VEX/priv/host_amd64_isel.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                                 host_amd64_isel.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex_ir.h"
  36 #include "libvex.h"
  37
  38 #include "ir_match.h"
  39 #include "main_util.h"
  40 #include "main_globals.h"
  41 #include "host_generic_regs.h"
  42 #include "host_generic_simd64.h"
  43 #include "host_generic_simd128.h"
  44 #include "host_generic_simd256.h"
  45 #include "host_generic_maddf.h"
  46 #include "host_amd64_defs.h"
  47
  48
  49 /*---------------------------------------------------------*/
  50 /*--- x87/SSE control word stuff                        ---*/
  51 /*---------------------------------------------------------*/
  52
  53 /* Vex-generated code expects to run with the FPU set as follows: all
  54    exceptions masked, round-to-nearest, precision = 53 bits.  This
  55    corresponds to a FPU control word value of 0x027F.
  56
  57    Similarly the SSE control word (%mxcsr) should be 0x1F80.
  58
  59    %fpucw and %mxcsr should have these values on entry to
  60    Vex-generated code, and should those values should be
  61    unchanged at exit.
  62 */
  63
  64 #define DEFAULT_FPUCW 0x027F
  65
  66 #define DEFAULT_MXCSR 0x1F80
  67
  68 /* debugging only, do not use */
  69 /* define DEFAULT_FPUCW 0x037F */
  70
  71
  72 /*---------------------------------------------------------*/
  73 /*--- misc helpers                                      ---*/
  74 /*---------------------------------------------------------*/
  75
  76 /* These are duplicated in guest-amd64/toIR.c */
  77 static IRExpr* unop ( IROp op, IRExpr* a )
  78 {
  79    return IRExpr_Unop(op, a);
  80 }
  81
  82 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
  83 {
  84    return IRExpr_Binop(op, a1, a2);
  85 }
  86
  87 static IRExpr* bind ( Int binder )
  88 {
  89    return IRExpr_Binder(binder);
  90 }
  91
  92 static Bool isZeroU8 ( const IRExpr* e )
  93 {
  94    return e->tag == Iex_Const
  95           && e->Iex.Const.con->tag == Ico_U8
  96           && e->Iex.Const.con->Ico.U8 == 0;
  97 }
  98
  99
 100 /*---------------------------------------------------------*/
 101 /*--- ISelEnv                                           ---*/
 102 /*---------------------------------------------------------*/
 103
 104 /* This carries around:
 105
 106    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
 107      might encounter.  This is computed before insn selection starts,
 108      and does not change.
 109
 110    - A mapping from IRTemp to HReg.  This tells the insn selector
 111      which virtual register is associated with each IRTemp
 112      temporary.  This is computed before insn selection starts, and
 113      does not change.  We expect this mapping to map precisely the
 114      same set of IRTemps as the type mapping does.
 115
 116         - vregmap   holds the primary register for the IRTemp.
 117         - vregmapHI is only used for 128-bit integer-typed
 118              IRTemps.  It holds the identity of a second
 119              64-bit virtual HReg, which holds the high half
 120              of the value.
 121
 122    - The host subarchitecture we are selecting insns for.
 123      This is set at the start and does not change.
 124
 125    - The code array, that is, the insns selected so far.
 126
 127    - A counter, for generating new virtual registers.
 128
 129    - A Bool for indicating whether we may generate chain-me
 130      instructions for control flow transfers, or whether we must use
 131      XAssisted.
 132
 133    - The maximum guest address of any guest insn in this block.
 134      Actually, the address of the highest-addressed byte from any insn
 135      in this block.  Is set at the start and does not change.  This is
 136      used for detecting jumps which are definitely forward-edges from
 137      this block, and therefore can be made (chained) to the fast entry
 138      point of the destination, thereby avoiding the destination's
 139      event check.
 140
 141    Note, this is all host-independent.  (JRS 20050201: well, kinda
 142    ... not completely.  Compare with ISelEnv for X86.)
 143 */
 144
 145 typedef
 146    struct {
 147       /* Constant -- are set at the start and do not change. */
 148       IRTypeEnv*   type_env;
 149
 150       HReg*        vregmap;
 151       HReg*        vregmapHI;
 152       Int          n_vregmap;
 153
 154       UInt         hwcaps;
 155
 156       Bool         chainingAllowed;
 157       Addr64       max_ga;
 158
 159       /* These are modified as we go along. */
 160       HInstrArray* code;
 161       Int          vreg_ctr;
 162    }
 163    ISelEnv;
 164
 165
 166 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
 167 {
 168    vassert(tmp >= 0);
 169    vassert(tmp < env->n_vregmap);
 170    return env->vregmap[tmp];
 171 }
 172
 173 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
 174                                ISelEnv* env, IRTemp tmp )
 175 {
 176    vassert(tmp >= 0);
 177    vassert(tmp < env->n_vregmap);
 178    vassert(! hregIsInvalid(env->vregmapHI[tmp]));
 179    *vrLO = env->vregmap[tmp];
 180    *vrHI = env->vregmapHI[tmp];
 181 }
 182
 183 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
 184 {
 185    addHInstr(env->code, instr);
 186    if (vex_traceflags & VEX_TRACE_VCODE) {
 187       ppAMD64Instr(instr, True);
 188       vex_printf("\n");
 189    }
 190 }
 191
 192 static HReg newVRegI ( ISelEnv* env )
 193 {
 194    HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
 195    env->vreg_ctr++;
 196    return reg;
 197 }
 198
 199 static HReg newVRegV ( ISelEnv* env )
 200 {
 201    HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
 202    env->vreg_ctr++;
 203    return reg;
 204 }
 205
 206
 207 /*---------------------------------------------------------*/
 208 /*--- ISEL: Forward declarations                        ---*/
 209 /*---------------------------------------------------------*/
 210
 211 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
 212    iselXXX_wrk do the real work, but are not to be called directly.
 213    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
 214    checks that all returned registers are virtual.  You should not
 215    call the _wrk version directly.
 216 */
 217 static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e );
 218 static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, const IRExpr* e );
 219
 220 static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, const IRExpr* e );
 221 static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, const IRExpr* e );
 222
 223 static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, const IRExpr* e );
 224 static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, const IRExpr* e );
 225
 226 static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, const IRExpr* e );
 227 static HReg          iselIntExpr_R       ( ISelEnv* env, const IRExpr* e );
 228
 229 static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e );
 230 static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, const IRExpr* e );
 231
 232 static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
 233                                           ISelEnv* env, const IRExpr* e );
 234 static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
 235                                           ISelEnv* env, const IRExpr* e );
 236
 237 static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, const IRExpr* e );
 238 static AMD64CondCode iselCondCode        ( ISelEnv* env, const IRExpr* e );
 239
 240 static HReg          iselDblExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 241 static HReg          iselDblExpr         ( ISelEnv* env, const IRExpr* e );
 242
 243 static HReg          iselFltExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 244 static HReg          iselFltExpr         ( ISelEnv* env, const IRExpr* e );
 245
 246 static HReg          iselVecExpr_wrk     ( ISelEnv* env, const IRExpr* e );
 247 static HReg          iselVecExpr         ( ISelEnv* env, const IRExpr* e );
 248
 249 static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
 250                                         ISelEnv* env, const IRExpr* e );
 251 static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
 252                                         ISelEnv* env, const IRExpr* e );
 253
 254
 255 /*---------------------------------------------------------*/
 256 /*--- ISEL: Misc helpers                                ---*/
 257 /*---------------------------------------------------------*/
 258
 259 static Bool sane_AMode ( AMD64AMode* am )
 260 {
 261    switch (am->tag) {
 262       case Aam_IR:
 263          return
 264             toBool( hregClass(am->Aam.IR.reg) == HRcInt64
 265                     && (hregIsVirtual(am->Aam.IR.reg)
 266                         || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
 267       case Aam_IRRS:
 268          return
 269             toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
 270                     && hregIsVirtual(am->Aam.IRRS.base)
 271                     && hregClass(am->Aam.IRRS.index) == HRcInt64
 272                     && hregIsVirtual(am->Aam.IRRS.index) );
 273       default:
 274         vpanic("sane_AMode: unknown amd64 amode tag");
 275    }
 276 }
 277
 278
 279 /* Can the lower 32 bits be signedly widened to produce the whole
 280    64-bit value?  In other words, are the top 33 bits either all 0 or
 281    all 1 ? */
 282 static Bool fitsIn32Bits ( ULong x )
 283 {
 284    Long y1;
 285    y1 = x << 32;
 286    y1 >>=/*s*/ 32;
 287    return toBool(x == y1);
 288 }
 289
 290 /* Is this a 64-bit zero expression? */
 291
 292 static Bool isZeroU64 ( const IRExpr* e )
 293 {
 294    return e->tag == Iex_Const
 295           && e->Iex.Const.con->tag == Ico_U64
 296           && e->Iex.Const.con->Ico.U64 == 0ULL;
 297 }
 298
 299 static Bool isZeroU32 ( const IRExpr* e )
 300 {
 301    return e->tag == Iex_Const
 302           && e->Iex.Const.con->tag == Ico_U32
 303           && e->Iex.Const.con->Ico.U32 == 0;
 304 }
 305
 306 /* Are both args atoms and the same?  This is copy of eqIRAtom
 307    that omits the assertions that the args are indeed atoms. */
 308
 309 static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
 310 {
 311    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 312       return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
 313    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 314       return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
 315    return False;
 316 }
 317
 318 /* Make a int reg-reg move. */
 319
 320 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
 321 {
 322    vassert(hregClass(src) == HRcInt64);
 323    vassert(hregClass(dst) == HRcInt64);
 324    return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
 325 }
 326
 327 /* Make a vector (128 bit) reg-reg move. */
 328
 329 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
 330 {
 331    vassert(hregClass(src) == HRcVec128);
 332    vassert(hregClass(dst) == HRcVec128);
 333    return AMD64Instr_SseReRg(Asse_MOV, src, dst);
 334 }
 335
 336 /* Advance/retreat %rsp by n. */
 337
 338 static void add_to_rsp ( ISelEnv* env, Int n )
 339 {
 340    vassert(n > 0 && n < 256 && (n%8) == 0);
 341    addInstr(env,
 342             AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
 343                                         hregAMD64_RSP()));
 344 }
 345
 346 static void sub_from_rsp ( ISelEnv* env, Int n )
 347 {
 348    vassert(n > 0 && n < 256 && (n%8) == 0);
 349    addInstr(env,
 350             AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
 351                                         hregAMD64_RSP()));
 352 }
 353
 354 /* Push 64-bit constants on the stack. */
 355 static void push_uimm64( ISelEnv* env, ULong uimm64 )
 356 {
 357    /* If uimm64 can be expressed as the sign extension of its
 358       lower 32 bits, we can do it the easy way. */
 359    Long simm64 = (Long)uimm64;
 360    if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
 361       addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
 362    } else {
 363       HReg tmp = newVRegI(env);
 364       addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
 365       addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
 366    }
 367 }
 368
 369
 370 /* Used only in doHelperCall.  If possible, produce a single
 371    instruction which computes 'e' into 'dst'.  If not possible, return
 372    NULL. */
 373
 374 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
 375                                                     HReg     dst,
 376                                                     IRExpr*  e )
 377 {
 378    /* Per comments in doHelperCall below, appearance of
 379       Iex_VECRET implies ill-formed IR. */
 380    vassert(e->tag != Iex_VECRET);
 381
 382    /* In this case we give out a copy of the BaseBlock pointer. */
 383    if (UNLIKELY(e->tag == Iex_GSPTR)) {
 384       return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
 385    }
 386
 387    vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
 388
 389    if (e->tag == Iex_Const) {
 390       vassert(e->Iex.Const.con->tag == Ico_U64);
 391       if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
 392          return AMD64Instr_Alu64R(
 393                    Aalu_MOV,
 394                    AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
 395                    dst
 396                 );
 397       } else {
 398          return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
 399       }
 400    }
 401
 402    if (e->tag == Iex_RdTmp) {
 403       HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
 404       return mk_iMOVsd_RR(src, dst);
 405    }
 406
 407    if (e->tag == Iex_Get) {
 408       vassert(e->Iex.Get.ty == Ity_I64);
 409       return AMD64Instr_Alu64R(
 410                 Aalu_MOV,
 411                 AMD64RMI_Mem(
 412                    AMD64AMode_IR(e->Iex.Get.offset,
 413                                  hregAMD64_RBP())),
 414                 dst);
 415    }
 416
 417    if (e->tag == Iex_Unop
 418        && e->Iex.Unop.op == Iop_32Uto64
 419        && e->Iex.Unop.arg->tag == Iex_RdTmp) {
 420       HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
 421       return AMD64Instr_MovxLQ(False, src, dst);
 422    }
 423
 424    if (0) { ppIRExpr(e); vex_printf("\n"); }
 425
 426    return NULL;
 427 }
 428
 429
 430 /* Do a complete function call.  |guard| is a Ity_Bit expression
 431    indicating whether or not the call happens.  If guard==NULL, the
 432    call is unconditional.  |retloc| is set to indicate where the
 433    return value is after the call.  The caller (of this fn) must
 434    generate code to add |stackAdjustAfterCall| to the stack pointer
 435    after the call is done. */
 436
 437 static
 438 void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
 439                     /*OUT*/RetLoc* retloc,
 440                     ISelEnv* env,
 441                     IRExpr* guard,
 442                     IRCallee* cee, IRType retTy, IRExpr** args )
 443 {
 444    AMD64CondCode cc;
 445    HReg          argregs[6];
 446    HReg          tmpregs[6];
 447    AMD64Instr*   fastinstrs[6];
 448    UInt          n_args, i;
 449
 450    /* Set default returns.  We'll update them later if needed. */
 451    *stackAdjustAfterCall = 0;
 452    *retloc               = mk_RetLoc_INVALID();
 453
 454    /* These are used for cross-checking that IR-level constraints on
 455       the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
 456    UInt nVECRETs = 0;
 457    UInt nGSPTRs  = 0;
 458
 459    /* Marshal args for a call and do the call.
 460
 461       This function only deals with a tiny set of possibilities, which
 462       cover all helpers in practice.  The restrictions are that only
 463       arguments in registers are supported, hence only 6x64 integer
 464       bits in total can be passed.  In fact the only supported arg
 465       type is I64.
 466
 467       The return type can be I{64,32,16,8} or V{128,256}.  In the
 468       latter two cases, it is expected that |args| will contain the
 469       special node IRExpr_VECRET(), in which case this routine
 470       generates code to allocate space on the stack for the vector
 471       return value.  Since we are not passing any scalars on the
 472       stack, it is enough to preallocate the return space before
 473       marshalling any arguments, in this case.
 474
 475       |args| may also contain IRExpr_GSPTR(), in which case the
 476       value in %rbp is passed as the corresponding argument.
 477
 478       Generating code which is both efficient and correct when
 479       parameters are to be passed in registers is difficult, for the
 480       reasons elaborated in detail in comments attached to
 481       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
 482       of the method described in those comments.
 483
 484       The problem is split into two cases: the fast scheme and the
 485       slow scheme.  In the fast scheme, arguments are computed
 486       directly into the target (real) registers.  This is only safe
 487       when we can be sure that computation of each argument will not
 488       trash any real registers set by computation of any other
 489       argument.
 490
 491       In the slow scheme, all args are first computed into vregs, and
 492       once they are all done, they are moved to the relevant real
 493       regs.  This always gives correct code, but it also gives a bunch
 494       of vreg-to-rreg moves which are usually redundant but are hard
 495       for the register allocator to get rid of.
 496
 497       To decide which scheme to use, all argument expressions are
 498       first examined.  If they are all so simple that it is clear they
 499       will be evaluated without use of any fixed registers, use the
 500       fast scheme, else use the slow scheme.  Note also that only
 501       unconditional calls may use the fast scheme, since having to
 502       compute a condition expression could itself trash real
 503       registers.  Note that for simplicity, in the case where
 504       IRExpr_VECRET() is present, we use the slow scheme.  This is
 505       motivated by the desire to avoid any possible complexity
 506       w.r.t. nested calls.
 507
 508       Note this requires being able to examine an expression and
 509       determine whether or not evaluation of it might use a fixed
 510       register.  That requires knowledge of how the rest of this insn
 511       selector works.  Currently just the following 3 are regarded as
 512       safe -- hopefully they cover the majority of arguments in
 513       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
 514    */
 515
 516    /* Note that the cee->regparms field is meaningless on AMD64 host
 517       (since there is only one calling convention) and so we always
 518       ignore it. */
 519    n_args = 0;
 520    for (i = 0; args[i]; i++)
 521       n_args++;
 522
 523    if (n_args > 6)
 524       vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
 525
 526    argregs[0] = hregAMD64_RDI();
 527    argregs[1] = hregAMD64_RSI();
 528    argregs[2] = hregAMD64_RDX();
 529    argregs[3] = hregAMD64_RCX();
 530    argregs[4] = hregAMD64_R8();
 531    argregs[5] = hregAMD64_R9();
 532
 533    tmpregs[0] = tmpregs[1] = tmpregs[2] =
 534    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
 535
 536    fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
 537    fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
 538
 539    /* First decide which scheme (slow or fast) is to be used.  First
 540       assume the fast scheme, and select slow if any contraindications
 541       (wow) appear. */
 542
 543    /* We'll need space on the stack for the return value.  Avoid
 544       possible complications with nested calls by using the slow
 545       scheme. */
 546    if (retTy == Ity_V128 || retTy == Ity_V256)
 547       goto slowscheme;
 548
 549    if (guard) {
 550       if (guard->tag == Iex_Const
 551           && guard->Iex.Const.con->tag == Ico_U1
 552           && guard->Iex.Const.con->Ico.U1 == True) {
 553          /* unconditional */
 554       } else {
 555          /* Not manifestly unconditional -- be conservative. */
 556          goto slowscheme;
 557       }
 558    }
 559
 560    /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
 561       use the slow scheme.  Because this is tentative, we can't call
 562       addInstr (that is, commit to) any instructions until we're
 563       handled all the arguments.  So park the resulting instructions
 564       in a buffer and emit that if we're successful. */
 565
 566    /* FAST SCHEME */
 567    /* In this loop, we process args that can be computed into the
 568       destination (real) register with a single instruction, without
 569       using any fixed regs.  That also includes IRExpr_GSPTR(), but
 570       not IRExpr_VECRET().  Indeed, if the IR is well-formed, we can
 571       never see IRExpr_VECRET() at this point, since the return-type
 572       check above should ensure all those cases use the slow scheme
 573       instead. */
 574    vassert(n_args >= 0 && n_args <= 6);
 575    for (i = 0; i < n_args; i++) {
 576       IRExpr* arg = args[i];
 577       if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
 578          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
 579       }
 580       fastinstrs[i]
 581          = iselIntExpr_single_instruction( env, argregs[i], args[i] );
 582       if (fastinstrs[i] == NULL)
 583          goto slowscheme;
 584    }
 585
 586    /* Looks like we're in luck.  Emit the accumulated instructions and
 587       move on to doing the call itself. */
 588    for (i = 0; i < n_args; i++)
 589       addInstr(env, fastinstrs[i]);
 590
 591    /* Fast scheme only applies for unconditional calls.  Hence: */
 592    cc = Acc_ALWAYS;
 593
 594    goto handle_call;
 595
 596
 597    /* SLOW SCHEME; move via temporaries */
 598   slowscheme:
 599    {}
 600 #  if 0 /* debug only */
 601    if (n_args > 0) {for (i = 0; args[i]; i++) {
 602    ppIRExpr(args[i]); vex_printf(" "); }
 603    vex_printf("\n");}
 604 #  endif
 605
 606    /* If we have a vector return type, allocate a place for it on the
 607       stack and record its address. */
 608    HReg r_vecRetAddr = INVALID_HREG;
 609    if (retTy == Ity_V128) {
 610       r_vecRetAddr = newVRegI(env);
 611       sub_from_rsp(env, 16);
 612       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
 613    }
 614    else if (retTy == Ity_V256) {
 615       r_vecRetAddr = newVRegI(env);
 616       sub_from_rsp(env, 32);
 617       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
 618    }
 619
 620    vassert(n_args >= 0 && n_args <= 6);
 621    for (i = 0; i < n_args; i++) {
 622       IRExpr* arg = args[i];
 623       if (UNLIKELY(arg->tag == Iex_GSPTR)) {
 624          tmpregs[i] = newVRegI(env);
 625          addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
 626          nGSPTRs++;
 627       }
 628       else if (UNLIKELY(arg->tag == Iex_VECRET)) {
 629          /* We stashed the address of the return slot earlier, so just
 630             retrieve it now. */
 631          vassert(!hregIsInvalid(r_vecRetAddr));
 632          tmpregs[i] = r_vecRetAddr;
 633          nVECRETs++;
 634       }
 635       else {
 636          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
 637          tmpregs[i] = iselIntExpr_R(env, args[i]);
 638       }
 639    }
 640
 641    /* Now we can compute the condition.  We can't do it earlier
 642       because the argument computations could trash the condition
 643       codes.  Be a bit clever to handle the common case where the
 644       guard is 1:Bit. */
 645    cc = Acc_ALWAYS;
 646    if (guard) {
 647       if (guard->tag == Iex_Const
 648           && guard->Iex.Const.con->tag == Ico_U1
 649           && guard->Iex.Const.con->Ico.U1 == True) {
 650          /* unconditional -- do nothing */
 651       } else {
 652          cc = iselCondCode( env, guard );
 653       }
 654    }
 655
 656    /* Move the args to their final destinations. */
 657    for (i = 0; i < n_args; i++) {
 658       /* None of these insns, including any spill code that might
 659          be generated, may alter the condition codes. */
 660       addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
 661    }
 662
 663
 664    /* Do final checks, set the return values, and generate the call
 665       instruction proper. */
 666   handle_call:
 667
 668    if (retTy == Ity_V128 || retTy == Ity_V256) {
 669       vassert(nVECRETs == 1);
 670    } else {
 671       vassert(nVECRETs == 0);
 672    }
 673
 674    vassert(nGSPTRs == 0 || nGSPTRs == 1);
 675
 676    vassert(*stackAdjustAfterCall == 0);
 677    vassert(is_RetLoc_INVALID(*retloc));
 678    switch (retTy) {
 679          case Ity_INVALID:
 680             /* Function doesn't return a value. */
 681             *retloc = mk_RetLoc_simple(RLPri_None);
 682             break;
 683          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
 684             *retloc = mk_RetLoc_simple(RLPri_Int);
 685             break;
 686          case Ity_V128:
 687             *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
 688             *stackAdjustAfterCall = 16;
 689             break;
 690          case Ity_V256:
 691             *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
 692             *stackAdjustAfterCall = 32;
 693             break;
 694          default:
 695             /* IR can denote other possible return types, but we don't
 696                handle those here. */
 697            vassert(0);
 698    }
 699
 700    /* Finally, generate the call itself.  This needs the *retloc value
 701       set in the switch above, which is why it's at the end. */
 702    addInstr(env,
 703             AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
 704 }
 705
 706
 707 /* Given a guest-state array descriptor, an index expression and a
 708    bias, generate an AMD64AMode holding the relevant guest state
 709    offset. */
 710
 711 static
 712 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
 713                                   IRExpr* off, Int bias )
 714 {
 715    HReg tmp, roff;
 716    Int  elemSz = sizeofIRType(descr->elemTy);
 717    Int  nElems = descr->nElems;
 718
 719    /* Throw out any cases not generated by an amd64 front end.  In
 720       theory there might be a day where we need to handle them -- if
 721       we ever run non-amd64-guest on amd64 host. */
 722
 723    if (nElems != 8 || (elemSz != 1 && elemSz != 8))
 724       vpanic("genGuestArrayOffset(amd64 host)");
 725
 726    /* Compute off into a reg, %off.  Then return:
 727
 728          movq %off, %tmp
 729          addq $bias, %tmp  (if bias != 0)
 730          andq %tmp, 7
 731          ... base(%rbp, %tmp, shift) ...
 732    */
 733    tmp  = newVRegI(env);
 734    roff = iselIntExpr_R(env, off);
 735    addInstr(env, mk_iMOVsd_RR(roff, tmp));
 736    if (bias != 0) {
 737       /* Make sure the bias is sane, in the sense that there are
 738          no significant bits above bit 30 in it. */
 739       vassert(-10000 < bias && bias < 10000);
 740       addInstr(env,
 741                AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
 742    }
 743    addInstr(env,
 744             AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
 745    vassert(elemSz == 1 || elemSz == 8);
 746    return
 747       AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
 748                                     elemSz==8 ? 3 : 0);
 749 }
 750
 751
 752 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
 753 static
 754 void set_SSE_rounding_default ( ISelEnv* env )
 755 {
 756    /* pushq $DEFAULT_MXCSR
 757       ldmxcsr 0(%rsp)
 758       addq $8, %rsp
 759    */
 760    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
 761    addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
 762    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
 763    add_to_rsp(env, 8);
 764 }
 765
 766 /* Mess with the FPU's rounding mode: set to the default rounding mode
 767    (DEFAULT_FPUCW). */
 768 static
 769 void set_FPU_rounding_default ( ISelEnv* env )
 770 {
 771    /* movq $DEFAULT_FPUCW, -8(%rsp)
 772       fldcw -8(%esp)
 773    */
 774    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
 775    addInstr(env, AMD64Instr_Alu64M(
 776                     Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
 777    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
 778 }
 779
 780
 781 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
 782    expression denoting a value in the range 0 .. 3, indicating a round
 783    mode encoded as per type IRRoundingMode.  Set the SSE machinery to
 784    have the same rounding.
 785 */
 786 static
 787 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
 788 {
 789    /* Note: this sequence only makes sense because DEFAULT_MXCSR has
 790       both rounding bits == 0.  If that wasn't the case, we couldn't
 791       create a new rounding field simply by ORing the new value into
 792       place. */
 793
 794    /* movq $3, %reg
 795       andq [[mode]], %reg  -- shouldn't be needed; paranoia
 796       shlq $13, %reg
 797       orq $DEFAULT_MXCSR, %reg
 798       pushq %reg
 799       ldmxcsr 0(%esp)
 800       addq $8, %rsp
 801    */
 802    HReg        reg      = newVRegI(env);
 803    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
 804    addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
 805    addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
 806                                    iselIntExpr_RMI(env, mode), reg));
 807    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
 808    addInstr(env, AMD64Instr_Alu64R(
 809                     Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
 810    addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
 811    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
 812    add_to_rsp(env, 8);
 813 }
 814
 815
 816 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
 817    expression denoting a value in the range 0 .. 3, indicating a round
 818    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
 819    the same rounding.
 820 */
 821 static
 822 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
 823 {
 824    HReg rrm  = iselIntExpr_R(env, mode);
 825    HReg rrm2 = newVRegI(env);
 826    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
 827
 828    /* movq  %rrm, %rrm2
 829       andq  $3, %rrm2   -- shouldn't be needed; paranoia
 830       shlq  $10, %rrm2
 831       orq   $DEFAULT_FPUCW, %rrm2
 832       movq  %rrm2, -8(%rsp)
 833       fldcw -8(%esp)
 834    */
 835    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
 836    addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
 837    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
 838    addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
 839                                    AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
 840    addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
 841                                    AMD64RI_Reg(rrm2), m8_rsp));
 842    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
 843 }
 844
 845
 846 /* Generate all-zeroes into a new vector register.
 847 */
 848 static HReg generate_zeroes_V128 ( ISelEnv* env )
 849 {
 850    HReg dst = newVRegV(env);
 851    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
 852    return dst;
 853 }
 854
 855 /* Generate all-ones into a new vector register.
 856 */
 857 static HReg generate_ones_V128 ( ISelEnv* env )
 858 {
 859    HReg dst = newVRegV(env);
 860    addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
 861    return dst;
 862 }
 863
 864
 865 /* Generate !src into a new vector register.  Amazing that there isn't
 866    a less crappy way to do this.
 867 */
 868 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
 869 {
 870    HReg dst = generate_ones_V128(env);
 871    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
 872    return dst;
 873 }
 874
 875
 876 /* Expand the given byte into a 64-bit word, by cloning each bit
 877    8 times. */
 878 static ULong bitmask8_to_bytemask64 ( UShort w8 )
 879 {
 880    vassert(w8 == (w8 & 0xFF));
 881    ULong w64 = 0;
 882    Int i;
 883    for (i = 0; i < 8; i++) {
 884       if (w8 & (1<<i))
 885          w64 |= (0xFFULL << (8 * i));
 886    }
 887    return w64;
 888 }
 889
 890
 891 /*---------------------------------------------------------*/
 892 /*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
 893 /*---------------------------------------------------------*/
 894
 895 /* Select insns for an integer-typed expression, and add them to the
 896    code list.  Return a reg holding the result.  This reg will be a
 897    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
 898    want to modify it, ask for a new vreg, copy it in there, and modify
 899    the copy.  The register allocator will do its best to map both
 900    vregs to the same real register, so the copies will often disappear
 901    later in the game.
 902
 903    This should handle expressions of 64, 32, 16 and 8-bit type.  All
 904    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
 905    expressions, the upper 32/48/56 bits are arbitrary, so you should
 906    mask or sign extend partial values if necessary.
 907 */
 908
 909 static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e )
 910 {
 911    HReg r = iselIntExpr_R_wrk(env, e);
 912    /* sanity checks ... */
 913 #  if 0
 914    vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
 915 #  endif
 916    vassert(hregClass(r) == HRcInt64);
 917    vassert(hregIsVirtual(r));
 918    return r;
 919 }
 920
 921 /* DO NOT CALL THIS DIRECTLY ! */
 922 static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
 923 {
 924    MatchInfo mi;
 925    DECLARE_PATTERN(p_1Uto8_64to1);
 926    DECLARE_PATTERN(p_LDle8_then_8Uto64);
 927    DECLARE_PATTERN(p_LDle16_then_16Uto64);
 928
 929    IRType ty = typeOfIRExpr(env->type_env,e);
 930    switch (ty) {
 931       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
 932       default: vassert(0);
 933    }
 934
 935    switch (e->tag) {
 936
 937    /* --------- TEMP --------- */
 938    case Iex_RdTmp: {
 939       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
 940    }
 941
 942    /* --------- LOAD --------- */
 943    case Iex_Load: {
 944       HReg dst = newVRegI(env);
 945       AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
 946
 947       /* We can't handle big-endian loads, nor load-linked. */
 948       if (e->Iex.Load.end != Iend_LE)
 949          goto irreducible;
 950
 951       if (ty == Ity_I64) {
 952          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
 953                                          AMD64RMI_Mem(amode), dst) );
 954          return dst;
 955       }
 956       if (ty == Ity_I32) {
 957          addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
 958          return dst;
 959       }
 960       if (ty == Ity_I16) {
 961          addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
 962          return dst;
 963       }
 964       if (ty == Ity_I8) {
 965          addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
 966          return dst;
 967       }
 968       break;
 969    }
 970
 971    /* --------- BINARY OP --------- */
 972    case Iex_Binop: {
 973       AMD64AluOp   aluOp;
 974       AMD64ShiftOp shOp;
 975
 976       /* Pattern: Sub64(0,x) */
 977       /*     and: Sub32(0,x) */
 978       if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
 979           || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
 980          HReg dst = newVRegI(env);
 981          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
 982          addInstr(env, mk_iMOVsd_RR(reg,dst));
 983          addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
 984          return dst;
 985       }
 986
 987       /* Is it an addition or logical style op? */
 988       switch (e->Iex.Binop.op) {
 989          case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
 990             aluOp = Aalu_ADD; break;
 991          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
 992             aluOp = Aalu_SUB; break;
 993          case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
 994             aluOp = Aalu_AND; break;
 995          case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
 996             aluOp = Aalu_OR; break;
 997          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
 998             aluOp = Aalu_XOR; break;
 999          case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
1000             aluOp = Aalu_MUL; break;
1001          default:
1002             aluOp = Aalu_INVALID; break;
1003       }
1004       /* For commutative ops we assume any literal
1005          values are on the second operand. */
1006       if (aluOp != Aalu_INVALID) {
1007          HReg dst      = newVRegI(env);
1008          HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
1009          AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
1010          addInstr(env, mk_iMOVsd_RR(reg,dst));
1011          addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
1012          return dst;
1013       }
1014
1015       /* Perhaps a shift op? */
1016       switch (e->Iex.Binop.op) {
1017          case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1018             shOp = Ash_SHL; break;
1019          case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
1020             shOp = Ash_SHR; break;
1021          case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
1022             shOp = Ash_SAR; break;
1023          default:
1024             shOp = Ash_INVALID; break;
1025       }
1026       if (shOp != Ash_INVALID) {
1027          HReg dst = newVRegI(env);
1028
1029          /* regL = the value to be shifted */
1030          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
1031          addInstr(env, mk_iMOVsd_RR(regL,dst));
1032
1033          /* Do any necessary widening for 32/16/8 bit operands */
1034          switch (e->Iex.Binop.op) {
1035             case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
1036                break;
1037             case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
1038                break;
1039             case Iop_Shr8:
1040                addInstr(env, AMD64Instr_Alu64R(
1041                                 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
1042                break;
1043             case Iop_Shr16:
1044                addInstr(env, AMD64Instr_Alu64R(
1045                                 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
1046                break;
1047             case Iop_Shr32:
1048                addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
1049                break;
1050             case Iop_Sar8:
1051                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
1052                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
1053                break;
1054             case Iop_Sar16:
1055                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
1056                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
1057                break;
1058             case Iop_Sar32:
1059                addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
1060                break;
1061             default:
1062                ppIROp(e->Iex.Binop.op);
1063                vassert(0);
1064          }
1065
1066          /* Now consider the shift amount.  If it's a literal, we
1067             can do a much better job than the general case. */
1068          if (e->Iex.Binop.arg2->tag == Iex_Const) {
1069             /* assert that the IR is well-typed */
1070             Int nshift;
1071             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1072             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
1073             vassert(nshift >= 0);
1074             if (nshift > 0)
1075                /* Can't allow nshift==0 since that means %cl */
1076                addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
1077          } else {
1078             /* General case; we have to force the amount into %cl. */
1079             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1080             addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
1081             addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
1082          }
1083          return dst;
1084       }
1085
1086       /* Handle misc other scalar ops. */
1087       if (e->Iex.Binop.op == Iop_Max32U) {
1088          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1089          HReg dst  = newVRegI(env);
1090          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
1091          addInstr(env, mk_iMOVsd_RR(src1, dst));
1092          addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
1093          addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
1094          return dst;
1095       }
1096
1097       if (e->Iex.Binop.op == Iop_DivModS64to32
1098           || e->Iex.Binop.op == Iop_DivModU64to32) {
1099          /* 64 x 32 -> (32(rem),32(div)) division */
1100          /* Get the 64-bit operand into edx:eax, and the other into
1101             any old R/M. */
1102          HReg      rax     = hregAMD64_RAX();
1103          HReg      rdx     = hregAMD64_RDX();
1104          HReg      dst     = newVRegI(env);
1105          Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
1106          AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
1107          /* Compute the left operand into a reg, and then
1108             put the top half in edx and the bottom in eax. */
1109          HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
1110          addInstr(env, mk_iMOVsd_RR(left64, rdx));
1111          addInstr(env, mk_iMOVsd_RR(left64, rax));
1112          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
1113          addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
1114          addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
1115          addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
1116          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
1117          addInstr(env, mk_iMOVsd_RR(rax, dst));
1118          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
1119          return dst;
1120       }
1121
1122       if (e->Iex.Binop.op == Iop_32HLto64) {
1123          HReg hi32  = newVRegI(env);
1124          HReg lo32  = newVRegI(env);
1125          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1126          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1127          addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
1128          addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
1129          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
1130          addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
1131          addInstr(env, AMD64Instr_Alu64R(
1132                           Aalu_OR, AMD64RMI_Reg(lo32), hi32));
1133          return hi32;
1134       }
1135
1136       if (e->Iex.Binop.op == Iop_16HLto32) {
1137          HReg hi16  = newVRegI(env);
1138          HReg lo16  = newVRegI(env);
1139          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1140          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1141          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
1142          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
1143          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
1144          addInstr(env, AMD64Instr_Alu64R(
1145                           Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
1146          addInstr(env, AMD64Instr_Alu64R(
1147                           Aalu_OR, AMD64RMI_Reg(lo16), hi16));
1148          return hi16;
1149       }
1150
1151       if (e->Iex.Binop.op == Iop_8HLto16) {
1152          HReg hi8  = newVRegI(env);
1153          HReg lo8  = newVRegI(env);
1154          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1155          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1156          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
1157          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
1158          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
1159          addInstr(env, AMD64Instr_Alu64R(
1160                           Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
1161          addInstr(env, AMD64Instr_Alu64R(
1162                           Aalu_OR, AMD64RMI_Reg(lo8), hi8));
1163          return hi8;
1164       }
1165
1166       if (e->Iex.Binop.op == Iop_MullS32
1167           || e->Iex.Binop.op == Iop_MullS16
1168           || e->Iex.Binop.op == Iop_MullS8
1169           || e->Iex.Binop.op == Iop_MullU32
1170           || e->Iex.Binop.op == Iop_MullU16
1171           || e->Iex.Binop.op == Iop_MullU8) {
1172          HReg a32   = newVRegI(env);
1173          HReg b32   = newVRegI(env);
1174          HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1175          HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1176          Int          shift  = 0;
1177          AMD64ShiftOp shr_op = Ash_SHR;
1178          switch (e->Iex.Binop.op) {
1179             case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
1180             case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
1181             case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
1182             case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
1183             case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
1184             case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
1185             default: vassert(0);
1186          }
1187
1188          addInstr(env, mk_iMOVsd_RR(a32s, a32));
1189          addInstr(env, mk_iMOVsd_RR(b32s, b32));
1190          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
1191          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
1192          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
1193          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
1194          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
1195          return b32;
1196       }
1197
1198       if (e->Iex.Binop.op == Iop_CmpF64) {
1199          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
1200          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
1201          HReg dst = newVRegI(env);
1202          addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
1203          /* Mask out irrelevant parts of the result so as to conform
1204             to the CmpF64 definition. */
1205          addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
1206          return dst;
1207       }
1208
1209       if (e->Iex.Binop.op == Iop_F64toI32S
1210           || e->Iex.Binop.op == Iop_F64toI64S) {
1211          Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
1212          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
1213          HReg dst = newVRegI(env);
1214          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1215          addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
1216          set_SSE_rounding_default(env);
1217          return dst;
1218       }
1219
1220       /* Deal with 64-bit SIMD binary ops.  For the most part these are doable
1221          by using the equivalent 128-bit operation and ignoring the upper half
1222          of the result. */
1223       AMD64SseOp op = Asse_INVALID;
1224       Bool arg1isEReg = False;
1225       Bool preShift32R = False;
1226       switch (e->Iex.Binop.op) {
1227          // The following 3 could be done with 128 bit insns too, but
1228          // first require the inputs to be reformatted.
1229          //case Iop_QNarrowBin32Sto16Sx4:
1230          //op = Asse_PACKSSD; arg1isEReg = True; break;
1231          //case Iop_QNarrowBin16Sto8Sx8:
1232          //op = Asse_PACKSSW; arg1isEReg = True; break;
1233          //case Iop_QNarrowBin16Sto8Ux8:
1234          //op = Asse_PACKUSW; arg1isEReg = True; break;
1235
1236          case Iop_InterleaveHI8x8:
1237             op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True;
1238             break;
1239          case Iop_InterleaveHI16x4:
1240             op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True;
1241             break;
1242          case Iop_InterleaveHI32x2:
1243             op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True;
1244             break;
1245          case Iop_InterleaveLO8x8:
1246             op = Asse_UNPCKLB; arg1isEReg = True;
1247             break;
1248          case Iop_InterleaveLO16x4:
1249             op = Asse_UNPCKLW; arg1isEReg = True;
1250             break;
1251          case Iop_InterleaveLO32x2:
1252             op = Asse_UNPCKLD; arg1isEReg = True;
1253             break;
1254
1255          case Iop_Add8x8:     op = Asse_ADD8;     break;
1256          case Iop_Add16x4:    op = Asse_ADD16;    break;
1257          case Iop_Add32x2:    op = Asse_ADD32;    break;
1258          case Iop_QAdd8Sx8:   op = Asse_QADD8S;   break;
1259          case Iop_QAdd16Sx4:  op = Asse_QADD16S;  break;
1260          case Iop_QAdd8Ux8:   op = Asse_QADD8U;   break;
1261          case Iop_QAdd16Ux4:  op = Asse_QADD16U;  break;
1262          case Iop_Avg8Ux8:    op = Asse_AVG8U;    break;
1263          case Iop_Avg16Ux4:   op = Asse_AVG16U;   break;
1264          case Iop_CmpEQ8x8:   op = Asse_CMPEQ8;   break;
1265          case Iop_CmpEQ16x4:  op = Asse_CMPEQ16;  break;
1266          case Iop_CmpEQ32x2:  op = Asse_CMPEQ32;  break;
1267          case Iop_CmpGT8Sx8:  op = Asse_CMPGT8S;  break;
1268          case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break;
1269          case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break;
1270          case Iop_Max16Sx4:   op = Asse_MAX16S;   break;
1271          case Iop_Max8Ux8:    op = Asse_MAX8U;    break;
1272          case Iop_Min16Sx4:   op = Asse_MIN16S;   break;
1273          case Iop_Min8Ux8:    op = Asse_MIN8U;    break;
1274          case Iop_MulHi16Ux4: op = Asse_MULHI16U; break;
1275          case Iop_MulHi16Sx4: op = Asse_MULHI16S; break;
1276          case Iop_Mul16x4:    op = Asse_MUL16;    break;
1277          case Iop_Sub8x8:     op = Asse_SUB8;     break;
1278          case Iop_Sub16x4:    op = Asse_SUB16;    break;
1279          case Iop_Sub32x2:    op = Asse_SUB32;    break;
1280          case Iop_QSub8Sx8:   op = Asse_QSUB8S;   break;
1281          case Iop_QSub16Sx4:  op = Asse_QSUB16S;  break;
1282          case Iop_QSub8Ux8:   op = Asse_QSUB8U;   break;
1283          case Iop_QSub16Ux4:  op = Asse_QSUB16U;  break;
1284          default: break;
1285       }
1286       if (op != Asse_INVALID) {
1287          /* This isn't pretty, but .. move each arg to the low half of an XMM
1288             register, do the operation on the whole register, and move the
1289             result back to an integer register. */
1290          const IRExpr* arg1 = e->Iex.Binop.arg1;
1291          const IRExpr* arg2 = e->Iex.Binop.arg2;
1292          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1293          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1294          HReg iarg1 = iselIntExpr_R(env, arg1);
1295          HReg iarg2 = iselIntExpr_R(env, arg2);
1296          HReg varg1 = newVRegV(env);
1297          HReg varg2 = newVRegV(env);
1298          HReg idst  = newVRegI(env);
1299          addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/));
1300          addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/));
1301          if (arg1isEReg) {
1302             if (preShift32R) {
1303                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1));
1304                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2));
1305             }
1306             addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2));
1307             addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/));
1308          } else {
1309             vassert(!preShift32R);
1310             addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1));
1311             addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/));
1312          }
1313          return idst;
1314       }
1315
1316       UInt laneBits = 0;
1317       op = Asse_INVALID;
1318       switch (e->Iex.Binop.op) {
1319          case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break;
1320          case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break;
1321          case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break;
1322          case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break;
1323          case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break;
1324          case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break;
1325          default: break;
1326       }
1327       if (op != Asse_INVALID) {
1328          const IRExpr* arg1 = e->Iex.Binop.arg1;
1329          const IRExpr* arg2 = e->Iex.Binop.arg2;
1330          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1331          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8);
1332          HReg igreg = iselIntExpr_R(env, arg1);
1333          HReg vgreg = newVRegV(env);
1334          HReg idst  = newVRegI(env);
1335          addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/));
1336          /* If it's a shift by an in-range immediate, generate a single
1337             instruction. */
1338          if (arg2->tag == Iex_Const) {
1339             IRConst* c = arg2->Iex.Const.con;
1340             vassert(c->tag == Ico_U8);
1341             UInt shift = c->Ico.U8;
1342             if (shift < laneBits) {
1343                addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg));
1344                addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1345                return idst;
1346             }
1347          }
1348          /* Otherwise we have to do it the longwinded way. */
1349          HReg ishift = iselIntExpr_R(env, arg2);
1350          HReg vshift = newVRegV(env);
1351          addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/));
1352          addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg));
1353          addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/));
1354          return idst;
1355       }
1356
1357       if (e->Iex.Binop.op == Iop_Mul32x2) {
1358          const IRExpr* arg1 = e->Iex.Binop.arg1;
1359          const IRExpr* arg2 = e->Iex.Binop.arg2;
1360          vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64);
1361          vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64);
1362          HReg s1 = iselIntExpr_R(env, arg1);
1363          HReg s2 = iselIntExpr_R(env, arg2);
1364          HReg resLo = newVRegI(env);
1365          // resLo = (s1 *64 s2) & 0xFFFF'FFFF
1366          addInstr(env, mk_iMOVsd_RR(s1, resLo));
1367          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo));
1368          addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo));
1369
1370          // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32;
1371          HReg resHi = newVRegI(env);
1372          addInstr(env, mk_iMOVsd_RR(s1, resHi));
1373          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi));
1374          HReg tmp = newVRegI(env);
1375          addInstr(env, mk_iMOVsd_RR(s2, tmp));
1376          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp));
1377          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi));
1378          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi));
1379
1380          // final result = resHi | resLo
1381          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo));
1382          return resLo;
1383       }
1384
1385       // A few remaining SIMD64 ops require helper functions, at least for
1386       // now.
1387       Bool second_is_UInt = False;
1388       HWord fn = 0;
1389       switch (e->Iex.Binop.op) {
1390          case Iop_CatOddLanes16x4:
1391             fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
1392          case Iop_CatEvenLanes16x4:
1393             fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
1394          case Iop_PermOrZero8x8:
1395             fn = (HWord)h_generic_calc_PermOrZero8x8; break;
1396
1397          case Iop_QNarrowBin32Sto16Sx4:
1398             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
1399          case Iop_QNarrowBin16Sto8Sx8:
1400             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
1401          case Iop_QNarrowBin16Sto8Ux8:
1402             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
1403
1404          case Iop_NarrowBin16to8x8:
1405             fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
1406          case Iop_NarrowBin32to16x4:
1407             fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
1408
1409          case Iop_SarN8x8:
1410             fn = (HWord)h_generic_calc_SarN8x8;
1411             second_is_UInt = True;
1412             break;
1413
1414          default:
1415             fn = (HWord)0; break;
1416       }
1417       if (fn != (HWord)0) {
1418          /* Note: the following assumes all helpers are of signature
1419                ULong fn ( ULong, ULong ), and they are
1420             not marked as regparm functions.
1421          */
1422          HReg dst  = newVRegI(env);
1423          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1424          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1425          if (second_is_UInt)
1426             addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
1427          addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
1428          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
1429          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
1430                                         mk_RetLoc_simple(RLPri_Int) ));
1431          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1432          return dst;
1433       }
1434
1435       // Half-float vector conversion
1436       if (e->Iex.Binop.op == Iop_F32toF16x4
1437           && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) {
1438          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2);
1439          HReg dstV = newVRegV(env);
1440          HReg dstI = newVRegI(env);
1441          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
1442          addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV));
1443          set_SSE_rounding_default(env);
1444          addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False));
1445          return dstI;
1446       }
1447
1448       break;
1449    }
1450
1451    /* --------- UNARY OP --------- */
1452    case Iex_Unop: {
1453
1454       /* 1Uto8(64to1(expr64)) */
1455       {
1456          DEFINE_PATTERN( p_1Uto8_64to1,
1457                          unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
1458          if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
1459             const IRExpr* expr64 = mi.bindee[0];
1460             HReg    dst    = newVRegI(env);
1461             HReg    src    = iselIntExpr_R(env, expr64);
1462             addInstr(env, mk_iMOVsd_RR(src,dst) );
1463             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1464                                             AMD64RMI_Imm(1), dst));
1465             return dst;
1466          }
1467       }
1468
1469       /* 8Uto64(LDle(expr64)) */
1470       {
1471          DEFINE_PATTERN(p_LDle8_then_8Uto64,
1472                         unop(Iop_8Uto64,
1473                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
1474          if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
1475             HReg dst = newVRegI(env);
1476             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1477             addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
1478             return dst;
1479          }
1480       }
1481
1482       /* 16Uto64(LDle(expr64)) */
1483       {
1484          DEFINE_PATTERN(p_LDle16_then_16Uto64,
1485                         unop(Iop_16Uto64,
1486                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
1487          if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
1488             HReg dst = newVRegI(env);
1489             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
1490             addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
1491             return dst;
1492          }
1493       }
1494
1495       /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
1496          Use 32 bit arithmetic and let the default zero-extend rule
1497          do the 32Uto64 for free. */
1498       if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
1499          IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
1500          IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
1501          IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
1502          AMD64AluOp aluOp = Aalu_INVALID;
1503          switch (opi) {
1504             case Iop_Add32: aluOp = Aalu_ADD; break;
1505             case Iop_Sub32: aluOp = Aalu_SUB; break;
1506             case Iop_And32: aluOp = Aalu_AND; break;
1507             case Iop_Or32:  aluOp = Aalu_OR;  break;
1508             case Iop_Xor32: aluOp = Aalu_XOR; break;
1509             default: break;
1510          }
1511          if (aluOp != Aalu_INVALID) {
1512             /* For commutative ops we assume any literal values are on
1513                the second operand. */
1514             HReg dst      = newVRegI(env);
1515             HReg reg      = iselIntExpr_R(env, argL);
1516             AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
1517             addInstr(env, mk_iMOVsd_RR(reg,dst));
1518             addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
1519             return dst;
1520          }
1521          /* just fall through to normal handling for Iop_32Uto64 */
1522       }
1523
1524       /* Fallback cases */
1525       switch (e->Iex.Unop.op) {
1526          case Iop_32Uto64:
1527          case Iop_32Sto64: {
1528             HReg dst = newVRegI(env);
1529             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1530             addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
1531                                             src, dst) );
1532             return dst;
1533          }
1534          case Iop_128HIto64: {
1535             HReg rHi, rLo;
1536             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1537             return rHi; /* and abandon rLo */
1538          }
1539          case Iop_128to64: {
1540             HReg rHi, rLo;
1541             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1542             return rLo; /* and abandon rHi */
1543          }
1544          case Iop_8Uto16:
1545          case Iop_8Uto32:
1546          case Iop_8Uto64:
1547          case Iop_16Uto64:
1548          case Iop_16Uto32: {
1549             HReg dst     = newVRegI(env);
1550             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1551             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
1552                                    || e->Iex.Unop.op==Iop_16Uto64 );
1553             UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
1554             addInstr(env, mk_iMOVsd_RR(src,dst) );
1555             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
1556                                             AMD64RMI_Imm(mask), dst));
1557             return dst;
1558          }
1559          case Iop_8Sto16:
1560          case Iop_8Sto64:
1561          case Iop_8Sto32:
1562          case Iop_16Sto32:
1563          case Iop_16Sto64: {
1564             HReg dst     = newVRegI(env);
1565             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
1566             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
1567                                    || e->Iex.Unop.op==Iop_16Sto64 );
1568             UInt amt     = srcIs16 ? 48 : 56;
1569             addInstr(env, mk_iMOVsd_RR(src,dst) );
1570             addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
1571             addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
1572             return dst;
1573          }
1574          case Iop_Not8:
1575          case Iop_Not16:
1576          case Iop_Not32:
1577          case Iop_Not64: {
1578             HReg dst = newVRegI(env);
1579             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1580             addInstr(env, mk_iMOVsd_RR(src,dst) );
1581             addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
1582             return dst;
1583          }
1584          case Iop_16HIto8:
1585          case Iop_32HIto16:
1586          case Iop_64HIto32: {
1587             HReg dst  = newVRegI(env);
1588             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
1589             Int shift = 0;
1590             switch (e->Iex.Unop.op) {
1591                case Iop_16HIto8:  shift = 8;  break;
1592                case Iop_32HIto16: shift = 16; break;
1593                case Iop_64HIto32: shift = 32; break;
1594                default: vassert(0);
1595             }
1596             addInstr(env, mk_iMOVsd_RR(src,dst) );
1597             addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
1598             return dst;
1599          }
1600          case Iop_1Uto64:
1601          case Iop_1Uto32:
1602          case Iop_1Uto8: {
1603             HReg dst           = newVRegI(env);
1604             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1605             addInstr(env, AMD64Instr_Set64(cond,dst));
1606             return dst;
1607          }
1608          case Iop_1Sto8:
1609          case Iop_1Sto16:
1610          case Iop_1Sto32:
1611          case Iop_1Sto64: {
1612             /* could do better than this, but for now ... */
1613             HReg dst           = newVRegI(env);
1614             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
1615             addInstr(env, AMD64Instr_Set64(cond,dst));
1616             addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
1617             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1618             return dst;
1619          }
1620          case Iop_Ctz64: {
1621             /* Count trailing zeroes, implemented by amd64 'bsfq' */
1622             HReg dst = newVRegI(env);
1623             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1624             addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
1625             return dst;
1626          }
1627          case Iop_Clz64: {
1628             /* Count leading zeroes.  Do 'bsrq' to establish the index
1629                of the highest set bit, and subtract that value from
1630                63. */
1631             HReg tmp = newVRegI(env);
1632             HReg dst = newVRegI(env);
1633             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1634             addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
1635             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
1636                                             AMD64RMI_Imm(63), dst));
1637             addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
1638                                             AMD64RMI_Reg(tmp), dst));
1639             return dst;
1640          }
1641
1642          case Iop_CmpwNEZ64: {
1643             HReg dst = newVRegI(env);
1644             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1645             addInstr(env, mk_iMOVsd_RR(src,dst));
1646             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1647             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1648                                             AMD64RMI_Reg(src), dst));
1649             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1650             return dst;
1651          }
1652
1653          case Iop_CmpwNEZ32: {
1654             HReg src = newVRegI(env);
1655             HReg dst = newVRegI(env);
1656             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1657             addInstr(env, mk_iMOVsd_RR(pre,src));
1658             addInstr(env, AMD64Instr_MovxLQ(False, src, src));
1659             addInstr(env, mk_iMOVsd_RR(src,dst));
1660             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
1661             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
1662                                             AMD64RMI_Reg(src), dst));
1663             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
1664             return dst;
1665          }
1666
1667          case Iop_Left8:
1668          case Iop_Left16:
1669          case Iop_Left32:
1670          case Iop_Left64: {
1671             HReg dst = newVRegI(env);
1672             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1673             addInstr(env, mk_iMOVsd_RR(src, dst));
1674             addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
1675             addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
1676             return dst;
1677          }
1678
1679          case Iop_V128to32: {
1680             HReg        dst     = newVRegI(env);
1681             HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
1682             AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
1683             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
1684             addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
1685             return dst;
1686          }
1687
1688          /* V128{HI}to64 */
1689          case Iop_V128to64: {
1690             HReg dst = newVRegI(env);
1691             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1692             addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1693             return dst;
1694          }
1695          case Iop_V128HIto64: {
1696             HReg dst  = newVRegI(env);
1697             HReg vec  = iselVecExpr(env, e->Iex.Unop.arg);
1698             HReg vec2 = newVRegV(env);
1699             addInstr(env, mk_vMOVsd_RR(vec, vec2));
1700             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1701             addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1702             return dst;
1703          }
1704
1705          /* V256to64_{3,2,1,0} */
1706          case Iop_V256to64_0: case Iop_V256to64_1:
1707          case Iop_V256to64_2: case Iop_V256to64_3: {
1708             HReg vHi, vLo, vec;
1709             iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
1710             /* Do the first part of the selection by deciding which of
1711                the 128 bit registers to look at, and second part using
1712                the same scheme as for V128{HI}to64 above. */
1713             Bool low64of128 = True;
1714             switch (e->Iex.Unop.op) {
1715                case Iop_V256to64_0: vec = vLo; low64of128 = True;  break;
1716                case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
1717                case Iop_V256to64_2: vec = vHi; low64of128 = True;  break;
1718                case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
1719                default: vassert(0);
1720             }
1721             HReg dst = newVRegI(env);
1722             if (low64of128) {
1723                addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
1724             } else {
1725                HReg vec2 = newVRegV(env);
1726                addInstr(env, mk_vMOVsd_RR(vec, vec2));
1727                addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
1728                addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
1729             }
1730             return dst;
1731          }
1732
1733          /* ReinterpF64asI64(e) */
1734          /* Given an IEEE754 double, produce an I64 with the same bit
1735             pattern. */
1736          case Iop_ReinterpF64asI64: {
1737             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1738             HReg        dst    = newVRegI(env);
1739             HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
1740             /* paranoia */
1741             set_SSE_rounding_default(env);
1742             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
1743             addInstr(env, AMD64Instr_Alu64R(
1744                              Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
1745             return dst;
1746          }
1747
1748          /* ReinterpF32asI32(e) */
1749          /* Given an IEEE754 single, produce an I64 with the same bit
1750             pattern in the lower half. */
1751          case Iop_ReinterpF32asI32: {
1752             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1753             HReg        dst    = newVRegI(env);
1754             HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
1755             /* paranoia */
1756             set_SSE_rounding_default(env);
1757             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
1758             addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
1759             return dst;
1760          }
1761
1762          case Iop_16to8:
1763          case Iop_32to8:
1764          case Iop_64to8:
1765          case Iop_32to16:
1766          case Iop_64to16:
1767          case Iop_64to32:
1768             /* These are no-ops. */
1769             return iselIntExpr_R(env, e->Iex.Unop.arg);
1770
1771          case Iop_GetMSBs8x8: {
1772             /* Note: the following assumes the helper is of
1773                signature
1774                   UInt fn ( ULong ), and is not a regparm fn.
1775             */
1776             HReg dst = newVRegI(env);
1777             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1778             HWord fn = (HWord)h_generic_calc_GetMSBs8x8;
1779             addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1780             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1781                                            1, mk_RetLoc_simple(RLPri_Int) ));
1782             /* MovxLQ is not exactly the right thing here.  We just
1783                need to get the bottom 8 bits of RAX into dst, and zero
1784                out everything else.  Assuming that the helper returns
1785                a UInt with the top 24 bits zeroed out, it'll do,
1786                though. */
1787             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1788             return dst;
1789          }
1790
1791          case Iop_GetMSBs8x16: {
1792             /* Note: the following assumes the helper is of signature
1793                   UInt fn ( ULong w64hi, ULong w64Lo ),
1794                and is not a regparm fn. */
1795             HReg dst = newVRegI(env);
1796             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
1797             HReg rsp = hregAMD64_RSP();
1798             HWord fn = (HWord)h_generic_calc_GetMSBs8x16;
1799             AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
1800             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
1801             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
1802                                              16, vec, m16_rsp));
1803             /* hi 64 bits into RDI -- the first arg */
1804             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1805                                              AMD64RMI_Mem(m8_rsp),
1806                                              hregAMD64_RDI() )); /* 1st arg */
1807             /* lo 64 bits into RSI -- the 2nd arg */
1808             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
1809                                              AMD64RMI_Mem(m16_rsp),
1810                                              hregAMD64_RSI() )); /* 2nd arg */
1811             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
1812                                            2, mk_RetLoc_simple(RLPri_Int) ));
1813             /* MovxLQ is not exactly the right thing here.  We just
1814                need to get the bottom 16 bits of RAX into dst, and zero
1815                out everything else.  Assuming that the helper returns
1816                a UInt with the top 16 bits zeroed out, it'll do,
1817                though. */
1818             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1819             return dst;
1820          }
1821
1822          default:
1823             break;
1824       }
1825
1826       /* Deal with unary 64-bit SIMD ops. */
1827       HWord fn = 0;
1828       switch (e->Iex.Unop.op) {
1829          case Iop_CmpNEZ32x2:
1830             fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
1831          case Iop_CmpNEZ16x4:
1832             fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
1833          case Iop_CmpNEZ8x8:
1834             fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
1835          default:
1836             fn = (HWord)0; break;
1837       }
1838       if (fn != (HWord)0) {
1839          /* Note: the following assumes all helpers are of
1840             signature
1841                ULong fn ( ULong ), and they are
1842             not marked as regparm functions.
1843          */
1844          HReg dst = newVRegI(env);
1845          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
1846          addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
1847          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
1848                                         mk_RetLoc_simple(RLPri_Int) ));
1849          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1850          return dst;
1851       }
1852
1853       break;
1854    }
1855
1856    /* --------- GET --------- */
1857    case Iex_Get: {
1858       if (ty == Ity_I64) {
1859          HReg dst = newVRegI(env);
1860          addInstr(env, AMD64Instr_Alu64R(
1861                           Aalu_MOV,
1862                           AMD64RMI_Mem(
1863                              AMD64AMode_IR(e->Iex.Get.offset,
1864                                            hregAMD64_RBP())),
1865                           dst));
1866          return dst;
1867       }
1868       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
1869          HReg dst = newVRegI(env);
1870          addInstr(env, AMD64Instr_LoadEX(
1871                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
1872                           False,
1873                           AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
1874                           dst));
1875          return dst;
1876       }
1877       break;
1878    }
1879
1880    case Iex_GetI: {
1881       AMD64AMode* am
1882          = genGuestArrayOffset(
1883               env, e->Iex.GetI.descr,
1884                    e->Iex.GetI.ix, e->Iex.GetI.bias );
1885       HReg dst = newVRegI(env);
1886       if (ty == Ity_I8) {
1887          addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
1888          return dst;
1889       }
1890       if (ty == Ity_I64) {
1891          addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
1892          return dst;
1893       }
1894       break;
1895    }
1896
1897    /* --------- CCALL --------- */
1898    case Iex_CCall: {
1899       HReg    dst = newVRegI(env);
1900       vassert(ty == e->Iex.CCall.retty);
1901
1902       /* be very restrictive for now.  Only 64-bit ints allowed for
1903          args, and 64 or 32 bits for return type. */
1904       if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
1905          goto irreducible;
1906
1907       /* Marshal args, do the call. */
1908       UInt   addToSp = 0;
1909       RetLoc rloc    = mk_RetLoc_INVALID();
1910       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
1911                     e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
1912       vassert(is_sane_RetLoc(rloc));
1913       vassert(rloc.pri == RLPri_Int);
1914       vassert(addToSp == 0);
1915
1916       /* Move to dst, and zero out the top 32 bits if the result type is
1917          Ity_I32.  Probably overkill, but still .. */
1918       if (e->Iex.CCall.retty == Ity_I64)
1919          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
1920       else
1921          addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
1922
1923       return dst;
1924    }
1925
1926    /* --------- LITERAL --------- */
1927    /* 64/32/16/8-bit literals */
1928    case Iex_Const:
1929       if (ty == Ity_I64) {
1930          HReg r = newVRegI(env);
1931          addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
1932          return r;
1933       } else {
1934          AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
1935          HReg      r   = newVRegI(env);
1936          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
1937          return r;
1938       }
1939
1940    /* --------- MULTIPLEX --------- */
1941    case Iex_ITE: { // VFD
1942       if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
1943           && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
1944          HReg     r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
1945          HReg     r0  = iselIntExpr_R(env, e->Iex.ITE.iffalse);
1946          HReg     dst = newVRegI(env);
1947          addInstr(env, mk_iMOVsd_RR(r1,dst));
1948          AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
1949          addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
1950          return dst;
1951       }
1952       break;
1953    }
1954
1955    /* --------- TERNARY OP --------- */
1956    case Iex_Triop: {
1957       IRTriop *triop = e->Iex.Triop.details;
1958       /* C3210 flags following FPU partial remainder (fprem), both
1959          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
1960       if (triop->op == Iop_PRemC3210F64
1961           || triop->op == Iop_PRem1C3210F64) {
1962          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
1963          HReg        arg1   = iselDblExpr(env, triop->arg2);
1964          HReg        arg2   = iselDblExpr(env, triop->arg3);
1965          HReg        dst    = newVRegI(env);
1966          addInstr(env, AMD64Instr_A87Free(2));
1967
1968          /* one arg -> top of x87 stack */
1969          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
1970          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1971
1972          /* other arg -> top of x87 stack */
1973          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
1974          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
1975
1976          switch (triop->op) {
1977             case Iop_PRemC3210F64:
1978                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
1979                break;
1980             case Iop_PRem1C3210F64:
1981                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
1982                break;
1983             default:
1984                vassert(0);
1985          }
1986          /* Ignore the result, and instead make off with the FPU's
1987             C3210 flags (in the status word). */
1988          addInstr(env, AMD64Instr_A87StSW(m8_rsp));
1989          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
1990          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
1991          return dst;
1992       }
1993       break;
1994    }
1995
1996    default:
1997    break;
1998    } /* switch (e->tag) */
1999
2000    /* We get here if no pattern matched. */
2001   irreducible:
2002    ppIRExpr(e);
2003    vpanic("iselIntExpr_R(amd64): cannot reduce tree");
2004 }
2005
2006
2007 /*---------------------------------------------------------*/
2008 /*--- ISEL: Integer expression auxiliaries              ---*/
2009 /*---------------------------------------------------------*/
2010
2011 /* --------------------- AMODEs --------------------- */
2012
2013 /* Return an AMode which computes the value of the specified
2014    expression, possibly also adding insns to the code list as a
2015    result.  The expression may only be a 32-bit one.
2016 */
2017
2018 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e )
2019 {
2020    AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
2021    vassert(sane_AMode(am));
2022    return am;
2023 }
2024
2025 /* DO NOT CALL THIS DIRECTLY ! */
2026 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e )
2027 {
2028    MatchInfo mi;
2029    DECLARE_PATTERN(p_complex);
2030    IRType ty = typeOfIRExpr(env->type_env,e);
2031    vassert(ty == Ity_I64);
2032
2033    /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
2034    /*              bind0        bind1  bind2   bind3   */
2035    DEFINE_PATTERN(p_complex,
2036       binop( Iop_Add64,
2037              binop( Iop_Add64,
2038                     bind(0),
2039                     binop(Iop_Shl64, bind(1), bind(2))
2040                   ),
2041              bind(3)
2042            )
2043    );
2044    if (matchIRExpr(&mi, p_complex, e)) {
2045       const IRExpr* expr1  = mi.bindee[0];
2046       const IRExpr* expr2  = mi.bindee[1];
2047       const IRExpr* imm8   = mi.bindee[2];
2048       const IRExpr* simm32 = mi.bindee[3];
2049       if (imm8->tag == Iex_Const
2050           && imm8->Iex.Const.con->tag == Ico_U8
2051           && imm8->Iex.Const.con->Ico.U8 < 4
2052           /* imm8 is OK, now check simm32 */
2053           && simm32->tag == Iex_Const
2054           && simm32->Iex.Const.con->tag == Ico_U64
2055           && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
2056          UInt shift = imm8->Iex.Const.con->Ico.U8;
2057          UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
2058          HReg r1 = iselIntExpr_R(env, expr1);
2059          HReg r2 = iselIntExpr_R(env, expr2);
2060          vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
2061          return AMD64AMode_IRRS(offset, r1, r2, shift);
2062       }
2063    }
2064
2065    /* Add64(expr1, Shl64(expr2, imm)) */
2066    if (e->tag == Iex_Binop
2067        && e->Iex.Binop.op == Iop_Add64
2068        && e->Iex.Binop.arg2->tag == Iex_Binop
2069        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
2070        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
2071        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
2072       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
2073       if (shift == 1 || shift == 2 || shift == 3) {
2074          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2075          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
2076          return AMD64AMode_IRRS(0, r1, r2, shift);
2077       }
2078    }
2079
2080    /* Add64(expr,i) */
2081    if (e->tag == Iex_Binop
2082        && e->Iex.Binop.op == Iop_Add64
2083        && e->Iex.Binop.arg2->tag == Iex_Const
2084        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
2085        && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
2086       HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
2087       return AMD64AMode_IR(
2088                 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
2089                 r1
2090              );
2091    }
2092
2093    /* Doesn't match anything in particular.  Generate it into
2094       a register and use that. */
2095    {
2096       HReg r1 = iselIntExpr_R(env, e);
2097       return AMD64AMode_IR(0, r1);
2098    }
2099 }
2100
2101
2102 /* --------------------- RMIs --------------------- */
2103
2104 /* Similarly, calculate an expression into an X86RMI operand.  As with
2105    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
2106
2107 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e )
2108 {
2109    AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
2110    /* sanity checks ... */
2111    switch (rmi->tag) {
2112       case Armi_Imm:
2113          return rmi;
2114       case Armi_Reg:
2115          vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
2116          vassert(hregIsVirtual(rmi->Armi.Reg.reg));
2117          return rmi;
2118       case Armi_Mem:
2119          vassert(sane_AMode(rmi->Armi.Mem.am));
2120          return rmi;
2121       default:
2122          vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
2123    }
2124 }
2125
2126 /* DO NOT CALL THIS DIRECTLY ! */
2127 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e )
2128 {
2129    IRType ty = typeOfIRExpr(env->type_env,e);
2130    vassert(ty == Ity_I64 || ty == Ity_I32
2131            || ty == Ity_I16 || ty == Ity_I8);
2132
2133    /* special case: immediate 64/32/16/8 */
2134    if (e->tag == Iex_Const) {
2135       switch (e->Iex.Const.con->tag) {
2136         case Ico_U64:
2137            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2138               return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2139            }
2140            break;
2141          case Ico_U32:
2142             return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
2143          case Ico_U16:
2144             return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
2145          case Ico_U8:
2146             return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
2147          default:
2148             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2149       }
2150    }
2151
2152    /* special case: 64-bit GET */
2153    if (e->tag == Iex_Get && ty == Ity_I64) {
2154       return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2155                                         hregAMD64_RBP()));
2156    }
2157
2158    /* special case: 64-bit load from memory */
2159    if (e->tag == Iex_Load && ty == Ity_I64
2160        && e->Iex.Load.end == Iend_LE) {
2161       AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2162       return AMD64RMI_Mem(am);
2163    }
2164
2165    /* default case: calculate into a register and return that */
2166    {
2167       HReg r = iselIntExpr_R ( env, e );
2168       return AMD64RMI_Reg(r);
2169    }
2170 }
2171
2172
2173 /* --------------------- RIs --------------------- */
2174
2175 /* Calculate an expression into an AMD64RI operand.  As with
2176    iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2177    bits. */
2178
2179 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e )
2180 {
2181    AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
2182    /* sanity checks ... */
2183    switch (ri->tag) {
2184       case Ari_Imm:
2185          return ri;
2186       case Ari_Reg:
2187          vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
2188          vassert(hregIsVirtual(ri->Ari.Reg.reg));
2189          return ri;
2190       default:
2191          vpanic("iselIntExpr_RI: unknown amd64 RI tag");
2192    }
2193 }
2194
2195 /* DO NOT CALL THIS DIRECTLY ! */
2196 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e )
2197 {
2198    IRType ty = typeOfIRExpr(env->type_env,e);
2199    vassert(ty == Ity_I64 || ty == Ity_I32
2200            || ty == Ity_I16 || ty == Ity_I8);
2201
2202    /* special case: immediate */
2203    if (e->tag == Iex_Const) {
2204       switch (e->Iex.Const.con->tag) {
2205         case Ico_U64:
2206            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
2207               return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
2208            }
2209            break;
2210          case Ico_U32:
2211             return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
2212          case Ico_U16:
2213             return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
2214          case Ico_U8:
2215             return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
2216          default:
2217             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
2218       }
2219    }
2220
2221    /* default case: calculate into a register and return that */
2222    {
2223       HReg r = iselIntExpr_R ( env, e );
2224       return AMD64RI_Reg(r);
2225    }
2226 }
2227
2228
2229 /* --------------------- RMs --------------------- */
2230
2231 /* Similarly, calculate an expression into an AMD64RM operand.  As
2232    with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
2233    bits.  */
2234
2235 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e )
2236 {
2237    AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
2238    /* sanity checks ... */
2239    switch (rm->tag) {
2240       case Arm_Reg:
2241          vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
2242          vassert(hregIsVirtual(rm->Arm.Reg.reg));
2243          return rm;
2244       case Arm_Mem:
2245          vassert(sane_AMode(rm->Arm.Mem.am));
2246          return rm;
2247       default:
2248          vpanic("iselIntExpr_RM: unknown amd64 RM tag");
2249    }
2250 }
2251
2252 /* DO NOT CALL THIS DIRECTLY ! */
2253 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e )
2254 {
2255    IRType ty = typeOfIRExpr(env->type_env,e);
2256    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
2257
2258    /* special case: 64-bit GET */
2259    if (e->tag == Iex_Get && ty == Ity_I64) {
2260       return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
2261                                        hregAMD64_RBP()));
2262    }
2263
2264    /* special case: load from memory */
2265
2266    /* default case: calculate into a register and return that */
2267    {
2268       HReg r = iselIntExpr_R ( env, e );
2269       return AMD64RM_Reg(r);
2270    }
2271 }
2272
2273
2274 /* --------------------- CONDCODE --------------------- */
2275
2276 /* Generate code to evaluated a bit-typed expression, returning the
2277    condition code which would correspond when the expression would
2278    notionally have returned 1. */
2279
2280 static AMD64CondCode iselCondCode ( ISelEnv* env, const IRExpr* e )
2281 {
2282    /* Uh, there's nothing we can sanity check here, unfortunately. */
2283    return iselCondCode_wrk(env,e);
2284 }
2285
2286 /* DO NOT CALL THIS DIRECTLY ! */
2287 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, const IRExpr* e )
2288 {
2289    vassert(e);
2290    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
2291
2292    /* var */
2293    if (e->tag == Iex_RdTmp) {
2294       HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
2295       HReg dst = newVRegI(env);
2296       addInstr(env, mk_iMOVsd_RR(r64,dst));
2297       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
2298       return Acc_NZ;
2299    }
2300
2301    /* Constant 1:Bit */
2302    if (e->tag == Iex_Const) {
2303       HReg r;
2304       vassert(e->Iex.Const.con->tag == Ico_U1);
2305       vassert(e->Iex.Const.con->Ico.U1 == True
2306               || e->Iex.Const.con->Ico.U1 == False);
2307       r = newVRegI(env);
2308       addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
2309       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
2310       return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
2311    }
2312
2313    /* Not1(...) */
2314    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
2315       /* Generate code for the arg, and negate the test condition */
2316       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
2317    }
2318
2319    /* --- patterns rooted at: 64to1 --- */
2320
2321    /* 64to1 */
2322    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
2323       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2324       addInstr(env, AMD64Instr_Test64(1,reg));
2325       return Acc_NZ;
2326    }
2327
2328    /* --- patterns rooted at: 32to1 --- */
2329
2330    /* 32to1 */
2331    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
2332       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
2333       addInstr(env, AMD64Instr_Test64(1,reg));
2334       return Acc_NZ;
2335    }
2336
2337    /* --- patterns rooted at: CmpNEZ8 --- */
2338
2339    /* CmpNEZ8(x) */
2340    if (e->tag == Iex_Unop
2341        && e->Iex.Unop.op == Iop_CmpNEZ8) {
2342       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2343       addInstr(env, AMD64Instr_Test64(0xFF,r));
2344       return Acc_NZ;
2345    }
2346
2347    /* --- patterns rooted at: CmpNEZ16 --- */
2348
2349    /* CmpNEZ16(x) */
2350    if (e->tag == Iex_Unop
2351        && e->Iex.Unop.op == Iop_CmpNEZ16) {
2352       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
2353       addInstr(env, AMD64Instr_Test64(0xFFFF,r));
2354       return Acc_NZ;
2355    }
2356
2357    /* --- patterns rooted at: CmpNEZ32 --- */
2358
2359    if (e->tag == Iex_Unop
2360        && e->Iex.Unop.op == Iop_CmpNEZ32) {
2361       IRExpr* arg = e->Iex.Unop.arg;
2362       if (arg->tag == Iex_Binop
2363           && (arg->Iex.Binop.op == Iop_Or32
2364               || arg->Iex.Binop.op == Iop_And32)) {
2365          /* CmpNEZ32(Or32(x,y)) */
2366          /* CmpNEZ32(And32(x,y)) */
2367          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2368          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2369          HReg      tmp  = newVRegI(env);
2370          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2371          addInstr(env, AMD64Instr_Alu32R(
2372                           arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND,
2373                           rmi1, tmp));
2374          return Acc_NZ;
2375       }
2376       /* CmpNEZ32(x) */
2377       HReg      r1   = iselIntExpr_R(env, arg);
2378       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2379       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2380       return Acc_NZ;
2381    }
2382
2383    /* --- patterns rooted at: CmpNEZ64 --- */
2384
2385    if (e->tag == Iex_Unop
2386        && e->Iex.Unop.op == Iop_CmpNEZ64) {
2387       IRExpr* arg = e->Iex.Unop.arg;
2388       if (arg->tag == Iex_Binop
2389           && (arg->Iex.Binop.op == Iop_Or64
2390               || arg->Iex.Binop.op == Iop_And64)) {
2391          /* CmpNEZ64(Or64(x,y)) */
2392          /* CmpNEZ64(And64(x,y)) */
2393          HReg      r0   = iselIntExpr_R(env, arg->Iex.Binop.arg1);
2394          AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2);
2395          HReg      tmp  = newVRegI(env);
2396          addInstr(env, mk_iMOVsd_RR(r0, tmp));
2397          addInstr(env, AMD64Instr_Alu64R(
2398                           arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND,
2399                           rmi1, tmp));
2400          return Acc_NZ;
2401       }
2402       /* CmpNEZ64(x) */
2403       HReg      r1   = iselIntExpr_R(env, arg);
2404       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
2405       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2406       return Acc_NZ;
2407    }
2408
2409    /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
2410
2411    /* CmpEQ8 / CmpNE8 */
2412    if (e->tag == Iex_Binop
2413        && (e->Iex.Binop.op == Iop_CmpEQ8
2414            || e->Iex.Binop.op == Iop_CmpNE8
2415            || e->Iex.Binop.op == Iop_CasCmpEQ8
2416            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
2417       if (isZeroU8(e->Iex.Binop.arg2)) {
2418          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2419          addInstr(env, AMD64Instr_Test64(0xFF,r1));
2420          switch (e->Iex.Binop.op) {
2421             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2422             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2423             default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
2424          }
2425       } else {
2426          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2427          AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2428          HReg      r    = newVRegI(env);
2429          addInstr(env, mk_iMOVsd_RR(r1,r));
2430          addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2431          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
2432          switch (e->Iex.Binop.op) {
2433             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
2434             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
2435             default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
2436          }
2437       }
2438    }
2439
2440    /* CmpEQ16 / CmpNE16 */
2441    if (e->tag == Iex_Binop
2442        && (e->Iex.Binop.op == Iop_CmpEQ16
2443            || e->Iex.Binop.op == Iop_CmpNE16
2444            || e->Iex.Binop.op == Iop_CasCmpEQ16
2445            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
2446       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2447       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2448       HReg      r    = newVRegI(env);
2449       addInstr(env, mk_iMOVsd_RR(r1,r));
2450       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
2451       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
2452       switch (e->Iex.Binop.op) {
2453          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
2454          case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
2455          default: vpanic("iselCondCode(amd64): CmpXX16");
2456       }
2457    }
2458
2459    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
2460       Saves a "movq %rax, %tmp" compared to the default route. */
2461    if (e->tag == Iex_Binop
2462        && e->Iex.Binop.op == Iop_CmpNE64
2463        && e->Iex.Binop.arg1->tag == Iex_CCall
2464        && e->Iex.Binop.arg2->tag == Iex_Const) {
2465       IRExpr* cal = e->Iex.Binop.arg1;
2466       IRExpr* con = e->Iex.Binop.arg2;
2467       HReg    tmp = newVRegI(env);
2468       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
2469       vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
2470       vassert(con->Iex.Const.con->tag == Ico_U64);
2471       /* Marshal args, do the call. */
2472       UInt   addToSp = 0;
2473       RetLoc rloc    = mk_RetLoc_INVALID();
2474       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2475                     cal->Iex.CCall.cee,
2476                     cal->Iex.CCall.retty, cal->Iex.CCall.args );
2477       vassert(is_sane_RetLoc(rloc));
2478       vassert(rloc.pri == RLPri_Int);
2479       vassert(addToSp == 0);
2480       /* */
2481       addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
2482       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
2483                                       AMD64RMI_Reg(hregAMD64_RAX()), tmp));
2484       return Acc_NZ;
2485    }
2486
2487    /* Cmp*64*(x,y) */
2488    if (e->tag == Iex_Binop
2489        && (e->Iex.Binop.op == Iop_CmpEQ64
2490            || e->Iex.Binop.op == Iop_CmpNE64
2491            || e->Iex.Binop.op == Iop_CmpLT64S
2492            || e->Iex.Binop.op == Iop_CmpLT64U
2493            || e->Iex.Binop.op == Iop_CmpLE64S
2494            || e->Iex.Binop.op == Iop_CmpLE64U
2495            || e->Iex.Binop.op == Iop_CasCmpEQ64
2496            || e->Iex.Binop.op == Iop_CasCmpNE64
2497            || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
2498       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2499       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2500       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
2501       switch (e->Iex.Binop.op) {
2502          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
2503          case Iop_CmpNE64:
2504          case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
2505          case Iop_CmpLT64S: return Acc_L;
2506          case Iop_CmpLT64U: return Acc_B;
2507          case Iop_CmpLE64S: return Acc_LE;
2508          case Iop_CmpLE64U: return Acc_BE;
2509          default: vpanic("iselCondCode(amd64): CmpXX64");
2510       }
2511    }
2512
2513    /* Cmp*32*(x,y) */
2514    if (e->tag == Iex_Binop
2515        && (e->Iex.Binop.op == Iop_CmpEQ32
2516            || e->Iex.Binop.op == Iop_CmpNE32
2517            || e->Iex.Binop.op == Iop_CmpLT32S
2518            || e->Iex.Binop.op == Iop_CmpLT32U
2519            || e->Iex.Binop.op == Iop_CmpLE32S
2520            || e->Iex.Binop.op == Iop_CmpLE32U
2521            || e->Iex.Binop.op == Iop_CasCmpEQ32
2522            || e->Iex.Binop.op == Iop_CasCmpNE32
2523            || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
2524       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
2525       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
2526       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
2527       switch (e->Iex.Binop.op) {
2528          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
2529          case Iop_CmpNE32:
2530          case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
2531          case Iop_CmpLT32S: return Acc_L;
2532          case Iop_CmpLT32U: return Acc_B;
2533          case Iop_CmpLE32S: return Acc_LE;
2534          case Iop_CmpLE32U: return Acc_BE;
2535          default: vpanic("iselCondCode(amd64): CmpXX32");
2536       }
2537    }
2538
2539    ppIRExpr(e);
2540    vpanic("iselCondCode(amd64)");
2541 }
2542
2543
2544 /*---------------------------------------------------------*/
2545 /*--- ISEL: Integer expressions (128 bit)               ---*/
2546 /*---------------------------------------------------------*/
2547
2548 /* Compute a 128-bit value into a register pair, which is returned as
2549    the first two parameters.  As with iselIntExpr_R, these may be
2550    either real or virtual regs; in any case they must not be changed
2551    by subsequent code emitted by the caller.  */
2552
2553 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2554                              ISelEnv* env, const IRExpr* e )
2555 {
2556    iselInt128Expr_wrk(rHi, rLo, env, e);
2557 #  if 0
2558    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2559 #  endif
2560    vassert(hregClass(*rHi) == HRcInt64);
2561    vassert(hregIsVirtual(*rHi));
2562    vassert(hregClass(*rLo) == HRcInt64);
2563    vassert(hregIsVirtual(*rLo));
2564 }
2565
2566 /* DO NOT CALL THIS DIRECTLY ! */
2567 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2568                                  ISelEnv* env, const IRExpr* e )
2569 {
2570    vassert(e);
2571    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2572
2573    /* read 128-bit IRTemp */
2574    if (e->tag == Iex_RdTmp) {
2575       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
2576       return;
2577    }
2578
2579    /* --------- BINARY ops --------- */
2580    if (e->tag == Iex_Binop) {
2581       switch (e->Iex.Binop.op) {
2582          /* 64 x 64 -> 128 multiply */
2583          case Iop_MullU64:
2584          case Iop_MullS64: {
2585             /* get one operand into %rax, and the other into a R/M.
2586                Need to make an educated guess about which is better in
2587                which. */
2588             HReg     tLo    = newVRegI(env);
2589             HReg     tHi    = newVRegI(env);
2590             Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
2591             AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
2592             HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
2593             addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
2594             addInstr(env, AMD64Instr_MulL(syned, rmLeft));
2595             /* Result is now in RDX:RAX.  Tell the caller. */
2596             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2597             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2598             *rHi = tHi;
2599             *rLo = tLo;
2600             return;
2601          }
2602
2603          /* 128 x 64 -> (64(rem),64(div)) division */
2604          case Iop_DivModU128to64:
2605          case Iop_DivModS128to64: {
2606             /* Get the 128-bit operand into rdx:rax, and the other into
2607                any old R/M. */
2608             HReg sHi, sLo;
2609             HReg     tLo     = newVRegI(env);
2610             HReg     tHi     = newVRegI(env);
2611             Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
2612             AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
2613             iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
2614             addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
2615             addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
2616             addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
2617             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
2618             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
2619             *rHi = tHi;
2620             *rLo = tLo;
2621             return;
2622          }
2623
2624          /* 64HLto128(e1,e2) */
2625          case Iop_64HLto128:
2626             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2627             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2628             return;
2629
2630          default:
2631             break;
2632       }
2633    } /* if (e->tag == Iex_Binop) */
2634
2635    ppIRExpr(e);
2636    vpanic("iselInt128Expr");
2637 }
2638
2639
2640 /*---------------------------------------------------------*/
2641 /*--- ISEL: Floating point expressions (32 bit)         ---*/
2642 /*---------------------------------------------------------*/
2643
2644 /* Nothing interesting here; really just wrappers for
2645    64-bit stuff. */
2646
2647 static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e )
2648 {
2649    HReg r = iselFltExpr_wrk( env, e );
2650 #  if 0
2651    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2652 #  endif
2653    vassert(hregClass(r) == HRcVec128);
2654    vassert(hregIsVirtual(r));
2655    return r;
2656 }
2657
2658 /* DO NOT CALL THIS DIRECTLY */
2659 static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
2660 {
2661    IRType ty = typeOfIRExpr(env->type_env,e);
2662    vassert(ty == Ity_F32);
2663
2664    if (e->tag == Iex_RdTmp) {
2665       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2666    }
2667
2668    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2669       AMD64AMode* am;
2670       HReg res = newVRegV(env);
2671       vassert(e->Iex.Load.ty == Ity_F32);
2672       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2673       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
2674       return res;
2675    }
2676
2677    if (e->tag == Iex_Binop
2678        && e->Iex.Binop.op == Iop_F64toF32) {
2679       /* Although the result is still held in a standard SSE register,
2680          we need to round it to reflect the loss of accuracy/range
2681          entailed in casting it to a 32-bit float. */
2682       HReg dst = newVRegV(env);
2683       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
2684       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
2685       addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
2686       set_SSE_rounding_default( env );
2687       return dst;
2688    }
2689
2690    if (e->tag == Iex_Get) {
2691       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2692                                        hregAMD64_RBP() );
2693       HReg res = newVRegV(env);
2694       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
2695       return res;
2696    }
2697
2698    if (e->tag == Iex_Unop
2699        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
2700        /* Given an I32, produce an IEEE754 float with the same bit
2701           pattern. */
2702        HReg        dst    = newVRegV(env);
2703        HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
2704        AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
2705        addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
2706        addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
2707        return dst;
2708    }
2709
2710    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
2711       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2712       HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
2713       HReg        dst    = newVRegV(env);
2714
2715       /* rf now holds the value to be rounded.  The first thing to do
2716          is set the FPU's rounding mode accordingly. */
2717
2718       /* Set host x87 rounding mode */
2719       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2720
2721       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
2722       addInstr(env, AMD64Instr_A87Free(1));
2723       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
2724       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2725       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
2726       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
2727
2728       /* Restore default x87 rounding. */
2729       set_FPU_rounding_default( env );
2730
2731       return dst;
2732    }
2733
2734    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
2735       /* Sigh ... very rough code.  Could do much better. */
2736       /* Get the 128-bit literal 00---0 10---0 into a register
2737          and xor it with the value to be negated. */
2738       HReg r1  = newVRegI(env);
2739       HReg dst = newVRegV(env);
2740       HReg tmp = newVRegV(env);
2741       HReg src = iselFltExpr(env, e->Iex.Unop.arg);
2742       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
2743       addInstr(env, mk_vMOVsd_RR(src,tmp));
2744       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
2745       addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
2746       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
2747       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
2748       addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
2749       add_to_rsp(env, 16);
2750       return dst;
2751    }
2752
2753    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
2754       IRQop *qop = e->Iex.Qop.details;
2755       HReg dst  = newVRegV(env);
2756       HReg argX = iselFltExpr(env, qop->arg2);
2757       HReg argY = iselFltExpr(env, qop->arg3);
2758       HReg argZ = iselFltExpr(env, qop->arg4);
2759       /* XXXROUNDINGFIXME */
2760       /* set roundingmode here */
2761       /* subq $16, %rsp         -- make a space*/
2762       sub_from_rsp(env, 16);
2763       /* Prepare 4 arg regs:
2764          leaq 0(%rsp), %rdi
2765          leaq 4(%rsp), %rsi
2766          leaq 8(%rsp), %rdx
2767          leaq 12(%rsp), %rcx
2768       */
2769       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2770                                      hregAMD64_RDI()));
2771       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
2772                                      hregAMD64_RSI()));
2773       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2774                                      hregAMD64_RDX()));
2775       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
2776                                      hregAMD64_RCX()));
2777       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2778          movss  %argX, 0(%rsi)
2779          movss  %argY, 0(%rdx)
2780          movss  %argZ, 0(%rcx)
2781          */
2782       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
2783                                        AMD64AMode_IR(0, hregAMD64_RSI())));
2784       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
2785                                        AMD64AMode_IR(0, hregAMD64_RDX())));
2786       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
2787                                        AMD64AMode_IR(0, hregAMD64_RCX())));
2788       /* call the helper */
2789       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2790                                      (ULong)(HWord)h_generic_calc_MAddF32,
2791                                      4, mk_RetLoc_simple(RLPri_None) ));
2792       /* fetch the result from memory, using %r_argp, which the
2793          register allocator will keep alive across the call. */
2794       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
2795                                        AMD64AMode_IR(0, hregAMD64_RSP())));
2796       /* and finally, clear the space */
2797       add_to_rsp(env, 16);
2798       return dst;
2799    }
2800
2801    ppIRExpr(e);
2802    vpanic("iselFltExpr_wrk");
2803 }
2804
2805
2806 /*---------------------------------------------------------*/
2807 /*--- ISEL: Floating point expressions (64 bit)         ---*/
2808 /*---------------------------------------------------------*/
2809
2810 /* Compute a 64-bit floating point value into the lower half of an xmm
2811    register, the identity of which is returned.  As with
2812    iselIntExpr_R, the returned reg will be virtual, and it must not be
2813    changed by subsequent code emitted by the caller.
2814 */
2815
2816 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
2817
2818     Type                  S (1 bit)   E (11 bits)   F (52 bits)
2819     ----                  ---------   -----------   -----------
2820     signalling NaN        u           2047 (max)    .0uuuuu---u
2821                                                     (with at least
2822                                                      one 1 bit)
2823     quiet NaN             u           2047 (max)    .1uuuuu---u
2824
2825     negative infinity     1           2047 (max)    .000000---0
2826
2827     positive infinity     0           2047 (max)    .000000---0
2828
2829     negative zero         1           0             .000000---0
2830
2831     positive zero         0           0             .000000---0
2832 */
2833
2834 static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e )
2835 {
2836    HReg r = iselDblExpr_wrk( env, e );
2837 #  if 0
2838    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2839 #  endif
2840    vassert(hregClass(r) == HRcVec128);
2841    vassert(hregIsVirtual(r));
2842    return r;
2843 }
2844
2845 /* DO NOT CALL THIS DIRECTLY */
2846 static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
2847 {
2848    IRType ty = typeOfIRExpr(env->type_env,e);
2849    vassert(e);
2850    vassert(ty == Ity_F64);
2851
2852    if (e->tag == Iex_RdTmp) {
2853       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2854    }
2855
2856    if (e->tag == Iex_Const) {
2857       union { ULong u64; Double f64; } u;
2858       HReg res = newVRegV(env);
2859       HReg tmp = newVRegI(env);
2860       vassert(sizeof(u) == 8);
2861       vassert(sizeof(u.u64) == 8);
2862       vassert(sizeof(u.f64) == 8);
2863
2864       if (e->Iex.Const.con->tag == Ico_F64) {
2865          u.f64 = e->Iex.Const.con->Ico.F64;
2866       }
2867       else if (e->Iex.Const.con->tag == Ico_F64i) {
2868          u.u64 = e->Iex.Const.con->Ico.F64i;
2869       }
2870       else
2871          vpanic("iselDblExpr(amd64): const");
2872
2873       addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
2874       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
2875       addInstr(env, AMD64Instr_SseLdSt(
2876                        True/*load*/, 8, res,
2877                        AMD64AMode_IR(0, hregAMD64_RSP())
2878               ));
2879       add_to_rsp(env, 8);
2880       return res;
2881    }
2882
2883    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
2884       AMD64AMode* am;
2885       HReg res = newVRegV(env);
2886       vassert(e->Iex.Load.ty == Ity_F64);
2887       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
2888       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2889       return res;
2890    }
2891
2892    if (e->tag == Iex_Get) {
2893       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
2894                                       hregAMD64_RBP() );
2895       HReg res = newVRegV(env);
2896       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2897       return res;
2898    }
2899
2900    if (e->tag == Iex_GetI) {
2901       AMD64AMode* am
2902          = genGuestArrayOffset(
2903               env, e->Iex.GetI.descr,
2904                    e->Iex.GetI.ix, e->Iex.GetI.bias );
2905       HReg res = newVRegV(env);
2906       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
2907       return res;
2908    }
2909
2910    if (e->tag == Iex_Triop) {
2911       IRTriop *triop = e->Iex.Triop.details;
2912       AMD64SseOp op = Asse_INVALID;
2913       switch (triop->op) {
2914          case Iop_AddF64: op = Asse_ADDF; break;
2915          case Iop_SubF64: op = Asse_SUBF; break;
2916          case Iop_MulF64: op = Asse_MULF; break;
2917          case Iop_DivF64: op = Asse_DIVF; break;
2918          default: break;
2919       }
2920       if (op != Asse_INVALID) {
2921          HReg dst  = newVRegV(env);
2922          HReg argL = iselDblExpr(env, triop->arg2);
2923          HReg argR = iselDblExpr(env, triop->arg3);
2924          addInstr(env, mk_vMOVsd_RR(argL, dst));
2925          /* XXXROUNDINGFIXME */
2926          /* set roundingmode here */
2927          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
2928          return dst;
2929       }
2930    }
2931
2932    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
2933       IRQop *qop = e->Iex.Qop.details;
2934       HReg dst  = newVRegV(env);
2935       HReg argX = iselDblExpr(env, qop->arg2);
2936       HReg argY = iselDblExpr(env, qop->arg3);
2937       HReg argZ = iselDblExpr(env, qop->arg4);
2938       /* XXXROUNDINGFIXME */
2939       /* set roundingmode here */
2940       /* subq $32, %rsp         -- make a space*/
2941       sub_from_rsp(env, 32);
2942       /* Prepare 4 arg regs:
2943          leaq 0(%rsp), %rdi
2944          leaq 8(%rsp), %rsi
2945          leaq 16(%rsp), %rdx
2946          leaq 24(%rsp), %rcx
2947       */
2948       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
2949                                      hregAMD64_RDI()));
2950       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
2951                                      hregAMD64_RSI()));
2952       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
2953                                      hregAMD64_RDX()));
2954       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
2955                                      hregAMD64_RCX()));
2956       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
2957          movsd  %argX, 0(%rsi)
2958          movsd  %argY, 0(%rdx)
2959          movsd  %argZ, 0(%rcx)
2960          */
2961       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
2962                                        AMD64AMode_IR(0, hregAMD64_RSI())));
2963       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
2964                                        AMD64AMode_IR(0, hregAMD64_RDX())));
2965       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
2966                                        AMD64AMode_IR(0, hregAMD64_RCX())));
2967       /* call the helper */
2968       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
2969                                      (ULong)(HWord)h_generic_calc_MAddF64,
2970                                      4, mk_RetLoc_simple(RLPri_None) ));
2971       /* fetch the result from memory, using %r_argp, which the
2972          register allocator will keep alive across the call. */
2973       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
2974                                        AMD64AMode_IR(0, hregAMD64_RSP())));
2975       /* and finally, clear the space */
2976       add_to_rsp(env, 32);
2977       return dst;
2978    }
2979
2980    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
2981       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
2982       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
2983       HReg        dst    = newVRegV(env);
2984
2985       /* rf now holds the value to be rounded.  The first thing to do
2986          is set the FPU's rounding mode accordingly. */
2987
2988       /* Set host x87 rounding mode */
2989       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
2990
2991       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
2992       addInstr(env, AMD64Instr_A87Free(1));
2993       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
2994       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
2995       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
2996       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
2997
2998       /* Restore default x87 rounding. */
2999       set_FPU_rounding_default( env );
3000
3001       return dst;
3002    }
3003
3004    IRTriop *triop = e->Iex.Triop.details;
3005    if (e->tag == Iex_Triop
3006        && (triop->op == Iop_ScaleF64
3007            || triop->op == Iop_AtanF64
3008            || triop->op == Iop_Yl2xF64
3009            || triop->op == Iop_Yl2xp1F64
3010            || triop->op == Iop_PRemF64
3011            || triop->op == Iop_PRem1F64)
3012       ) {
3013       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3014       HReg        arg1   = iselDblExpr(env, triop->arg2);
3015       HReg        arg2   = iselDblExpr(env, triop->arg3);
3016       HReg        dst    = newVRegV(env);
3017       Bool     arg2first = toBool(triop->op == Iop_ScaleF64
3018                                   || triop->op == Iop_PRemF64
3019                                   || triop->op == Iop_PRem1F64);
3020       addInstr(env, AMD64Instr_A87Free(2));
3021
3022       /* one arg -> top of x87 stack */
3023       addInstr(env, AMD64Instr_SseLdSt(
3024                        False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
3025       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3026
3027       /* other arg -> top of x87 stack */
3028       addInstr(env, AMD64Instr_SseLdSt(
3029                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
3030       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3031
3032       /* do it */
3033       /* XXXROUNDINGFIXME */
3034       /* set roundingmode here */
3035       switch (triop->op) {
3036          case Iop_ScaleF64:
3037             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
3038             break;
3039          case Iop_AtanF64:
3040             addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
3041             break;
3042          case Iop_Yl2xF64:
3043             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
3044             break;
3045          case Iop_Yl2xp1F64:
3046             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
3047             break;
3048          case Iop_PRemF64:
3049             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
3050             break;
3051          case Iop_PRem1F64:
3052             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
3053             break;
3054          default:
3055             vassert(0);
3056       }
3057
3058       /* save result */
3059       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3060       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3061       return dst;
3062    }
3063
3064    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
3065       HReg dst = newVRegV(env);
3066       HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
3067       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3068       addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
3069       set_SSE_rounding_default( env );
3070       return dst;
3071    }
3072
3073    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
3074       HReg dst = newVRegV(env);
3075       HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3076       set_SSE_rounding_default( env );
3077       addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
3078       return dst;
3079    }
3080
3081    if (e->tag == Iex_Unop
3082        && (e->Iex.Unop.op == Iop_NegF64
3083            || e->Iex.Unop.op == Iop_AbsF64)) {
3084       /* Sigh ... very rough code.  Could do much better. */
3085       /* Get the 128-bit literal 00---0 10---0 into a register
3086          and xor/nand it with the value to be negated. */
3087       HReg r1  = newVRegI(env);
3088       HReg dst = newVRegV(env);
3089       HReg tmp = newVRegV(env);
3090       HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3091       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3092       addInstr(env, mk_vMOVsd_RR(src,tmp));
3093       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3094       addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
3095       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
3096       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
3097
3098       if (e->Iex.Unop.op == Iop_NegF64)
3099          addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
3100       else
3101          addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
3102
3103       add_to_rsp(env, 16);
3104       return dst;
3105    }
3106
3107    if (e->tag == Iex_Binop) {
3108       A87FpOp fpop = Afp_INVALID;
3109       switch (e->Iex.Binop.op) {
3110          case Iop_SqrtF64: fpop = Afp_SQRT; break;
3111          case Iop_SinF64:  fpop = Afp_SIN;  break;
3112          case Iop_CosF64:  fpop = Afp_COS;  break;
3113          case Iop_TanF64:  fpop = Afp_TAN;  break;
3114          case Iop_2xm1F64: fpop = Afp_2XM1; break;
3115          default: break;
3116       }
3117       if (fpop != Afp_INVALID) {
3118          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3119          HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
3120          HReg        dst    = newVRegV(env);
3121          Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
3122          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
3123          addInstr(env, AMD64Instr_A87Free(nNeeded));
3124          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
3125          /* XXXROUNDINGFIXME */
3126          /* set roundingmode here */
3127          /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
3128             codes.  I don't think that matters, since this insn
3129             selector never generates such an instruction intervening
3130             between an flag-setting instruction and a flag-using
3131             instruction. */
3132          addInstr(env, AMD64Instr_A87FpOp(fpop));
3133          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
3134          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3135          return dst;
3136       }
3137    }
3138
3139    if (e->tag == Iex_Unop) {
3140       switch (e->Iex.Unop.op) {
3141 //..          case Iop_I32toF64: {
3142 //..             HReg dst = newVRegF(env);
3143 //..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
3144 //..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
3145 //..             set_FPU_rounding_default(env);
3146 //..             addInstr(env, X86Instr_FpLdStI(
3147 //..                              True/*load*/, 4, dst,
3148 //..                              X86AMode_IR(0, hregX86_ESP())));
3149 //..             add_to_esp(env, 4);
3150 //..             return dst;
3151 //..          }
3152          case Iop_ReinterpI64asF64: {
3153             /* Given an I64, produce an IEEE754 double with the same
3154                bit pattern. */
3155             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
3156             HReg        dst    = newVRegV(env);
3157             AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
3158             /* paranoia */
3159             set_SSE_rounding_default(env);
3160             addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
3161             addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
3162             return dst;
3163          }
3164          case Iop_F32toF64: {
3165             HReg f32;
3166             HReg f64 = newVRegV(env);
3167             /* this shouldn't be necessary, but be paranoid ... */
3168             set_SSE_rounding_default(env);
3169             f32 = iselFltExpr(env, e->Iex.Unop.arg);
3170             addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
3171             return f64;
3172          }
3173          default:
3174             break;
3175       }
3176    }
3177
3178    /* --------- MULTIPLEX --------- */
3179    if (e->tag == Iex_ITE) { // VFD
3180       HReg r1, r0, dst;
3181       vassert(ty == Ity_F64);
3182       vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
3183       r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3184       r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3185       dst = newVRegV(env);
3186       addInstr(env, mk_vMOVsd_RR(r1,dst));
3187       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3188       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3189       return dst;
3190    }
3191
3192    ppIRExpr(e);
3193    vpanic("iselDblExpr_wrk");
3194 }
3195
3196
3197 /*---------------------------------------------------------*/
3198 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
3199 /*---------------------------------------------------------*/
3200
3201 static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e )
3202 {
3203    HReg r = iselVecExpr_wrk( env, e );
3204 #  if 0
3205    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3206 #  endif
3207    vassert(hregClass(r) == HRcVec128);
3208    vassert(hregIsVirtual(r));
3209    return r;
3210 }
3211
3212
3213 /* DO NOT CALL THIS DIRECTLY */
3214 static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
3215 {
3216    HWord      fn = 0; /* address of helper fn, if required */
3217    Bool       arg1isEReg = False;
3218    AMD64SseOp op = Asse_INVALID;
3219    vassert(e);
3220    IRType ty = typeOfIRExpr(env->type_env, e);
3221    vassert(ty == Ity_V128);
3222    UInt laneBits = 0;
3223
3224    if (e->tag == Iex_RdTmp) {
3225       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3226    }
3227
3228    if (e->tag == Iex_Get) {
3229       HReg dst = newVRegV(env);
3230       addInstr(env, AMD64Instr_SseLdSt(
3231                        True/*load*/,
3232                        16,
3233                        dst,
3234                        AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
3235                     )
3236               );
3237       return dst;
3238    }
3239
3240    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3241       HReg        dst = newVRegV(env);
3242       AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
3243       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
3244       return dst;
3245    }
3246
3247    if (e->tag == Iex_Const) {
3248       HReg dst = newVRegV(env);
3249       vassert(e->Iex.Const.con->tag == Ico_V128);
3250       switch (e->Iex.Const.con->Ico.V128) {
3251          case 0x0000:
3252             dst = generate_zeroes_V128(env);
3253             break;
3254          case 0xFFFF:
3255             dst = generate_ones_V128(env);
3256             break;
3257          default: {
3258             AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3259             /* do push_uimm64 twice, first time for the high-order half. */
3260             push_uimm64(env, bitmask8_to_bytemask64(
3261                                 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
3262                        ));
3263             push_uimm64(env, bitmask8_to_bytemask64(
3264                                 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
3265                        ));
3266             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
3267             add_to_rsp(env, 16);
3268             break;
3269          }
3270       }
3271       return dst;
3272    }
3273
3274    if (e->tag == Iex_Unop) {
3275    switch (e->Iex.Unop.op) {
3276
3277       case Iop_NotV128: {
3278          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3279          return do_sse_NotV128(env, arg);
3280       }
3281
3282       case Iop_CmpNEZ64x2: {
3283          /* We can use SSE2 instructions for this. */
3284          /* Ideally, we want to do a 64Ix2 comparison against zero of
3285             the operand.  Problem is no such insn exists.  Solution
3286             therefore is to do a 32Ix4 comparison instead, and bitwise-
3287             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
3288             let the not'd result of this initial comparison be a:b:c:d.
3289             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
3290             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
3291             giving the required result.
3292
3293             The required selection sequence is 2,3,0,1, which
3294             according to Intel's documentation means the pshufd
3295             literal value is 0xB1, that is,
3296             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
3297          */
3298          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3299          HReg tmp  = generate_zeroes_V128(env);
3300          HReg dst  = newVRegV(env);
3301          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
3302          tmp = do_sse_NotV128(env, tmp);
3303          addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
3304          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3305          return dst;
3306       }
3307
3308       case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
3309       case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
3310       case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
3311       do_CmpNEZ_vector:
3312       {
3313          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
3314          HReg tmp  = newVRegV(env);
3315          HReg zero = generate_zeroes_V128(env);
3316          HReg dst;
3317          addInstr(env, mk_vMOVsd_RR(arg, tmp));
3318          addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
3319          dst = do_sse_NotV128(env, tmp);
3320          return dst;
3321       }
3322
3323       case Iop_RecipEst32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
3324       case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
3325       do_32Fx4_unary:
3326       {
3327          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3328          HReg dst = newVRegV(env);
3329          addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
3330          return dst;
3331       }
3332
3333       case Iop_RecipEst32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
3334       case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
3335       case Iop_Sqrt32F0x4:     op = Asse_SQRTF;  goto do_32F0x4_unary;
3336       do_32F0x4_unary:
3337       {
3338          /* A bit subtle.  We have to copy the arg to the result
3339             register first, because actually doing the SSE scalar insn
3340             leaves the upper 3/4 of the destination register
3341             unchanged.  Whereas the required semantics of these
3342             primops is that the upper 3/4 is simply copied in from the
3343             argument. */
3344          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3345          HReg dst = newVRegV(env);
3346          addInstr(env, mk_vMOVsd_RR(arg, dst));
3347          addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
3348          return dst;
3349       }
3350
3351       case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
3352       do_64F0x2_unary:
3353       {
3354          /* A bit subtle.  We have to copy the arg to the result
3355             register first, because actually doing the SSE scalar insn
3356             leaves the upper half of the destination register
3357             unchanged.  Whereas the required semantics of these
3358             primops is that the upper half is simply copied in from the
3359             argument. */
3360          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
3361          HReg dst = newVRegV(env);
3362          addInstr(env, mk_vMOVsd_RR(arg, dst));
3363          addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
3364          return dst;
3365       }
3366
3367       case Iop_32UtoV128: {
3368          // FIXME maybe just use MOVQ here?
3369          HReg        dst     = newVRegV(env);
3370          AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
3371          AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
3372          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
3373          addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
3374          return dst;
3375       }
3376
3377       case Iop_64UtoV128: {
3378          // FIXME maybe just use MOVQ here?
3379          HReg        dst  = newVRegV(env);
3380          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3381          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
3382          addInstr(env, AMD64Instr_Push(rmi));
3383          addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
3384          add_to_rsp(env, 8);
3385          return dst;
3386       }
3387
3388       case Iop_V256toV128_0:
3389       case Iop_V256toV128_1: {
3390          HReg vHi, vLo;
3391          iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
3392          return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
3393       }
3394
3395       case Iop_F16toF32x4: {
3396          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3397             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
3398             HReg dst = newVRegV(env);
3399             addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True));
3400             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst));
3401             return dst;
3402          }
3403          break;
3404       }
3405
3406       default:
3407          break;
3408    } /* switch (e->Iex.Unop.op) */
3409    } /* if (e->tag == Iex_Unop) */
3410
3411    if (e->tag == Iex_Binop) {
3412    switch (e->Iex.Binop.op) {
3413
3414       case Iop_Sqrt64Fx2:
3415       case Iop_Sqrt32Fx4: {
3416          /* :: (rmode, vec) -> vec */
3417          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3418          HReg dst = newVRegV(env);
3419          /* XXXROUNDINGFIXME */
3420          /* set roundingmode here */
3421          addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
3422                            ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
3423                        (Asse_SQRTF, arg, dst));
3424          return dst;
3425       }
3426
3427       /* FIXME: could we generate MOVQ here? */
3428       case Iop_SetV128lo64: {
3429          HReg dst  = newVRegV(env);
3430          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3431          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3432          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3433          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3434          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
3435          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3436          return dst;
3437       }
3438
3439       /* FIXME: could we generate MOVD here? */
3440       case Iop_SetV128lo32: {
3441          HReg dst  = newVRegV(env);
3442          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
3443          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3444          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
3445          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
3446          addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
3447          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
3448          return dst;
3449       }
3450
3451       case Iop_64HLtoV128: {
3452          const IRExpr* arg1 = e->Iex.Binop.arg1;
3453          const IRExpr* arg2 = e->Iex.Binop.arg2;
3454          HReg dst = newVRegV(env);
3455          HReg tmp = newVRegV(env);
3456          HReg qHi = iselIntExpr_R(env, arg1);
3457          // If the args are trivially the same (tmp or const), use the same
3458          // source register for both, and only one movq since those are
3459          // (relatively) expensive.
3460          if (areAtomsAndEqual(arg1, arg2)) {
3461             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3462             addInstr(env, mk_vMOVsd_RR(dst, tmp));
3463             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3464             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3465          } else {
3466             HReg qLo = iselIntExpr_R(env, arg2);
3467             addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
3468             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
3469             addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
3470             addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
3471          }
3472          return dst;
3473       }
3474
3475       case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
3476       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
3477       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
3478       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
3479       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
3480       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
3481       do_32Fx4:
3482       {
3483          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3484          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3485          HReg dst = newVRegV(env);
3486          addInstr(env, mk_vMOVsd_RR(argL, dst));
3487          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3488          return dst;
3489       }
3490
3491       case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
3492       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
3493       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
3494       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
3495       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
3496       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
3497       do_64Fx2:
3498       {
3499          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3500          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3501          HReg dst = newVRegV(env);
3502          addInstr(env, mk_vMOVsd_RR(argL, dst));
3503          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3504          return dst;
3505       }
3506
3507       case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
3508       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
3509       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
3510       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
3511       case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
3512       case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
3513       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
3514       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
3515       case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
3516       case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
3517       do_32F0x4: {
3518          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3519          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3520          HReg dst = newVRegV(env);
3521          addInstr(env, mk_vMOVsd_RR(argL, dst));
3522          addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
3523          return dst;
3524       }
3525
3526       case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
3527       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
3528       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
3529       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
3530       case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
3531       case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
3532       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
3533       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
3534       case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
3535       case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
3536       do_64F0x2: {
3537          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3538          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3539          HReg dst = newVRegV(env);
3540          addInstr(env, mk_vMOVsd_RR(argL, dst));
3541          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
3542          return dst;
3543       }
3544
3545       case Iop_PermOrZero8x16:
3546          if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3547             op = Asse_PSHUFB;
3548             goto do_SseReRg;
3549          }
3550          // Otherwise we'll have to generate a call to
3551          // h_generic_calc_PermOrZero8x16 (ATK).  But that would only be for a
3552          // host which doesn't have SSSE3, in which case we don't expect this
3553          // IROp to enter the compilation pipeline in the first place.
3554          break;
3555
3556       case Iop_PwExtUSMulQAdd8x16:
3557          if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
3558             op = Asse_PMADDUBSW;
3559             goto do_SseReRg;
3560          }
3561          break;
3562
3563       case Iop_QNarrowBin32Sto16Sx8:
3564          op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
3565       case Iop_QNarrowBin16Sto8Sx16:
3566          op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
3567       case Iop_QNarrowBin16Sto8Ux16:
3568          op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
3569
3570       case Iop_InterleaveHI8x16:
3571          op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
3572       case Iop_InterleaveHI16x8:
3573          op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
3574       case Iop_InterleaveHI32x4:
3575          op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
3576       case Iop_InterleaveHI64x2:
3577          op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
3578
3579       case Iop_InterleaveLO8x16:
3580          op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
3581       case Iop_InterleaveLO16x8:
3582          op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
3583       case Iop_InterleaveLO32x4:
3584          op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
3585       case Iop_InterleaveLO64x2:
3586          op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
3587
3588       case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
3589       case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
3590       case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
3591       case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
3592       case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
3593       case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
3594       case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
3595       case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
3596       case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
3597       case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
3598       case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
3599       case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
3600       case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
3601       case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
3602       case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
3603       case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
3604       case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
3605       case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
3606       case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
3607       case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
3608       case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
3609       case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
3610       case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
3611       case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
3612       case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
3613       case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
3614       case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
3615       case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
3616       case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
3617       case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
3618       case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
3619       case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
3620       case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
3621       case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
3622       do_SseReRg: {
3623          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
3624          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
3625          HReg dst = newVRegV(env);
3626          if (arg1isEReg) {
3627             addInstr(env, mk_vMOVsd_RR(arg2, dst));
3628             addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
3629          } else {
3630             addInstr(env, mk_vMOVsd_RR(arg1, dst));
3631             addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
3632          }
3633          return dst;
3634       }
3635
3636       case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
3637       case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
3638       case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
3639       case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
3640       case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
3641       case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
3642       case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
3643       case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
3644       do_SseShift: {
3645          HReg dst  = newVRegV(env);
3646          HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
3647          /* If it's a shift by an in-range immediate, generate a single
3648             instruction. */
3649          if (e->Iex.Binop.arg2->tag == Iex_Const) {
3650             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
3651             vassert(c->tag == Ico_U8);
3652             UInt shift = c->Ico.U8;
3653             if (shift < laneBits) {
3654                addInstr(env, mk_vMOVsd_RR(greg, dst));
3655                addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
3656                return dst;
3657             }
3658          }
3659          /* Otherwise we have to do it the longwinded way. */
3660          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
3661          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
3662          HReg        ereg = newVRegV(env);
3663          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
3664          addInstr(env, AMD64Instr_Push(rmi));
3665          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
3666          addInstr(env, mk_vMOVsd_RR(greg, dst));
3667          addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
3668          add_to_rsp(env, 16);
3669          return dst;
3670       }
3671
3672       case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
3673                            goto do_SseAssistedBinary;
3674       case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
3675                            goto do_SseAssistedBinary;
3676       case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
3677                            goto do_SseAssistedBinary;
3678       case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
3679                            goto do_SseAssistedBinary;
3680       case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
3681                            goto do_SseAssistedBinary;
3682       case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
3683                            goto do_SseAssistedBinary;
3684       case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
3685                            goto do_SseAssistedBinary;
3686       case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
3687                            goto do_SseAssistedBinary;
3688       case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
3689                            goto do_SseAssistedBinary;
3690       case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
3691                            goto do_SseAssistedBinary;
3692       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
3693                            goto do_SseAssistedBinary;
3694       case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
3695                            goto do_SseAssistedBinary;
3696       case Iop_QNarrowBin32Sto16Ux8:
3697                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
3698                            goto do_SseAssistedBinary;
3699       case Iop_NarrowBin16to8x16:
3700                            fn = (HWord)h_generic_calc_NarrowBin16to8x16;
3701                            goto do_SseAssistedBinary;
3702       case Iop_NarrowBin32to16x8:
3703                            fn = (HWord)h_generic_calc_NarrowBin32to16x8;
3704                            goto do_SseAssistedBinary;
3705       do_SseAssistedBinary: {
3706          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3707             well. */
3708          vassert(fn != 0);
3709          HReg dst = newVRegV(env);
3710          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3711          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
3712          HReg argp = newVRegI(env);
3713          /* subq $112, %rsp         -- make a space*/
3714          sub_from_rsp(env, 112);
3715          /* leaq 48(%rsp), %r_argp  -- point into it */
3716          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3717                                         argp));
3718          /* andq $-16, %r_argp      -- 16-align the pointer */
3719          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3720                                          AMD64RMI_Imm( ~(UInt)15 ),
3721                                          argp));
3722          /* Prepare 3 arg regs:
3723             leaq 0(%r_argp), %rdi
3724             leaq 16(%r_argp), %rsi
3725             leaq 32(%r_argp), %rdx
3726          */
3727          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3728                                         hregAMD64_RDI()));
3729          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3730                                         hregAMD64_RSI()));
3731          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
3732                                         hregAMD64_RDX()));
3733          /* Store the two args, at (%rsi) and (%rdx):
3734             movupd  %argL, 0(%rsi)
3735             movupd  %argR, 0(%rdx)
3736          */
3737          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3738                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3739          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
3740                                           AMD64AMode_IR(0, hregAMD64_RDX())));
3741          /* call the helper */
3742          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3743                                         3, mk_RetLoc_simple(RLPri_None) ));
3744          /* fetch the result from memory, using %r_argp, which the
3745             register allocator will keep alive across the call. */
3746          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3747                                           AMD64AMode_IR(0, argp)));
3748          /* and finally, clear the space */
3749          add_to_rsp(env, 112);
3750          return dst;
3751       }
3752
3753       case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
3754                          goto do_SseAssistedVectorAndScalar;
3755       case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
3756                          goto do_SseAssistedVectorAndScalar;
3757       do_SseAssistedVectorAndScalar: {
3758          /* RRRufff!  RRRufff code is what we're generating here.  Oh
3759             well. */
3760          vassert(fn != 0);
3761          HReg dst = newVRegV(env);
3762          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
3763          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
3764          HReg argp = newVRegI(env);
3765          /* subq $112, %rsp         -- make a space*/
3766          sub_from_rsp(env, 112);
3767          /* leaq 48(%rsp), %r_argp  -- point into it */
3768          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
3769                                         argp));
3770          /* andq $-16, %r_argp      -- 16-align the pointer */
3771          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
3772                                          AMD64RMI_Imm( ~(UInt)15 ),
3773                                          argp));
3774          /* Prepare 2 vector arg regs:
3775             leaq 0(%r_argp), %rdi
3776             leaq 16(%r_argp), %rsi
3777          */
3778          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
3779                                         hregAMD64_RDI()));
3780          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
3781                                         hregAMD64_RSI()));
3782          /* Store the vector arg, at (%rsi):
3783             movupd  %argL, 0(%rsi)
3784          */
3785          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
3786                                           AMD64AMode_IR(0, hregAMD64_RSI())));
3787          /* And get the scalar value into rdx */
3788          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
3789
3790          /* call the helper */
3791          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
3792                                         3, mk_RetLoc_simple(RLPri_None) ));
3793          /* fetch the result from memory, using %r_argp, which the
3794             register allocator will keep alive across the call. */
3795          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
3796                                           AMD64AMode_IR(0, argp)));
3797          /* and finally, clear the space */
3798          add_to_rsp(env, 112);
3799          return dst;
3800       }
3801
3802       case Iop_I32StoF32x4:
3803       case Iop_F32toI32Sx4: {
3804          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
3805          HReg dst = newVRegV(env);
3806          AMD64SseOp mop
3807             = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
3808          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
3809          addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
3810          set_SSE_rounding_default(env);
3811          return dst;
3812       }
3813
3814       // Half-float vector conversion
3815       case Iop_F32toF16x8: {
3816          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
3817             HReg srcHi, srcLo;
3818             iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2);
3819             HReg dstHi = newVRegV(env);
3820             HReg dstLo = newVRegV(env);
3821             set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
3822             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi));
3823             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo));
3824             set_SSE_rounding_default(env);
3825             // Now we have the result in dstHi[63:0] and dstLo[63:0], but we
3826             // need to compact all that into one register.  There's probably a
3827             // more elegant way to do this, but ..
3828             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
3829             // dstHi is now 127:64 = useful data, 63:0 = zero
3830             addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
3831             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo));
3832             // dstLo is now 127:64 = zero, 63:0 = useful data
3833             addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo));
3834             return dstLo;
3835          }
3836          break;
3837       }
3838
3839       default:
3840          break;
3841    } /* switch (e->Iex.Binop.op) */
3842    } /* if (e->tag == Iex_Binop) */
3843
3844    if (e->tag == Iex_Triop) {
3845    IRTriop *triop = e->Iex.Triop.details;
3846    switch (triop->op) {
3847
3848       case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
3849       case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
3850       case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
3851       case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
3852       do_64Fx2_w_rm:
3853       {
3854          HReg argL = iselVecExpr(env, triop->arg2);
3855          HReg argR = iselVecExpr(env, triop->arg3);
3856          HReg dst = newVRegV(env);
3857          addInstr(env, mk_vMOVsd_RR(argL, dst));
3858          /* XXXROUNDINGFIXME */
3859          /* set roundingmode here */
3860          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
3861          return dst;
3862       }
3863
3864       case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
3865       case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
3866       case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
3867       case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
3868       do_32Fx4_w_rm:
3869       {
3870          HReg argL = iselVecExpr(env, triop->arg2);
3871          HReg argR = iselVecExpr(env, triop->arg3);
3872          HReg dst = newVRegV(env);
3873          addInstr(env, mk_vMOVsd_RR(argL, dst));
3874          /* XXXROUNDINGFIXME */
3875          /* set roundingmode here */
3876          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
3877          return dst;
3878       }
3879
3880       default:
3881          break;
3882    } /* switch (triop->op) */
3883    } /* if (e->tag == Iex_Triop) */
3884
3885    if (e->tag == Iex_ITE) { // VFD
3886       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
3887       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
3888       HReg dst = newVRegV(env);
3889       addInstr(env, mk_vMOVsd_RR(r1,dst));
3890       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
3891       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
3892       return dst;
3893    }
3894
3895    //vec_fail:
3896    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
3897               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
3898    ppIRExpr(e);
3899    vpanic("iselVecExpr_wrk");
3900 }
3901
3902
3903 /*---------------------------------------------------------*/
3904 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
3905 /*---------------------------------------------------------*/
3906
3907 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3908                            ISelEnv* env, const IRExpr* e )
3909 {
3910    iselDVecExpr_wrk( rHi, rLo, env, e );
3911 #  if 0
3912    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3913 #  endif
3914    vassert(hregClass(*rHi) == HRcVec128);
3915    vassert(hregClass(*rLo) == HRcVec128);
3916    vassert(hregIsVirtual(*rHi));
3917    vassert(hregIsVirtual(*rLo));
3918 }
3919
3920
3921 /* DO NOT CALL THIS DIRECTLY */
3922 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3923                                ISelEnv* env, const IRExpr* e )
3924 {
3925    HWord fn = 0; /* address of helper fn, if required */
3926    vassert(e);
3927    IRType ty = typeOfIRExpr(env->type_env, e);
3928    vassert(ty == Ity_V256);
3929    UInt laneBits = 0;
3930
3931    AMD64SseOp op = Asse_INVALID;
3932
3933    /* read 256-bit IRTemp */
3934    if (e->tag == Iex_RdTmp) {
3935       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3936       return;
3937    }
3938
3939    if (e->tag == Iex_Get) {
3940       HReg        vHi  = newVRegV(env);
3941       HReg        vLo  = newVRegV(env);
3942       HReg        rbp  = hregAMD64_RBP();
3943       AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
3944       AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
3945       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3946       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3947       *rHi = vHi;
3948       *rLo = vLo;
3949       return;
3950    }
3951
3952    if (e->tag == Iex_Load) {
3953       HReg        vHi  = newVRegV(env);
3954       HReg        vLo  = newVRegV(env);
3955       HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
3956       AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
3957       AMD64AMode* am16 = AMD64AMode_IR(16, rA);
3958       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
3959       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
3960       *rHi = vHi;
3961       *rLo = vLo;
3962       return;
3963    }
3964
3965    if (e->tag == Iex_Const) {
3966       vassert(e->Iex.Const.con->tag == Ico_V256);
3967       switch (e->Iex.Const.con->Ico.V256) {
3968          case 0x00000000: {
3969             HReg vHi = generate_zeroes_V128(env);
3970             HReg vLo = newVRegV(env);
3971             addInstr(env, mk_vMOVsd_RR(vHi, vLo));
3972             *rHi = vHi;
3973             *rLo = vLo;
3974             return;
3975          }
3976          default:
3977             break; /* give up.   Until such time as is necessary. */
3978       }
3979    }
3980
3981    if (e->tag == Iex_Unop) {
3982    switch (e->Iex.Unop.op) {
3983
3984       case Iop_NotV256: {
3985          HReg argHi, argLo;
3986          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3987          *rHi = do_sse_NotV128(env, argHi);
3988          *rLo = do_sse_NotV128(env, argLo);
3989          return;
3990       }
3991
3992       case Iop_RecipEst32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
3993       case Iop_Sqrt32Fx8:     op = Asse_SQRTF;  goto do_32Fx8_unary;
3994       case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
3995       do_32Fx8_unary:
3996       {
3997          HReg argHi, argLo;
3998          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
3999          HReg dstHi = newVRegV(env);
4000          HReg dstLo = newVRegV(env);
4001          addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
4002          addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
4003          *rHi = dstHi;
4004          *rLo = dstLo;
4005          return;
4006       }
4007
4008       case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
4009       do_64Fx4_unary:
4010       {
4011          HReg argHi, argLo;
4012          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4013          HReg dstHi = newVRegV(env);
4014          HReg dstLo = newVRegV(env);
4015          addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
4016          addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
4017          *rHi = dstHi;
4018          *rLo = dstLo;
4019          return;
4020       }
4021
4022       case Iop_CmpNEZ64x4: {
4023          /* We can use SSE2 instructions for this. */
4024          /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
4025             (obviously).  See comment on Iop_CmpNEZ64x2 for
4026             explanation of what's going on here. */
4027          HReg argHi, argLo;
4028          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4029          HReg tmpHi  = generate_zeroes_V128(env);
4030          HReg tmpLo  = newVRegV(env);
4031          addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
4032          HReg dstHi  = newVRegV(env);
4033          HReg dstLo  = newVRegV(env);
4034          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
4035          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
4036          tmpHi = do_sse_NotV128(env, tmpHi);
4037          tmpLo = do_sse_NotV128(env, tmpLo);
4038          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
4039          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
4040          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
4041          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
4042          *rHi = dstHi;
4043          *rLo = dstLo;
4044          return;
4045       }
4046
4047       case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
4048       case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
4049       case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
4050       do_CmpNEZ_vector:
4051       {
4052          HReg argHi, argLo;
4053          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
4054          HReg tmpHi = newVRegV(env);
4055          HReg tmpLo = newVRegV(env);
4056          HReg zero  = generate_zeroes_V128(env);
4057          HReg dstHi, dstLo;
4058          addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
4059          addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
4060          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
4061          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
4062          dstHi = do_sse_NotV128(env, tmpHi);
4063          dstLo = do_sse_NotV128(env, tmpLo);
4064          *rHi = dstHi;
4065          *rLo = dstLo;
4066          return;
4067       }
4068
4069       case Iop_F16toF32x8: {
4070          if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) {
4071             HReg src     = iselVecExpr(env, e->Iex.Unop.arg);
4072             HReg srcCopy = newVRegV(env);
4073             HReg dstHi   = newVRegV(env);
4074             HReg dstLo   = newVRegV(env);
4075             // Copy src, since we'll need to modify it.
4076             addInstr(env, mk_vMOVsd_RR(src, srcCopy));
4077             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo));
4078             addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy));
4079             addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi));
4080             *rHi = dstHi;
4081             *rLo = dstLo;
4082             return;
4083          }
4084          break;
4085       }
4086
4087       default:
4088          break;
4089    } /* switch (e->Iex.Unop.op) */
4090    } /* if (e->tag == Iex_Unop) */
4091
4092    if (e->tag == Iex_Binop) {
4093    switch (e->Iex.Binop.op) {
4094
4095       case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
4096       case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
4097       do_64Fx4:
4098       {
4099          HReg argLhi, argLlo, argRhi, argRlo;
4100          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4101          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4102          HReg dstHi = newVRegV(env);
4103          HReg dstLo = newVRegV(env);
4104          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4105          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4106          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4107          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4108          *rHi = dstHi;
4109          *rLo = dstLo;
4110          return;
4111       }
4112
4113       case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
4114       case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
4115       do_32Fx8:
4116       {
4117          HReg argLhi, argLlo, argRhi, argRlo;
4118          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4119          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4120          HReg dstHi = newVRegV(env);
4121          HReg dstLo = newVRegV(env);
4122          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4123          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4124          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4125          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4126          *rHi = dstHi;
4127          *rLo = dstLo;
4128          return;
4129       }
4130
4131       case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
4132       case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
4133       case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
4134       case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
4135       case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
4136       case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
4137       case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
4138       case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
4139       case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
4140       case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
4141       case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
4142       case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
4143       case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
4144       case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
4145       case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
4146       case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
4147       case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
4148       case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
4149       case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
4150       case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
4151       case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
4152       case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
4153       case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
4154       case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
4155       case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
4156       case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
4157       case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
4158       case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
4159       case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
4160       case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
4161       case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
4162       case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
4163       case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
4164       case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
4165       do_SseReRg:
4166       {
4167          HReg argLhi, argLlo, argRhi, argRlo;
4168          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4169          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4170          HReg dstHi = newVRegV(env);
4171          HReg dstLo = newVRegV(env);
4172          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4173          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4174          addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
4175          addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
4176          *rHi = dstHi;
4177          *rLo = dstLo;
4178          return;
4179       }
4180
4181       case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
4182       case Iop_ShlN32x8:  laneBits = 32; op = Asse_SHL32; goto do_SseShift;
4183       case Iop_ShlN64x4:  laneBits = 64; op = Asse_SHL64; goto do_SseShift;
4184       case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
4185       case Iop_SarN32x8:  laneBits = 32; op = Asse_SAR32; goto do_SseShift;
4186       case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
4187       case Iop_ShrN32x8:  laneBits = 32; op = Asse_SHR32; goto do_SseShift;
4188       case Iop_ShrN64x4:  laneBits = 64; op = Asse_SHR64; goto do_SseShift;
4189       do_SseShift: {
4190          HReg dstHi = newVRegV(env);
4191          HReg dstLo = newVRegV(env);
4192          HReg gregHi, gregLo;
4193          iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
4194          /* If it's a shift by an in-range immediate, generate two single
4195             instructions. */
4196          if (e->Iex.Binop.arg2->tag == Iex_Const) {
4197             IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
4198             vassert(c->tag == Ico_U8);
4199             UInt shift = c->Ico.U8;
4200             if (shift < laneBits) {
4201                addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4202                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
4203                addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4204                addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
4205                *rHi = dstHi;
4206                *rLo = dstLo;
4207                return;
4208             }
4209          }
4210          /* Otherwise we have to do it the longwinded way. */
4211          AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
4212          AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
4213          HReg        ereg  = newVRegV(env);
4214          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
4215          addInstr(env, AMD64Instr_Push(rmi));
4216          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
4217          addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
4218          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
4219          addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
4220          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
4221          add_to_rsp(env, 16);
4222          *rHi = dstHi;
4223          *rLo = dstLo;
4224          return;
4225       }
4226
4227       case Iop_V128HLtoV256: {
4228          // Curiously, there doesn't seem to be any benefit to be had here by
4229          // checking whether arg1 and arg2 are the same, in the style of how
4230          // (eg) 64HLtoV128 is handled elsewhere in this file.
4231          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
4232          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
4233          return;
4234       }
4235
4236       case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
4237                            goto do_SseAssistedBinary;
4238       case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
4239                            goto do_SseAssistedBinary;
4240       case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
4241                            goto do_SseAssistedBinary;
4242       case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
4243                            goto do_SseAssistedBinary;
4244       case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
4245                            goto do_SseAssistedBinary;
4246       case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
4247                            goto do_SseAssistedBinary;
4248       case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
4249                            goto do_SseAssistedBinary;
4250       case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
4251                            goto do_SseAssistedBinary;
4252       case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
4253                            goto do_SseAssistedBinary;
4254       case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
4255                            goto do_SseAssistedBinary;
4256       case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
4257                            goto do_SseAssistedBinary;
4258       do_SseAssistedBinary: {
4259          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4260             well. */
4261          vassert(fn != 0);
4262          HReg dstHi = newVRegV(env);
4263          HReg dstLo = newVRegV(env);
4264          HReg argLhi, argLlo, argRhi, argRlo;
4265          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4266          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4267          HReg argp = newVRegI(env);
4268          /* subq $160, %rsp         -- make a space*/
4269          sub_from_rsp(env, 160);
4270          /* leaq 48(%rsp), %r_argp  -- point into it */
4271          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4272                                         argp));
4273          /* andq $-16, %r_argp      -- 16-align the pointer */
4274          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4275                                          AMD64RMI_Imm( ~(UInt)15 ),
4276                                          argp));
4277          /* Prepare 3 arg regs:
4278             leaq 0(%r_argp), %rdi
4279             leaq 16(%r_argp), %rsi
4280             leaq 32(%r_argp), %rdx
4281          */
4282          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4283                                         hregAMD64_RDI()));
4284          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
4285                                         hregAMD64_RSI()));
4286          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4287                                         hregAMD64_RDX()));
4288          /* Store the two high args, at (%rsi) and (%rdx):
4289             movupd  %argLhi, 0(%rsi)
4290             movupd  %argRhi, 0(%rdx)
4291          */
4292          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4293                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4294          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4295                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4296          /* Store the two low args, at 48(%rsi) and 48(%rdx):
4297             movupd  %argLlo, 48(%rsi)
4298             movupd  %argRlo, 48(%rdx)
4299          */
4300          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4301                                           AMD64AMode_IR(48, hregAMD64_RSI())));
4302          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4303                                           AMD64AMode_IR(48, hregAMD64_RDX())));
4304          /* call the helper */
4305          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4306                                         mk_RetLoc_simple(RLPri_None) ));
4307          /* Prepare 3 arg regs:
4308             leaq 48(%r_argp), %rdi
4309             leaq 64(%r_argp), %rsi
4310             leaq 80(%r_argp), %rdx
4311          */
4312          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
4313                                         hregAMD64_RDI()));
4314          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4315                                         hregAMD64_RSI()));
4316          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
4317                                         hregAMD64_RDX()));
4318          /* call the helper */
4319          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4320                                         mk_RetLoc_simple(RLPri_None) ));
4321          /* fetch the result from memory, using %r_argp, which the
4322             register allocator will keep alive across the call. */
4323          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4324                                           AMD64AMode_IR(0, argp)));
4325          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4326                                           AMD64AMode_IR(48, argp)));
4327          /* and finally, clear the space */
4328          add_to_rsp(env, 160);
4329          *rHi = dstHi;
4330          *rLo = dstLo;
4331          return;
4332       }
4333
4334       case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
4335                            goto do_SseAssistedBinary256;
4336       do_SseAssistedBinary256: {
4337          /* RRRufff!  RRRufff code is what we're generating here.  Oh
4338             well. */
4339          vassert(fn != 0);
4340          HReg dstHi = newVRegV(env);
4341          HReg dstLo = newVRegV(env);
4342          HReg argLhi, argLlo, argRhi, argRlo;
4343          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
4344          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
4345          HReg argp = newVRegI(env);
4346          /* subq $160, %rsp         -- make a space*/
4347          sub_from_rsp(env, 160);
4348          /* leaq 48(%rsp), %r_argp  -- point into it */
4349          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
4350                                         argp));
4351          /* andq $-16, %r_argp      -- 16-align the pointer */
4352          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
4353                                          AMD64RMI_Imm( ~(UInt)15 ),
4354                                          argp));
4355          /* Prepare 3 arg regs:
4356             leaq 0(%r_argp), %rdi
4357             leaq 32(%r_argp), %rsi
4358             leaq 64(%r_argp), %rdx
4359          */
4360          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
4361                                         hregAMD64_RDI()));
4362          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
4363                                         hregAMD64_RSI()));
4364          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
4365                                         hregAMD64_RDX()));
4366          /* Store the two args, at (%rsi) and (%rdx):
4367             movupd  %argLlo, 0(%rsi)
4368             movupd  %argLhi, 16(%rsi)
4369             movupd  %argRlo, 0(%rdx)
4370             movupd  %argRhi, 16(%rdx)
4371          */
4372          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
4373                                           AMD64AMode_IR(0, hregAMD64_RSI())));
4374          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
4375                                           AMD64AMode_IR(16, hregAMD64_RSI())));
4376          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
4377                                           AMD64AMode_IR(0, hregAMD64_RDX())));
4378          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
4379                                           AMD64AMode_IR(16, hregAMD64_RDX())));
4380          /* call the helper */
4381          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
4382                                         mk_RetLoc_simple(RLPri_None) ));
4383          /* fetch the result from memory, using %r_argp, which the
4384             register allocator will keep alive across the call. */
4385          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
4386                                           AMD64AMode_IR(0, argp)));
4387          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
4388                                           AMD64AMode_IR(16, argp)));
4389          /* and finally, clear the space */
4390          add_to_rsp(env, 160);
4391          *rHi = dstHi;
4392          *rLo = dstLo;
4393          return;
4394       }
4395
4396       case Iop_I32StoF32x8:
4397       case Iop_F32toI32Sx8: {
4398          HReg argHi, argLo;
4399          iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
4400          HReg dstHi = newVRegV(env);
4401          HReg dstLo = newVRegV(env);
4402          AMD64SseOp mop
4403             = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
4404          set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
4405          addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
4406          addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
4407          set_SSE_rounding_default(env);
4408          *rHi = dstHi;
4409          *rLo = dstLo;
4410          return;
4411       }
4412
4413       default:
4414          break;
4415    } /* switch (e->Iex.Binop.op) */
4416    } /* if (e->tag == Iex_Binop) */
4417
4418    if (e->tag == Iex_Triop) {
4419    IRTriop *triop = e->Iex.Triop.details;
4420    switch (triop->op) {
4421
4422       case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
4423       case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
4424       case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
4425       case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
4426       do_64Fx4_w_rm:
4427       {
4428          HReg argLhi, argLlo, argRhi, argRlo;
4429          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4430          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4431          HReg dstHi = newVRegV(env);
4432          HReg dstLo = newVRegV(env);
4433          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4434          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4435          /* XXXROUNDINGFIXME */
4436          /* set roundingmode here */
4437          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
4438          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
4439          *rHi = dstHi;
4440          *rLo = dstLo;
4441          return;
4442       }
4443
4444       case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
4445       case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
4446       case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
4447       case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
4448       do_32Fx8_w_rm:
4449       {
4450          HReg argLhi, argLlo, argRhi, argRlo;
4451          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
4452          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
4453          HReg dstHi = newVRegV(env);
4454          HReg dstLo = newVRegV(env);
4455          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
4456          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
4457          /* XXXROUNDINGFIXME */
4458          /* set roundingmode here */
4459          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
4460          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
4461          *rHi = dstHi;
4462          *rLo = dstLo;
4463          return;
4464       }
4465
4466       default:
4467          break;
4468    } /* switch (triop->op) */
4469    } /* if (e->tag == Iex_Triop) */
4470
4471
4472    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
4473       const IRExpr* arg1 = e->Iex.Qop.details->arg1;
4474       const IRExpr* arg2 = e->Iex.Qop.details->arg2;
4475       const IRExpr* arg3 = e->Iex.Qop.details->arg3;
4476       const IRExpr* arg4 = e->Iex.Qop.details->arg4;
4477       // If the args are trivially the same (tmp or const), use the same
4478       // source register for all four, and only one movq since those are
4479       // (relatively) expensive.
4480       if (areAtomsAndEqual(arg1, arg2)
4481           && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
4482          HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
4483          HReg tmp = newVRegV(env);
4484          HReg dst = newVRegV(env);
4485          addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
4486          addInstr(env, mk_vMOVsd_RR(dst, tmp));
4487          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
4488          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
4489          *rHi = dst;
4490          *rLo = dst;
4491       } else {
4492          /* arg1 is the most significant (Q3), arg4 the least (Q0) */
4493          HReg q3 = iselIntExpr_R(env, arg1);
4494          HReg q2 = iselIntExpr_R(env, arg2);
4495          HReg q1 = iselIntExpr_R(env, arg3);
4496          HReg q0 = iselIntExpr_R(env, arg4);
4497          HReg tmp = newVRegV(env);
4498          HReg dstHi = newVRegV(env);
4499          HReg dstLo = newVRegV(env);
4500          addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
4501          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
4502          addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
4503          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
4504          addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
4505          addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
4506          addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
4507          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
4508          *rHi = dstHi;
4509          *rLo = dstLo;
4510       }
4511       return;
4512    }
4513
4514    if (e->tag == Iex_ITE) {
4515       HReg r1Hi, r1Lo, r0Hi, r0Lo;
4516       iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
4517       iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
4518       HReg dstHi = newVRegV(env);
4519       HReg dstLo = newVRegV(env);
4520       addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
4521       addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
4522       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
4523       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
4524       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
4525       *rHi = dstHi;
4526       *rLo = dstLo;
4527       return;
4528    }
4529
4530    //avx_fail:
4531    vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
4532               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
4533    ppIRExpr(e);
4534    vpanic("iselDVecExpr_wrk");
4535 }
4536
4537
4538 /*---------------------------------------------------------*/
4539 /*--- ISEL: Statements                                  ---*/
4540 /*---------------------------------------------------------*/
4541
4542 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4543 {
4544    if (vex_traceflags & VEX_TRACE_VCODE) {
4545       vex_printf("\n-- ");
4546       ppIRStmt(stmt);
4547       vex_printf("\n");
4548    }
4549
4550    switch (stmt->tag) {
4551
4552    /* --------- LOADG (guarded load) --------- */
4553    case Ist_LoadG: {
4554       IRLoadG* lg = stmt->Ist.LoadG.details;
4555       if (lg->end != Iend_LE)
4556          goto stmt_fail;
4557
4558       UChar szB = 0; /* invalid */
4559       switch (lg->cvt) {
4560          case ILGop_Ident32:   szB = 4;  break;
4561          case ILGop_Ident64:   szB = 8;  break;
4562          case ILGop_IdentV128: szB = 16; break;
4563          default: break;
4564       }
4565       if (szB == 0)
4566          goto stmt_fail;
4567
4568       AMD64AMode* amAddr
4569          = iselIntExpr_AMode(env, lg->addr);
4570       HReg rAlt
4571          = szB == 16 ? iselVecExpr(env, lg->alt)
4572                      : iselIntExpr_R(env, lg->alt);
4573       HReg rDst
4574          = lookupIRTemp(env, lg->dst);
4575
4576       /* Get the alt value into the dst.  We'll do a conditional load
4577          which overwrites it -- or not -- with loaded data. */
4578       if (szB == 16) {
4579          addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
4580       } else {
4581          addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
4582       }
4583       AMD64CondCode cc = iselCondCode(env, lg->guard);
4584       if (szB == 16) {
4585          addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
4586       } else {
4587          addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
4588       }
4589       return;
4590    }
4591
4592    /* --------- STOREG (guarded store) --------- */
4593    case Ist_StoreG: {
4594       IRStoreG* sg = stmt->Ist.StoreG.details;
4595       if (sg->end != Iend_LE)
4596          goto stmt_fail;
4597
4598       UChar szB = 0; /* invalid */
4599       switch (typeOfIRExpr(env->type_env, sg->data)) {
4600          case Ity_I32:  szB = 4; break;
4601          case Ity_I64:  szB = 8; break;
4602          case Ity_V128: szB = 16; break;
4603          default: break;
4604       }
4605       if (szB == 0)
4606          goto stmt_fail;
4607
4608       AMD64AMode* amAddr
4609          = iselIntExpr_AMode(env, sg->addr);
4610       HReg rSrc
4611          = szB == 16 ? iselVecExpr(env, sg->data)
4612                      : iselIntExpr_R(env, sg->data);
4613       AMD64CondCode cc
4614          = iselCondCode(env, sg->guard);
4615       if (szB == 16) {
4616          addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
4617       } else {
4618          addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
4619       }
4620       return;
4621    }
4622
4623    /* --------- STORE --------- */
4624    case Ist_Store: {
4625       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
4626       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
4627       IREndness end   = stmt->Ist.Store.end;
4628
4629       if (tya != Ity_I64 || end != Iend_LE)
4630          goto stmt_fail;
4631
4632       if (tyd == Ity_I64) {
4633          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4634          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
4635          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
4636          return;
4637       }
4638       if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
4639          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4640          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
4641          addInstr(env, AMD64Instr_Store(
4642                           toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
4643                           r,am));
4644          return;
4645       }
4646       if (tyd == Ity_F64) {
4647          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4648          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
4649          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
4650          return;
4651       }
4652       if (tyd == Ity_F32) {
4653          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4654          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
4655          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
4656          return;
4657       }
4658       if (tyd == Ity_V128) {
4659          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
4660          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
4661          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
4662          return;
4663       }
4664       if (tyd == Ity_V256) {
4665          HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
4666          AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
4667          AMD64AMode* am16 = AMD64AMode_IR(16, rA);
4668          HReg vHi, vLo;
4669          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
4670          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4671          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4672          return;
4673       }
4674       break;
4675    }
4676
4677    /* --------- PUT --------- */
4678    case Ist_Put: {
4679       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
4680       if (ty == Ity_I64) {
4681          /* We're going to write to memory, so compute the RHS into an
4682             AMD64RI. */
4683          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
4684          addInstr(env,
4685                   AMD64Instr_Alu64M(
4686                      Aalu_MOV,
4687                      ri,
4688                      AMD64AMode_IR(stmt->Ist.Put.offset,
4689                                    hregAMD64_RBP())
4690                  ));
4691          return;
4692       }
4693       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
4694          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
4695          addInstr(env, AMD64Instr_Store(
4696                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
4697                           r,
4698                           AMD64AMode_IR(stmt->Ist.Put.offset,
4699                                         hregAMD64_RBP())));
4700          return;
4701       }
4702       if (ty == Ity_F32) {
4703          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
4704          AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
4705          set_SSE_rounding_default(env); /* paranoia */
4706          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
4707          return;
4708       }
4709       if (ty == Ity_F64) {
4710          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
4711          AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
4712                                          hregAMD64_RBP() );
4713          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
4714          return;
4715       }
4716       if (ty == Ity_V128) {
4717          HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
4718          AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
4719                                          hregAMD64_RBP());
4720          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
4721          return;
4722       }
4723       if (ty == Ity_V256) {
4724          HReg vHi, vLo;
4725          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
4726          HReg        rbp  = hregAMD64_RBP();
4727          AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
4728          AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
4729          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
4730          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
4731          return;
4732       }
4733       break;
4734    }
4735
4736    /* --------- Indexed PUT --------- */
4737    case Ist_PutI: {
4738       IRPutI *puti = stmt->Ist.PutI.details;
4739
4740       AMD64AMode* am
4741          = genGuestArrayOffset(
4742               env, puti->descr,
4743                    puti->ix, puti->bias );
4744
4745       IRType ty = typeOfIRExpr(env->type_env, puti->data);
4746       if (ty == Ity_F64) {
4747          HReg val = iselDblExpr(env, puti->data);
4748          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
4749          return;
4750       }
4751       if (ty == Ity_I8) {
4752          HReg r = iselIntExpr_R(env, puti->data);
4753          addInstr(env, AMD64Instr_Store( 1, r, am ));
4754          return;
4755       }
4756       if (ty == Ity_I64) {
4757          AMD64RI* ri = iselIntExpr_RI(env, puti->data);
4758          addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
4759          return;
4760       }
4761       break;
4762    }
4763
4764    /* --------- TMP --------- */
4765    case Ist_WrTmp: {
4766       IRTemp tmp = stmt->Ist.WrTmp.tmp;
4767       IRType ty = typeOfIRTemp(env->type_env, tmp);
4768
4769       /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
4770          compute it into an AMode and then use LEA.  This usually
4771          produces fewer instructions, often because (for memcheck
4772          created IR) we get t = address-expression, (t is later used
4773          twice) and so doing this naturally turns address-expression
4774          back into an AMD64 amode. */
4775       if (ty == Ity_I64
4776           && stmt->Ist.WrTmp.data->tag == Iex_Binop
4777           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
4778          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
4779          HReg dst = lookupIRTemp(env, tmp);
4780          if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
4781             /* Hmm, iselIntExpr_AMode wimped out and just computed the
4782                value into a register.  Just emit a normal reg-reg move
4783                so reg-alloc can coalesce it away in the usual way. */
4784             HReg src = am->Aam.IR.reg;
4785             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
4786          } else {
4787             addInstr(env, AMD64Instr_Lea64(am,dst));
4788          }
4789          return;
4790       }
4791
4792       if (ty == Ity_I64 || ty == Ity_I32
4793           || ty == Ity_I16 || ty == Ity_I8) {
4794          AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
4795          HReg dst = lookupIRTemp(env, tmp);
4796          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
4797          return;
4798       }
4799       if (ty == Ity_I128) {
4800          HReg rHi, rLo, dstHi, dstLo;
4801          iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4802          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4803          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
4804          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
4805          return;
4806       }
4807       if (ty == Ity_I1) {
4808          AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
4809          HReg dst = lookupIRTemp(env, tmp);
4810          addInstr(env, AMD64Instr_Set64(cond, dst));
4811          return;
4812       }
4813       if (ty == Ity_F64) {
4814          HReg dst = lookupIRTemp(env, tmp);
4815          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
4816          addInstr(env, mk_vMOVsd_RR(src, dst));
4817          return;
4818       }
4819       if (ty == Ity_F32) {
4820          HReg dst = lookupIRTemp(env, tmp);
4821          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
4822          addInstr(env, mk_vMOVsd_RR(src, dst));
4823          return;
4824       }
4825       if (ty == Ity_V128) {
4826          HReg dst = lookupIRTemp(env, tmp);
4827          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
4828          addInstr(env, mk_vMOVsd_RR(src, dst));
4829          return;
4830       }
4831       if (ty == Ity_V256) {
4832          HReg rHi, rLo, dstHi, dstLo;
4833          iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4834          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4835          addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
4836          addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
4837          return;
4838       }
4839       break;
4840    }
4841
4842    /* --------- Call to DIRTY helper --------- */
4843    case Ist_Dirty: {
4844       IRDirty* d = stmt->Ist.Dirty.details;
4845
4846       /* Figure out the return type, if any. */
4847       IRType retty = Ity_INVALID;
4848       if (d->tmp != IRTemp_INVALID)
4849          retty = typeOfIRTemp(env->type_env, d->tmp);
4850
4851       /* Throw out any return types we don't know about. */
4852       Bool retty_ok = False;
4853       switch (retty) {
4854          case Ity_INVALID: /* function doesn't return anything */
4855          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
4856          case Ity_V128: case Ity_V256:
4857             retty_ok = True; break;
4858          default:
4859             break;
4860       }
4861       if (!retty_ok)
4862          break; /* will go to stmt_fail: */
4863
4864       /* Marshal args, do the call, and set the return value to
4865          0x555..555 if this is a conditional call that returns a value
4866          and the call is skipped. */
4867       UInt   addToSp = 0;
4868       RetLoc rloc    = mk_RetLoc_INVALID();
4869       doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
4870       vassert(is_sane_RetLoc(rloc));
4871
4872       /* Now figure out what to do with the returned value, if any. */
4873       switch (retty) {
4874          case Ity_INVALID: {
4875             /* No return value.  Nothing to do. */
4876             vassert(d->tmp == IRTemp_INVALID);
4877             vassert(rloc.pri == RLPri_None);
4878             vassert(addToSp == 0);
4879             return;
4880          }
4881          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
4882             /* The returned value is in %rax.  Park it in the register
4883                associated with tmp. */
4884             vassert(rloc.pri == RLPri_Int);
4885             vassert(addToSp == 0);
4886             HReg dst = lookupIRTemp(env, d->tmp);
4887             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
4888             return;
4889          }
4890          case Ity_V128: {
4891             /* The returned value is on the stack, and rloc.spOff
4892                tells us where.  Fish it off the stack and then move
4893                the stack pointer upwards to clear it, as directed by
4894                doHelperCall. */
4895             vassert(rloc.pri == RLPri_V128SpRel);
4896             vassert(addToSp >= 16);
4897             HReg        dst = lookupIRTemp(env, d->tmp);
4898             AMD64AMode* am  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4899             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
4900             add_to_rsp(env, addToSp);
4901             return;
4902          }
4903          case Ity_V256: {
4904             /* See comments for Ity_V128. */
4905             vassert(rloc.pri == RLPri_V256SpRel);
4906             vassert(addToSp >= 32);
4907             HReg        dstLo, dstHi;
4908             lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
4909             AMD64AMode* amLo  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
4910             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
4911             AMD64AMode* amHi  = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
4912             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
4913             add_to_rsp(env, addToSp);
4914             return;
4915          }
4916          default:
4917             /*NOTREACHED*/
4918             vassert(0);
4919       }
4920       break;
4921    }
4922
4923    /* --------- MEM FENCE --------- */
4924    case Ist_MBE:
4925       switch (stmt->Ist.MBE.event) {
4926          case Imbe_Fence:
4927             addInstr(env, AMD64Instr_MFence());
4928             return;
4929          default:
4930             break;
4931       }
4932       break;
4933
4934    /* --------- ACAS --------- */
4935    case Ist_CAS:
4936       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4937          /* "normal" singleton CAS */
4938          UChar  sz;
4939          IRCAS* cas = stmt->Ist.CAS.details;
4940          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4941          /* get: cas->expd into %rax, and cas->data into %rbx */
4942          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4943          HReg rData = iselIntExpr_R(env, cas->dataLo);
4944          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
4945          HReg rOld  = lookupIRTemp(env, cas->oldLo);
4946          vassert(cas->expdHi == NULL);
4947          vassert(cas->dataHi == NULL);
4948          addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
4949          addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
4950          addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
4951          switch (ty) {
4952             case Ity_I64: sz = 8; break;
4953             case Ity_I32: sz = 4; break;
4954             case Ity_I16: sz = 2; break;
4955             case Ity_I8:  sz = 1; break;
4956             default: goto unhandled_cas;
4957          }
4958          addInstr(env, AMD64Instr_ACAS(am, sz));
4959          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
4960          return;
4961       } else {
4962          /* double CAS */
4963          UChar  sz;
4964          IRCAS* cas = stmt->Ist.CAS.details;
4965          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4966          /* only 32-bit and 64-bit allowed in this case */
4967          /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
4968          /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
4969          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
4970          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
4971          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
4972          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
4973          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
4974          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
4975          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
4976          switch (ty) {
4977             case Ity_I64:
4978                if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
4979                   goto unhandled_cas; /* we'd have to generate
4980                                          cmpxchg16b, but the host
4981                                          doesn't support that */
4982                sz = 8;
4983                break;
4984             case Ity_I32:
4985                sz = 4;
4986                break;
4987             default:
4988                goto unhandled_cas;
4989          }
4990          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
4991          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
4992          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
4993          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
4994          addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
4995          addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
4996          addInstr(env, AMD64Instr_DACAS(am, sz));
4997          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
4998          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
4999          return;
5000       }
5001       unhandled_cas:
5002       break;
5003
5004    /* --------- INSTR MARK --------- */
5005    /* Doesn't generate any executable code ... */
5006    case Ist_IMark:
5007        return;
5008
5009    /* --------- ABI HINT --------- */
5010    /* These have no meaning (denotation in the IR) and so we ignore
5011       them ... if any actually made it this far. */
5012    case Ist_AbiHint:
5013        return;
5014
5015    /* --------- NO-OP --------- */
5016    case Ist_NoOp:
5017        return;
5018
5019    /* --------- EXIT --------- */
5020    case Ist_Exit: {
5021       if (stmt->Ist.Exit.dst->tag != Ico_U64)
5022          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
5023
5024       AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
5025       AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
5026                                           hregAMD64_RBP());
5027
5028       /* Case: boring transfer to known address */
5029       if (stmt->Ist.Exit.jk == Ijk_Boring) {
5030          if (env->chainingAllowed) {
5031             /* .. almost always true .. */
5032             /* Skip the event check at the dst if this is a forwards
5033                edge. */
5034             Bool toFastEP
5035                = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
5036             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
5037             addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
5038                                              amRIP, cc, toFastEP));
5039          } else {
5040             /* .. very occasionally .. */
5041             /* We can't use chaining, so ask for an assisted transfer,
5042                as that's the only alternative that is allowable. */
5043             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5044             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
5045          }
5046          return;
5047       }
5048
5049       /* Case: assisted transfer to arbitrary address */
5050       switch (stmt->Ist.Exit.jk) {
5051          /* Keep this list in sync with that in iselNext below */
5052          case Ijk_ClientReq:
5053          case Ijk_EmWarn:
5054          case Ijk_NoDecode:
5055          case Ijk_NoRedir:
5056          case Ijk_SigSEGV:
5057          case Ijk_SigTRAP:
5058          case Ijk_Sys_syscall:
5059          case Ijk_Sys_int210:
5060          case Ijk_InvalICache:
5061          case Ijk_Yield:
5062          {
5063             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
5064             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
5065             return;
5066          }
5067          default:
5068             break;
5069       }
5070
5071       /* Do we ever expect to see any other kind? */
5072       goto stmt_fail;
5073    }
5074
5075    default: break;
5076    }
5077   stmt_fail:
5078    ppIRStmt(stmt);
5079    vpanic("iselStmt(amd64)");
5080 }
5081
5082
5083 /*---------------------------------------------------------*/
5084 /*--- ISEL: Basic block terminators (Nexts)             ---*/
5085 /*---------------------------------------------------------*/
5086
5087 static void iselNext ( ISelEnv* env,
5088                        IRExpr* next, IRJumpKind jk, Int offsIP )
5089 {
5090    if (vex_traceflags & VEX_TRACE_VCODE) {
5091       vex_printf( "\n-- PUT(%d) = ", offsIP);
5092       ppIRExpr( next );
5093       vex_printf( "; exit-");
5094       ppIRJumpKind(jk);
5095       vex_printf( "\n");
5096    }
5097
5098    /* Case: boring transfer to known address */
5099    if (next->tag == Iex_Const) {
5100       IRConst* cdst = next->Iex.Const.con;
5101       vassert(cdst->tag == Ico_U64);
5102       if (jk == Ijk_Boring || jk == Ijk_Call) {
5103          /* Boring transfer to known address */
5104          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5105          if (env->chainingAllowed) {
5106             /* .. almost always true .. */
5107             /* Skip the event check at the dst if this is a forwards
5108                edge. */
5109             Bool toFastEP
5110                = ((Addr64)cdst->Ico.U64) > env->max_ga;
5111             if (0) vex_printf("%s", toFastEP ? "X" : ".");
5112             addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
5113                                              amRIP, Acc_ALWAYS,
5114                                              toFastEP));
5115          } else {
5116             /* .. very occasionally .. */
5117             /* We can't use chaining, so ask for an indirect transfer,
5118                as that's the cheapest alternative that is
5119                allowable. */
5120             HReg r = iselIntExpr_R(env, next);
5121             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5122                                                Ijk_Boring));
5123          }
5124          return;
5125       }
5126    }
5127
5128    /* Case: call/return (==boring) transfer to any address */
5129    switch (jk) {
5130       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
5131          HReg        r     = iselIntExpr_R(env, next);
5132          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5133          if (env->chainingAllowed) {
5134             addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
5135          } else {
5136             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
5137                                                Ijk_Boring));
5138          }
5139          return;
5140       }
5141       default:
5142          break;
5143    }
5144
5145    /* Case: assisted transfer to arbitrary address */
5146    switch (jk) {
5147       /* Keep this list in sync with that for Ist_Exit above */
5148       case Ijk_ClientReq:
5149       case Ijk_EmWarn:
5150       case Ijk_NoDecode:
5151       case Ijk_NoRedir:
5152       case Ijk_SigSEGV:
5153       case Ijk_SigTRAP:
5154       case Ijk_Sys_syscall:
5155       case Ijk_Sys_int210:
5156       case Ijk_InvalICache:
5157       case Ijk_Yield: {
5158          HReg        r     = iselIntExpr_R(env, next);
5159          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
5160          addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
5161          return;
5162       }
5163       default:
5164          break;
5165    }
5166
5167    vex_printf( "\n-- PUT(%d) = ", offsIP);
5168    ppIRExpr( next );
5169    vex_printf( "; exit-");
5170    ppIRJumpKind(jk);
5171    vex_printf( "\n");
5172    vassert(0); // are we expecting any other kind?
5173 }
5174
5175
5176 /*---------------------------------------------------------*/
5177 /*--- Insn selector top-level                           ---*/
5178 /*---------------------------------------------------------*/
5179
5180 /* Translate an entire SB to amd64 code. */
5181
5182 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
5183                             VexArch      arch_host,
5184                             const VexArchInfo* archinfo_host,
5185                             const VexAbiInfo*  vbi/*UNUSED*/,
5186                             Int offs_Host_EvC_Counter,
5187                             Int offs_Host_EvC_FailAddr,
5188                             Bool chainingAllowed,
5189                             Bool addProfInc,
5190                             Addr max_ga )
5191 {
5192    Int        i, j;
5193    HReg       hreg, hregHI;
5194    ISelEnv*   env;
5195    UInt       hwcaps_host = archinfo_host->hwcaps;
5196    AMD64AMode *amCounter, *amFailAddr;
5197
5198    /* sanity ... */
5199    vassert(arch_host == VexArchAMD64);
5200    vassert(0 == (hwcaps_host
5201                  & ~(VEX_HWCAPS_AMD64_SSE3
5202                      | VEX_HWCAPS_AMD64_SSSE3
5203                      | VEX_HWCAPS_AMD64_CX16
5204                      | VEX_HWCAPS_AMD64_LZCNT
5205                      | VEX_HWCAPS_AMD64_AVX
5206                      | VEX_HWCAPS_AMD64_RDTSCP
5207                      | VEX_HWCAPS_AMD64_BMI
5208                      | VEX_HWCAPS_AMD64_AVX2
5209                      | VEX_HWCAPS_AMD64_F16C
5210                      | VEX_HWCAPS_AMD64_RDRAND)));
5211
5212    /* Check that the host's endianness is as expected. */
5213    vassert(archinfo_host->endness == VexEndnessLE);
5214
5215    /* Make up an initial environment to use. */
5216    env = LibVEX_Alloc_inline(sizeof(ISelEnv));
5217    env->vreg_ctr = 0;
5218
5219    /* Set up output code array. */
5220    env->code = newHInstrArray();
5221
5222    /* Copy BB's type env. */
5223    env->type_env = bb->tyenv;
5224
5225    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
5226       change as we go along. */
5227    env->n_vregmap = bb->tyenv->types_used;
5228    env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5229    env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
5230
5231    /* and finally ... */
5232    env->chainingAllowed = chainingAllowed;
5233    env->hwcaps          = hwcaps_host;
5234    env->max_ga          = max_ga;
5235
5236    /* For each IR temporary, allocate a suitably-kinded virtual
5237       register. */
5238    j = 0;
5239    for (i = 0; i < env->n_vregmap; i++) {
5240       hregHI = hreg = INVALID_HREG;
5241       switch (bb->tyenv->types[i]) {
5242          case Ity_I1:
5243          case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
5244             hreg = mkHReg(True, HRcInt64, 0, j++);
5245             break;
5246          case Ity_I128:
5247             hreg   = mkHReg(True, HRcInt64, 0, j++);
5248             hregHI = mkHReg(True, HRcInt64, 0, j++);
5249             break;
5250          case Ity_F32:
5251          case Ity_F64:
5252          case Ity_V128:
5253             hreg = mkHReg(True, HRcVec128, 0, j++);
5254             break;
5255          case Ity_V256:
5256             hreg   = mkHReg(True, HRcVec128, 0, j++);
5257             hregHI = mkHReg(True, HRcVec128, 0, j++);
5258             break;
5259          default:
5260             ppIRType(bb->tyenv->types[i]);
5261             vpanic("iselBB(amd64): IRTemp type");
5262       }
5263       env->vregmap[i]   = hreg;
5264       env->vregmapHI[i] = hregHI;
5265    }
5266    env->vreg_ctr = j;
5267
5268    /* The very first instruction must be an event check. */
5269    amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
5270    amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
5271    addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
5272
5273    /* Possibly a block counter increment (for profiling).  At this
5274       point we don't know the address of the counter, so just pretend
5275       it is zero.  It will have to be patched later, but before this
5276       translation is used, by a call to LibVEX_patchProfCtr. */
5277    if (addProfInc) {
5278       addInstr(env, AMD64Instr_ProfInc());
5279    }
5280
5281    /* Ok, finally we can iterate over the statements. */
5282    for (i = 0; i < bb->stmts_used; i++)
5283       if (bb->stmts[i])
5284          iselStmt(env, bb->stmts[i]);
5285
5286    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
5287
5288    /* record the number of vregs we used. */
5289    env->code->n_vregs = env->vreg_ctr;
5290    return env->code;
5291 }
5292
5293
5294 /*---------------------------------------------------------------*/
5295 /*--- end                                   host_amd64_isel.c ---*/
5296 /*---------------------------------------------------------------*/