VEX/priv/host_arm64_isel.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                                 host_arm64_isel.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2013-2017 OpenWorks
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, write to the Free Software
  25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26    02110-1301, USA.
  27
  28    The GNU General Public License is contained in the file COPYING.
  29 */
  30
  31 #include "libvex_basictypes.h"
  32 #include "libvex_ir.h"
  33 #include "libvex.h"
  34 #include "ir_match.h"
  35
  36 #include "main_util.h"
  37 #include "main_globals.h"
  38 #include "host_generic_regs.h"
  39 #include "host_generic_simd64.h"  // for 32-bit SIMD helpers
  40 #include "host_arm64_defs.h"
  41
  42
  43 /*---------------------------------------------------------*/
  44 /*--- ISelEnv                                           ---*/
  45 /*---------------------------------------------------------*/
  46
  47 /* This carries around:
  48
  49    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
  50      might encounter.  This is computed before insn selection starts,
  51      and does not change.
  52
  53    - A mapping from IRTemp to HReg.  This tells the insn selector
  54      which virtual register is associated with each IRTemp temporary.
  55      This is computed before insn selection starts, and does not
  56      change.  We expect this mapping to map precisely the same set of
  57      IRTemps as the type mapping does.
  58
  59      |vregmap|   holds the primary register for the IRTemp.
  60      |vregmapHI| is only used for 128-bit integer-typed
  61                  IRTemps.  It holds the identity of a second
  62                  64-bit virtual HReg, which holds the high half
  63                  of the value.
  64
  65    - The code array, that is, the insns selected so far.
  66
  67    - A counter, for generating new virtual registers.
  68
  69    - The host hardware capabilities word.  This is set at the start
  70      and does not change.
  71
  72    - A Bool for indicating whether we may generate chain-me
  73      instructions for control flow transfers, or whether we must use
  74      XAssisted.
  75
  76    - The maximum guest address of any guest insn in this block.
  77      Actually, the address of the highest-addressed byte from any insn
  78      in this block.  Is set at the start and does not change.  This is
  79      used for detecting jumps which are definitely forward-edges from
  80      this block, and therefore can be made (chained) to the fast entry
  81      point of the destination, thereby avoiding the destination's
  82      event check.
  83
  84     - An IRExpr*, which may be NULL, holding the IR expression (an
  85       IRRoundingMode-encoded value) to which the FPU's rounding mode
  86       was most recently set.  Setting to NULL is always safe.  Used to
  87       avoid redundant settings of the FPU's rounding mode, as
  88       described in set_FPCR_rounding_mode below.
  89
  90    Note, this is all (well, mostly) host-independent.
  91 */
  92
  93 typedef
  94    struct {
  95       /* Constant -- are set at the start and do not change. */
  96       IRTypeEnv*   type_env;
  97
  98       HReg*        vregmap;
  99       HReg*        vregmapHI;
 100       Int          n_vregmap;
 101
 102       UInt         hwcaps;
 103
 104       Bool         chainingAllowed;
 105       Addr64       max_ga;
 106
 107       /* These are modified as we go along. */
 108       HInstrArray* code;
 109       Int          vreg_ctr;
 110
 111       IRExpr*      previous_rm;
 112    }
 113    ISelEnv;
 114
 115 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
 116 {
 117    vassert(tmp >= 0);
 118    vassert(tmp < env->n_vregmap);
 119    return env->vregmap[tmp];
 120 }
 121
 122 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
 123                                ISelEnv* env, IRTemp tmp )
 124 {
 125    vassert(tmp >= 0);
 126    vassert(tmp < env->n_vregmap);
 127    vassert(! hregIsInvalid(env->vregmapHI[tmp]));
 128    *vrLO = env->vregmap[tmp];
 129    *vrHI = env->vregmapHI[tmp];
 130 }
 131
 132 static void addInstr ( ISelEnv* env, ARM64Instr* instr )
 133 {
 134    addHInstr(env->code, instr);
 135    if (vex_traceflags & VEX_TRACE_VCODE) {
 136       ppARM64Instr(instr);
 137       vex_printf("\n");
 138    }
 139 }
 140
 141 static HReg newVRegI ( ISelEnv* env )
 142 {
 143    HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0, env->vreg_ctr);
 144    env->vreg_ctr++;
 145    return reg;
 146 }
 147
 148 static HReg newVRegD ( ISelEnv* env )
 149 {
 150    HReg reg = mkHReg(True/*virtual reg*/, HRcFlt64, 0, env->vreg_ctr);
 151    env->vreg_ctr++;
 152    return reg;
 153 }
 154
 155 static HReg newVRegV ( ISelEnv* env )
 156 {
 157    HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0, env->vreg_ctr);
 158    env->vreg_ctr++;
 159    return reg;
 160 }
 161
 162
 163 /*---------------------------------------------------------*/
 164 /*--- ISEL: Forward declarations                        ---*/
 165 /*---------------------------------------------------------*/
 166
 167 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
 168    iselXXX_wrk do the real work, but are not to be called directly.
 169    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
 170    checks that all returned registers are virtual.  You should not
 171    call the _wrk version directly.
 172
 173    Because some forms of ARM64 memory amodes are implicitly scaled by
 174    the access size, iselIntExpr_AMode takes an IRType which tells it
 175    the type of the access for which the amode is to be used.  This
 176    type needs to be correct, else you'll get incorrect code.
 177 */
 178 static ARM64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env,
 179                                            IRExpr* e, IRType dty );
 180 static ARM64AMode* iselIntExpr_AMode     ( ISelEnv* env,
 181                                            IRExpr* e, IRType dty );
 182
 183 static ARM64RIA*   iselIntExpr_RIA_wrk   ( ISelEnv* env, IRExpr* e );
 184 static ARM64RIA*   iselIntExpr_RIA       ( ISelEnv* env, IRExpr* e );
 185
 186 static ARM64RIL*   iselIntExpr_RIL_wrk   ( ISelEnv* env, IRExpr* e );
 187 static ARM64RIL*   iselIntExpr_RIL       ( ISelEnv* env, IRExpr* e );
 188
 189 static ARM64RI6*   iselIntExpr_RI6_wrk   ( ISelEnv* env, IRExpr* e );
 190 static ARM64RI6*   iselIntExpr_RI6       ( ISelEnv* env, IRExpr* e );
 191
 192 static ARM64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
 193 static ARM64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
 194
 195 static HReg        iselIntExpr_R_wrk     ( ISelEnv* env, IRExpr* e );
 196 static HReg        iselIntExpr_R         ( ISelEnv* env, IRExpr* e );
 197
 198 static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, HReg* rLo,
 199                                            ISelEnv* env, IRExpr* e );
 200 static void        iselInt128Expr        ( /*OUT*/HReg* rHi, HReg* rLo,
 201                                            ISelEnv* env, IRExpr* e );
 202
 203 static HReg        iselDblExpr_wrk        ( ISelEnv* env, IRExpr* e );
 204 static HReg        iselDblExpr            ( ISelEnv* env, IRExpr* e );
 205
 206 static HReg        iselFltExpr_wrk        ( ISelEnv* env, IRExpr* e );
 207 static HReg        iselFltExpr            ( ISelEnv* env, IRExpr* e );
 208
 209 static HReg        iselF16Expr_wrk        ( ISelEnv* env, IRExpr* e );
 210 static HReg        iselF16Expr            ( ISelEnv* env, IRExpr* e );
 211
 212 static HReg        iselV128Expr_wrk       ( ISelEnv* env, IRExpr* e );
 213 static HReg        iselV128Expr           ( ISelEnv* env, IRExpr* e );
 214
 215 static void        iselV256Expr_wrk       ( /*OUT*/HReg* rHi, HReg* rLo,
 216                                             ISelEnv* env, IRExpr* e );
 217 static void        iselV256Expr           ( /*OUT*/HReg* rHi, HReg* rLo,
 218                                             ISelEnv* env, IRExpr* e );
 219
 220 static ARM64RIL* mb_mkARM64RIL_I ( ULong imm64 );
 221
 222
 223 /*---------------------------------------------------------*/
 224 /*--- ISEL: Misc helpers                                ---*/
 225 /*---------------------------------------------------------*/
 226
 227 /* Generate an amode suitable for a 64-bit sized access relative to
 228    the baseblock register (X21).  This generates an RI12 amode, which
 229    means its scaled by the access size, which is why the access size
 230    -- 64 bit -- is stated explicitly here.  Consequently |off| needs
 231    to be divisible by 8. */
 232 static ARM64AMode* mk_baseblock_64bit_access_amode ( UInt off )
 233 {
 234    vassert(off < (8 << 12)); /* otherwise it's unrepresentable */
 235    vassert((off & 7) == 0);  /* ditto */
 236    return ARM64AMode_RI12(hregARM64_X21(), off >> 3, 8/*scale*/);
 237 }
 238
 239 /* Ditto, for 32 bit accesses. */
 240 static ARM64AMode* mk_baseblock_32bit_access_amode ( UInt off )
 241 {
 242    vassert(off < (4 << 12)); /* otherwise it's unrepresentable */
 243    vassert((off & 3) == 0);  /* ditto */
 244    return ARM64AMode_RI12(hregARM64_X21(), off >> 2, 4/*scale*/);
 245 }
 246
 247 /* Ditto, for 16 bit accesses. */
 248 static ARM64AMode* mk_baseblock_16bit_access_amode ( UInt off )
 249 {
 250    vassert(off < (2 << 12)); /* otherwise it's unrepresentable */
 251    vassert((off & 1) == 0);  /* ditto */
 252    return ARM64AMode_RI12(hregARM64_X21(), off >> 1, 2/*scale*/);
 253 }
 254
 255 /* Ditto, for 8 bit accesses. */
 256 static ARM64AMode* mk_baseblock_8bit_access_amode ( UInt off )
 257 {
 258    vassert(off < (1 << 12)); /* otherwise it's unrepresentable */
 259    return ARM64AMode_RI12(hregARM64_X21(), off >> 0, 1/*scale*/);
 260 }
 261
 262 static HReg mk_baseblock_128bit_access_addr ( ISelEnv* env, UInt off )
 263 {
 264    vassert(off < (1<<12));
 265    HReg r = newVRegI(env);
 266    addInstr(env, ARM64Instr_Arith(r, hregARM64_X21(),
 267                                      ARM64RIA_I12(off,0), True/*isAdd*/));
 268    return r;
 269 }
 270
 271 static HReg get_baseblock_register ( void )
 272 {
 273    return hregARM64_X21();
 274 }
 275
 276 /* Generate code to zero extend a 32 bit value in 'src' to 64 bits, in
 277    a new register, and return the new register. */
 278 static HReg widen_z_32_to_64 ( ISelEnv* env, HReg src )
 279 {
 280    HReg      dst  = newVRegI(env);
 281    ARM64RIL* mask = ARM64RIL_I13(1, 0, 31); /* encodes 0xFFFFFFFF */
 282    addInstr(env, ARM64Instr_Logic(dst, src, mask, ARM64lo_AND));
 283    return dst;
 284 }
 285
 286 /* Generate code to sign extend a 16 bit value in 'src' to 64 bits, in
 287    a new register, and return the new register. */
 288 static HReg widen_s_16_to_64 ( ISelEnv* env, HReg src )
 289 {
 290    HReg      dst = newVRegI(env);
 291    ARM64RI6* n48 = ARM64RI6_I6(48);
 292    addInstr(env, ARM64Instr_Shift(dst, src, n48, ARM64sh_SHL));
 293    addInstr(env, ARM64Instr_Shift(dst, dst, n48, ARM64sh_SAR));
 294    return dst;
 295 }
 296
 297 /* Generate code to zero extend a 16 bit value in 'src' to 64 bits, in
 298    a new register, and return the new register. */
 299 static HReg widen_z_16_to_64 ( ISelEnv* env, HReg src )
 300 {
 301    HReg      dst  = newVRegI(env);
 302    ARM64RIL* mask = ARM64RIL_I13(1, 0, 15); /* encodes 0xFFFF */
 303    addInstr(env, ARM64Instr_Logic(dst, src, mask, ARM64lo_AND));
 304    return dst;
 305 }
 306
 307 /* Generate code to sign extend a 32 bit value in 'src' to 64 bits, in
 308    a new register, and return the new register. */
 309 static HReg widen_s_32_to_64 ( ISelEnv* env, HReg src )
 310 {
 311    HReg      dst = newVRegI(env);
 312    ARM64RI6* n32 = ARM64RI6_I6(32);
 313    addInstr(env, ARM64Instr_Shift(dst, src, n32, ARM64sh_SHL));
 314    addInstr(env, ARM64Instr_Shift(dst, dst, n32, ARM64sh_SAR));
 315    return dst;
 316 }
 317
 318 /* Generate code to sign extend a 8 bit value in 'src' to 64 bits, in
 319    a new register, and return the new register. */
 320 static HReg widen_s_8_to_64 ( ISelEnv* env, HReg src )
 321 {
 322    HReg      dst = newVRegI(env);
 323    ARM64RI6* n56 = ARM64RI6_I6(56);
 324    addInstr(env, ARM64Instr_Shift(dst, src, n56, ARM64sh_SHL));
 325    addInstr(env, ARM64Instr_Shift(dst, dst, n56, ARM64sh_SAR));
 326    return dst;
 327 }
 328
 329 static HReg widen_z_8_to_64 ( ISelEnv* env, HReg src )
 330 {
 331    HReg      dst  = newVRegI(env);
 332    ARM64RIL* mask = ARM64RIL_I13(1, 0, 7); /* encodes 0xFF */
 333    addInstr(env, ARM64Instr_Logic(dst, src, mask, ARM64lo_AND));
 334    return dst;
 335 }
 336
 337 /* Is this IRExpr_Const(IRConst_U64(0)) ? */
 338 static Bool isZeroU64 ( IRExpr* e ) {
 339    if (e->tag != Iex_Const) return False;
 340    IRConst* con = e->Iex.Const.con;
 341    vassert(con->tag == Ico_U64);
 342    return con->Ico.U64 == 0;
 343 }
 344
 345
 346 /*---------------------------------------------------------*/
 347 /*--- ISEL: FP rounding mode helpers                    ---*/
 348 /*---------------------------------------------------------*/
 349
 350 /* Set the FP rounding mode: 'mode' is an I32-typed expression
 351    denoting a value in the range 0 .. 3, indicating a round mode
 352    encoded as per type IRRoundingMode -- the first four values only
 353    (Irrm_NEAREST, Irrm_NegINF, Irrm_PosINF, Irrm_ZERO).  Set the ARM64
 354    FSCR to have the same rounding.
 355
 356    For speed & simplicity, we're setting the *entire* FPCR here.
 357
 358    Setting the rounding mode is expensive.  So this function tries to
 359    avoid repeatedly setting the rounding mode to the same thing by
 360    first comparing 'mode' to the 'mode' tree supplied in the previous
 361    call to this function, if any.  (The previous value is stored in
 362    env->previous_rm.)  If 'mode' is a single IR temporary 't' and
 363    env->previous_rm is also just 't', then the setting is skipped.
 364
 365    This is safe because of the SSA property of IR: an IR temporary can
 366    only be defined once and so will have the same value regardless of
 367    where it appears in the block.  Cool stuff, SSA.
 368
 369    A safety condition: all attempts to set the RM must be aware of
 370    this mechanism - by being routed through the functions here.
 371
 372    Of course this only helps if blocks where the RM is set more than
 373    once and it is set to the same value each time, *and* that value is
 374    held in the same IR temporary each time.  In order to assure the
 375    latter as much as possible, the IR optimiser takes care to do CSE
 376    on any block with any sign of floating point activity.
 377 */
 378 static
 379 void set_FPCR_rounding_mode ( ISelEnv* env, IRExpr* mode )
 380 {
 381    vassert(typeOfIRExpr(env->type_env,mode) == Ity_I32);
 382
 383    /* Do we need to do anything? */
 384    if (env->previous_rm
 385        && env->previous_rm->tag == Iex_RdTmp
 386        && mode->tag == Iex_RdTmp
 387        && env->previous_rm->Iex.RdTmp.tmp == mode->Iex.RdTmp.tmp) {
 388       /* no - setting it to what it was before.  */
 389       vassert(typeOfIRExpr(env->type_env, env->previous_rm) == Ity_I32);
 390       return;
 391    }
 392
 393    /* No luck - we better set it, and remember what we set it to. */
 394    env->previous_rm = mode;
 395
 396    /* Only supporting the rounding-mode bits - the rest of FPCR is set
 397       to zero - so we can set the whole register at once (faster). */
 398
 399    /* This isn't simple, because 'mode' carries an IR rounding
 400       encoding, and we need to translate that to an ARM64 FP one:
 401       The IR encoding:
 402          00  to nearest (the default)
 403          10  to +infinity
 404          01  to -infinity
 405          11  to zero
 406       The ARM64 FP encoding:
 407          00  to nearest
 408          01  to +infinity
 409          10  to -infinity
 410          11  to zero
 411       Easy enough to do; just swap the two bits.
 412    */
 413    HReg irrm = iselIntExpr_R(env, mode);
 414    HReg tL   = newVRegI(env);
 415    HReg tR   = newVRegI(env);
 416    HReg t3   = newVRegI(env);
 417    /* tL = irrm << 1;
 418       tR = irrm >> 1;  if we're lucky, these will issue together
 419       tL &= 2;
 420       tR &= 1;         ditto
 421       t3 = tL | tR;
 422       t3 <<= 22;
 423       fmxr fpscr, t3
 424    */
 425    ARM64RIL* ril_one = mb_mkARM64RIL_I(1);
 426    ARM64RIL* ril_two = mb_mkARM64RIL_I(2);
 427    vassert(ril_one && ril_two);
 428    addInstr(env, ARM64Instr_Shift(tL, irrm, ARM64RI6_I6(1), ARM64sh_SHL));
 429    addInstr(env, ARM64Instr_Shift(tR, irrm, ARM64RI6_I6(1), ARM64sh_SHR));
 430    addInstr(env, ARM64Instr_Logic(tL, tL, ril_two, ARM64lo_AND));
 431    addInstr(env, ARM64Instr_Logic(tR, tR, ril_one, ARM64lo_AND));
 432    addInstr(env, ARM64Instr_Logic(t3, tL, ARM64RIL_R(tR), ARM64lo_OR));
 433    addInstr(env, ARM64Instr_Shift(t3, t3, ARM64RI6_I6(22), ARM64sh_SHL));
 434    addInstr(env, ARM64Instr_FPCR(True/*toFPCR*/, t3));
 435 }
 436
 437
 438 /*---------------------------------------------------------*/
 439 /*--- ISEL: Function call helpers                       ---*/
 440 /*---------------------------------------------------------*/
 441
 442 /* Used only in doHelperCall.  See big comment in doHelperCall re
 443    handling of register-parameter args.  This function figures out
 444    whether evaluation of an expression might require use of a fixed
 445    register.  If in doubt return True (safe but suboptimal).
 446 */
 447 static
 448 Bool mightRequireFixedRegs ( IRExpr* e )
 449 {
 450    if (UNLIKELY(is_IRExpr_VECRET_or_GSPTR(e))) {
 451       // These are always "safe" -- either a copy of SP in some
 452       // arbitrary vreg, or a copy of x21, respectively.
 453       return False;
 454    }
 455    /* Else it's a "normal" expression. */
 456    switch (e->tag) {
 457       case Iex_RdTmp: case Iex_Const: case Iex_Get:
 458          return False;
 459       default:
 460          return True;
 461    }
 462 }
 463
 464
 465 /* Do a complete function call.  |guard| is a Ity_Bit expression
 466    indicating whether or not the call happens.  If guard==NULL, the
 467    call is unconditional.  |retloc| is set to indicate where the
 468    return value is after the call.  The caller (of this fn) must
 469    generate code to add |stackAdjustAfterCall| to the stack pointer
 470    after the call is done.  Returns True iff it managed to handle this
 471    combination of arg/return types, else returns False. */
 472
 473 static
 474 Bool doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
 475                     /*OUT*/RetLoc* retloc,
 476                     ISelEnv* env,
 477                     IRExpr* guard,
 478                     IRCallee* cee, IRType retTy, IRExpr** args )
 479 {
 480    ARM64CondCode cc;
 481    HReg          argregs[ARM64_N_ARGREGS];
 482    HReg          tmpregs[ARM64_N_ARGREGS];
 483    Bool          go_fast;
 484    Int           n_args, i, nextArgReg;
 485    Addr64        target;
 486
 487    vassert(ARM64_N_ARGREGS == 8);
 488
 489    /* Set default returns.  We'll update them later if needed. */
 490    *stackAdjustAfterCall = 0;
 491    *retloc               = mk_RetLoc_INVALID();
 492
 493    /* These are used for cross-checking that IR-level constraints on
 494       the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */
 495    UInt nVECRETs = 0;
 496    UInt nGSPTRs  = 0;
 497
 498    /* Marshal args for a call and do the call.
 499
 500       This function only deals with a tiny set of possibilities, which
 501       cover all helpers in practice.  The restrictions are that only
 502       arguments in registers are supported, hence only
 503       ARM64_N_REGPARMS x 64 integer bits in total can be passed.  In
 504       fact the only supported arg type is I64.
 505
 506       The return type can be I{64,32} or V128.  In the V128 case, it
 507       is expected that |args| will contain the special node
 508       IRExpr_VECRET(), in which case this routine generates code to
 509       allocate space on the stack for the vector return value.  Since
 510       we are not passing any scalars on the stack, it is enough to
 511       preallocate the return space before marshalling any arguments,
 512       in this case.
 513
 514       |args| may also contain IRExpr_GSPTR(), in which case the
 515       value in x21 is passed as the corresponding argument.
 516
 517       Generating code which is both efficient and correct when
 518       parameters are to be passed in registers is difficult, for the
 519       reasons elaborated in detail in comments attached to
 520       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
 521       of the method described in those comments.
 522
 523       The problem is split into two cases: the fast scheme and the
 524       slow scheme.  In the fast scheme, arguments are computed
 525       directly into the target (real) registers.  This is only safe
 526       when we can be sure that computation of each argument will not
 527       trash any real registers set by computation of any other
 528       argument.
 529
 530       In the slow scheme, all args are first computed into vregs, and
 531       once they are all done, they are moved to the relevant real
 532       regs.  This always gives correct code, but it also gives a bunch
 533       of vreg-to-rreg moves which are usually redundant but are hard
 534       for the register allocator to get rid of.
 535
 536       To decide which scheme to use, all argument expressions are
 537       first examined.  If they are all so simple that it is clear they
 538       will be evaluated without use of any fixed registers, use the
 539       fast scheme, else use the slow scheme.  Note also that only
 540       unconditional calls may use the fast scheme, since having to
 541       compute a condition expression could itself trash real
 542       registers.
 543
 544       Note this requires being able to examine an expression and
 545       determine whether or not evaluation of it might use a fixed
 546       register.  That requires knowledge of how the rest of this insn
 547       selector works.  Currently just the following 3 are regarded as
 548       safe -- hopefully they cover the majority of arguments in
 549       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
 550    */
 551
 552    /* Note that the cee->regparms field is meaningless on ARM64 hosts
 553       (since there is only one calling convention) and so we always
 554       ignore it. */
 555
 556    n_args = 0;
 557    for (i = 0; args[i]; i++) {
 558       IRExpr* arg = args[i];
 559       if (UNLIKELY(arg->tag == Iex_VECRET)) {
 560          nVECRETs++;
 561       } else if (UNLIKELY(arg->tag == Iex_GSPTR)) {
 562          nGSPTRs++;
 563       }
 564       n_args++;
 565    }
 566
 567    /* If this fails, the IR is ill-formed */
 568    vassert(nGSPTRs == 0 || nGSPTRs == 1);
 569
 570    /* If we have a VECRET, allocate space on the stack for the return
 571       value, and record the stack pointer after that. */
 572    HReg r_vecRetAddr = INVALID_HREG;
 573    if (nVECRETs == 1) {
 574       vassert(retTy == Ity_V128 || retTy == Ity_V256);
 575       vassert(retTy != Ity_V256); // we don't handle that yet (if ever)
 576       r_vecRetAddr = newVRegI(env);
 577       addInstr(env, ARM64Instr_AddToSP(-16));
 578       addInstr(env, ARM64Instr_FromSP(r_vecRetAddr));
 579    } else {
 580       // If either of these fail, the IR is ill-formed
 581       vassert(retTy != Ity_V128 && retTy != Ity_V256);
 582       vassert(nVECRETs == 0);
 583    }
 584
 585    argregs[0] = hregARM64_X0();
 586    argregs[1] = hregARM64_X1();
 587    argregs[2] = hregARM64_X2();
 588    argregs[3] = hregARM64_X3();
 589    argregs[4] = hregARM64_X4();
 590    argregs[5] = hregARM64_X5();
 591    argregs[6] = hregARM64_X6();
 592    argregs[7] = hregARM64_X7();
 593
 594    tmpregs[0] = tmpregs[1] = tmpregs[2] = tmpregs[3] = INVALID_HREG;
 595    tmpregs[4] = tmpregs[5] = tmpregs[6] = tmpregs[7] = INVALID_HREG;
 596
 597    /* First decide which scheme (slow or fast) is to be used.  First
 598       assume the fast scheme, and select slow if any contraindications
 599       (wow) appear. */
 600
 601    go_fast = True;
 602
 603    if (guard) {
 604       if (guard->tag == Iex_Const
 605           && guard->Iex.Const.con->tag == Ico_U1
 606           && guard->Iex.Const.con->Ico.U1 == True) {
 607          /* unconditional */
 608       } else {
 609          /* Not manifestly unconditional -- be conservative. */
 610          go_fast = False;
 611       }
 612    }
 613
 614    if (go_fast) {
 615       for (i = 0; i < n_args; i++) {
 616          if (mightRequireFixedRegs(args[i])) {
 617             go_fast = False;
 618             break;
 619          }
 620       }
 621    }
 622
 623    if (go_fast) {
 624       if (retTy == Ity_V128 || retTy == Ity_V256)
 625          go_fast = False;
 626    }
 627
 628    /* At this point the scheme to use has been established.  Generate
 629       code to get the arg values into the argument rregs.  If we run
 630       out of arg regs, give up. */
 631
 632    if (go_fast) {
 633
 634       /* FAST SCHEME */
 635       nextArgReg = 0;
 636
 637       for (i = 0; i < n_args; i++) {
 638          IRExpr* arg = args[i];
 639
 640          IRType  aTy = Ity_INVALID;
 641          if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg)))
 642             aTy = typeOfIRExpr(env->type_env, args[i]);
 643
 644          if (nextArgReg >= ARM64_N_ARGREGS)
 645             return False; /* out of argregs */
 646
 647          if (aTy == Ity_I64) {
 648             addInstr(env, ARM64Instr_MovI( argregs[nextArgReg],
 649                                            iselIntExpr_R(env, args[i]) ));
 650             nextArgReg++;
 651          }
 652          else if (arg->tag == Iex_GSPTR) {
 653             vassert(0); //ATC
 654             addInstr(env, ARM64Instr_MovI( argregs[nextArgReg],
 655                                            hregARM64_X21() ));
 656             nextArgReg++;
 657          }
 658          else if (arg->tag == Iex_VECRET) {
 659             // because of the go_fast logic above, we can't get here,
 660             // since vector return values makes us use the slow path
 661             // instead.
 662             vassert(0);
 663          }
 664          else
 665             return False; /* unhandled arg type */
 666       }
 667
 668       /* Fast scheme only applies for unconditional calls.  Hence: */
 669       cc = ARM64cc_AL;
 670
 671    } else {
 672
 673       /* SLOW SCHEME; move via temporaries */
 674       nextArgReg = 0;
 675
 676       for (i = 0; i < n_args; i++) {
 677          IRExpr* arg = args[i];
 678
 679          IRType  aTy = Ity_INVALID;
 680          if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg)))
 681             aTy = typeOfIRExpr(env->type_env, args[i]);
 682
 683          if (nextArgReg >= ARM64_N_ARGREGS)
 684             return False; /* out of argregs */
 685
 686          if (aTy == Ity_I64) {
 687             tmpregs[nextArgReg] = iselIntExpr_R(env, args[i]);
 688             nextArgReg++;
 689          }
 690          else if (arg->tag == Iex_GSPTR) {
 691             vassert(0); //ATC
 692             tmpregs[nextArgReg] = hregARM64_X21();
 693             nextArgReg++;
 694          }
 695          else if (arg->tag == Iex_VECRET) {
 696             vassert(!hregIsInvalid(r_vecRetAddr));
 697             tmpregs[nextArgReg] = r_vecRetAddr;
 698             nextArgReg++;
 699          }
 700          else
 701             return False; /* unhandled arg type */
 702       }
 703
 704       /* Now we can compute the condition.  We can't do it earlier
 705          because the argument computations could trash the condition
 706          codes.  Be a bit clever to handle the common case where the
 707          guard is 1:Bit. */
 708       cc = ARM64cc_AL;
 709       if (guard) {
 710          if (guard->tag == Iex_Const
 711              && guard->Iex.Const.con->tag == Ico_U1
 712              && guard->Iex.Const.con->Ico.U1 == True) {
 713             /* unconditional -- do nothing */
 714          } else {
 715             cc = iselCondCode( env, guard );
 716          }
 717       }
 718
 719       /* Move the args to their final destinations. */
 720       for (i = 0; i < nextArgReg; i++) {
 721          vassert(!(hregIsInvalid(tmpregs[i])));
 722          /* None of these insns, including any spill code that might
 723             be generated, may alter the condition codes. */
 724          addInstr( env, ARM64Instr_MovI( argregs[i], tmpregs[i] ) );
 725       }
 726
 727    }
 728
 729    /* Should be assured by checks above */
 730    vassert(nextArgReg <= ARM64_N_ARGREGS);
 731
 732    /* Do final checks, set the return values, and generate the call
 733       instruction proper. */
 734    vassert(nGSPTRs == 0 || nGSPTRs == 1);
 735    vassert(nVECRETs == ((retTy == Ity_V128 || retTy == Ity_V256) ? 1 : 0));
 736    vassert(*stackAdjustAfterCall == 0);
 737    vassert(is_RetLoc_INVALID(*retloc));
 738    switch (retTy) {
 739       case Ity_INVALID:
 740          /* Function doesn't return a value. */
 741          *retloc = mk_RetLoc_simple(RLPri_None);
 742          break;
 743       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
 744          *retloc = mk_RetLoc_simple(RLPri_Int);
 745          break;
 746       case Ity_V128:
 747          *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
 748          *stackAdjustAfterCall = 16;
 749          break;
 750       case Ity_V256:
 751          vassert(0); // ATC
 752          *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
 753          *stackAdjustAfterCall = 32;
 754          break;
 755       default:
 756          /* IR can denote other possible return types, but we don't
 757             handle those here. */
 758          vassert(0);
 759    }
 760
 761    /* Finally, generate the call itself.  This needs the *retloc value
 762       set in the switch above, which is why it's at the end. */
 763
 764    /* nextArgReg doles out argument registers.  Since these are
 765       assigned in the order x0 .. x7, its numeric value at this point,
 766       which must be between 0 and 8 inclusive, is going to be equal to
 767       the number of arg regs in use for the call.  Hence bake that
 768       number into the call (we'll need to know it when doing register
 769       allocation, to know what regs the call reads.) */
 770
 771    target = (Addr)cee->addr;
 772    addInstr(env, ARM64Instr_Call( cc, target, nextArgReg, *retloc ));
 773
 774    return True; /* success */
 775 }
 776
 777
 778 /*---------------------------------------------------------*/
 779 /*--- ISEL: Integer expressions (64/32 bit)             ---*/
 780 /*---------------------------------------------------------*/
 781
 782 /* Select insns for an integer-typed expression, and add them to the
 783    code list.  Return a reg holding the result.  This reg will be a
 784    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
 785    want to modify it, ask for a new vreg, copy it in there, and modify
 786    the copy.  The register allocator will do its best to map both
 787    vregs to the same real register, so the copies will often disappear
 788    later in the game.
 789
 790    This should handle expressions of 64- and 32-bit type.  All results
 791    are returned in a 64-bit register.  For 32-bit expressions, the
 792    upper 32 bits are arbitrary, so you should mask or sign extend
 793    partial values if necessary.
 794 */
 795
 796 /* --------------------- AMode --------------------- */
 797
 798 /* Return an AMode which computes the value of the specified
 799    expression, possibly also adding insns to the code list as a
 800    result.  The expression may only be a 64-bit one.
 801 */
 802
 803 static Bool isValidScale ( UChar scale )
 804 {
 805    switch (scale) {
 806       case 1: case 2: case 4: case 8: /* case 16: ??*/ return True;
 807       default: return False;
 808    }
 809 }
 810
 811 static Bool sane_AMode ( ARM64AMode* am )
 812 {
 813    switch (am->tag) {
 814       case ARM64am_RI9:
 815          return
 816             toBool( hregClass(am->ARM64am.RI9.reg) == HRcInt64
 817                     && (hregIsVirtual(am->ARM64am.RI9.reg)
 818                         /* || sameHReg(am->ARM64am.RI9.reg,
 819                                        hregARM64_X21()) */ )
 820                     && am->ARM64am.RI9.simm9 >= -256
 821                     && am->ARM64am.RI9.simm9 <= 255 );
 822       case ARM64am_RI12:
 823          return
 824             toBool( hregClass(am->ARM64am.RI12.reg) == HRcInt64
 825                     && (hregIsVirtual(am->ARM64am.RI12.reg)
 826                         /* || sameHReg(am->ARM64am.RI12.reg,
 827                                        hregARM64_X21()) */ )
 828                     && am->ARM64am.RI12.uimm12 < 4096
 829                     && isValidScale(am->ARM64am.RI12.szB) );
 830       case ARM64am_RR:
 831          return
 832             toBool( hregClass(am->ARM64am.RR.base) == HRcInt64
 833                     && hregIsVirtual(am->ARM64am.RR.base)
 834                     && hregClass(am->ARM64am.RR.index) == HRcInt64
 835                     && hregIsVirtual(am->ARM64am.RR.index) );
 836       default:
 837          vpanic("sane_AMode: unknown ARM64 AMode1 tag");
 838    }
 839 }
 840
 841 static
 842 ARM64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e, IRType dty )
 843 {
 844    ARM64AMode* am = iselIntExpr_AMode_wrk(env, e, dty);
 845    vassert(sane_AMode(am));
 846    return am;
 847 }
 848
 849 static
 850 ARM64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e, IRType dty )
 851 {
 852    IRType ty = typeOfIRExpr(env->type_env,e);
 853    vassert(ty == Ity_I64);
 854
 855    ULong szBbits = 0;
 856    switch (dty) {
 857       case Ity_I64: szBbits = 3; break;
 858       case Ity_I32: szBbits = 2; break;
 859       case Ity_I16: szBbits = 1; break;
 860       case Ity_I8:  szBbits = 0; break;
 861       default: vassert(0);
 862    }
 863
 864    /* {Add64,Sub64}(expr,simm9).  We don't care about |dty| here since
 865       we're going to create an amode suitable for LDU* or STU*
 866       instructions, which use unscaled immediate offsets.  */
 867    if (e->tag == Iex_Binop
 868        && (e->Iex.Binop.op == Iop_Add64 || e->Iex.Binop.op == Iop_Sub64)
 869        && e->Iex.Binop.arg2->tag == Iex_Const
 870        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64) {
 871       Long simm = (Long)e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
 872       if (simm >= -255 && simm <= 255) {
 873          /* Although the gating condition might seem to be
 874                simm >= -256 && simm <= 255
 875             we will need to negate simm in the case where the op is Sub64.
 876             Hence limit the lower value to -255 in order that its negation
 877             is representable. */
 878          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
 879          if (e->Iex.Binop.op == Iop_Sub64) simm = -simm;
 880          return ARM64AMode_RI9(reg, (Int)simm);
 881       }
 882    }
 883
 884    /* Add64(expr, uimm12 * transfer-size) */
 885    if (e->tag == Iex_Binop
 886        && e->Iex.Binop.op == Iop_Add64
 887        && e->Iex.Binop.arg2->tag == Iex_Const
 888        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64) {
 889       ULong uimm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
 890       ULong szB  = 1 << szBbits;
 891       if (0 == (uimm & (szB-1)) /* "uimm is szB-aligned" */
 892           && (uimm >> szBbits) < 4096) {
 893          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
 894          return ARM64AMode_RI12(reg, (UInt)(uimm >> szBbits), (UChar)szB);
 895       }
 896    }
 897
 898    /* Add64(expr1, expr2) */
 899    if (e->tag == Iex_Binop
 900        && e->Iex.Binop.op == Iop_Add64) {
 901       HReg reg1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
 902       HReg reg2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
 903       return ARM64AMode_RR(reg1, reg2);
 904    }
 905
 906    /* Doesn't match anything in particular.  Generate it into
 907       a register and use that. */
 908    HReg reg = iselIntExpr_R(env, e);
 909    return ARM64AMode_RI9(reg, 0);
 910 }
 911
 912
 913 /* --------------------- RIA --------------------- */
 914
 915 /* Select instructions to generate 'e' into a RIA. */
 916
 917 static ARM64RIA* iselIntExpr_RIA ( ISelEnv* env, IRExpr* e )
 918 {
 919    ARM64RIA* ri = iselIntExpr_RIA_wrk(env, e);
 920    /* sanity checks ... */
 921    switch (ri->tag) {
 922       case ARM64riA_I12:
 923          vassert(ri->ARM64riA.I12.imm12 < 4096);
 924          vassert(ri->ARM64riA.I12.shift == 0 || ri->ARM64riA.I12.shift == 12);
 925          return ri;
 926       case ARM64riA_R:
 927          vassert(hregClass(ri->ARM64riA.R.reg) == HRcInt64);
 928          vassert(hregIsVirtual(ri->ARM64riA.R.reg));
 929          return ri;
 930       default:
 931          vpanic("iselIntExpr_RIA: unknown arm RIA tag");
 932    }
 933 }
 934
 935 /* DO NOT CALL THIS DIRECTLY ! */
 936 static ARM64RIA* iselIntExpr_RIA_wrk ( ISelEnv* env, IRExpr* e )
 937 {
 938    IRType ty = typeOfIRExpr(env->type_env,e);
 939    vassert(ty == Ity_I64 || ty == Ity_I32);
 940
 941    /* special case: immediate */
 942    if (e->tag == Iex_Const) {
 943       ULong u = 0xF000000ULL; /* invalid */
 944       switch (e->Iex.Const.con->tag) {
 945          case Ico_U64: u = e->Iex.Const.con->Ico.U64; break;
 946          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
 947          default: vpanic("iselIntExpr_RIA.Iex_Const(arm64)");
 948       }
 949       if (0 == (u & ~(0xFFFULL << 0)))
 950          return ARM64RIA_I12((UShort)((u >> 0) & 0xFFFULL), 0);
 951       if (0 == (u & ~(0xFFFULL << 12)))
 952          return ARM64RIA_I12((UShort)((u >> 12) & 0xFFFULL), 12);
 953       /* else fail, fall through to default case */
 954    }
 955
 956    /* default case: calculate into a register and return that */
 957    {
 958       HReg r = iselIntExpr_R ( env, e );
 959       return ARM64RIA_R(r);
 960    }
 961 }
 962
 963
 964 /* --------------------- RIL --------------------- */
 965
 966 /* Select instructions to generate 'e' into a RIL.  At this point we
 967    have to deal with the strange bitfield-immediate encoding for logic
 968    instructions. */
 969
 970
 971 // The following four functions
 972 //    CountLeadingZeros CountTrailingZeros CountSetBits isImmLogical
 973 // are copied, with modifications, from
 974 // https://github.com/armvixl/vixl/blob/master/src/a64/assembler-a64.cc
 975 // which has the following copyright notice:
 976 /*
 977    Copyright 2013, ARM Limited
 978    All rights reserved.
 979
 980    Redistribution and use in source and binary forms, with or without
 981    modification, are permitted provided that the following conditions are met:
 982
 983    * Redistributions of source code must retain the above copyright notice,
 984      this list of conditions and the following disclaimer.
 985    * Redistributions in binary form must reproduce the above copyright notice,
 986      this list of conditions and the following disclaimer in the documentation
 987      and/or other materials provided with the distribution.
 988    * Neither the name of ARM Limited nor the names of its contributors may be
 989      used to endorse or promote products derived from this software without
 990      specific prior written permission.
 991
 992    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
 993    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 994    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 995    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 996    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 997    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 998    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 999    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
1000    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1001    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1002 */
1003
1004 static Int CountLeadingZeros(ULong value, Int width)
1005 {
1006    vassert(width == 32 || width == 64);
1007    Int count = 0;
1008    ULong bit_test = 1ULL << (width - 1);
1009    while ((count < width) && ((bit_test & value) == 0)) {
1010       count++;
1011       bit_test >>= 1;
1012    }
1013    return count;
1014 }
1015
1016 static Int CountTrailingZeros(ULong value, Int width)
1017 {
1018    vassert(width == 32 || width == 64);
1019    Int count = 0;
1020    while ((count < width) && (((value >> count) & 1) == 0)) {
1021       count++;
1022    }
1023    return count;
1024 }
1025
1026 static Int CountSetBits(ULong value, Int width)
1027 {
1028    // TODO: Other widths could be added here, as the implementation already
1029    // supports them.
1030    vassert(width == 32 || width == 64);
1031
1032    // Mask out unused bits to ensure that they are not counted.
1033    value &= (0xffffffffffffffffULL >> (64-width));
1034
1035    // Add up the set bits.
1036    // The algorithm works by adding pairs of bit fields together iteratively,
1037    // where the size of each bit field doubles each time.
1038    // An example for an 8-bit value:
1039    // Bits: h g f e d c b a
1040    // \ | \ | \ | \ |
1041    // value = h+g f+e d+c b+a
1042    // \ | \ |
1043    // value = h+g+f+e d+c+b+a
1044    // \ |
1045    // value = h+g+f+e+d+c+b+a
1046    value = ((value >>  1) & 0x5555555555555555ULL)
1047                  + (value & 0x5555555555555555ULL);
1048    value = ((value >>  2) & 0x3333333333333333ULL)
1049                  + (value & 0x3333333333333333ULL);
1050    value = ((value >>  4) & 0x0f0f0f0f0f0f0f0fULL)
1051                  + (value & 0x0f0f0f0f0f0f0f0fULL);
1052    value = ((value >>  8) & 0x00ff00ff00ff00ffULL)
1053                  + (value & 0x00ff00ff00ff00ffULL);
1054    value = ((value >> 16) & 0x0000ffff0000ffffULL)
1055                  + (value & 0x0000ffff0000ffffULL);
1056    value = ((value >> 32) & 0x00000000ffffffffULL)
1057                  + (value & 0x00000000ffffffffULL);
1058
1059    return value;
1060 }
1061
1062 static Bool isImmLogical ( /*OUT*/UInt* n,
1063                            /*OUT*/UInt* imm_s, /*OUT*/UInt* imm_r,
1064                            ULong value, UInt width )
1065 {
1066   // Test if a given value can be encoded in the immediate field of a
1067   // logical instruction.
1068
1069   // If it can be encoded, the function returns true, and values
1070   // pointed to by n, imm_s and imm_r are updated with immediates
1071   // encoded in the format required by the corresponding fields in the
1072   // logical instruction.  If it can not be encoded, the function
1073   // returns false, and the values pointed to by n, imm_s and imm_r
1074   // are undefined.
1075   vassert(n != NULL && imm_s != NULL && imm_r != NULL);
1076   vassert(width == 32 || width == 64);
1077
1078   // Logical immediates are encoded using parameters n, imm_s and imm_r using
1079   // the following table:
1080   //
1081   // N imms immr size S R
1082   // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr)
1083   // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr)
1084   // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr)
1085   // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr)
1086   // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr)
1087   // 0 11110s xxxxxr 2 UInt(s) UInt(r)
1088   // (s bits must not be all set)
1089   //
1090   // A pattern is constructed of size bits, where the least significant S+1
1091   // bits are set. The pattern is rotated right by R, and repeated across a
1092   // 32 or 64-bit value, depending on destination register width.
1093   //
1094   // To test if an arbitrary immediate can be encoded using this scheme, an
1095   // iterative algorithm is used.
1096   //
1097   // TODO: This code does not consider using X/W register overlap to support
1098   // 64-bit immediates where the top 32-bits are zero, and the bottom 32-bits
1099   // are an encodable logical immediate.
1100
1101   // 1. If the value has all set or all clear bits, it can't be encoded.
1102   if ((value == 0) || (value == 0xffffffffffffffffULL) ||
1103       ((width == 32) && (value == 0xffffffff))) {
1104     return False;
1105   }
1106
1107   UInt lead_zero = CountLeadingZeros(value, width);
1108   UInt lead_one = CountLeadingZeros(~value, width);
1109   UInt trail_zero = CountTrailingZeros(value, width);
1110   UInt trail_one = CountTrailingZeros(~value, width);
1111   UInt set_bits = CountSetBits(value, width);
1112
1113   // The fixed bits in the immediate s field.
1114   // If width == 64 (X reg), start at 0xFFFFFF80.
1115   // If width == 32 (W reg), start at 0xFFFFFFC0, as the iteration for 64-bit
1116   // widths won't be executed.
1117   Int imm_s_fixed = (width == 64) ? -128 : -64;
1118   Int imm_s_mask = 0x3F;
1119
1120   for (;;) {
1121     // 2. If the value is two bits wide, it can be encoded.
1122     if (width == 2) {
1123       *n = 0;
1124       *imm_s = 0x3C;
1125       *imm_r = (value & 3) - 1;
1126       return True;
1127     }
1128
1129     *n = (width == 64) ? 1 : 0;
1130     *imm_s = ((imm_s_fixed | (set_bits - 1)) & imm_s_mask);
1131     if ((lead_zero + set_bits) == width) {
1132       *imm_r = 0;
1133     } else {
1134       *imm_r = (lead_zero > 0) ? (width - trail_zero) : lead_one;
1135     }
1136
1137     // 3. If the sum of leading zeros, trailing zeros and set bits is equal to
1138     // the bit width of the value, it can be encoded.
1139     if (lead_zero + trail_zero + set_bits == width) {
1140       return True;
1141     }
1142
1143     // 4. If the sum of leading ones, trailing ones and unset bits in the
1144     // value is equal to the bit width of the value, it can be encoded.
1145     if (lead_one + trail_one + (width - set_bits) == width) {
1146       return True;
1147     }
1148
1149     // 5. If the most-significant half of the bitwise value is equal to the
1150     // least-significant half, return to step 2 using the least-significant
1151     // half of the value.
1152     ULong mask = (1ULL << (width >> 1)) - 1;
1153     if ((value & mask) == ((value >> (width >> 1)) & mask)) {
1154       width >>= 1;
1155       set_bits >>= 1;
1156       imm_s_fixed >>= 1;
1157       continue;
1158     }
1159
1160     // 6. Otherwise, the value can't be encoded.
1161     return False;
1162   }
1163 }
1164
1165
1166 /* Create a RIL for the given immediate, if it is representable, or
1167    return NULL if not. */
1168
1169 static ARM64RIL* mb_mkARM64RIL_I ( ULong imm64 )
1170 {
1171    UInt n = 0, imm_s = 0, imm_r = 0;
1172    Bool ok = isImmLogical(&n, &imm_s, &imm_r, imm64, 64);
1173    if (!ok) return NULL;
1174    vassert(n < 2 && imm_s < 64 && imm_r < 64);
1175    return ARM64RIL_I13(n, imm_r, imm_s);
1176 }
1177
1178 /* So, finally .. */
1179
1180 static ARM64RIL* iselIntExpr_RIL ( ISelEnv* env, IRExpr* e )
1181 {
1182    ARM64RIL* ri = iselIntExpr_RIL_wrk(env, e);
1183    /* sanity checks ... */
1184    switch (ri->tag) {
1185       case ARM64riL_I13:
1186          vassert(ri->ARM64riL.I13.bitN < 2);
1187          vassert(ri->ARM64riL.I13.immR < 64);
1188          vassert(ri->ARM64riL.I13.immS < 64);
1189          return ri;
1190       case ARM64riL_R:
1191          vassert(hregClass(ri->ARM64riL.R.reg) == HRcInt64);
1192          vassert(hregIsVirtual(ri->ARM64riL.R.reg));
1193          return ri;
1194       default:
1195          vpanic("iselIntExpr_RIL: unknown arm RIL tag");
1196    }
1197 }
1198
1199 /* DO NOT CALL THIS DIRECTLY ! */
1200 static ARM64RIL* iselIntExpr_RIL_wrk ( ISelEnv* env, IRExpr* e )
1201 {
1202    IRType ty = typeOfIRExpr(env->type_env,e);
1203    vassert(ty == Ity_I64 || ty == Ity_I32);
1204
1205    /* special case: immediate */
1206    if (e->tag == Iex_Const) {
1207       ARM64RIL* maybe = NULL;
1208       if (ty == Ity_I64) {
1209          vassert(e->Iex.Const.con->tag == Ico_U64);
1210          maybe = mb_mkARM64RIL_I(e->Iex.Const.con->Ico.U64);
1211       } else {
1212          vassert(ty == Ity_I32);
1213          vassert(e->Iex.Const.con->tag == Ico_U32);
1214          UInt  u32 = e->Iex.Const.con->Ico.U32;
1215          ULong u64 = (ULong)u32;
1216          /* First try with 32 leading zeroes. */
1217          maybe = mb_mkARM64RIL_I(u64);
1218          /* If that doesn't work, try with 2 copies, since it doesn't
1219             matter what winds up in the upper 32 bits. */
1220          if (!maybe) {
1221             maybe = mb_mkARM64RIL_I((u64 << 32) | u64);
1222          }
1223       }
1224       if (maybe) return maybe;
1225       /* else fail, fall through to default case */
1226    }
1227
1228    /* default case: calculate into a register and return that */
1229    {
1230       HReg r = iselIntExpr_R ( env, e );
1231       return ARM64RIL_R(r);
1232    }
1233 }
1234
1235
1236 /* --------------------- RI6 --------------------- */
1237
1238 /* Select instructions to generate 'e' into a RI6. */
1239
1240 static ARM64RI6* iselIntExpr_RI6 ( ISelEnv* env, IRExpr* e )
1241 {
1242    ARM64RI6* ri = iselIntExpr_RI6_wrk(env, e);
1243    /* sanity checks ... */
1244    switch (ri->tag) {
1245       case ARM64ri6_I6:
1246          vassert(ri->ARM64ri6.I6.imm6 < 64);
1247          vassert(ri->ARM64ri6.I6.imm6 > 0);
1248          return ri;
1249       case ARM64ri6_R:
1250          vassert(hregClass(ri->ARM64ri6.R.reg) == HRcInt64);
1251          vassert(hregIsVirtual(ri->ARM64ri6.R.reg));
1252          return ri;
1253       default:
1254          vpanic("iselIntExpr_RI6: unknown arm RI6 tag");
1255    }
1256 }
1257
1258 /* DO NOT CALL THIS DIRECTLY ! */
1259 static ARM64RI6* iselIntExpr_RI6_wrk ( ISelEnv* env, IRExpr* e )
1260 {
1261    IRType ty = typeOfIRExpr(env->type_env,e);
1262    vassert(ty == Ity_I64 || ty == Ity_I8);
1263
1264    /* special case: immediate */
1265    if (e->tag == Iex_Const) {
1266       switch (e->Iex.Const.con->tag) {
1267          case Ico_U8: {
1268             UInt u = e->Iex.Const.con->Ico.U8;
1269             if (u > 0 && u < 64)
1270               return ARM64RI6_I6(u);
1271             break;
1272          default:
1273             break;
1274          }
1275       }
1276       /* else fail, fall through to default case */
1277    }
1278
1279    /* default case: calculate into a register and return that */
1280    {
1281       HReg r = iselIntExpr_R ( env, e );
1282       return ARM64RI6_R(r);
1283    }
1284 }
1285
1286
1287 /* ------------------- CondCode ------------------- */
1288
1289 /* Generate code to evaluated a bit-typed expression, returning the
1290    condition code which would correspond when the expression would
1291    notionally have returned 1. */
1292
1293 static ARM64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
1294 {
1295    ARM64CondCode cc = iselCondCode_wrk(env,e);
1296    vassert(cc != ARM64cc_NV);
1297    return cc;
1298 }
1299
1300 static ARM64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
1301 {
1302    vassert(e);
1303    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
1304
1305    /* var */
1306    if (e->tag == Iex_RdTmp) {
1307       HReg rTmp = lookupIRTemp(env, e->Iex.RdTmp.tmp);
1308       /* Cmp doesn't modify rTmp; so this is OK. */
1309       ARM64RIL* one = mb_mkARM64RIL_I(1);
1310       vassert(one);
1311       addInstr(env, ARM64Instr_Test(rTmp, one));
1312       return ARM64cc_NE;
1313    }
1314
1315    /* Not1(e) */
1316    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
1317       /* Generate code for the arg, and negate the test condition */
1318       ARM64CondCode cc = iselCondCode(env, e->Iex.Unop.arg);
1319       if (cc == ARM64cc_AL || cc == ARM64cc_NV) {
1320         return ARM64cc_AL;
1321       } else {
1322         return 1 ^ cc;
1323       }
1324    }
1325
1326    /* --- patterns rooted at: 64to1 --- */
1327
1328    if (e->tag == Iex_Unop
1329        && e->Iex.Unop.op == Iop_64to1) {
1330       HReg      rTmp = iselIntExpr_R(env, e->Iex.Unop.arg);
1331       ARM64RIL* one  = mb_mkARM64RIL_I(1);
1332       vassert(one); /* '1' must be representable */
1333       addInstr(env, ARM64Instr_Test(rTmp, one));
1334       return ARM64cc_NE;
1335    }
1336
1337    /* --- patterns rooted at: CmpNEZ8 --- */
1338
1339    if (e->tag == Iex_Unop
1340        && e->Iex.Unop.op == Iop_CmpNEZ8) {
1341       HReg      r1  = iselIntExpr_R(env, e->Iex.Unop.arg);
1342       ARM64RIL* xFF = mb_mkARM64RIL_I(0xFF);
1343       addInstr(env, ARM64Instr_Test(r1, xFF));
1344       return ARM64cc_NE;
1345    }
1346
1347    /* --- patterns rooted at: CmpNEZ16 --- */
1348
1349    if (e->tag == Iex_Unop
1350        && e->Iex.Unop.op == Iop_CmpNEZ16) {
1351       HReg      r1    = iselIntExpr_R(env, e->Iex.Unop.arg);
1352       ARM64RIL* xFFFF = mb_mkARM64RIL_I(0xFFFF);
1353       addInstr(env, ARM64Instr_Test(r1, xFFFF));
1354       return ARM64cc_NE;
1355    }
1356
1357    /* --- patterns rooted at: CmpNEZ64 --- */
1358
1359    if (e->tag == Iex_Unop
1360        && e->Iex.Unop.op == Iop_CmpNEZ64) {
1361       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
1362       ARM64RIA* zero = ARM64RIA_I12(0,0);
1363       addInstr(env, ARM64Instr_Cmp(r1, zero, True/*is64*/));
1364       return ARM64cc_NE;
1365    }
1366
1367    /* --- patterns rooted at: CmpNEZ32 --- */
1368
1369    if (e->tag == Iex_Unop
1370        && e->Iex.Unop.op == Iop_CmpNEZ32) {
1371       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
1372       ARM64RIA* zero = ARM64RIA_I12(0,0);
1373       addInstr(env, ARM64Instr_Cmp(r1, zero, False/*!is64*/));
1374       return ARM64cc_NE;
1375    }
1376
1377    /* --- Cmp*64*(x,y) --- */
1378    if (e->tag == Iex_Binop
1379        && (e->Iex.Binop.op == Iop_CmpEQ64
1380            || e->Iex.Binop.op == Iop_CmpNE64
1381            || e->Iex.Binop.op == Iop_CmpLT64S
1382            || e->Iex.Binop.op == Iop_CmpLT64U
1383            || e->Iex.Binop.op == Iop_CmpLE64S
1384            || e->Iex.Binop.op == Iop_CmpLE64U
1385            || e->Iex.Binop.op == Iop_CasCmpEQ64)) {
1386       HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1387       ARM64RIA* argR = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
1388       addInstr(env, ARM64Instr_Cmp(argL, argR, True/*is64*/));
1389       switch (e->Iex.Binop.op) {
1390          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return ARM64cc_EQ;
1391          case Iop_CmpNE64:  return ARM64cc_NE;
1392          case Iop_CmpLT64S: return ARM64cc_LT;
1393          case Iop_CmpLT64U: return ARM64cc_CC;
1394          case Iop_CmpLE64S: return ARM64cc_LE;
1395          case Iop_CmpLE64U: return ARM64cc_LS;
1396          default: vpanic("iselCondCode(arm64): CmpXX64");
1397       }
1398    }
1399
1400    /* --- Cmp*32*(x,y) --- */
1401    if (e->tag == Iex_Binop
1402        && (e->Iex.Binop.op == Iop_CmpEQ32
1403            || e->Iex.Binop.op == Iop_CmpNE32
1404            || e->Iex.Binop.op == Iop_CmpLT32S
1405            || e->Iex.Binop.op == Iop_CmpLT32U
1406            || e->Iex.Binop.op == Iop_CmpLE32S
1407            || e->Iex.Binop.op == Iop_CmpLE32U
1408            || e->Iex.Binop.op == Iop_CasCmpEQ32)) {
1409       HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1410       ARM64RIA* argR = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
1411       addInstr(env, ARM64Instr_Cmp(argL, argR, False/*!is64*/));
1412       switch (e->Iex.Binop.op) {
1413          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return ARM64cc_EQ;
1414          case Iop_CmpNE32:  return ARM64cc_NE;
1415          case Iop_CmpLT32S: return ARM64cc_LT;
1416          case Iop_CmpLT32U: return ARM64cc_CC;
1417          case Iop_CmpLE32S: return ARM64cc_LE;
1418          case Iop_CmpLE32U: return ARM64cc_LS;
1419          default: vpanic("iselCondCode(arm64): CmpXX32");
1420       }
1421    }
1422
1423    /* --- Cmp*16*(x,y) --- */
1424    if (e->tag == Iex_Binop
1425        && (e->Iex.Binop.op == Iop_CasCmpEQ16)) {
1426       HReg argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1427       HReg argR  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1428       HReg argL2 = widen_z_16_to_64(env, argL);
1429       HReg argR2 = widen_z_16_to_64(env, argR);
1430       addInstr(env, ARM64Instr_Cmp(argL2, ARM64RIA_R(argR2), True/*is64*/));
1431       switch (e->Iex.Binop.op) {
1432          case Iop_CasCmpEQ16: return ARM64cc_EQ;
1433          default: vpanic("iselCondCode(arm64): CmpXX16");
1434       }
1435    }
1436
1437    /* --- Cmp*8*(x,y) --- */
1438    if (e->tag == Iex_Binop
1439        && (e->Iex.Binop.op == Iop_CasCmpEQ8)) {
1440       HReg argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1441       HReg argR  = iselIntExpr_R(env, e->Iex.Binop.arg2);
1442       HReg argL2 = widen_z_8_to_64(env, argL);
1443       HReg argR2 = widen_z_8_to_64(env, argR);
1444       addInstr(env, ARM64Instr_Cmp(argL2, ARM64RIA_R(argR2), True/*is64*/));
1445       switch (e->Iex.Binop.op) {
1446          case Iop_CasCmpEQ8: return ARM64cc_EQ;
1447          default: vpanic("iselCondCode(arm64): CmpXX8");
1448       }
1449    }
1450
1451    ppIRExpr(e);
1452    vpanic("iselCondCode");
1453 }
1454
1455
1456 /* --------------------- Reg --------------------- */
1457
1458 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
1459 {
1460    HReg r = iselIntExpr_R_wrk(env, e);
1461    /* sanity checks ... */
1462 #  if 0
1463    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
1464 #  endif
1465    vassert(hregClass(r) == HRcInt64);
1466    vassert(hregIsVirtual(r));
1467    return r;
1468 }
1469
1470 /* DO NOT CALL THIS DIRECTLY ! */
1471 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
1472 {
1473    IRType ty = typeOfIRExpr(env->type_env,e);
1474    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
1475
1476    switch (e->tag) {
1477
1478    /* --------- TEMP --------- */
1479    case Iex_RdTmp: {
1480       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
1481    }
1482
1483    /* --------- LOAD --------- */
1484    case Iex_Load: {
1485       HReg dst  = newVRegI(env);
1486
1487       if (e->Iex.Load.end != Iend_LE)
1488          goto irreducible;
1489
1490       if (ty == Ity_I64) {
1491          ARM64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr, ty );
1492          addInstr(env, ARM64Instr_LdSt64(True/*isLoad*/, dst, amode));
1493          return dst;
1494       }
1495       if (ty == Ity_I32) {
1496          ARM64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr, ty );
1497          addInstr(env, ARM64Instr_LdSt32(True/*isLoad*/, dst, amode));
1498          return dst;
1499       }
1500       if (ty == Ity_I16) {
1501          ARM64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr, ty );
1502          addInstr(env, ARM64Instr_LdSt16(True/*isLoad*/, dst, amode));
1503          return dst;
1504       }
1505       if (ty == Ity_I8) {
1506          ARM64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr, ty );
1507          addInstr(env, ARM64Instr_LdSt8(True/*isLoad*/, dst, amode));
1508          return dst;
1509       }
1510       break;
1511    }
1512
1513    /* --------- BINARY OP --------- */
1514    case Iex_Binop: {
1515
1516       ARM64LogicOp lop = 0; /* invalid */
1517       ARM64ShiftOp sop = 0; /* invalid */
1518
1519       /* Special-case 0-x into a Neg instruction.  Not because it's
1520          particularly useful but more so as to give value flow using
1521          this instruction, so as to check its assembly correctness for
1522          implementation of Left32/Left64. */
1523       switch (e->Iex.Binop.op) {
1524          case Iop_Sub64:
1525             if (isZeroU64(e->Iex.Binop.arg1)) {
1526                HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1527                HReg dst  = newVRegI(env);
1528                addInstr(env, ARM64Instr_Unary(dst, argR, ARM64un_NEG));
1529                return dst;
1530             }
1531             break;
1532          default:
1533             break;
1534       }
1535
1536       /* ADD/SUB */
1537       switch (e->Iex.Binop.op) {
1538          case Iop_Add64: case Iop_Add32:
1539          case Iop_Sub64: case Iop_Sub32: {
1540             Bool      isAdd = e->Iex.Binop.op == Iop_Add64
1541                               || e->Iex.Binop.op == Iop_Add32;
1542             HReg      dst   = newVRegI(env);
1543             HReg      argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
1544             ARM64RIA* argR  = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
1545             addInstr(env, ARM64Instr_Arith(dst, argL, argR, isAdd));
1546             return dst;
1547          }
1548          default:
1549             break;
1550       }
1551
1552       /* AND/OR/XOR */
1553       switch (e->Iex.Binop.op) {
1554          case Iop_And64: case Iop_And32: lop = ARM64lo_AND; goto log_binop;
1555          case Iop_Or64:  case Iop_Or32:  lop = ARM64lo_OR;  goto log_binop;
1556          case Iop_Xor64: case Iop_Xor32: lop = ARM64lo_XOR; goto log_binop;
1557          log_binop: {
1558             HReg      dst  = newVRegI(env);
1559             HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1560             ARM64RIL* argR = iselIntExpr_RIL(env, e->Iex.Binop.arg2);
1561             addInstr(env, ARM64Instr_Logic(dst, argL, argR, lop));
1562             return dst;
1563          }
1564          default:
1565             break;
1566       }
1567
1568       /* SHL/SHR/SAR */
1569       switch (e->Iex.Binop.op) {
1570          case Iop_Shr64:                 sop = ARM64sh_SHR; goto sh_binop;
1571          case Iop_Sar64:                 sop = ARM64sh_SAR; goto sh_binop;
1572          case Iop_Shl64: case Iop_Shl32: sop = ARM64sh_SHL; goto sh_binop;
1573          sh_binop: {
1574             HReg      dst  = newVRegI(env);
1575             HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1576             ARM64RI6* argR = iselIntExpr_RI6(env, e->Iex.Binop.arg2);
1577             addInstr(env, ARM64Instr_Shift(dst, argL, argR, sop));
1578             return dst;
1579          }
1580          case Iop_Shr32:
1581          case Iop_Sar32: {
1582             Bool      zx   = e->Iex.Binop.op == Iop_Shr32;
1583             HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1584             ARM64RI6* argR = iselIntExpr_RI6(env, e->Iex.Binop.arg2);
1585             HReg      dst  = zx ? widen_z_32_to_64(env, argL)
1586                                 : widen_s_32_to_64(env, argL);
1587             addInstr(env, ARM64Instr_Shift(dst, dst, argR, ARM64sh_SHR));
1588             return dst;
1589          }
1590          default: break;
1591       }
1592
1593       /* MUL */
1594       if (e->Iex.Binop.op == Iop_Mul64 || e->Iex.Binop.op == Iop_Mul32) {
1595          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1596          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1597          HReg dst  = newVRegI(env);
1598          addInstr(env, ARM64Instr_Mul(dst, argL, argR, ARM64mul_PLAIN));
1599          return dst;
1600       }
1601
1602       /* MULL */
1603       if (e->Iex.Binop.op == Iop_MullU32 || e->Iex.Binop.op == Iop_MullS32) {
1604          Bool isS  = e->Iex.Binop.op == Iop_MullS32;
1605          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1606          HReg extL = (isS ? widen_s_32_to_64 : widen_z_32_to_64)(env, argL);
1607          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1608          HReg extR = (isS ? widen_s_32_to_64 : widen_z_32_to_64)(env, argR);
1609          HReg dst  = newVRegI(env);
1610          addInstr(env, ARM64Instr_Mul(dst, extL, extR, ARM64mul_PLAIN));
1611          return dst;
1612       }
1613
1614       /* Handle misc other ops. */
1615
1616       if (e->Iex.Binop.op == Iop_Max32U) {
1617          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1618          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1619          HReg dst  = newVRegI(env);
1620          addInstr(env, ARM64Instr_Cmp(argL, ARM64RIA_R(argR), False/*!is64*/));
1621          addInstr(env, ARM64Instr_CSel(dst, argL, argR, ARM64cc_CS));
1622          return dst;
1623       }
1624
1625       if (e->Iex.Binop.op == Iop_32HLto64) {
1626          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
1627          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
1628          HReg lo32  = widen_z_32_to_64(env, lo32s);
1629          HReg hi32  = newVRegI(env);
1630          addInstr(env, ARM64Instr_Shift(hi32, hi32s, ARM64RI6_I6(32),
1631                                         ARM64sh_SHL));
1632          addInstr(env, ARM64Instr_Logic(hi32, hi32, ARM64RIL_R(lo32),
1633                                         ARM64lo_OR));
1634          return hi32;
1635       }
1636
1637       if (e->Iex.Binop.op == Iop_CmpF64 || e->Iex.Binop.op == Iop_CmpF32) {
1638          Bool isD = e->Iex.Binop.op == Iop_CmpF64;
1639          HReg dL  = (isD ? iselDblExpr : iselFltExpr)(env, e->Iex.Binop.arg1);
1640          HReg dR  = (isD ? iselDblExpr : iselFltExpr)(env, e->Iex.Binop.arg2);
1641          HReg dst = newVRegI(env);
1642          HReg imm = newVRegI(env);
1643          /* Do the compare (FCMP), which sets NZCV in PSTATE.  Then
1644             create in dst, the IRCmpF64Result encoded result. */
1645          addInstr(env, (isD ? ARM64Instr_VCmpD : ARM64Instr_VCmpS)(dL, dR));
1646          addInstr(env, ARM64Instr_Imm64(dst, 0));
1647          addInstr(env, ARM64Instr_Imm64(imm, 0x40)); // 0x40 = Ircr_EQ
1648          addInstr(env, ARM64Instr_CSel(dst, imm, dst, ARM64cc_EQ));
1649          addInstr(env, ARM64Instr_Imm64(imm, 0x01)); // 0x01 = Ircr_LT
1650          addInstr(env, ARM64Instr_CSel(dst, imm, dst, ARM64cc_MI));
1651          addInstr(env, ARM64Instr_Imm64(imm, 0x00)); // 0x00 = Ircr_GT
1652          addInstr(env, ARM64Instr_CSel(dst, imm, dst, ARM64cc_GT));
1653          addInstr(env, ARM64Instr_Imm64(imm, 0x45)); // 0x45 = Ircr_UN
1654          addInstr(env, ARM64Instr_CSel(dst, imm, dst, ARM64cc_VS));
1655          return dst;
1656       }
1657
1658       { /* local scope */
1659         ARM64CvtOp cvt_op = ARM64cvt_INVALID;
1660         Bool       srcIsD = False;
1661         switch (e->Iex.Binop.op) {
1662            case Iop_F64toI64S:
1663               cvt_op = ARM64cvt_F64_I64S; srcIsD = True; break;
1664            case Iop_F64toI64U:
1665               cvt_op = ARM64cvt_F64_I64U; srcIsD = True; break;
1666            case Iop_F64toI32S:
1667               cvt_op = ARM64cvt_F64_I32S; srcIsD = True; break;
1668            case Iop_F64toI32U:
1669               cvt_op = ARM64cvt_F64_I32U; srcIsD = True; break;
1670            case Iop_F32toI32S:
1671               cvt_op = ARM64cvt_F32_I32S; srcIsD = False; break;
1672            case Iop_F32toI32U:
1673               cvt_op = ARM64cvt_F32_I32U; srcIsD = False; break;
1674            case Iop_F32toI64S:
1675               cvt_op = ARM64cvt_F32_I64S; srcIsD = False; break;
1676            case Iop_F32toI64U:
1677               cvt_op = ARM64cvt_F32_I64U; srcIsD = False; break;
1678            default:
1679               break;
1680         }
1681         if (cvt_op != ARM64cvt_INVALID) {
1682            /* This is all a bit dodgy, because we can't handle a
1683               non-constant (not-known-at-JIT-time) rounding mode
1684               indication.  That's because there's no instruction
1685               AFAICS that does this conversion but rounds according to
1686               FPCR.RM, so we have to bake the rounding mode into the
1687               instruction right now.  But that should be OK because
1688               (1) the front end attaches a literal Irrm_ value to the
1689               conversion binop, and (2) iropt will never float that
1690               off via CSE, into a literal.  Hence we should always
1691               have an Irrm_ value as the first arg. */
1692            IRExpr* arg1 = e->Iex.Binop.arg1;
1693            if (arg1->tag != Iex_Const) goto irreducible;
1694            IRConst* arg1con = arg1->Iex.Const.con;
1695            vassert(arg1con->tag == Ico_U32); // else ill-typed IR
1696            UInt irrm = arg1con->Ico.U32;
1697            /* Find the ARM-encoded equivalent for |irrm|. */
1698            UInt armrm = 4; /* impossible */
1699            switch (irrm) {
1700               case Irrm_NEAREST: armrm = 0; break;
1701               case Irrm_NegINF:  armrm = 2; break;
1702               case Irrm_PosINF:  armrm = 1; break;
1703               case Irrm_ZERO:    armrm = 3; break;
1704               default: goto irreducible;
1705            }
1706            HReg src = (srcIsD ? iselDblExpr : iselFltExpr)
1707                          (env, e->Iex.Binop.arg2);
1708            HReg dst = newVRegI(env);
1709            addInstr(env, ARM64Instr_VCvtF2I(cvt_op, dst, src, armrm));
1710            return dst;
1711         }
1712       } /* local scope */
1713
1714       /* All cases involving host-side helper calls. */
1715       void* fn = NULL;
1716       switch (e->Iex.Binop.op) {
1717          case Iop_DivU32:
1718             fn = &h_calc_udiv32_w_arm_semantics; break;
1719          case Iop_DivS32:
1720             fn = &h_calc_sdiv32_w_arm_semantics; break;
1721          case Iop_DivU64:
1722             fn = &h_calc_udiv64_w_arm_semantics; break;
1723          case Iop_DivS64:
1724             fn = &h_calc_sdiv64_w_arm_semantics; break;
1725          default:
1726             break;
1727       }
1728
1729       if (fn) {
1730          HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1);
1731          HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
1732          HReg res  = newVRegI(env);
1733          addInstr(env, ARM64Instr_MovI(hregARM64_X0(), regL));
1734          addInstr(env, ARM64Instr_MovI(hregARM64_X1(), regR));
1735          addInstr(env, ARM64Instr_Call( ARM64cc_AL, (Addr)fn,
1736                                         2, mk_RetLoc_simple(RLPri_Int) ));
1737          addInstr(env, ARM64Instr_MovI(res, hregARM64_X0()));
1738          return res;
1739       }
1740
1741       break;
1742    }
1743
1744    /* --------- UNARY OP --------- */
1745    case Iex_Unop: {
1746
1747       switch (e->Iex.Unop.op) {
1748          case Iop_16Uto64: {
1749             /* This probably doesn't occur often enough to be worth
1750                rolling the extension into the load. */
1751             IRExpr* arg = e->Iex.Unop.arg;
1752             HReg    src = iselIntExpr_R(env, arg);
1753             HReg    dst = widen_z_16_to_64(env, src);
1754             return dst;
1755          }
1756          case Iop_32Uto64: {
1757             IRExpr* arg = e->Iex.Unop.arg;
1758             if (arg->tag == Iex_Load) {
1759                /* This correctly zero extends because _LdSt32 is
1760                   defined to do a zero extending load. */
1761                HReg dst = newVRegI(env);
1762                ARM64AMode* am
1763                   = iselIntExpr_AMode(env, arg->Iex.Load.addr, Ity_I32);
1764                addInstr(env, ARM64Instr_LdSt32(True/*isLoad*/, dst, am));
1765                return dst;
1766             }
1767             /* else be lame and mask it  */
1768             HReg src  = iselIntExpr_R(env, arg);
1769             HReg dst  = widen_z_32_to_64(env, src);
1770             return dst;
1771          }
1772          case Iop_8Uto32: /* Just freeload on the 8Uto64 case */
1773          case Iop_8Uto64: {
1774             IRExpr* arg = e->Iex.Unop.arg;
1775             if (arg->tag == Iex_Load) {
1776                /* This correctly zero extends because _LdSt8 is
1777                   defined to do a zero extending load. */
1778                HReg dst = newVRegI(env);
1779                ARM64AMode* am
1780                   = iselIntExpr_AMode(env, arg->Iex.Load.addr, Ity_I8);
1781                addInstr(env, ARM64Instr_LdSt8(True/*isLoad*/, dst, am));
1782                return dst;
1783             }
1784             /* else be lame and mask it  */
1785             HReg src = iselIntExpr_R(env, arg);
1786             HReg dst = widen_z_8_to_64(env, src);
1787             return dst;
1788          }
1789          case Iop_128HIto64: {
1790             HReg rHi, rLo;
1791             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
1792             return rHi; /* and abandon rLo */
1793          }
1794          case Iop_8Sto32: case Iop_8Sto64: {
1795             IRExpr* arg = e->Iex.Unop.arg;
1796             HReg    src = iselIntExpr_R(env, arg);
1797             HReg    dst = widen_s_8_to_64(env, src);
1798             return dst;
1799          }
1800          case Iop_16Sto32: case Iop_16Sto64: {
1801             IRExpr* arg = e->Iex.Unop.arg;
1802             HReg    src = iselIntExpr_R(env, arg);
1803             HReg    dst = widen_s_16_to_64(env, src);
1804             return dst;
1805          }
1806          case Iop_32Sto64: {
1807             IRExpr* arg = e->Iex.Unop.arg;
1808             HReg    src = iselIntExpr_R(env, arg);
1809             HReg    dst = widen_s_32_to_64(env, src);
1810             return dst;
1811          }
1812          case Iop_Not32:
1813          case Iop_Not64: {
1814             HReg dst = newVRegI(env);
1815             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1816             addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_NOT));
1817             return dst;
1818          }
1819          case Iop_Clz64: {
1820             HReg dst = newVRegI(env);
1821             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1822             addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_CLZ));
1823             return dst;
1824          }
1825          case Iop_Left32:
1826          case Iop_Left64: {
1827             /* Left64(src) = src | -src.  Left32 can use the same
1828                implementation since in that case we don't care what
1829                the upper 32 bits become. */
1830             HReg dst = newVRegI(env);
1831             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1832             addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_NEG));
1833             addInstr(env, ARM64Instr_Logic(dst, dst, ARM64RIL_R(src),
1834                                            ARM64lo_OR));
1835             return dst;
1836          }
1837          case Iop_CmpwNEZ64: {
1838            /* CmpwNEZ64(src) = (src == 0) ? 0...0 : 1...1
1839                              = Left64(src) >>s 63 */
1840             HReg dst = newVRegI(env);
1841             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
1842             addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_NEG));
1843             addInstr(env, ARM64Instr_Logic(dst, dst, ARM64RIL_R(src),
1844                                            ARM64lo_OR));
1845             addInstr(env, ARM64Instr_Shift(dst, dst, ARM64RI6_I6(63),
1846                                            ARM64sh_SAR));
1847             return dst;
1848          }
1849          case Iop_CmpwNEZ32: {
1850             /* CmpwNEZ32(src) = CmpwNEZ64(src & 0xFFFFFFFF)
1851                               = Left64(src & 0xFFFFFFFF) >>s 63 */
1852             HReg dst = newVRegI(env);
1853             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
1854             HReg src = widen_z_32_to_64(env, pre);
1855             addInstr(env, ARM64Instr_Unary(dst, src, ARM64un_NEG));
1856             addInstr(env, ARM64Instr_Logic(dst, dst, ARM64RIL_R(src),
1857                                            ARM64lo_OR));
1858             addInstr(env, ARM64Instr_Shift(dst, dst, ARM64RI6_I6(63),
1859                                            ARM64sh_SAR));
1860             return dst;
1861          }
1862          case Iop_V128to64: case Iop_V128HIto64: {
1863             HReg dst    = newVRegI(env);
1864             HReg src    = iselV128Expr(env, e->Iex.Unop.arg);
1865             UInt laneNo = (e->Iex.Unop.op == Iop_V128HIto64) ? 1 : 0;
1866             addInstr(env, ARM64Instr_VXfromQ(dst, src, laneNo));
1867             return dst;
1868          }
1869          case Iop_ReinterpF64asI64: {
1870             HReg dst = newVRegI(env);
1871             HReg src = iselDblExpr(env, e->Iex.Unop.arg);
1872             addInstr(env, ARM64Instr_VXfromDorS(dst, src, True/*fromD*/));
1873             return dst;
1874          }
1875          case Iop_ReinterpF32asI32: {
1876             HReg dst = newVRegI(env);
1877             HReg src = iselFltExpr(env, e->Iex.Unop.arg);
1878             addInstr(env, ARM64Instr_VXfromDorS(dst, src, False/*!fromD*/));
1879             return dst;
1880          }
1881          case Iop_1Sto16:
1882          case Iop_1Sto32:
1883          case Iop_1Sto64: {
1884             /* As with the iselStmt case for 'tmp:I1 = expr', we could
1885                do a lot better here if it ever became necessary. */
1886             HReg zero = newVRegI(env);
1887             HReg one  = newVRegI(env);
1888             HReg dst  = newVRegI(env);
1889             addInstr(env, ARM64Instr_Imm64(zero, 0));
1890             addInstr(env, ARM64Instr_Imm64(one,  1));
1891             ARM64CondCode cc = iselCondCode(env, e->Iex.Unop.arg);
1892             addInstr(env, ARM64Instr_CSel(dst, one, zero, cc));
1893             addInstr(env, ARM64Instr_Shift(dst, dst, ARM64RI6_I6(63),
1894                                            ARM64sh_SHL));
1895             addInstr(env, ARM64Instr_Shift(dst, dst, ARM64RI6_I6(63),
1896                                            ARM64sh_SAR));
1897             return dst;
1898          }
1899          case Iop_NarrowUn16to8x8:
1900          case Iop_NarrowUn32to16x4:
1901          case Iop_NarrowUn64to32x2:
1902          case Iop_QNarrowUn16Sto8Sx8:
1903          case Iop_QNarrowUn32Sto16Sx4:
1904          case Iop_QNarrowUn64Sto32Sx2:
1905          case Iop_QNarrowUn16Uto8Ux8:
1906          case Iop_QNarrowUn32Uto16Ux4:
1907          case Iop_QNarrowUn64Uto32Ux2:
1908          case Iop_QNarrowUn16Sto8Ux8:
1909          case Iop_QNarrowUn32Sto16Ux4:
1910          case Iop_QNarrowUn64Sto32Ux2:
1911          {
1912             HReg src = iselV128Expr(env, e->Iex.Unop.arg);
1913             HReg tmp = newVRegV(env);
1914             HReg dst = newVRegI(env);
1915             UInt dszBlg2 = 3; /* illegal */
1916             ARM64VecNarrowOp op = ARM64vecna_INVALID;
1917             switch (e->Iex.Unop.op) {
1918                case Iop_NarrowUn16to8x8:
1919                   dszBlg2 = 0; op = ARM64vecna_XTN; break;
1920                case Iop_NarrowUn32to16x4:
1921                   dszBlg2 = 1; op = ARM64vecna_XTN; break;
1922                case Iop_NarrowUn64to32x2:
1923                   dszBlg2 = 2; op = ARM64vecna_XTN; break;
1924                case Iop_QNarrowUn16Sto8Sx8:
1925                   dszBlg2 = 0; op = ARM64vecna_SQXTN; break;
1926                case Iop_QNarrowUn32Sto16Sx4:
1927                   dszBlg2 = 1; op = ARM64vecna_SQXTN; break;
1928                case Iop_QNarrowUn64Sto32Sx2:
1929                   dszBlg2 = 2; op = ARM64vecna_SQXTN; break;
1930                case Iop_QNarrowUn16Uto8Ux8:
1931                   dszBlg2 = 0; op = ARM64vecna_UQXTN; break;
1932                case Iop_QNarrowUn32Uto16Ux4:
1933                   dszBlg2 = 1; op = ARM64vecna_UQXTN; break;
1934                case Iop_QNarrowUn64Uto32Ux2:
1935                   dszBlg2 = 2; op = ARM64vecna_UQXTN; break;
1936                case Iop_QNarrowUn16Sto8Ux8:
1937                   dszBlg2 = 0; op = ARM64vecna_SQXTUN; break;
1938                case Iop_QNarrowUn32Sto16Ux4:
1939                   dszBlg2 = 1; op = ARM64vecna_SQXTUN; break;
1940                case Iop_QNarrowUn64Sto32Ux2:
1941                   dszBlg2 = 2; op = ARM64vecna_SQXTUN; break;
1942                default:
1943                   vassert(0);
1944             }
1945             addInstr(env, ARM64Instr_VNarrowV(op, dszBlg2, tmp, src));
1946             addInstr(env, ARM64Instr_VXfromQ(dst, tmp, 0/*laneNo*/));
1947             return dst;
1948          }
1949          case Iop_1Uto64: {
1950             /* 1Uto64(tmp). */
1951             HReg dst = newVRegI(env);
1952             if (e->Iex.Unop.arg->tag == Iex_RdTmp) {
1953                ARM64RIL* one = mb_mkARM64RIL_I(1);
1954                HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
1955                vassert(one);
1956                addInstr(env, ARM64Instr_Logic(dst, src, one, ARM64lo_AND));
1957             } else {
1958                /* CLONE-01 */
1959                HReg zero = newVRegI(env);
1960                HReg one  = newVRegI(env);
1961                addInstr(env, ARM64Instr_Imm64(zero, 0));
1962                addInstr(env, ARM64Instr_Imm64(one,  1));
1963                ARM64CondCode cc = iselCondCode(env, e->Iex.Unop.arg);
1964                addInstr(env, ARM64Instr_CSel(dst, one, zero, cc));
1965             }
1966             return dst;
1967          }
1968          case Iop_64to32:
1969          case Iop_64to16:
1970          case Iop_64to8:
1971             /* These are no-ops. */
1972             return iselIntExpr_R(env, e->Iex.Unop.arg);
1973
1974          default:
1975             break;
1976       }
1977
1978       break;
1979    }
1980
1981    /* --------- GET --------- */
1982    case Iex_Get: {
1983       if (ty == Ity_I64
1984           && 0 == (e->Iex.Get.offset & 7) && e->Iex.Get.offset < (8<<12)-8) {
1985          HReg        dst = newVRegI(env);
1986          ARM64AMode* am
1987             = mk_baseblock_64bit_access_amode(e->Iex.Get.offset);
1988          addInstr(env, ARM64Instr_LdSt64(True/*isLoad*/, dst, am));
1989          return dst;
1990       }
1991       if (ty == Ity_I32
1992           && 0 == (e->Iex.Get.offset & 3) && e->Iex.Get.offset < (4<<12)-4) {
1993          HReg        dst = newVRegI(env);
1994          ARM64AMode* am
1995             = mk_baseblock_32bit_access_amode(e->Iex.Get.offset);
1996          addInstr(env, ARM64Instr_LdSt32(True/*isLoad*/, dst, am));
1997          return dst;
1998       }
1999       if (ty == Ity_I16
2000           && 0 == (e->Iex.Get.offset & 1) && e->Iex.Get.offset < (2<<12)-2) {
2001          HReg        dst = newVRegI(env);
2002          ARM64AMode* am
2003             = mk_baseblock_16bit_access_amode(e->Iex.Get.offset);
2004          addInstr(env, ARM64Instr_LdSt16(True/*isLoad*/, dst, am));
2005          return dst;
2006       }
2007       if (ty == Ity_I8
2008           /* && no alignment check */ && e->Iex.Get.offset < (1<<12)-1) {
2009          HReg        dst = newVRegI(env);
2010          ARM64AMode* am
2011             = mk_baseblock_8bit_access_amode(e->Iex.Get.offset);
2012          addInstr(env, ARM64Instr_LdSt8(True/*isLoad*/, dst, am));
2013          return dst;
2014       }
2015       break;
2016    }
2017
2018    /* --------- CCALL --------- */
2019    case Iex_CCall: {
2020       HReg    dst = newVRegI(env);
2021       vassert(ty == e->Iex.CCall.retty);
2022
2023       /* be very restrictive for now.  Only 64-bit ints allowed for
2024          args, and 64 bits for return type.  Don't forget to change
2025          the RetLoc if more types are allowed in future. */
2026       if (e->Iex.CCall.retty != Ity_I64)
2027          goto irreducible;
2028
2029       /* Marshal args, do the call, clear stack. */
2030       UInt   addToSp = 0;
2031       RetLoc rloc    = mk_RetLoc_INVALID();
2032       Bool   ok      = doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
2033                                      e->Iex.CCall.cee, e->Iex.CCall.retty,
2034                                      e->Iex.CCall.args );
2035       /* */
2036       if (ok) {
2037          vassert(is_sane_RetLoc(rloc));
2038          vassert(rloc.pri == RLPri_Int);
2039          vassert(addToSp == 0);
2040          addInstr(env, ARM64Instr_MovI(dst, hregARM64_X0()));
2041          return dst;
2042       }
2043       /* else fall through; will hit the irreducible: label */
2044    }
2045
2046    /* --------- LITERAL --------- */
2047    /* 64-bit literals */
2048    case Iex_Const: {
2049       ULong u   = 0;
2050       HReg  dst = newVRegI(env);
2051       switch (e->Iex.Const.con->tag) {
2052          case Ico_U64: u = e->Iex.Const.con->Ico.U64; break;
2053          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
2054          case Ico_U16: u = e->Iex.Const.con->Ico.U16; break;
2055          case Ico_U8:  u = e->Iex.Const.con->Ico.U8;  break;
2056          default: ppIRExpr(e); vpanic("iselIntExpr_R.Iex_Const(arm64)");
2057       }
2058       addInstr(env, ARM64Instr_Imm64(dst, u));
2059       return dst;
2060    }
2061
2062    /* --------- MULTIPLEX --------- */
2063    case Iex_ITE: {
2064       /* ITE(ccexpr, iftrue, iffalse) */
2065       if (ty == Ity_I64 || ty == Ity_I32) {
2066          ARM64CondCode cc;
2067          HReg r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
2068          HReg r0  = iselIntExpr_R(env, e->Iex.ITE.iffalse);
2069          HReg dst = newVRegI(env);
2070          cc = iselCondCode(env, e->Iex.ITE.cond);
2071          addInstr(env, ARM64Instr_CSel(dst, r1, r0, cc));
2072          return dst;
2073       }
2074       break;
2075    }
2076
2077    default:
2078    break;
2079    } /* switch (e->tag) */
2080
2081    /* We get here if no pattern matched. */
2082   irreducible:
2083    ppIRExpr(e);
2084    vpanic("iselIntExpr_R: cannot reduce tree");
2085 }
2086
2087
2088 /*---------------------------------------------------------*/
2089 /*--- ISEL: Integer expressions (128 bit)               ---*/
2090 /*---------------------------------------------------------*/
2091
2092 /* Compute a 128-bit value into a register pair, which is returned as
2093    the first two parameters.  As with iselIntExpr_R, these may be
2094    either real or virtual regs; in any case they must not be changed
2095    by subsequent code emitted by the caller.  */
2096
2097 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
2098                              ISelEnv* env, IRExpr* e )
2099 {
2100    iselInt128Expr_wrk(rHi, rLo, env, e);
2101 #  if 0
2102    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
2103 #  endif
2104    vassert(hregClass(*rHi) == HRcInt64);
2105    vassert(hregIsVirtual(*rHi));
2106    vassert(hregClass(*rLo) == HRcInt64);
2107    vassert(hregIsVirtual(*rLo));
2108 }
2109
2110 /* DO NOT CALL THIS DIRECTLY ! */
2111 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
2112                                  ISelEnv* env, IRExpr* e )
2113 {
2114    vassert(e);
2115    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
2116
2117    /* --------- BINARY ops --------- */
2118    if (e->tag == Iex_Binop) {
2119       switch (e->Iex.Binop.op) {
2120          /* 64 x 64 -> 128 multiply */
2121          case Iop_MullU64:
2122          case Iop_MullS64: {
2123             Bool syned = toBool(e->Iex.Binop.op == Iop_MullS64);
2124             HReg argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
2125             HReg argR  = iselIntExpr_R(env, e->Iex.Binop.arg2);
2126             HReg dstLo = newVRegI(env);
2127             HReg dstHi = newVRegI(env);
2128             addInstr(env, ARM64Instr_Mul(dstLo, argL, argR,
2129                                          ARM64mul_PLAIN));
2130             addInstr(env, ARM64Instr_Mul(dstHi, argL, argR,
2131                                          syned ? ARM64mul_SX : ARM64mul_ZX));
2132             *rHi = dstHi;
2133             *rLo = dstLo;
2134             return;
2135          }
2136          /* 64HLto128(e1,e2) */
2137          case Iop_64HLto128:
2138             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
2139             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
2140             return;
2141          default:
2142             break;
2143       }
2144    } /* if (e->tag == Iex_Binop) */
2145
2146    ppIRExpr(e);
2147    vpanic("iselInt128Expr(arm64)");
2148 }
2149
2150
2151 /*---------------------------------------------------------*/
2152 /*--- ISEL: Vector expressions (128 bit)                ---*/
2153 /*---------------------------------------------------------*/
2154
2155 static HReg iselV128Expr ( ISelEnv* env, IRExpr* e )
2156 {
2157    HReg r = iselV128Expr_wrk( env, e );
2158    vassert(hregClass(r) == HRcVec128);
2159    vassert(hregIsVirtual(r));
2160    return r;
2161 }
2162
2163 /* DO NOT CALL THIS DIRECTLY */
2164 static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e )
2165 {
2166    IRType ty = typeOfIRExpr(env->type_env, e);
2167    vassert(e);
2168    vassert(ty == Ity_V128);
2169
2170    if (e->tag == Iex_RdTmp) {
2171       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
2172    }
2173
2174    if (e->tag == Iex_Const) {
2175       /* Only a very limited range of constants is handled. */
2176       vassert(e->Iex.Const.con->tag == Ico_V128);
2177       UShort con = e->Iex.Const.con->Ico.V128;
2178       HReg   res = newVRegV(env);
2179       switch (con) {
2180          case 0x0000: case 0x000F: case 0x003F: case 0x00FF: case 0xFFFF:
2181             addInstr(env, ARM64Instr_VImmQ(res, con));
2182             return res;
2183          case 0x00F0:
2184             addInstr(env, ARM64Instr_VImmQ(res, 0x000F));
2185             addInstr(env, ARM64Instr_VExtV(res, res, res, 12));
2186             return res;
2187          case 0x0F00:
2188             addInstr(env, ARM64Instr_VImmQ(res, 0x000F));
2189             addInstr(env, ARM64Instr_VExtV(res, res, res, 8));
2190             return res;
2191          case 0x0FF0:
2192             addInstr(env, ARM64Instr_VImmQ(res, 0x00FF));
2193             addInstr(env, ARM64Instr_VExtV(res, res, res, 12));
2194             return res;
2195          case 0x0FFF:
2196             addInstr(env, ARM64Instr_VImmQ(res, 0x000F));
2197             addInstr(env, ARM64Instr_VExtV(res, res, res, 4));
2198             addInstr(env, ARM64Instr_VUnaryV(ARM64vecu_NOT, res, res));
2199             return res;
2200          case 0xF000:
2201             addInstr(env, ARM64Instr_VImmQ(res, 0x000F));
2202             addInstr(env, ARM64Instr_VExtV(res, res, res, 4));
2203             return res;
2204          case 0xFF00:
2205             addInstr(env, ARM64Instr_VImmQ(res, 0x00FF));
2206             addInstr(env, ARM64Instr_VExtV(res, res, res, 8));
2207             return res;
2208          default:
2209             break;
2210       }
2211       /* Unhandled */
2212       goto v128_expr_bad;
2213    }
2214
2215    if (e->tag == Iex_Load) {
2216       HReg res = newVRegV(env);
2217       HReg rN  = iselIntExpr_R(env, e->Iex.Load.addr);
2218       vassert(ty == Ity_V128);
2219       addInstr(env, ARM64Instr_VLdStQ(True/*isLoad*/, res, rN));
2220       return res;
2221    }
2222
2223    if (e->tag == Iex_Get) {
2224       UInt offs = (UInt)e->Iex.Get.offset;
2225       if (offs < (1<<12)) {
2226          HReg addr = mk_baseblock_128bit_access_addr(env, offs);
2227          HReg res  = newVRegV(env);
2228          vassert(ty == Ity_V128);
2229          addInstr(env, ARM64Instr_VLdStQ(True/*isLoad*/, res, addr));
2230          return res;
2231       }
2232       goto v128_expr_bad;
2233    }
2234
2235    if (e->tag == Iex_Unop) {
2236
2237       /* Iop_ZeroHIXXofV128 cases */
2238       UShort imm16 = 0;
2239       switch (e->Iex.Unop.op) {
2240          case Iop_ZeroHI64ofV128:  imm16 = 0x00FF; break;
2241          case Iop_ZeroHI96ofV128:  imm16 = 0x000F; break;
2242          case Iop_ZeroHI112ofV128: imm16 = 0x0003; break;
2243          case Iop_ZeroHI120ofV128: imm16 = 0x0001; break;
2244          default: break;
2245       }
2246       if (imm16 != 0) {
2247          HReg src = iselV128Expr(env, e->Iex.Unop.arg);
2248          HReg imm = newVRegV(env);
2249          HReg res = newVRegV(env);
2250          addInstr(env, ARM64Instr_VImmQ(imm, imm16));
2251          addInstr(env, ARM64Instr_VBinV(ARM64vecb_AND, res, src, imm));
2252          return res;
2253       }
2254
2255       /* Other cases */
2256       switch (e->Iex.Unop.op) {
2257          case Iop_NotV128:
2258          case Iop_Abs64Fx2: case Iop_Abs32Fx4:
2259          case Iop_Neg64Fx2: case Iop_Neg32Fx4:
2260          case Iop_Abs64x2:  case Iop_Abs32x4:
2261          case Iop_Abs16x8:  case Iop_Abs8x16:
2262          case Iop_Cls32x4:  case Iop_Cls16x8:  case Iop_Cls8x16:
2263          case Iop_Clz32x4:  case Iop_Clz16x8:  case Iop_Clz8x16:
2264          case Iop_Cnt8x16:
2265          case Iop_Reverse1sIn8_x16:
2266          case Iop_Reverse8sIn16_x8:
2267          case Iop_Reverse8sIn32_x4: case Iop_Reverse16sIn32_x4:
2268          case Iop_Reverse8sIn64_x2: case Iop_Reverse16sIn64_x2:
2269          case Iop_Reverse32sIn64_x2:
2270          case Iop_RecipEst32Ux4:
2271          case Iop_RSqrtEst32Ux4:
2272          case Iop_RecipEst64Fx2: case Iop_RecipEst32Fx4:
2273          case Iop_RSqrtEst64Fx2: case Iop_RSqrtEst32Fx4:
2274          {
2275             HReg res   = newVRegV(env);
2276             HReg arg   = iselV128Expr(env, e->Iex.Unop.arg);
2277             Bool setRM = False;
2278             ARM64VecUnaryOp op = ARM64vecu_INVALID;
2279             switch (e->Iex.Unop.op) {
2280                case Iop_NotV128:           op = ARM64vecu_NOT;         break;
2281                case Iop_Abs64Fx2:          op = ARM64vecu_FABS64x2;    break;
2282                case Iop_Abs32Fx4:          op = ARM64vecu_FABS32x4;    break;
2283                case Iop_Neg64Fx2:          op = ARM64vecu_FNEG64x2;    break;
2284                case Iop_Neg32Fx4:          op = ARM64vecu_FNEG32x4;    break;
2285                case Iop_Abs64x2:           op = ARM64vecu_ABS64x2;     break;
2286                case Iop_Abs32x4:           op = ARM64vecu_ABS32x4;     break;
2287                case Iop_Abs16x8:           op = ARM64vecu_ABS16x8;     break;
2288                case Iop_Abs8x16:           op = ARM64vecu_ABS8x16;     break;
2289                case Iop_Cls32x4:           op = ARM64vecu_CLS32x4;     break;
2290                case Iop_Cls16x8:           op = ARM64vecu_CLS16x8;     break;
2291                case Iop_Cls8x16:           op = ARM64vecu_CLS8x16;     break;
2292                case Iop_Clz32x4:           op = ARM64vecu_CLZ32x4;     break;
2293                case Iop_Clz16x8:           op = ARM64vecu_CLZ16x8;     break;
2294                case Iop_Clz8x16:           op = ARM64vecu_CLZ8x16;     break;
2295                case Iop_Cnt8x16:           op = ARM64vecu_CNT8x16;     break;
2296                case Iop_Reverse1sIn8_x16:  op = ARM64vecu_RBIT;        break;
2297                case Iop_Reverse8sIn16_x8:  op = ARM64vecu_REV1616B;    break;
2298                case Iop_Reverse8sIn32_x4:  op = ARM64vecu_REV3216B;    break;
2299                case Iop_Reverse16sIn32_x4: op = ARM64vecu_REV328H;     break;
2300                case Iop_Reverse8sIn64_x2:  op = ARM64vecu_REV6416B;    break;
2301                case Iop_Reverse16sIn64_x2: op = ARM64vecu_REV648H;     break;
2302                case Iop_Reverse32sIn64_x2: op = ARM64vecu_REV644S;     break;
2303                case Iop_RecipEst32Ux4:     op = ARM64vecu_URECPE32x4;  break;
2304                case Iop_RSqrtEst32Ux4:     op = ARM64vecu_URSQRTE32x4; break;
2305                case Iop_RecipEst64Fx2:     setRM = True;
2306                                            op = ARM64vecu_FRECPE64x2;  break;
2307                case Iop_RecipEst32Fx4:     setRM = True;
2308                                            op = ARM64vecu_FRECPE32x4;  break;
2309                case Iop_RSqrtEst64Fx2:     setRM = True;
2310                                            op = ARM64vecu_FRSQRTE64x2; break;
2311                case Iop_RSqrtEst32Fx4:     setRM = True;
2312                                            op = ARM64vecu_FRSQRTE32x4; break;
2313                default: vassert(0);
2314             }
2315             if (setRM) {
2316                // This is a bit of a kludge.  We should do rm properly for
2317                // these recip-est insns, but that would require changing the
2318                // primop's type to take an rmode.
2319                set_FPCR_rounding_mode(env, IRExpr_Const(
2320                                               IRConst_U32(Irrm_NEAREST)));
2321             }
2322             addInstr(env, ARM64Instr_VUnaryV(op, res, arg));
2323             return res;
2324          }
2325          case Iop_CmpNEZ8x16:
2326          case Iop_CmpNEZ16x8:
2327          case Iop_CmpNEZ32x4:
2328          case Iop_CmpNEZ64x2: {
2329             HReg arg  = iselV128Expr(env, e->Iex.Unop.arg);
2330             HReg zero = newVRegV(env);
2331             HReg res  = newVRegV(env);
2332             ARM64VecBinOp cmp = ARM64vecb_INVALID;
2333             switch (e->Iex.Unop.op) {
2334                case Iop_CmpNEZ64x2: cmp = ARM64vecb_CMEQ64x2; break;
2335                case Iop_CmpNEZ32x4: cmp = ARM64vecb_CMEQ32x4; break;
2336                case Iop_CmpNEZ16x8: cmp = ARM64vecb_CMEQ16x8; break;
2337                case Iop_CmpNEZ8x16: cmp = ARM64vecb_CMEQ8x16; break;
2338                default: vassert(0);
2339             }
2340             // This is pretty feeble.  Better: use CMP against zero
2341             // and avoid the extra instruction and extra register.
2342             addInstr(env, ARM64Instr_VImmQ(zero, 0x0000));
2343             addInstr(env, ARM64Instr_VBinV(cmp, res, arg, zero));
2344             addInstr(env, ARM64Instr_VUnaryV(ARM64vecu_NOT, res, res));
2345             return res;
2346          }
2347          case Iop_V256toV128_0:
2348          case Iop_V256toV128_1: {
2349             HReg vHi, vLo;
2350             iselV256Expr(&vHi, &vLo, env, e->Iex.Unop.arg);
2351             return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
2352          }
2353          case Iop_64UtoV128: {
2354             HReg res = newVRegV(env);
2355             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
2356             addInstr(env, ARM64Instr_VQfromX(res, arg));
2357             return res;
2358          }
2359          case Iop_Widen8Sto16x8: {
2360             HReg res = newVRegV(env);
2361             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
2362             addInstr(env, ARM64Instr_VQfromX(res, arg));
2363             addInstr(env, ARM64Instr_VBinV(ARM64vecb_ZIP18x16, res, res, res));
2364             addInstr(env, ARM64Instr_VShiftImmV(ARM64vecshi_SSHR16x8,
2365                                                 res, res, 8));
2366             return res;
2367          }
2368          case Iop_Widen16Sto32x4: {
2369             HReg res = newVRegV(env);
2370             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
2371             addInstr(env, ARM64Instr_VQfromX(res, arg));
2372             addInstr(env, ARM64Instr_VBinV(ARM64vecb_ZIP116x8, res, res, res));
2373             addInstr(env, ARM64Instr_VShiftImmV(ARM64vecshi_SSHR32x4,
2374                                                 res, res, 16));
2375             return res;
2376          }
2377          case Iop_Widen32Sto64x2: {
2378             HReg res = newVRegV(env);
2379             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
2380             addInstr(env, ARM64Instr_VQfromX(res, arg));
2381             addInstr(env, ARM64Instr_VBinV(ARM64vecb_ZIP132x4, res, res, res));
2382             addInstr(env, ARM64Instr_VShiftImmV(ARM64vecshi_SSHR64x2,
2383                                                 res, res, 32));
2384             return res;
2385          }
2386          /* ... */
2387          default:
2388             break;
2389       } /* switch on the unop */
2390    } /* if (e->tag == Iex_Unop) */
2391
2392    if (e->tag == Iex_Binop) {
2393       switch (e->Iex.Binop.op) {
2394          case Iop_Sqrt32Fx4:
2395          case Iop_Sqrt64Fx2: {
2396             HReg arg = iselV128Expr(env, e->Iex.Binop.arg2);
2397             HReg res = newVRegV(env);
2398             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
2399             ARM64VecUnaryOp op
2400                = e->Iex.Binop.op == Iop_Sqrt32Fx4
2401                     ? ARM64vecu_FSQRT32x4 : ARM64vecu_FSQRT64x2;
2402             addInstr(env, ARM64Instr_VUnaryV(op, res, arg));
2403             return res;
2404          }
2405          case Iop_64HLtoV128: {
2406             HReg res  = newVRegV(env);
2407             HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
2408             HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
2409             addInstr(env, ARM64Instr_VQfromXX(res, argL, argR));
2410             return res;
2411          }
2412          /* -- Cases where we can generate a simple three-reg instruction. -- */
2413          case Iop_AndV128:
2414          case Iop_OrV128:
2415          case Iop_XorV128:
2416          case Iop_Max32Ux4: case Iop_Max16Ux8: case Iop_Max8Ux16:
2417          case Iop_Min32Ux4: case Iop_Min16Ux8: case Iop_Min8Ux16:
2418          case Iop_Max32Sx4: case Iop_Max16Sx8: case Iop_Max8Sx16:
2419          case Iop_Min32Sx4: case Iop_Min16Sx8: case Iop_Min8Sx16:
2420          case Iop_Add64x2: case Iop_Add32x4:
2421          case Iop_Add16x8: case Iop_Add8x16:
2422          case Iop_Sub64x2: case Iop_Sub32x4:
2423          case Iop_Sub16x8: case Iop_Sub8x16:
2424          case Iop_Mul32x4: case Iop_Mul16x8: case Iop_Mul8x16:
2425          case Iop_CmpEQ64x2: case Iop_CmpEQ32x4:
2426          case Iop_CmpEQ16x8:  case Iop_CmpEQ8x16:
2427          case Iop_CmpGT64Ux2: case Iop_CmpGT32Ux4:
2428          case Iop_CmpGT16Ux8: case Iop_CmpGT8Ux16:
2429          case Iop_CmpGT64Sx2: case Iop_CmpGT32Sx4:
2430          case Iop_CmpGT16Sx8: case Iop_CmpGT8Sx16:
2431          case Iop_CmpEQ64Fx2: case Iop_CmpEQ32Fx4:
2432          case Iop_CmpLE64Fx2: case Iop_CmpLE32Fx4:
2433          case Iop_CmpLT64Fx2: case Iop_CmpLT32Fx4:
2434          case Iop_Perm8x16:
2435          case Iop_InterleaveLO64x2: case Iop_CatEvenLanes32x4:
2436          case Iop_CatEvenLanes16x8: case Iop_CatEvenLanes8x16:
2437          case Iop_InterleaveHI64x2: case Iop_CatOddLanes32x4:
2438          case Iop_CatOddLanes16x8:  case Iop_CatOddLanes8x16:
2439          case Iop_InterleaveHI32x4:
2440          case Iop_InterleaveHI16x8: case Iop_InterleaveHI8x16:
2441          case Iop_InterleaveLO32x4:
2442          case Iop_InterleaveLO16x8: case Iop_InterleaveLO8x16:
2443          case Iop_PolynomialMul8x16:
2444          case Iop_QAdd64Sx2: case Iop_QAdd32Sx4:
2445          case Iop_QAdd16Sx8: case Iop_QAdd8Sx16:
2446          case Iop_QAdd64Ux2: case Iop_QAdd32Ux4:
2447          case Iop_QAdd16Ux8: case Iop_QAdd8Ux16:
2448          case Iop_QSub64Sx2: case Iop_QSub32Sx4:
2449          case Iop_QSub16Sx8: case Iop_QSub8Sx16:
2450          case Iop_QSub64Ux2: case Iop_QSub32Ux4:
2451          case Iop_QSub16Ux8: case Iop_QSub8Ux16:
2452          case Iop_QDMulHi32Sx4:  case Iop_QDMulHi16Sx8:
2453          case Iop_QRDMulHi32Sx4: case Iop_QRDMulHi16Sx8:
2454          case Iop_Sh8Sx16:  case Iop_Sh16Sx8:
2455          case Iop_Sh32Sx4:  case Iop_Sh64Sx2:
2456          case Iop_Sh8Ux16:  case Iop_Sh16Ux8:
2457          case Iop_Sh32Ux4:  case Iop_Sh64Ux2:
2458          case Iop_Rsh8Sx16: case Iop_Rsh16Sx8:
2459          case Iop_Rsh32Sx4: case Iop_Rsh64Sx2:
2460          case Iop_Rsh8Ux16: case Iop_Rsh16Ux8:
2461          case Iop_Rsh32Ux4: case Iop_Rsh64Ux2:
2462          case Iop_Max64Fx2: case Iop_Max32Fx4:
2463          case Iop_Min64Fx2: case Iop_Min32Fx4:
2464          case Iop_RecipStep64Fx2: case Iop_RecipStep32Fx4:
2465          case Iop_RSqrtStep64Fx2: case Iop_RSqrtStep32Fx4:
2466          {
2467             HReg res   = newVRegV(env);
2468             HReg argL  = iselV128Expr(env, e->Iex.Binop.arg1);
2469             HReg argR  = iselV128Expr(env, e->Iex.Binop.arg2);
2470             Bool sw    = False;
2471             Bool setRM = False;
2472             ARM64VecBinOp op = ARM64vecb_INVALID;
2473             switch (e->Iex.Binop.op) {
2474                case Iop_AndV128:    op = ARM64vecb_AND; break;
2475                case Iop_OrV128:     op = ARM64vecb_ORR; break;
2476                case Iop_XorV128:    op = ARM64vecb_XOR; break;
2477                case Iop_Max32Ux4:   op = ARM64vecb_UMAX32x4; break;
2478                case Iop_Max16Ux8:   op = ARM64vecb_UMAX16x8; break;
2479                case Iop_Max8Ux16:   op = ARM64vecb_UMAX8x16; break;
2480                case Iop_Min32Ux4:   op = ARM64vecb_UMIN32x4; break;
2481                case Iop_Min16Ux8:   op = ARM64vecb_UMIN16x8; break;
2482                case Iop_Min8Ux16:   op = ARM64vecb_UMIN8x16; break;
2483                case Iop_Max32Sx4:   op = ARM64vecb_SMAX32x4; break;
2484                case Iop_Max16Sx8:   op = ARM64vecb_SMAX16x8; break;
2485                case Iop_Max8Sx16:   op = ARM64vecb_SMAX8x16; break;
2486                case Iop_Min32Sx4:   op = ARM64vecb_SMIN32x4; break;
2487                case Iop_Min16Sx8:   op = ARM64vecb_SMIN16x8; break;
2488                case Iop_Min8Sx16:   op = ARM64vecb_SMIN8x16; break;
2489                case Iop_Add64x2:    op = ARM64vecb_ADD64x2; break;
2490                case Iop_Add32x4:    op = ARM64vecb_ADD32x4; break;
2491                case Iop_Add16x8:    op = ARM64vecb_ADD16x8; break;
2492                case Iop_Add8x16:    op = ARM64vecb_ADD8x16; break;
2493                case Iop_Sub64x2:    op = ARM64vecb_SUB64x2; break;
2494                case Iop_Sub32x4:    op = ARM64vecb_SUB32x4; break;
2495                case Iop_Sub16x8:    op = ARM64vecb_SUB16x8; break;
2496                case Iop_Sub8x16:    op = ARM64vecb_SUB8x16; break;
2497                case Iop_Mul32x4:    op = ARM64vecb_MUL32x4; break;
2498                case Iop_Mul16x8:    op = ARM64vecb_MUL16x8; break;
2499                case Iop_Mul8x16:    op = ARM64vecb_MUL8x16; break;
2500                case Iop_CmpEQ64x2:  op = ARM64vecb_CMEQ64x2; break;
2501                case Iop_CmpEQ32x4:  op = ARM64vecb_CMEQ32x4; break;
2502                case Iop_CmpEQ16x8:  op = ARM64vecb_CMEQ16x8; break;
2503                case Iop_CmpEQ8x16:  op = ARM64vecb_CMEQ8x16; break;
2504                case Iop_CmpGT64Ux2: op = ARM64vecb_CMHI64x2; break;
2505                case Iop_CmpGT32Ux4: op = ARM64vecb_CMHI32x4; break;
2506                case Iop_CmpGT16Ux8: op = ARM64vecb_CMHI16x8; break;
2507                case Iop_CmpGT8Ux16: op = ARM64vecb_CMHI8x16; break;
2508                case Iop_CmpGT64Sx2: op = ARM64vecb_CMGT64x2; break;
2509                case Iop_CmpGT32Sx4: op = ARM64vecb_CMGT32x4; break;
2510                case Iop_CmpGT16Sx8: op = ARM64vecb_CMGT16x8; break;
2511                case Iop_CmpGT8Sx16: op = ARM64vecb_CMGT8x16; break;
2512                case Iop_CmpEQ64Fx2: op = ARM64vecb_FCMEQ64x2; break;
2513                case Iop_CmpEQ32Fx4: op = ARM64vecb_FCMEQ32x4; break;
2514                case Iop_CmpLE64Fx2: op = ARM64vecb_FCMGE64x2; sw = True; break;
2515                case Iop_CmpLE32Fx4: op = ARM64vecb_FCMGE32x4; sw = True; break;
2516                case Iop_CmpLT64Fx2: op = ARM64vecb_FCMGT64x2; sw = True; break;
2517                case Iop_CmpLT32Fx4: op = ARM64vecb_FCMGT32x4; sw = True; break;
2518                case Iop_Perm8x16:   op = ARM64vecb_TBL1; break;
2519                case Iop_InterleaveLO64x2: op = ARM64vecb_UZP164x2; sw = True;
2520                                           break;
2521                case Iop_CatEvenLanes32x4: op = ARM64vecb_UZP132x4; sw = True;
2522                                           break;
2523                case Iop_CatEvenLanes16x8: op = ARM64vecb_UZP116x8; sw = True;
2524                                           break;
2525                case Iop_CatEvenLanes8x16: op = ARM64vecb_UZP18x16; sw = True;
2526                                           break;
2527                case Iop_InterleaveHI64x2: op = ARM64vecb_UZP264x2; sw = True;
2528                                           break;
2529                case Iop_CatOddLanes32x4:  op = ARM64vecb_UZP232x4; sw = True;
2530                                           break;
2531                case Iop_CatOddLanes16x8:  op = ARM64vecb_UZP216x8; sw = True;
2532                                           break;
2533                case Iop_CatOddLanes8x16:  op = ARM64vecb_UZP28x16; sw = True;
2534                                           break;
2535                case Iop_InterleaveHI32x4: op = ARM64vecb_ZIP232x4; sw = True;
2536                                           break;
2537                case Iop_InterleaveHI16x8: op = ARM64vecb_ZIP216x8; sw = True;
2538                                           break;
2539                case Iop_InterleaveHI8x16: op = ARM64vecb_ZIP28x16; sw = True;
2540                                           break;
2541                case Iop_InterleaveLO32x4: op = ARM64vecb_ZIP132x4; sw = True;
2542                                           break;
2543                case Iop_InterleaveLO16x8: op = ARM64vecb_ZIP116x8; sw = True;
2544                                           break;
2545                case Iop_InterleaveLO8x16: op = ARM64vecb_ZIP18x16; sw = True;
2546                                           break;
2547                case Iop_PolynomialMul8x16: op = ARM64vecb_PMUL8x16; break;
2548                case Iop_QAdd64Sx2:      op = ARM64vecb_SQADD64x2; break;
2549                case Iop_QAdd32Sx4:      op = ARM64vecb_SQADD32x4; break;
2550                case Iop_QAdd16Sx8:      op = ARM64vecb_SQADD16x8; break;
2551                case Iop_QAdd8Sx16:      op = ARM64vecb_SQADD8x16; break;
2552                case Iop_QAdd64Ux2:      op = ARM64vecb_UQADD64x2; break;
2553                case Iop_QAdd32Ux4:      op = ARM64vecb_UQADD32x4; break;
2554                case Iop_QAdd16Ux8:      op = ARM64vecb_UQADD16x8; break;
2555                case Iop_QAdd8Ux16:      op = ARM64vecb_UQADD8x16; break;
2556                case Iop_QSub64Sx2:      op = ARM64vecb_SQSUB64x2; break;
2557                case Iop_QSub32Sx4:      op = ARM64vecb_SQSUB32x4; break;
2558                case Iop_QSub16Sx8:      op = ARM64vecb_SQSUB16x8; break;
2559                case Iop_QSub8Sx16:      op = ARM64vecb_SQSUB8x16; break;
2560                case Iop_QSub64Ux2:      op = ARM64vecb_UQSUB64x2; break;
2561                case Iop_QSub32Ux4:      op = ARM64vecb_UQSUB32x4; break;
2562                case Iop_QSub16Ux8:      op = ARM64vecb_UQSUB16x8; break;
2563                case Iop_QSub8Ux16:      op = ARM64vecb_UQSUB8x16; break;
2564                case Iop_QDMulHi32Sx4:   op = ARM64vecb_SQDMULH32x4; break;
2565                case Iop_QDMulHi16Sx8:   op = ARM64vecb_SQDMULH16x8; break;
2566                case Iop_QRDMulHi32Sx4:  op = ARM64vecb_SQRDMULH32x4; break;
2567                case Iop_QRDMulHi16Sx8:  op = ARM64vecb_SQRDMULH16x8; break;
2568                case Iop_Sh8Sx16:        op = ARM64vecb_SSHL8x16; break;
2569                case Iop_Sh16Sx8:        op = ARM64vecb_SSHL16x8; break;
2570                case Iop_Sh32Sx4:        op = ARM64vecb_SSHL32x4; break;
2571                case Iop_Sh64Sx2:        op = ARM64vecb_SSHL64x2; break;
2572                case Iop_Sh8Ux16:        op = ARM64vecb_USHL8x16; break;
2573                case Iop_Sh16Ux8:        op = ARM64vecb_USHL16x8; break;
2574                case Iop_Sh32Ux4:        op = ARM64vecb_USHL32x4; break;
2575                case Iop_Sh64Ux2:        op = ARM64vecb_USHL64x2; break;
2576                case Iop_Rsh8Sx16:       op = ARM64vecb_SRSHL8x16; break;
2577                case Iop_Rsh16Sx8:       op = ARM64vecb_SRSHL16x8; break;
2578                case Iop_Rsh32Sx4:       op = ARM64vecb_SRSHL32x4; break;
2579                case Iop_Rsh64Sx2:       op = ARM64vecb_SRSHL64x2; break;
2580                case Iop_Rsh8Ux16:       op = ARM64vecb_URSHL8x16; break;
2581                case Iop_Rsh16Ux8:       op = ARM64vecb_URSHL16x8; break;
2582                case Iop_Rsh32Ux4:       op = ARM64vecb_URSHL32x4; break;
2583                case Iop_Rsh64Ux2:       op = ARM64vecb_URSHL64x2; break;
2584                case Iop_Max64Fx2:       op = ARM64vecb_FMAX64x2; break;
2585                case Iop_Max32Fx4:       op = ARM64vecb_FMAX32x4; break;
2586                case Iop_Min64Fx2:       op = ARM64vecb_FMIN64x2; break;
2587                case Iop_Min32Fx4:       op = ARM64vecb_FMIN32x4; break;
2588                case Iop_RecipStep64Fx2: setRM = True;
2589                                         op = ARM64vecb_FRECPS64x2; break;
2590                case Iop_RecipStep32Fx4: setRM = True;
2591                                         op = ARM64vecb_FRECPS32x4; break;
2592                case Iop_RSqrtStep64Fx2: setRM = True;
2593                                         op = ARM64vecb_FRSQRTS64x2; break;
2594                case Iop_RSqrtStep32Fx4: setRM = True;
2595                                         op = ARM64vecb_FRSQRTS32x4; break;
2596                default: vassert(0);
2597             }
2598             if (setRM) {
2599                // This is a bit of a kludge.  We should do rm properly for
2600                // these recip-step insns, but that would require changing the
2601                // primop's type to take an rmode.
2602                set_FPCR_rounding_mode(env, IRExpr_Const(
2603                                               IRConst_U32(Irrm_NEAREST)));
2604             }
2605             if (sw) {
2606                addInstr(env, ARM64Instr_VBinV(op, res, argR, argL));
2607             } else {
2608                addInstr(env, ARM64Instr_VBinV(op, res, argL, argR));
2609             }
2610             return res;
2611          }
2612          /* -- These only have 2 operand instructions, so we have to first move
2613             the first argument into a new register, for modification. -- */
2614          case Iop_QAddExtUSsatSS8x16: case Iop_QAddExtUSsatSS16x8:
2615          case Iop_QAddExtUSsatSS32x4: case Iop_QAddExtUSsatSS64x2:
2616          case Iop_QAddExtSUsatUU8x16: case Iop_QAddExtSUsatUU16x8:
2617          case Iop_QAddExtSUsatUU32x4: case Iop_QAddExtSUsatUU64x2:
2618          {
2619             HReg res  = newVRegV(env);
2620             HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
2621             HReg argR = iselV128Expr(env, e->Iex.Binop.arg2);
2622             ARM64VecModifyOp op = ARM64vecmo_INVALID;
2623             switch (e->Iex.Binop.op) {
2624                /* In the following 8 cases, the US - SU switching is intended.
2625                   See comments on the libvex_ir.h for details.  Also in the
2626                   ARM64 front end, where used these primops are generated. */
2627                case Iop_QAddExtUSsatSS8x16: op = ARM64vecmo_SUQADD8x16; break;
2628                case Iop_QAddExtUSsatSS16x8: op = ARM64vecmo_SUQADD16x8; break;
2629                case Iop_QAddExtUSsatSS32x4: op = ARM64vecmo_SUQADD32x4; break;
2630                case Iop_QAddExtUSsatSS64x2: op = ARM64vecmo_SUQADD64x2; break;
2631                case Iop_QAddExtSUsatUU8x16: op = ARM64vecmo_USQADD8x16; break;
2632                case Iop_QAddExtSUsatUU16x8: op = ARM64vecmo_USQADD16x8; break;
2633                case Iop_QAddExtSUsatUU32x4: op = ARM64vecmo_USQADD32x4; break;
2634                case Iop_QAddExtSUsatUU64x2: op = ARM64vecmo_USQADD64x2; break;
2635                default: vassert(0);
2636             }
2637             /* The order of the operands is important.  Although this is
2638                basically addition, the two operands are extended differently,
2639                making it important to get them into the correct registers in
2640                the instruction. */
2641             addInstr(env, ARM64Instr_VMov(16, res, argR));
2642             addInstr(env, ARM64Instr_VModifyV(op, res, argL));
2643             return res;
2644          }
2645          /* -- Shifts by an immediate. -- */
2646          case Iop_ShrN64x2: case Iop_ShrN32x4:
2647          case Iop_ShrN16x8: case Iop_ShrN8x16:
2648          case Iop_SarN64x2: case Iop_SarN32x4:
2649          case Iop_SarN16x8: case Iop_SarN8x16:
2650          case Iop_ShlN64x2: case Iop_ShlN32x4:
2651          case Iop_ShlN16x8: case Iop_ShlN8x16:
2652          case Iop_QShlNsatUU64x2: case Iop_QShlNsatUU32x4:
2653          case Iop_QShlNsatUU16x8: case Iop_QShlNsatUU8x16:
2654          case Iop_QShlNsatSS64x2: case Iop_QShlNsatSS32x4:
2655          case Iop_QShlNsatSS16x8: case Iop_QShlNsatSS8x16:
2656          case Iop_QShlNsatSU64x2: case Iop_QShlNsatSU32x4:
2657          case Iop_QShlNsatSU16x8: case Iop_QShlNsatSU8x16:
2658          {
2659             IRExpr* argL = e->Iex.Binop.arg1;
2660             IRExpr* argR = e->Iex.Binop.arg2;
2661             if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) {
2662                UInt amt   = argR->Iex.Const.con->Ico.U8;
2663                UInt limLo = 0;
2664                UInt limHi = 0;
2665                ARM64VecShiftImmOp op = ARM64vecshi_INVALID;
2666                /* Establish the instruction to use. */
2667                switch (e->Iex.Binop.op) {
2668                   case Iop_ShrN64x2:       op = ARM64vecshi_USHR64x2;   break;
2669                   case Iop_ShrN32x4:       op = ARM64vecshi_USHR32x4;   break;
2670                   case Iop_ShrN16x8:       op = ARM64vecshi_USHR16x8;   break;
2671                   case Iop_ShrN8x16:       op = ARM64vecshi_USHR8x16;   break;
2672                   case Iop_SarN64x2:       op = ARM64vecshi_SSHR64x2;   break;
2673                   case Iop_SarN32x4:       op = ARM64vecshi_SSHR32x4;   break;
2674                   case Iop_SarN16x8:       op = ARM64vecshi_SSHR16x8;   break;
2675                   case Iop_SarN8x16:       op = ARM64vecshi_SSHR8x16;   break;
2676                   case Iop_ShlN64x2:       op = ARM64vecshi_SHL64x2;    break;
2677                   case Iop_ShlN32x4:       op = ARM64vecshi_SHL32x4;    break;
2678                   case Iop_ShlN16x8:       op = ARM64vecshi_SHL16x8;    break;
2679                   case Iop_ShlN8x16:       op = ARM64vecshi_SHL8x16;    break;
2680                   case Iop_QShlNsatUU64x2: op = ARM64vecshi_UQSHL64x2;  break;
2681                   case Iop_QShlNsatUU32x4: op = ARM64vecshi_UQSHL32x4;  break;
2682                   case Iop_QShlNsatUU16x8: op = ARM64vecshi_UQSHL16x8;  break;
2683                   case Iop_QShlNsatUU8x16: op = ARM64vecshi_UQSHL8x16;  break;
2684                   case Iop_QShlNsatSS64x2: op = ARM64vecshi_SQSHL64x2;  break;
2685                   case Iop_QShlNsatSS32x4: op = ARM64vecshi_SQSHL32x4;  break;
2686                   case Iop_QShlNsatSS16x8: op = ARM64vecshi_SQSHL16x8;  break;
2687                   case Iop_QShlNsatSS8x16: op = ARM64vecshi_SQSHL8x16;  break;
2688                   case Iop_QShlNsatSU64x2: op = ARM64vecshi_SQSHLU64x2; break;
2689                   case Iop_QShlNsatSU32x4: op = ARM64vecshi_SQSHLU32x4; break;
2690                   case Iop_QShlNsatSU16x8: op = ARM64vecshi_SQSHLU16x8; break;
2691                   case Iop_QShlNsatSU8x16: op = ARM64vecshi_SQSHLU8x16; break;
2692                   default: vassert(0);
2693                }
2694                /* Establish the shift limits, for sanity check purposes only. */
2695                switch (e->Iex.Binop.op) {
2696                   case Iop_ShrN64x2:       limLo = 1; limHi = 64; break;
2697                   case Iop_ShrN32x4:       limLo = 1; limHi = 32; break;
2698                   case Iop_ShrN16x8:       limLo = 1; limHi = 16; break;
2699                   case Iop_ShrN8x16:       limLo = 1; limHi = 8;  break;
2700                   case Iop_SarN64x2:       limLo = 1; limHi = 64; break;
2701                   case Iop_SarN32x4:       limLo = 1; limHi = 32; break;
2702                   case Iop_SarN16x8:       limLo = 1; limHi = 16; break;
2703                   case Iop_SarN8x16:       limLo = 1; limHi = 8;  break;
2704                   case Iop_ShlN64x2:       limLo = 0; limHi = 63; break;
2705                   case Iop_ShlN32x4:       limLo = 0; limHi = 31; break;
2706                   case Iop_ShlN16x8:       limLo = 0; limHi = 15; break;
2707                   case Iop_ShlN8x16:       limLo = 0; limHi = 7;  break;
2708                   case Iop_QShlNsatUU64x2: limLo = 0; limHi = 63; break;
2709                   case Iop_QShlNsatUU32x4: limLo = 0; limHi = 31; break;
2710                   case Iop_QShlNsatUU16x8: limLo = 0; limHi = 15; break;
2711                   case Iop_QShlNsatUU8x16: limLo = 0; limHi = 7;  break;
2712                   case Iop_QShlNsatSS64x2: limLo = 0; limHi = 63; break;
2713                   case Iop_QShlNsatSS32x4: limLo = 0; limHi = 31; break;
2714                   case Iop_QShlNsatSS16x8: limLo = 0; limHi = 15; break;
2715                   case Iop_QShlNsatSS8x16: limLo = 0; limHi = 7;  break;
2716                   case Iop_QShlNsatSU64x2: limLo = 0; limHi = 63; break;
2717                   case Iop_QShlNsatSU32x4: limLo = 0; limHi = 31; break;
2718                   case Iop_QShlNsatSU16x8: limLo = 0; limHi = 15; break;
2719                   case Iop_QShlNsatSU8x16: limLo = 0; limHi = 7;  break;
2720                   default: vassert(0);
2721                }
2722                /* For left shifts, the allowable amt values are
2723                   0 .. lane_bits-1.  For right shifts the allowable
2724                   values are 1 .. lane_bits. */
2725                if (op != ARM64vecshi_INVALID && amt >= limLo && amt <= limHi) {
2726                   HReg src = iselV128Expr(env, argL);
2727                   HReg dst = newVRegV(env);
2728                   addInstr(env, ARM64Instr_VShiftImmV(op, dst, src, amt));
2729                   return dst;
2730                }
2731                /* Special case some no-op shifts that the arm64 front end
2732                   throws at us.  We can't generate any instructions for these,
2733                   but we don't need to either. */
2734                switch (e->Iex.Binop.op) {
2735                   case Iop_ShrN64x2: case Iop_ShrN32x4:
2736                   case Iop_ShrN16x8: case Iop_ShrN8x16:
2737                      if (amt == 0) {
2738                         return iselV128Expr(env, argL);
2739                      }
2740                      break;
2741                   default:
2742                      break;
2743                }
2744                /* otherwise unhandled */
2745             }
2746             /* else fall out; this is unhandled */
2747             break;
2748          }
2749          /* -- Saturating narrowing by an immediate -- */
2750          /* uu */
2751          case Iop_QandQShrNnarrow16Uto8Ux8:
2752          case Iop_QandQShrNnarrow32Uto16Ux4:
2753          case Iop_QandQShrNnarrow64Uto32Ux2:
2754          /* ss */
2755          case Iop_QandQSarNnarrow16Sto8Sx8:
2756          case Iop_QandQSarNnarrow32Sto16Sx4:
2757          case Iop_QandQSarNnarrow64Sto32Sx2:
2758          /* su */
2759          case Iop_QandQSarNnarrow16Sto8Ux8:
2760          case Iop_QandQSarNnarrow32Sto16Ux4:
2761          case Iop_QandQSarNnarrow64Sto32Ux2:
2762          /* ruu */
2763          case Iop_QandQRShrNnarrow16Uto8Ux8:
2764          case Iop_QandQRShrNnarrow32Uto16Ux4:
2765          case Iop_QandQRShrNnarrow64Uto32Ux2:
2766          /* rss */
2767          case Iop_QandQRSarNnarrow16Sto8Sx8:
2768          case Iop_QandQRSarNnarrow32Sto16Sx4:
2769          case Iop_QandQRSarNnarrow64Sto32Sx2:
2770          /* rsu */
2771          case Iop_QandQRSarNnarrow16Sto8Ux8:
2772          case Iop_QandQRSarNnarrow32Sto16Ux4:
2773          case Iop_QandQRSarNnarrow64Sto32Ux2:
2774          {
2775             IRExpr* argL = e->Iex.Binop.arg1;
2776             IRExpr* argR = e->Iex.Binop.arg2;
2777             if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) {
2778                UInt amt   = argR->Iex.Const.con->Ico.U8;
2779                UInt limit = 0;
2780                ARM64VecShiftImmOp op = ARM64vecshi_INVALID;
2781                switch (e->Iex.Binop.op) {
2782                   /* uu */
2783                   case Iop_QandQShrNnarrow64Uto32Ux2:
2784                      op = ARM64vecshi_UQSHRN2SD; limit = 64; break;
2785                   case Iop_QandQShrNnarrow32Uto16Ux4:
2786                      op = ARM64vecshi_UQSHRN4HS; limit = 32; break;
2787                   case Iop_QandQShrNnarrow16Uto8Ux8:
2788                      op = ARM64vecshi_UQSHRN8BH; limit = 16; break;
2789                   /* ss */
2790                   case Iop_QandQSarNnarrow64Sto32Sx2:
2791                      op = ARM64vecshi_SQSHRN2SD; limit = 64; break;
2792                   case Iop_QandQSarNnarrow32Sto16Sx4:
2793                      op = ARM64vecshi_SQSHRN4HS; limit = 32; break;
2794                   case Iop_QandQSarNnarrow16Sto8Sx8:
2795                      op = ARM64vecshi_SQSHRN8BH; limit = 16; break;
2796                   /* su */
2797                   case Iop_QandQSarNnarrow64Sto32Ux2:
2798                      op = ARM64vecshi_SQSHRUN2SD; limit = 64; break;
2799                   case Iop_QandQSarNnarrow32Sto16Ux4:
2800                      op = ARM64vecshi_SQSHRUN4HS; limit = 32; break;
2801                   case Iop_QandQSarNnarrow16Sto8Ux8:
2802                      op = ARM64vecshi_SQSHRUN8BH; limit = 16; break;
2803                   /* ruu */
2804                   case Iop_QandQRShrNnarrow64Uto32Ux2:
2805                      op = ARM64vecshi_UQRSHRN2SD; limit = 64; break;
2806                   case Iop_QandQRShrNnarrow32Uto16Ux4:
2807                      op = ARM64vecshi_UQRSHRN4HS; limit = 32; break;
2808                   case Iop_QandQRShrNnarrow16Uto8Ux8:
2809                      op = ARM64vecshi_UQRSHRN8BH; limit = 16; break;
2810                   /* rss */
2811                   case Iop_QandQRSarNnarrow64Sto32Sx2:
2812                      op = ARM64vecshi_SQRSHRN2SD; limit = 64; break;
2813                   case Iop_QandQRSarNnarrow32Sto16Sx4:
2814                      op = ARM64vecshi_SQRSHRN4HS; limit = 32; break;
2815                   case Iop_QandQRSarNnarrow16Sto8Sx8:
2816                      op = ARM64vecshi_SQRSHRN8BH; limit = 16; break;
2817                   /* rsu */
2818                   case Iop_QandQRSarNnarrow64Sto32Ux2:
2819                      op = ARM64vecshi_SQRSHRUN2SD; limit = 64; break;
2820                   case Iop_QandQRSarNnarrow32Sto16Ux4:
2821                      op = ARM64vecshi_SQRSHRUN4HS; limit = 32; break;
2822                   case Iop_QandQRSarNnarrow16Sto8Ux8:
2823                      op = ARM64vecshi_SQRSHRUN8BH; limit = 16; break;
2824                   /**/
2825                   default:
2826                      vassert(0);
2827                }
2828                if (op != ARM64vecshi_INVALID && amt >= 1 && amt <= limit) {
2829                   HReg src  = iselV128Expr(env, argL);
2830                   HReg dst  = newVRegV(env);
2831                   HReg fpsr = newVRegI(env);
2832                   /* Clear FPSR.Q, do the operation, and return both its
2833                      result and the new value of FPSR.Q.  We can simply
2834                      zero out FPSR since all the other bits have no relevance
2835                      in VEX generated code. */
2836                   addInstr(env, ARM64Instr_Imm64(fpsr, 0));
2837                   addInstr(env, ARM64Instr_FPSR(True/*toFPSR*/, fpsr));
2838                   addInstr(env, ARM64Instr_VShiftImmV(op, dst, src, amt));
2839                   addInstr(env, ARM64Instr_FPSR(False/*!toFPSR*/, fpsr));
2840                   addInstr(env, ARM64Instr_Shift(fpsr, fpsr, ARM64RI6_I6(27),
2841                                                              ARM64sh_SHR));
2842                   ARM64RIL* ril_one = mb_mkARM64RIL_I(1);
2843                   vassert(ril_one);
2844                   addInstr(env, ARM64Instr_Logic(fpsr,
2845                                                  fpsr, ril_one, ARM64lo_AND));
2846                   /* Now we have: the main (shift) result in the bottom half
2847                      of |dst|, and the Q bit at the bottom of |fpsr|.
2848                      Combining them with a "InterleaveLO64x2" style operation
2849                      produces a 128 bit value, dst[63:0]:fpsr[63:0],
2850                      which is what we want. */
2851                   HReg scratch = newVRegV(env);
2852                   addInstr(env, ARM64Instr_VQfromX(scratch, fpsr));
2853                   addInstr(env, ARM64Instr_VBinV(ARM64vecb_UZP164x2,
2854                                                  dst, dst, scratch));
2855                   return dst;
2856                }
2857             }
2858             /* else fall out; this is unhandled */
2859             break;
2860          }
2861
2862          // Use Iop_SliceV128 in preference to Iop_ShlV128 and Iop_ShrV128,
2863          // as it is in some ways more general and often leads to better
2864          // code overall.
2865          case Iop_ShlV128:
2866          case Iop_ShrV128: {
2867             Bool isSHR = e->Iex.Binop.op == Iop_ShrV128;
2868             /* This is tricky.  Generate an EXT instruction with zeroes in
2869                the high operand (shift right) or low operand (shift left).
2870                Note that we can only slice in the EXT instruction at a byte
2871                level of granularity, so the shift amount needs careful
2872                checking. */
2873             IRExpr* argL = e->Iex.Binop.arg1;
2874             IRExpr* argR = e->Iex.Binop.arg2;
2875             if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) {
2876                UInt amt   = argR->Iex.Const.con->Ico.U8;
2877                Bool amtOK = False;
2878                switch (amt) {
2879                   case 0x08: case 0x10: case 0x18: case 0x20: case 0x28:
2880                   case 0x30: case 0x38: case 0x40: case 0x48: case 0x50:
2881                   case 0x58: case 0x60: case 0x68: case 0x70: case 0x78:
2882                      amtOK = True; break;
2883                }
2884                /* We could also deal with amt==0 by copying the source to
2885                   the destination, but there's no need for that so far. */
2886                if (amtOK) {
2887                   HReg src  = iselV128Expr(env, argL);
2888                   HReg srcZ = newVRegV(env);
2889                   addInstr(env, ARM64Instr_VImmQ(srcZ, 0x0000));
2890                   UInt immB = amt / 8;
2891                   vassert(immB >= 1 && immB <= 15);
2892                   HReg dst = newVRegV(env);
2893                   if (isSHR) {
2894                     addInstr(env, ARM64Instr_VExtV(dst, src/*lo*/, srcZ/*hi*/,
2895                                                          immB));
2896                   } else {
2897                     addInstr(env, ARM64Instr_VExtV(dst, srcZ/*lo*/, src/*hi*/,
2898                                                          16 - immB));
2899                   }
2900                   return dst;
2901                }
2902             }
2903             /* else fall out; this is unhandled */
2904             break;
2905          }
2906
2907          case Iop_PolynomialMull8x8:
2908          case Iop_Mull32Ux2:
2909          case Iop_Mull16Ux4:
2910          case Iop_Mull8Ux8:
2911          case Iop_Mull32Sx2:
2912          case Iop_Mull16Sx4:
2913          case Iop_Mull8Sx8:
2914          case Iop_QDMull32Sx2:
2915          case Iop_QDMull16Sx4:
2916          {
2917             HReg iSrcL = iselIntExpr_R(env, e->Iex.Binop.arg1);
2918             HReg iSrcR = iselIntExpr_R(env, e->Iex.Binop.arg2);
2919             HReg vSrcL = newVRegV(env);
2920             HReg vSrcR = newVRegV(env);
2921             HReg dst   = newVRegV(env);
2922             ARM64VecBinOp op = ARM64vecb_INVALID;
2923             switch (e->Iex.Binop.op) {
2924                case Iop_PolynomialMull8x8: op = ARM64vecb_PMULL8x8;    break;
2925                case Iop_Mull32Ux2:         op = ARM64vecb_UMULL2DSS;   break;
2926                case Iop_Mull16Ux4:         op = ARM64vecb_UMULL4SHH;   break;
2927                case Iop_Mull8Ux8:          op = ARM64vecb_UMULL8HBB;   break;
2928                case Iop_Mull32Sx2:         op = ARM64vecb_SMULL2DSS;   break;
2929                case Iop_Mull16Sx4:         op = ARM64vecb_SMULL4SHH;   break;
2930                case Iop_Mull8Sx8:          op = ARM64vecb_SMULL8HBB;   break;
2931                case Iop_QDMull32Sx2:       op = ARM64vecb_SQDMULL2DSS; break;
2932                case Iop_QDMull16Sx4:       op = ARM64vecb_SQDMULL4SHH; break;
2933                default: vassert(0);
2934             }
2935             addInstr(env, ARM64Instr_VQfromXX(vSrcL, iSrcL, iSrcL));
2936             addInstr(env, ARM64Instr_VQfromXX(vSrcR, iSrcR, iSrcR));
2937             addInstr(env, ARM64Instr_VBinV(op, dst, vSrcL, vSrcR));
2938             return dst;
2939          }
2940
2941          /* ... */
2942          default:
2943             break;
2944       } /* switch on the binop */
2945    } /* if (e->tag == Iex_Binop) */
2946
2947    if (e->tag == Iex_Triop) {
2948       IRTriop*      triop  = e->Iex.Triop.details;
2949       ARM64VecBinOp vecbop = ARM64vecb_INVALID;
2950       switch (triop->op) {
2951          case Iop_Add64Fx2: vecbop = ARM64vecb_FADD64x2; break;
2952          case Iop_Sub64Fx2: vecbop = ARM64vecb_FSUB64x2; break;
2953          case Iop_Mul64Fx2: vecbop = ARM64vecb_FMUL64x2; break;
2954          case Iop_Div64Fx2: vecbop = ARM64vecb_FDIV64x2; break;
2955          case Iop_Add32Fx4: vecbop = ARM64vecb_FADD32x4; break;
2956          case Iop_Sub32Fx4: vecbop = ARM64vecb_FSUB32x4; break;
2957          case Iop_Mul32Fx4: vecbop = ARM64vecb_FMUL32x4; break;
2958          case Iop_Div32Fx4: vecbop = ARM64vecb_FDIV32x4; break;
2959          default: break;
2960       }
2961       if (vecbop != ARM64vecb_INVALID) {
2962          HReg argL = iselV128Expr(env, triop->arg2);
2963          HReg argR = iselV128Expr(env, triop->arg3);
2964          HReg dst  = newVRegV(env);
2965          set_FPCR_rounding_mode(env, triop->arg1);
2966          addInstr(env, ARM64Instr_VBinV(vecbop, dst, argL, argR));
2967          return dst;
2968       }
2969
2970       if (triop->op == Iop_SliceV128) {
2971          /* Note that, compared to ShlV128/ShrV128 just above, the shift
2972             amount here is in bytes, not bits. */
2973          IRExpr* argHi  = triop->arg1;
2974          IRExpr* argLo  = triop->arg2;
2975          IRExpr* argAmt = triop->arg3;
2976          if (argAmt->tag == Iex_Const && argAmt->Iex.Const.con->tag == Ico_U8) {
2977             UInt amt   = argAmt->Iex.Const.con->Ico.U8;
2978             Bool amtOK = amt >= 1 && amt <= 15;
2979             /* We could also deal with amt==0 by copying argLO to
2980                the destination, but there's no need for that so far. */
2981             if (amtOK) {
2982                HReg srcHi = iselV128Expr(env, argHi);
2983                HReg srcLo = iselV128Expr(env, argLo);
2984                HReg dst = newVRegV(env);
2985               addInstr(env, ARM64Instr_VExtV(dst, srcLo, srcHi, amt));
2986                return dst;
2987             }
2988          }
2989          /* else fall out; this is unhandled */
2990       }
2991
2992    } /* if (e->tag == Iex_Triop) */
2993
2994   v128_expr_bad:
2995    ppIRExpr(e);
2996    vpanic("iselV128Expr_wrk");
2997 }
2998
2999
3000 /*---------------------------------------------------------*/
3001 /*--- ISEL: Floating point expressions (64 bit)         ---*/
3002 /*---------------------------------------------------------*/
3003
3004 /* Compute a 64-bit floating point value into a register, the identity
3005    of which is returned.  As with iselIntExpr_R, the reg may be either
3006    real or virtual; in any case it must not be changed by subsequent
3007    code emitted by the caller.  */
3008
3009 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
3010 {
3011    HReg r = iselDblExpr_wrk( env, e );
3012 #  if 0
3013    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3014 #  endif
3015    vassert(hregClass(r) == HRcFlt64);
3016    vassert(hregIsVirtual(r));
3017    return r;
3018 }
3019
3020 /* DO NOT CALL THIS DIRECTLY */
3021 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
3022 {
3023    IRType ty = typeOfIRExpr(env->type_env,e);
3024    vassert(e);
3025    vassert(ty == Ity_F64);
3026
3027    if (e->tag == Iex_RdTmp) {
3028       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3029    }
3030
3031    if (e->tag == Iex_Const) {
3032       IRConst* con = e->Iex.Const.con;
3033       if (con->tag == Ico_F64i) {
3034          HReg src = newVRegI(env);
3035          HReg dst = newVRegD(env);
3036          addInstr(env, ARM64Instr_Imm64(src, con->Ico.F64i));
3037          addInstr(env, ARM64Instr_VDfromX(dst, src));
3038          return dst;
3039       }
3040       if (con->tag == Ico_F64) {
3041          HReg src = newVRegI(env);
3042          HReg dst = newVRegD(env);
3043          union { Double d64; ULong u64; } u;
3044          vassert(sizeof(u) == 8);
3045          u.d64 = con->Ico.F64;
3046          addInstr(env, ARM64Instr_Imm64(src, u.u64));
3047          addInstr(env, ARM64Instr_VDfromX(dst, src));
3048          return dst;
3049       }
3050    }
3051
3052    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3053       vassert(e->Iex.Load.ty == Ity_F64);
3054       HReg addr = iselIntExpr_R(env, e->Iex.Load.addr);
3055       HReg res  = newVRegD(env);
3056       addInstr(env, ARM64Instr_VLdStD(True/*isLoad*/, res, addr, 0));
3057       return res;
3058    }
3059
3060    if (e->tag == Iex_Get) {
3061       Int offs = e->Iex.Get.offset;
3062       if (offs >= 0 && offs < 32768 && 0 == (offs & 7)) {
3063          HReg rD = newVRegD(env);
3064          HReg rN = get_baseblock_register();
3065          addInstr(env, ARM64Instr_VLdStD(True/*isLoad*/, rD, rN, offs));
3066          return rD;
3067       }
3068    }
3069
3070    if (e->tag == Iex_Unop) {
3071       switch (e->Iex.Unop.op) {
3072          case Iop_NegF64: {
3073             HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3074             HReg dst = newVRegD(env);
3075             addInstr(env, ARM64Instr_VUnaryD(ARM64fpu_NEG, dst, src));
3076             return dst;
3077          }
3078          case Iop_AbsF64: {
3079             HReg src = iselDblExpr(env, e->Iex.Unop.arg);
3080             HReg dst = newVRegD(env);
3081             addInstr(env, ARM64Instr_VUnaryD(ARM64fpu_ABS, dst, src));
3082             return dst;
3083          }
3084          case Iop_F32toF64: {
3085             HReg src = iselFltExpr(env, e->Iex.Unop.arg);
3086             HReg dst = newVRegD(env);
3087             addInstr(env, ARM64Instr_VCvtSD(True/*sToD*/, dst, src));
3088             return dst;
3089          }
3090          case Iop_F16toF64: {
3091             HReg src = iselF16Expr(env, e->Iex.Unop.arg);
3092             HReg dst = newVRegD(env);
3093             addInstr(env, ARM64Instr_VCvtHD(True/*hToD*/, dst, src));
3094             return dst;
3095          }
3096          case Iop_I32UtoF64:
3097          case Iop_I32StoF64: {
3098             /* Rounding mode is not involved here, since the
3099                conversion can always be done without loss of
3100                precision. */
3101             HReg src   = iselIntExpr_R(env, e->Iex.Unop.arg);
3102             HReg dst   = newVRegD(env);
3103             Bool syned = e->Iex.Unop.op == Iop_I32StoF64;
3104             ARM64CvtOp cvt_op = syned ? ARM64cvt_F64_I32S : ARM64cvt_F64_I32U;
3105             addInstr(env, ARM64Instr_VCvtI2F(cvt_op, dst, src));
3106             return dst;
3107          }
3108          default:
3109             break;
3110       }
3111    }
3112
3113    if (e->tag == Iex_Binop) {
3114       switch (e->Iex.Binop.op) {
3115          case Iop_RoundF64toInt:
3116          case Iop_SqrtF64:
3117          case Iop_RecpExpF64: {
3118             HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
3119             HReg dst = newVRegD(env);
3120             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
3121             ARM64FpUnaryOp op = ARM64fpu_INVALID;
3122             switch (e->Iex.Binop.op) {
3123                case Iop_RoundF64toInt: op = ARM64fpu_RINT;  break;
3124                case Iop_SqrtF64:       op = ARM64fpu_SQRT;  break;
3125                case Iop_RecpExpF64:    op = ARM64fpu_RECPX; break;
3126                default: vassert(0);
3127             }
3128             addInstr(env, ARM64Instr_VUnaryD(op, dst, src));
3129             return dst;
3130          }
3131          case Iop_I64StoF64:
3132          case Iop_I64UtoF64: {
3133             ARM64CvtOp cvt_op = e->Iex.Binop.op == Iop_I64StoF64
3134                                    ? ARM64cvt_F64_I64S : ARM64cvt_F64_I64U;
3135             HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3136             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
3137             HReg dstS = newVRegD(env);
3138             addInstr(env, ARM64Instr_VCvtI2F(cvt_op, dstS, srcI));
3139             return dstS;
3140          }
3141          default:
3142             break;
3143       }
3144    }
3145
3146    if (e->tag == Iex_Triop) {
3147       IRTriop*     triop = e->Iex.Triop.details;
3148       ARM64FpBinOp dblop = ARM64fpb_INVALID;
3149       switch (triop->op) {
3150          case Iop_DivF64: dblop = ARM64fpb_DIV; break;
3151          case Iop_MulF64: dblop = ARM64fpb_MUL; break;
3152          case Iop_SubF64: dblop = ARM64fpb_SUB; break;
3153          case Iop_AddF64: dblop = ARM64fpb_ADD; break;
3154          default: break;
3155       }
3156       if (dblop != ARM64fpb_INVALID) {
3157          HReg argL = iselDblExpr(env, triop->arg2);
3158          HReg argR = iselDblExpr(env, triop->arg3);
3159          HReg dst  = newVRegD(env);
3160          set_FPCR_rounding_mode(env, triop->arg1);
3161          addInstr(env, ARM64Instr_VBinD(dblop, dst, argL, argR));
3162          return dst;
3163       }
3164    }
3165
3166    if (e->tag == Iex_ITE) {
3167       /* ITE(ccexpr, iftrue, iffalse) */
3168       ARM64CondCode cc;
3169       HReg r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
3170       HReg r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
3171       HReg dst = newVRegD(env);
3172       cc = iselCondCode(env, e->Iex.ITE.cond);
3173       addInstr(env, ARM64Instr_VFCSel(dst, r1, r0, cc, True/*64-bit*/));
3174       return dst;
3175    }
3176
3177    ppIRExpr(e);
3178    vpanic("iselDblExpr_wrk");
3179 }
3180
3181
3182 /*---------------------------------------------------------*/
3183 /*--- ISEL: Floating point expressions (32 bit)         ---*/
3184 /*---------------------------------------------------------*/
3185
3186 /* Compute a 32-bit floating point value into a register, the identity
3187    of which is returned.  As with iselIntExpr_R, the reg may be either
3188    real or virtual; in any case it must not be changed by subsequent
3189    code emitted by the caller.  Values are generated into HRcFlt64
3190    registers despite the values themselves being Ity_F32s. */
3191
3192 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
3193 {
3194    HReg r = iselFltExpr_wrk( env, e );
3195 #  if 0
3196    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3197 #  endif
3198    vassert(hregClass(r) == HRcFlt64);
3199    vassert(hregIsVirtual(r));
3200    return r;
3201 }
3202
3203 /* DO NOT CALL THIS DIRECTLY */
3204 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
3205 {
3206    IRType ty = typeOfIRExpr(env->type_env,e);
3207    vassert(e);
3208    vassert(ty == Ity_F32);
3209
3210    if (e->tag == Iex_RdTmp) {
3211       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
3212    }
3213
3214    if (e->tag == Iex_Const) {
3215       /* This is something of a kludge.  Since a 32 bit floating point
3216          zero is just .. all zeroes, just create a 64 bit zero word
3217          and transfer it.  This avoids having to create a SfromW
3218          instruction for this specific case. */
3219       IRConst* con = e->Iex.Const.con;
3220       if (con->tag == Ico_F32i && con->Ico.F32i == 0) {
3221          HReg src = newVRegI(env);
3222          HReg dst = newVRegD(env);
3223          addInstr(env, ARM64Instr_Imm64(src, 0));
3224          addInstr(env, ARM64Instr_VDfromX(dst, src));
3225          return dst;
3226       }
3227       if (con->tag == Ico_F32) {
3228          HReg src = newVRegI(env);
3229          HReg dst = newVRegD(env);
3230          union { Float f32; UInt u32; } u;
3231          vassert(sizeof(u) == 4);
3232          u.f32 = con->Ico.F32;
3233          addInstr(env, ARM64Instr_Imm64(src, (ULong)u.u32));
3234          addInstr(env, ARM64Instr_VDfromX(dst, src));
3235          return dst;
3236       }
3237    }
3238
3239    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
3240       vassert(e->Iex.Load.ty == Ity_F32);
3241       HReg addr = iselIntExpr_R(env, e->Iex.Load.addr);
3242       HReg res  = newVRegD(env);
3243       addInstr(env, ARM64Instr_VLdStS(True/*isLoad*/, res, addr, 0));
3244       return res;
3245    }
3246
3247    if (e->tag == Iex_Get) {
3248       Int offs = e->Iex.Get.offset;
3249       if (offs >= 0 && offs < 16384 && 0 == (offs & 3)) {
3250          HReg rD = newVRegD(env);
3251          HReg rN = get_baseblock_register();
3252          addInstr(env, ARM64Instr_VLdStS(True/*isLoad*/, rD, rN, offs));
3253          return rD;
3254       }
3255    }
3256
3257    if (e->tag == Iex_Unop) {
3258       switch (e->Iex.Unop.op) {
3259          case Iop_NegF32: {
3260             HReg src = iselFltExpr(env, e->Iex.Unop.arg);
3261             HReg dst = newVRegD(env);
3262             addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_NEG, dst, src));
3263             return dst;
3264          }
3265          case Iop_AbsF32: {
3266             HReg src = iselFltExpr(env, e->Iex.Unop.arg);
3267             HReg dst = newVRegD(env);
3268             addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_ABS, dst, src));
3269             return dst;
3270          }
3271          case Iop_F16toF32: {
3272             HReg src = iselF16Expr(env, e->Iex.Unop.arg);
3273             HReg dst = newVRegD(env);
3274             addInstr(env, ARM64Instr_VCvtHS(True/*hToS*/, dst, src));
3275             return dst;
3276          }
3277          default:
3278             break;
3279       }
3280    }
3281
3282    if (e->tag == Iex_Binop) {
3283       switch (e->Iex.Binop.op) {
3284          case Iop_RoundF32toInt:
3285          case Iop_SqrtF32:
3286          case Iop_RecpExpF32: {
3287             HReg src = iselFltExpr(env, e->Iex.Binop.arg2);
3288             HReg dst = newVRegD(env);
3289             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
3290             ARM64FpUnaryOp op = ARM64fpu_INVALID;
3291             switch (e->Iex.Binop.op) {
3292                case Iop_RoundF32toInt: op = ARM64fpu_RINT;  break;
3293                case Iop_SqrtF32:       op = ARM64fpu_SQRT;  break;
3294                case Iop_RecpExpF32:    op = ARM64fpu_RECPX; break;
3295                default: vassert(0);
3296             }
3297             addInstr(env, ARM64Instr_VUnaryS(op, dst, src));
3298             return dst;
3299          }
3300          case Iop_F64toF32: {
3301             HReg srcD = iselDblExpr(env, e->Iex.Binop.arg2);
3302             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
3303             HReg dstS = newVRegD(env);
3304             addInstr(env, ARM64Instr_VCvtSD(False/*!sToD*/, dstS, srcD));
3305             return dstS;
3306          }
3307          case Iop_I32UtoF32:
3308          case Iop_I32StoF32:
3309          case Iop_I64UtoF32:
3310          case Iop_I64StoF32: {
3311             ARM64CvtOp cvt_op = ARM64cvt_INVALID;
3312             switch (e->Iex.Binop.op) {
3313                case Iop_I32UtoF32: cvt_op = ARM64cvt_F32_I32U; break;
3314                case Iop_I32StoF32: cvt_op = ARM64cvt_F32_I32S; break;
3315                case Iop_I64UtoF32: cvt_op = ARM64cvt_F32_I64U; break;
3316                case Iop_I64StoF32: cvt_op = ARM64cvt_F32_I64S; break;
3317                default: vassert(0);
3318             }
3319             HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
3320             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
3321             HReg dstS = newVRegD(env);
3322             addInstr(env, ARM64Instr_VCvtI2F(cvt_op, dstS, srcI));
3323             return dstS;
3324          }
3325          default:
3326             break;
3327       }
3328    }
3329
3330    if (e->tag == Iex_Triop) {
3331       IRTriop*     triop = e->Iex.Triop.details;
3332       ARM64FpBinOp sglop = ARM64fpb_INVALID;
3333       switch (triop->op) {
3334          case Iop_DivF32: sglop = ARM64fpb_DIV; break;
3335          case Iop_MulF32: sglop = ARM64fpb_MUL; break;
3336          case Iop_SubF32: sglop = ARM64fpb_SUB; break;
3337          case Iop_AddF32: sglop = ARM64fpb_ADD; break;
3338          default: break;
3339       }
3340       if (sglop != ARM64fpb_INVALID) {
3341          HReg argL = iselFltExpr(env, triop->arg2);
3342          HReg argR = iselFltExpr(env, triop->arg3);
3343          HReg dst  = newVRegD(env);
3344          set_FPCR_rounding_mode(env, triop->arg1);
3345          addInstr(env, ARM64Instr_VBinS(sglop, dst, argL, argR));
3346          return dst;
3347       }
3348    }
3349
3350    if (e->tag == Iex_ITE) {
3351       /* ITE(ccexpr, iftrue, iffalse) */
3352       ARM64CondCode cc;
3353       HReg r1  = iselFltExpr(env, e->Iex.ITE.iftrue);
3354       HReg r0  = iselFltExpr(env, e->Iex.ITE.iffalse);
3355       HReg dst = newVRegD(env);
3356       cc = iselCondCode(env, e->Iex.ITE.cond);
3357       addInstr(env, ARM64Instr_VFCSel(dst, r1, r0, cc, False/*!64-bit*/));
3358       return dst;
3359    }
3360
3361    ppIRExpr(e);
3362    vpanic("iselFltExpr_wrk");
3363 }
3364
3365
3366 /*---------------------------------------------------------*/
3367 /*--- ISEL: Floating point expressions (16 bit)         ---*/
3368 /*---------------------------------------------------------*/
3369
3370 /* Compute a 16-bit floating point value into a register, the identity
3371    of which is returned.  As with iselIntExpr_R, the reg may be either
3372    real or virtual; in any case it must not be changed by subsequent
3373    code emitted by the caller.  Values are generated into HRcFlt64
3374    registers despite the values themselves being Ity_F16s. */
3375
3376 static HReg iselF16Expr ( ISelEnv* env, IRExpr* e )
3377 {
3378    HReg r = iselF16Expr_wrk( env, e );
3379 #  if 0
3380    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
3381 #  endif
3382    vassert(hregClass(r) == HRcFlt64);
3383    vassert(hregIsVirtual(r));
3384    return r;
3385 }
3386
3387 /* DO NOT CALL THIS DIRECTLY */
3388 static HReg iselF16Expr_wrk ( ISelEnv* env, IRExpr* e )
3389 {
3390    IRType ty = typeOfIRExpr(env->type_env,e);
3391    vassert(e);
3392    vassert(ty == Ity_F16);
3393
3394    if (e->tag == Iex_Get) {
3395       Int offs = e->Iex.Get.offset;
3396       if (offs >= 0 && offs < 8192 && 0 == (offs & 1)) {
3397          HReg rD = newVRegD(env);
3398          HReg rN = get_baseblock_register();
3399          addInstr(env, ARM64Instr_VLdStH(True/*isLoad*/, rD, rN, offs));
3400          return rD;
3401       }
3402    }
3403
3404    if (e->tag == Iex_Binop) {
3405       switch (e->Iex.Binop.op) {
3406          case Iop_F32toF16: {
3407             HReg srcS = iselFltExpr(env, e->Iex.Binop.arg2);
3408             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
3409             HReg dstH = newVRegD(env);
3410             addInstr(env, ARM64Instr_VCvtHS(False/*!hToS*/, dstH, srcS));
3411             return dstH;
3412          }
3413          case Iop_F64toF16: {
3414             HReg srcD = iselDblExpr(env, e->Iex.Binop.arg2);
3415             set_FPCR_rounding_mode(env, e->Iex.Binop.arg1);
3416             HReg dstH = newVRegD(env);
3417             addInstr(env, ARM64Instr_VCvtHD(False/*!hToD*/, dstH, srcD));
3418             return dstH;
3419          }
3420          default:
3421             break;
3422       }
3423    }
3424
3425    ppIRExpr(e);
3426    vpanic("iselF16Expr_wrk");
3427 }
3428
3429
3430 /*---------------------------------------------------------*/
3431 /*--- ISEL: Vector expressions (256 bit)                ---*/
3432 /*---------------------------------------------------------*/
3433
3434 static void iselV256Expr ( /*OUT*/HReg* rHi, HReg* rLo,
3435                            ISelEnv* env, IRExpr* e )
3436 {
3437    iselV256Expr_wrk( rHi, rLo, env, e );
3438    vassert(hregClass(*rHi) == HRcVec128);
3439    vassert(hregClass(*rLo) == HRcVec128);
3440    vassert(hregIsVirtual(*rHi));
3441    vassert(hregIsVirtual(*rLo));
3442 }
3443
3444 /* DO NOT CALL THIS DIRECTLY */
3445 static void iselV256Expr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
3446                                ISelEnv* env, IRExpr* e )
3447 {
3448    vassert(e);
3449    IRType ty = typeOfIRExpr(env->type_env,e);
3450    vassert(ty == Ity_V256);
3451
3452    /* read 256-bit IRTemp */
3453    if (e->tag == Iex_RdTmp) {
3454       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
3455       return;
3456    }
3457
3458    if (e->tag == Iex_Binop) {
3459       switch (e->Iex.Binop.op) {
3460          case Iop_V128HLtoV256: {
3461             *rHi = iselV128Expr(env, e->Iex.Binop.arg1);
3462             *rLo = iselV128Expr(env, e->Iex.Binop.arg2);
3463             return;
3464          }
3465          case Iop_QandSQsh64x2:
3466          case Iop_QandSQsh32x4:
3467          case Iop_QandSQsh16x8:
3468          case Iop_QandSQsh8x16:
3469          case Iop_QandUQsh64x2:
3470          case Iop_QandUQsh32x4:
3471          case Iop_QandUQsh16x8:
3472          case Iop_QandUQsh8x16:
3473          case Iop_QandSQRsh64x2:
3474          case Iop_QandSQRsh32x4:
3475          case Iop_QandSQRsh16x8:
3476          case Iop_QandSQRsh8x16:
3477          case Iop_QandUQRsh64x2:
3478          case Iop_QandUQRsh32x4:
3479          case Iop_QandUQRsh16x8:
3480          case Iop_QandUQRsh8x16:
3481          {
3482             HReg argL  = iselV128Expr(env, e->Iex.Binop.arg1);
3483             HReg argR  = iselV128Expr(env, e->Iex.Binop.arg2);
3484             HReg fpsr  = newVRegI(env);
3485             HReg resHi = newVRegV(env);
3486             HReg resLo = newVRegV(env);
3487             ARM64VecBinOp op = ARM64vecb_INVALID;
3488             switch (e->Iex.Binop.op) {
3489                case Iop_QandSQsh64x2:  op = ARM64vecb_SQSHL64x2;  break;
3490                case Iop_QandSQsh32x4:  op = ARM64vecb_SQSHL32x4;  break;
3491                case Iop_QandSQsh16x8:  op = ARM64vecb_SQSHL16x8;  break;
3492                case Iop_QandSQsh8x16:  op = ARM64vecb_SQSHL8x16;  break;
3493                case Iop_QandUQsh64x2:  op = ARM64vecb_UQSHL64x2;  break;
3494                case Iop_QandUQsh32x4:  op = ARM64vecb_UQSHL32x4;  break;
3495                case Iop_QandUQsh16x8:  op = ARM64vecb_UQSHL16x8;  break;
3496                case Iop_QandUQsh8x16:  op = ARM64vecb_UQSHL8x16;  break;
3497                case Iop_QandSQRsh64x2: op = ARM64vecb_SQRSHL64x2; break;
3498                case Iop_QandSQRsh32x4: op = ARM64vecb_SQRSHL32x4; break;
3499                case Iop_QandSQRsh16x8: op = ARM64vecb_SQRSHL16x8; break;
3500                case Iop_QandSQRsh8x16: op = ARM64vecb_SQRSHL8x16; break;
3501                case Iop_QandUQRsh64x2: op = ARM64vecb_UQRSHL64x2; break;
3502                case Iop_QandUQRsh32x4: op = ARM64vecb_UQRSHL32x4; break;
3503                case Iop_QandUQRsh16x8: op = ARM64vecb_UQRSHL16x8; break;
3504                case Iop_QandUQRsh8x16: op = ARM64vecb_UQRSHL8x16; break;
3505                default: vassert(0);
3506             }
3507             /* Clear FPSR.Q, do the operation, and return both its result
3508                and the new value of FPSR.Q.  We can simply zero out FPSR
3509                since all the other bits have no relevance in VEX generated
3510                code. */
3511             addInstr(env, ARM64Instr_Imm64(fpsr, 0));
3512             addInstr(env, ARM64Instr_FPSR(True/*toFPSR*/, fpsr));
3513             addInstr(env, ARM64Instr_VBinV(op, resLo, argL, argR));
3514             addInstr(env, ARM64Instr_FPSR(False/*!toFPSR*/, fpsr));
3515             addInstr(env, ARM64Instr_Shift(fpsr, fpsr, ARM64RI6_I6(27),
3516                                                        ARM64sh_SHR));
3517             ARM64RIL* ril_one = mb_mkARM64RIL_I(1);
3518             vassert(ril_one);
3519             addInstr(env, ARM64Instr_Logic(fpsr, fpsr, ril_one, ARM64lo_AND));
3520             /* Now we have: the main (shift) result in |resLo|, and the
3521                Q bit at the bottom of |fpsr|. */
3522             addInstr(env, ARM64Instr_VQfromX(resHi, fpsr));
3523             *rHi = resHi;
3524             *rLo = resLo;
3525             return;
3526          }
3527
3528          /* ... */
3529          default:
3530             break;
3531       } /* switch on the binop */
3532    } /* if (e->tag == Iex_Binop) */
3533
3534    ppIRExpr(e);
3535    vpanic("iselV256Expr_wrk");
3536 }
3537
3538
3539 /*---------------------------------------------------------*/
3540 /*--- ISEL: Statements                                  ---*/
3541 /*---------------------------------------------------------*/
3542
3543 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
3544 {
3545    if (vex_traceflags & VEX_TRACE_VCODE) {
3546       vex_printf("\n-- ");
3547       ppIRStmt(stmt);
3548       vex_printf("\n");
3549    }
3550    switch (stmt->tag) {
3551
3552    /* --------- STORE --------- */
3553    /* little-endian write to memory */
3554    case Ist_Store: {
3555       IRType    tya  = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
3556       IRType    tyd  = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
3557       IREndness end  = stmt->Ist.Store.end;
3558
3559       if (tya != Ity_I64 || end != Iend_LE)
3560          goto stmt_fail;
3561
3562       if (tyd == Ity_I64) {
3563          HReg        rD = iselIntExpr_R(env, stmt->Ist.Store.data);
3564          ARM64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr, tyd);
3565          addInstr(env, ARM64Instr_LdSt64(False/*!isLoad*/, rD, am));
3566          return;
3567       }
3568       if (tyd == Ity_I32) {
3569          HReg        rD = iselIntExpr_R(env, stmt->Ist.Store.data);
3570          ARM64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr, tyd);
3571          addInstr(env, ARM64Instr_LdSt32(False/*!isLoad*/, rD, am));
3572          return;
3573       }
3574       if (tyd == Ity_I16) {
3575          HReg        rD = iselIntExpr_R(env, stmt->Ist.Store.data);
3576          ARM64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr, tyd);
3577          addInstr(env, ARM64Instr_LdSt16(False/*!isLoad*/, rD, am));
3578          return;
3579       }
3580       if (tyd == Ity_I8) {
3581          HReg        rD = iselIntExpr_R(env, stmt->Ist.Store.data);
3582          ARM64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr, tyd);
3583          addInstr(env, ARM64Instr_LdSt8(False/*!isLoad*/, rD, am));
3584          return;
3585       }
3586       if (tyd == Ity_V128) {
3587          HReg qD   = iselV128Expr(env, stmt->Ist.Store.data);
3588          HReg addr = iselIntExpr_R(env, stmt->Ist.Store.addr);
3589          addInstr(env, ARM64Instr_VLdStQ(False/*!isLoad*/, qD, addr));
3590          return;
3591       }
3592       if (tyd == Ity_F64) {
3593          HReg dD   = iselDblExpr(env, stmt->Ist.Store.data);
3594          HReg addr = iselIntExpr_R(env, stmt->Ist.Store.addr);
3595          addInstr(env, ARM64Instr_VLdStD(False/*!isLoad*/, dD, addr, 0));
3596          return;
3597       }
3598       if (tyd == Ity_F32) {
3599          HReg sD   = iselFltExpr(env, stmt->Ist.Store.data);
3600          HReg addr = iselIntExpr_R(env, stmt->Ist.Store.addr);
3601          addInstr(env, ARM64Instr_VLdStS(False/*!isLoad*/, sD, addr, 0));
3602          return;
3603       }
3604       break;
3605    }
3606
3607    /* --------- PUT --------- */
3608    /* write guest state, fixed offset */
3609    case Ist_Put: {
3610       IRType tyd  = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
3611       UInt   offs = (UInt)stmt->Ist.Put.offset;
3612       if (tyd == Ity_I64 && 0 == (offs & 7) && offs < (8<<12)) {
3613          HReg        rD = iselIntExpr_R(env, stmt->Ist.Put.data);
3614          ARM64AMode* am = mk_baseblock_64bit_access_amode(offs);
3615          addInstr(env, ARM64Instr_LdSt64(False/*!isLoad*/, rD, am));
3616          return;
3617       }
3618       if (tyd == Ity_I32 && 0 == (offs & 3) && offs < (4<<12)) {
3619          HReg        rD = iselIntExpr_R(env, stmt->Ist.Put.data);
3620          ARM64AMode* am = mk_baseblock_32bit_access_amode(offs);
3621          addInstr(env, ARM64Instr_LdSt32(False/*!isLoad*/, rD, am));
3622          return;
3623       }
3624       if (tyd == Ity_I16 && 0 == (offs & 1) && offs < (2<<12)) {
3625          HReg        rD = iselIntExpr_R(env, stmt->Ist.Put.data);
3626          ARM64AMode* am = mk_baseblock_16bit_access_amode(offs);
3627          addInstr(env, ARM64Instr_LdSt16(False/*!isLoad*/, rD, am));
3628          return;
3629       }
3630       if (tyd == Ity_I8 && offs < (1<<12)) {
3631          HReg        rD = iselIntExpr_R(env, stmt->Ist.Put.data);
3632          ARM64AMode* am = mk_baseblock_8bit_access_amode(offs);
3633          addInstr(env, ARM64Instr_LdSt8(False/*!isLoad*/, rD, am));
3634          return;
3635       }
3636       if (tyd == Ity_V128 && offs < (1<<12)) {
3637          HReg qD   = iselV128Expr(env, stmt->Ist.Put.data);
3638          HReg addr = mk_baseblock_128bit_access_addr(env, offs);
3639          addInstr(env, ARM64Instr_VLdStQ(False/*!isLoad*/, qD, addr));
3640          return;
3641       }
3642       if (tyd == Ity_F64 && 0 == (offs & 7) && offs < (8<<12)) {
3643          HReg dD   = iselDblExpr(env, stmt->Ist.Put.data);
3644          HReg bbp  = get_baseblock_register();
3645          addInstr(env, ARM64Instr_VLdStD(False/*!isLoad*/, dD, bbp, offs));
3646          return;
3647       }
3648       if (tyd == Ity_F32 && 0 == (offs & 3) && offs < (4<<12)) {
3649          HReg sD   = iselFltExpr(env, stmt->Ist.Put.data);
3650          HReg bbp  = get_baseblock_register();
3651          addInstr(env, ARM64Instr_VLdStS(False/*!isLoad*/, sD, bbp, offs));
3652          return;
3653       }
3654       if (tyd == Ity_F16 && 0 == (offs & 1) && offs < (2<<12)) {
3655          HReg hD   = iselF16Expr(env, stmt->Ist.Put.data);
3656          HReg bbp  = get_baseblock_register();
3657          addInstr(env, ARM64Instr_VLdStH(False/*!isLoad*/, hD, bbp, offs));
3658          return;
3659       }
3660
3661       break;
3662    }
3663
3664    /* --------- TMP --------- */
3665    /* assign value to temporary */
3666    case Ist_WrTmp: {
3667       IRTemp tmp = stmt->Ist.WrTmp.tmp;
3668       IRType ty  = typeOfIRTemp(env->type_env, tmp);
3669
3670       if (ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
3671          /* We could do a lot better here.  But for the time being: */
3672          HReg dst = lookupIRTemp(env, tmp);
3673          HReg rD  = iselIntExpr_R(env, stmt->Ist.WrTmp.data);
3674          addInstr(env, ARM64Instr_MovI(dst, rD));
3675          return;
3676       }
3677       if (ty == Ity_I1) {
3678          /* Here, we are generating a I1 value into a 64 bit register.
3679             Make sure the value in the register is only zero or one,
3680             but no other.  This allows optimisation of the
3681             1Uto64(tmp:I1) case, by making it simply a copy of the
3682             register holding 'tmp'.  The point being that the value in
3683             the register holding 'tmp' can only have been created
3684             here.  LATER: that seems dangerous; safer to do 'tmp & 1'
3685             in that case.  Also, could do this just with a single CINC
3686             insn. */
3687          /* CLONE-01 */
3688          HReg zero = newVRegI(env);
3689          HReg one  = newVRegI(env);
3690          HReg dst  = lookupIRTemp(env, tmp);
3691          addInstr(env, ARM64Instr_Imm64(zero, 0));
3692          addInstr(env, ARM64Instr_Imm64(one,  1));
3693          ARM64CondCode cc = iselCondCode(env, stmt->Ist.WrTmp.data);
3694          addInstr(env, ARM64Instr_CSel(dst, one, zero, cc));
3695          return;
3696       }
3697       if (ty == Ity_F64) {
3698          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
3699          HReg dst = lookupIRTemp(env, tmp);
3700          addInstr(env, ARM64Instr_VMov(8, dst, src));
3701          return;
3702       }
3703       if (ty == Ity_F32) {
3704          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
3705          HReg dst = lookupIRTemp(env, tmp);
3706          addInstr(env, ARM64Instr_VMov(8/*yes, really*/, dst, src));
3707          return;
3708       }
3709       if (ty == Ity_V128) {
3710          HReg src = iselV128Expr(env, stmt->Ist.WrTmp.data);
3711          HReg dst = lookupIRTemp(env, tmp);
3712          addInstr(env, ARM64Instr_VMov(16, dst, src));
3713          return;
3714       }
3715       if (ty == Ity_V256) {
3716          HReg srcHi, srcLo, dstHi, dstLo;
3717          iselV256Expr(&srcHi,&srcLo, env, stmt->Ist.WrTmp.data);
3718          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
3719          addInstr(env, ARM64Instr_VMov(16, dstHi, srcHi));
3720          addInstr(env, ARM64Instr_VMov(16, dstLo, srcLo));
3721          return;
3722       }
3723       break;
3724    }
3725
3726    /* --------- Call to DIRTY helper --------- */
3727    /* call complex ("dirty") helper function */
3728    case Ist_Dirty: {
3729       IRDirty* d = stmt->Ist.Dirty.details;
3730
3731       /* Figure out the return type, if any. */
3732       IRType retty = Ity_INVALID;
3733       if (d->tmp != IRTemp_INVALID)
3734          retty = typeOfIRTemp(env->type_env, d->tmp);
3735
3736       Bool retty_ok = False;
3737       switch (retty) {
3738          case Ity_INVALID: /* function doesn't return anything */
3739          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
3740          case Ity_V128:
3741             retty_ok = True; break;
3742          default:
3743             break;
3744       }
3745       if (!retty_ok)
3746          break; /* will go to stmt_fail: */
3747
3748       /* Marshal args, do the call, and set the return value to 0x555..555
3749          if this is a conditional call that returns a value and the
3750          call is skipped. */
3751       UInt   addToSp = 0;
3752       RetLoc rloc    = mk_RetLoc_INVALID();
3753       doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
3754       vassert(is_sane_RetLoc(rloc));
3755
3756       /* Now figure out what to do with the returned value, if any. */
3757       switch (retty) {
3758          case Ity_INVALID: {
3759             /* No return value.  Nothing to do. */
3760             vassert(d->tmp == IRTemp_INVALID);
3761             vassert(rloc.pri == RLPri_None);
3762             vassert(addToSp == 0);
3763             return;
3764          }
3765          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
3766             vassert(rloc.pri == RLPri_Int);
3767             vassert(addToSp == 0);
3768             /* The returned value is in x0.  Park it in the register
3769                associated with tmp. */
3770             HReg dst = lookupIRTemp(env, d->tmp);
3771             addInstr(env, ARM64Instr_MovI(dst, hregARM64_X0()) );
3772             return;
3773          }
3774          case Ity_V128: {
3775             /* The returned value is on the stack, and *retloc tells
3776                us where.  Fish it off the stack and then move the
3777                stack pointer upwards to clear it, as directed by
3778                doHelperCall. */
3779             vassert(rloc.pri == RLPri_V128SpRel);
3780             vassert(rloc.spOff < 256); // stay sane
3781             vassert(addToSp >= 16); // ditto
3782             vassert(addToSp < 256); // ditto
3783             HReg dst = lookupIRTemp(env, d->tmp);
3784             HReg tmp = newVRegI(env); // the address of the returned value
3785             addInstr(env, ARM64Instr_FromSP(tmp)); // tmp = SP
3786             addInstr(env, ARM64Instr_Arith(tmp, tmp,
3787                                            ARM64RIA_I12((UShort)rloc.spOff, 0),
3788                                            True/*isAdd*/ ));
3789             addInstr(env, ARM64Instr_VLdStQ(True/*isLoad*/, dst, tmp));
3790             addInstr(env, ARM64Instr_AddToSP(addToSp));
3791             return;
3792          }
3793          default:
3794             /*NOTREACHED*/
3795             vassert(0);
3796       }
3797       break;
3798    }
3799
3800    /* --------- Load Linked and Store Conditional --------- */
3801    case Ist_LLSC: {
3802       if (stmt->Ist.LLSC.storedata == NULL) {
3803          /* LL */
3804          IRTemp res = stmt->Ist.LLSC.result;
3805          IRType ty  = typeOfIRTemp(env->type_env, res);
3806          if (ty == Ity_I64 || ty == Ity_I32
3807              || ty == Ity_I16 || ty == Ity_I8) {
3808             Int  szB   = 0;
3809             HReg r_dst = lookupIRTemp(env, res);
3810             HReg raddr = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
3811             switch (ty) {
3812                case Ity_I8:  szB = 1; break;
3813                case Ity_I16: szB = 2; break;
3814                case Ity_I32: szB = 4; break;
3815                case Ity_I64: szB = 8; break;
3816                default:      vassert(0);
3817             }
3818             addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
3819             addInstr(env, ARM64Instr_LdrEX(szB));
3820             addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
3821             return;
3822          }
3823          goto stmt_fail;
3824       } else {
3825          /* SC */
3826          IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.LLSC.storedata);
3827          if (tyd == Ity_I64 || tyd == Ity_I32
3828              || tyd == Ity_I16 || tyd == Ity_I8) {
3829             Int  szB = 0;
3830             HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
3831             HReg rA  = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
3832             switch (tyd) {
3833                case Ity_I8:  szB = 1; break;
3834                case Ity_I16: szB = 2; break;
3835                case Ity_I32: szB = 4; break;
3836                case Ity_I64: szB = 8; break;
3837                default:      vassert(0);
3838             }
3839             addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
3840             addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
3841             addInstr(env, ARM64Instr_StrEX(szB));
3842          } else {
3843             goto stmt_fail;
3844          }
3845          /* now r0 is 1 if failed, 0 if success.  Change to IR
3846             conventions (0 is fail, 1 is success).  Also transfer
3847             result to r_res. */
3848          IRTemp    res   = stmt->Ist.LLSC.result;
3849          IRType    ty    = typeOfIRTemp(env->type_env, res);
3850          HReg      r_res = lookupIRTemp(env, res);
3851          ARM64RIL* one   = mb_mkARM64RIL_I(1);
3852          vassert(ty == Ity_I1);
3853          vassert(one);
3854          addInstr(env, ARM64Instr_Logic(r_res, hregARM64_X0(), one,
3855                                         ARM64lo_XOR));
3856          /* And be conservative -- mask off all but the lowest bit. */
3857          addInstr(env, ARM64Instr_Logic(r_res, r_res, one,
3858                                         ARM64lo_AND));
3859          return;
3860       }
3861       break;
3862    }
3863
3864    /* --------- ACAS --------- */
3865    case Ist_CAS: {
3866       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
3867          /* "normal" singleton CAS */
3868          UChar  sz;
3869          IRCAS* cas = stmt->Ist.CAS.details;
3870          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
3871          switch (ty) {
3872             case Ity_I64: sz = 8; break;
3873             case Ity_I32: sz = 4; break;
3874             case Ity_I16: sz = 2; break;
3875             case Ity_I8:  sz = 1; break;
3876             default: goto unhandled_cas;
3877          }
3878          HReg rAddr = iselIntExpr_R(env, cas->addr);
3879          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
3880          HReg rData = iselIntExpr_R(env, cas->dataLo);
3881          vassert(cas->expdHi == NULL);
3882          vassert(cas->dataHi == NULL);
3883          addInstr(env, ARM64Instr_MovI(hregARM64_X3(), rAddr));
3884          addInstr(env, ARM64Instr_MovI(hregARM64_X5(), rExpd));
3885          addInstr(env, ARM64Instr_MovI(hregARM64_X7(), rData));
3886          addInstr(env, ARM64Instr_CAS(sz));
3887          /* Now we have the lowest szB bytes of x1 are either equal to
3888             the lowest szB bytes of x5, indicating success, or they
3889             aren't, indicating failure.  The IR semantics actually
3890             require us to return the old value at the location,
3891             regardless of success or failure, but in the case of
3892             failure it's not clear how to do this, since
3893             ARM64Instr_CAS can't provide that.  Instead we'll just
3894             return the relevant bit of x1, since that's at least
3895             guaranteed to be different from the lowest bits of x5 on
3896             failure. */
3897          HReg rResult = hregARM64_X1();
3898          switch (sz) {
3899             case 8:  break;
3900             case 4:  rResult = widen_z_32_to_64(env, rResult); break;
3901             case 2:  rResult = widen_z_16_to_64(env, rResult); break;
3902             case 1:  rResult = widen_z_8_to_64(env, rResult); break;
3903             default: vassert(0);
3904          }
3905          // "old" in this case is interpreted somewhat liberally, per
3906          // the previous comment.
3907          HReg rOld = lookupIRTemp(env, cas->oldLo);
3908          addInstr(env, ARM64Instr_MovI(rOld, rResult));
3909          return;
3910       }
3911       unhandled_cas:
3912       break;
3913    }
3914
3915    /* --------- MEM FENCE --------- */
3916    case Ist_MBE:
3917       switch (stmt->Ist.MBE.event) {
3918          case Imbe_Fence:
3919             addInstr(env, ARM64Instr_MFence());
3920             return;
3921          case Imbe_CancelReservation:
3922             addInstr(env, ARM64Instr_ClrEX());
3923             return;
3924          default:
3925             break;
3926       }
3927       break;
3928
3929    /* --------- INSTR MARK --------- */
3930    /* Doesn't generate any executable code ... */
3931    case Ist_IMark:
3932        return;
3933
3934    /* --------- ABI HINT --------- */
3935    /* These have no meaning (denotation in the IR) and so we ignore
3936       them ... if any actually made it this far. */
3937    case Ist_AbiHint:
3938        return;
3939
3940    /* --------- NO-OP --------- */
3941    case Ist_NoOp:
3942        return;
3943
3944    /* --------- EXIT --------- */
3945    case Ist_Exit: {
3946       if (stmt->Ist.Exit.dst->tag != Ico_U64)
3947          vpanic("isel_arm: Ist_Exit: dst is not a 64-bit value");
3948
3949       ARM64CondCode cc
3950          = iselCondCode(env, stmt->Ist.Exit.guard);
3951       ARM64AMode* amPC
3952          = mk_baseblock_64bit_access_amode(stmt->Ist.Exit.offsIP);
3953
3954       /* Case: boring transfer to known address */
3955       if (stmt->Ist.Exit.jk == Ijk_Boring) {
3956          if (env->chainingAllowed) {
3957             /* .. almost always true .. */
3958             /* Skip the event check at the dst if this is a forwards
3959                edge. */
3960             Bool toFastEP
3961                = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
3962             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
3963             addInstr(env, ARM64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
3964                                              amPC, cc, toFastEP));
3965          } else {
3966             /* .. very occasionally .. */
3967             /* We can't use chaining, so ask for an assisted transfer,
3968                as that's the only alternative that is allowable. */
3969             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
3970             addInstr(env, ARM64Instr_XAssisted(r, amPC, cc, Ijk_Boring));
3971          }
3972          return;
3973       }
3974
3975       /* Case: assisted transfer to arbitrary address */
3976       switch (stmt->Ist.Exit.jk) {
3977          /* Keep this list in sync with that for iselNext below */
3978          case Ijk_ClientReq:
3979          case Ijk_NoDecode:
3980          case Ijk_NoRedir:
3981          case Ijk_Sys_syscall:
3982          case Ijk_InvalICache:
3983          case Ijk_FlushDCache:
3984          case Ijk_SigTRAP:
3985          case Ijk_Yield: {
3986             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
3987             addInstr(env, ARM64Instr_XAssisted(r, amPC, cc,
3988                                                stmt->Ist.Exit.jk));
3989             return;
3990          }
3991          default:
3992             break;
3993       }
3994
3995       /* Do we ever expect to see any other kind? */
3996       goto stmt_fail;
3997    }
3998
3999    default: break;
4000    }
4001   stmt_fail:
4002    ppIRStmt(stmt);
4003    vpanic("iselStmt");
4004 }
4005
4006
4007 /*---------------------------------------------------------*/
4008 /*--- ISEL: Basic block terminators (Nexts)             ---*/
4009 /*---------------------------------------------------------*/
4010
4011 static void iselNext ( ISelEnv* env,
4012                        IRExpr* next, IRJumpKind jk, Int offsIP )
4013 {
4014    if (vex_traceflags & VEX_TRACE_VCODE) {
4015       vex_printf( "\n-- PUT(%d) = ", offsIP);
4016       ppIRExpr( next );
4017       vex_printf( "; exit-");
4018       ppIRJumpKind(jk);
4019       vex_printf( "\n");
4020    }
4021
4022    /* Case: boring transfer to known address */
4023    if (next->tag == Iex_Const) {
4024       IRConst* cdst = next->Iex.Const.con;
4025       vassert(cdst->tag == Ico_U64);
4026       if (jk == Ijk_Boring || jk == Ijk_Call) {
4027          /* Boring transfer to known address */
4028          ARM64AMode* amPC = mk_baseblock_64bit_access_amode(offsIP);
4029          if (env->chainingAllowed) {
4030             /* .. almost always true .. */
4031             /* Skip the event check at the dst if this is a forwards
4032                edge. */
4033             Bool toFastEP
4034                = ((Addr64)cdst->Ico.U64) > env->max_ga;
4035             if (0) vex_printf("%s", toFastEP ? "X" : ".");
4036             addInstr(env, ARM64Instr_XDirect(cdst->Ico.U64,
4037                                              amPC, ARM64cc_AL,
4038                                              toFastEP));
4039          } else {
4040             /* .. very occasionally .. */
4041             /* We can't use chaining, so ask for an assisted transfer,
4042                as that's the only alternative that is allowable. */
4043             HReg r = iselIntExpr_R(env, next);
4044             addInstr(env, ARM64Instr_XAssisted(r, amPC, ARM64cc_AL,
4045                                                Ijk_Boring));
4046          }
4047          return;
4048       }
4049    }
4050
4051    /* Case: call/return (==boring) transfer to any address */
4052    switch (jk) {
4053       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
4054          HReg        r    = iselIntExpr_R(env, next);
4055          ARM64AMode* amPC = mk_baseblock_64bit_access_amode(offsIP);
4056          if (env->chainingAllowed) {
4057             addInstr(env, ARM64Instr_XIndir(r, amPC, ARM64cc_AL));
4058          } else {
4059             addInstr(env, ARM64Instr_XAssisted(r, amPC, ARM64cc_AL,
4060                                                Ijk_Boring));
4061          }
4062          return;
4063       }
4064       default:
4065          break;
4066    }
4067
4068    /* Case: assisted transfer to arbitrary address */
4069    switch (jk) {
4070       /* Keep this list in sync with that for Ist_Exit above */
4071       case Ijk_ClientReq:
4072       case Ijk_NoDecode:
4073       case Ijk_NoRedir:
4074       case Ijk_Sys_syscall:
4075       case Ijk_InvalICache:
4076       case Ijk_FlushDCache:
4077       case Ijk_SigTRAP:
4078       case Ijk_Yield:
4079       {
4080          HReg        r    = iselIntExpr_R(env, next);
4081          ARM64AMode* amPC = mk_baseblock_64bit_access_amode(offsIP);
4082          addInstr(env, ARM64Instr_XAssisted(r, amPC, ARM64cc_AL, jk));
4083          return;
4084       }
4085       default:
4086          break;
4087    }
4088
4089    vex_printf( "\n-- PUT(%d) = ", offsIP);
4090    ppIRExpr( next );
4091    vex_printf( "; exit-");
4092    ppIRJumpKind(jk);
4093    vex_printf( "\n");
4094    vassert(0); // are we expecting any other kind?
4095 }
4096
4097
4098 /*---------------------------------------------------------*/
4099 /*--- Insn selector top-level                           ---*/
4100 /*---------------------------------------------------------*/
4101
4102 /* Translate an entire SB to arm64 code. */
4103
4104 HInstrArray* iselSB_ARM64 ( const IRSB* bb,
4105                             VexArch      arch_host,
4106                             const VexArchInfo* archinfo_host,
4107                             const VexAbiInfo*  vbi/*UNUSED*/,
4108                             Int offs_Host_EvC_Counter,
4109                             Int offs_Host_EvC_FailAddr,
4110                             Bool chainingAllowed,
4111                             Bool addProfInc,
4112                             Addr max_ga )
4113 {
4114    Int        i, j;
4115    HReg       hreg, hregHI;
4116    ISelEnv*   env;
4117    UInt       hwcaps_host = archinfo_host->hwcaps;
4118    ARM64AMode *amCounter, *amFailAddr;
4119
4120    /* sanity ... */
4121    vassert(arch_host == VexArchARM64);
4122
4123    /* Check that the host's endianness is as expected. */
4124    vassert(archinfo_host->endness == VexEndnessLE);
4125
4126    /* guard against unexpected space regressions */
4127    vassert(sizeof(ARM64Instr) <= 32);
4128
4129    /* Make up an initial environment to use. */
4130    env = LibVEX_Alloc_inline(sizeof(ISelEnv));
4131    env->vreg_ctr = 0;
4132
4133    /* Set up output code array. */
4134    env->code = newHInstrArray();
4135
4136    /* Copy BB's type env. */
4137    env->type_env = bb->tyenv;
4138
4139    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
4140       change as we go along. */
4141    env->n_vregmap = bb->tyenv->types_used;
4142    env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4143    env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
4144
4145    /* and finally ... */
4146    env->chainingAllowed = chainingAllowed;
4147    env->hwcaps          = hwcaps_host;
4148    env->previous_rm     = NULL;
4149    env->max_ga          = max_ga;
4150
4151    /* For each IR temporary, allocate a suitably-kinded virtual
4152       register. */
4153    j = 0;
4154    for (i = 0; i < env->n_vregmap; i++) {
4155       hregHI = hreg = INVALID_HREG;
4156       switch (bb->tyenv->types[i]) {
4157          case Ity_I1:
4158          case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
4159             hreg = mkHReg(True, HRcInt64, 0, j++);
4160             break;
4161          case Ity_I128:
4162             hreg   = mkHReg(True, HRcInt64, 0, j++);
4163             hregHI = mkHReg(True, HRcInt64, 0, j++);
4164             break;
4165          case Ity_F16: // we'll use HRcFlt64 regs for F16 too
4166          case Ity_F32: // we'll use HRcFlt64 regs for F32 too
4167          case Ity_F64:
4168             hreg = mkHReg(True, HRcFlt64, 0, j++);
4169             break;
4170          case Ity_V128:
4171             hreg = mkHReg(True, HRcVec128, 0, j++);
4172             break;
4173          case Ity_V256:
4174             hreg   = mkHReg(True, HRcVec128, 0, j++);
4175             hregHI = mkHReg(True, HRcVec128, 0, j++);
4176             break;
4177          default:
4178             ppIRType(bb->tyenv->types[i]);
4179             vpanic("iselBB(arm64): IRTemp type");
4180       }
4181       env->vregmap[i]   = hreg;
4182       env->vregmapHI[i] = hregHI;
4183    }
4184    env->vreg_ctr = j;
4185
4186    /* The very first instruction must be an event check. */
4187    amCounter  = ARM64AMode_RI9(hregARM64_X21(), offs_Host_EvC_Counter);
4188    amFailAddr = ARM64AMode_RI9(hregARM64_X21(), offs_Host_EvC_FailAddr);
4189    addInstr(env, ARM64Instr_EvCheck(amCounter, amFailAddr));
4190
4191    /* Possibly a block counter increment (for profiling).  At this
4192       point we don't know the address of the counter, so just pretend
4193       it is zero.  It will have to be patched later, but before this
4194       translation is used, by a call to LibVEX_patchProfCtr. */
4195    if (addProfInc) {
4196       addInstr(env, ARM64Instr_ProfInc());
4197    }
4198
4199    /* Ok, finally we can iterate over the statements. */
4200    for (i = 0; i < bb->stmts_used; i++)
4201       iselStmt(env, bb->stmts[i]);
4202
4203    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
4204
4205    /* record the number of vregs we used. */
4206    env->code->n_vregs = env->vreg_ctr;
4207    return env->code;
4208 }
4209
4210
4211 /*---------------------------------------------------------------*/
4212 /*--- end                                   host_arm64_isel.c ---*/
4213 /*---------------------------------------------------------------*/