VEX/priv/ir_opt.c

   1 /* -*- mode: C; c-basic-offset: 3; -*- */
   2
   3 /*---------------------------------------------------------------*/
   4 /*--- begin                                          ir_opt.c ---*/
   5 /*---------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of Valgrind, a dynamic binary instrumentation
   9    framework.
  10
  11    Copyright (C) 2004-2017 OpenWorks LLP
  12       info@open-works.net
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28
  29    Neither the names of the U.S. Department of Energy nor the
  30    University of California nor the names of its contributors may be
  31    used to endorse or promote products derived from this software
  32    without prior written permission.
  33 */
  34
  35 #include "libvex_basictypes.h"
  36 #include "libvex_ir.h"
  37 #include "libvex.h"
  38
  39 #include "main_util.h"
  40 #include "main_globals.h"
  41 #include "ir_opt.h"
  42
  43
  44 /* Set to 1 for lots of debugging output. */
  45 #define DEBUG_IROPT 0
  46
  47 /* Set to 1 to gather some statistics. Currently only for sameIRExprs. */
  48 #define STATS_IROPT 0
  49
  50
  51 /* What iropt does, 29 Dec 04.
  52
  53    It takes an IRSB and produces a new one with the same meaning,
  54    defined thus:
  55
  56    After execution of the new BB, all guest state and guest memory is
  57    the same as after execution of the original.  This is true
  58    regardless of how the block was exited (at the end vs side exit).
  59
  60    In addition, parts of the guest state will be identical to that
  61    created by execution of the original at the following observation
  62    points:
  63
  64    * In a dirty helper call, any parts of the guest state that the
  65      helper states that it reads or modifies will be up to date.
  66      Also, guest memory will be up to date.  Parts of the guest state
  67      not marked as being read or modified by the helper cannot be
  68      assumed to be up-to-date at the point where the helper is called.
  69
  70    * If iropt_register_updates == VexRegUpdSpAtMemAccess :
  71      The guest state is only up to date only as explained above
  72      (i.e. at SB exits and as specified by dirty helper call).
  73      Also, the stack pointer register is up to date at memory
  74      exception points (as this is needed for the stack extension
  75      logic in m_signals.c).
  76
  77    * If iropt_register_updates == VexRegUpdUnwindregsAtMemAccess :
  78      Immediately prior to any load or store, those parts of the guest
  79      state marked as requiring precise exceptions will be up to date.
  80      Also, guest memory will be up to date.  Parts of the guest state
  81      not marked as requiring precise exceptions cannot be assumed to
  82      be up-to-date at the point of the load/store.
  83
  84    * If iropt_register_updates == VexRegUpdAllregsAtMemAccess:
  85      Same as minimal, but all the guest state is up to date at memory
  86      exception points.
  87
  88    * If iropt_register_updates == VexRegUpdAllregsAtEachInsn :
  89      Guest state is up to date at each instruction.
  90
  91    The relative order of loads and stores (including loads/stores of
  92    guest memory done by dirty helpers annotated as such) is not
  93    changed.  However, the relative order of loads with no intervening
  94    stores/modifies may be changed.
  95
  96    Transformation order
  97    ~~~~~~~~~~~~~~~~~~~~
  98
  99    There are three levels of optimisation, controlled by
 100    vex_control.iropt_level.  Define first:
 101
 102    "Cheap transformations" are the following sequence:
 103       * Redundant-Get removal
 104       * Redundant-Put removal
 105       * Constant propagation/folding
 106       * Dead code removal
 107       * Specialisation of clean helper functions
 108       * Dead code removal
 109
 110    "Expensive transformations" are the following sequence:
 111       * CSE
 112       * Folding of add/sub chains
 113       * Redundant-GetI removal
 114       * Redundant-PutI removal
 115       * Dead code removal
 116
 117    Then the transformations are as follows, as defined by
 118    vex_control.iropt_level:
 119
 120    Level 0:
 121       * Flatten into atomic form.
 122
 123    Level 1: the following sequence:
 124       * Flatten into atomic form.
 125       * Cheap transformations.
 126
 127    Level 2: the following sequence
 128       * Flatten into atomic form.
 129       * Cheap transformations.
 130       * If block contains any floating or vector types, CSE.
 131       * If block contains GetI or PutI, Expensive transformations.
 132       * Try unrolling loops.  Three possible outcomes:
 133         - No effect: do nothing more.
 134         - Unrolled a loop, and block does not contain GetI or PutI:
 135           Do: * CSE
 136               * Dead code removal
 137         - Unrolled a loop, and block contains GetI or PutI:
 138           Do: * Expensive transformations
 139               * Cheap transformations
 140 */
 141
 142 /* Implementation notes, 29 Dec 04.
 143
 144    TODO (important): I think rPutI removal ignores precise exceptions
 145    and is therefore in a sense, wrong.  In the sense that PutIs are
 146    assumed not to write parts of the guest state that we need to have
 147    up-to-date at loads/stores.  So far on x86 guest that has not
 148    mattered since indeed only the x87 FP registers and tags are
 149    accessed using GetI/PutI, and there is no need so far for them to
 150    be up to date at mem exception points.  The rPutI pass should be
 151    fixed.
 152
 153    TODO: improve pessimistic handling of precise exceptions
 154      in the tree builder.
 155
 156    TODO: check interaction of rGetI and dirty helpers.
 157
 158    F64i constants are treated differently from other constants.
 159    They are not regarded as atoms, and instead lifted off and
 160    bound to temps.  This allows them to participate in CSE, which
 161    is important for getting good performance for x86 guest code.
 162
 163    CSE up F64 literals (already doing F64is)
 164
 165    CSE: consider carefully the requirement for precise exns
 166         prior to making CSE any more aggressive.  */
 167
 168
 169 /*---------------------------------------------------------------*/
 170 /*--- Finite mappery, of a sort                               ---*/
 171 /*---------------------------------------------------------------*/
 172
 173 /* General map from HWord-sized thing HWord-sized thing.  Could be by
 174    hashing, but it's not clear whether or not this would really be any
 175    faster. */
 176
 177 typedef
 178    struct {
 179       Bool*  inuse;
 180       HWord* key;
 181       HWord* val;
 182       Int    size;
 183       Int    used;
 184    }
 185    HashHW;
 186
 187 static HashHW* newHHW ( void )
 188 {
 189    HashHW* h = LibVEX_Alloc_inline(sizeof(HashHW));
 190    h->size   = 8;
 191    h->used   = 0;
 192    h->inuse  = LibVEX_Alloc_inline(h->size * sizeof(Bool));
 193    h->key    = LibVEX_Alloc_inline(h->size * sizeof(HWord));
 194    h->val    = LibVEX_Alloc_inline(h->size * sizeof(HWord));
 195    return h;
 196 }
 197
 198
 199 /* Look up key in the map. */
 200
 201 static Bool lookupHHW ( HashHW* h, /*OUT*/HWord* val, HWord key )
 202 {
 203    Int i;
 204    /* vex_printf("lookupHHW(%llx)\n", key ); */
 205    for (i = 0; i < h->used; i++) {
 206       if (h->inuse[i] && h->key[i] == key) {
 207          if (val)
 208             *val = h->val[i];
 209          return True;
 210       }
 211    }
 212    return False;
 213 }
 214
 215
 216 /* Add key->val to the map.  Replaces any existing binding for key. */
 217
 218 static void addToHHW ( HashHW* h, HWord key, HWord val )
 219 {
 220    Int i, j;
 221    /* vex_printf("addToHHW(%llx, %llx)\n", key, val); */
 222
 223    /* Find and replace existing binding, if any. */
 224    for (i = 0; i < h->used; i++) {
 225       if (h->inuse[i] && h->key[i] == key) {
 226          h->val[i] = val;
 227          return;
 228       }
 229    }
 230
 231    /* Ensure a space is available. */
 232    if (h->used == h->size) {
 233       /* Copy into arrays twice the size. */
 234       Bool*  inuse2 = LibVEX_Alloc_inline(2 * h->size * sizeof(Bool));
 235       HWord* key2   = LibVEX_Alloc_inline(2 * h->size * sizeof(HWord));
 236       HWord* val2   = LibVEX_Alloc_inline(2 * h->size * sizeof(HWord));
 237       for (i = j = 0; i < h->size; i++) {
 238          if (!h->inuse[i]) continue;
 239          inuse2[j] = True;
 240          key2[j] = h->key[i];
 241          val2[j] = h->val[i];
 242          j++;
 243       }
 244       h->used = j;
 245       h->size *= 2;
 246       h->inuse = inuse2;
 247       h->key = key2;
 248       h->val = val2;
 249    }
 250
 251    /* Finally, add it. */
 252    vassert(h->used < h->size);
 253    h->inuse[h->used] = True;
 254    h->key[h->used] = key;
 255    h->val[h->used] = val;
 256    h->used++;
 257 }
 258
 259
 260 /*---------------------------------------------------------------*/
 261 /*--- Flattening out a BB into atomic SSA form                ---*/
 262 /*---------------------------------------------------------------*/
 263
 264 /* Non-critical helper, heuristic for reducing the number of tmp-tmp
 265    copies made by flattening.  If in doubt return False. */
 266
 267 static Bool isFlat ( IRExpr* e )
 268 {
 269    if (e->tag == Iex_Get)
 270       return True;
 271    if (e->tag == Iex_Binop)
 272       return toBool( isIRAtom(e->Iex.Binop.arg1)
 273                      && isIRAtom(e->Iex.Binop.arg2) );
 274    if (e->tag == Iex_Load)
 275       return isIRAtom(e->Iex.Load.addr);
 276    return False;
 277 }
 278
 279 /* Flatten out 'ex' so it is atomic, returning a new expression with
 280    the same value, after having appended extra IRTemp assignments to
 281    the end of 'bb'. */
 282
 283 static IRExpr* flatten_Expr ( IRSB* bb, IRExpr* ex )
 284 {
 285    Int i;
 286    IRExpr** newargs;
 287    IRType ty = typeOfIRExpr(bb->tyenv, ex);
 288    IRTemp t1;
 289
 290    switch (ex->tag) {
 291
 292       case Iex_GetI:
 293          t1 = newIRTemp(bb->tyenv, ty);
 294          addStmtToIRSB(bb, IRStmt_WrTmp(t1,
 295             IRExpr_GetI(ex->Iex.GetI.descr,
 296                         flatten_Expr(bb, ex->Iex.GetI.ix),
 297                         ex->Iex.GetI.bias)));
 298          return IRExpr_RdTmp(t1);
 299
 300       case Iex_Get:
 301          t1 = newIRTemp(bb->tyenv, ty);
 302          addStmtToIRSB(bb,
 303             IRStmt_WrTmp(t1, ex));
 304          return IRExpr_RdTmp(t1);
 305
 306       case Iex_Qop: {
 307          IRQop* qop = ex->Iex.Qop.details;
 308          t1 = newIRTemp(bb->tyenv, ty);
 309          addStmtToIRSB(bb, IRStmt_WrTmp(t1,
 310             IRExpr_Qop(qop->op,
 311                          flatten_Expr(bb, qop->arg1),
 312                          flatten_Expr(bb, qop->arg2),
 313                          flatten_Expr(bb, qop->arg3),
 314                          flatten_Expr(bb, qop->arg4))));
 315          return IRExpr_RdTmp(t1);
 316       }
 317
 318       case Iex_Triop: {
 319          IRTriop* triop = ex->Iex.Triop.details;
 320          t1 = newIRTemp(bb->tyenv, ty);
 321          addStmtToIRSB(bb, IRStmt_WrTmp(t1,
 322             IRExpr_Triop(triop->op,
 323                          flatten_Expr(bb, triop->arg1),
 324                          flatten_Expr(bb, triop->arg2),
 325                          flatten_Expr(bb, triop->arg3))));
 326          return IRExpr_RdTmp(t1);
 327       }
 328
 329       case Iex_Binop:
 330          t1 = newIRTemp(bb->tyenv, ty);
 331          addStmtToIRSB(bb, IRStmt_WrTmp(t1,
 332             IRExpr_Binop(ex->Iex.Binop.op,
 333                          flatten_Expr(bb, ex->Iex.Binop.arg1),
 334                          flatten_Expr(bb, ex->Iex.Binop.arg2))));
 335          return IRExpr_RdTmp(t1);
 336
 337       case Iex_Unop:
 338          t1 = newIRTemp(bb->tyenv, ty);
 339          addStmtToIRSB(bb, IRStmt_WrTmp(t1,
 340             IRExpr_Unop(ex->Iex.Unop.op,
 341                         flatten_Expr(bb, ex->Iex.Unop.arg))));
 342          return IRExpr_RdTmp(t1);
 343
 344       case Iex_Load:
 345          t1 = newIRTemp(bb->tyenv, ty);
 346          addStmtToIRSB(bb, IRStmt_WrTmp(t1,
 347             IRExpr_Load(ex->Iex.Load.end,
 348                         ex->Iex.Load.ty,
 349                         flatten_Expr(bb, ex->Iex.Load.addr))));
 350          return IRExpr_RdTmp(t1);
 351
 352       case Iex_CCall:
 353          newargs = shallowCopyIRExprVec(ex->Iex.CCall.args);
 354          for (i = 0; newargs[i]; i++)
 355             newargs[i] = flatten_Expr(bb, newargs[i]);
 356          t1 = newIRTemp(bb->tyenv, ty);
 357          addStmtToIRSB(bb, IRStmt_WrTmp(t1,
 358             IRExpr_CCall(ex->Iex.CCall.cee,
 359                          ex->Iex.CCall.retty,
 360                          newargs)));
 361          return IRExpr_RdTmp(t1);
 362
 363       case Iex_ITE:
 364          t1 = newIRTemp(bb->tyenv, ty);
 365          addStmtToIRSB(bb, IRStmt_WrTmp(t1,
 366             IRExpr_ITE(flatten_Expr(bb, ex->Iex.ITE.cond),
 367                        flatten_Expr(bb, ex->Iex.ITE.iftrue),
 368                        flatten_Expr(bb, ex->Iex.ITE.iffalse))));
 369          return IRExpr_RdTmp(t1);
 370
 371       case Iex_Const:
 372          /* Lift F64i constants out onto temps so they can be CSEd
 373             later. */
 374          if (ex->Iex.Const.con->tag == Ico_F64i) {
 375             t1 = newIRTemp(bb->tyenv, ty);
 376             addStmtToIRSB(bb, IRStmt_WrTmp(t1,
 377                IRExpr_Const(ex->Iex.Const.con)));
 378             return IRExpr_RdTmp(t1);
 379          } else {
 380             /* Leave all other constants alone. */
 381             return ex;
 382          }
 383
 384       case Iex_RdTmp:
 385          return ex;
 386
 387       default:
 388          vex_printf("\n");
 389          ppIRExpr(ex);
 390          vex_printf("\n");
 391          vpanic("flatten_Expr");
 392    }
 393 }
 394
 395
 396 /* Append a completely flattened form of 'st' to the end of 'bb'. */
 397
 398 static void flatten_Stmt ( IRSB* bb, IRStmt* st )
 399 {
 400    Int i;
 401    IRExpr   *e1, *e2, *e3, *e4, *e5;
 402    IRDirty  *d,  *d2;
 403    IRCAS    *cas, *cas2;
 404    IRPutI   *puti, *puti2;
 405    IRLoadG  *lg;
 406    IRStoreG *sg;
 407    switch (st->tag) {
 408       case Ist_Put:
 409          if (isIRAtom(st->Ist.Put.data)) {
 410             /* optimisation to reduce the amount of heap wasted
 411                by the flattener */
 412             addStmtToIRSB(bb, st);
 413          } else {
 414             /* general case, always correct */
 415             e1 = flatten_Expr(bb, st->Ist.Put.data);
 416             addStmtToIRSB(bb, IRStmt_Put(st->Ist.Put.offset, e1));
 417          }
 418          break;
 419       case Ist_PutI:
 420          puti = st->Ist.PutI.details;
 421          e1 = flatten_Expr(bb, puti->ix);
 422          e2 = flatten_Expr(bb, puti->data);
 423          puti2 = mkIRPutI(puti->descr, e1, puti->bias, e2);
 424          addStmtToIRSB(bb, IRStmt_PutI(puti2));
 425          break;
 426       case Ist_WrTmp:
 427          if (isFlat(st->Ist.WrTmp.data)) {
 428             /* optimisation, to reduce the number of tmp-tmp
 429                copies generated */
 430             addStmtToIRSB(bb, st);
 431          } else {
 432             /* general case, always correct */
 433             e1 = flatten_Expr(bb, st->Ist.WrTmp.data);
 434             addStmtToIRSB(bb, IRStmt_WrTmp(st->Ist.WrTmp.tmp, e1));
 435          }
 436          break;
 437       case Ist_Store:
 438          e1 = flatten_Expr(bb, st->Ist.Store.addr);
 439          e2 = flatten_Expr(bb, st->Ist.Store.data);
 440          addStmtToIRSB(bb, IRStmt_Store(st->Ist.Store.end, e1,e2));
 441          break;
 442       case Ist_StoreG:
 443          sg = st->Ist.StoreG.details;
 444          e1 = flatten_Expr(bb, sg->addr);
 445          e2 = flatten_Expr(bb, sg->data);
 446          e3 = flatten_Expr(bb, sg->guard);
 447          addStmtToIRSB(bb, IRStmt_StoreG(sg->end, e1, e2, e3));
 448          break;
 449       case Ist_LoadG:
 450          lg = st->Ist.LoadG.details;
 451          e1 = flatten_Expr(bb, lg->addr);
 452          e2 = flatten_Expr(bb, lg->alt);
 453          e3 = flatten_Expr(bb, lg->guard);
 454          addStmtToIRSB(bb, IRStmt_LoadG(lg->end, lg->cvt, lg->dst,
 455                                         e1, e2, e3));
 456          break;
 457       case Ist_CAS:
 458          cas  = st->Ist.CAS.details;
 459          e1   = flatten_Expr(bb, cas->addr);
 460          e2   = cas->expdHi ? flatten_Expr(bb, cas->expdHi) : NULL;
 461          e3   = flatten_Expr(bb, cas->expdLo);
 462          e4   = cas->dataHi ? flatten_Expr(bb, cas->dataHi) : NULL;
 463          e5   = flatten_Expr(bb, cas->dataLo);
 464          cas2 = mkIRCAS( cas->oldHi, cas->oldLo, cas->end,
 465                          e1, e2, e3, e4, e5 );
 466          addStmtToIRSB(bb, IRStmt_CAS(cas2));
 467          break;
 468       case Ist_LLSC:
 469          e1 = flatten_Expr(bb, st->Ist.LLSC.addr);
 470          e2 = st->Ist.LLSC.storedata
 471                  ? flatten_Expr(bb, st->Ist.LLSC.storedata)
 472                  : NULL;
 473          addStmtToIRSB(bb, IRStmt_LLSC(st->Ist.LLSC.end,
 474                                        st->Ist.LLSC.result, e1, e2));
 475          break;
 476       case Ist_Dirty:
 477          d = st->Ist.Dirty.details;
 478          d2 = emptyIRDirty();
 479          *d2 = *d;
 480          d2->args = shallowCopyIRExprVec(d2->args);
 481          if (d2->mFx != Ifx_None) {
 482             d2->mAddr = flatten_Expr(bb, d2->mAddr);
 483          } else {
 484             vassert(d2->mAddr == NULL);
 485          }
 486          d2->guard = flatten_Expr(bb, d2->guard);
 487          for (i = 0; d2->args[i]; i++) {
 488             IRExpr* arg = d2->args[i];
 489             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg)))
 490                d2->args[i] = flatten_Expr(bb, arg);
 491          }
 492          addStmtToIRSB(bb, IRStmt_Dirty(d2));
 493          break;
 494       case Ist_NoOp:
 495       case Ist_MBE:
 496       case Ist_IMark:
 497          addStmtToIRSB(bb, st);
 498          break;
 499       case Ist_AbiHint:
 500          e1 = flatten_Expr(bb, st->Ist.AbiHint.base);
 501          e2 = flatten_Expr(bb, st->Ist.AbiHint.nia);
 502          addStmtToIRSB(bb, IRStmt_AbiHint(e1, st->Ist.AbiHint.len, e2));
 503          break;
 504       case Ist_Exit:
 505          e1 = flatten_Expr(bb, st->Ist.Exit.guard);
 506          addStmtToIRSB(bb, IRStmt_Exit(e1, st->Ist.Exit.jk,
 507                                        st->Ist.Exit.dst,
 508                                        st->Ist.Exit.offsIP));
 509          break;
 510       default:
 511          vex_printf("\n");
 512          ppIRStmt(st);
 513          vex_printf("\n");
 514          vpanic("flatten_Stmt");
 515    }
 516 }
 517
 518
 519 static IRSB* flatten_BB ( IRSB* in )
 520 {
 521    Int   i;
 522    IRSB* out;
 523    out = emptyIRSB();
 524    out->tyenv = deepCopyIRTypeEnv( in->tyenv );
 525    for (i = 0; i < in->stmts_used; i++)
 526       if (in->stmts[i])
 527          flatten_Stmt( out, in->stmts[i] );
 528    out->next     = flatten_Expr( out, in->next );
 529    out->jumpkind = in->jumpkind;
 530    out->offsIP   = in->offsIP;
 531    return out;
 532 }
 533
 534
 535 /*---------------------------------------------------------------*/
 536 /*--- In-place removal of redundant GETs                      ---*/
 537 /*---------------------------------------------------------------*/
 538
 539 /* Scan forwards, building up an environment binding (min offset, max
 540    offset) pairs to values, which will either be temps or constants.
 541
 542    On seeing 't = Get(minoff,maxoff)', look up (minoff,maxoff) in the
 543    env and if it matches, replace the Get with the stored value.  If
 544    there is no match, add a (minoff,maxoff) :-> t binding.
 545
 546    On seeing 'Put (minoff,maxoff) = t or c', first remove in the env
 547    any binding which fully or partially overlaps with (minoff,maxoff).
 548    Then add a new (minoff,maxoff) :-> t or c binding.  */
 549
 550 /* Extract the min/max offsets from a guest state array descriptor. */
 551
 552 inline
 553 static void getArrayBounds ( IRRegArray* descr,
 554                              UInt* minoff, UInt* maxoff )
 555 {
 556    *minoff = descr->base;
 557    *maxoff = *minoff + descr->nElems*sizeofIRType(descr->elemTy) - 1;
 558    vassert((*minoff & ~0xFFFF) == 0);
 559    vassert((*maxoff & ~0xFFFF) == 0);
 560    vassert(*minoff <= *maxoff);
 561 }
 562
 563 /* Create keys, of the form ((minoffset << 16) | maxoffset). */
 564
 565 static UInt mk_key_GetPut ( Int offset, IRType ty )
 566 {
 567    /* offset should fit in 16 bits. */
 568    UInt minoff = offset;
 569    UInt maxoff = minoff + sizeofIRType(ty) - 1;
 570    vassert((minoff & ~0xFFFF) == 0);
 571    vassert((maxoff & ~0xFFFF) == 0);
 572    return (minoff << 16) | maxoff;
 573 }
 574
 575 static UInt mk_key_GetIPutI ( IRRegArray* descr )
 576 {
 577    UInt minoff, maxoff;
 578    getArrayBounds( descr, &minoff, &maxoff );
 579    vassert((minoff & ~0xFFFF) == 0);
 580    vassert((maxoff & ~0xFFFF) == 0);
 581    return (minoff << 16) | maxoff;
 582 }
 583
 584 /* Supposing h has keys of the form generated by mk_key_GetPut and
 585    mk_key_GetIPutI, invalidate any key which overlaps (k_lo
 586    .. k_hi).
 587 */
 588 static void invalidateOverlaps ( HashHW* h, UInt k_lo, UInt k_hi )
 589 {
 590    Int  j;
 591    UInt e_lo, e_hi;
 592    vassert(k_lo <= k_hi);
 593    /* invalidate any env entries which in any way overlap (k_lo
 594       .. k_hi) */
 595    /* vex_printf("invalidate %d .. %d\n", k_lo, k_hi ); */
 596
 597    for (j = 0; j < h->used; j++) {
 598       if (!h->inuse[j])
 599          continue;
 600       e_lo = (((UInt)h->key[j]) >> 16) & 0xFFFF;
 601       e_hi = ((UInt)h->key[j]) & 0xFFFF;
 602       vassert(e_lo <= e_hi);
 603       if (e_hi < k_lo || k_hi < e_lo)
 604          continue; /* no overlap possible */
 605       else
 606          /* overlap; invalidate */
 607          h->inuse[j] = False;
 608    }
 609 }
 610
 611
 612 static void redundant_get_removal_BB ( IRSB* bb )
 613 {
 614    HashHW* env = newHHW();
 615    UInt    key = 0; /* keep gcc -O happy */
 616    Int     i, j;
 617    HWord   val;
 618
 619    for (i = 0; i < bb->stmts_used; i++) {
 620       IRStmt* st = bb->stmts[i];
 621
 622       if (st->tag == Ist_NoOp)
 623          continue;
 624
 625       /* Deal with Gets */
 626       if (st->tag == Ist_WrTmp
 627           && st->Ist.WrTmp.data->tag == Iex_Get) {
 628          /* st is 't = Get(...)'.  Look up in the environment and see
 629             if the Get can be replaced. */
 630          IRExpr* get = st->Ist.WrTmp.data;
 631          key = (HWord)mk_key_GetPut( get->Iex.Get.offset,
 632                                      get->Iex.Get.ty );
 633          if (lookupHHW(env, &val, (HWord)key)) {
 634             /* found it */
 635             /* Note, we could do better here.  If the types are
 636                different we don't do the substitution, since doing so
 637                could lead to invalidly-typed IR.  An improvement would
 638                be to stick in a reinterpret-style cast, although that
 639                would make maintaining flatness more difficult. */
 640             IRExpr* valE    = (IRExpr*)val;
 641             Bool    typesOK = toBool( typeOfIRExpr(bb->tyenv,valE)
 642                                       == st->Ist.WrTmp.data->Iex.Get.ty );
 643             if (typesOK && DEBUG_IROPT) {
 644                vex_printf("rGET: "); ppIRExpr(get);
 645                vex_printf("  ->  "); ppIRExpr(valE);
 646                vex_printf("\n");
 647             }
 648             if (typesOK)
 649                bb->stmts[i] = IRStmt_WrTmp(st->Ist.WrTmp.tmp, valE);
 650          } else {
 651             /* Not found, but at least we know that t and the Get(...)
 652                are now associated.  So add a binding to reflect that
 653                fact. */
 654             addToHHW( env, (HWord)key,
 655                            (HWord)(void*)(IRExpr_RdTmp(st->Ist.WrTmp.tmp)) );
 656          }
 657       }
 658
 659       /* Deal with Puts: invalidate any env entries overlapped by this
 660          Put */
 661       if (st->tag == Ist_Put || st->tag == Ist_PutI) {
 662          UInt k_lo, k_hi;
 663          if (st->tag == Ist_Put) {
 664             key = mk_key_GetPut( st->Ist.Put.offset,
 665                                  typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
 666          } else {
 667             vassert(st->tag == Ist_PutI);
 668             key = mk_key_GetIPutI( st->Ist.PutI.details->descr );
 669          }
 670
 671          k_lo = (key >> 16) & 0xFFFF;
 672          k_hi = key & 0xFFFF;
 673          invalidateOverlaps(env, k_lo, k_hi);
 674       }
 675       else
 676       if (st->tag == Ist_Dirty) {
 677          /* Deal with dirty helpers which write or modify guest state.
 678             Invalidate the entire env.  We could do a lot better
 679             here. */
 680          IRDirty* d      = st->Ist.Dirty.details;
 681          Bool     writes = False;
 682          for (j = 0; j < d->nFxState; j++) {
 683             if (d->fxState[j].fx == Ifx_Modify
 684                 || d->fxState[j].fx == Ifx_Write)
 685             writes = True;
 686          }
 687          if (writes) {
 688             /* dump the entire env (not clever, but correct ...) */
 689             for (j = 0; j < env->used; j++)
 690                env->inuse[j] = False;
 691             if (0) vex_printf("rGET: trash env due to dirty helper\n");
 692          }
 693       }
 694
 695       /* add this one to the env, if appropriate */
 696       if (st->tag == Ist_Put) {
 697          vassert(isIRAtom(st->Ist.Put.data));
 698          addToHHW( env, (HWord)key, (HWord)(st->Ist.Put.data));
 699       }
 700
 701    } /* for (i = 0; i < bb->stmts_used; i++) */
 702
 703 }
 704
 705
 706 /*---------------------------------------------------------------*/
 707 /*--- In-place removal of redundant PUTs                      ---*/
 708 /*---------------------------------------------------------------*/
 709
 710 /* Find any Get uses in st and invalidate any partially or fully
 711    overlapping ranges listed in env.  Due to the flattening phase, the
 712    only stmt kind we expect to find a Get on is IRStmt_WrTmp. */
 713
 714 static void handle_gets_Stmt (
 715                HashHW* env,
 716                IRStmt* st,
 717                Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
 718                VexRegisterUpdates pxControl
 719             )
 720 {
 721    Int     j;
 722    UInt    key = 0; /* keep gcc -O happy */
 723    Bool    isGet;
 724    Bool    memRW = False;
 725    IRExpr* e;
 726
 727    switch (st->tag) {
 728
 729       /* This is the only interesting case.  Deal with Gets in the RHS
 730          expression. */
 731       case Ist_WrTmp:
 732          e = st->Ist.WrTmp.data;
 733          switch (e->tag) {
 734             case Iex_Get:
 735                isGet = True;
 736                key = mk_key_GetPut ( e->Iex.Get.offset, e->Iex.Get.ty );
 737                break;
 738             case Iex_GetI:
 739                isGet = True;
 740                key = mk_key_GetIPutI ( e->Iex.GetI.descr );
 741                break;
 742             case Iex_Load:
 743                isGet = False;
 744                memRW = True;
 745                break;
 746             default:
 747                isGet = False;
 748          }
 749          if (isGet) {
 750             UInt k_lo, k_hi;
 751             k_lo = (key >> 16) & 0xFFFF;
 752             k_hi = key & 0xFFFF;
 753             invalidateOverlaps(env, k_lo, k_hi);
 754          }
 755          break;
 756
 757       /* Be very conservative for dirty helper calls; dump the entire
 758          environment.  The helper might read guest state, in which
 759          case it needs to be flushed first.  Also, the helper might
 760          access guest memory, in which case all parts of the guest
 761          state requiring precise exceptions needs to be flushed.  The
 762          crude solution is just to flush everything; we could easily
 763          enough do a lot better if needed. */
 764       /* Probably also overly-conservative, but also dump everything
 765          if we hit a memory bus event (fence, lock, unlock).  Ditto
 766          AbiHints, CASs, LLs and SCs. */
 767       case Ist_AbiHint:
 768          vassert(isIRAtom(st->Ist.AbiHint.base));
 769          vassert(isIRAtom(st->Ist.AbiHint.nia));
 770          /* fall through */
 771       case Ist_MBE:
 772       case Ist_Dirty:
 773       case Ist_CAS:
 774       case Ist_LLSC:
 775          for (j = 0; j < env->used; j++)
 776             env->inuse[j] = False;
 777          break;
 778
 779       /* all other cases are boring. */
 780       case Ist_Store:
 781          vassert(isIRAtom(st->Ist.Store.addr));
 782          vassert(isIRAtom(st->Ist.Store.data));
 783          memRW = True;
 784          break;
 785       case Ist_StoreG: {
 786          IRStoreG* sg = st->Ist.StoreG.details;
 787          vassert(isIRAtom(sg->addr));
 788          vassert(isIRAtom(sg->data));
 789          vassert(isIRAtom(sg->guard));
 790          memRW = True;
 791          break;
 792       }
 793       case Ist_LoadG: {
 794          IRLoadG* lg = st->Ist.LoadG.details;
 795          vassert(isIRAtom(lg->addr));
 796          vassert(isIRAtom(lg->alt));
 797          vassert(isIRAtom(lg->guard));
 798          memRW = True;
 799          break;
 800       }
 801       case Ist_Exit:
 802          vassert(isIRAtom(st->Ist.Exit.guard));
 803          break;
 804
 805       case Ist_Put:
 806          vassert(isIRAtom(st->Ist.Put.data));
 807          break;
 808
 809       case Ist_PutI:
 810          vassert(isIRAtom(st->Ist.PutI.details->ix));
 811          vassert(isIRAtom(st->Ist.PutI.details->data));
 812          break;
 813
 814       case Ist_NoOp:
 815       case Ist_IMark:
 816          break;
 817
 818       default:
 819          vex_printf("\n");
 820          ppIRStmt(st);
 821          vex_printf("\n");
 822          vpanic("handle_gets_Stmt");
 823    }
 824
 825    if (memRW) {
 826       /* This statement accesses memory.  So we might need to dump all parts
 827          of the environment corresponding to guest state that may not
 828          be reordered with respect to memory references.  That means
 829          at least the stack pointer. */
 830       switch (pxControl) {
 831          case VexRegUpdAllregsAtMemAccess:
 832             /* Precise exceptions required at mem access.
 833                Flush all guest state. */
 834             for (j = 0; j < env->used; j++)
 835                env->inuse[j] = False;
 836             break;
 837          case VexRegUpdSpAtMemAccess:
 838             /* We need to dump the stack pointer
 839                (needed for stack extension in m_signals.c).
 840                preciseMemExnsFn will use vex_control.iropt_register_updates
 841                to verify only the sp is to be checked. */
 842             /* fallthrough */
 843          case VexRegUpdUnwindregsAtMemAccess:
 844             for (j = 0; j < env->used; j++) {
 845                if (!env->inuse[j])
 846                   continue;
 847                /* Just flush the minimal amount required, as computed by
 848                   preciseMemExnsFn. */
 849                HWord k_lo = (env->key[j] >> 16) & 0xFFFF;
 850                HWord k_hi = env->key[j] & 0xFFFF;
 851                if (preciseMemExnsFn( k_lo, k_hi, pxControl ))
 852                   env->inuse[j] = False;
 853             }
 854             break;
 855          case VexRegUpdAllregsAtEachInsn:
 856             // VexRegUpdAllregsAtEachInsn cannot happen here.
 857             // fall through
 858          case VexRegUpd_INVALID:
 859          default:
 860             vassert(0);
 861       }
 862    } /* if (memRW) */
 863
 864 }
 865
 866
 867 /* Scan backwards, building up a set of (min offset, max
 868    offset) pairs, indicating those parts of the guest state
 869    for which the next event is a write.
 870
 871    On seeing a conditional exit, empty the set.
 872
 873    On seeing 'Put (minoff,maxoff) = t or c', if (minoff,maxoff) is
 874    completely within the set, remove the Put.  Otherwise, add
 875    (minoff,maxoff) to the set.
 876
 877    On seeing 'Get (minoff,maxoff)', remove any part of the set
 878    overlapping (minoff,maxoff).  The same has to happen for any events
 879    which implicitly read parts of the guest state: dirty helper calls
 880    and loads/stores.
 881 */
 882
 883 static void redundant_put_removal_BB (
 884                IRSB* bb,
 885                Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
 886                VexRegisterUpdates pxControl
 887             )
 888 {
 889    Int     i, j;
 890    Bool    isPut;
 891    IRStmt* st;
 892    UInt    key = 0; /* keep gcc -O happy */
 893
 894 //   vassert(pxControl < VexRegUpdAllregsAtEachInsn);
 895
 896    HashHW* env = newHHW();
 897
 898    /* Initialise the running env with the fact that the final exit
 899       writes the IP (or, whatever it claims to write.  We don't
 900       care.) */
 901    key = mk_key_GetPut(bb->offsIP, typeOfIRExpr(bb->tyenv, bb->next));
 902    addToHHW(env, (HWord)key, 0);
 903
 904    /* And now scan backwards through the statements. */
 905    for (i = bb->stmts_used-1; i >= 0; i--) {
 906       st = bb->stmts[i];
 907
 908       if (st->tag == Ist_NoOp)
 909          continue;
 910
 911       /* Deal with conditional exits. */
 912       if (st->tag == Ist_Exit) {
 913          //Bool re_add;
 914          /* Need to throw out from the env, any part of it which
 915             doesn't overlap with the guest state written by this exit.
 916             Since the exit only writes one section, it's simplest to
 917             do this: (1) check whether env contains a write that
 918             completely overlaps the write done by this exit; (2) empty
 919             out env; and (3) if (1) was true, add the write done by
 920             this exit.
 921
 922             To make (1) a bit simpler, merely search for a write that
 923             exactly matches the one done by this exit.  That's safe
 924             because it will fail as often or more often than a full
 925             overlap check, and failure to find an overlapping write in
 926             env is the safe case (we just nuke env if that
 927             happens). */
 928          //vassert(isIRAtom(st->Ist.Exit.guard));
 929          /* (1) */
 930          //key = mk_key_GetPut(st->Ist.Exit.offsIP,
 931          //                    typeOfIRConst(st->Ist.Exit.dst));
 932          //re_add = lookupHHW(env, NULL, key);
 933          /* (2) */
 934          for (j = 0; j < env->used; j++)
 935             env->inuse[j] = False;
 936          /* (3) */
 937          //if (0 && re_add)
 938          //   addToHHW(env, (HWord)key, 0);
 939          continue;
 940       }
 941
 942       /* Deal with Puts */
 943       switch (st->tag) {
 944          case Ist_Put:
 945             isPut = True;
 946             key = mk_key_GetPut( st->Ist.Put.offset,
 947                                  typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
 948             vassert(isIRAtom(st->Ist.Put.data));
 949             break;
 950          case Ist_PutI:
 951             isPut = True;
 952             key = mk_key_GetIPutI( st->Ist.PutI.details->descr );
 953             vassert(isIRAtom(st->Ist.PutI.details->ix));
 954             vassert(isIRAtom(st->Ist.PutI.details->data));
 955             break;
 956          default:
 957             isPut = False;
 958       }
 959       if (isPut && st->tag != Ist_PutI) {
 960          /* See if any single entry in env overlaps this Put.  This is
 961             simplistic in that the transformation is valid if, say, two
 962             or more entries in the env overlap this Put, but the use of
 963             lookupHHW will only find a single entry which exactly
 964             overlaps this Put.  This is suboptimal but safe. */
 965          if (lookupHHW(env, NULL, (HWord)key)) {
 966             /* This Put is redundant because a later one will overwrite
 967                it.  So NULL (nop) it out. */
 968             if (DEBUG_IROPT) {
 969                vex_printf("rPUT: "); ppIRStmt(st);
 970                vex_printf("\n");
 971             }
 972             bb->stmts[i] = IRStmt_NoOp();
 973          } else {
 974             /* We can't demonstrate that this Put is redundant, so add it
 975                to the running collection. */
 976             addToHHW(env, (HWord)key, 0);
 977          }
 978          continue;
 979       }
 980
 981       /* Deal with Gets.  These remove bits of the environment since
 982          appearance of a Get means that the next event for that slice
 983          of the guest state is no longer a write, but a read.  Also
 984          deals with implicit reads of guest state needed to maintain
 985          precise exceptions. */
 986       handle_gets_Stmt( env, st, preciseMemExnsFn, pxControl );
 987    }
 988 }
 989
 990
 991 /*---------------------------------------------------------------*/
 992 /*--- Constant propagation and folding                        ---*/
 993 /*---------------------------------------------------------------*/
 994
 995 #if STATS_IROPT
 996 /* How often sameIRExprs was invoked */
 997 static UInt invocation_count;
 998 /* How often sameIRExprs recursed through IRTemp assignments */
 999 static UInt recursion_count;
1000 /* How often sameIRExprs found identical IRExprs */
1001 static UInt success_count;
1002 /* How often recursing through assignments to IRTemps helped
1003    establishing equality. */
1004 static UInt recursion_success_count;
1005 /* Whether or not recursing through an IRTemp assignment helped
1006    establishing IRExpr equality for a given sameIRExprs invocation. */
1007 static Bool recursion_helped;
1008 /* Whether or not a given sameIRExprs invocation recursed through an
1009    IRTemp assignment */
1010 static Bool recursed;
1011 /* Maximum number of nodes ever visited when comparing two IRExprs. */
1012 static UInt max_nodes_visited;
1013 #endif /* STATS_IROPT */
1014
1015 /* Count the number of nodes visited for a given sameIRExprs invocation. */
1016 static UInt num_nodes_visited;
1017
1018 /* Do not visit more than NODE_LIMIT nodes when comparing two IRExprs.
1019    This is to guard against performance degradation by visiting large
1020    trees without success. */
1021 #define NODE_LIMIT 30
1022
1023
1024 /* The env in this section is a map from IRTemp to IRExpr*,
1025    that is, an array indexed by IRTemp. */
1026
1027 /* Do both expressions compute the same value? The answer is generally
1028    conservative, i.e. it will report that the expressions do not compute
1029    the same value when in fact they do. The reason is that we do not
1030    keep track of changes in the guest state and memory. Thusly, two
1031    Get's, GetI's or Load's, even when accessing the same location, will be
1032    assumed to compute different values. After all the accesses may happen
1033    at different times and the guest state / memory can have changed in
1034    the meantime.
1035
1036    XXX IMPORTANT XXX the two expressions must have the same IR type.
1037    DO NOT CALL HERE WITH DIFFERENTLY-TYPED EXPRESSIONS. */
1038
1039 /* JRS 20-Mar-2012: split sameIRExprs_aux into a fast inlineable
1040    wrapper that deals with the common tags-don't-match case, and a
1041    slower out of line general case.  Saves a few insns. */
1042
1043 __attribute__((noinline))
1044 static Bool sameIRExprs_aux2 ( IRExpr** env, IRExpr* e1, IRExpr* e2 );
1045
1046 inline
1047 static Bool sameIRExprs_aux ( IRExpr** env, IRExpr* e1, IRExpr* e2 )
1048 {
1049    if (e1->tag != e2->tag) return False;
1050    return sameIRExprs_aux2(env, e1, e2);
1051 }
1052
1053 __attribute__((noinline))
1054 static Bool sameIRExprs_aux2 ( IRExpr** env, IRExpr* e1, IRExpr* e2 )
1055 {
1056    if (num_nodes_visited++ > NODE_LIMIT) return False;
1057
1058    switch (e1->tag) {
1059       case Iex_RdTmp:
1060          if (e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp) return True;
1061
1062          if (env[e1->Iex.RdTmp.tmp] && env[e2->Iex.RdTmp.tmp]) {
1063             Bool same = sameIRExprs_aux(env, env[e1->Iex.RdTmp.tmp],
1064                                         env[e2->Iex.RdTmp.tmp]);
1065 #if STATS_IROPT
1066             recursed = True;
1067             if (same) recursion_helped = True;
1068 #endif
1069             return same;
1070          }
1071          return False;
1072
1073       case Iex_Get:
1074       case Iex_GetI:
1075       case Iex_Load:
1076          /* Guest state / memory could have changed in the meantime. */
1077          return False;
1078
1079       case Iex_Binop:
1080          return toBool( e1->Iex.Binop.op == e2->Iex.Binop.op
1081                         && sameIRExprs_aux( env, e1->Iex.Binop.arg1,
1082                                                  e2->Iex.Binop.arg1 )
1083                         && sameIRExprs_aux( env, e1->Iex.Binop.arg2,
1084                                                  e2->Iex.Binop.arg2 ));
1085
1086       case Iex_Unop:
1087          return toBool( e1->Iex.Unop.op == e2->Iex.Unop.op
1088                         && sameIRExprs_aux( env, e1->Iex.Unop.arg,
1089                                                  e2->Iex.Unop.arg ));
1090
1091       case Iex_Const: {
1092          IRConst *c1 = e1->Iex.Const.con;
1093          IRConst *c2 = e2->Iex.Const.con;
1094          vassert(c1->tag == c2->tag);
1095          switch (c1->tag) {
1096             case Ico_U1:   return toBool( c1->Ico.U1  == c2->Ico.U1 );
1097             case Ico_U8:   return toBool( c1->Ico.U8  == c2->Ico.U8 );
1098             case Ico_U16:  return toBool( c1->Ico.U16 == c2->Ico.U16 );
1099             case Ico_U32:  return toBool( c1->Ico.U32 == c2->Ico.U32 );
1100             case Ico_U64:  return toBool( c1->Ico.U64 == c2->Ico.U64 );
1101             default: break;
1102          }
1103          return False;
1104       }
1105
1106       case Iex_Triop: {
1107          IRTriop *tri1 = e1->Iex.Triop.details;
1108          IRTriop *tri2 = e2->Iex.Triop.details;
1109          return toBool( tri1->op == tri2->op
1110                         && sameIRExprs_aux( env, tri1->arg1, tri2->arg1 )
1111                         && sameIRExprs_aux( env, tri1->arg2, tri2->arg2 )
1112                         && sameIRExprs_aux( env, tri1->arg3, tri2->arg3 ));
1113       }
1114
1115       case Iex_ITE:
1116          return toBool(    sameIRExprs_aux( env, e1->Iex.ITE.cond,
1117                                                  e2->Iex.ITE.cond )
1118                         && sameIRExprs_aux( env, e1->Iex.ITE.iftrue,
1119                                                  e2->Iex.ITE.iftrue )
1120                         && sameIRExprs_aux( env, e1->Iex.ITE.iffalse,
1121                                                  e2->Iex.ITE.iffalse ));
1122
1123       default:
1124          /* Not very likely to be "same". */
1125          break;
1126    }
1127
1128    return False;
1129 }
1130
1131 inline
1132 static Bool sameIRExprs ( IRExpr** env, IRExpr* e1, IRExpr* e2 )
1133 {
1134    Bool same;
1135
1136    num_nodes_visited = 0;
1137    same = sameIRExprs_aux(env, e1, e2);
1138
1139 #if STATS_IROPT
1140    ++invocation_count;
1141    if (recursed) ++recursion_count;
1142    success_count += same;
1143    if (same && recursion_helped)
1144       ++recursion_success_count;
1145    if (num_nodes_visited > max_nodes_visited)
1146       max_nodes_visited = num_nodes_visited;
1147    recursed = False; /* reset */
1148    recursion_helped = False;  /* reset */
1149 #endif /* STATS_IROPT */
1150
1151    return same;
1152 }
1153
1154
1155 /* Debugging-only hack (not used in production runs): make a guess
1156    whether sameIRExprs might assert due to the two args being of
1157    different types.  If in doubt return False.  Is only used when
1158    --vex-iropt-level > 0, that is, vex_control.iropt_verbosity > 0.
1159    Bad because it duplicates functionality from typeOfIRExpr.  See
1160    comment on the single use point below for rationale. */
1161 static
1162 Bool debug_only_hack_sameIRExprs_might_assert ( IRExpr* e1, IRExpr* e2 )
1163 {
1164    if (e1->tag != e2->tag) return False;
1165    switch (e1->tag) {
1166       case Iex_Const: {
1167          /* The only interesting case */
1168          IRConst *c1 = e1->Iex.Const.con;
1169          IRConst *c2 = e2->Iex.Const.con;
1170          return c1->tag != c2->tag;
1171       }
1172       default:
1173          break;
1174    }
1175    return False;
1176 }
1177
1178
1179 /* Is this literally IRExpr_Const(IRConst_U32(0)) ? */
1180 static Bool isZeroU32 ( IRExpr* e )
1181 {
1182    return toBool( e->tag == Iex_Const
1183                   && e->Iex.Const.con->tag == Ico_U32
1184                   && e->Iex.Const.con->Ico.U32 == 0);
1185 }
1186
1187 /* Is this literally IRExpr_Const(IRConst_U64(0)) ?
1188    Currently unused; commented out to avoid compiler warning */
1189 #if 0
1190 static Bool isZeroU64 ( IRExpr* e )
1191 {
1192    return toBool( e->tag == Iex_Const
1193                   && e->Iex.Const.con->tag == Ico_U64
1194                   && e->Iex.Const.con->Ico.U64 == 0);
1195 }
1196 #endif
1197
1198 /* Is this literally IRExpr_Const(IRConst_V128(0)) ? */
1199 static Bool isZeroV128 ( IRExpr* e )
1200 {
1201    return toBool( e->tag == Iex_Const
1202                   && e->Iex.Const.con->tag == Ico_V128
1203                   && e->Iex.Const.con->Ico.V128 == 0x0000);
1204 }
1205
1206 /* Is this literally IRExpr_Const(IRConst_V128(1...1)) ? */
1207 static Bool isOnesV128 ( IRExpr* e )
1208 {
1209    return toBool( e->tag == Iex_Const
1210                   && e->Iex.Const.con->tag == Ico_V128
1211                   && e->Iex.Const.con->Ico.V128 == 0xFFFF);
1212 }
1213
1214 /* Is this literally IRExpr_Const(IRConst_V256(0)) ? */
1215 static Bool isZeroV256 ( IRExpr* e )
1216 {
1217    return toBool( e->tag == Iex_Const
1218                   && e->Iex.Const.con->tag == Ico_V256
1219                   && e->Iex.Const.con->Ico.V256 == 0x00000000);
1220 }
1221
1222 /* Is this an integer constant with value 0 ? */
1223 static Bool isZeroU ( IRExpr* e )
1224 {
1225    if (e->tag != Iex_Const) return False;
1226    switch (e->Iex.Const.con->tag) {
1227       case Ico_U1:    return toBool( e->Iex.Const.con->Ico.U1  == 0);
1228       case Ico_U8:    return toBool( e->Iex.Const.con->Ico.U8  == 0);
1229       case Ico_U16:   return toBool( e->Iex.Const.con->Ico.U16 == 0);
1230       case Ico_U32:   return toBool( e->Iex.Const.con->Ico.U32 == 0);
1231       case Ico_U64:   return toBool( e->Iex.Const.con->Ico.U64 == 0);
1232       case Ico_V256:  return toBool( e->Iex.Const.con->Ico.V256 == 0x00000000);
1233       default: vpanic("isZeroU");
1234    }
1235 }
1236
1237 /* Is this an integer constant with value 1---1b ? */
1238 static Bool isOnesU ( IRExpr* e )
1239 {
1240    if (e->tag != Iex_Const) return False;
1241    switch (e->Iex.Const.con->tag) {
1242       case Ico_U1:    return toBool( e->Iex.Const.con->Ico.U1  == True);
1243       case Ico_U8:    return toBool( e->Iex.Const.con->Ico.U8  == 0xFF);
1244       case Ico_U16:   return toBool( e->Iex.Const.con->Ico.U16 == 0xFFFF);
1245       case Ico_U32:   return toBool( e->Iex.Const.con->Ico.U32
1246                                      == 0xFFFFFFFF);
1247       case Ico_U64:   return toBool( e->Iex.Const.con->Ico.U64
1248                                      == 0xFFFFFFFFFFFFFFFFULL);
1249       default: ppIRExpr(e); vpanic("isOnesU");
1250    }
1251 }
1252
1253 static Bool notBool ( Bool b )
1254 {
1255    if (b == True) return False;
1256    if (b == False) return True;
1257    vpanic("notBool");
1258 }
1259
1260 /* Make a zero which has the same type as the result of the given
1261    primop. */
1262 static IRExpr* mkZeroOfPrimopResultType ( IROp op )
1263 {
1264    switch (op) {
1265       case Iop_CmpNE32: return IRExpr_Const(IRConst_U1(toBool(0)));
1266       case Iop_Xor8:  return IRExpr_Const(IRConst_U8(0));
1267       case Iop_Xor16: return IRExpr_Const(IRConst_U16(0));
1268       case Iop_Sub32:
1269       case Iop_Xor32: return IRExpr_Const(IRConst_U32(0));
1270       case Iop_And64:
1271       case Iop_Sub64:
1272       case Iop_Xor64: return IRExpr_Const(IRConst_U64(0));
1273       case Iop_XorV128:
1274       case Iop_AndV128: return IRExpr_Const(IRConst_V128(0));
1275       case Iop_XorV256:
1276       case Iop_AndV256: return IRExpr_Const(IRConst_V256(0));
1277       default: vpanic("mkZeroOfPrimopResultType: bad primop");
1278    }
1279 }
1280
1281 /* Make a value containing all 1-bits, which has the same type as the
1282    result of the given primop. */
1283 static IRExpr* mkOnesOfPrimopResultType ( IROp op )
1284 {
1285    switch (op) {
1286       case Iop_CmpEQ32:
1287       case Iop_CmpEQ64:
1288          return IRExpr_Const(IRConst_U1(toBool(1)));
1289       case Iop_Or8:
1290          return IRExpr_Const(IRConst_U8(0xFF));
1291       case Iop_Or16:
1292          return IRExpr_Const(IRConst_U16(0xFFFF));
1293       case Iop_Or32:
1294          return IRExpr_Const(IRConst_U32(0xFFFFFFFF));
1295       case Iop_CmpEQ8x8:
1296       case Iop_CmpEQ16x4:
1297       case Iop_CmpEQ32x2:
1298       case Iop_Or64:
1299          return IRExpr_Const(IRConst_U64(0xFFFFFFFFFFFFFFFFULL));
1300       case Iop_CmpEQ8x16:
1301       case Iop_CmpEQ16x8:
1302       case Iop_CmpEQ32x4:
1303       case Iop_CmpEQ64x2:
1304          return IRExpr_Const(IRConst_V128(0xFFFF));
1305       case Iop_CmpEQ8x32:
1306       case Iop_CmpEQ16x16:
1307       case Iop_CmpEQ32x8:
1308       case Iop_CmpEQ64x4:
1309          return IRExpr_Const(IRConst_V256(0xFFFFFFFF));
1310       default:
1311          ppIROp(op);
1312          vpanic("mkOnesOfPrimopResultType: bad primop");
1313    }
1314 }
1315
1316 /* Helpers for folding Clz32/64. */
1317 static UInt fold_Clz64 ( ULong value )
1318 {
1319    UInt i;
1320    vassert(value != 0ULL); /* no defined semantics for arg==0 */
1321    for (i = 0; i < 64; ++i) {
1322       if (0ULL != (value & (((ULong)1) << (63 - i)))) return i;
1323    }
1324    vassert(0);
1325    /*NOTREACHED*/
1326    return 0;
1327 }
1328
1329 static UInt fold_Clz32 ( UInt value )
1330 {
1331    UInt i;
1332    vassert(value != 0); /* no defined semantics for arg==0 */
1333    for (i = 0; i < 32; ++i) {
1334       if (0 != (value & (((UInt)1) << (31 - i)))) return i;
1335    }
1336    vassert(0);
1337    /*NOTREACHED*/
1338    return 0;
1339 }
1340
1341 /* V64 holds 8 summary-constant bits in V128/V256 style.  Convert to
1342    the corresponding real constant. */
1343 //XXX re-check this before use
1344 //static ULong de_summarise_V64 ( UChar v64 )
1345 //{
1346 //   ULong r = 0;
1347 //   if (v64 & (1<<0)) r |= 0x00000000000000FFULL;
1348 //   if (v64 & (1<<1)) r |= 0x000000000000FF00ULL;
1349 //   if (v64 & (1<<2)) r |= 0x0000000000FF0000ULL;
1350 //   if (v64 & (1<<3)) r |= 0x00000000FF000000ULL;
1351 //   if (v64 & (1<<4)) r |= 0x000000FF00000000ULL;
1352 //   if (v64 & (1<<5)) r |= 0x0000FF0000000000ULL;
1353 //   if (v64 & (1<<6)) r |= 0x00FF000000000000ULL;
1354 //   if (v64 & (1<<7)) r |= 0xFF00000000000000ULL;
1355 //   return r;
1356 //}
1357
1358 /* Helper for arbitrary expression pattern matching in flat IR.  If
1359    'e' is a reference to a tmp, look it up in env -- repeatedly, if
1360    necessary -- until it resolves to a non-tmp.  Note that this can
1361    return NULL if it can't resolve 'e' to a new expression, which will
1362    be the case if 'e' is instead defined by an IRStmt (IRDirty or
1363    LLSC). */
1364 static IRExpr* chase ( IRExpr** env, IRExpr* e )
1365 {
1366    /* Why is this loop guaranteed to terminate?  Because all tmps must
1367       have definitions before use, hence a tmp cannot be bound
1368       (directly or indirectly) to itself. */
1369    while (e->tag == Iex_RdTmp) {
1370       if (0) { vex_printf("chase "); ppIRExpr(e); vex_printf("\n"); }
1371       e = env[(Int)e->Iex.RdTmp.tmp];
1372       if (e == NULL) break;
1373    }
1374    return e;
1375 }
1376
1377 /* Similar to |chase|, but follows at most one level of tmp reference. */
1378 static IRExpr* chase1 ( IRExpr** env, IRExpr* e )
1379 {
1380    if (e == NULL || e->tag != Iex_RdTmp)
1381       return e;
1382    else
1383       return env[(Int)e->Iex.RdTmp.tmp];
1384 }
1385
1386 __attribute__((noinline))
1387 static IRExpr* fold_Expr_WRK ( IRExpr** env, IRExpr* e )
1388 {
1389    Int     shift;
1390    IRExpr* e2 = e; /* e2 is the result of folding e, if possible */
1391
1392    switch (e->tag) {
1393    case Iex_Unop:
1394       /* UNARY ops */
1395       if (e->Iex.Unop.arg->tag == Iex_Const) {
1396
1397          /* cases where the arg is a const */
1398          switch (e->Iex.Unop.op) {
1399          case Iop_1Uto8:
1400             e2 = IRExpr_Const(IRConst_U8(toUChar(
1401                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
1402                     ? 1 : 0)));
1403             break;
1404          case Iop_1Uto32:
1405             e2 = IRExpr_Const(IRConst_U32(
1406                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
1407                     ? 1 : 0));
1408             break;
1409          case Iop_1Uto64:
1410             e2 = IRExpr_Const(IRConst_U64(
1411                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
1412                     ? 1 : 0));
1413             break;
1414
1415          case Iop_1Sto8:
1416             e2 = IRExpr_Const(IRConst_U8(toUChar(
1417                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
1418                     ? 0xFF : 0)));
1419             break;
1420          case Iop_1Sto16:
1421             e2 = IRExpr_Const(IRConst_U16(toUShort(
1422                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
1423                     ? 0xFFFF : 0)));
1424             break;
1425          case Iop_1Sto32:
1426             e2 = IRExpr_Const(IRConst_U32(
1427                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
1428                     ? 0xFFFFFFFF : 0));
1429             break;
1430          case Iop_1Sto64:
1431             e2 = IRExpr_Const(IRConst_U64(
1432                     e->Iex.Unop.arg->Iex.Const.con->Ico.U1
1433                     ? 0xFFFFFFFFFFFFFFFFULL : 0));
1434             break;
1435
1436          case Iop_8Sto32: {
1437             UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U8;
1438             u32 <<= 24;
1439             u32 = (Int)u32 >> 24;   /* signed shift */
1440             e2 = IRExpr_Const(IRConst_U32(u32));
1441             break;
1442          }
1443          case Iop_16Sto32: {
1444             UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U16;
1445             u32 <<= 16;
1446             u32 = (Int)u32 >> 16;   /* signed shift */
1447             e2 = IRExpr_Const(IRConst_U32(u32));
1448             break;
1449          }
1450          case Iop_8Uto64:
1451             e2 = IRExpr_Const(IRConst_U64(
1452                     0xFFULL & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
1453             break;
1454          case Iop_16Uto64:
1455             e2 = IRExpr_Const(IRConst_U64(
1456                     0xFFFFULL & e->Iex.Unop.arg->Iex.Const.con->Ico.U16));
1457             break;
1458          case Iop_8Uto32:
1459             e2 = IRExpr_Const(IRConst_U32(
1460                     0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
1461             break;
1462          case Iop_8Sto16: {
1463             UShort u16 = e->Iex.Unop.arg->Iex.Const.con->Ico.U8;
1464             u16 <<= 8;
1465             u16 = (Short)u16 >> 8;  /* signed shift */
1466             e2 = IRExpr_Const(IRConst_U16(u16));
1467             break;
1468          }
1469          case Iop_8Uto16:
1470             e2 = IRExpr_Const(IRConst_U16(
1471                     0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
1472             break;
1473          case Iop_16Uto32:
1474             e2 = IRExpr_Const(IRConst_U32(
1475                     0xFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U16));
1476             break;
1477          case Iop_32to16:
1478             e2 = IRExpr_Const(IRConst_U16(toUShort(
1479                     0xFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
1480             break;
1481          case Iop_32to8:
1482             e2 = IRExpr_Const(IRConst_U8(toUChar(
1483                     0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
1484             break;
1485          case Iop_32to1:
1486             e2 = IRExpr_Const(IRConst_U1(toBool(
1487                     1 == (1 & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)
1488                  )));
1489             break;
1490          case Iop_64to1:
1491             e2 = IRExpr_Const(IRConst_U1(toBool(
1492                     1 == (1 & e->Iex.Unop.arg->Iex.Const.con->Ico.U64)
1493                  )));
1494             break;
1495
1496          case Iop_NotV128:
1497             e2 = IRExpr_Const(IRConst_V128(
1498                     ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.V128)));
1499             break;
1500          case Iop_Not64:
1501             e2 = IRExpr_Const(IRConst_U64(
1502                     ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U64)));
1503             break;
1504          case Iop_Not32:
1505             e2 = IRExpr_Const(IRConst_U32(
1506                     ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
1507             break;
1508          case Iop_Not16:
1509             e2 = IRExpr_Const(IRConst_U16(toUShort(
1510                     ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U16))));
1511             break;
1512          case Iop_Not8:
1513             e2 = IRExpr_Const(IRConst_U8(toUChar(
1514                     ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U8))));
1515             break;
1516
1517          case Iop_Not1:
1518             e2 = IRExpr_Const(IRConst_U1(
1519                     notBool(e->Iex.Unop.arg->Iex.Const.con->Ico.U1)));
1520             break;
1521
1522          case Iop_64to8: {
1523             ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
1524             w64 &= 0xFFULL;
1525             e2 = IRExpr_Const(IRConst_U8( (UChar)w64 ));
1526             break;
1527          }
1528          case Iop_64to16: {
1529             ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
1530             w64 &= 0xFFFFULL;
1531             e2 = IRExpr_Const(IRConst_U16( (UShort)w64 ));
1532             break;
1533          }
1534          case Iop_64to32: {
1535             ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
1536             w64 &= 0x00000000FFFFFFFFULL;
1537             e2 = IRExpr_Const(IRConst_U32( (UInt)w64 ));
1538             break;
1539          }
1540          case Iop_64HIto32: {
1541             ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
1542             w64 >>= 32;
1543             e2 = IRExpr_Const(IRConst_U32( (UInt)w64 ));
1544             break;
1545          }
1546          case Iop_32Uto64:
1547             e2 = IRExpr_Const(IRConst_U64(
1548                     0xFFFFFFFFULL
1549                     & e->Iex.Unop.arg->Iex.Const.con->Ico.U32));
1550             break;
1551          case Iop_16Sto64: {
1552             ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U16;
1553             u64 <<= 48;
1554             u64 = (Long)u64 >> 48;   /* signed shift */
1555             e2 = IRExpr_Const(IRConst_U64(u64));
1556             break;
1557          }
1558          case Iop_32Sto64: {
1559             ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
1560             u64 <<= 32;
1561             u64 = (Long)u64 >> 32;   /* signed shift */
1562             e2 = IRExpr_Const(IRConst_U64(u64));
1563             break;
1564          }
1565
1566          case Iop_16to8: {
1567             UShort w16 = e->Iex.Unop.arg->Iex.Const.con->Ico.U16;
1568             w16 &= 0xFF;
1569             e2 = IRExpr_Const(IRConst_U8( (UChar)w16 ));
1570             break;
1571          }
1572          case Iop_16HIto8: {
1573             UShort w16 = e->Iex.Unop.arg->Iex.Const.con->Ico.U16;
1574             w16 >>= 8;
1575             w16 &= 0xFF;
1576             e2 = IRExpr_Const(IRConst_U8( (UChar)w16 ));
1577             break;
1578          }
1579
1580          case Iop_CmpNEZ8:
1581             e2 = IRExpr_Const(IRConst_U1(toBool(
1582                     0 !=
1583                     (0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8)
1584                  )));
1585             break;
1586          case Iop_CmpNEZ32:
1587             e2 = IRExpr_Const(IRConst_U1(toBool(
1588                     0 !=
1589                     (0xFFFFFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)
1590                  )));
1591             break;
1592          case Iop_CmpNEZ64:
1593             e2 = IRExpr_Const(IRConst_U1(toBool(
1594                     0ULL != e->Iex.Unop.arg->Iex.Const.con->Ico.U64
1595                  )));
1596             break;
1597
1598          case Iop_CmpwNEZ32: {
1599             UInt w32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
1600             if (w32 == 0)
1601                e2 = IRExpr_Const(IRConst_U32( 0 ));
1602             else
1603                e2 = IRExpr_Const(IRConst_U32( 0xFFFFFFFF ));
1604             break;
1605          }
1606          case Iop_CmpwNEZ64: {
1607             ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
1608             if (w64 == 0)
1609                e2 = IRExpr_Const(IRConst_U64( 0 ));
1610             else
1611                e2 = IRExpr_Const(IRConst_U64( 0xFFFFFFFFFFFFFFFFULL ));
1612             break;
1613          }
1614
1615          case Iop_Left32: {
1616             UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
1617             Int  s32 = (Int)(u32 & 0xFFFFFFFF);
1618             s32 = (s32 | (-s32));
1619             e2 = IRExpr_Const( IRConst_U32( (UInt)s32 ));
1620             break;
1621          }
1622
1623          case Iop_Left64: {
1624             ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
1625             Long  s64 = (Long)u64;
1626             s64 = (s64 | (-s64));
1627             e2 = IRExpr_Const( IRConst_U64( (ULong)s64 ));
1628             break;
1629          }
1630
1631          case Iop_Clz32: {
1632             UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
1633             if (u32 != 0)
1634                e2 = IRExpr_Const(IRConst_U32(fold_Clz32(u32)));
1635             break;
1636          }
1637          case Iop_Clz64: {
1638             ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
1639             if (u64 != 0ULL)
1640                e2 = IRExpr_Const(IRConst_U64(fold_Clz64(u64)));
1641             break;
1642          }
1643
1644          /* For these vector ones, can't fold all cases, but at least
1645             do the most obvious one.  Could do better here using
1646             summarise/desummarise of vector constants, but too
1647             difficult to verify; hence just handle the zero cases. */
1648          case Iop_32UtoV128: {
1649             UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
1650             if (0 == u32) {
1651                e2 = IRExpr_Const(IRConst_V128(0x0000));
1652             } else {
1653                goto unhandled;
1654             }
1655             break;
1656          }
1657          case Iop_V128to64: {
1658             UShort v128 = e->Iex.Unop.arg->Iex.Const.con->Ico.V128;
1659             if (0 == ((v128 >> 0) & 0xFF)) {
1660                e2 = IRExpr_Const(IRConst_U64(0));
1661             } else {
1662                goto unhandled;
1663             }
1664             break;
1665          }
1666          case Iop_V128HIto64: {
1667             UShort v128 = e->Iex.Unop.arg->Iex.Const.con->Ico.V128;
1668             if (0 == ((v128 >> 8) & 0xFF)) {
1669                e2 = IRExpr_Const(IRConst_U64(0));
1670             } else {
1671                goto unhandled;
1672             }
1673             break;
1674          }
1675          case Iop_64UtoV128: {
1676             ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
1677             if (0 == u64) {
1678                e2 = IRExpr_Const(IRConst_V128(0x0000));
1679             } else {
1680                goto unhandled;
1681             }
1682             break;
1683          }
1684
1685          /* Even stupider (although still correct ..) */
1686          case Iop_V256to64_0: case Iop_V256to64_1:
1687          case Iop_V256to64_2: case Iop_V256to64_3: {
1688             UInt v256 = e->Iex.Unop.arg->Iex.Const.con->Ico.V256;
1689             if (v256 == 0x00000000) {
1690                e2 = IRExpr_Const(IRConst_U64(0));
1691             } else {
1692                goto unhandled;
1693             }
1694             break;
1695          }
1696
1697          /* Similarly .. */
1698          case Iop_V256toV128_0: case Iop_V256toV128_1: {
1699             UInt v256 = e->Iex.Unop.arg->Iex.Const.con->Ico.V256;
1700             if (v256 == 0x00000000) {
1701                e2 = IRExpr_Const(IRConst_V128(0x0000));
1702             } else {
1703                goto unhandled;
1704             }
1705             break;
1706          }
1707
1708          case Iop_ZeroHI64ofV128: {
1709             /* Could do better here -- only need to look at the bottom 64 bits
1710                of the argument, really. */
1711             UShort v128 = e->Iex.Unop.arg->Iex.Const.con->Ico.V128;
1712             if (v128 == 0x0000) {
1713                e2 = IRExpr_Const(IRConst_V128(0x0000));
1714             } else {
1715                goto unhandled;
1716             }
1717             break;
1718          }
1719
1720          default:
1721             goto unhandled;
1722          } // switch (e->Iex.Unop.op)
1723
1724       } else {
1725
1726          /* other cases (identities, etc) */
1727          switch (e->Iex.Unop.op) {
1728          case Iop_PopCount64: {
1729             // PopCount64( And64( Add64(x,-1), Not64(x) ) ) ==> CtzNat64(x)
1730             // bindings:
1731             //   a1:And64( a11:Add64(a111:x,a112:-1), a12:Not64(a121:x) )
1732             IRExpr* a1 = chase(env, e->Iex.Unop.arg);
1733             if (!a1)
1734                goto nomatch;
1735             if (a1->tag != Iex_Binop || a1->Iex.Binop.op != Iop_And64)
1736                goto nomatch;
1737             // a1 is established
1738             IRExpr* a11 = chase(env, a1->Iex.Binop.arg1);
1739             if (!a11)
1740                goto nomatch;
1741             if (a11->tag != Iex_Binop || a11->Iex.Binop.op != Iop_Add64)
1742                goto nomatch;
1743             // a11 is established
1744             IRExpr* a12 = chase(env, a1->Iex.Binop.arg2);
1745             if (!a12)
1746                goto nomatch;
1747             if (a12->tag != Iex_Unop || a12->Iex.Unop.op != Iop_Not64)
1748                goto nomatch;
1749             // a12 is established
1750             IRExpr* a111 = a11->Iex.Binop.arg1;
1751             IRExpr* a112 = chase(env, a11->Iex.Binop.arg2);
1752             IRExpr* a121 = a12->Iex.Unop.arg;
1753             if (!a111 || !a112 || !a121)
1754                goto nomatch;
1755             // a111 and a121 need to be the same temp.
1756             if (!eqIRAtom(a111, a121))
1757                goto nomatch;
1758             // Finally, a112 must be a 64-bit version of -1.
1759             if (!isOnesU(a112))
1760                goto nomatch;
1761             // Match established.  Transform.
1762             e2 = IRExpr_Unop(Iop_CtzNat64, a111);
1763             break;
1764            nomatch:
1765             break;
1766          }
1767          default:
1768             break;
1769          } // switch (e->Iex.Unop.op)
1770
1771       } // if (e->Iex.Unop.arg->tag == Iex_Const)
1772       break;
1773
1774    case Iex_Binop:
1775       /* BINARY ops */
1776       if (e->Iex.Binop.arg1->tag == Iex_Const
1777           && e->Iex.Binop.arg2->tag == Iex_Const) {
1778          /* cases where both args are consts */
1779          switch (e->Iex.Binop.op) {
1780
1781             /* -- Or -- */
1782             case Iop_Or8:
1783                e2 = IRExpr_Const(IRConst_U8(toUChar(
1784                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
1785                         | e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
1786                break;
1787             case Iop_Or16:
1788                e2 = IRExpr_Const(IRConst_U16(toUShort(
1789                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
1790                         | e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
1791                break;
1792             case Iop_Or32:
1793                e2 = IRExpr_Const(IRConst_U32(
1794                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
1795                         | e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
1796                break;
1797             case Iop_Or64:
1798                e2 = IRExpr_Const(IRConst_U64(
1799                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
1800                         | e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
1801                break;
1802             case Iop_OrV128:
1803                e2 = IRExpr_Const(IRConst_V128(
1804                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.V128
1805                         | e->Iex.Binop.arg2->Iex.Const.con->Ico.V128)));
1806                break;
1807
1808             /* -- Xor -- */
1809             case Iop_Xor8:
1810                e2 = IRExpr_Const(IRConst_U8(toUChar(
1811                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
1812                         ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
1813                break;
1814             case Iop_Xor16:
1815                e2 = IRExpr_Const(IRConst_U16(toUShort(
1816                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
1817                         ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
1818                break;
1819             case Iop_Xor32:
1820                e2 = IRExpr_Const(IRConst_U32(
1821                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
1822                         ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
1823                break;
1824             case Iop_Xor64:
1825                e2 = IRExpr_Const(IRConst_U64(
1826                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
1827                         ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
1828                break;
1829             case Iop_XorV128:
1830                e2 = IRExpr_Const(IRConst_V128(
1831                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.V128
1832                         ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.V128)));
1833                break;
1834
1835             /* -- And -- */
1836             case Iop_And1:
1837                e2 = IRExpr_Const(IRConst_U1(toBool(
1838                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U1
1839                         & e->Iex.Binop.arg2->Iex.Const.con->Ico.U1))));
1840                break;
1841             case Iop_And8:
1842                e2 = IRExpr_Const(IRConst_U8(toUChar(
1843                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
1844                         & e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
1845                break;
1846             case Iop_And16:
1847                e2 = IRExpr_Const(IRConst_U16(toUShort(
1848                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
1849                         & e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
1850                break;
1851             case Iop_And32:
1852                e2 = IRExpr_Const(IRConst_U32(
1853                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
1854                         & e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
1855                break;
1856             case Iop_And64:
1857                e2 = IRExpr_Const(IRConst_U64(
1858                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
1859                         & e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
1860                break;
1861             case Iop_AndV128:
1862                e2 = IRExpr_Const(IRConst_V128(
1863                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.V128
1864                         & e->Iex.Binop.arg2->Iex.Const.con->Ico.V128)));
1865                break;
1866
1867             /* -- Add -- */
1868             case Iop_Add8:
1869                e2 = IRExpr_Const(IRConst_U8(toUChar(
1870                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
1871                         + e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
1872                break;
1873             case Iop_Add32:
1874                e2 = IRExpr_Const(IRConst_U32(
1875                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
1876                         + e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
1877                break;
1878             case Iop_Add64:
1879                e2 = IRExpr_Const(IRConst_U64(
1880                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
1881                         + e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
1882                break;
1883
1884             /* -- Sub -- */
1885             case Iop_Sub8:
1886                e2 = IRExpr_Const(IRConst_U8(toUChar(
1887                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
1888                         - e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
1889                break;
1890             case Iop_Sub32:
1891                e2 = IRExpr_Const(IRConst_U32(
1892                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
1893                         - e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
1894                break;
1895             case Iop_Sub64:
1896                e2 = IRExpr_Const(IRConst_U64(
1897                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
1898                         - e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
1899                break;
1900
1901             /* -- Max32U -- */
1902             case Iop_Max32U: {
1903                UInt u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
1904                UInt u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
1905                UInt res  = u32a > u32b ? u32a : u32b;
1906                e2 = IRExpr_Const(IRConst_U32(res));
1907                break;
1908             }
1909
1910             /* -- Mul -- */
1911             case Iop_Mul32:
1912                e2 = IRExpr_Const(IRConst_U32(
1913                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
1914                         * e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
1915                break;
1916             case Iop_Mul64:
1917                e2 = IRExpr_Const(IRConst_U64(
1918                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
1919                         * e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
1920                break;
1921
1922             case Iop_MullS32: {
1923                /* very paranoid */
1924                UInt  u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
1925                UInt  u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
1926                Int   s32a = (Int)u32a;
1927                Int   s32b = (Int)u32b;
1928                Long  s64a = (Long)s32a;
1929                Long  s64b = (Long)s32b;
1930                Long  sres = s64a * s64b;
1931                ULong ures = (ULong)sres;
1932                e2 = IRExpr_Const(IRConst_U64(ures));
1933                break;
1934             }
1935
1936             /* -- Shl -- */
1937             case Iop_Shl32:
1938                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1939                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
1940                if (shift >= 0 && shift <= 31)
1941                   e2 = IRExpr_Const(IRConst_U32(
1942                           (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
1943                            << shift)));
1944                break;
1945             case Iop_Shl64:
1946                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1947                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
1948                if (shift >= 0 && shift <= 63)
1949                   e2 = IRExpr_Const(IRConst_U64(
1950                           (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
1951                            << shift)));
1952                break;
1953
1954             /* -- Sar -- */
1955             case Iop_Sar32: {
1956                /* paranoid ... */
1957                /*signed*/ Int s32;
1958                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1959                s32   = (Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32);
1960                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
1961                if (shift >= 0 && shift <= 31) {
1962                   s32 >>=/*signed*/ shift;
1963                   e2 = IRExpr_Const(IRConst_U32((UInt)s32));
1964                }
1965                break;
1966             }
1967             case Iop_Sar64: {
1968                /* paranoid ... */
1969                /*signed*/ Long s64;
1970                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1971                s64   = (Long)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64);
1972                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
1973                if (shift >= 0 && shift <= 63) {
1974                   s64 >>=/*signed*/ shift;
1975                   e2 = IRExpr_Const(IRConst_U64((ULong)s64));
1976                }
1977                break;
1978             }
1979
1980             /* -- Shr -- */
1981             case Iop_Shr32: {
1982                /* paranoid ... */
1983                /*unsigned*/ UInt u32;
1984                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1985                u32   = (UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32);
1986                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
1987                if (shift >= 0 && shift <= 31) {
1988                   u32 >>=/*unsigned*/ shift;
1989                   e2 = IRExpr_Const(IRConst_U32(u32));
1990                }
1991                break;
1992             }
1993             case Iop_Shr64: {
1994                /* paranoid ... */
1995                /*unsigned*/ ULong u64;
1996                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
1997                u64   = (ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64);
1998                shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
1999                if (shift >= 0 && shift <= 63) {
2000                   u64 >>=/*unsigned*/ shift;
2001                   e2 = IRExpr_Const(IRConst_U64(u64));
2002                }
2003                break;
2004             }
2005
2006             /* -- CmpEQ -- */
2007             case Iop_CmpEQ32:
2008                e2 = IRExpr_Const(IRConst_U1(toBool(
2009                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
2010                         == e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))));
2011                break;
2012             case Iop_CmpEQ64:
2013                e2 = IRExpr_Const(IRConst_U1(toBool(
2014                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
2015                         == e->Iex.Binop.arg2->Iex.Const.con->Ico.U64))));
2016                break;
2017
2018             /* -- CmpNE -- */
2019             case Iop_CmpNE8:
2020             case Iop_CasCmpNE8:
2021             case Iop_ExpCmpNE8:
2022                e2 = IRExpr_Const(IRConst_U1(toBool(
2023                        ((0xFF & e->Iex.Binop.arg1->Iex.Const.con->Ico.U8)
2024                         != (0xFF & e->Iex.Binop.arg2->Iex.Const.con->Ico.U8)))));
2025                break;
2026             case Iop_CmpNE32:
2027             case Iop_CasCmpNE32:
2028             case Iop_ExpCmpNE32:
2029                e2 = IRExpr_Const(IRConst_U1(toBool(
2030                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
2031                         != e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))));
2032                break;
2033             case Iop_CmpNE64:
2034             case Iop_CasCmpNE64:
2035             case Iop_ExpCmpNE64:
2036                e2 = IRExpr_Const(IRConst_U1(toBool(
2037                        (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
2038                         != e->Iex.Binop.arg2->Iex.Const.con->Ico.U64))));
2039                break;
2040
2041             /* -- CmpLEU -- */
2042             case Iop_CmpLE32U:
2043                e2 = IRExpr_Const(IRConst_U1(toBool(
2044                        ((UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
2045                         <= (UInt)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
2046                break;
2047             case Iop_CmpLE64U:
2048                e2 = IRExpr_Const(IRConst_U1(toBool(
2049                        ((ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64)
2050                         <= (ULong)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)))));
2051                break;
2052
2053             /* -- CmpLES -- */
2054             case Iop_CmpLE32S:
2055                e2 = IRExpr_Const(IRConst_U1(toBool(
2056                        ((Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
2057                         <= (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
2058                break;
2059             case Iop_CmpLE64S:
2060                e2 = IRExpr_Const(IRConst_U1(toBool(
2061                        ((Long)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64)
2062                         <= (Long)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)))));
2063                break;
2064
2065             /* -- CmpLTS -- */
2066             case Iop_CmpLT32S:
2067                e2 = IRExpr_Const(IRConst_U1(toBool(
2068                        ((Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
2069                         < (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
2070                break;
2071             case Iop_CmpLT64S:
2072                e2 = IRExpr_Const(IRConst_U1(toBool(
2073                        ((Long)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64)
2074                         < (Long)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)))));
2075                break;
2076
2077             /* -- CmpLTU -- */
2078             case Iop_CmpLT32U:
2079                e2 = IRExpr_Const(IRConst_U1(toBool(
2080                        ((UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
2081                         < (UInt)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
2082                break;
2083             case Iop_CmpLT64U:
2084                e2 = IRExpr_Const(IRConst_U1(toBool(
2085                        ((ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64)
2086                         < (ULong)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)))));
2087                break;
2088
2089             /* -- CmpORD -- */
2090             case Iop_CmpORD32S: {
2091                /* very paranoid */
2092                UInt  u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
2093                UInt  u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
2094                Int   s32a = (Int)u32a;
2095                Int   s32b = (Int)u32b;
2096                Int   r = 0x2; /* EQ */
2097                if (s32a < s32b) {
2098                   r = 0x8; /* LT */
2099                }
2100                else if (s32a > s32b) {
2101                   r = 0x4; /* GT */
2102                }
2103                e2 = IRExpr_Const(IRConst_U32(r));
2104                break;
2105             }
2106
2107             /* -- nHLto2n -- */
2108             case Iop_32HLto64:
2109                e2 = IRExpr_Const(IRConst_U64(
2110                        (((ULong)(e->Iex.Binop.arg1
2111                                   ->Iex.Const.con->Ico.U32)) << 32)
2112                        | ((ULong)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))
2113                     ));
2114                break;
2115             case Iop_64HLto128:
2116                /* We can't fold this, because there is no way to
2117                   express he result in IR, but at least pretend to
2118                   handle it, so as to stop getting blasted with
2119                   no-rule-for-this-primop messages. */
2120                break;
2121             /* For this vector one, can't fold all cases, but at
2122                least do the most obvious one.  Could do better here
2123                using summarise/desummarise of vector constants, but
2124                too difficult to verify; hence just handle the zero
2125                cases. */
2126             case Iop_64HLtoV128: {
2127                ULong argHi = e->Iex.Binop.arg1->Iex.Const.con->Ico.U64;
2128                ULong argLo = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
2129                if (0 == argHi && 0 == argLo) {
2130                   e2 = IRExpr_Const(IRConst_V128(0));
2131                } else {
2132                   goto unhandled;
2133                }
2134                break;
2135             }
2136             /* Same reasoning for the 256-bit version. */
2137             case Iop_V128HLtoV256: {
2138                IRExpr* argHi = e->Iex.Binop.arg1;
2139                IRExpr* argLo = e->Iex.Binop.arg2;
2140                if (isZeroV128(argHi) && isZeroV128(argLo)) {
2141                   e2 = IRExpr_Const(IRConst_V256(0));
2142                } else {
2143                   goto unhandled;
2144                }
2145                break;
2146             }
2147
2148             /* -- V128 stuff -- */
2149             case Iop_InterleaveLO64x2: // This, and the HI64, are created
2150             case Iop_InterleaveHI64x2: // by the amd64 PTEST translation
2151             case Iop_InterleaveLO8x16: {
2152                /* This turns up a lot in Memcheck instrumentation of
2153                   Icc generated code.  I don't know why. */
2154                UShort arg1 =  e->Iex.Binop.arg1->Iex.Const.con->Ico.V128;
2155                UShort arg2 =  e->Iex.Binop.arg2->Iex.Const.con->Ico.V128;
2156                if (0 == arg1 && 0 == arg2) {
2157                   e2 = IRExpr_Const(IRConst_V128(0));
2158                } else {
2159                   goto unhandled;
2160                }
2161                break;
2162             }
2163
2164             default:
2165                goto unhandled;
2166          }
2167
2168       } else {
2169
2170          /* other cases (identities, etc) */
2171          switch (e->Iex.Binop.op) {
2172
2173             case Iop_Shl32:
2174             case Iop_Shl64:
2175             case Iop_Shr64:
2176             case Iop_Sar64:
2177                /* Shl32/Shl64/Shr64/Sar64(x,0) ==> x */
2178                if (isZeroU(e->Iex.Binop.arg2)) {
2179                   e2 = e->Iex.Binop.arg1;
2180                   break;
2181                }
2182                /* Shl32/Shl64/Shr64(0,x) ==> 0 */
2183                if (isZeroU(e->Iex.Binop.arg1)) {
2184                   e2 = e->Iex.Binop.arg1;
2185                   break;
2186                }
2187                break;
2188
2189             case Iop_Sar32:
2190             case Iop_Shr32:
2191                /* Shr32/Sar32(x,0) ==> x */
2192                if (isZeroU(e->Iex.Binop.arg2)) {
2193                   e2 = e->Iex.Binop.arg1;
2194                   break;
2195                }
2196                break;
2197
2198             case Iop_Or8:
2199             case Iop_Or16:
2200             case Iop_Or32:
2201             case Iop_Or64:
2202             case Iop_Max32U:
2203                /* Or8/Or16/Or32/Or64/Max32U(x,0) ==> x */
2204                if (isZeroU(e->Iex.Binop.arg2)) {
2205                   e2 = e->Iex.Binop.arg1;
2206                   break;
2207                }
2208                /* Or8/Or16/Or32/Or64/Max32U(0,x) ==> x */
2209                if (isZeroU(e->Iex.Binop.arg1)) {
2210                   e2 = e->Iex.Binop.arg2;
2211                   break;
2212                }
2213                /* Or8/Or16/Or32/Or64/Max32U(x,1---1b) ==> 1---1b */
2214                /* Or8/Or16/Or32/Or64/Max32U(1---1b,x) ==> 1---1b */
2215                if (isOnesU(e->Iex.Binop.arg1) || isOnesU(e->Iex.Binop.arg2)) {
2216                   e2 = mkOnesOfPrimopResultType(e->Iex.Binop.op);
2217                   break;
2218                }
2219                /* Or8/Or16/Or32/Or64/Max32U(t,t) ==> t, for some IRTemp t */
2220                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2221                   e2 = e->Iex.Binop.arg1;
2222                   break;
2223                }
2224                break;
2225
2226             case Iop_Add8:
2227                /* Add8(t,t) ==> t << 1.
2228                   Memcheck doesn't understand that
2229                   x+x produces a defined least significant bit, and it seems
2230                   simplest just to get rid of the problem by rewriting it
2231                   out, since the opportunity to do so exists. */
2232                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2233                   e2 = IRExpr_Binop(Iop_Shl8, e->Iex.Binop.arg1,
2234                                     IRExpr_Const(IRConst_U8(1)));
2235                   break;
2236                }
2237                break;
2238
2239                /* NB no Add16(t,t) case yet as no known test case exists */
2240
2241             case Iop_Add32:
2242             case Iop_Add64:
2243                /* Add32/Add64(x,0) ==> x */
2244                if (isZeroU(e->Iex.Binop.arg2)) {
2245                   e2 = e->Iex.Binop.arg1;
2246                   break;
2247                }
2248                /* Add32/Add64(0,x) ==> x */
2249                if (isZeroU(e->Iex.Binop.arg1)) {
2250                   e2 = e->Iex.Binop.arg2;
2251                   break;
2252                }
2253                /* Add32/Add64(t,t) ==> t << 1. Same rationale as for Add8. */
2254                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2255                   e2 = IRExpr_Binop(
2256                           e->Iex.Binop.op == Iop_Add32 ? Iop_Shl32 : Iop_Shl64,
2257                           e->Iex.Binop.arg1, IRExpr_Const(IRConst_U8(1)));
2258                   break;
2259                }
2260                break;
2261
2262             case Iop_Sub32:
2263             case Iop_Sub64:
2264                /* Sub32/Sub64(x,0) ==> x */
2265                if (isZeroU(e->Iex.Binop.arg2)) {
2266                   e2 = e->Iex.Binop.arg1;
2267                   break;
2268                }
2269                /* Sub32/Sub64(t,t) ==> 0, for some IRTemp t */
2270                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2271                   e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
2272                   break;
2273                }
2274                break;
2275             case Iop_Sub8x16:
2276                /* Sub8x16(x,0) ==> x */
2277                if (isZeroV128(e->Iex.Binop.arg2)) {
2278                   e2 = e->Iex.Binop.arg1;
2279                   break;
2280                }
2281                break;
2282
2283             case Iop_And1:
2284             case Iop_And8:
2285             case Iop_And16:
2286             case Iop_And32:
2287             case Iop_And64:
2288                /* And1/And8/And16/And32/And64(x,1---1b) ==> x */
2289                if (isOnesU(e->Iex.Binop.arg2)) {
2290                   e2 = e->Iex.Binop.arg1;
2291                   break;
2292                }
2293                /* And1/And8/And16/And32/And64(1---1b,x) ==> x */
2294                if (isOnesU(e->Iex.Binop.arg1)) {
2295                   e2 = e->Iex.Binop.arg2;
2296                   break;
2297                }
2298                /* And1/And8/And16/And32/And64(x,0) ==> 0 */
2299                if (isZeroU(e->Iex.Binop.arg2)) {
2300                   e2 = e->Iex.Binop.arg2;
2301                   break;
2302                }
2303                /* And1/And8/And16/And32/And64(0,x) ==> 0 */
2304                if (isZeroU(e->Iex.Binop.arg1)) {
2305                   e2 = e->Iex.Binop.arg1;
2306                   break;
2307                }
2308                /* And1/And8/And16/And32/And64(t,t) ==> t, for some IRTemp t */
2309                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2310                   e2 = e->Iex.Binop.arg1;
2311                   break;
2312                }
2313                break;
2314
2315             case Iop_AndV128:
2316             case Iop_AndV256:
2317                /* And8/And16/AndV128/AndV256(t,t)
2318                   ==> t, for some IRTemp t */
2319                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2320                   e2 = e->Iex.Binop.arg1;
2321                   break;
2322                }
2323                /* Deal with either arg zero.  Could handle other And
2324                   cases here too. */
2325                if (e->Iex.Binop.op == Iop_AndV256
2326                    && (isZeroV256(e->Iex.Binop.arg1)
2327                        || isZeroV256(e->Iex.Binop.arg2))) {
2328                   e2 =  mkZeroOfPrimopResultType(e->Iex.Binop.op);
2329                   break;
2330                } else if (e->Iex.Binop.op == Iop_AndV128
2331                           && (isZeroV128(e->Iex.Binop.arg1)
2332                               || isZeroV128(e->Iex.Binop.arg2))) {
2333                   e2 =  mkZeroOfPrimopResultType(e->Iex.Binop.op);
2334                   break;
2335                }
2336                /* AndV128(t,1...1) ==> t.  The amd64 front end generates these
2337                   for *CMP{P,S}{S,D} etc. */
2338                if (e->Iex.Binop.op == Iop_AndV128) {
2339                   if (isOnesV128(e->Iex.Binop.arg2)) {
2340                      e2 = e->Iex.Binop.arg1;
2341                      break;
2342                   }
2343                }
2344                /* AndV128( x, NotV128( x ) ) ==> 0...0  and
2345                   AndV256( x, NotV256( x ) ) ==> 0...0
2346                   This is generated by amd64 `ptest %xmmReg, %xmmReg`
2347                     (the same reg both times)
2348                   See https://bugzilla.redhat.com/show_bug.cgi?id=2257546 */
2349                if (e->Iex.Binop.op == Iop_AndV128
2350                    || e->Iex.Binop.op == Iop_AndV256) {
2351                   Bool isV256 = e->Iex.Binop.op == Iop_AndV256;
2352                   IRExpr* x1 = chase(env, e->Iex.Binop.arg1);
2353                   IRExpr* rhs = chase(env, e->Iex.Binop.arg2);
2354                   if (x1 && rhs
2355                          && rhs->tag == Iex_Unop
2356                          && rhs->Iex.Unop.op == (isV256 ? Iop_NotV256
2357                                                         : Iop_NotV128)) {
2358                      IRExpr* x2 = chase(env, rhs->Iex.Unop.arg);
2359                      if (x2 && sameIRExprs(env, x1, x2)) {
2360                         e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
2361                         break;
2362                      }
2363                   }
2364                }
2365                break;
2366
2367             case Iop_OrV128:
2368             case Iop_OrV256:
2369                /* V128/V256(t,t) ==> t, for some IRTemp t */
2370                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2371                   e2 = e->Iex.Binop.arg1;
2372                   break;
2373                }
2374                /* OrV128(t,0) ==> t */
2375                if (e->Iex.Binop.op == Iop_OrV128) {
2376                   if (isZeroV128(e->Iex.Binop.arg2)) {
2377                      e2 = e->Iex.Binop.arg1;
2378                      break;
2379                   }
2380                   if (isZeroV128(e->Iex.Binop.arg1)) {
2381                      e2 = e->Iex.Binop.arg2;
2382                      break;
2383                   }
2384                }
2385                /* OrV256(t,0) ==> t */
2386                if (e->Iex.Binop.op == Iop_OrV256) {
2387                   if (isZeroV256(e->Iex.Binop.arg2)) {
2388                      e2 = e->Iex.Binop.arg1;
2389                      break;
2390                   }
2391                   //Disabled because there's no known test case right now.
2392                   //if (isZeroV256(e->Iex.Binop.arg1)) {
2393                   //   e2 = e->Iex.Binop.arg2;
2394                   //   break;
2395                   //}
2396                }
2397                break;
2398
2399             case Iop_Xor8:
2400             case Iop_Xor16:
2401             case Iop_Xor32:
2402             case Iop_Xor64:
2403             case Iop_XorV128:
2404             case Iop_XorV256:
2405                /* Xor8/16/32/64/V128/V256(t,t) ==> 0, for some IRTemp t */
2406                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2407                   e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
2408                   break;
2409                }
2410                /* XorV128(t,0) ==> t */
2411                if (e->Iex.Binop.op == Iop_XorV128) {
2412                   if (isZeroV128(e->Iex.Binop.arg2)) {
2413                      e2 = e->Iex.Binop.arg1;
2414                      break;
2415                   }
2416                   //Disabled because there's no known test case right now.
2417                   //if (isZeroV128(e->Iex.Binop.arg1)) {
2418                   //   e2 = e->Iex.Binop.arg2;
2419                   //   break;
2420                   //}
2421                } else {
2422                   /* Xor8/16/32/64(0,t) ==> t */
2423                   if (isZeroU(e->Iex.Binop.arg1)) {
2424                      e2 = e->Iex.Binop.arg2;
2425                      break;
2426                   }
2427                   /* Xor8/16/32/64(t,0) ==> t */
2428                   if (isZeroU(e->Iex.Binop.arg2)) {
2429                      e2 = e->Iex.Binop.arg1;
2430                      break;
2431                   }
2432                }
2433                break;
2434
2435             case Iop_CmpNE32:
2436                /* CmpNE32(t,t) ==> 0, for some IRTemp t */
2437                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2438                   e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
2439                   break;
2440                }
2441                /* CmpNE32(1Uto32(b), 0) ==> b */
2442                if (isZeroU32(e->Iex.Binop.arg2)) {
2443                   IRExpr* a1 = chase(env, e->Iex.Binop.arg1);
2444                   if (a1 && a1->tag == Iex_Unop
2445                          && a1->Iex.Unop.op == Iop_1Uto32) {
2446                      e2 = a1->Iex.Unop.arg;
2447                      break;
2448                   }
2449                }
2450                break;
2451
2452             // in total 32 bits
2453             case Iop_CmpEQ32:
2454             // in total 64 bits
2455             case Iop_CmpEQ64:
2456             case Iop_CmpEQ8x8:
2457             case Iop_CmpEQ16x4:
2458             case Iop_CmpEQ32x2:
2459             // in total 128 bits
2460             case Iop_CmpEQ8x16:
2461             case Iop_CmpEQ16x8:
2462             case Iop_CmpEQ32x4:
2463             case Iop_CmpEQ64x2:
2464             // in total 256 bits
2465             case Iop_CmpEQ8x32:
2466             case Iop_CmpEQ16x16:
2467             case Iop_CmpEQ32x8:
2468             case Iop_CmpEQ64x4:
2469                if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2470                   e2 = mkOnesOfPrimopResultType(e->Iex.Binop.op);
2471                   break;
2472                }
2473                break;
2474
2475             default:
2476                break;
2477          }
2478       }
2479       break;
2480
2481    case Iex_ITE:
2482       /* ITE */
2483       /* is the discriminant is a constant? */
2484       if (e->Iex.ITE.cond->tag == Iex_Const) {
2485          /* assured us by the IR type rules */
2486          vassert(e->Iex.ITE.cond->Iex.Const.con->tag == Ico_U1);
2487          e2 = e->Iex.ITE.cond->Iex.Const.con->Ico.U1
2488                  ? e->Iex.ITE.iftrue : e->Iex.ITE.iffalse;
2489       }
2490       else
2491       /* are the arms identical? (pretty weedy test) */
2492       if (sameIRExprs(env, e->Iex.ITE.iftrue,
2493                            e->Iex.ITE.iffalse)) {
2494          e2 = e->Iex.ITE.iffalse;
2495       }
2496       break;
2497
2498    default:
2499       /* not considered */
2500       break;
2501    }
2502
2503    /* Show cases where we've found but not folded 'op(t,t)'.  Be
2504       careful not to call sameIRExprs with values of different types,
2505       though, else it will assert (and so it should!).  We can't
2506       conveniently call typeOfIRExpr on the two args without a whole
2507       bunch of extra plumbing to pass in a type env, so just use a
2508       hacky test to check the arguments are not anything that might
2509       sameIRExprs to assert.  This is only OK because this kludge is
2510       only used for debug printing, not for "real" operation.  For
2511       "real" operation (ie, all other calls to sameIRExprs), it is
2512       essential that the to args have the same type.
2513
2514       The "right" solution is to plumb the containing block's
2515       IRTypeEnv through to here and use typeOfIRExpr to be sure.  But
2516       that's a bunch of extra parameter passing which will just slow
2517       down the normal case, for no purpose. */
2518    if (vex_control.iropt_verbosity > 0
2519        && e == e2
2520        && e->tag == Iex_Binop
2521        && !debug_only_hack_sameIRExprs_might_assert(e->Iex.Binop.arg1,
2522                                                     e->Iex.Binop.arg2)
2523        && sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
2524       vex_printf("vex iropt: fold_Expr_WRK: no ident rule for: ");
2525       ppIRExpr(e);
2526       vex_printf("\n");
2527    }
2528
2529    /* Show the overall results of folding. */
2530    if (DEBUG_IROPT && e2 != e) {
2531       vex_printf("FOLD: ");
2532       ppIRExpr(e); vex_printf("  ->  ");
2533       ppIRExpr(e2); vex_printf("\n");
2534    }
2535
2536    return e2;
2537
2538  unhandled:
2539 #  if 0
2540    vex_printf("\n\n");
2541    ppIRExpr(e);
2542    vpanic("fold_Expr: no rule for the above");
2543 #  else
2544    if (vex_control.iropt_verbosity > 0) {
2545       vex_printf("vex iropt: fold_Expr_WRK: no const rule for: ");
2546       ppIRExpr(e);
2547       vex_printf("\n");
2548    }
2549    return e2;
2550 #  endif
2551 }
2552
2553 /* Fold |e| as much as possible, given the bindings in |env|.  If no folding is
2554    possible, just return |e|.  Also, if |env| is NULL, don't even try to
2555    fold; just return |e| directly. */
2556 inline
2557 static IRExpr* fold_Expr ( IRExpr** env, IRExpr* e )
2558 {
2559    return env == NULL ? e : fold_Expr_WRK(env, e);
2560 }
2561
2562 /* Apply the subst to a simple 1-level expression -- guaranteed to be
2563    1-level due to previous flattening pass. */
2564
2565 static IRExpr* subst_Expr ( IRExpr** env, IRExpr* ex )
2566 {
2567    switch (ex->tag) {
2568       case Iex_RdTmp:
2569          if (env[(Int)ex->Iex.RdTmp.tmp] != NULL) {
2570             IRExpr *rhs = env[(Int)ex->Iex.RdTmp.tmp];
2571             if (rhs->tag == Iex_RdTmp)
2572                return rhs;
2573             if (rhs->tag == Iex_Const
2574                 && rhs->Iex.Const.con->tag != Ico_F64i)
2575                return rhs;
2576          }
2577          /* not bound in env */
2578          return ex;
2579
2580       case Iex_Const:
2581       case Iex_Get:
2582          return ex;
2583
2584       case Iex_GetI:
2585          vassert(isIRAtom(ex->Iex.GetI.ix));
2586          return IRExpr_GetI(
2587             ex->Iex.GetI.descr,
2588             subst_Expr(env, ex->Iex.GetI.ix),
2589             ex->Iex.GetI.bias
2590          );
2591
2592       case Iex_Qop: {
2593          IRQop* qop = ex->Iex.Qop.details;
2594          vassert(isIRAtom(qop->arg1));
2595          vassert(isIRAtom(qop->arg2));
2596          vassert(isIRAtom(qop->arg3));
2597          vassert(isIRAtom(qop->arg4));
2598          return IRExpr_Qop(
2599                    qop->op,
2600                    subst_Expr(env, qop->arg1),
2601                    subst_Expr(env, qop->arg2),
2602                    subst_Expr(env, qop->arg3),
2603                    subst_Expr(env, qop->arg4)
2604                 );
2605       }
2606
2607       case Iex_Triop: {
2608          IRTriop* triop = ex->Iex.Triop.details;
2609          vassert(isIRAtom(triop->arg1));
2610          vassert(isIRAtom(triop->arg2));
2611          vassert(isIRAtom(triop->arg3));
2612          return IRExpr_Triop(
2613                    triop->op,
2614                    subst_Expr(env, triop->arg1),
2615                    subst_Expr(env, triop->arg2),
2616                    subst_Expr(env, triop->arg3)
2617                 );
2618       }
2619
2620       case Iex_Binop:
2621          vassert(isIRAtom(ex->Iex.Binop.arg1));
2622          vassert(isIRAtom(ex->Iex.Binop.arg2));
2623          return IRExpr_Binop(
2624                    ex->Iex.Binop.op,
2625                    subst_Expr(env, ex->Iex.Binop.arg1),
2626                    subst_Expr(env, ex->Iex.Binop.arg2)
2627                 );
2628
2629       case Iex_Unop:
2630          vassert(isIRAtom(ex->Iex.Unop.arg));
2631          return IRExpr_Unop(
2632                    ex->Iex.Unop.op,
2633                    subst_Expr(env, ex->Iex.Unop.arg)
2634                 );
2635
2636       case Iex_Load:
2637          vassert(isIRAtom(ex->Iex.Load.addr));
2638          return IRExpr_Load(
2639                    ex->Iex.Load.end,
2640                    ex->Iex.Load.ty,
2641                    subst_Expr(env, ex->Iex.Load.addr)
2642                 );
2643
2644       case Iex_CCall: {
2645          Int      i;
2646          IRExpr** args2 = shallowCopyIRExprVec(ex->Iex.CCall.args);
2647          for (i = 0; args2[i]; i++) {
2648             vassert(isIRAtom(args2[i]));
2649             args2[i] = subst_Expr(env, args2[i]);
2650          }
2651          return IRExpr_CCall(
2652                    ex->Iex.CCall.cee,
2653                    ex->Iex.CCall.retty,
2654                    args2
2655                 );
2656       }
2657
2658       case Iex_ITE:
2659          vassert(isIRAtom(ex->Iex.ITE.cond));
2660          vassert(isIRAtom(ex->Iex.ITE.iftrue));
2661          vassert(isIRAtom(ex->Iex.ITE.iffalse));
2662          return IRExpr_ITE(
2663                    subst_Expr(env, ex->Iex.ITE.cond),
2664                    subst_Expr(env, ex->Iex.ITE.iftrue),
2665                    subst_Expr(env, ex->Iex.ITE.iffalse)
2666                 );
2667
2668       default:
2669          vex_printf("\n\n"); ppIRExpr(ex);
2670          vpanic("subst_Expr");
2671
2672    }
2673 }
2674
2675
2676 /* Apply the subst to stmt, then, if |doFolding| is |True|, fold the result as
2677    much as possible.  Much simplified due to stmt being previously flattened.
2678    As a result of this, the stmt may wind up being turned into a no-op.
2679 */
2680 static IRStmt* subst_and_maybe_fold_Stmt ( Bool doFolding,
2681                                            IRExpr** env, IRStmt* st )
2682 {
2683 #  if 0
2684    vex_printf("\nsubst and maybe fold stmt\n");
2685    ppIRStmt(st);
2686    vex_printf("\n");
2687 #  endif
2688
2689    IRExpr** s_env = env;
2690    IRExpr** f_env = doFolding ? env : NULL;
2691
2692    switch (st->tag) {
2693       case Ist_AbiHint:
2694          vassert(isIRAtom(st->Ist.AbiHint.base));
2695          vassert(isIRAtom(st->Ist.AbiHint.nia));
2696          return IRStmt_AbiHint(
2697                    fold_Expr(f_env, subst_Expr(s_env, st->Ist.AbiHint.base)),
2698                    st->Ist.AbiHint.len,
2699                    fold_Expr(f_env, subst_Expr(s_env, st->Ist.AbiHint.nia))
2700                 );
2701       case Ist_Put:
2702          vassert(isIRAtom(st->Ist.Put.data));
2703          return IRStmt_Put(
2704                    st->Ist.Put.offset,
2705                    fold_Expr(f_env, subst_Expr(s_env, st->Ist.Put.data))
2706                 );
2707
2708       case Ist_PutI: {
2709          IRPutI *puti, *puti2;
2710          puti = st->Ist.PutI.details;
2711          vassert(isIRAtom(puti->ix));
2712          vassert(isIRAtom(puti->data));
2713          puti2 = mkIRPutI(puti->descr,
2714                           fold_Expr(f_env, subst_Expr(s_env, puti->ix)),
2715                           puti->bias,
2716                           fold_Expr(f_env, subst_Expr(s_env, puti->data)));
2717          return IRStmt_PutI(puti2);
2718       }
2719
2720       case Ist_WrTmp:
2721          /* This is the one place where an expr (st->Ist.WrTmp.data) is
2722             allowed to be more than just a constant or a tmp. */
2723          return IRStmt_WrTmp(
2724                    st->Ist.WrTmp.tmp,
2725                    fold_Expr(f_env, subst_Expr(s_env, st->Ist.WrTmp.data))
2726                 );
2727
2728       case Ist_Store:
2729          vassert(isIRAtom(st->Ist.Store.addr));
2730          vassert(isIRAtom(st->Ist.Store.data));
2731          return IRStmt_Store(
2732                    st->Ist.Store.end,
2733                    fold_Expr(f_env, subst_Expr(s_env, st->Ist.Store.addr)),
2734                    fold_Expr(f_env, subst_Expr(s_env, st->Ist.Store.data))
2735                 );
2736
2737       case Ist_StoreG: {
2738          IRStoreG* sg = st->Ist.StoreG.details;
2739          vassert(isIRAtom(sg->addr));
2740          vassert(isIRAtom(sg->data));
2741          vassert(isIRAtom(sg->guard));
2742          IRExpr* faddr  = fold_Expr(f_env, subst_Expr(s_env, sg->addr));
2743          IRExpr* fdata  = fold_Expr(f_env, subst_Expr(s_env, sg->data));
2744          IRExpr* fguard = fold_Expr(f_env, subst_Expr(s_env, sg->guard));
2745          if (fguard->tag == Iex_Const) {
2746             /* The condition on this store has folded down to a constant. */
2747             vassert(fguard->Iex.Const.con->tag == Ico_U1);
2748             if (fguard->Iex.Const.con->Ico.U1 == False) {
2749                return IRStmt_NoOp();
2750             } else {
2751                vassert(fguard->Iex.Const.con->Ico.U1 == True);
2752                return IRStmt_Store(sg->end, faddr, fdata);
2753             }
2754          }
2755          return IRStmt_StoreG(sg->end, faddr, fdata, fguard);
2756       }
2757
2758       case Ist_LoadG: {
2759          /* This is complicated.  If the guard folds down to 'false',
2760             we can replace it with an assignment 'dst := alt', but if
2761             the guard folds down to 'true', we can't conveniently
2762             replace it with an unconditional load, because doing so
2763             requires generating a new temporary, and that is not easy
2764             to do at this point. */
2765          IRLoadG* lg = st->Ist.LoadG.details;
2766          vassert(isIRAtom(lg->addr));
2767          vassert(isIRAtom(lg->alt));
2768          vassert(isIRAtom(lg->guard));
2769          IRExpr* faddr  = fold_Expr(f_env, subst_Expr(s_env, lg->addr));
2770          IRExpr* falt   = fold_Expr(f_env, subst_Expr(s_env, lg->alt));
2771          IRExpr* fguard = fold_Expr(f_env, subst_Expr(s_env, lg->guard));
2772          if (fguard->tag == Iex_Const) {
2773             /* The condition on this load has folded down to a constant. */
2774             vassert(fguard->Iex.Const.con->tag == Ico_U1);
2775             if (fguard->Iex.Const.con->Ico.U1 == False) {
2776                /* The load is not going to happen -- instead 'alt' is
2777                   assigned to 'dst'.  */
2778                return IRStmt_WrTmp(lg->dst, falt);
2779             } else {
2780                vassert(fguard->Iex.Const.con->Ico.U1 == True);
2781                /* The load is always going to happen.  We want to
2782                   convert to an unconditional load and assign to 'dst'
2783                   (IRStmt_WrTmp).  Problem is we need an extra temp to
2784                   hold the loaded value, but none is available.
2785                   Instead, reconstitute the conditional load (with
2786                   folded args, of course) and let the caller of this
2787                   routine deal with the problem. */
2788             }
2789          }
2790          return IRStmt_LoadG(lg->end, lg->cvt, lg->dst, faddr, falt, fguard);
2791       }
2792
2793       case Ist_CAS: {
2794          IRCAS *cas, *cas2;
2795          cas = st->Ist.CAS.details;
2796          vassert(isIRAtom(cas->addr));
2797          vassert(cas->expdHi == NULL || isIRAtom(cas->expdHi));
2798          vassert(isIRAtom(cas->expdLo));
2799          vassert(cas->dataHi == NULL || isIRAtom(cas->dataHi));
2800          vassert(isIRAtom(cas->dataLo));
2801          cas2 = mkIRCAS(
2802                    cas->oldHi, cas->oldLo, cas->end,
2803                    fold_Expr(f_env, subst_Expr(s_env, cas->addr)),
2804                    cas->expdHi ? fold_Expr(f_env,
2805                                            subst_Expr(s_env, cas->expdHi))
2806                                : NULL,
2807                    fold_Expr(f_env, subst_Expr(s_env, cas->expdLo)),
2808                    cas->dataHi ? fold_Expr(f_env,
2809                                            subst_Expr(s_env, cas->dataHi))
2810                                : NULL,
2811                    fold_Expr(f_env, subst_Expr(s_env, cas->dataLo))
2812                 );
2813          return IRStmt_CAS(cas2);
2814       }
2815
2816       case Ist_LLSC:
2817          vassert(isIRAtom(st->Ist.LLSC.addr));
2818          if (st->Ist.LLSC.storedata)
2819             vassert(isIRAtom(st->Ist.LLSC.storedata));
2820          return IRStmt_LLSC(
2821                    st->Ist.LLSC.end,
2822                    st->Ist.LLSC.result,
2823                    fold_Expr(f_env, subst_Expr(s_env, st->Ist.LLSC.addr)),
2824                    st->Ist.LLSC.storedata
2825                       ? fold_Expr(f_env,
2826                                   subst_Expr(s_env, st->Ist.LLSC.storedata))
2827                       : NULL
2828                 );
2829
2830       case Ist_Dirty: {
2831          Int     i;
2832          IRDirty *d, *d2;
2833          d = st->Ist.Dirty.details;
2834          d2 = emptyIRDirty();
2835          *d2 = *d;
2836          d2->args = shallowCopyIRExprVec(d2->args);
2837          if (d2->mFx != Ifx_None) {
2838             vassert(isIRAtom(d2->mAddr));
2839             d2->mAddr = fold_Expr(f_env, subst_Expr(s_env, d2->mAddr));
2840          }
2841          vassert(isIRAtom(d2->guard));
2842          d2->guard = fold_Expr(f_env, subst_Expr(s_env, d2->guard));
2843          for (i = 0; d2->args[i]; i++) {
2844             IRExpr* arg = d2->args[i];
2845             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
2846                vassert(isIRAtom(arg));
2847                d2->args[i] = fold_Expr(f_env, subst_Expr(s_env, arg));
2848             }
2849          }
2850          return IRStmt_Dirty(d2);
2851       }
2852
2853       case Ist_IMark:
2854          return IRStmt_IMark(st->Ist.IMark.addr,
2855                              st->Ist.IMark.len,
2856                              st->Ist.IMark.delta);
2857
2858       case Ist_NoOp:
2859          return IRStmt_NoOp();
2860
2861       case Ist_MBE:
2862          return IRStmt_MBE(st->Ist.MBE.event);
2863
2864       case Ist_Exit: {
2865          IRExpr* fcond;
2866          vassert(isIRAtom(st->Ist.Exit.guard));
2867          fcond = fold_Expr(f_env, subst_Expr(s_env, st->Ist.Exit.guard));
2868          if (fcond->tag == Iex_Const) {
2869             /* Interesting.  The condition on this exit has folded down to
2870                a constant. */
2871             vassert(fcond->Iex.Const.con->tag == Ico_U1);
2872             if (fcond->Iex.Const.con->Ico.U1 == False) {
2873                /* exit is never going to happen, so dump the statement. */
2874                return IRStmt_NoOp();
2875             } else {
2876                vassert(fcond->Iex.Const.con->Ico.U1 == True);
2877                /* Hmmm.  The exit has become unconditional.  Leave it
2878                   as it is for now, since we'd have to truncate the BB
2879                   at this point, which is tricky.  Such truncation is
2880                   done later by the dead-code elimination pass. */
2881                /* fall out into the reconstruct-the-exit code. */
2882                if (vex_control.iropt_verbosity > 0)
2883                   /* really a misuse of vex_control.iropt_verbosity */
2884                   vex_printf("vex iropt: IRStmt_Exit became unconditional\n");
2885             }
2886          }
2887          return IRStmt_Exit(fcond, st->Ist.Exit.jk,
2888                                    st->Ist.Exit.dst, st->Ist.Exit.offsIP);
2889       }
2890
2891    default:
2892       vex_printf("\n"); ppIRStmt(st);
2893       vpanic("subst_and_fold_Stmt");
2894    }
2895 }
2896
2897
2898 __attribute__((noinline))
2899 static IRSB* cprop_BB_WRK ( IRSB* in, Bool mustRetainNoOps, Bool doFolding )
2900 {
2901    Int      i;
2902    IRSB*    out;
2903    IRStmt*  st2;
2904    Int      n_tmps = in->tyenv->types_used;
2905    IRExpr** env = LibVEX_Alloc_inline(n_tmps * sizeof(IRExpr*));
2906    /* Keep track of IRStmt_LoadGs that we need to revisit after
2907       processing all the other statements. */
2908    const Int N_FIXUPS = 16;
2909    Int fixups[N_FIXUPS]; /* indices in the stmt array of 'out' */
2910    Int n_fixups = 0;
2911
2912    out = emptyIRSB();
2913    out->tyenv = deepCopyIRTypeEnv( in->tyenv );
2914
2915    /* Set up the env with which travels forward.  This holds a
2916       substitution, mapping IRTemps to IRExprs. The environment
2917       is to be applied as we move along.  Keys are IRTemps.
2918       Values are IRExpr*s.
2919    */
2920    for (i = 0; i < n_tmps; i++)
2921       env[i] = NULL;
2922
2923    /* For each original SSA-form stmt ... */
2924    for (i = 0; i < in->stmts_used; i++) {
2925
2926       /* First apply the substitution to the current stmt.  This
2927          propagates in any constants and tmp-tmp assignments
2928          accumulated prior to this point.  As part of the subst_Stmt
2929          call, also then fold any constant expressions resulting. */
2930
2931       st2 = in->stmts[i];
2932
2933       /* perhaps st2 is already a no-op? */
2934       if (st2->tag == Ist_NoOp && !mustRetainNoOps) continue;
2935
2936       st2 = subst_and_maybe_fold_Stmt( doFolding, env, st2 );
2937
2938       /* Deal with some post-folding special cases. */
2939       switch (st2->tag) {
2940
2941          /* If the statement has been folded into a no-op, forget
2942             it. */
2943          case Ist_NoOp:
2944             if (mustRetainNoOps) {
2945                break;
2946             } else {
2947                continue;
2948             }
2949
2950          /* If the statement assigns to an IRTemp add it to the
2951             running environment. This is for the benefit of copy
2952             propagation and to allow sameIRExpr look through
2953             IRTemps. */
2954          case Ist_WrTmp: {
2955             vassert(env[(Int)(st2->Ist.WrTmp.tmp)] == NULL);
2956             env[(Int)(st2->Ist.WrTmp.tmp)] = st2->Ist.WrTmp.data;
2957
2958             /* 't1 = t2' -- don't add to BB; will be optimized out */
2959             if (st2->Ist.WrTmp.data->tag == Iex_RdTmp)
2960                continue;
2961
2962             /* 't = const' && 'const != F64i' -- don't add to BB
2963                Note, we choose not to propagate const when const is an
2964                F64i, so that F64i literals can be CSE'd later.  This
2965                helps x86 floating point code generation. */
2966             if (st2->Ist.WrTmp.data->tag == Iex_Const
2967                 && st2->Ist.WrTmp.data->Iex.Const.con->tag != Ico_F64i) {
2968                continue;
2969             }
2970             /* else add it to the output, as normal */
2971             break;
2972          }
2973
2974          case Ist_LoadG: {
2975             IRLoadG* lg    = st2->Ist.LoadG.details;
2976             IRExpr*  guard = lg->guard;
2977             if (guard->tag == Iex_Const) {
2978                /* The guard has folded to a constant, and that
2979                   constant must be 1:I1, since subst_and_maybe_fold_Stmt
2980                   folds out the case 0:I1 by itself. */
2981                vassert(guard->Iex.Const.con->tag == Ico_U1);
2982                vassert(guard->Iex.Const.con->Ico.U1 == True);
2983                /* Add a NoOp here as a placeholder, and make a note of
2984                   where it is in the output block.  Afterwards we'll
2985                   come back here and transform the NoOp and the LoadG
2986                   into a load-convert pair.  The fixups[] entry
2987                   refers to the inserted NoOp, and we expect to find
2988                   the relevant LoadG immediately after it. */
2989                vassert(n_fixups >= 0 && n_fixups <= N_FIXUPS);
2990                if (n_fixups < N_FIXUPS) {
2991                   fixups[n_fixups++] = out->stmts_used;
2992                   addStmtToIRSB( out, IRStmt_NoOp() );
2993                }
2994             }
2995             /* And always add the LoadG to the output, regardless. */
2996             break;
2997          }
2998
2999       default:
3000          break;
3001       }
3002
3003       /* Not interesting, copy st2 into the output block. */
3004       addStmtToIRSB( out, st2 );
3005    }
3006
3007 #  if STATS_IROPT
3008    vex_printf("sameIRExpr: invoked = %u/%u  equal = %u/%u max_nodes = %u\n",
3009               invocation_count, recursion_count, success_count,
3010               recursion_success_count, max_nodes_visited);
3011 #  endif
3012
3013    out->next     = subst_Expr( env, in->next );
3014    out->jumpkind = in->jumpkind;
3015    out->offsIP   = in->offsIP;
3016
3017    /* Process any leftover unconditional LoadGs that we noticed
3018       in the main pass. */
3019    vassert(n_fixups >= 0 && n_fixups <= N_FIXUPS);
3020    for (i = 0; i < n_fixups; i++) {
3021       Int ix = fixups[i];
3022       /* Carefully verify that the LoadG has the expected form. */
3023       vassert(ix >= 0 && ix+1 < out->stmts_used);
3024       IRStmt* nop = out->stmts[ix];
3025       IRStmt* lgu = out->stmts[ix+1];
3026       vassert(nop->tag == Ist_NoOp);
3027       vassert(lgu->tag == Ist_LoadG);
3028       IRLoadG* lg    = lgu->Ist.LoadG.details;
3029       IRExpr*  guard = lg->guard;
3030       vassert(guard->Iex.Const.con->tag == Ico_U1);
3031       vassert(guard->Iex.Const.con->Ico.U1 == True);
3032       /* Figure out the load and result types, and the implied
3033          conversion operation. */
3034       IRType cvtRes = Ity_INVALID, cvtArg = Ity_INVALID;
3035       typeOfIRLoadGOp(lg->cvt, &cvtRes, &cvtArg);
3036       IROp cvtOp = Iop_INVALID;
3037       switch (lg->cvt) {
3038          case ILGop_IdentV128:
3039          case ILGop_Ident64:
3040          case ILGop_Ident32: break;
3041          case ILGop_8Uto32:  cvtOp = Iop_8Uto32;  break;
3042          case ILGop_8Sto32:  cvtOp = Iop_8Sto32;  break;
3043          case ILGop_16Uto32: cvtOp = Iop_16Uto32; break;
3044          case ILGop_16Sto32: cvtOp = Iop_16Sto32; break;
3045          default: vpanic("cprop_BB: unhandled ILGOp");
3046       }
3047       /* Replace the placeholder NoOp by the required unconditional
3048          load. */
3049       IRTemp tLoaded = newIRTemp(out->tyenv, cvtArg);
3050       out->stmts[ix]
3051          = IRStmt_WrTmp(tLoaded,
3052                         IRExpr_Load(lg->end, cvtArg, lg->addr));
3053       /* Replace the LoadG by a conversion from the loaded value's
3054          type to the required result type. */
3055       out->stmts[ix+1]
3056          = IRStmt_WrTmp(
3057               lg->dst, cvtOp == Iop_INVALID
3058                           ? IRExpr_RdTmp(tLoaded)
3059                           : IRExpr_Unop(cvtOp, IRExpr_RdTmp(tLoaded)));
3060    }
3061
3062    return out;
3063 }
3064
3065
3066 IRSB* cprop_BB ( IRSB* in ) {
3067    return cprop_BB_WRK(in, /*mustRetainNoOps=*/False, /*doFolding=*/True);
3068 }
3069
3070
3071 /*---------------------------------------------------------------*/
3072 /*--- Dead code (t = E) removal                               ---*/
3073 /*---------------------------------------------------------------*/
3074
3075 /* As a side effect, also removes all code following an unconditional
3076    side exit. */
3077
3078 /* The type of the HashHW map is: a map from IRTemp to nothing
3079    -- really just operating a set or IRTemps.
3080 */
3081
3082 inline
3083 static void addUses_Temp ( Bool* set, IRTemp tmp )
3084 {
3085    set[(Int)tmp] = True;
3086 }
3087
3088 static void addUses_Expr ( Bool* set, IRExpr* e )
3089 {
3090    Int i;
3091    switch (e->tag) {
3092       case Iex_GetI:
3093          addUses_Expr(set, e->Iex.GetI.ix);
3094          return;
3095       case Iex_ITE:
3096          addUses_Expr(set, e->Iex.ITE.cond);
3097          addUses_Expr(set, e->Iex.ITE.iftrue);
3098          addUses_Expr(set, e->Iex.ITE.iffalse);
3099          return;
3100       case Iex_CCall:
3101          for (i = 0; e->Iex.CCall.args[i]; i++)
3102             addUses_Expr(set, e->Iex.CCall.args[i]);
3103          return;
3104       case Iex_Load:
3105          addUses_Expr(set, e->Iex.Load.addr);
3106          return;
3107       case Iex_Qop:
3108          addUses_Expr(set, e->Iex.Qop.details->arg1);
3109          addUses_Expr(set, e->Iex.Qop.details->arg2);
3110          addUses_Expr(set, e->Iex.Qop.details->arg3);
3111          addUses_Expr(set, e->Iex.Qop.details->arg4);
3112          return;
3113       case Iex_Triop:
3114          addUses_Expr(set, e->Iex.Triop.details->arg1);
3115          addUses_Expr(set, e->Iex.Triop.details->arg2);
3116          addUses_Expr(set, e->Iex.Triop.details->arg3);
3117          return;
3118       case Iex_Binop:
3119          addUses_Expr(set, e->Iex.Binop.arg1);
3120          addUses_Expr(set, e->Iex.Binop.arg2);
3121          return;
3122       case Iex_Unop:
3123          addUses_Expr(set, e->Iex.Unop.arg);
3124          return;
3125       case Iex_RdTmp:
3126          addUses_Temp(set, e->Iex.RdTmp.tmp);
3127          return;
3128       case Iex_Const:
3129       case Iex_Get:
3130          return;
3131       default:
3132          vex_printf("\n");
3133          ppIRExpr(e);
3134          vpanic("addUses_Expr");
3135    }
3136 }
3137
3138 static void addUses_Stmt ( Bool* set, IRStmt* st )
3139 {
3140    Int      i;
3141    IRDirty* d;
3142    IRCAS*   cas;
3143    switch (st->tag) {
3144       case Ist_AbiHint:
3145          addUses_Expr(set, st->Ist.AbiHint.base);
3146          addUses_Expr(set, st->Ist.AbiHint.nia);
3147          return;
3148       case Ist_PutI:
3149          addUses_Expr(set, st->Ist.PutI.details->ix);
3150          addUses_Expr(set, st->Ist.PutI.details->data);
3151          return;
3152       case Ist_WrTmp:
3153          addUses_Expr(set, st->Ist.WrTmp.data);
3154          return;
3155       case Ist_Put:
3156          addUses_Expr(set, st->Ist.Put.data);
3157          return;
3158       case Ist_Store:
3159          addUses_Expr(set, st->Ist.Store.addr);
3160          addUses_Expr(set, st->Ist.Store.data);
3161          return;
3162       case Ist_StoreG: {
3163          IRStoreG* sg = st->Ist.StoreG.details;
3164          addUses_Expr(set, sg->addr);
3165          addUses_Expr(set, sg->data);
3166          addUses_Expr(set, sg->guard);
3167          return;
3168       }
3169       case Ist_LoadG: {
3170          IRLoadG* lg = st->Ist.LoadG.details;
3171          addUses_Expr(set, lg->addr);
3172          addUses_Expr(set, lg->alt);
3173          addUses_Expr(set, lg->guard);
3174          return;
3175       }
3176       case Ist_CAS:
3177          cas = st->Ist.CAS.details;
3178          addUses_Expr(set, cas->addr);
3179          if (cas->expdHi)
3180             addUses_Expr(set, cas->expdHi);
3181          addUses_Expr(set, cas->expdLo);
3182          if (cas->dataHi)
3183             addUses_Expr(set, cas->dataHi);
3184          addUses_Expr(set, cas->dataLo);
3185          return;
3186       case Ist_LLSC:
3187          addUses_Expr(set, st->Ist.LLSC.addr);
3188          if (st->Ist.LLSC.storedata)
3189             addUses_Expr(set, st->Ist.LLSC.storedata);
3190          return;
3191       case Ist_Dirty:
3192          d = st->Ist.Dirty.details;
3193          if (d->mFx != Ifx_None)
3194             addUses_Expr(set, d->mAddr);
3195          addUses_Expr(set, d->guard);
3196          for (i = 0; d->args[i] != NULL; i++) {
3197             IRExpr* arg = d->args[i];
3198             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg)))
3199                addUses_Expr(set, arg);
3200          }
3201          return;
3202       case Ist_NoOp:
3203       case Ist_IMark:
3204       case Ist_MBE:
3205          return;
3206       case Ist_Exit:
3207          addUses_Expr(set, st->Ist.Exit.guard);
3208          return;
3209       default:
3210          vex_printf("\n");
3211          ppIRStmt(st);
3212          vpanic("addUses_Stmt");
3213    }
3214 }
3215
3216
3217 /* Is this literally IRExpr_Const(IRConst_U1(False)) ? */
3218 static Bool isZeroU1 ( IRExpr* e )
3219 {
3220    return toBool( e->tag == Iex_Const
3221                   && e->Iex.Const.con->tag == Ico_U1
3222                   && e->Iex.Const.con->Ico.U1 == False );
3223 }
3224
3225 /* Is this literally IRExpr_Const(IRConst_U1(True)) ? */
3226 static Bool isOneU1 ( IRExpr* e )
3227 {
3228    return toBool( e->tag == Iex_Const
3229                   && e->Iex.Const.con->tag == Ico_U1
3230                   && e->Iex.Const.con->Ico.U1 == True );
3231 }
3232
3233
3234 /* Note, this destructively modifies the given IRSB. */
3235
3236 /* Scan backwards through statements, carrying a set of IRTemps which
3237    are known to be used after the current point.  On encountering 't =
3238    E', delete the binding if it is not used.  Otherwise, add any temp
3239    uses to the set and keep on moving backwards.
3240
3241    As an enhancement, the first (backwards) pass searches for IR exits
3242    with always-taken conditions and notes the location of the earliest
3243    one in the block.  If any such are found, a second pass copies the
3244    exit destination and jump kind to the bb-end.  Then, the exit and
3245    all statements following it are turned into no-ops.
3246 */
3247
3248 /* notstatic */ void do_deadcode_BB ( IRSB* bb )
3249 {
3250    Int     i, i_unconditional_exit;
3251    Int     n_tmps = bb->tyenv->types_used;
3252    Bool*   set = LibVEX_Alloc_inline(n_tmps * sizeof(Bool));
3253    IRStmt* st;
3254
3255    for (i = 0; i < n_tmps; i++)
3256       set[i] = False;
3257
3258    /* start off by recording IRTemp uses in the next field. */
3259    addUses_Expr(set, bb->next);
3260
3261    /* First pass */
3262
3263    /* Work backwards through the stmts */
3264    i_unconditional_exit = -1;
3265    for (i = bb->stmts_used-1; i >= 0; i--) {
3266       st = bb->stmts[i];
3267       if (st->tag == Ist_NoOp)
3268          continue;
3269       /* take note of any unconditional exits */
3270       if (st->tag == Ist_Exit
3271           && isOneU1(st->Ist.Exit.guard))
3272          i_unconditional_exit = i;
3273       if (st->tag == Ist_WrTmp
3274           && set[(Int)(st->Ist.WrTmp.tmp)] == False) {
3275           /* it's an IRTemp which never got used.  Delete it. */
3276          if (DEBUG_IROPT) {
3277             vex_printf("DEAD: ");
3278             ppIRStmt(st);
3279             vex_printf("\n");
3280          }
3281          bb->stmts[i] = IRStmt_NoOp();
3282       }
3283       else
3284       if (st->tag == Ist_Dirty
3285           && st->Ist.Dirty.details->guard
3286           && isZeroU1(st->Ist.Dirty.details->guard)) {
3287          /* This is a dirty helper which will never get called.
3288             Delete it. */
3289          bb->stmts[i] = IRStmt_NoOp();
3290        }
3291        else {
3292          /* Note any IRTemp uses made by the current statement. */
3293          addUses_Stmt(set, st);
3294       }
3295    }
3296
3297    /* Optional second pass: if any unconditional exits were found,
3298       delete them and all following statements. */
3299
3300    if (i_unconditional_exit != -1) {
3301       if (0) vex_printf("ZAPPING ALL FORWARDS from %d\n",
3302                         i_unconditional_exit);
3303       vassert(i_unconditional_exit >= 0
3304               && i_unconditional_exit < bb->stmts_used);
3305       bb->next
3306          = IRExpr_Const( bb->stmts[i_unconditional_exit]->Ist.Exit.dst );
3307       bb->jumpkind
3308          = bb->stmts[i_unconditional_exit]->Ist.Exit.jk;
3309       bb->offsIP
3310          = bb->stmts[i_unconditional_exit]->Ist.Exit.offsIP;
3311       for (i = i_unconditional_exit; i < bb->stmts_used; i++)
3312          bb->stmts[i] = IRStmt_NoOp();
3313    }
3314 }
3315
3316
3317 /*---------------------------------------------------------------*/
3318 /*--- Specialisation of helper function calls, in             ---*/
3319 /*--- collaboration with the front end                        ---*/
3320 /*---------------------------------------------------------------*/
3321
3322 static
3323 IRSB* spec_helpers_BB(
3324          IRSB* bb,
3325          IRExpr* (*specHelper) (const HChar*, IRExpr**, IRStmt**, Int)
3326       )
3327 {
3328    Int     i;
3329    IRStmt* st;
3330    IRExpr* ex;
3331    Bool    any = False;
3332
3333    for (i = bb->stmts_used-1; i >= 0; i--) {
3334       st = bb->stmts[i];
3335
3336       if (st->tag != Ist_WrTmp
3337           || st->Ist.WrTmp.data->tag != Iex_CCall)
3338          continue;
3339
3340       ex = (*specHelper)( st->Ist.WrTmp.data->Iex.CCall.cee->name,
3341                           st->Ist.WrTmp.data->Iex.CCall.args,
3342                           &bb->stmts[0], i );
3343       if (!ex)
3344         /* the front end can't think of a suitable replacement */
3345         continue;
3346
3347       /* We got something better.  Install it in the bb. */
3348       any = True;
3349       bb->stmts[i]
3350          = IRStmt_WrTmp(st->Ist.WrTmp.tmp, ex);
3351
3352       if (0) {
3353          vex_printf("SPEC: ");
3354          ppIRExpr(st->Ist.WrTmp.data);
3355          vex_printf("  -->  ");
3356          ppIRExpr(ex);
3357          vex_printf("\n");
3358       }
3359    }
3360
3361    if (any)
3362       bb = flatten_BB(bb);
3363    return bb;
3364 }
3365
3366
3367 /*---------------------------------------------------------------*/
3368 /*--- Determination of guest state aliasing relationships     ---*/
3369 /*---------------------------------------------------------------*/
3370
3371 /* These are helper functions for CSE and GetI/PutI transformations.
3372
3373    Determine, to the extent possible, the relationship between two
3374    guest state accesses.  The possible outcomes are:
3375
3376    * Exact alias.  These two accesses denote precisely the same
3377      piece of the guest state.
3378
3379    * Definitely no alias.  These two accesses are guaranteed not to
3380      overlap any part of the guest state.
3381
3382    * Unknown -- if neither of the above can be established.
3383
3384    If in doubt, return Unknown.  */
3385
3386 typedef
3387    enum { ExactAlias, NoAlias, UnknownAlias }
3388    GSAliasing;
3389
3390
3391 /* Produces the alias relation between an indexed guest
3392    state access and a non-indexed access. */
3393
3394 static
3395 GSAliasing getAliasingRelation_IC ( IRRegArray* descr1, IRExpr* ix1,
3396                                     Int offset2, IRType ty2 )
3397 {
3398    UInt minoff1, maxoff1, minoff2, maxoff2;
3399
3400    getArrayBounds( descr1, &minoff1, &maxoff1 );
3401    minoff2 = offset2;
3402    maxoff2 = minoff2 + sizeofIRType(ty2) - 1;
3403
3404    if (maxoff1 < minoff2 || maxoff2 < minoff1)
3405       return NoAlias;
3406
3407    /* Could probably do better here if required.  For the moment
3408       however just claim not to know anything more. */
3409    return UnknownAlias;
3410 }
3411
3412
3413 /* Produces the alias relation between two indexed guest state
3414    accesses. */
3415
3416 static
3417 GSAliasing getAliasingRelation_II (
3418               IRRegArray* descr1, IRExpr* ix1, Int bias1,
3419               IRRegArray* descr2, IRExpr* ix2, Int bias2
3420            )
3421 {
3422    UInt minoff1, maxoff1, minoff2, maxoff2;
3423    Int  iters;
3424
3425    /* First try hard to show they don't alias. */
3426    getArrayBounds( descr1, &minoff1, &maxoff1 );
3427    getArrayBounds( descr2, &minoff2, &maxoff2 );
3428    if (maxoff1 < minoff2 || maxoff2 < minoff1)
3429       return NoAlias;
3430
3431    /* So the two arrays at least partially overlap.  To get any
3432       further we'll have to be sure that the descriptors are
3433       identical. */
3434    if (!eqIRRegArray(descr1, descr2))
3435       return UnknownAlias;
3436
3437    /* The descriptors are identical.  Now the only difference can be
3438       in the index expressions.  If they cannot be shown to be
3439       identical, we have to say we don't know what the aliasing
3440       relation will be.  Now, since the IR is flattened, the index
3441       expressions should be atoms -- either consts or tmps.  So that
3442       makes the comparison simple. */
3443    vassert(isIRAtom(ix1));
3444    vassert(isIRAtom(ix2));
3445    if (!eqIRAtom(ix1,ix2))
3446       return UnknownAlias;
3447
3448    /* Ok, the index expressions are identical.  So now the only way
3449       they can be different is in the bias.  Normalise this
3450       paranoidly, to reliably establish equality/non-equality. */
3451
3452    /* So now we know that the GetI and PutI index the same array
3453       with the same base.  Are the offsets the same, modulo the
3454       array size?  Do this paranoidly. */
3455    vassert(descr1->nElems == descr2->nElems);
3456    vassert(descr1->elemTy == descr2->elemTy);
3457    vassert(descr1->base   == descr2->base);
3458    iters = 0;
3459    while (bias1 < 0 || bias2 < 0) {
3460       bias1 += descr1->nElems;
3461       bias2 += descr1->nElems;
3462       iters++;
3463       if (iters > 10)
3464          vpanic("getAliasingRelation: iters");
3465    }
3466    vassert(bias1 >= 0 && bias2 >= 0);
3467    bias1 %= descr1->nElems;
3468    bias2 %= descr1->nElems;
3469    vassert(bias1 >= 0 && bias1 < descr1->nElems);
3470    vassert(bias2 >= 0 && bias2 < descr1->nElems);
3471
3472    /* Finally, biasP and biasG are normalised into the range
3473       0 .. descrP/G->nElems - 1.  And so we can establish
3474       equality/non-equality. */
3475
3476    return bias1==bias2 ? ExactAlias : NoAlias;
3477 }
3478
3479
3480 /*---------------------------------------------------------------*/
3481 /*--- Common Subexpression Elimination                        ---*/
3482 /*---------------------------------------------------------------*/
3483
3484 /* Expensive in time and space. */
3485
3486 /* Uses two environments:
3487    a IRTemp -> IRTemp mapping
3488    a mapping from AvailExpr* to IRTemp
3489 */
3490
3491 typedef
3492    struct {
3493       enum { TCc, TCt } tag;
3494       union { IRTemp tmp; IRConst* con; } u;
3495    }
3496    TmpOrConst;
3497
3498 static Bool eqTmpOrConst ( TmpOrConst* tc1, TmpOrConst* tc2 )
3499 {
3500    if (tc1->tag != tc2->tag)
3501       return False;
3502    switch (tc1->tag) {
3503       case TCc:
3504          return eqIRConst(tc1->u.con, tc2->u.con);
3505       case TCt:
3506          return tc1->u.tmp == tc2->u.tmp;
3507       default:
3508          vpanic("eqTmpOrConst");
3509    }
3510 }
3511
3512 static Bool eqIRCallee ( IRCallee* cee1, IRCallee* cee2 )
3513 {
3514    Bool eq = cee1->addr == cee2->addr;
3515    if (eq) {
3516       vassert(cee1->regparms == cee2->regparms);
3517       vassert(cee1->mcx_mask == cee2->mcx_mask);
3518       /* Names should be the same too, but we don't bother to
3519          check. */
3520    }
3521    return eq;
3522 }
3523
3524 /* Convert an atomic IRExpr* to a TmpOrConst. */
3525 static void irExpr_to_TmpOrConst ( /*OUT*/TmpOrConst* tc, IRExpr* e )
3526 {
3527    switch (e->tag) {
3528       case Iex_RdTmp:
3529          tc->tag   = TCt;
3530          tc->u.tmp = e->Iex.RdTmp.tmp;
3531          break;
3532       case Iex_Const:
3533          tc->tag   = TCc;
3534          tc->u.con = e->Iex.Const.con;
3535          break;
3536       default:
3537          /* Getting here is a serious error.  It means that the
3538             presented arg isn't an IR atom, as it should be. */
3539          vpanic("irExpr_to_TmpOrConst");
3540    }
3541 }
3542
3543 /* Convert a TmpOrConst to an atomic IRExpr*. */
3544 static IRExpr* tmpOrConst_to_IRExpr ( TmpOrConst* tc )
3545 {
3546    switch (tc->tag) {
3547       case TCc: return IRExpr_Const(tc->u.con);
3548       case TCt: return IRExpr_RdTmp(tc->u.tmp);
3549       default:  vpanic("tmpOrConst_to_IRExpr");
3550    }
3551 }
3552
3553 /* Convert a NULL terminated IRExpr* vector to an array of
3554    TmpOrConsts, and a length. */
3555 static void irExprVec_to_TmpOrConsts ( /*OUT*/TmpOrConst** outs,
3556                                        /*OUT*/Int* nOuts,
3557                                        IRExpr** ins )
3558 {
3559    Int i, n;
3560    /* We have to make two passes, one to count, one to copy. */
3561    for (n = 0; ins[n]; n++)
3562       ;
3563    *outs  = LibVEX_Alloc_inline(n * sizeof(TmpOrConst));
3564    *nOuts = n;
3565    /* and now copy .. */
3566    for (i = 0; i < n; i++) {
3567       IRExpr*     arg = ins[i];
3568       TmpOrConst* dst = &(*outs)[i];
3569       irExpr_to_TmpOrConst(dst, arg);
3570    }
3571 }
3572
3573 typedef
3574    struct {
3575       enum { Ut, Btt, Btc, Bct, Cf64i, Ittt, Itct, Ittc, Itcc, GetIt,
3576              CCall, Load
3577       } tag;
3578       union {
3579          /* unop(tmp) */
3580          struct {
3581             IROp   op;
3582             IRTemp arg;
3583          } Ut;
3584          /* binop(tmp,tmp) */
3585          struct {
3586             IROp   op;
3587             IRTemp arg1;
3588             IRTemp arg2;
3589          } Btt;
3590          /* binop(tmp,const) */
3591          struct {
3592             IROp    op;
3593             IRTemp  arg1;
3594             IRConst con2;
3595          } Btc;
3596          /* binop(const,tmp) */
3597          struct {
3598             IROp    op;
3599             IRConst con1;
3600             IRTemp  arg2;
3601          } Bct;
3602          /* F64i-style const */
3603          struct {
3604             ULong f64i;
3605          } Cf64i;
3606          /* ITE(tmp,tmp,tmp) */
3607          struct {
3608             IRTemp co;
3609             IRTemp e1;
3610             IRTemp e0;
3611          } Ittt;
3612          /* ITE(tmp,tmp,const) */
3613          struct {
3614             IRTemp  co;
3615             IRTemp  e1;
3616             IRConst con0;
3617          } Ittc;
3618          /* ITE(tmp,const,tmp) */
3619          struct {
3620             IRTemp  co;
3621             IRConst con1;
3622             IRTemp  e0;
3623          } Itct;
3624          /* ITE(tmp,const,const) */
3625          struct {
3626             IRTemp  co;
3627             IRConst con1;
3628             IRConst con0;
3629          } Itcc;
3630          /* GetI(descr,tmp,bias)*/
3631          struct {
3632             IRRegArray* descr;
3633             IRTemp      ix;
3634             Int         bias;
3635          } GetIt;
3636          /* Clean helper call */
3637          struct {
3638             IRCallee*   cee;
3639             TmpOrConst* args;
3640             Int         nArgs;
3641             IRType      retty;
3642          } CCall;
3643          /* Load(end,ty,addr) */
3644          struct {
3645             IREndness  end;
3646             IRType     ty;
3647             TmpOrConst addr;
3648          } Load;
3649       } u;
3650    }
3651    AvailExpr;
3652
3653 static Bool eq_AvailExpr ( AvailExpr* a1, AvailExpr* a2 )
3654 {
3655    if (LIKELY(a1->tag != a2->tag))
3656       return False;
3657    switch (a1->tag) {
3658       case Ut:
3659          return toBool(
3660                 a1->u.Ut.op == a2->u.Ut.op
3661                 && a1->u.Ut.arg == a2->u.Ut.arg);
3662       case Btt:
3663          return toBool(
3664                 a1->u.Btt.op == a2->u.Btt.op
3665                 && a1->u.Btt.arg1 == a2->u.Btt.arg1
3666                 && a1->u.Btt.arg2 == a2->u.Btt.arg2);
3667       case Btc:
3668          return toBool(
3669                 a1->u.Btc.op == a2->u.Btc.op
3670                 && a1->u.Btc.arg1 == a2->u.Btc.arg1
3671                 && eqIRConst(&a1->u.Btc.con2, &a2->u.Btc.con2));
3672       case Bct:
3673          return toBool(
3674                 a1->u.Bct.op == a2->u.Bct.op
3675                 && a1->u.Bct.arg2 == a2->u.Bct.arg2
3676                 && eqIRConst(&a1->u.Bct.con1, &a2->u.Bct.con1));
3677       case Cf64i:
3678          return toBool(a1->u.Cf64i.f64i == a2->u.Cf64i.f64i);
3679       case Ittt:
3680          return toBool(a1->u.Ittt.co == a2->u.Ittt.co
3681                        && a1->u.Ittt.e1 == a2->u.Ittt.e1
3682                        && a1->u.Ittt.e0 == a2->u.Ittt.e0);
3683       case Ittc:
3684          return toBool(a1->u.Ittc.co == a2->u.Ittc.co
3685                        && a1->u.Ittc.e1 == a2->u.Ittc.e1
3686                        && eqIRConst(&a1->u.Ittc.con0, &a2->u.Ittc.con0));
3687       case Itct:
3688          return toBool(a1->u.Itct.co == a2->u.Itct.co
3689                        && eqIRConst(&a1->u.Itct.con1, &a2->u.Itct.con1)
3690                        && a1->u.Itct.e0 == a2->u.Itct.e0);
3691       case Itcc:
3692          return toBool(a1->u.Itcc.co == a2->u.Itcc.co
3693                        && eqIRConst(&a1->u.Itcc.con1, &a2->u.Itcc.con1)
3694                        && eqIRConst(&a1->u.Itcc.con0, &a2->u.Itcc.con0));
3695       case GetIt:
3696          return toBool(eqIRRegArray(a1->u.GetIt.descr, a2->u.GetIt.descr)
3697                        && a1->u.GetIt.ix == a2->u.GetIt.ix
3698                        && a1->u.GetIt.bias == a2->u.GetIt.bias);
3699       case CCall: {
3700          Int  i, n;
3701          Bool eq = a1->u.CCall.nArgs == a2->u.CCall.nArgs
3702                    && eqIRCallee(a1->u.CCall.cee, a2->u.CCall.cee);
3703          if (eq) {
3704             n = a1->u.CCall.nArgs;
3705             for (i = 0; i < n; i++) {
3706                if (!eqTmpOrConst( &a1->u.CCall.args[i],
3707                                   &a2->u.CCall.args[i] )) {
3708                   eq = False;
3709                   break;
3710                }
3711             }
3712          }
3713          if (eq) vassert(a1->u.CCall.retty == a2->u.CCall.retty);
3714          return eq;
3715       }
3716       case Load: {
3717          Bool eq = toBool(a1->u.Load.end == a2->u.Load.end
3718                           && a1->u.Load.ty == a2->u.Load.ty
3719                           && eqTmpOrConst(&a1->u.Load.addr, &a2->u.Load.addr));
3720          return eq;
3721       }
3722       default:
3723          vpanic("eq_AvailExpr");
3724    }
3725 }
3726
3727 static IRExpr* availExpr_to_IRExpr ( AvailExpr* ae )
3728 {
3729    IRConst *con, *con0, *con1;
3730    switch (ae->tag) {
3731       case Ut:
3732          return IRExpr_Unop( ae->u.Ut.op, IRExpr_RdTmp(ae->u.Ut.arg) );
3733       case Btt:
3734          return IRExpr_Binop( ae->u.Btt.op,
3735                               IRExpr_RdTmp(ae->u.Btt.arg1),
3736                               IRExpr_RdTmp(ae->u.Btt.arg2) );
3737       case Btc:
3738          con = LibVEX_Alloc_inline(sizeof(IRConst));
3739          *con = ae->u.Btc.con2;
3740          return IRExpr_Binop( ae->u.Btc.op,
3741                               IRExpr_RdTmp(ae->u.Btc.arg1),
3742                               IRExpr_Const(con) );
3743       case Bct:
3744          con = LibVEX_Alloc_inline(sizeof(IRConst));
3745          *con = ae->u.Bct.con1;
3746          return IRExpr_Binop( ae->u.Bct.op,
3747                               IRExpr_Const(con),
3748                               IRExpr_RdTmp(ae->u.Bct.arg2) );
3749       case Cf64i:
3750          return IRExpr_Const(IRConst_F64i(ae->u.Cf64i.f64i));
3751       case Ittt:
3752          return IRExpr_ITE(IRExpr_RdTmp(ae->u.Ittt.co),
3753                            IRExpr_RdTmp(ae->u.Ittt.e1),
3754                            IRExpr_RdTmp(ae->u.Ittt.e0));
3755       case Ittc:
3756          con0 = LibVEX_Alloc_inline(sizeof(IRConst));
3757          *con0 = ae->u.Ittc.con0;
3758          return IRExpr_ITE(IRExpr_RdTmp(ae->u.Ittc.co),
3759                            IRExpr_RdTmp(ae->u.Ittc.e1),
3760                            IRExpr_Const(con0));
3761       case Itct:
3762          con1 = LibVEX_Alloc_inline(sizeof(IRConst));
3763          *con1 = ae->u.Itct.con1;
3764          return IRExpr_ITE(IRExpr_RdTmp(ae->u.Itct.co),
3765                            IRExpr_Const(con1),
3766                            IRExpr_RdTmp(ae->u.Itct.e0));
3767
3768       case Itcc:
3769          con0 = LibVEX_Alloc_inline(sizeof(IRConst));
3770          con1 = LibVEX_Alloc_inline(sizeof(IRConst));
3771          *con0 = ae->u.Itcc.con0;
3772          *con1 = ae->u.Itcc.con1;
3773          return IRExpr_ITE(IRExpr_RdTmp(ae->u.Itcc.co),
3774                            IRExpr_Const(con1),
3775                            IRExpr_Const(con0));
3776       case GetIt:
3777          return IRExpr_GetI(ae->u.GetIt.descr,
3778                             IRExpr_RdTmp(ae->u.GetIt.ix),
3779                             ae->u.GetIt.bias);
3780       case CCall: {
3781          Int i, n = ae->u.CCall.nArgs;
3782          vassert(n >= 0);
3783          IRExpr** vec = LibVEX_Alloc_inline((n+1) * sizeof(IRExpr*));
3784          vec[n] = NULL;
3785          for (i = 0; i < n; i++) {
3786             vec[i] = tmpOrConst_to_IRExpr(&ae->u.CCall.args[i]);
3787          }
3788          return IRExpr_CCall(ae->u.CCall.cee,
3789                              ae->u.CCall.retty,
3790                              vec);
3791       }
3792       case Load:
3793          return IRExpr_Load(ae->u.Load.end, ae->u.Load.ty,
3794                             tmpOrConst_to_IRExpr(&ae->u.Load.addr));
3795       default:
3796          vpanic("availExpr_to_IRExpr");
3797    }
3798 }
3799
3800 inline
3801 static IRTemp subst_AvailExpr_Temp ( HashHW* env, IRTemp tmp )
3802 {
3803    HWord res;
3804    /* env :: IRTemp -> IRTemp */
3805    if (lookupHHW( env, &res, (HWord)tmp ))
3806       return (IRTemp)res;
3807    else
3808       return tmp;
3809 }
3810
3811 inline
3812 static void subst_AvailExpr_TmpOrConst ( /*MB_MOD*/TmpOrConst* tc,
3813                                           HashHW* env )
3814 {
3815    /* env :: IRTemp -> IRTemp */
3816    if (tc->tag == TCt) {
3817       tc->u.tmp = subst_AvailExpr_Temp( env, tc->u.tmp );
3818    }
3819 }
3820
3821 static void subst_AvailExpr ( HashHW* env, AvailExpr* ae )
3822 {
3823    /* env :: IRTemp -> IRTemp */
3824    switch (ae->tag) {
3825       case Ut:
3826          ae->u.Ut.arg = subst_AvailExpr_Temp( env, ae->u.Ut.arg );
3827          break;
3828       case Btt:
3829          ae->u.Btt.arg1 = subst_AvailExpr_Temp( env, ae->u.Btt.arg1 );
3830          ae->u.Btt.arg2 = subst_AvailExpr_Temp( env, ae->u.Btt.arg2 );
3831          break;
3832       case Btc:
3833          ae->u.Btc.arg1 = subst_AvailExpr_Temp( env, ae->u.Btc.arg1 );
3834          break;
3835       case Bct:
3836          ae->u.Bct.arg2 = subst_AvailExpr_Temp( env, ae->u.Bct.arg2 );
3837          break;
3838       case Cf64i:
3839          break;
3840       case Ittt:
3841          ae->u.Ittt.co = subst_AvailExpr_Temp( env, ae->u.Ittt.co );
3842          ae->u.Ittt.e1 = subst_AvailExpr_Temp( env, ae->u.Ittt.e1 );
3843          ae->u.Ittt.e0 = subst_AvailExpr_Temp( env, ae->u.Ittt.e0 );
3844          break;
3845       case Ittc:
3846          ae->u.Ittc.co = subst_AvailExpr_Temp( env, ae->u.Ittc.co );
3847          ae->u.Ittc.e1 = subst_AvailExpr_Temp( env, ae->u.Ittc.e1 );
3848          break;
3849       case Itct:
3850          ae->u.Itct.co = subst_AvailExpr_Temp( env, ae->u.Itct.co );
3851          ae->u.Itct.e0 = subst_AvailExpr_Temp( env, ae->u.Itct.e0 );
3852          break;
3853       case Itcc:
3854          ae->u.Itcc.co = subst_AvailExpr_Temp( env, ae->u.Itcc.co );
3855          break;
3856       case GetIt:
3857          ae->u.GetIt.ix = subst_AvailExpr_Temp( env, ae->u.GetIt.ix );
3858          break;
3859       case CCall: {
3860          Int i, n = ae->u.CCall.nArgs;;
3861          for (i = 0; i < n; i++) {
3862             subst_AvailExpr_TmpOrConst(&ae->u.CCall.args[i], env);
3863          }
3864          break;
3865       }
3866       case Load:
3867          subst_AvailExpr_TmpOrConst(&ae->u.Load.addr, env);
3868          break;
3869       default:
3870          vpanic("subst_AvailExpr");
3871    }
3872 }
3873
3874 static AvailExpr* irExpr_to_AvailExpr ( IRExpr* e, Bool allowLoadsToBeCSEd )
3875 {
3876    AvailExpr* ae;
3877
3878    switch (e->tag) {
3879       case Iex_Unop:
3880          if (e->Iex.Unop.arg->tag == Iex_RdTmp) {
3881             ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3882             ae->tag      = Ut;
3883             ae->u.Ut.op  = e->Iex.Unop.op;
3884             ae->u.Ut.arg = e->Iex.Unop.arg->Iex.RdTmp.tmp;
3885             return ae;
3886          }
3887          break;
3888
3889       case Iex_Binop:
3890          if (e->Iex.Binop.arg1->tag == Iex_RdTmp) {
3891             if (e->Iex.Binop.arg2->tag == Iex_RdTmp) {
3892                ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3893                ae->tag        = Btt;
3894                ae->u.Btt.op   = e->Iex.Binop.op;
3895                ae->u.Btt.arg1 = e->Iex.Binop.arg1->Iex.RdTmp.tmp;
3896                ae->u.Btt.arg2 = e->Iex.Binop.arg2->Iex.RdTmp.tmp;
3897                return ae;
3898             }
3899             if (e->Iex.Binop.arg2->tag == Iex_Const) {
3900                ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3901                ae->tag        = Btc;
3902                ae->u.Btc.op   = e->Iex.Binop.op;
3903                ae->u.Btc.arg1 = e->Iex.Binop.arg1->Iex.RdTmp.tmp;
3904                ae->u.Btc.con2 = *(e->Iex.Binop.arg2->Iex.Const.con);
3905                return ae;
3906             }
3907          } else if (e->Iex.Binop.arg1->tag == Iex_Const
3908                     && e->Iex.Binop.arg2->tag == Iex_RdTmp) {
3909             ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3910             ae->tag        = Bct;
3911             ae->u.Bct.op   = e->Iex.Binop.op;
3912             ae->u.Bct.arg2 = e->Iex.Binop.arg2->Iex.RdTmp.tmp;
3913             ae->u.Bct.con1 = *(e->Iex.Binop.arg1->Iex.Const.con);
3914             return ae;
3915          }
3916          break;
3917
3918       case Iex_Const:
3919          if (e->Iex.Const.con->tag == Ico_F64i) {
3920             ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3921             ae->tag          = Cf64i;
3922             ae->u.Cf64i.f64i = e->Iex.Const.con->Ico.F64i;
3923             return ae;
3924          }
3925          break;
3926
3927       case Iex_ITE:
3928          if (e->Iex.ITE.cond->tag == Iex_RdTmp) {
3929             if (e->Iex.ITE.iffalse->tag == Iex_RdTmp) {
3930                if (e->Iex.ITE.iftrue->tag == Iex_RdTmp) {
3931                   ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3932                   ae->tag       = Ittt;
3933                   ae->u.Ittt.co = e->Iex.ITE.cond->Iex.RdTmp.tmp;
3934                   ae->u.Ittt.e1 = e->Iex.ITE.iftrue->Iex.RdTmp.tmp;
3935                   ae->u.Ittt.e0 = e->Iex.ITE.iffalse->Iex.RdTmp.tmp;
3936                   return ae;
3937                }
3938                if (e->Iex.ITE.iftrue->tag == Iex_Const) {
3939                   ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3940                   ae->tag       = Itct;
3941                   ae->u.Itct.co = e->Iex.ITE.cond->Iex.RdTmp.tmp;
3942                   ae->u.Itct.con1 = *(e->Iex.ITE.iftrue->Iex.Const.con);
3943                   ae->u.Itct.e0 = e->Iex.ITE.iffalse->Iex.RdTmp.tmp;
3944                   return ae;
3945                }
3946             } else if (e->Iex.ITE.iffalse->tag == Iex_Const) {
3947                if (e->Iex.ITE.iftrue->tag == Iex_RdTmp) {
3948                   ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3949                   ae->tag       = Ittc;
3950                   ae->u.Ittc.co = e->Iex.ITE.cond->Iex.RdTmp.tmp;
3951                   ae->u.Ittc.e1 = e->Iex.ITE.iftrue->Iex.RdTmp.tmp;
3952                   ae->u.Ittc.con0 = *(e->Iex.ITE.iffalse->Iex.Const.con);
3953                   return ae;
3954                }
3955                if (e->Iex.ITE.iftrue->tag == Iex_Const) {
3956                   ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3957                   ae->tag       = Itcc;
3958                   ae->u.Itcc.co = e->Iex.ITE.cond->Iex.RdTmp.tmp;
3959                   ae->u.Itcc.con1 = *(e->Iex.ITE.iftrue->Iex.Const.con);
3960                   ae->u.Itcc.con0 = *(e->Iex.ITE.iffalse->Iex.Const.con);
3961                   return ae;
3962                }
3963             }
3964          }
3965          break;
3966
3967       case Iex_GetI:
3968          if (e->Iex.GetI.ix->tag == Iex_RdTmp) {
3969             ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3970             ae->tag           = GetIt;
3971             ae->u.GetIt.descr = e->Iex.GetI.descr;
3972             ae->u.GetIt.ix    = e->Iex.GetI.ix->Iex.RdTmp.tmp;
3973             ae->u.GetIt.bias  = e->Iex.GetI.bias;
3974             return ae;
3975          }
3976          break;
3977
3978       case Iex_CCall:
3979          ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
3980          ae->tag = CCall;
3981          /* Ok to share only the cee, since it is immutable. */
3982          ae->u.CCall.cee   = e->Iex.CCall.cee;
3983          ae->u.CCall.retty = e->Iex.CCall.retty;
3984          /* irExprVec_to_TmpOrConsts will assert if the args are
3985             neither tmps nor constants, but that's ok .. that's all they
3986             should be. */
3987          irExprVec_to_TmpOrConsts(
3988                                   &ae->u.CCall.args, &ae->u.CCall.nArgs,
3989                                   e->Iex.CCall.args
3990                                  );
3991          return ae;
3992
3993       case Iex_Load:
3994          /* If the caller of do_cse_BB has requested that loads also
3995             be CSEd, convert them into AvailExprs.  If not, we'll just
3996             return NULL here, and the load never becomes considered
3997             "available", which effectively disables CSEing of them, as
3998             desired. */
3999          if (allowLoadsToBeCSEd) {
4000             ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
4001             ae->tag        = Load;
4002             ae->u.Load.end = e->Iex.Load.end;
4003             ae->u.Load.ty  = e->Iex.Load.ty;
4004             irExpr_to_TmpOrConst(&ae->u.Load.addr, e->Iex.Load.addr);
4005             return ae;
4006          }
4007          break;
4008
4009       default:
4010          break;
4011    }
4012
4013    return NULL;
4014 }
4015
4016
4017 /* The BB is modified in-place.  Returns True if any changes were
4018    made.  The caller can choose whether or not loads should be CSEd.
4019    In the normal course of things we don't do that, since CSEing loads
4020    is something of a dodgy proposition if the guest program is doing
4021    some screwy stuff to do with races and spinloops. */
4022
4023 static Bool do_cse_BB ( IRSB* bb, Bool allowLoadsToBeCSEd )
4024 {
4025    Int        i, j, paranoia;
4026    IRTemp     t, q;
4027    IRStmt*    st;
4028    AvailExpr* eprime;
4029    AvailExpr* ae;
4030    Bool       invalidate;
4031    Bool       anyDone = False;
4032
4033    HashHW* tenv = newHHW(); /* :: IRTemp -> IRTemp */
4034    HashHW* aenv = newHHW(); /* :: AvailExpr* -> IRTemp */
4035
4036    vassert(sizeof(IRTemp) <= sizeof(HWord));
4037
4038    if (0) { ppIRSB(bb); vex_printf("\n\n"); }
4039
4040    /* Iterate forwards over the stmts.
4041       On seeing "t = E", where E is one of the AvailExpr forms:
4042          let E' = apply tenv substitution to E
4043          search aenv for E'
4044             if a mapping E' -> q is found,
4045                replace this stmt by "t = q"
4046                and add binding t -> q to tenv
4047             else
4048                add binding E' -> t to aenv
4049                replace this stmt by "t = E'"
4050
4051       Other statements are only interesting to the extent that they
4052       might invalidate some of the expressions in aenv.  So there is
4053       an invalidate-bindings check for each statement seen.
4054    */
4055    for (i = 0; i < bb->stmts_used; i++) {
4056       st = bb->stmts[i];
4057
4058       /* ------ BEGIN invalidate aenv bindings ------ */
4059       /* This is critical: remove from aenv any E' -> .. bindings
4060          which might be invalidated by this statement.  The only
4061          vulnerable kind of bindings are the GetI and Load kinds.
4062             Dirty call - dump (paranoia level -> 2)
4063             Store      - dump (ditto)
4064             Put, PutI  - dump unless no-overlap is proven (.. -> 1)
4065          Uses getAliasingRelation_IC and getAliasingRelation_II
4066          to do the no-overlap assessments needed for Put/PutI.
4067       */
4068       switch (st->tag) {
4069          case Ist_Dirty: case Ist_Store: case Ist_MBE:
4070          case Ist_CAS: case Ist_LLSC:
4071          case Ist_StoreG:
4072             paranoia = 2; break;
4073          case Ist_Put: case Ist_PutI:
4074             paranoia = 1; break;
4075          case Ist_NoOp: case Ist_IMark: case Ist_AbiHint:
4076          case Ist_WrTmp: case Ist_Exit: case Ist_LoadG:
4077             paranoia = 0; break;
4078          default:
4079             vpanic("do_cse_BB(1)");
4080       }
4081
4082       if (paranoia > 0) {
4083          for (j = 0; j < aenv->used; j++) {
4084             if (!aenv->inuse[j])
4085                continue;
4086             ae = (AvailExpr*)aenv->key[j];
4087             if (ae->tag != GetIt && ae->tag != Load)
4088                continue;
4089             invalidate = False;
4090             if (paranoia >= 2) {
4091                invalidate = True;
4092             } else {
4093                vassert(paranoia == 1);
4094                if (ae->tag == Load) {
4095                   /* Loads can be invalidated by anything that could
4096                      possibly touch memory.  But in that case we
4097                      should have |paranoia| == 2 and we won't get
4098                      here.  So there's nothing to do; we don't have to
4099                      invalidate the load. */
4100                }
4101                else
4102                if (st->tag == Ist_Put) {
4103                   if (getAliasingRelation_IC(
4104                          ae->u.GetIt.descr,
4105                          IRExpr_RdTmp(ae->u.GetIt.ix),
4106                          st->Ist.Put.offset,
4107                          typeOfIRExpr(bb->tyenv,st->Ist.Put.data)
4108                       ) != NoAlias)
4109                      invalidate = True;
4110                }
4111                else
4112                if (st->tag == Ist_PutI) {
4113                   IRPutI *puti = st->Ist.PutI.details;
4114                   if (getAliasingRelation_II(
4115                          ae->u.GetIt.descr,
4116                          IRExpr_RdTmp(ae->u.GetIt.ix),
4117                          ae->u.GetIt.bias,
4118                          puti->descr,
4119                          puti->ix,
4120                          puti->bias
4121                       ) != NoAlias)
4122                      invalidate = True;
4123                }
4124                else
4125                   vpanic("do_cse_BB(2)");
4126             }
4127
4128             if (invalidate) {
4129                aenv->inuse[j] = False;
4130                aenv->key[j]   = (HWord)NULL;  /* be sure */
4131             }
4132          } /* for j */
4133       } /* paranoia > 0 */
4134
4135       /* ------ ENV invalidate aenv bindings ------ */
4136
4137       /* ignore not-interestings */
4138       if (st->tag != Ist_WrTmp)
4139          continue;
4140
4141       t = st->Ist.WrTmp.tmp;
4142       eprime = irExpr_to_AvailExpr(st->Ist.WrTmp.data, allowLoadsToBeCSEd);
4143       /* ignore if not of AvailExpr form */
4144       if (!eprime)
4145          continue;
4146
4147       /* vex_printf("considering: " ); ppIRStmt(st); vex_printf("\n"); */
4148
4149       /* apply tenv */
4150       subst_AvailExpr( tenv, eprime );
4151
4152       /* search aenv for eprime, unfortunately the hard way */
4153       for (j = 0; j < aenv->used; j++)
4154          if (aenv->inuse[j] && eq_AvailExpr(eprime, (AvailExpr*)aenv->key[j]))
4155             break;
4156
4157       if (j < aenv->used) {
4158          /* A binding E' -> q was found.  Replace stmt by "t = q" and
4159             note the t->q binding in tenv. */
4160          /* (this is the core of the CSE action) */
4161          q = (IRTemp)aenv->val[j];
4162          bb->stmts[i] = IRStmt_WrTmp( t, IRExpr_RdTmp(q) );
4163          addToHHW( tenv, (HWord)t, (HWord)q );
4164          anyDone = True;
4165       } else {
4166          /* No binding was found, so instead we add E' -> t to our
4167             collection of available expressions, replace this stmt
4168             with "t = E'", and move on. */
4169          bb->stmts[i] = IRStmt_WrTmp( t, availExpr_to_IRExpr(eprime) );
4170          addToHHW( aenv, (HWord)eprime, (HWord)t );
4171       }
4172    }
4173
4174    /*
4175    ppIRSB(bb);
4176    sanityCheckIRSB(bb, Ity_I32);
4177    vex_printf("\n\n");
4178    */
4179    return anyDone;
4180 }
4181
4182
4183 /*---------------------------------------------------------------*/
4184 /*--- Add32/Sub32 chain collapsing                            ---*/
4185 /*---------------------------------------------------------------*/
4186
4187 /* ----- Helper functions for Add32/Sub32 chain collapsing ----- */
4188
4189 /* Is this expression "Add32(tmp,const)" or "Sub32(tmp,const)" ?  If
4190    yes, set *tmp and *i32 appropriately.  *i32 is set as if the
4191    root node is Add32, not Sub32. */
4192
4193 static Bool isAdd32OrSub32 ( IRExpr* e, IRTemp* tmp, Int* i32 )
4194 {
4195    if (e->tag != Iex_Binop)
4196       return False;
4197    if (e->Iex.Binop.op != Iop_Add32 && e->Iex.Binop.op != Iop_Sub32)
4198       return False;
4199    if (e->Iex.Binop.arg1->tag != Iex_RdTmp)
4200       return False;
4201    if (e->Iex.Binop.arg2->tag != Iex_Const)
4202       return False;
4203    *tmp = e->Iex.Binop.arg1->Iex.RdTmp.tmp;
4204    *i32 = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32);
4205    if (e->Iex.Binop.op == Iop_Sub32)
4206       *i32 = -*i32;
4207    return True;
4208 }
4209
4210
4211 /* Figure out if tmp can be expressed as tmp2 +32 const, for some
4212    other tmp2.  Scan backwards from the specified start point -- an
4213    optimisation. */
4214
4215 static Bool collapseChain ( IRSB* bb, Int startHere,
4216                             IRTemp tmp,
4217                             IRTemp* tmp2, Int* i32 )
4218 {
4219    Int     j, ii;
4220    IRTemp  vv;
4221    IRStmt* st;
4222    IRExpr* e;
4223
4224    /* the (var, con) pair contain the current 'representation' for
4225       'tmp'.  We start with 'tmp + 0'.  */
4226    IRTemp var = tmp;
4227    Int    con = 0;
4228
4229    /* Scan backwards to see if tmp can be replaced by some other tmp
4230      +/- a constant. */
4231    for (j = startHere; j >= 0; j--) {
4232       st = bb->stmts[j];
4233       if (st->tag != Ist_WrTmp)
4234          continue;
4235       if (st->Ist.WrTmp.tmp != var)
4236          continue;
4237       e = st->Ist.WrTmp.data;
4238       if (!isAdd32OrSub32(e, &vv, &ii))
4239          break;
4240       var = vv;
4241       con += ii;
4242    }
4243    if (j == -1)
4244       /* no earlier binding for var .. ill-formed IR */
4245       vpanic("collapseChain");
4246
4247    /* so, did we find anything interesting? */
4248    if (var == tmp)
4249       return False; /* no .. */
4250
4251    *tmp2 = var;
4252    *i32  = con;
4253    return True;
4254 }
4255
4256
4257 /* ------- Main function for Add32/Sub32 chain collapsing ------ */
4258
4259 static void collapse_AddSub_chains_BB ( IRSB* bb )
4260 {
4261    IRStmt *st;
4262    IRTemp var, var2;
4263    Int    i, con, con2;
4264
4265    for (i = bb->stmts_used-1; i >= 0; i--) {
4266       st = bb->stmts[i];
4267       if (st->tag == Ist_NoOp)
4268          continue;
4269
4270       /* Try to collapse 't1 = Add32/Sub32(t2, con)'. */
4271
4272       if (st->tag == Ist_WrTmp
4273           && isAdd32OrSub32(st->Ist.WrTmp.data, &var, &con)) {
4274
4275          /* So e1 is of the form Add32(var,con) or Sub32(var,-con).
4276             Find out if var can be expressed as var2 + con2. */
4277          if (collapseChain(bb, i-1, var, &var2, &con2)) {
4278             if (DEBUG_IROPT) {
4279                vex_printf("replacing1 ");
4280                ppIRStmt(st);
4281                vex_printf(" with ");
4282             }
4283             con2 += con;
4284             bb->stmts[i]
4285                = IRStmt_WrTmp(
4286                     st->Ist.WrTmp.tmp,
4287                     (con2 >= 0)
4288                       ? IRExpr_Binop(Iop_Add32,
4289                                      IRExpr_RdTmp(var2),
4290                                      IRExpr_Const(IRConst_U32(con2)))
4291                       : IRExpr_Binop(Iop_Sub32,
4292                                      IRExpr_RdTmp(var2),
4293                                      IRExpr_Const(IRConst_U32(-con2)))
4294                  );
4295             if (DEBUG_IROPT) {
4296                ppIRStmt(bb->stmts[i]);
4297                vex_printf("\n");
4298             }
4299          }
4300
4301          continue;
4302       }
4303
4304       /* Try to collapse 't1 = GetI[t2, con]'. */
4305
4306       if (st->tag == Ist_WrTmp
4307           && st->Ist.WrTmp.data->tag == Iex_GetI
4308           && st->Ist.WrTmp.data->Iex.GetI.ix->tag == Iex_RdTmp
4309           && collapseChain(bb, i-1, st->Ist.WrTmp.data->Iex.GetI.ix
4310                                       ->Iex.RdTmp.tmp, &var2, &con2)) {
4311          if (DEBUG_IROPT) {
4312             vex_printf("replacing3 ");
4313             ppIRStmt(st);
4314             vex_printf(" with ");
4315          }
4316          con2 += st->Ist.WrTmp.data->Iex.GetI.bias;
4317          bb->stmts[i]
4318             = IRStmt_WrTmp(
4319                  st->Ist.WrTmp.tmp,
4320                  IRExpr_GetI(st->Ist.WrTmp.data->Iex.GetI.descr,
4321                              IRExpr_RdTmp(var2),
4322                              con2));
4323          if (DEBUG_IROPT) {
4324             ppIRStmt(bb->stmts[i]);
4325             vex_printf("\n");
4326          }
4327          continue;
4328       }
4329
4330       /* Perhaps st is PutI[t, con] ? */
4331       IRPutI *puti = st->Ist.PutI.details;
4332       if (st->tag == Ist_PutI
4333           && puti->ix->tag == Iex_RdTmp
4334           && collapseChain(bb, i-1, puti->ix->Iex.RdTmp.tmp,
4335                                &var2, &con2)) {
4336          if (DEBUG_IROPT) {
4337             vex_printf("replacing2 ");
4338             ppIRStmt(st);
4339             vex_printf(" with ");
4340          }
4341          con2 += puti->bias;
4342          bb->stmts[i]
4343             = IRStmt_PutI(mkIRPutI(puti->descr,
4344                                    IRExpr_RdTmp(var2),
4345                                    con2,
4346                                    puti->data));
4347          if (DEBUG_IROPT) {
4348             ppIRStmt(bb->stmts[i]);
4349             vex_printf("\n");
4350          }
4351          continue;
4352       }
4353
4354    } /* for */
4355 }
4356
4357
4358 /*---------------------------------------------------------------*/
4359 /*--- PutI/GetI transformations                               ---*/
4360 /*---------------------------------------------------------------*/
4361
4362 /* Given the parts (descr, tmp, bias) for a GetI, scan backwards from
4363    the given starting point to find, if any, a PutI which writes
4364    exactly the same piece of guest state, and so return the expression
4365    that the PutI writes.  This is the core of PutI-GetI forwarding. */
4366
4367 static
4368 IRExpr* findPutI ( IRSB* bb, Int startHere,
4369                    IRRegArray* descrG, IRExpr* ixG, Int biasG )
4370 {
4371    Int        j;
4372    IRStmt*    st;
4373    GSAliasing relation;
4374
4375    if (0) {
4376       vex_printf("\nfindPutI ");
4377       ppIRRegArray(descrG);
4378       vex_printf(" ");
4379       ppIRExpr(ixG);
4380       vex_printf(" %d\n", biasG);
4381    }
4382
4383    /* Scan backwards in bb from startHere to find a suitable PutI
4384       binding for (descrG, ixG, biasG), if any. */
4385
4386    for (j = startHere; j >= 0; j--) {
4387       st = bb->stmts[j];
4388       if (st->tag == Ist_NoOp)
4389          continue;
4390
4391       if (st->tag == Ist_Put) {
4392          /* Non-indexed Put.  This can't give a binding, but we do
4393             need to check it doesn't invalidate the search by
4394             overlapping any part of the indexed guest state. */
4395
4396          relation
4397             = getAliasingRelation_IC(
4398                  descrG, ixG,
4399                  st->Ist.Put.offset,
4400                  typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
4401
4402          if (relation == NoAlias) {
4403             /* we're OK; keep going */
4404             continue;
4405          } else {
4406             /* relation == UnknownAlias || relation == ExactAlias */
4407             /* If this assertion fails, we've found a Put which writes
4408                an area of guest state which is read by a GetI.  Which
4409                is unlikely (although not per se wrong). */
4410             vassert(relation != ExactAlias);
4411             /* This Put potentially writes guest state that the GetI
4412                reads; we must fail. */
4413             return NULL;
4414          }
4415       }
4416
4417       if (st->tag == Ist_PutI) {
4418          IRPutI *puti = st->Ist.PutI.details;
4419
4420          relation = getAliasingRelation_II(
4421                        descrG, ixG, biasG,
4422                        puti->descr,
4423                        puti->ix,
4424                        puti->bias
4425                     );
4426
4427          if (relation == NoAlias) {
4428             /* This PutI definitely doesn't overlap.  Ignore it and
4429                keep going. */
4430             continue; /* the for j loop */
4431          }
4432
4433          if (relation == UnknownAlias) {
4434             /* We don't know if this PutI writes to the same guest
4435                state that the GetI, or not.  So we have to give up. */
4436             return NULL;
4437          }
4438
4439          /* Otherwise, we've found what we're looking for.  */
4440          vassert(relation == ExactAlias);
4441          return puti->data;
4442
4443       } /* if (st->tag == Ist_PutI) */
4444
4445       if (st->tag == Ist_Dirty) {
4446          /* Be conservative.  If the dirty call has any guest effects at
4447             all, give up.  We could do better -- only give up if there
4448             are any guest writes/modifies. */
4449          if (st->Ist.Dirty.details->nFxState > 0)
4450             return NULL;
4451       }
4452
4453    } /* for */
4454
4455    /* No valid replacement was found. */
4456    return NULL;
4457 }
4458
4459
4460
4461 /* Assuming pi is a PutI stmt, is s2 identical to it (in the sense
4462    that it writes exactly the same piece of guest state) ?  Safe
4463    answer: False. */
4464
4465 static Bool identicalPutIs ( IRStmt* pi, IRStmt* s2 )
4466 {
4467    vassert(pi->tag == Ist_PutI);
4468    if (s2->tag != Ist_PutI)
4469       return False;
4470
4471    IRPutI *p1 = pi->Ist.PutI.details;
4472    IRPutI *p2 = s2->Ist.PutI.details;
4473
4474    return toBool(
4475           getAliasingRelation_II(
4476              p1->descr, p1->ix, p1->bias,
4477              p2->descr, p2->ix, p2->bias
4478           )
4479           == ExactAlias
4480           );
4481 }
4482
4483
4484 /* Assuming pi is a PutI stmt, is s2 a Get/GetI/Put/PutI which might
4485    overlap it?  Safe answer: True.  Note, we could do a lot better
4486    than this if needed. */
4487
4488 static
4489 Bool guestAccessWhichMightOverlapPutI (
4490         IRTypeEnv* tyenv, IRStmt* pi, IRStmt* s2
4491      )
4492 {
4493    GSAliasing relation;
4494    UInt       minoffP, maxoffP;
4495
4496    vassert(pi->tag == Ist_PutI);
4497
4498    IRPutI *p1 = pi->Ist.PutI.details;
4499
4500    getArrayBounds(p1->descr, &minoffP, &maxoffP);
4501    switch (s2->tag) {
4502
4503       case Ist_NoOp:
4504       case Ist_IMark:
4505          return False;
4506
4507       case Ist_MBE:
4508       case Ist_AbiHint:
4509          /* just be paranoid ... these should be rare. */
4510          return True;
4511
4512       case Ist_CAS:
4513          /* This is unbelievably lame, but it's probably not
4514             significant from a performance point of view.  Really, a
4515             CAS is a load-store op, so it should be safe to say False.
4516             However .. */
4517          return True;
4518
4519       case Ist_Dirty:
4520          /* If the dirty call has any guest effects at all, give up.
4521             Probably could do better. */
4522          if (s2->Ist.Dirty.details->nFxState > 0)
4523             return True;
4524          return False;
4525
4526       case Ist_Put:
4527          vassert(isIRAtom(s2->Ist.Put.data));
4528          relation
4529             = getAliasingRelation_IC(
4530                  p1->descr, p1->ix,
4531                  s2->Ist.Put.offset,
4532                  typeOfIRExpr(tyenv,s2->Ist.Put.data)
4533               );
4534          goto have_relation;
4535
4536       case Ist_PutI: {
4537          IRPutI *p2 = s2->Ist.PutI.details;
4538
4539          vassert(isIRAtom(p2->ix));
4540          vassert(isIRAtom(p2->data));
4541          relation
4542             = getAliasingRelation_II(
4543                  p1->descr, p1->ix, p1->bias,
4544                  p2->descr, p2->ix, p2->bias
4545               );
4546          goto have_relation;
4547       }
4548
4549       case Ist_WrTmp:
4550          if (s2->Ist.WrTmp.data->tag == Iex_GetI) {
4551             relation
4552                = getAliasingRelation_II(
4553                     p1->descr, p1->ix, p1->bias,
4554                     s2->Ist.WrTmp.data->Iex.GetI.descr,
4555                     s2->Ist.WrTmp.data->Iex.GetI.ix,
4556                     s2->Ist.WrTmp.data->Iex.GetI.bias
4557                  );
4558             goto have_relation;
4559          }
4560          if (s2->Ist.WrTmp.data->tag == Iex_Get) {
4561             relation
4562                = getAliasingRelation_IC(
4563                     p1->descr, p1->ix,
4564                     s2->Ist.WrTmp.data->Iex.Get.offset,
4565                     s2->Ist.WrTmp.data->Iex.Get.ty
4566                  );
4567             goto have_relation;
4568          }
4569          return False;
4570
4571       case Ist_Store:
4572          vassert(isIRAtom(s2->Ist.Store.addr));
4573          vassert(isIRAtom(s2->Ist.Store.data));
4574          return False;
4575
4576       default:
4577          vex_printf("\n"); ppIRStmt(s2); vex_printf("\n");
4578          vpanic("guestAccessWhichMightOverlapPutI");
4579    }
4580
4581   have_relation:
4582    if (relation == NoAlias)
4583       return False;
4584    else
4585       return True; /* ExactAlias or UnknownAlias */
4586 }
4587
4588
4589
4590 /* ---------- PutI/GetI transformations main functions --------- */
4591
4592 /* Remove redundant GetIs, to the extent that they can be detected.
4593    bb is modified in-place. */
4594
4595 static
4596 void do_redundant_GetI_elimination ( IRSB* bb )
4597 {
4598    Int     i;
4599    IRStmt* st;
4600
4601    for (i = bb->stmts_used-1; i >= 0; i--) {
4602       st = bb->stmts[i];
4603       if (st->tag == Ist_NoOp)
4604          continue;
4605
4606       if (st->tag == Ist_WrTmp
4607           && st->Ist.WrTmp.data->tag == Iex_GetI
4608           && st->Ist.WrTmp.data->Iex.GetI.ix->tag == Iex_RdTmp) {
4609          IRRegArray* descr = st->Ist.WrTmp.data->Iex.GetI.descr;
4610          IRExpr*     ix    = st->Ist.WrTmp.data->Iex.GetI.ix;
4611          Int         bias  = st->Ist.WrTmp.data->Iex.GetI.bias;
4612          IRExpr*     replacement = findPutI(bb, i-1, descr, ix, bias);
4613          if (replacement
4614              && isIRAtom(replacement)
4615              /* Make sure we're doing a type-safe transformation! */
4616              && typeOfIRExpr(bb->tyenv, replacement) == descr->elemTy) {
4617             if (DEBUG_IROPT) {
4618                vex_printf("rGI:  ");
4619                ppIRExpr(st->Ist.WrTmp.data);
4620                vex_printf(" -> ");
4621                ppIRExpr(replacement);
4622                vex_printf("\n");
4623             }
4624             bb->stmts[i] = IRStmt_WrTmp(st->Ist.WrTmp.tmp, replacement);
4625          }
4626       }
4627    }
4628
4629 }
4630
4631
4632 /* Remove redundant PutIs, to the extent which they can be detected.
4633    bb is modified in-place. */
4634
4635 static
4636 void do_redundant_PutI_elimination ( IRSB* bb, VexRegisterUpdates pxControl )
4637 {
4638    Int    i, j;
4639    Bool   delete;
4640    IRStmt *st, *stj;
4641
4642    vassert(pxControl < VexRegUpdAllregsAtEachInsn);
4643
4644    for (i = 0; i < bb->stmts_used; i++) {
4645       st = bb->stmts[i];
4646       if (st->tag != Ist_PutI)
4647          continue;
4648       /* Ok, search forwards from here to see if we can find another
4649          PutI which makes this one redundant, and dodging various
4650          hazards.  Search forwards:
4651          * If conditional exit, give up (because anything after that
4652            does not postdominate this put).
4653          * If a Get which might overlap, give up (because this PutI
4654            not necessarily dead).
4655          * If a Put which is identical, stop with success.
4656          * If a Put which might overlap, but is not identical, give up.
4657          * If a dirty helper call which might write guest state, give up.
4658          * If a Put which definitely doesn't overlap, or any other
4659            kind of stmt, continue.
4660       */
4661       delete = False;
4662       for (j = i+1; j < bb->stmts_used; j++) {
4663          stj = bb->stmts[j];
4664          if (stj->tag == Ist_NoOp)
4665             continue;
4666          if (identicalPutIs(st, stj)) {
4667             /* success! */
4668             delete = True;
4669             break;
4670          }
4671          if (stj->tag == Ist_Exit)
4672             /* give up */
4673             break;
4674          if (st->tag == Ist_Dirty)
4675             /* give up; could do better here */
4676             break;
4677          if (guestAccessWhichMightOverlapPutI(bb->tyenv, st, stj))
4678             /* give up */
4679            break;
4680       }
4681
4682       if (delete) {
4683          if (DEBUG_IROPT) {
4684             vex_printf("rPI:  ");
4685             ppIRStmt(st);
4686             vex_printf("\n");
4687          }
4688          bb->stmts[i] = IRStmt_NoOp();
4689       }
4690
4691    }
4692 }
4693
4694
4695 /*---------------------------------------------------------------*/
4696 /*--- Loop unrolling                                          ---*/
4697 /*---------------------------------------------------------------*/
4698
4699 /* Adjust all tmp values (names) in e by delta.  e is destructively
4700    modified. */
4701
4702 static void deltaIRExpr ( IRExpr* e, Int delta )
4703 {
4704    Int i;
4705    switch (e->tag) {
4706       case Iex_RdTmp:
4707          e->Iex.RdTmp.tmp += delta;
4708          break;
4709       case Iex_Get:
4710       case Iex_Const:
4711          break;
4712       case Iex_GetI:
4713          deltaIRExpr(e->Iex.GetI.ix, delta);
4714          break;
4715       case Iex_Qop:
4716          deltaIRExpr(e->Iex.Qop.details->arg1, delta);
4717          deltaIRExpr(e->Iex.Qop.details->arg2, delta);
4718          deltaIRExpr(e->Iex.Qop.details->arg3, delta);
4719          deltaIRExpr(e->Iex.Qop.details->arg4, delta);
4720          break;
4721       case Iex_Triop:
4722          deltaIRExpr(e->Iex.Triop.details->arg1, delta);
4723          deltaIRExpr(e->Iex.Triop.details->arg2, delta);
4724          deltaIRExpr(e->Iex.Triop.details->arg3, delta);
4725          break;
4726       case Iex_Binop:
4727          deltaIRExpr(e->Iex.Binop.arg1, delta);
4728          deltaIRExpr(e->Iex.Binop.arg2, delta);
4729          break;
4730       case Iex_Unop:
4731          deltaIRExpr(e->Iex.Unop.arg, delta);
4732          break;
4733       case Iex_Load:
4734          deltaIRExpr(e->Iex.Load.addr, delta);
4735          break;
4736       case Iex_CCall:
4737          for (i = 0; e->Iex.CCall.args[i]; i++)
4738             deltaIRExpr(e->Iex.CCall.args[i], delta);
4739          break;
4740       case Iex_ITE:
4741          deltaIRExpr(e->Iex.ITE.cond, delta);
4742          deltaIRExpr(e->Iex.ITE.iftrue, delta);
4743          deltaIRExpr(e->Iex.ITE.iffalse, delta);
4744          break;
4745       default:
4746          vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
4747          vpanic("deltaIRExpr");
4748    }
4749 }
4750
4751 /* Adjust all tmp values (names) in st by delta.  st is destructively
4752    modified. */
4753
4754 /*static*/ void deltaIRStmt ( IRStmt* st, Int delta )
4755 {
4756    Int      i;
4757    IRDirty* d;
4758    switch (st->tag) {
4759       case Ist_NoOp:
4760       case Ist_IMark:
4761       case Ist_MBE:
4762          break;
4763       case Ist_AbiHint:
4764          deltaIRExpr(st->Ist.AbiHint.base, delta);
4765          deltaIRExpr(st->Ist.AbiHint.nia, delta);
4766          break;
4767       case Ist_Put:
4768          deltaIRExpr(st->Ist.Put.data, delta);
4769          break;
4770       case Ist_PutI:
4771          deltaIRExpr(st->Ist.PutI.details->ix, delta);
4772          deltaIRExpr(st->Ist.PutI.details->data, delta);
4773          break;
4774       case Ist_WrTmp:
4775          st->Ist.WrTmp.tmp += delta;
4776          deltaIRExpr(st->Ist.WrTmp.data, delta);
4777          break;
4778       case Ist_Exit:
4779          deltaIRExpr(st->Ist.Exit.guard, delta);
4780          break;
4781       case Ist_Store:
4782          deltaIRExpr(st->Ist.Store.addr, delta);
4783          deltaIRExpr(st->Ist.Store.data, delta);
4784          break;
4785       case Ist_StoreG: {
4786          IRStoreG* sg = st->Ist.StoreG.details;
4787          deltaIRExpr(sg->addr, delta);
4788          deltaIRExpr(sg->data, delta);
4789          deltaIRExpr(sg->guard, delta);
4790          break;
4791       }
4792       case Ist_LoadG: {
4793          IRLoadG* lg = st->Ist.LoadG.details;
4794          lg->dst += delta;
4795          deltaIRExpr(lg->addr, delta);
4796          deltaIRExpr(lg->alt, delta);
4797          deltaIRExpr(lg->guard, delta);
4798          break;
4799       }
4800       case Ist_CAS:
4801          if (st->Ist.CAS.details->oldHi != IRTemp_INVALID)
4802             st->Ist.CAS.details->oldHi += delta;
4803          st->Ist.CAS.details->oldLo += delta;
4804          deltaIRExpr(st->Ist.CAS.details->addr, delta);
4805          if (st->Ist.CAS.details->expdHi)
4806             deltaIRExpr(st->Ist.CAS.details->expdHi, delta);
4807          deltaIRExpr(st->Ist.CAS.details->expdLo, delta);
4808          if (st->Ist.CAS.details->dataHi)
4809             deltaIRExpr(st->Ist.CAS.details->dataHi, delta);
4810          deltaIRExpr(st->Ist.CAS.details->dataLo, delta);
4811          break;
4812       case Ist_LLSC:
4813          st->Ist.LLSC.result += delta;
4814          deltaIRExpr(st->Ist.LLSC.addr, delta);
4815          if (st->Ist.LLSC.storedata)
4816             deltaIRExpr(st->Ist.LLSC.storedata, delta);
4817          break;
4818       case Ist_Dirty:
4819          d = st->Ist.Dirty.details;
4820          deltaIRExpr(d->guard, delta);
4821          for (i = 0; d->args[i]; i++) {
4822             IRExpr* arg = d->args[i];
4823             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg)))
4824                deltaIRExpr(arg, delta);
4825          }
4826          if (d->tmp != IRTemp_INVALID)
4827             d->tmp += delta;
4828          if (d->mAddr)
4829             deltaIRExpr(d->mAddr, delta);
4830          break;
4831       default:
4832          vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
4833          vpanic("deltaIRStmt");
4834    }
4835 }
4836
4837
4838 /* If possible, return a loop-unrolled version of bb0.  The original
4839    is changed.  If not possible, return NULL.  */
4840
4841 /* The two schemas considered are:
4842
4843      X: BODY; goto X
4844
4845      which unrolls to (eg)  X: BODY;BODY; goto X
4846
4847    and
4848
4849        X: BODY; if (c) goto X; goto Y
4850    which trivially transforms to
4851        X: BODY; if (!c) goto Y; goto X;
4852    so it falls in the scope of the first case.
4853
4854    X and Y must be literal (guest) addresses.
4855 */
4856
4857 static Int calc_unroll_factor( IRSB* bb )
4858 {
4859    Int n_stmts, i;
4860
4861    n_stmts = 0;
4862    for (i = 0; i < bb->stmts_used; i++) {
4863       if (bb->stmts[i]->tag != Ist_NoOp)
4864          n_stmts++;
4865    }
4866
4867    if (n_stmts <= vex_control.iropt_unroll_thresh/8) {
4868       if (vex_control.iropt_verbosity > 0)
4869          vex_printf("vex iropt: 8 x unrolling (%d sts -> %d sts)\n",
4870                     n_stmts, 8* n_stmts);
4871       return 8;
4872    }
4873    if (n_stmts <= vex_control.iropt_unroll_thresh/4) {
4874       if (vex_control.iropt_verbosity > 0)
4875          vex_printf("vex iropt: 4 x unrolling (%d sts -> %d sts)\n",
4876                     n_stmts, 4* n_stmts);
4877       return 4;
4878    }
4879
4880    if (n_stmts <= vex_control.iropt_unroll_thresh/2) {
4881       if (vex_control.iropt_verbosity > 0)
4882          vex_printf("vex iropt: 2 x unrolling (%d sts -> %d sts)\n",
4883                     n_stmts, 2* n_stmts);
4884       return 2;
4885    }
4886
4887    if (vex_control.iropt_verbosity > 0)
4888       vex_printf("vex iropt: not unrolling (%d sts)\n", n_stmts);
4889
4890    return 1;
4891 }
4892
4893
4894 static IRSB* maybe_loop_unroll_BB ( IRSB* bb0, Addr my_addr )
4895 {
4896    Int      i, j, jmax, n_vars;
4897    Bool     xxx_known;
4898    Addr64   xxx_value, yyy_value;
4899    IRExpr*  udst;
4900    IRStmt*  st;
4901    IRConst* con;
4902    IRSB     *bb1, *bb2;
4903    Int      unroll_factor;
4904
4905    if (vex_control.iropt_unroll_thresh <= 0)
4906       return NULL;
4907
4908    /* First off, figure out if we can unroll this loop.  Do this
4909       without modifying bb0. */
4910
4911    if (bb0->jumpkind != Ijk_Boring)
4912       return NULL;
4913
4914    xxx_known = False;
4915    xxx_value = 0;
4916
4917    /* Extract the next-guest address.  If it isn't a literal, we
4918       have to give up. */
4919
4920    udst = bb0->next;
4921    if (udst->tag == Iex_Const
4922        && (udst->Iex.Const.con->tag == Ico_U32
4923            || udst->Iex.Const.con->tag == Ico_U64)) {
4924       /* The BB ends in a jump to a literal location. */
4925       xxx_known = True;
4926       xxx_value = udst->Iex.Const.con->tag == Ico_U64
4927                     ?  udst->Iex.Const.con->Ico.U64
4928                     : (Addr64)(udst->Iex.Const.con->Ico.U32);
4929    }
4930
4931    if (!xxx_known)
4932       return NULL;
4933
4934    /* Now we know the BB ends to a jump to a literal location.  If
4935       it's a jump to itself (viz, idiom #1), move directly to the
4936       unrolling stage, first cloning the bb so the original isn't
4937       modified. */
4938    if (xxx_value == my_addr) {
4939       unroll_factor = calc_unroll_factor( bb0 );
4940       if (unroll_factor < 2)
4941          return NULL;
4942       bb1 = deepCopyIRSB( bb0 );
4943       bb0 = NULL;
4944       udst = NULL; /* is now invalid */
4945       goto do_unroll;
4946    }
4947
4948    /* Search for the second idiomatic form:
4949         X: BODY; if (c) goto X; goto Y
4950       We know Y, but need to establish that the last stmt
4951       is 'if (c) goto X'.
4952    */
4953    yyy_value = xxx_value;
4954    for (i = bb0->stmts_used-1; i >= 0; i--)
4955       if (bb0->stmts[i])
4956          break;
4957
4958    if (i < 0)
4959       return NULL; /* block with no stmts.  Strange. */
4960
4961    st = bb0->stmts[i];
4962    if (st->tag != Ist_Exit)
4963       return NULL;
4964    if (st->Ist.Exit.jk != Ijk_Boring)
4965       return NULL;
4966
4967    con = st->Ist.Exit.dst;
4968    vassert(con->tag == Ico_U32 || con->tag == Ico_U64);
4969
4970    xxx_value = con->tag == Ico_U64
4971                   ? st->Ist.Exit.dst->Ico.U64
4972                   : (Addr64)(st->Ist.Exit.dst->Ico.U32);
4973
4974    /* If this assertion fails, we have some kind of type error. */
4975    vassert(con->tag == udst->Iex.Const.con->tag);
4976
4977    if (xxx_value != my_addr)
4978       /* We didn't find either idiom.  Give up. */
4979       return NULL;
4980
4981    /* Ok, we found idiom #2.  Copy the BB, switch around the xxx and
4982       yyy values (which makes it look like idiom #1), and go into
4983       unrolling proper.  This means finding (again) the last stmt, in
4984       the copied BB. */
4985
4986    unroll_factor = calc_unroll_factor( bb0 );
4987    if (unroll_factor < 2)
4988       return NULL;
4989
4990    bb1 = deepCopyIRSB( bb0 );
4991    bb0 = NULL;
4992    udst = NULL; /* is now invalid */
4993    for (i = bb1->stmts_used-1; i >= 0; i--)
4994       if (bb1->stmts[i])
4995          break;
4996
4997    /* The next bunch of assertions should be true since we already
4998       found and checked the last stmt in the original bb. */
4999
5000    vassert(i >= 0);
5001
5002    st = bb1->stmts[i];
5003    vassert(st->tag == Ist_Exit);
5004
5005    con = st->Ist.Exit.dst;
5006    vassert(con->tag == Ico_U32 || con->tag == Ico_U64);
5007
5008    udst = bb1->next;
5009    vassert(udst->tag == Iex_Const);
5010    vassert(udst->Iex.Const.con->tag == Ico_U32
5011           || udst->Iex.Const.con->tag == Ico_U64);
5012    vassert(con->tag == udst->Iex.Const.con->tag);
5013
5014    /* switch the xxx and yyy fields around */
5015    if (con->tag == Ico_U64) {
5016       udst->Iex.Const.con->Ico.U64 = xxx_value;
5017       con->Ico.U64 = yyy_value;
5018    } else {
5019       udst->Iex.Const.con->Ico.U32 = (UInt)xxx_value;
5020       con->Ico.U32 = (UInt)yyy_value;
5021    }
5022
5023    /* negate the test condition */
5024    st->Ist.Exit.guard
5025       = IRExpr_Unop(Iop_Not1,deepCopyIRExpr(st->Ist.Exit.guard));
5026
5027    /* --- The unroller proper.  Both idioms are by now --- */
5028    /* --- now converted to idiom 1. --- */
5029
5030   do_unroll:
5031
5032    vassert(unroll_factor == 2
5033            || unroll_factor == 4
5034            || unroll_factor == 8);
5035
5036    jmax = unroll_factor==8 ? 3 : (unroll_factor==4 ? 2 : 1);
5037    for (j = 1; j <= jmax; j++) {
5038
5039       n_vars = bb1->tyenv->types_used;
5040
5041       bb2 = deepCopyIRSB(bb1);
5042       for (i = 0; i < n_vars; i++)
5043          (void)newIRTemp(bb1->tyenv, bb2->tyenv->types[i]);
5044
5045       for (i = 0; i < bb2->stmts_used; i++) {
5046          /* deltaIRStmt destructively modifies the stmt, but
5047             that's OK since bb2 is a complete fresh copy of bb1. */
5048          deltaIRStmt(bb2->stmts[i], n_vars);
5049          addStmtToIRSB(bb1, bb2->stmts[i]);
5050       }
5051    }
5052
5053    if (DEBUG_IROPT) {
5054       vex_printf("\nUNROLLED (%lx)\n", my_addr);
5055       ppIRSB(bb1);
5056       vex_printf("\n");
5057    }
5058
5059    /* Flattening; sigh.  The unroller succeeds in breaking flatness
5060       by negating the test condition.  This should be fixed properly.
5061       For the moment use this shotgun approach.  */
5062    return flatten_BB(bb1);
5063 }
5064
5065
5066 /*---------------------------------------------------------------*/
5067 /*--- The tree builder                                        ---*/
5068 /*---------------------------------------------------------------*/
5069
5070 /* This isn't part of IR optimisation.  Really it's a pass done prior
5071    to instruction selection, which improves the code that the
5072    instruction selector can produce. */
5073
5074 /* --- The 'tmp' environment is the central data structure here --- */
5075
5076 /* The number of outstanding bindings we're prepared to track.
5077    The number of times the env becomes full and we have to dump
5078    the oldest binding (hence reducing code quality) falls very
5079    rapidly as the env size increases.  8 gives reasonable performance
5080    under most circumstances. */
5081 #define A_NENV 10
5082
5083 /* An interval. Used to record the bytes in the guest state accessed
5084    by a Put[I] statement or by (one or more) Get[I] expression(s). In
5085    case of several Get[I] expressions, the lower/upper bounds are recorded.
5086    This is conservative but cheap.
5087    E.g. a Put of 8 bytes at address 100 would be recorded as [100,107].
5088    E.g. an expression that reads 8 bytes at offset 100 and 4 bytes at
5089    offset 200 would be recorded as [100,203] */
5090 typedef
5091    struct {
5092       Bool present;
5093       Int  low;
5094       Int  high;
5095    }
5096    Interval;
5097
5098 static inline Bool
5099 intervals_overlap(Interval i1, Interval i2)
5100 {
5101    return (i1.low >= i2.low && i1.low <= i2.high) ||
5102           (i2.low >= i1.low && i2.low <= i1.high);
5103 }
5104
5105 static inline void
5106 update_interval(Interval *i, Int low, Int high)
5107 {
5108    vassert(low <= high);
5109
5110    if (i->present) {
5111       if (low  < i->low)  i->low  = low;
5112       if (high > i->high) i->high = high;
5113    } else {
5114       i->present = True;
5115       i->low  = low;
5116       i->high = high;
5117    }
5118 }
5119
5120
5121 /* bindee == NULL   ===  slot is not in use
5122    bindee != NULL   ===  slot is in use
5123 */
5124 typedef
5125    struct {
5126       IRTemp  binder;
5127       IRExpr* bindee;
5128       Bool    doesLoad;
5129       /* Record the bytes of the guest state BINDEE reads from. */
5130       Interval getInterval;
5131    }
5132    ATmpInfo;
5133
5134 __attribute__((unused))
5135 static void ppAEnv ( ATmpInfo* env )
5136 {
5137    Int i;
5138    for (i = 0; i < A_NENV; i++) {
5139       vex_printf("%d  tmp %d  val ", i, (Int)env[i].binder);
5140       if (env[i].bindee)
5141          ppIRExpr(env[i].bindee);
5142       else
5143          vex_printf("(null)");
5144       vex_printf("\n");
5145    }
5146 }
5147
5148 /* --- Tree-traversal fns --- */
5149
5150 /* Traverse an expr, and detect if any part of it reads memory or does
5151    a Get.  Be careful ... this really controls how much the
5152    tree-builder can reorder the code, so getting it right is critical.
5153 */
5154 static void setHints_Expr (Bool* doesLoad, Interval* getInterval, IRExpr* e )
5155 {
5156    Int i;
5157    switch (e->tag) {
5158       case Iex_CCall:
5159          for (i = 0; e->Iex.CCall.args[i]; i++)
5160             setHints_Expr(doesLoad, getInterval, e->Iex.CCall.args[i]);
5161          return;
5162       case Iex_ITE:
5163          setHints_Expr(doesLoad, getInterval, e->Iex.ITE.cond);
5164          setHints_Expr(doesLoad, getInterval, e->Iex.ITE.iftrue);
5165          setHints_Expr(doesLoad, getInterval, e->Iex.ITE.iffalse);
5166          return;
5167       case Iex_Qop:
5168          setHints_Expr(doesLoad, getInterval, e->Iex.Qop.details->arg1);
5169          setHints_Expr(doesLoad, getInterval, e->Iex.Qop.details->arg2);
5170          setHints_Expr(doesLoad, getInterval, e->Iex.Qop.details->arg3);
5171          setHints_Expr(doesLoad, getInterval, e->Iex.Qop.details->arg4);
5172          return;
5173       case Iex_Triop:
5174          setHints_Expr(doesLoad, getInterval, e->Iex.Triop.details->arg1);
5175          setHints_Expr(doesLoad, getInterval, e->Iex.Triop.details->arg2);
5176          setHints_Expr(doesLoad, getInterval, e->Iex.Triop.details->arg3);
5177          return;
5178       case Iex_Binop:
5179          setHints_Expr(doesLoad, getInterval, e->Iex.Binop.arg1);
5180          setHints_Expr(doesLoad, getInterval, e->Iex.Binop.arg2);
5181          return;
5182       case Iex_Unop:
5183          setHints_Expr(doesLoad, getInterval, e->Iex.Unop.arg);
5184          return;
5185       case Iex_Load:
5186          *doesLoad = True;
5187          setHints_Expr(doesLoad, getInterval, e->Iex.Load.addr);
5188          return;
5189       case Iex_Get: {
5190          Int low = e->Iex.Get.offset;
5191          Int high = low + sizeofIRType(e->Iex.Get.ty) - 1;
5192          update_interval(getInterval, low, high);
5193          return;
5194       }
5195       case Iex_GetI: {
5196          IRRegArray *descr = e->Iex.GetI.descr;
5197          Int size = sizeofIRType(descr->elemTy);
5198          Int low  = descr->base;
5199          Int high = low + descr->nElems * size - 1;
5200          update_interval(getInterval, low, high);
5201          setHints_Expr(doesLoad, getInterval, e->Iex.GetI.ix);
5202          return;
5203       }
5204       case Iex_RdTmp:
5205       case Iex_Const:
5206          return;
5207       default:
5208          vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
5209          vpanic("setHints_Expr");
5210    }
5211 }
5212
5213
5214 /* Add a binding to the front of the env and slide all the rest
5215    backwards.  It should be the case that the last slot is free. */
5216 static void addToEnvFront ( ATmpInfo* env, IRTemp binder, IRExpr* bindee )
5217 {
5218    Int i;
5219    vassert(env[A_NENV-1].bindee == NULL);
5220    for (i = A_NENV-1; i >= 1; i--)
5221       env[i] = env[i-1];
5222    env[0].binder   = binder;
5223    env[0].bindee   = bindee;
5224    env[0].doesLoad = False; /* filled in later */
5225    env[0].getInterval.present  = False; /* filled in later */
5226    env[0].getInterval.low  = -1; /* filled in later */
5227    env[0].getInterval.high = -1; /* filled in later */
5228 }
5229
5230 /* Given uses :: array of UShort, indexed by IRTemp
5231    Add the use-occurrences of temps in this expression
5232    to the env.
5233 */
5234 static void aoccCount_Expr ( UShort* uses, IRExpr* e )
5235 {
5236    Int i;
5237
5238    switch (e->tag) {
5239
5240       case Iex_RdTmp: /* the only interesting case */
5241          uses[e->Iex.RdTmp.tmp]++;
5242          return;
5243
5244       case Iex_ITE:
5245          aoccCount_Expr(uses, e->Iex.ITE.cond);
5246          aoccCount_Expr(uses, e->Iex.ITE.iftrue);
5247          aoccCount_Expr(uses, e->Iex.ITE.iffalse);
5248          return;
5249
5250       case Iex_Qop:
5251          aoccCount_Expr(uses, e->Iex.Qop.details->arg1);
5252          aoccCount_Expr(uses, e->Iex.Qop.details->arg2);
5253          aoccCount_Expr(uses, e->Iex.Qop.details->arg3);
5254          aoccCount_Expr(uses, e->Iex.Qop.details->arg4);
5255          return;
5256
5257       case Iex_Triop:
5258          aoccCount_Expr(uses, e->Iex.Triop.details->arg1);
5259          aoccCount_Expr(uses, e->Iex.Triop.details->arg2);
5260          aoccCount_Expr(uses, e->Iex.Triop.details->arg3);
5261          return;
5262
5263       case Iex_Binop:
5264          aoccCount_Expr(uses, e->Iex.Binop.arg1);
5265          aoccCount_Expr(uses, e->Iex.Binop.arg2);
5266          return;
5267
5268       case Iex_Unop:
5269          aoccCount_Expr(uses, e->Iex.Unop.arg);
5270          return;
5271
5272       case Iex_Load:
5273          aoccCount_Expr(uses, e->Iex.Load.addr);
5274          return;
5275
5276       case Iex_CCall:
5277          for (i = 0; e->Iex.CCall.args[i]; i++)
5278             aoccCount_Expr(uses, e->Iex.CCall.args[i]);
5279          return;
5280
5281       case Iex_GetI:
5282          aoccCount_Expr(uses, e->Iex.GetI.ix);
5283          return;
5284
5285       case Iex_Const:
5286       case Iex_Get:
5287          return;
5288
5289       default:
5290          vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
5291          vpanic("aoccCount_Expr");
5292     }
5293 }
5294
5295
5296 /* Given uses :: array of UShort, indexed by IRTemp
5297    Add the use-occurrences of temps in this statement
5298    to the env.
5299 */
5300 static void aoccCount_Stmt ( UShort* uses, IRStmt* st )
5301 {
5302    Int      i;
5303    IRDirty* d;
5304    IRCAS*   cas;
5305    switch (st->tag) {
5306       case Ist_AbiHint:
5307          aoccCount_Expr(uses, st->Ist.AbiHint.base);
5308          aoccCount_Expr(uses, st->Ist.AbiHint.nia);
5309          return;
5310       case Ist_WrTmp:
5311          aoccCount_Expr(uses, st->Ist.WrTmp.data);
5312          return;
5313       case Ist_Put:
5314          aoccCount_Expr(uses, st->Ist.Put.data);
5315          return;
5316       case Ist_PutI:
5317          aoccCount_Expr(uses, st->Ist.PutI.details->ix);
5318          aoccCount_Expr(uses, st->Ist.PutI.details->data);
5319          return;
5320       case Ist_Store:
5321          aoccCount_Expr(uses, st->Ist.Store.addr);
5322          aoccCount_Expr(uses, st->Ist.Store.data);
5323          return;
5324       case Ist_StoreG: {
5325          IRStoreG* sg = st->Ist.StoreG.details;
5326          aoccCount_Expr(uses, sg->addr);
5327          aoccCount_Expr(uses, sg->data);
5328          aoccCount_Expr(uses, sg->guard);
5329          return;
5330       }
5331       case Ist_LoadG: {
5332          IRLoadG* lg = st->Ist.LoadG.details;
5333          aoccCount_Expr(uses, lg->addr);
5334          aoccCount_Expr(uses, lg->alt);
5335          aoccCount_Expr(uses, lg->guard);
5336          return;
5337       }
5338       case Ist_CAS:
5339          cas = st->Ist.CAS.details;
5340          aoccCount_Expr(uses, cas->addr);
5341          if (cas->expdHi)
5342             aoccCount_Expr(uses, cas->expdHi);
5343          aoccCount_Expr(uses, cas->expdLo);
5344          if (cas->dataHi)
5345             aoccCount_Expr(uses, cas->dataHi);
5346          aoccCount_Expr(uses, cas->dataLo);
5347          return;
5348       case Ist_LLSC:
5349          aoccCount_Expr(uses, st->Ist.LLSC.addr);
5350          if (st->Ist.LLSC.storedata)
5351             aoccCount_Expr(uses, st->Ist.LLSC.storedata);
5352          return;
5353       case Ist_Dirty:
5354          d = st->Ist.Dirty.details;
5355          if (d->mFx != Ifx_None)
5356             aoccCount_Expr(uses, d->mAddr);
5357          aoccCount_Expr(uses, d->guard);
5358          for (i = 0; d->args[i]; i++) {
5359             IRExpr* arg = d->args[i];
5360             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg)))
5361                aoccCount_Expr(uses, arg);
5362          }
5363          return;
5364       case Ist_NoOp:
5365       case Ist_IMark:
5366       case Ist_MBE:
5367          return;
5368       case Ist_Exit:
5369          aoccCount_Expr(uses, st->Ist.Exit.guard);
5370          return;
5371       default:
5372          vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
5373          vpanic("aoccCount_Stmt");
5374    }
5375 }
5376
5377 /* Look up a binding for tmp in the env.  If found, return the bound
5378    expression, and set the env's binding to NULL so it is marked as
5379    used.  If not found, return NULL. */
5380
5381 static IRExpr* atbSubst_Temp ( ATmpInfo* env, IRTemp tmp )
5382 {
5383    Int i;
5384    for (i = 0; i < A_NENV; i++) {
5385       if (env[i].binder == tmp && env[i].bindee != NULL) {
5386          IRExpr* bindee = env[i].bindee;
5387          env[i].bindee = NULL;
5388          return bindee;
5389       }
5390    }
5391    return NULL;
5392 }
5393
5394 /* Traverse e, looking for temps.  For each observed temp, see if env
5395    contains a binding for the temp, and if so return the bound value.
5396    The env has the property that any binding it holds is
5397    'single-shot', so once a binding is used, it is marked as no longer
5398    available, by setting its .bindee field to NULL. */
5399
5400 static inline Bool is_Unop ( IRExpr* e, IROp op ) {
5401    return e->tag == Iex_Unop && e->Iex.Unop.op == op;
5402 }
5403 static inline Bool is_Binop ( IRExpr* e, IROp op ) {
5404    return e->tag == Iex_Binop && e->Iex.Binop.op == op;
5405 }
5406
5407 static IRExpr* fold_IRExpr_Binop ( IROp op, IRExpr* a1, IRExpr* a2 )
5408 {
5409    switch (op) {
5410    case Iop_Or32:
5411       /* Or32( CmpwNEZ32(x), CmpwNEZ32(y) ) --> CmpwNEZ32( Or32( x, y ) )  */
5412       if (is_Unop(a1, Iop_CmpwNEZ32) && is_Unop(a2, Iop_CmpwNEZ32))
5413          return IRExpr_Unop( Iop_CmpwNEZ32,
5414                              IRExpr_Binop( Iop_Or32, a1->Iex.Unop.arg,
5415                                                      a2->Iex.Unop.arg ) );
5416       break;
5417
5418    case Iop_CmpNE32:
5419       /* Since X has type Ity_I1 we can simplify:
5420          CmpNE32(1Uto32(X),0)) ==> X */
5421       if (is_Unop(a1, Iop_1Uto32) && isZeroU32(a2))
5422          return a1->Iex.Unop.arg;
5423       break;
5424
5425    default:
5426       break;
5427    }
5428    /* no reduction rule applies */
5429    return IRExpr_Binop( op, a1, a2 );
5430 }
5431
5432 static IRExpr* fold_IRExpr_Unop ( IROp op, IRExpr* aa )
5433 {
5434    switch (op) {
5435    case Iop_CmpwNEZ64:
5436       /* CmpwNEZ64( CmpwNEZ64 ( x ) ) --> CmpwNEZ64 ( x ) */
5437       if (is_Unop(aa, Iop_CmpwNEZ64))
5438          return IRExpr_Unop( Iop_CmpwNEZ64, aa->Iex.Unop.arg );
5439       /* CmpwNEZ64( Or64 ( CmpwNEZ64(x), y ) ) --> CmpwNEZ64( Or64( x, y ) ) */
5440       if (is_Binop(aa, Iop_Or64)
5441           && is_Unop(aa->Iex.Binop.arg1, Iop_CmpwNEZ64))
5442          return fold_IRExpr_Unop(
5443                    Iop_CmpwNEZ64,
5444                    IRExpr_Binop(Iop_Or64,
5445                                 aa->Iex.Binop.arg1->Iex.Unop.arg,
5446                                 aa->Iex.Binop.arg2));
5447       /* CmpwNEZ64( Or64 ( x, CmpwNEZ64(y) ) ) --> CmpwNEZ64( Or64( x, y ) ) */
5448       if (is_Binop(aa, Iop_Or64)
5449           && is_Unop(aa->Iex.Binop.arg2, Iop_CmpwNEZ64))
5450          return fold_IRExpr_Unop(
5451                    Iop_CmpwNEZ64,
5452                    IRExpr_Binop(Iop_Or64,
5453                                 aa->Iex.Binop.arg1,
5454                                 aa->Iex.Binop.arg2->Iex.Unop.arg));
5455       break;
5456    case Iop_CmpNEZ64:
5457       /* CmpNEZ64( Left64(x) ) --> CmpNEZ64(x) */
5458       if (is_Unop(aa, Iop_Left64))
5459          return IRExpr_Unop(Iop_CmpNEZ64, aa->Iex.Unop.arg);
5460       /* CmpNEZ64( 1Uto64(X) ) --> X */
5461       if (is_Unop(aa, Iop_1Uto64))
5462          return aa->Iex.Unop.arg;
5463       break;
5464    case Iop_CmpwNEZ32:
5465       /* CmpwNEZ32( CmpwNEZ32 ( x ) ) --> CmpwNEZ32 ( x ) */
5466       if (is_Unop(aa, Iop_CmpwNEZ32))
5467          return IRExpr_Unop( Iop_CmpwNEZ32, aa->Iex.Unop.arg );
5468       break;
5469    case Iop_CmpNEZ32:
5470       /* CmpNEZ32( Left32(x) ) --> CmpNEZ32(x) */
5471       if (is_Unop(aa, Iop_Left32))
5472          return IRExpr_Unop(Iop_CmpNEZ32, aa->Iex.Unop.arg);
5473       /* CmpNEZ32( 1Uto32(X) ) --> X */
5474       if (is_Unop(aa, Iop_1Uto32))
5475          return aa->Iex.Unop.arg;
5476       /* CmpNEZ32( 64to32( CmpwNEZ64(X) ) ) --> CmpNEZ64(X) */
5477       if (is_Unop(aa, Iop_64to32) && is_Unop(aa->Iex.Unop.arg, Iop_CmpwNEZ64))
5478          return IRExpr_Unop(Iop_CmpNEZ64, aa->Iex.Unop.arg->Iex.Unop.arg);
5479       break;
5480    case Iop_CmpNEZ8:
5481       /* CmpNEZ8( 1Uto8(X) ) --> X */
5482       if (is_Unop(aa, Iop_1Uto8))
5483          return aa->Iex.Unop.arg;
5484       break;
5485    case Iop_Left32:
5486       /* Left32( Left32(x) ) --> Left32(x) */
5487       if (is_Unop(aa, Iop_Left32))
5488          return IRExpr_Unop( Iop_Left32, aa->Iex.Unop.arg );
5489       break;
5490    case Iop_Left64:
5491       /* Left64( Left64(x) ) --> Left64(x) */
5492       if (is_Unop(aa, Iop_Left64))
5493          return IRExpr_Unop( Iop_Left64, aa->Iex.Unop.arg );
5494       break;
5495    case Iop_ZeroHI64ofV128:
5496       /* ZeroHI64ofV128( ZeroHI64ofV128(x) ) --> ZeroHI64ofV128(x) */
5497       if (is_Unop(aa, Iop_ZeroHI64ofV128))
5498          return IRExpr_Unop( Iop_ZeroHI64ofV128, aa->Iex.Unop.arg );
5499       break;
5500    case Iop_32to1:
5501       /* 32to1( 1Uto32 ( x ) ) --> x */
5502       if (is_Unop(aa, Iop_1Uto32))
5503          return aa->Iex.Unop.arg;
5504       /* 32to1( CmpwNEZ32 ( x )) --> CmpNEZ32(x) */
5505       if (is_Unop(aa, Iop_CmpwNEZ32))
5506          return IRExpr_Unop( Iop_CmpNEZ32, aa->Iex.Unop.arg );
5507       break;
5508    case Iop_64to1:
5509       /* 64to1( 1Uto64 ( x ) ) --> x */
5510       if (is_Unop(aa, Iop_1Uto64))
5511          return aa->Iex.Unop.arg;
5512       /* 64to1( CmpwNEZ64 ( x )) --> CmpNEZ64(x) */
5513       if (is_Unop(aa, Iop_CmpwNEZ64))
5514          return IRExpr_Unop( Iop_CmpNEZ64, aa->Iex.Unop.arg );
5515       break;
5516    case Iop_64to32:
5517       /* 64to32( 32Uto64 ( x )) --> x */
5518       if (is_Unop(aa, Iop_32Uto64))
5519          return aa->Iex.Unop.arg;
5520       /* 64to32( 8Uto64 ( x )) --> 8Uto32(x) */
5521       if (is_Unop(aa, Iop_8Uto64))
5522          return IRExpr_Unop(Iop_8Uto32, aa->Iex.Unop.arg);
5523       break;
5524    case Iop_64to16:
5525       /* 64to16( 16Uto64 ( x )) --> x */
5526       if (is_Unop(aa, Iop_16Uto64))
5527          return aa->Iex.Unop.arg;
5528       /* 64to16( 32Uto64 ( x )) --> 32to16(x) */
5529       if (is_Unop(aa, Iop_32Uto64))
5530          return IRExpr_Unop(Iop_32to16, aa->Iex.Unop.arg);
5531       break;
5532
5533    case Iop_32Uto64:
5534       /* 32Uto64( 8Uto32( x )) --> 8Uto64(x) */
5535       if (is_Unop(aa, Iop_8Uto32))
5536          return IRExpr_Unop(Iop_8Uto64, aa->Iex.Unop.arg);
5537       /* 32Uto64( 16Uto32( x )) --> 16Uto64(x) */
5538       if (is_Unop(aa, Iop_16Uto32))
5539          return IRExpr_Unop(Iop_16Uto64, aa->Iex.Unop.arg);
5540       /* 32Uto64(64to32( Shr64( 32Uto64(64to32(x)), sh ))
5541                      --> Shr64( 32Uto64(64to32(x)), sh )) */
5542       if (is_Unop(aa, Iop_64to32)
5543           && is_Binop(aa->Iex.Unop.arg, Iop_Shr64)
5544           && is_Unop(aa->Iex.Unop.arg->Iex.Binop.arg1, Iop_32Uto64)
5545           && is_Unop(aa->Iex.Unop.arg->Iex.Binop.arg1->Iex.Unop.arg,
5546                      Iop_64to32)) {
5547          return aa->Iex.Unop.arg;
5548       }
5549       /*     32Uto64(64to32( Shl64( 32Uto64(64to32(x)), sh ))
5550          --> 32Uto64(64to32( Shl64(                x,   sh )) */
5551       if (is_Unop(aa, Iop_64to32)
5552           && is_Binop(aa->Iex.Unop.arg, Iop_Shl64)
5553           && is_Unop(aa->Iex.Unop.arg->Iex.Binop.arg1, Iop_32Uto64)
5554           && is_Unop(aa->Iex.Unop.arg->Iex.Binop.arg1->Iex.Unop.arg,
5555                      Iop_64to32)) {
5556          return
5557             IRExpr_Unop(
5558                Iop_32Uto64,
5559                IRExpr_Unop(
5560                   Iop_64to32,
5561                   IRExpr_Binop(
5562                      Iop_Shl64,
5563                      aa->Iex.Unop.arg->Iex.Binop.arg1->Iex.Unop.arg->Iex.Unop.arg,
5564                      aa->Iex.Unop.arg->Iex.Binop.arg2
5565             )));
5566       }
5567       break;
5568
5569    case Iop_1Sto32:
5570       /* 1Sto32( CmpNEZ8( 32to8( 1Uto32( CmpNEZ32( x ))))) -> CmpwNEZ32(x) */
5571       if (is_Unop(aa, Iop_CmpNEZ8)
5572           && is_Unop(aa->Iex.Unop.arg, Iop_32to8)
5573           && is_Unop(aa->Iex.Unop.arg->Iex.Unop.arg, Iop_1Uto32)
5574           && is_Unop(aa->Iex.Unop.arg->Iex.Unop.arg->Iex.Unop.arg,
5575                      Iop_CmpNEZ32)) {
5576          return IRExpr_Unop( Iop_CmpwNEZ32,
5577                              aa->Iex.Unop.arg->Iex.Unop.arg
5578                                ->Iex.Unop.arg->Iex.Unop.arg);
5579       }
5580       break;
5581
5582    default:
5583       break;
5584    }
5585    /* no reduction rule applies */
5586    return IRExpr_Unop( op, aa );
5587 }
5588
5589 static IRExpr* atbSubst_Expr ( ATmpInfo* env, IRExpr* e )
5590 {
5591    IRExpr*  e2;
5592    IRExpr** args2;
5593    Int      i;
5594
5595    switch (e->tag) {
5596
5597       case Iex_CCall:
5598          args2 = shallowCopyIRExprVec(e->Iex.CCall.args);
5599          for (i = 0; args2[i]; i++)
5600             args2[i] = atbSubst_Expr(env,args2[i]);
5601          return IRExpr_CCall(
5602                    e->Iex.CCall.cee,
5603                    e->Iex.CCall.retty,
5604                    args2
5605                 );
5606       case Iex_RdTmp:
5607          e2 = atbSubst_Temp(env, e->Iex.RdTmp.tmp);
5608          return e2 ? e2 : e;
5609       case Iex_ITE:
5610          return IRExpr_ITE(
5611                    atbSubst_Expr(env, e->Iex.ITE.cond),
5612                    atbSubst_Expr(env, e->Iex.ITE.iftrue),
5613                    atbSubst_Expr(env, e->Iex.ITE.iffalse)
5614                 );
5615       case Iex_Qop:
5616          return IRExpr_Qop(
5617                    e->Iex.Qop.details->op,
5618                    atbSubst_Expr(env, e->Iex.Qop.details->arg1),
5619                    atbSubst_Expr(env, e->Iex.Qop.details->arg2),
5620                    atbSubst_Expr(env, e->Iex.Qop.details->arg3),
5621                    atbSubst_Expr(env, e->Iex.Qop.details->arg4)
5622                 );
5623       case Iex_Triop:
5624          return IRExpr_Triop(
5625                    e->Iex.Triop.details->op,
5626                    atbSubst_Expr(env, e->Iex.Triop.details->arg1),
5627                    atbSubst_Expr(env, e->Iex.Triop.details->arg2),
5628                    atbSubst_Expr(env, e->Iex.Triop.details->arg3)
5629                 );
5630       case Iex_Binop:
5631          return fold_IRExpr_Binop(
5632                    e->Iex.Binop.op,
5633                    atbSubst_Expr(env, e->Iex.Binop.arg1),
5634                    atbSubst_Expr(env, e->Iex.Binop.arg2)
5635                 );
5636       case Iex_Unop:
5637          return fold_IRExpr_Unop(
5638                    e->Iex.Unop.op,
5639                    atbSubst_Expr(env, e->Iex.Unop.arg)
5640                 );
5641       case Iex_Load:
5642          return IRExpr_Load(
5643                    e->Iex.Load.end,
5644                    e->Iex.Load.ty,
5645                    atbSubst_Expr(env, e->Iex.Load.addr)
5646                 );
5647       case Iex_GetI:
5648          return IRExpr_GetI(
5649                    e->Iex.GetI.descr,
5650                    atbSubst_Expr(env, e->Iex.GetI.ix),
5651                    e->Iex.GetI.bias
5652                 );
5653       case Iex_Const:
5654       case Iex_Get:
5655          return e;
5656       default:
5657          vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
5658          vpanic("atbSubst_Expr");
5659    }
5660 }
5661
5662 /* Same deal as atbSubst_Expr, except for stmts. */
5663
5664 static IRStmt* atbSubst_Stmt ( ATmpInfo* env, IRStmt* st )
5665 {
5666    Int     i;
5667    IRDirty *d, *d2;
5668    IRCAS   *cas, *cas2;
5669    IRPutI  *puti, *puti2;
5670
5671    switch (st->tag) {
5672       case Ist_AbiHint:
5673          return IRStmt_AbiHint(
5674                    atbSubst_Expr(env, st->Ist.AbiHint.base),
5675                    st->Ist.AbiHint.len,
5676                    atbSubst_Expr(env, st->Ist.AbiHint.nia)
5677                 );
5678       case Ist_Store:
5679          return IRStmt_Store(
5680                    st->Ist.Store.end,
5681                    atbSubst_Expr(env, st->Ist.Store.addr),
5682                    atbSubst_Expr(env, st->Ist.Store.data)
5683                 );
5684       case Ist_StoreG: {
5685          IRStoreG* sg = st->Ist.StoreG.details;
5686          return IRStmt_StoreG(sg->end,
5687                               atbSubst_Expr(env, sg->addr),
5688                               atbSubst_Expr(env, sg->data),
5689                               atbSubst_Expr(env, sg->guard));
5690       }
5691       case Ist_LoadG: {
5692          IRLoadG* lg = st->Ist.LoadG.details;
5693          return IRStmt_LoadG(lg->end, lg->cvt, lg->dst,
5694                              atbSubst_Expr(env, lg->addr),
5695                              atbSubst_Expr(env, lg->alt),
5696                              atbSubst_Expr(env, lg->guard));
5697       }
5698       case Ist_WrTmp:
5699          return IRStmt_WrTmp(
5700                    st->Ist.WrTmp.tmp,
5701                    atbSubst_Expr(env, st->Ist.WrTmp.data)
5702                 );
5703       case Ist_Put:
5704          return IRStmt_Put(
5705                    st->Ist.Put.offset,
5706                    atbSubst_Expr(env, st->Ist.Put.data)
5707                 );
5708       case Ist_PutI:
5709          puti  = st->Ist.PutI.details;
5710          puti2 = mkIRPutI(puti->descr,
5711                           atbSubst_Expr(env, puti->ix),
5712                           puti->bias,
5713                           atbSubst_Expr(env, puti->data));
5714          return IRStmt_PutI(puti2);
5715
5716       case Ist_Exit:
5717          return IRStmt_Exit(
5718                    atbSubst_Expr(env, st->Ist.Exit.guard),
5719                    st->Ist.Exit.jk,
5720                    st->Ist.Exit.dst,
5721                    st->Ist.Exit.offsIP
5722                 );
5723       case Ist_IMark:
5724          return IRStmt_IMark(st->Ist.IMark.addr,
5725                              st->Ist.IMark.len,
5726                              st->Ist.IMark.delta);
5727       case Ist_NoOp:
5728          return IRStmt_NoOp();
5729       case Ist_MBE:
5730          return IRStmt_MBE(st->Ist.MBE.event);
5731       case Ist_CAS:
5732          cas  = st->Ist.CAS.details;
5733          cas2 = mkIRCAS(
5734                    cas->oldHi, cas->oldLo, cas->end,
5735                    atbSubst_Expr(env, cas->addr),
5736                    cas->expdHi ? atbSubst_Expr(env, cas->expdHi) : NULL,
5737                    atbSubst_Expr(env, cas->expdLo),
5738                    cas->dataHi ? atbSubst_Expr(env, cas->dataHi) : NULL,
5739                    atbSubst_Expr(env, cas->dataLo)
5740                 );
5741          return IRStmt_CAS(cas2);
5742       case Ist_LLSC:
5743          return IRStmt_LLSC(
5744                    st->Ist.LLSC.end,
5745                    st->Ist.LLSC.result,
5746                    atbSubst_Expr(env, st->Ist.LLSC.addr),
5747                    st->Ist.LLSC.storedata
5748                       ? atbSubst_Expr(env, st->Ist.LLSC.storedata) : NULL
5749                 );
5750       case Ist_Dirty:
5751          d  = st->Ist.Dirty.details;
5752          d2 = emptyIRDirty();
5753          *d2 = *d;
5754          if (d2->mFx != Ifx_None)
5755             d2->mAddr = atbSubst_Expr(env, d2->mAddr);
5756          d2->guard = atbSubst_Expr(env, d2->guard);
5757          for (i = 0; d2->args[i]; i++) {
5758             IRExpr* arg = d2->args[i];
5759             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg)))
5760                d2->args[i] = atbSubst_Expr(env, arg);
5761          }
5762          return IRStmt_Dirty(d2);
5763       default:
5764          vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
5765          vpanic("atbSubst_Stmt");
5766    }
5767 }
5768
5769 inline
5770 static Bool dirty_helper_stores ( const IRDirty *d )
5771 {
5772    return d->mFx == Ifx_Write || d->mFx == Ifx_Modify;
5773 }
5774
5775 inline
5776 static Interval dirty_helper_puts (
5777                    const IRDirty *d,
5778                    Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
5779                    VexRegisterUpdates pxControl,
5780                    /*OUT*/Bool *requiresPreciseMemExns
5781                 )
5782 {
5783    Int i;
5784    Interval interval;
5785
5786    /* Passing the guest state pointer opens the door to modifying the
5787       guest state under the covers.  It's not allowed, but let's be
5788       extra conservative and assume the worst. */
5789    for (i = 0; d->args[i]; i++) {
5790       if (UNLIKELY(d->args[i]->tag == Iex_GSPTR)) {
5791          *requiresPreciseMemExns = True;
5792          /* Assume all guest state is written. */
5793          interval.present = True;
5794          interval.low  = 0;
5795          interval.high = 0x7FFFFFFF;
5796          return interval;
5797       }
5798    }
5799
5800    /* Check the side effects on the guest state */
5801    interval.present = False;
5802    interval.low = interval.high = -1;
5803    *requiresPreciseMemExns = False;
5804
5805    for (i = 0; i < d->nFxState; ++i) {
5806       if (d->fxState[i].fx != Ifx_Read) {
5807          Int offset = d->fxState[i].offset;
5808          Int size = d->fxState[i].size;
5809          Int nRepeats = d->fxState[i].nRepeats;
5810          Int repeatLen = d->fxState[i].repeatLen;
5811
5812          if (preciseMemExnsFn(offset,
5813                               offset + nRepeats * repeatLen + size - 1,
5814                               pxControl)) {
5815             *requiresPreciseMemExns = True;
5816          }
5817          update_interval(&interval, offset,
5818                          offset + nRepeats * repeatLen + size - 1);
5819       }
5820    }
5821
5822    return interval;
5823 }
5824
5825 /* Return an interval if st modifies the guest state.  Via
5826    requiresPreciseMemExns return whether or not that modification
5827    requires precise exceptions. */
5828 static Interval stmt_modifies_guest_state (
5829                    IRSB *bb, const IRStmt *st,
5830                    Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
5831                    VexRegisterUpdates pxControl,
5832                    /*OUT*/Bool *requiresPreciseMemExns
5833                 )
5834 {
5835    Interval interval;
5836
5837    switch (st->tag) {
5838    case Ist_Put: {
5839       Int offset = st->Ist.Put.offset;
5840       Int size = sizeofIRType(typeOfIRExpr(bb->tyenv, st->Ist.Put.data));
5841
5842       *requiresPreciseMemExns
5843          = preciseMemExnsFn(offset, offset + size - 1, pxControl);
5844       interval.present = True;
5845       interval.low  = offset;
5846       interval.high = offset + size - 1;
5847       return interval;
5848    }
5849
5850    case Ist_PutI: {
5851       IRRegArray *descr = st->Ist.PutI.details->descr;
5852       Int offset = descr->base;
5853       Int size = sizeofIRType(descr->elemTy);
5854
5855       /* We quietly assume here that all segments are contiguous and there
5856          are no holes. This is to avoid a loop. The assumption is conservative
5857          in the sense that we might report that precise memory exceptions are
5858          needed when in fact they are not. */
5859       *requiresPreciseMemExns
5860          = preciseMemExnsFn(offset, offset + descr->nElems * size - 1,
5861                             pxControl);
5862       interval.present = True;
5863       interval.low  = offset;
5864       interval.high = offset + descr->nElems * size - 1;
5865       return interval;
5866    }
5867
5868    case Ist_Dirty:
5869       return dirty_helper_puts(st->Ist.Dirty.details,
5870                                preciseMemExnsFn, pxControl,
5871                                requiresPreciseMemExns);
5872
5873    default:
5874       *requiresPreciseMemExns = False;
5875       interval.present = False;
5876       interval.low  = -1;
5877       interval.high = -1;
5878       return interval;
5879    }
5880 }
5881
5882 /* notstatic */ Addr ado_treebuild_BB (
5883                         IRSB* bb,
5884                         Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
5885                         VexRegisterUpdates pxControl
5886                      )
5887 {
5888    Int      i, j, k, m;
5889    Bool     stmtStores, invalidateMe;
5890    Interval putInterval;
5891    IRStmt*  st;
5892    IRStmt*  st2;
5893    ATmpInfo env[A_NENV];
5894
5895    Bool   max_ga_known = False;
5896    Addr   max_ga       = 0;
5897
5898    Int       n_tmps = bb->tyenv->types_used;
5899    UShort*   uses   = LibVEX_Alloc_inline(n_tmps * sizeof(UShort));
5900
5901    /* Phase 1.  Scan forwards in bb, counting use occurrences of each
5902       temp.  Also count occurrences in the bb->next field.  Take the
5903       opportunity to also find the maximum guest address in the block,
5904       since that will be needed later for deciding when we can safely
5905       elide event checks. */
5906
5907    for (i = 0; i < n_tmps; i++)
5908       uses[i] = 0;
5909
5910    for (i = 0; i < bb->stmts_used; i++) {
5911       st = bb->stmts[i];
5912       switch (st->tag) {
5913          case Ist_NoOp:
5914             continue;
5915          case Ist_IMark: {
5916             UInt len = st->Ist.IMark.len;
5917             Addr mga = st->Ist.IMark.addr + (len < 1 ? 1 : len) - 1;
5918             max_ga_known = True;
5919             if (mga > max_ga)
5920                max_ga = mga;
5921             break;
5922          }
5923          default:
5924             break;
5925       }
5926       aoccCount_Stmt( uses, st );
5927    }
5928    aoccCount_Expr(uses, bb->next );
5929
5930 #  if 0
5931    for (i = 0; i < n_tmps; i++) {
5932       if (uses[i] == 0)
5933         continue;
5934       ppIRTemp( (IRTemp)i );
5935       vex_printf("  used %d\n", (Int)uses[i] );
5936    }
5937 #  endif
5938
5939    /* Phase 2.  Scan forwards in bb.  For each statement in turn:
5940
5941          If the env is full, emit the end element.  This guarantees
5942          there is at least one free slot in the following.
5943
5944          On seeing 't = E', occ(t)==1,
5945             let E'=env(E)
5946             delete this stmt
5947             add t -> E' to the front of the env
5948             Examine E' and set the hints for E' appropriately
5949               (doesLoad? doesGet?)
5950
5951          On seeing any other stmt,
5952             let stmt' = env(stmt)
5953             remove from env any 't=E' binds invalidated by stmt
5954                 emit the invalidated stmts
5955             emit stmt'
5956             compact any holes in env
5957               by sliding entries towards the front
5958
5959       Finally, apply env to bb->next.
5960    */
5961
5962    for (i = 0; i < A_NENV; i++) {
5963       env[i].bindee = NULL;
5964       env[i].binder = IRTemp_INVALID;
5965    }
5966
5967    /* The stmts in bb are being reordered, and we are guaranteed to
5968       end up with no more than the number we started with.  Use i to
5969       be the cursor of the current stmt examined and j <= i to be that
5970       for the current stmt being written.
5971    */
5972    j = 0;
5973    for (i = 0; i < bb->stmts_used; i++) {
5974
5975       st = bb->stmts[i];
5976       if (st->tag == Ist_NoOp)
5977          continue;
5978
5979       /* Ensure there's at least one space in the env, by emitting
5980          the oldest binding if necessary. */
5981       if (env[A_NENV-1].bindee != NULL) {
5982          bb->stmts[j] = IRStmt_WrTmp( env[A_NENV-1].binder,
5983                                       env[A_NENV-1].bindee );
5984          j++;
5985          vassert(j <= i);
5986          env[A_NENV-1].bindee = NULL;
5987       }
5988
5989       /* Consider current stmt. */
5990       if (st->tag == Ist_WrTmp && uses[st->Ist.WrTmp.tmp] <= 1) {
5991          IRExpr *e, *e2;
5992
5993          /* optional extra: dump dead bindings as we find them.
5994             Removes the need for a prior dead-code removal pass. */
5995          if (uses[st->Ist.WrTmp.tmp] == 0) {
5996             if (0) vex_printf("DEAD binding\n");
5997             continue; /* for (i = 0; i < bb->stmts_used; i++) loop */
5998          }
5999          vassert(uses[st->Ist.WrTmp.tmp] == 1);
6000
6001          /* ok, we have 't = E', occ(t)==1.  Do the abovementioned
6002             actions. */
6003          e  = st->Ist.WrTmp.data;
6004          e2 = atbSubst_Expr(env, e);
6005          addToEnvFront(env, st->Ist.WrTmp.tmp, e2);
6006          setHints_Expr(&env[0].doesLoad, &env[0].getInterval, e2);
6007          /* don't advance j, as we are deleting this stmt and instead
6008             holding it temporarily in the env. */
6009          continue; /* for (i = 0; i < bb->stmts_used; i++) loop */
6010       }
6011
6012       /* we get here for any other kind of statement. */
6013       /* 'use up' any bindings required by the current statement. */
6014       st2 = atbSubst_Stmt(env, st);
6015
6016       /* Now, before this stmt, dump any bindings in env that it
6017          invalidates.  These need to be dumped in the order in which
6018          they originally entered env -- that means from oldest to
6019          youngest. */
6020
6021       /* putInterval/stmtStores characterise what the stmt under
6022          consideration does, or might do (sidely safe @ True). */
6023
6024       Bool putRequiresPreciseMemExns;
6025       putInterval = stmt_modifies_guest_state(
6026                        bb, st, preciseMemExnsFn, pxControl,
6027                        &putRequiresPreciseMemExns
6028                     );
6029
6030       /* be True if this stmt writes memory or might do (==> we don't
6031          want to reorder other loads or stores relative to it).  Also,
6032          both LL and SC fall under this classification, since we
6033          really ought to be conservative and not reorder any other
6034          memory transactions relative to them. */
6035       stmtStores
6036          = toBool( st->tag == Ist_Store
6037                    || (st->tag == Ist_Dirty
6038                        && dirty_helper_stores(st->Ist.Dirty.details))
6039                    || st->tag == Ist_LLSC
6040                    || st->tag == Ist_CAS );
6041
6042       for (k = A_NENV-1; k >= 0; k--) {
6043          if (env[k].bindee == NULL)
6044             continue;
6045          /* Compare the actions of this stmt with the actions of
6046             binding 'k', to see if they invalidate the binding. */
6047          invalidateMe
6048             = toBool(
6049               /* a store invalidates loaded data */
6050               (env[k].doesLoad && stmtStores)
6051               /* a put invalidates get'd data, if they overlap */
6052               || ((env[k].getInterval.present && putInterval.present) &&
6053                   intervals_overlap(env[k].getInterval, putInterval))
6054               /* a put invalidates loaded data. That means, in essense, that
6055                  a load expression cannot be substituted into a statement
6056                  that follows the put. But there is nothing wrong doing so
6057                  except when the put statement requries precise exceptions.
6058                  Think of a load that is moved past a put where the put
6059                  updates the IP in the guest state. If the load generates
6060                  a segfault, the wrong address (line number) would be
6061                  reported. */
6062               || (env[k].doesLoad && putInterval.present &&
6063                   putRequiresPreciseMemExns)
6064               /* probably overly conservative: a memory bus event
6065                  invalidates absolutely everything, so that all
6066                  computation prior to it is forced to complete before
6067                  proceeding with the event (fence,lock,unlock). */
6068               || st->tag == Ist_MBE
6069               /* also be (probably overly) paranoid re AbiHints */
6070               || st->tag == Ist_AbiHint
6071               );
6072          if (invalidateMe) {
6073             bb->stmts[j] = IRStmt_WrTmp( env[k].binder, env[k].bindee );
6074             j++;
6075             vassert(j <= i);
6076             env[k].bindee = NULL;
6077          }
6078       }
6079
6080       /* Slide in-use entries in env up to the front */
6081       m = 0;
6082       for (k = 0; k < A_NENV; k++) {
6083          if (env[k].bindee != NULL) {
6084             env[m] = env[k];
6085             m++;
6086          }
6087       }
6088       for (m = m; m < A_NENV; m++) {
6089          env[m].bindee = NULL;
6090       }
6091
6092       /* finally, emit the substituted statement */
6093       bb->stmts[j] = st2;
6094       /* vex_printf("**2  "); ppIRStmt(bb->stmts[j]); vex_printf("\n"); */
6095       j++;
6096
6097       vassert(j <= i+1);
6098    } /* for each stmt in the original bb ... */
6099
6100    /* Finally ... substitute the ->next field as much as possible, and
6101       dump any left-over bindings.  Hmm.  Perhaps there should be no
6102       left over bindings?  Or any left-over bindings are
6103       by definition dead? */
6104    bb->next = atbSubst_Expr(env, bb->next);
6105    bb->stmts_used = j;
6106
6107    return max_ga_known ? max_ga : ~(Addr)0;
6108 }
6109
6110
6111 /*---------------------------------------------------------------*/
6112 /*--- MSVC specific transformation hacks                      ---*/
6113 /*---------------------------------------------------------------*/
6114
6115 /* The purpose of all this is to find MSVC's idiom for non-constant
6116    bitfield assignment, "a ^ ((a ^ b) & c)", and transform it into
6117    gcc's idiom "(a & ~c) | (b & c)".  Motivation is that Memcheck has
6118    generates a lot of false positives from the MSVC version because it
6119    doesn't understand that XORing an undefined bit with itself gives a
6120    defined result.
6121
6122    This isn't a problem for the simple case "x ^ x", because iropt
6123    folds it to a constant zero before Memcheck ever sees it.  But in
6124    this case we have an intervening "& c" which defeats the simple
6125    case.  So we have to carefully inspect all expressions rooted at an
6126    XOR to see if any of them match "a ^ ((a ^ b) & c)", or any of the
6127    7 other variants resulting from swapping the order of arguments to
6128    the three binary operations.  If we get a match, we then replace
6129    the tree with "(a & ~c) | (b & c)", and Memcheck is happy.
6130
6131    The key difficulty is to spot the two uses of "a".  To normalise
6132    the IR to maximise the chances of success, we first do a CSE pass,
6133    with CSEing of loads enabled, since the two "a" expressions may be
6134    loads, which need to be commoned up.  Then we do a constant folding
6135    pass, so as to remove any tmp-to-tmp assignment chains that would
6136    make matching the original expression more difficult.
6137 */
6138
6139
6140 /* Helper function for debug printing */
6141 __attribute__((unused))
6142 static void print_flat_expr ( IRExpr** env, IRExpr* e )
6143 {
6144    if (e == NULL) {
6145       vex_printf("?");
6146       return;
6147    }
6148    switch (e->tag) {
6149       case Iex_Binop: {
6150          ppIROp(e->Iex.Binop.op);
6151          vex_printf("(");
6152          print_flat_expr(env, e->Iex.Binop.arg1);
6153          vex_printf(",");
6154          print_flat_expr(env, e->Iex.Binop.arg2);
6155          vex_printf(")");
6156          break;
6157       }
6158       case Iex_Unop: {
6159          ppIROp(e->Iex.Unop.op);
6160          vex_printf("(");
6161          print_flat_expr(env, e->Iex.Unop.arg);
6162          vex_printf(")");
6163          break;
6164       }
6165       case Iex_RdTmp:
6166          ppIRTemp(e->Iex.RdTmp.tmp);
6167          vex_printf("=");
6168          print_flat_expr(env, chase1(env, e));
6169          break;
6170       case Iex_Const:
6171       case Iex_CCall:
6172       case Iex_Load:
6173       case Iex_ITE:
6174       case Iex_Get:
6175          ppIRExpr(e);
6176          break;
6177       default:
6178          vex_printf("FAIL: "); ppIRExpr(e); vex_printf("\n");
6179          vassert(0);
6180    }
6181 }
6182
6183 /* Spot   a ^ ((a ^ b) & c)   for a,b and c tmp-or-const (atoms)
6184    or any of the other 7 variants generated by switching the order
6185    of arguments to the outer ^, the inner ^ and the &.
6186 */
6187 static UInt spotBitfieldAssignment ( /*OUT*/IRExpr** aa, /*OUT*/IRExpr** bb,
6188                                      /*OUT*/IRExpr** cc,
6189                                      IRExpr** env, IRExpr* e,
6190                                      IROp opAND, IROp opXOR)
6191 {
6192 #  define ISBIN(_e,_op) ((_e) && (_e)->tag == Iex_Binop \
6193                               && (_e)->Iex.Binop.op == (_op))
6194 #  define ISATOM(_e)    isIRAtom(_e)
6195 #  define STEP(_e)      chase1(env, (_e))
6196 #  define LL(_e)        ((_e)->Iex.Binop.arg1)
6197 #  define RR(_e)        ((_e)->Iex.Binop.arg2)
6198
6199    IRExpr *a1, *and, *xor, *c, *a2bL, *a2bR;
6200
6201    /* This is common to all 8 cases */
6202    if (!ISBIN(e, opXOR)) goto fail;
6203
6204    /*                        -----and------ */
6205    /*                         --xor---      */
6206    /* find variant 1:   a1 ^ ((a2 ^ b) & c) */
6207    /* find variant 2:   a1 ^ ((b ^ a2) & c) */
6208    a1 = and = xor = c = a2bL = a2bR = NULL;
6209
6210    a1   = LL(e);
6211    and  = STEP(RR(e));
6212    if (!ISBIN(and, opAND)) goto v34;
6213    xor  = STEP(LL(and));
6214    c    = RR(and);
6215    if (!ISBIN(xor, opXOR)) goto v34;
6216    a2bL = LL(xor);
6217    a2bR = RR(xor);
6218
6219    if (eqIRAtom(a1, a2bL) && !eqIRAtom(a1, a2bR)) {
6220       *aa = a1;
6221       *bb = a2bR;
6222       *cc = c;
6223       return 1;
6224    }
6225    if (eqIRAtom(a1, a2bR) && !eqIRAtom(a1, a2bL)) {
6226       *aa = a1;
6227       *bb = a2bL;
6228       *cc = c;
6229       return 2;
6230    }
6231
6232   v34:
6233    /*                   -----and------      */
6234    /*                    --xor---           */
6235    /* find variant 3:   ((a2 ^ b) & c) ^ a1 */
6236    /* find variant 4:   ((b ^ a2) & c) ^ a1 */
6237    a1 = and = xor = c = a2bL = a2bR = NULL;
6238
6239    a1   = RR(e);
6240    and  = STEP(LL(e));
6241    if (!ISBIN(and, opAND)) goto v56;
6242    xor  = STEP(LL(and));
6243    c    = RR(and);
6244    if (!ISBIN(xor, opXOR)) goto v56;
6245    a2bL = LL(xor);
6246    a2bR = RR(xor);
6247
6248    if (eqIRAtom(a1, a2bL) && !eqIRAtom(a1, a2bR)) {
6249       *aa = a1;
6250       *bb = a2bR;
6251       *cc = c;
6252       return 3;
6253    }
6254    if (eqIRAtom(a1, a2bR) && !eqIRAtom(a1, a2bL)) {
6255       *aa = a1;
6256       *bb = a2bL;
6257       *cc = c;
6258       return 4;
6259    }
6260
6261   v56:
6262    /*                        -----and------ */
6263    /*                         --xor---      */
6264    /* find variant 5:   a1 ^ (c & (a2 ^ b)) */
6265    /* find variant 6:   a1 ^ (c & (b ^ a2)) */
6266    a1 = and = xor = c = a2bL = a2bR = NULL;
6267
6268    a1   = LL(e);
6269    and  = STEP(RR(e));
6270    if (!ISBIN(and, opAND)) goto v78;
6271    xor  = STEP(RR(and));
6272    c    = LL(and);
6273    if (!ISBIN(xor, opXOR)) goto v78;
6274    a2bL = LL(xor);
6275    a2bR = RR(xor);
6276
6277    if (eqIRAtom(a1, a2bL) && !eqIRAtom(a1, a2bR)) {
6278       *aa = a1;
6279       *bb = a2bR;
6280       *cc = c;
6281       vassert(5-5); // ATC
6282       return 5;
6283    }
6284    if (eqIRAtom(a1, a2bR) && !eqIRAtom(a1, a2bL)) {
6285       *aa = a1;
6286       *bb = a2bL;
6287       *cc = c;
6288       vassert(6-6); // ATC
6289       return 6;
6290    }
6291
6292  v78:
6293    /*                   -----and------      */
6294    /*                    --xor---           */
6295    /* find variant 7:   (c & (a2 ^ b)) ^ a1 */
6296    /* find variant 8:   (c & (b ^ a2)) ^ a1 */
6297    a1 = and = xor = c = a2bL = a2bR = NULL;
6298
6299    a1   = RR(e);
6300    and  = STEP(LL(e));
6301    if (!ISBIN(and, opAND)) goto fail;
6302    xor  = STEP(RR(and));
6303    c    = LL(and);
6304    if (!ISBIN(xor, opXOR)) goto fail;
6305    a2bL = LL(xor);
6306    a2bR = RR(xor);
6307
6308    if (eqIRAtom(a1, a2bL) && !eqIRAtom(a1, a2bR)) {
6309       *aa = a1;
6310       *bb = a2bR;
6311       *cc = c;
6312       return 7;
6313    }
6314    if (eqIRAtom(a1, a2bR) && !eqIRAtom(a1, a2bL)) {
6315       *aa = a1;
6316       *bb = a2bL;
6317       *cc = c;
6318       return 8;
6319    }
6320
6321  fail:
6322    return 0;
6323
6324 #  undef ISBIN
6325 #  undef ISATOM
6326 #  undef STEP
6327 #  undef LL
6328 #  undef RR
6329 }
6330
6331 /* If |e| is of the form a ^ ((a ^ b) & c) (or any of the 7 other
6332    variants thereof generated by switching arguments around), return
6333    the IRExpr* for (a & ~c) | (b & c).  Else return NULL. */
6334 static IRExpr* do_XOR_TRANSFORMS_IRExpr ( IRExpr** env, IRExpr* e )
6335 {
6336    if (e->tag != Iex_Binop)
6337       return NULL;
6338
6339    const HChar* tyNm = NULL;
6340    IROp   opOR  = Iop_INVALID;
6341    IROp   opAND = Iop_INVALID;
6342    IROp   opNOT = Iop_INVALID;
6343    IROp   opXOR = Iop_INVALID;
6344    switch (e->Iex.Binop.op) {
6345       case Iop_Xor32:
6346          tyNm  = "I32";
6347          opOR  = Iop_Or32;  opAND = Iop_And32;
6348          opNOT = Iop_Not32; opXOR = Iop_Xor32;
6349          break;
6350       case Iop_Xor16:
6351          tyNm  = "I16";
6352          opOR  = Iop_Or16;  opAND = Iop_And16;
6353          opNOT = Iop_Not16; opXOR = Iop_Xor16;
6354          break;
6355       case Iop_Xor8:
6356          tyNm  = "I8";
6357          opOR  = Iop_Or8;  opAND = Iop_And8;
6358          opNOT = Iop_Not8; opXOR = Iop_Xor8;
6359          break;
6360       default:
6361          return NULL;
6362    }
6363
6364    IRExpr* aa = NULL;
6365    IRExpr* bb = NULL;
6366    IRExpr* cc = NULL;
6367    UInt variant = spotBitfieldAssignment(&aa, &bb, &cc, env, e, opAND, opXOR);
6368    if (variant > 0) {
6369       static UInt ctr = 0;
6370       if (0)
6371          vex_printf("XXXXXXXXXX Bitfield Assignment number %u, "
6372                     "type %s, variant %u\n",
6373                     ++ctr, tyNm, variant);
6374       /* It's vitally important that the returned aa, bb and cc are
6375          atoms -- either constants or tmps.  If it's anything else
6376          (eg, a GET) then incorporating them in a tree at this point
6377          in the SB may erroneously pull them forwards (eg of a PUT
6378          that originally was after the GET) and so transform the IR
6379          wrongly.  spotBitfieldAssignment should guarantee only to
6380          give us atoms, but we check here anyway. */
6381       vassert(aa && isIRAtom(aa));
6382       vassert(bb && isIRAtom(bb));
6383       vassert(cc && isIRAtom(cc));
6384       return IRExpr_Binop(
6385                 opOR,
6386                 IRExpr_Binop(opAND, aa, IRExpr_Unop(opNOT, cc)),
6387                 IRExpr_Binop(opAND, bb, cc)
6388              );
6389    }
6390    return NULL;
6391 }
6392
6393
6394 /* SB is modified in-place.  Visit all the IRExprs and, for those
6395    which are allowed to be non-atomic, perform the XOR transform if
6396    possible.  This makes |sb| be non-flat, but that's ok, the caller
6397    can re-flatten it.  Returns True iff any changes were made. */
6398 static Bool do_XOR_TRANSFORM_IRSB ( IRSB* sb )
6399 {
6400    Int  i;
6401    Bool changed = False;
6402
6403    /* Make the tmp->expr environment, so we can use it for
6404       chasing expressions. */
6405    Int      n_tmps = sb->tyenv->types_used;
6406    IRExpr** env = LibVEX_Alloc_inline(n_tmps * sizeof(IRExpr*));
6407    for (i = 0; i < n_tmps; i++)
6408       env[i] = NULL;
6409
6410    for (i = 0; i < sb->stmts_used; i++) {
6411       IRStmt* st = sb->stmts[i];
6412       if (st->tag != Ist_WrTmp)
6413          continue;
6414       IRTemp t = st->Ist.WrTmp.tmp;
6415       vassert(t < n_tmps);
6416       env[t] = st->Ist.WrTmp.data;
6417    }
6418
6419    for (i = 0; i < sb->stmts_used; i++) {
6420       IRStmt* st = sb->stmts[i];
6421
6422       switch (st->tag) {
6423          case Ist_AbiHint:
6424             vassert(isIRAtom(st->Ist.AbiHint.base));
6425             vassert(isIRAtom(st->Ist.AbiHint.nia));
6426             break;
6427          case Ist_Put:
6428             vassert(isIRAtom(st->Ist.Put.data));
6429             break;
6430          case Ist_PutI: {
6431             IRPutI* puti = st->Ist.PutI.details;
6432             vassert(isIRAtom(puti->ix));
6433             vassert(isIRAtom(puti->data));
6434             break;
6435          }
6436          case Ist_WrTmp: {
6437             /* This is the one place where an expr (st->Ist.WrTmp.data) is
6438                allowed to be more than just a constant or a tmp. */
6439             IRExpr* mb_new_data
6440                = do_XOR_TRANSFORMS_IRExpr(env, st->Ist.WrTmp.data);
6441             if (mb_new_data) {
6442                //ppIRSB(sb);
6443                st->Ist.WrTmp.data = mb_new_data;
6444                //ppIRSB(sb);
6445                changed = True;
6446             }
6447             break;
6448          }
6449          case Ist_Store:
6450             vassert(isIRAtom(st->Ist.Store.addr));
6451             vassert(isIRAtom(st->Ist.Store.data));
6452             break;
6453          case Ist_StoreG: {
6454             IRStoreG* sg = st->Ist.StoreG.details;
6455             vassert(isIRAtom(sg->addr));
6456             vassert(isIRAtom(sg->data));
6457             vassert(isIRAtom(sg->guard));
6458             break;
6459          }
6460          case Ist_LoadG: {
6461             IRLoadG* lg = st->Ist.LoadG.details;
6462             vassert(isIRAtom(lg->addr));
6463             vassert(isIRAtom(lg->alt));
6464             vassert(isIRAtom(lg->guard));
6465             break;
6466          }
6467          case Ist_CAS: {
6468             IRCAS* cas = st->Ist.CAS.details;
6469             vassert(isIRAtom(cas->addr));
6470             vassert(cas->expdHi == NULL || isIRAtom(cas->expdHi));
6471             vassert(isIRAtom(cas->expdLo));
6472             vassert(cas->dataHi == NULL || isIRAtom(cas->dataHi));
6473             vassert(isIRAtom(cas->dataLo));
6474             break;
6475          }
6476          case Ist_LLSC:
6477             vassert(isIRAtom(st->Ist.LLSC.addr));
6478             if (st->Ist.LLSC.storedata)
6479                vassert(isIRAtom(st->Ist.LLSC.storedata));
6480             break;
6481          case Ist_Dirty: {
6482             IRDirty* d = st->Ist.Dirty.details;
6483             if (d->mFx != Ifx_None) {
6484                vassert(isIRAtom(d->mAddr));
6485             }
6486             vassert(isIRAtom(d->guard));
6487             for (Int j = 0; d->args[j]; j++) {
6488                IRExpr* arg = d->args[j];
6489                if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) {
6490                   vassert(isIRAtom(arg));
6491                }
6492             }
6493             break;
6494          }
6495          case Ist_IMark:
6496          case Ist_NoOp:
6497          case Ist_MBE:
6498             break;
6499          case Ist_Exit:
6500             vassert(isIRAtom(st->Ist.Exit.guard));
6501             break;
6502          default:
6503             vex_printf("\n"); ppIRStmt(st);
6504             vpanic("do_XOR_TRANSFORMS_IRSB");
6505       }
6506    }
6507
6508    vassert(isIRAtom(sb->next));
6509    return changed;
6510 }
6511
6512
6513 static IRSB* do_MSVC_HACKS ( IRSB* sb )
6514 {
6515    // Normalise as much as we can.  This is the one-and-only place
6516    // where we call do_cse_BB with allowLoadsToBeCSEd set to True.
6517    Bool any_cse_changes = do_cse_BB( sb, True/*allowLoadsToBeCSEd*/ );
6518    if (any_cse_changes) {
6519       // CSEing might have created dead code.  Remove it.
6520       sb = cprop_BB ( sb );
6521       do_deadcode_BB(sb);
6522    }
6523
6524    // Visit all atoms, do the transformation proper.  bb is modified
6525    // in-place.
6526    Bool changed = do_XOR_TRANSFORM_IRSB(sb);
6527
6528    if (changed) {
6529       // The transformation generates non-flat expressions, so we now
6530       // need to re-flatten the block.
6531       sb = flatten_BB(sb);
6532    }
6533
6534    return sb;
6535 }
6536
6537
6538 /*---------------------------------------------------------------*/
6539 /*--- iropt main                                              ---*/
6540 /*---------------------------------------------------------------*/
6541
6542 static Bool iropt_verbose = False; /* True; */
6543
6544
6545 /* Do a simple cleanup pass on bb.  This is: redundant Get removal,
6546    redundant Put removal, constant propagation, dead code removal,
6547    clean helper specialisation, and dead code removal (again).
6548 */
6549
6550
6551 static
6552 IRSB* cheap_transformations (
6553          IRSB* bb,
6554          IRExpr* (*specHelper) (const HChar*, IRExpr**, IRStmt**, Int),
6555          Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
6556          VexRegisterUpdates pxControl
6557       )
6558 {
6559    redundant_get_removal_BB ( bb );
6560    if (iropt_verbose) {
6561       vex_printf("\n========= REDUNDANT GET\n\n" );
6562       ppIRSB(bb);
6563    }
6564
6565    if (pxControl < VexRegUpdAllregsAtEachInsn) {
6566       redundant_put_removal_BB ( bb, preciseMemExnsFn, pxControl );
6567    }
6568    if (iropt_verbose) {
6569       vex_printf("\n========= REDUNDANT PUT\n\n" );
6570       ppIRSB(bb);
6571    }
6572
6573    bb = cprop_BB ( bb );
6574    if (iropt_verbose) {
6575       vex_printf("\n========= CPROPD\n\n" );
6576       ppIRSB(bb);
6577    }
6578
6579    do_deadcode_BB ( bb );
6580    if (iropt_verbose) {
6581       vex_printf("\n========= DEAD\n\n" );
6582       ppIRSB(bb);
6583    }
6584
6585    bb = spec_helpers_BB ( bb, specHelper );
6586    do_deadcode_BB ( bb );
6587    if (iropt_verbose) {
6588       vex_printf("\n========= SPECd \n\n" );
6589       ppIRSB(bb);
6590    }
6591
6592    return bb;
6593 }
6594
6595
6596 /* Do some more expensive transformations on bb, which are aimed at
6597    optimising as much as possible in the presence of GetI and PutI.  */
6598
6599 static
6600 IRSB* expensive_transformations( IRSB* bb, VexRegisterUpdates pxControl )
6601 {
6602    (void)do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
6603    collapse_AddSub_chains_BB( bb );
6604    do_redundant_GetI_elimination( bb );
6605    if (pxControl < VexRegUpdAllregsAtEachInsn) {
6606       do_redundant_PutI_elimination( bb, pxControl );
6607    }
6608    do_deadcode_BB( bb );
6609    return bb;
6610 }
6611
6612
6613 /* Scan a flattened BB to look for signs that more expensive
6614    optimisations might be useful:
6615    - find out if there are any GetIs and PutIs
6616    - find out if there are any floating or vector-typed temporaries
6617 */
6618
6619 static void considerExpensives ( /*OUT*/Bool* hasGetIorPutI,
6620                                  /*OUT*/Bool* hasVorFtemps,
6621                                  IRSB* bb )
6622 {
6623    Int      i, j;
6624    IRStmt*  st;
6625    IRDirty* d;
6626    IRCAS*   cas;
6627
6628    *hasGetIorPutI = False;
6629    *hasVorFtemps  = False;
6630
6631    for (i = 0; i < bb->stmts_used; i++) {
6632       st = bb->stmts[i];
6633       switch (st->tag) {
6634          case Ist_AbiHint:
6635             vassert(isIRAtom(st->Ist.AbiHint.base));
6636             vassert(isIRAtom(st->Ist.AbiHint.nia));
6637             break;
6638          case Ist_PutI:
6639             *hasGetIorPutI = True;
6640             break;
6641          case Ist_WrTmp:
6642             if (st->Ist.WrTmp.data->tag == Iex_GetI)
6643                *hasGetIorPutI = True;
6644             switch (typeOfIRTemp(bb->tyenv, st->Ist.WrTmp.tmp)) {
6645                case Ity_I1: case Ity_I8: case Ity_I16:
6646                case Ity_I32: case Ity_I64: case Ity_I128:
6647                   break;
6648                case Ity_F16: case Ity_F32: case Ity_F64: case Ity_F128:
6649                case Ity_V128: case Ity_V256:
6650                   *hasVorFtemps = True;
6651                   break;
6652                case Ity_D32: case Ity_D64: case Ity_D128:
6653                   *hasVorFtemps = True;
6654                   break;
6655                default:
6656                   goto bad;
6657             }
6658             break;
6659          case Ist_Put:
6660             vassert(isIRAtom(st->Ist.Put.data));
6661             break;
6662          case Ist_Store:
6663             vassert(isIRAtom(st->Ist.Store.addr));
6664             vassert(isIRAtom(st->Ist.Store.data));
6665             break;
6666          case Ist_StoreG: {
6667             IRStoreG* sg = st->Ist.StoreG.details;
6668             vassert(isIRAtom(sg->addr));
6669             vassert(isIRAtom(sg->data));
6670             vassert(isIRAtom(sg->guard));
6671             break;
6672          }
6673          case Ist_LoadG: {
6674             IRLoadG* lg = st->Ist.LoadG.details;
6675             vassert(isIRAtom(lg->addr));
6676             vassert(isIRAtom(lg->alt));
6677             vassert(isIRAtom(lg->guard));
6678             break;
6679          }
6680          case Ist_CAS:
6681             cas = st->Ist.CAS.details;
6682             vassert(isIRAtom(cas->addr));
6683             vassert(cas->expdHi == NULL || isIRAtom(cas->expdHi));
6684             vassert(isIRAtom(cas->expdLo));
6685             vassert(cas->dataHi == NULL || isIRAtom(cas->dataHi));
6686             vassert(isIRAtom(cas->dataLo));
6687             break;
6688          case Ist_LLSC:
6689             vassert(isIRAtom(st->Ist.LLSC.addr));
6690             if (st->Ist.LLSC.storedata)
6691                vassert(isIRAtom(st->Ist.LLSC.storedata));
6692             break;
6693          case Ist_Dirty:
6694             d = st->Ist.Dirty.details;
6695             vassert(isIRAtom(d->guard));
6696             for (j = 0; d->args[j]; j++) {
6697                IRExpr* arg = d->args[j];
6698                if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg)))
6699                   vassert(isIRAtom(arg));
6700             }
6701             if (d->mFx != Ifx_None)
6702                vassert(isIRAtom(d->mAddr));
6703             break;
6704          case Ist_NoOp:
6705          case Ist_IMark:
6706          case Ist_MBE:
6707             break;
6708          case Ist_Exit:
6709             vassert(isIRAtom(st->Ist.Exit.guard));
6710             break;
6711          default:
6712          bad:
6713             ppIRStmt(st);
6714             vpanic("considerExpensives");
6715       }
6716    }
6717 }
6718
6719
6720 /* ---------------- The main iropt entry point. ---------------- */
6721
6722 /* exported from this file */
6723 /* Rules of the game:
6724
6725    - IRExpr/IRStmt trees should be treated as immutable, as they
6726      may get shared.  So never change a field of such a tree node;
6727      instead construct and return a new one if needed.
6728 */
6729 IRSB* do_iropt_BB(
6730          IRSB* bb0,
6731          IRExpr* (*specHelper) (const HChar*, IRExpr**, IRStmt**, Int),
6732          Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
6733          VexRegisterUpdates pxControl,
6734          Addr    guest_addr,
6735          VexArch guest_arch
6736       )
6737 {
6738    static Int n_total     = 0;
6739    static Int n_expensive = 0;
6740
6741    Bool hasGetIorPutI, hasVorFtemps;
6742
6743    n_total++;
6744
6745    /* Flatness: this function assumes that the incoming block is already flat.
6746       That's because all blocks that arrive here should already have been
6747       processed by do_minimal_initial_iropt_BB.  And that will have flattened
6748       them out. */
6749    // FIXME Remove this assertion once the 'grail' machinery seems stable
6750    // FIXME2 The TOC-redirect-hacks generators in m_translate.c -- gen_PUSH()
6751    //        and gen_PO() -- don't generate flat IR, and so cause this assertion
6752    //        to fail.  For the time being, hack around this by flattening,
6753    //        rather than asserting for flatness, on the afflicted platforms.
6754    //        This is a kludge, yes.
6755    if (guest_arch == VexArchPPC64) {
6756       bb0 = flatten_BB(bb0); // Kludge!
6757    } else {
6758       vassert(isFlatIRSB(bb0)); // How it Really Should Be (tm).
6759    }
6760
6761    /* If at level 0, stop now. */
6762    if (vex_control.iropt_level <= 0) return bb0;
6763
6764    /* Now do a preliminary cleanup pass, and figure out if we also
6765       need to do 'expensive' optimisations.  Expensive optimisations
6766       are deemed necessary if the block contains any GetIs or PutIs.
6767       If needed, do expensive transformations and then another cheap
6768       cleanup pass. */
6769
6770    IRSB* bb = cheap_transformations( bb0, specHelper,
6771                                      preciseMemExnsFn, pxControl );
6772
6773    if (guest_arch == VexArchARM) {
6774       /* Translating Thumb2 code produces a lot of chaff.  We have to
6775          work extra hard to get rid of it. */
6776       bb = cprop_BB(bb);
6777       bb = spec_helpers_BB ( bb, specHelper );
6778       if (pxControl < VexRegUpdAllregsAtEachInsn) {
6779          redundant_put_removal_BB ( bb, preciseMemExnsFn, pxControl );
6780       }
6781       do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
6782       do_deadcode_BB( bb );
6783    }
6784
6785    if (vex_control.iropt_level > 1) {
6786
6787       /* Peer at what we have, to decide how much more effort to throw
6788          at it. */
6789       considerExpensives( &hasGetIorPutI, &hasVorFtemps, bb );
6790
6791       if (hasVorFtemps && !hasGetIorPutI) {
6792          /* If any evidence of FP or Vector activity, CSE, as that
6793             tends to mop up all manner of lardy code to do with
6794             rounding modes.  Don't bother if hasGetIorPutI since that
6795             case leads into the expensive transformations, which do
6796             CSE anyway. */
6797          (void)do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
6798          do_deadcode_BB( bb );
6799       }
6800
6801       if (hasGetIorPutI) {
6802          Bool cses;
6803          n_expensive++;
6804          if (DEBUG_IROPT)
6805             vex_printf("***** EXPENSIVE %d %d\n", n_total, n_expensive);
6806          bb = expensive_transformations( bb, pxControl );
6807          bb = cheap_transformations( bb, specHelper,
6808                                      preciseMemExnsFn, pxControl );
6809          /* Potentially common up GetIs */
6810          cses = do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
6811          if (cses)
6812             bb = cheap_transformations( bb, specHelper,
6813                                         preciseMemExnsFn, pxControl );
6814       }
6815
6816       ///////////////////////////////////////////////////////////
6817       // BEGIN MSVC optimised code transformation hacks
6818       if (0)
6819          bb = do_MSVC_HACKS(bb);
6820       // END   MSVC optimised code transformation hacks
6821       ///////////////////////////////////////////////////////////
6822
6823       /* Now have a go at unrolling simple (single-BB) loops.  If
6824          successful, clean up the results as much as possible. */
6825
6826       IRSB* bb2 = maybe_loop_unroll_BB( bb, guest_addr );
6827       if (bb2) {
6828          bb = cheap_transformations( bb2, specHelper,
6829                                      preciseMemExnsFn, pxControl );
6830          if (hasGetIorPutI) {
6831             bb = expensive_transformations( bb, pxControl );
6832             bb = cheap_transformations( bb, specHelper,
6833                                         preciseMemExnsFn, pxControl );
6834          } else {
6835             /* at least do CSE and dead code removal */
6836             do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
6837             do_deadcode_BB( bb );
6838          }
6839          if (0) vex_printf("vex iropt: unrolled a loop\n");
6840       }
6841
6842    }
6843
6844    return bb;
6845 }
6846
6847 IRSB* do_minimal_initial_iropt_BB(IRSB* bb0) {
6848    /* First flatten the block out, since all other phases assume flat code. */
6849    IRSB* bb = flatten_BB ( bb0 );
6850
6851    if (iropt_verbose) {
6852       vex_printf("\n========= FLAT\n\n" );
6853       ppIRSB(bb);
6854    }
6855
6856    // Remove redundant GETs
6857    redundant_get_removal_BB ( bb );
6858
6859    // Do minimal constant prop: copy prop and constant prop only.  No folding.
6860    // JRS FIXME 2019Nov25: this is too weak to be effective on arm32.  For that,
6861    // specifying doFolding=True makes a huge difference.
6862    bb = cprop_BB_WRK ( bb, /*mustRetainNoOps=*/True,
6863                            /*doFolding=*/False );
6864
6865    // Minor tidying of the block end, to remove a redundant Put of the IP right
6866    // at the end:
6867    /*
6868    ------ IMark(0x401FEC9, 2, 0) ------
6869    t18 = GET:I64(168)
6870    t19 = amd64g_calculate_condition[mcx=0x13]{0x58155130}(0x4:I64,0x5:I64,..
6871    t14 = 64to1(t19)
6872    if (t14) { PUT(184) = 0x401FED6:I64; exit-Boring }
6873    PUT(184) = 0x401FECB:I64  <--------------------------------
6874    PUT(184) = 0x401FECB:I64; exit-Boring
6875    */
6876    if (bb->stmts_used > 0) {
6877       const IRStmt* last = bb->stmts[bb->stmts_used - 1];
6878       if (last->tag == Ist_Put && last->Ist.Put.offset == bb->offsIP
6879           && eqIRAtom(last->Ist.Put.data, bb->next)) {
6880          bb->stmts_used--;
6881       }
6882    }
6883
6884    return bb;
6885 }
6886
6887 /* Copy the contents of |src| to the end of |dst|.  This entails fixing up the
6888    tmp numbers in |src| accordingly.  The final destination of |dst| is thrown
6889    away and replaced by the final destination of |src|.  This function doesn't
6890    make any assessment of whether it's meaningful or valid to concatenate the
6891    two IRSBs; it just *does* the concatenation. */
6892 void concatenate_irsbs ( IRSB* dst, IRSB* src )
6893 {
6894    // FIXME this is almost identical to code at the end of maybe_unroll_loop_BB.
6895    // Maybe try to common it up.
6896    Int delta = dst->tyenv->types_used;
6897    for (Int i = 0; i < src->tyenv->types_used; i++) {
6898       (void)newIRTemp(dst->tyenv, src->tyenv->types[i]);
6899    }
6900
6901    for (Int i = 0; i < src->stmts_used; i++) {
6902       IRStmt* s = deepCopyIRStmt(src->stmts[i]);
6903       deltaIRStmt(s, delta);
6904       addStmtToIRSB(dst, s);
6905    }
6906    deltaIRExpr(src->next, delta);
6907
6908    dst->next = src->next;
6909    dst->jumpkind = src->jumpkind;
6910    vassert(dst->offsIP == src->offsIP);
6911 }
6912
6913 /*---------------------------------------------------------------*/
6914 /*--- end                                            ir_opt.c ---*/
6915 /*---------------------------------------------------------------*/