memcheck/mc_translate.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- Instrument IR to perform memory checking operations.         ---*/
   4 /*---                                               mc_translate.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of MemCheck, a heavyweight Valgrind tool for
   9    detecting memory errors.
  10
  11    Copyright (C) 2000-2017 Julian Seward
  12       jseward@acm.org
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, write to the Free Software
  26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  27    02111-1307, USA.
  28
  29    The GNU General Public License is contained in the file COPYING.
  30 */
  31
  32 #include "pub_tool_basics.h"
  33 #include "pub_tool_poolalloc.h"     // For mc_include.h
  34 #include "pub_tool_hashtable.h"     // For mc_include.h
  35 #include "pub_tool_libcassert.h"
  36 #include "pub_tool_libcprint.h"
  37 #include "pub_tool_tooliface.h"
  38 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
  39 #include "pub_tool_xarray.h"
  40 #include "pub_tool_mallocfree.h"
  41 #include "pub_tool_libcbase.h"
  42
  43 #include "mc_include.h"
  44
  45
  46 /* FIXMEs JRS 2011-June-16.
  47
  48    Check the interpretation for vector narrowing and widening ops,
  49    particularly the saturating ones.  I suspect they are either overly
  50    pessimistic and/or wrong.
  51
  52    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
  53    saturating shifts): the interpretation is overly pessimistic.
  54    See comments on the relevant cases below for details.
  55
  56    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
  57    both rounding and non-rounding variants): ditto
  58 */
  59
  60 /* This file implements the Memcheck instrumentation, and in
  61    particular contains the core of its undefined value detection
  62    machinery.  For a comprehensive background of the terminology,
  63    algorithms and rationale used herein, read:
  64
  65      Using Valgrind to detect undefined value errors with
  66      bit-precision
  67
  68      Julian Seward and Nicholas Nethercote
  69
  70      2005 USENIX Annual Technical Conference (General Track),
  71      Anaheim, CA, USA, April 10-15, 2005.
  72
  73    ----
  74
  75    Here is as good a place as any to record exactly when V bits are and
  76    should be checked, why, and what function is responsible.
  77
  78
  79    Memcheck complains when an undefined value is used:
  80
  81    1. In the condition of a conditional branch.  Because it could cause
  82       incorrect control flow, and thus cause incorrect externally-visible
  83       behaviour.  [mc_translate.c:complainIfUndefined]
  84
  85    2. As an argument to a system call, or as the value that specifies
  86       the system call number.  Because it could cause an incorrect
  87       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
  88
  89    3. As the address in a load or store.  Because it could cause an
  90       incorrect value to be used later, which could cause externally-visible
  91       behaviour (eg. via incorrect control flow or an incorrect system call
  92       argument)  [complainIfUndefined]
  93
  94    4. As the target address of a branch.  Because it could cause incorrect
  95       control flow.  [complainIfUndefined]
  96
  97    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
  98       an incorrect value into the external environment.
  99       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
 100
 101    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
 102       [complainIfUndefined]
 103
 104    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
 105       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
 106       requested it.  [in memcheck.h]
 107
 108
 109    Memcheck also complains, but should not, when an undefined value is used:
 110
 111    8. As the shift value in certain SIMD shift operations (but not in the
 112       standard integer shift operations).  This inconsistency is due to
 113       historical reasons.)  [complainIfUndefined]
 114
 115
 116    Memcheck does not complain, but should, when an undefined value is used:
 117
 118    9. As an input to a client request.  Because the client request may
 119       affect the visible behaviour -- see bug #144362 for an example
 120       involving the malloc replacements in vg_replace_malloc.c and
 121       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
 122       isn't identified.  That bug report also has some info on how to solve
 123       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
 124
 125
 126    In practice, 1 and 2 account for the vast majority of cases.
 127 */
 128
 129 /* Generation of addr-definedness, addr-validity and
 130    guard-definedness checks pertaining to loads and stores (Iex_Load,
 131    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
 132    loads/stores) was re-checked 11 May 2013. */
 133
 134
 135 /*------------------------------------------------------------*/
 136 /*--- Forward decls                                        ---*/
 137 /*------------------------------------------------------------*/
 138
 139 struct _MCEnv;
 140
 141 // See below for comments explaining what this is for.
 142 typedef
 143    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 144    HowUsed;
 145
 146 static IRType  shadowTypeV ( IRType ty );
 147 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e,
 148                             HowUsed hu/*use HuOth if unknown*/ );
 149 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
 150
 151 static IRExpr *i128_const_zero(void);
 152
 153
 154 /*------------------------------------------------------------*/
 155 /*--- Memcheck running state, and tmp management.          ---*/
 156 /*------------------------------------------------------------*/
 157
 158 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
 159    propagation scheme, and a more expensive, more precise vbit propagation
 160    scheme.  This enum describes, for such an IROp, which scheme to use. */
 161 typedef
 162    enum {
 163       // Use the cheaper, less-exact variant.
 164       DLcheap=4,
 165       // Choose between cheap and expensive based on analysis of the block
 166       // to be instrumented.  Note that the choice may be done on a
 167       // per-instance basis of the IROp that this DetailLevel describes.
 168       DLauto,
 169       // Use the more expensive, more-exact variant.
 170       DLexpensive
 171    }
 172    DetailLevel;
 173
 174
 175 /* A readonly part of the running state.  For IROps that have both a
 176    less-exact and more-exact interpretation, records which interpretation is
 177    to be used.  */
 178 typedef
 179    struct {
 180       // For Add32/64 and Sub32/64, all 3 settings are allowed.  For the
 181       // DLauto case, a per-instance decision is to be made by inspecting
 182       // the associated tmp's entry in MCEnv.tmpHowUsed.
 183       DetailLevel dl_Add32;
 184       DetailLevel dl_Add64;
 185       DetailLevel dl_Sub32;
 186       DetailLevel dl_Sub64;
 187       // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
 188       // allowed.
 189       DetailLevel dl_CmpEQ64_CmpNE64;
 190       DetailLevel dl_CmpEQ32_CmpNE32;
 191       DetailLevel dl_CmpEQ16_CmpNE16;
 192       DetailLevel dl_CmpEQ8_CmpNE8;
 193    }
 194    DetailLevelByOp;
 195
 196 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp* dlbo,
 197                                        DetailLevel dl )
 198 {
 199    dlbo->dl_Add32           = dl;
 200    dlbo->dl_Add64           = dl;
 201    dlbo->dl_Sub32           = dl;
 202    dlbo->dl_Sub64           = dl;
 203    dlbo->dl_CmpEQ64_CmpNE64 = dl;
 204    dlbo->dl_CmpEQ32_CmpNE32 = dl;
 205    dlbo->dl_CmpEQ16_CmpNE16 = dl;
 206    dlbo->dl_CmpEQ8_CmpNE8   = dl;
 207 }
 208
 209 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp* dlbo )
 210 {
 211    tl_assert(dlbo->dl_Add32 >= DLcheap && dlbo->dl_Add32 <= DLexpensive);
 212    tl_assert(dlbo->dl_Add64 >= DLcheap && dlbo->dl_Add64 <= DLexpensive);
 213    tl_assert(dlbo->dl_Sub32 >= DLcheap && dlbo->dl_Sub32 <= DLexpensive);
 214    tl_assert(dlbo->dl_Sub64 >= DLcheap && dlbo->dl_Sub64 <= DLexpensive);
 215    tl_assert(dlbo->dl_CmpEQ64_CmpNE64 == DLcheap
 216              || dlbo->dl_CmpEQ64_CmpNE64 == DLexpensive);
 217    tl_assert(dlbo->dl_CmpEQ32_CmpNE32 == DLcheap
 218              || dlbo->dl_CmpEQ32_CmpNE32 == DLexpensive);
 219    tl_assert(dlbo->dl_CmpEQ16_CmpNE16 == DLcheap
 220              || dlbo->dl_CmpEQ16_CmpNE16 == DLexpensive);
 221    tl_assert(dlbo->dl_CmpEQ8_CmpNE8 == DLcheap
 222              || dlbo->dl_CmpEQ8_CmpNE8 == DLexpensive);
 223 }
 224
 225 static UInt DetailLevelByOp__count ( const DetailLevelByOp* dlbo,
 226                                      DetailLevel dl )
 227 {
 228    UInt n = 0;
 229    n += (dlbo->dl_Add32 == dl            ? 1 : 0);
 230    n += (dlbo->dl_Add64 == dl            ? 1 : 0);
 231    n += (dlbo->dl_Sub32 == dl            ? 1 : 0);
 232    n += (dlbo->dl_Sub64 == dl            ? 1 : 0);
 233    n += (dlbo->dl_CmpEQ64_CmpNE64 == dl  ? 1 : 0);
 234    n += (dlbo->dl_CmpEQ32_CmpNE32 == dl  ? 1 : 0);
 235    n += (dlbo->dl_CmpEQ16_CmpNE16 == dl  ? 1 : 0);
 236    n += (dlbo->dl_CmpEQ8_CmpNE8 == dl    ? 1 : 0);
 237    return n;
 238 }
 239
 240
 241 /* Carries info about a particular tmp.  The tmp's number is not
 242    recorded, as this is implied by (equal to) its index in the tmpMap
 243    in MCEnv.  The tmp's type is also not recorded, as this is present
 244    in MCEnv.sb->tyenv.
 245
 246    When .kind is Orig, .shadowV and .shadowB may give the identities
 247    of the temps currently holding the associated definedness (shadowV)
 248    and origin (shadowB) values, or these may be IRTemp_INVALID if code
 249    to compute such values has not yet been emitted.
 250
 251    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
 252    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
 253    illogical for a shadow tmp itself to be shadowed.
 254 */
 255 typedef
 256    enum { Orig=1, VSh=2, BSh=3 }
 257    TempKind;
 258
 259 typedef
 260    struct {
 261       TempKind kind;
 262       IRTemp   shadowV;
 263       IRTemp   shadowB;
 264    }
 265    TempMapEnt;
 266
 267
 268 /* A |HowUsed| value carries analysis results about how values are used,
 269    pertaining to whether we need to instrument integer adds expensively or
 270    not.  The running state carries a (readonly) mapping from original tmp to
 271    a HowUsed value for it.  A usage value can be one of three values,
 272    forming a 3-point chain lattice.
 273
 274       HuOth   ("Other") used in some arbitrary way
 275        |
 276       HuPCa   ("PCast") used *only* in effectively a PCast, in which all
 277        |      we care about is the all-defined vs not-all-defined distinction
 278        |
 279       HuUnU   ("Unused") not used at all.
 280
 281    The "safe" (don't-know) end of the lattice is "HuOth".  See comments
 282    below in |preInstrumentationAnalysis| for further details.
 283 */
 284 /* DECLARED ABOVE:
 285 typedef
 286    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 287    HowUsed;
 288 */
 289
 290 // Not actually necessary, but we don't want to waste D1 space.
 291 STATIC_ASSERT(sizeof(HowUsed) == 1);
 292
 293
 294 /* Carries around state during memcheck instrumentation. */
 295 typedef
 296    struct _MCEnv {
 297       /* MODIFIED: the superblock being constructed.  IRStmts are
 298          added. */
 299       IRSB* sb;
 300       Bool  trace;
 301
 302       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
 303          current kind and possibly shadow temps for each temp in the
 304          IRSB being constructed.  Note that it does not contain the
 305          type of each tmp.  If you want to know the type, look at the
 306          relevant entry in sb->tyenv.  It follows that at all times
 307          during the instrumentation process, the valid indices for
 308          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
 309          total number of Orig, V- and B- temps allocated so far.
 310
 311          The reason for this strange split (types in one place, all
 312          other info in another) is that we need the types to be
 313          attached to sb so as to make it possible to do
 314          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
 315          instrumentation process. */
 316       XArray* /* of TempMapEnt */ tmpMap;
 317
 318       /* READONLY: contains details of which ops should be expensively
 319          instrumented. */
 320       DetailLevelByOp dlbo;
 321
 322       /* READONLY: for each original tmp, how the tmp is used.  This is
 323          computed by |preInstrumentationAnalysis|.  Valid indices are
 324          0 .. #temps_in_sb-1 (same as for tmpMap). */
 325       HowUsed* tmpHowUsed;
 326
 327       /* READONLY: the guest layout.  This indicates which parts of
 328          the guest state should be regarded as 'always defined'. */
 329       const VexGuestLayout* layout;
 330
 331       /* READONLY: the host word type.  Needed for constructing
 332          arguments of type 'HWord' to be passed to helper functions.
 333          Ity_I32 or Ity_I64 only. */
 334       IRType hWordTy;
 335    }
 336    MCEnv;
 337
 338
 339 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
 340    demand), as they are encountered.  This is for two reasons.
 341
 342    (1) (less important reason): Many original tmps are unused due to
 343    initial IR optimisation, and we do not want to spaces in tables
 344    tracking them.
 345
 346    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
 347    table indexed [0 .. n_types-1], which gives the current shadow for
 348    each original tmp, or INVALID_IRTEMP if none is so far assigned.
 349    It is necessary to support making multiple assignments to a shadow
 350    -- specifically, after testing a shadow for definedness, it needs
 351    to be made defined.  But IR's SSA property disallows this.
 352
 353    (2) (more important reason): Therefore, when a shadow needs to get
 354    a new value, a new temporary is created, the value is assigned to
 355    that, and the tmpMap is updated to reflect the new binding.
 356
 357    A corollary is that if the tmpMap maps a given tmp to
 358    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
 359    there's a read-before-write error in the original tmps.  The IR
 360    sanity checker should catch all such anomalies, however.
 361 */
 362
 363 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
 364    both the table in mce->sb and to our auxiliary mapping.  Note that
 365    newTemp may cause mce->tmpMap to resize, hence previous results
 366    from VG_(indexXA)(mce->tmpMap) are invalidated. */
 367 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
 368 {
 369    Word       newIx;
 370    TempMapEnt ent;
 371    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
 372    ent.kind    = kind;
 373    ent.shadowV = IRTemp_INVALID;
 374    ent.shadowB = IRTemp_INVALID;
 375    newIx = VG_(addToXA)( mce->tmpMap, &ent );
 376    tl_assert(newIx == (Word)tmp);
 377    return tmp;
 378 }
 379
 380
 381 /* Find the tmp currently shadowing the given original tmp.  If none
 382    so far exists, allocate one.  */
 383 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
 384 {
 385    TempMapEnt* ent;
 386    /* VG_(indexXA) range-checks 'orig', hence no need to check
 387       here. */
 388    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 389    tl_assert(ent->kind == Orig);
 390    if (ent->shadowV == IRTemp_INVALID) {
 391       IRTemp tmpV
 392         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 393       /* newTemp may cause mce->tmpMap to resize, hence previous results
 394          from VG_(indexXA) are invalid. */
 395       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 396       tl_assert(ent->kind == Orig);
 397       tl_assert(ent->shadowV == IRTemp_INVALID);
 398       ent->shadowV = tmpV;
 399    }
 400    return ent->shadowV;
 401 }
 402
 403 /* Allocate a new shadow for the given original tmp.  This means any
 404    previous shadow is abandoned.  This is needed because it is
 405    necessary to give a new value to a shadow once it has been tested
 406    for undefinedness, but unfortunately IR's SSA property disallows
 407    this.  Instead we must abandon the old shadow, allocate a new one
 408    and use that instead.
 409
 410    This is the same as findShadowTmpV, except we don't bother to see
 411    if a shadow temp already existed -- we simply allocate a new one
 412    regardless. */
 413 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
 414 {
 415    TempMapEnt* ent;
 416    /* VG_(indexXA) range-checks 'orig', hence no need to check
 417       here. */
 418    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 419    tl_assert(ent->kind == Orig);
 420    if (1) {
 421       IRTemp tmpV
 422         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 423       /* newTemp may cause mce->tmpMap to resize, hence previous results
 424          from VG_(indexXA) are invalid. */
 425       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 426       tl_assert(ent->kind == Orig);
 427       ent->shadowV = tmpV;
 428    }
 429 }
 430
 431
 432 /*------------------------------------------------------------*/
 433 /*--- IRAtoms -- a subset of IRExprs                       ---*/
 434 /*------------------------------------------------------------*/
 435
 436 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
 437    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
 438    input, most of this code deals in atoms.  Usefully, a value atom
 439    always has a V-value which is also an atom: constants are shadowed
 440    by constants, and temps are shadowed by the corresponding shadow
 441    temporary. */
 442
 443 typedef  IRExpr  IRAtom;
 444
 445 /* (used for sanity checks only): is this an atom which looks
 446    like it's from original code? */
 447 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
 448 {
 449    if (a1->tag == Iex_Const)
 450       return True;
 451    if (a1->tag == Iex_RdTmp) {
 452       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 453       return ent->kind == Orig;
 454    }
 455    return False;
 456 }
 457
 458 /* (used for sanity checks only): is this an atom which looks
 459    like it's from shadow code? */
 460 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
 461 {
 462    if (a1->tag == Iex_Const)
 463       return True;
 464    if (a1->tag == Iex_RdTmp) {
 465       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 466       return ent->kind == VSh || ent->kind == BSh;
 467    }
 468    return False;
 469 }
 470
 471 /* (used for sanity checks only): check that both args are atoms and
 472    are identically-kinded. */
 473 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
 474 {
 475    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 476       return True;
 477    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 478       return True;
 479    return False;
 480 }
 481
 482
 483 /*------------------------------------------------------------*/
 484 /*--- Type management                                      ---*/
 485 /*------------------------------------------------------------*/
 486
 487 /* Shadow state is always accessed using integer types.  This returns
 488    an integer type with the same size (as per sizeofIRType) as the
 489    given type.  The only valid shadow types are Bit, I8, I16, I32,
 490    I64, I128, V128, V256. */
 491
 492 static IRType shadowTypeV ( IRType ty )
 493 {
 494    switch (ty) {
 495       case Ity_I1:
 496       case Ity_I8:
 497       case Ity_I16:
 498       case Ity_I32:
 499       case Ity_I64:
 500       case Ity_I128: return ty;
 501       case Ity_F16:  return Ity_I16;
 502       case Ity_F32:  return Ity_I32;
 503       case Ity_D32:  return Ity_I32;
 504       case Ity_F64:  return Ity_I64;
 505       case Ity_D64:  return Ity_I64;
 506       case Ity_F128: return Ity_I128;
 507       case Ity_D128: return Ity_I128;
 508       case Ity_V128: return Ity_V128;
 509       case Ity_V256: return Ity_V256;
 510       default: ppIRType(ty);
 511                VG_(tool_panic)("memcheck:shadowTypeV");
 512    }
 513 }
 514
 515 /* Produce a 'defined' value of the given shadow type.  Should only be
 516    supplied shadow types (Bit/I8/I16/I32/UI64). */
 517 static IRExpr* definedOfType ( IRType ty ) {
 518    switch (ty) {
 519       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
 520       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
 521       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
 522       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
 523       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
 524       case Ity_I128: return i128_const_zero();
 525       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
 526       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
 527       default:       VG_(tool_panic)("memcheck:definedOfType");
 528    }
 529 }
 530
 531
 532 /*------------------------------------------------------------*/
 533 /*--- Constructing IR fragments                            ---*/
 534 /*------------------------------------------------------------*/
 535
 536 /* add stmt to a bb */
 537 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
 538    if (mce->trace) {
 539       VG_(printf)("  %c: ", cat);
 540       ppIRStmt(st);
 541       VG_(printf)("\n");
 542    }
 543    addStmtToIRSB(mce->sb, st);
 544 }
 545
 546 /* assign value to tmp */
 547 static inline
 548 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
 549    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
 550 }
 551
 552 /* build various kinds of expressions */
 553 #define triop(_op, _arg1, _arg2, _arg3) \
 554                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
 555 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
 556 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
 557 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
 558 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
 559 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
 560 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
 561 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
 562 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
 563 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
 564
 565 /* Bind the given expression to a new temporary, and return the
 566    temporary.  This effectively converts an arbitrary expression into
 567    an atom.
 568
 569    'ty' is the type of 'e' and hence the type that the new temporary
 570    needs to be.  But passing it in is redundant, since we can deduce
 571    the type merely by inspecting 'e'.  So at least use that fact to
 572    assert that the two types agree. */
 573 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
 574 {
 575    TempKind k;
 576    IRTemp   t;
 577    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
 578
 579    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
 580    switch (cat) {
 581       case 'V': k = VSh;  break;
 582       case 'B': k = BSh;  break;
 583       case 'C': k = Orig; break;
 584                 /* happens when we are making up new "orig"
 585                    expressions, for IRCAS handling */
 586       default: tl_assert(0);
 587    }
 588    t = newTemp(mce, ty, k);
 589    assign(cat, mce, t, e);
 590    return mkexpr(t);
 591 }
 592
 593
 594 /*------------------------------------------------------------*/
 595 /*--- Helper functions for 128-bit ops                     ---*/
 596 /*------------------------------------------------------------*/
 597
 598 static IRExpr *i128_const_zero(void)
 599 {
 600    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
 601    return binop(Iop_64HLto128, z64, z64);
 602 }
 603
 604 /* There are no I128-bit loads and/or stores [as generated by any
 605    current front ends].  So we do not need to worry about that in
 606    expr2vbits_Load */
 607
 608
 609 /*------------------------------------------------------------*/
 610 /*--- Constructing definedness primitive ops               ---*/
 611 /*------------------------------------------------------------*/
 612
 613 /* --------- Defined-if-either-defined --------- */
 614
 615 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 616    tl_assert(isShadowAtom(mce,a1));
 617    tl_assert(isShadowAtom(mce,a2));
 618    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
 619 }
 620
 621 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 622    tl_assert(isShadowAtom(mce,a1));
 623    tl_assert(isShadowAtom(mce,a2));
 624    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
 625 }
 626
 627 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 628    tl_assert(isShadowAtom(mce,a1));
 629    tl_assert(isShadowAtom(mce,a2));
 630    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
 631 }
 632
 633 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 634    tl_assert(isShadowAtom(mce,a1));
 635    tl_assert(isShadowAtom(mce,a2));
 636    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
 637 }
 638
 639 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 640    tl_assert(isShadowAtom(mce,a1));
 641    tl_assert(isShadowAtom(mce,a2));
 642    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
 643 }
 644
 645 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 646    tl_assert(isShadowAtom(mce,a1));
 647    tl_assert(isShadowAtom(mce,a2));
 648    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
 649 }
 650
 651 /* --------- Undefined-if-either-undefined --------- */
 652
 653 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 654    tl_assert(isShadowAtom(mce,a1));
 655    tl_assert(isShadowAtom(mce,a2));
 656    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
 657 }
 658
 659 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 660    tl_assert(isShadowAtom(mce,a1));
 661    tl_assert(isShadowAtom(mce,a2));
 662    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
 663 }
 664
 665 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 666    tl_assert(isShadowAtom(mce,a1));
 667    tl_assert(isShadowAtom(mce,a2));
 668    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
 669 }
 670
 671 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 672    tl_assert(isShadowAtom(mce,a1));
 673    tl_assert(isShadowAtom(mce,a2));
 674    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
 675 }
 676
 677 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 678    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
 679    tl_assert(isShadowAtom(mce,a1));
 680    tl_assert(isShadowAtom(mce,a2));
 681    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
 682    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
 683    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
 684    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
 685    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
 686    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
 687
 688    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
 689 }
 690
 691 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 692    tl_assert(isShadowAtom(mce,a1));
 693    tl_assert(isShadowAtom(mce,a2));
 694    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
 695 }
 696
 697 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 698    tl_assert(isShadowAtom(mce,a1));
 699    tl_assert(isShadowAtom(mce,a2));
 700    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
 701 }
 702
 703 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
 704    switch (vty) {
 705       case Ity_I8:   return mkUifU8(mce, a1, a2);
 706       case Ity_I16:  return mkUifU16(mce, a1, a2);
 707       case Ity_I32:  return mkUifU32(mce, a1, a2);
 708       case Ity_I64:  return mkUifU64(mce, a1, a2);
 709       case Ity_I128: return mkUifU128(mce, a1, a2);
 710       case Ity_V128: return mkUifUV128(mce, a1, a2);
 711       case Ity_V256: return mkUifUV256(mce, a1, a2);
 712       default:
 713          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
 714          VG_(tool_panic)("memcheck:mkUifU");
 715    }
 716 }
 717
 718 /* --------- The Left-family of operations. --------- */
 719
 720 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
 721    tl_assert(isShadowAtom(mce,a1));
 722    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
 723 }
 724
 725 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
 726    tl_assert(isShadowAtom(mce,a1));
 727    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
 728 }
 729
 730 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
 731    tl_assert(isShadowAtom(mce,a1));
 732    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
 733 }
 734
 735 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
 736    tl_assert(isShadowAtom(mce,a1));
 737    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
 738 }
 739
 740 /* --------- 'Improvement' functions for AND/OR. --------- */
 741
 742 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
 743    defined (0); all other -> undefined (1).
 744 */
 745 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 746 {
 747    tl_assert(isOriginalAtom(mce, data));
 748    tl_assert(isShadowAtom(mce, vbits));
 749    tl_assert(sameKindedAtoms(data, vbits));
 750    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
 751 }
 752
 753 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 754 {
 755    tl_assert(isOriginalAtom(mce, data));
 756    tl_assert(isShadowAtom(mce, vbits));
 757    tl_assert(sameKindedAtoms(data, vbits));
 758    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
 759 }
 760
 761 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 762 {
 763    tl_assert(isOriginalAtom(mce, data));
 764    tl_assert(isShadowAtom(mce, vbits));
 765    tl_assert(sameKindedAtoms(data, vbits));
 766    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
 767 }
 768
 769 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 770 {
 771    tl_assert(isOriginalAtom(mce, data));
 772    tl_assert(isShadowAtom(mce, vbits));
 773    tl_assert(sameKindedAtoms(data, vbits));
 774    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
 775 }
 776
 777 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 778 {
 779    tl_assert(isOriginalAtom(mce, data));
 780    tl_assert(isShadowAtom(mce, vbits));
 781    tl_assert(sameKindedAtoms(data, vbits));
 782    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
 783 }
 784
 785 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 786 {
 787    tl_assert(isOriginalAtom(mce, data));
 788    tl_assert(isShadowAtom(mce, vbits));
 789    tl_assert(sameKindedAtoms(data, vbits));
 790    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
 791 }
 792
 793 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
 794    defined (0); all other -> undefined (1).
 795 */
 796 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 797 {
 798    tl_assert(isOriginalAtom(mce, data));
 799    tl_assert(isShadowAtom(mce, vbits));
 800    tl_assert(sameKindedAtoms(data, vbits));
 801    return assignNew(
 802              'V', mce, Ity_I8,
 803              binop(Iop_Or8,
 804                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
 805                    vbits) );
 806 }
 807
 808 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 809 {
 810    tl_assert(isOriginalAtom(mce, data));
 811    tl_assert(isShadowAtom(mce, vbits));
 812    tl_assert(sameKindedAtoms(data, vbits));
 813    return assignNew(
 814              'V', mce, Ity_I16,
 815              binop(Iop_Or16,
 816                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
 817                    vbits) );
 818 }
 819
 820 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 821 {
 822    tl_assert(isOriginalAtom(mce, data));
 823    tl_assert(isShadowAtom(mce, vbits));
 824    tl_assert(sameKindedAtoms(data, vbits));
 825    return assignNew(
 826              'V', mce, Ity_I32,
 827              binop(Iop_Or32,
 828                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
 829                    vbits) );
 830 }
 831
 832 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 833 {
 834    tl_assert(isOriginalAtom(mce, data));
 835    tl_assert(isShadowAtom(mce, vbits));
 836    tl_assert(sameKindedAtoms(data, vbits));
 837    return assignNew(
 838              'V', mce, Ity_I64,
 839              binop(Iop_Or64,
 840                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
 841                    vbits) );
 842 }
 843
 844 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 845 {
 846    tl_assert(isOriginalAtom(mce, data));
 847    tl_assert(isShadowAtom(mce, vbits));
 848    tl_assert(sameKindedAtoms(data, vbits));
 849    return assignNew(
 850              'V', mce, Ity_V128,
 851              binop(Iop_OrV128,
 852                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
 853                    vbits) );
 854 }
 855
 856 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 857 {
 858    tl_assert(isOriginalAtom(mce, data));
 859    tl_assert(isShadowAtom(mce, vbits));
 860    tl_assert(sameKindedAtoms(data, vbits));
 861    return assignNew(
 862              'V', mce, Ity_V256,
 863              binop(Iop_OrV256,
 864                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
 865                    vbits) );
 866 }
 867
 868 /* --------- Pessimising casts. --------- */
 869
 870 /* The function returns an expression of type DST_TY. If any of the VBITS
 871    is undefined (value == 1) the resulting expression has all bits set to
 872    1. Otherwise, all bits are 0. */
 873
 874 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
 875 {
 876    IRType  src_ty;
 877    IRAtom* tmp1;
 878
 879    /* Note, dst_ty is a shadow type, not an original type. */
 880    tl_assert(isShadowAtom(mce,vbits));
 881    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
 882
 883    /* Fast-track some common cases */
 884    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
 885       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 886
 887    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
 888       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 889
 890    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
 891       /* PCast the arg, then clone it. */
 892       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 893       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 894    }
 895
 896    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
 897       /* PCast the arg, then clone it 4 times. */
 898       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 899       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 900       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 901    }
 902
 903    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
 904       /* PCast the arg, then clone it 8 times. */
 905       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 906       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 907       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 908       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
 909    }
 910
 911    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
 912       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
 913          the top half. */
 914       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 915       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
 916    }
 917
 918    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
 919       /* Use InterleaveHI64x2 to copy the top half of the vector into
 920          the bottom half.  Then we can UifU it with the original, throw
 921          away the upper half of the result, and PCast-I64-to-I64
 922          the lower half. */
 923       // Generates vbits[127:64] : vbits[127:64]
 924       IRAtom* hi64hi64
 925          = assignNew('V', mce, Ity_V128,
 926                      binop(Iop_InterleaveHI64x2, vbits, vbits));
 927       // Generates
 928       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
 929       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
 930       IRAtom* lohi64
 931          = mkUifUV128(mce, hi64hi64, vbits);
 932       // Generates UifU(vbits[127:64],vbits[63:0])
 933       IRAtom* lo64
 934          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
 935       // Generates
 936       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
 937       //   == PCast-to-I64( vbits[127:0] )
 938       IRAtom* res
 939          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
 940       return res;
 941    }
 942
 943    /* Else do it the slow way .. */
 944    /* First of all, collapse vbits down to a single bit. */
 945    tmp1   = NULL;
 946    switch (src_ty) {
 947       case Ity_I1:
 948          tmp1 = vbits;
 949          break;
 950       case Ity_I8:
 951          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
 952          break;
 953       case Ity_I16:
 954          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
 955          break;
 956       case Ity_I32:
 957          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
 958          break;
 959       case Ity_I64:
 960          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
 961          break;
 962       case Ity_I128: {
 963          /* Gah.  Chop it in half, OR the halves together, and compare
 964             that with zero. */
 965          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
 966          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
 967          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
 968          tmp1         = assignNew('V', mce, Ity_I1,
 969                                        unop(Iop_CmpNEZ64, tmp4));
 970          break;
 971       }
 972       case Ity_V128: {
 973          /* Chop it in half, OR the halves together, and compare that
 974           * with zero.
 975           */
 976          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
 977          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
 978          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
 979          tmp1         = assignNew('V', mce, Ity_I1,
 980                                        unop(Iop_CmpNEZ64, tmp4));
 981          break;
 982       }
 983       default:
 984          ppIRType(src_ty);
 985          VG_(tool_panic)("mkPCastTo(1)");
 986    }
 987    tl_assert(tmp1);
 988    /* Now widen up to the dst type. */
 989    switch (dst_ty) {
 990       case Ity_I1:
 991          return tmp1;
 992       case Ity_I8:
 993          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
 994       case Ity_I16:
 995          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
 996       case Ity_I32:
 997          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
 998       case Ity_I64:
 999          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1000       case Ity_V128:
1001          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1002          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
1003          return tmp1;
1004       case Ity_I128:
1005          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1006          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
1007          return tmp1;
1008       case Ity_V256:
1009          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1010          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
1011                                                     tmp1, tmp1));
1012          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
1013                                                     tmp1, tmp1));
1014          return tmp1;
1015       default:
1016          ppIRType(dst_ty);
1017          VG_(tool_panic)("mkPCastTo(2)");
1018    }
1019 }
1020
1021 /* This is a minor variant.  It takes an arg of some type and returns
1022    a value of the same type.  The result consists entirely of Defined
1023    (zero) bits except its least significant bit, which is a PCast of
1024    the entire argument down to a single bit. */
1025 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
1026 {
1027    if (ty == Ity_V128) {
1028       /* --- Case for V128 --- */
1029       IRAtom* varg128 = varg;
1030       // generates: PCast-to-I64(varg128)
1031       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
1032       // Now introduce zeros (defined bits) in the top 63 places
1033       // generates: Def--(63)--Def PCast-to-I1(varg128)
1034       IRAtom* d63pc
1035          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
1036       // generates: Def--(64)--Def
1037       IRAtom* d64
1038          = definedOfType(Ity_I64);
1039       // generates: Def--(127)--Def PCast-to-I1(varg128)
1040       IRAtom* res
1041          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
1042       return res;
1043    }
1044    if (ty == Ity_I64) {
1045       /* --- Case for I64 --- */
1046       // PCast to 64
1047       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
1048       // Zero (Def) out the top 63 bits
1049       IRAtom* res
1050          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
1051       return res;
1052    }
1053    /*NOTREACHED*/
1054    tl_assert(0);
1055 }
1056
1057 /* --------- Optimistic casts. --------- */
1058
1059 /* The function takes and returns an expression of type TY. If any of the
1060    VBITS indicate defined (value == 0) the resulting expression has all bits
1061    set to 0. Otherwise, all bits are 1.  In words, if any bits are defined
1062    then all bits are made to be defined.
1063
1064    In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1065 */
1066 static IRAtom* mkOCastAt( MCEnv* mce, IRType ty, IRAtom* vbits )
1067 {
1068    IROp opSUB, opSHR, opSAR;
1069    UInt sh;
1070
1071    switch (ty) {
1072       case Ity_I64:
1073          opSUB = Iop_Sub64; opSHR = Iop_Shr64; opSAR = Iop_Sar64; sh = 63;
1074          break;
1075       case Ity_I32:
1076          opSUB = Iop_Sub32; opSHR = Iop_Shr32; opSAR = Iop_Sar32; sh = 31;
1077          break;
1078       case Ity_I16:
1079          opSUB = Iop_Sub16; opSHR = Iop_Shr16; opSAR = Iop_Sar16; sh = 15;
1080          break;
1081       case Ity_I8:
1082          opSUB = Iop_Sub8; opSHR = Iop_Shr8; opSAR = Iop_Sar8; sh = 7;
1083          break;
1084       default:
1085          ppIRType(ty);
1086          VG_(tool_panic)("mkOCastTo");
1087    }
1088
1089    IRAtom *shr1, *at;
1090    shr1 = assignNew('V', mce,ty, binop(opSHR, vbits, mkU8(1)));
1091    at   = assignNew('V', mce,ty, binop(opSUB, vbits, shr1));
1092    at   = assignNew('V', mce,ty, binop(opSAR, at, mkU8(sh)));
1093    return at;
1094 }
1095
1096
1097 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1098 /*
1099    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1100    PCasting to Ity_U1.  However, sometimes it is necessary to be more
1101    accurate.  The insight is that the result is defined if two
1102    corresponding bits can be found, one from each argument, so that
1103    both bits are defined but are different -- that makes EQ say "No"
1104    and NE say "Yes".  Hence, we compute an improvement term and DifD
1105    it onto the "normal" (UifU) result.
1106
1107    The result is:
1108
1109    PCastTo<1> (
1110       -- naive version
1111       UifU<sz>(vxx, vyy)
1112
1113       `DifD<sz>`
1114
1115       -- improvement term
1116       OCast<sz>(vec)
1117    )
1118
1119    where
1120      vec contains 0 (defined) bits where the corresponding arg bits
1121      are defined but different, and 1 bits otherwise.
1122
1123      vec = Or<sz>( vxx,   // 0 iff bit defined
1124                    vyy,   // 0 iff bit defined
1125                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1126                  )
1127
1128      If any bit of vec is 0, the result is defined and so the
1129      improvement term should produce 0...0, else it should produce
1130      1...1.
1131
1132      Hence require for the improvement term:
1133
1134         OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1135
1136      which you can think of as an "optimistic cast" (OCast, the opposite of
1137      the normal "pessimistic cast" (PCast) family.  An OCast says all bits
1138      are defined if any bit is defined.
1139
1140      It is possible to show that
1141
1142          if vec == 1...1 then 1...1 else 0...0
1143
1144      can be implemented in straight-line code as
1145
1146          (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1147
1148    We note that vec contains the sub-term Or<sz>(vxx, vyy).  Since UifU is
1149    implemented with Or (since 1 signifies undefinedness), this is a
1150    duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1151    a final version of:
1152
1153    let naive = UifU<sz>(vxx, vyy)
1154        vec   = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1155    in
1156        PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1157
1158    This was extensively re-analysed and checked on 6 July 05 and again
1159    in July 2017.
1160 */
1161 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
1162                                     IRType  ty,
1163                                     IRAtom* vxx, IRAtom* vyy,
1164                                     IRAtom* xx,  IRAtom* yy )
1165 {
1166    IRAtom *naive, *vec, *improved, *final_cast;
1167    IROp   opDIFD, opUIFU, opOR, opXOR, opNOT;
1168
1169    tl_assert(isShadowAtom(mce,vxx));
1170    tl_assert(isShadowAtom(mce,vyy));
1171    tl_assert(isOriginalAtom(mce,xx));
1172    tl_assert(isOriginalAtom(mce,yy));
1173    tl_assert(sameKindedAtoms(vxx,xx));
1174    tl_assert(sameKindedAtoms(vyy,yy));
1175
1176    switch (ty) {
1177       case Ity_I8:
1178          opDIFD = Iop_And8;
1179          opUIFU = Iop_Or8;
1180          opOR   = Iop_Or8;
1181          opXOR  = Iop_Xor8;
1182          opNOT  = Iop_Not8;
1183          break;
1184       case Ity_I16:
1185          opDIFD = Iop_And16;
1186          opUIFU = Iop_Or16;
1187          opOR   = Iop_Or16;
1188          opXOR  = Iop_Xor16;
1189          opNOT  = Iop_Not16;
1190          break;
1191       case Ity_I32:
1192          opDIFD = Iop_And32;
1193          opUIFU = Iop_Or32;
1194          opOR   = Iop_Or32;
1195          opXOR  = Iop_Xor32;
1196          opNOT  = Iop_Not32;
1197          break;
1198       case Ity_I64:
1199          opDIFD = Iop_And64;
1200          opUIFU = Iop_Or64;
1201          opOR   = Iop_Or64;
1202          opXOR  = Iop_Xor64;
1203          opNOT  = Iop_Not64;
1204          break;
1205       default:
1206          VG_(tool_panic)("expensiveCmpEQorNE");
1207    }
1208
1209    naive
1210       = assignNew('V', mce, ty, binop(opUIFU, vxx, vyy));
1211
1212    vec
1213       = assignNew(
1214            'V', mce,ty,
1215            binop( opOR,
1216                   naive,
1217                   assignNew(
1218                      'V', mce,ty,
1219                      unop(opNOT,
1220                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1221
1222    improved
1223       = assignNew( 'V', mce,ty,
1224                    binop(opDIFD, naive, mkOCastAt(mce, ty, vec)));
1225
1226    final_cast
1227       = mkPCastTo( mce, Ity_I1, improved );
1228
1229    return final_cast;
1230 }
1231
1232
1233 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1234
1235 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1236
1237       CmpORD32S(x,y) = 1<<3   if  x <s y
1238                      = 1<<2   if  x >s y
1239                      = 1<<1   if  x == y
1240
1241    and similarly the unsigned variant.  The default interpretation is:
1242
1243       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1244                                   & (7<<1)
1245
1246    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1247    are zero and therefore defined (viz, zero).
1248
1249    Also deal with a special case better:
1250
1251       CmpORD32S(x,0)
1252
1253    Here, bit 3 (LT) of the result is a copy of the top bit of x and
1254    will be defined even if the rest of x isn't.  In which case we do:
1255
1256       CmpORD32S#(x,x#,0,{impliedly 0}#)
1257          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
1258            | (x# >>u 31) << 3      -- LT# = x#[31]
1259
1260    Analogous handling for CmpORD64{S,U}.
1261 */
1262 static Bool isZeroU32 ( IRAtom* e )
1263 {
1264    return
1265       toBool( e->tag == Iex_Const
1266               && e->Iex.Const.con->tag == Ico_U32
1267               && e->Iex.Const.con->Ico.U32 == 0 );
1268 }
1269
1270 static Bool isZeroU64 ( IRAtom* e )
1271 {
1272    return
1273       toBool( e->tag == Iex_Const
1274               && e->Iex.Const.con->tag == Ico_U64
1275               && e->Iex.Const.con->Ico.U64 == 0 );
1276 }
1277
1278 static IRAtom* doCmpORD ( MCEnv*  mce,
1279                           IROp    cmp_op,
1280                           IRAtom* xxhash, IRAtom* yyhash,
1281                           IRAtom* xx,     IRAtom* yy )
1282 {
1283    Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1284    Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1285    IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
1286    IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
1287    IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
1288    IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
1289    IRType ty     = m64 ? Ity_I64   : Ity_I32;
1290    Int    width  = m64 ? 64        : 32;
1291
1292    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1293
1294    IRAtom* threeLeft1 = NULL;
1295    IRAtom* sevenLeft1 = NULL;
1296
1297    tl_assert(isShadowAtom(mce,xxhash));
1298    tl_assert(isShadowAtom(mce,yyhash));
1299    tl_assert(isOriginalAtom(mce,xx));
1300    tl_assert(isOriginalAtom(mce,yy));
1301    tl_assert(sameKindedAtoms(xxhash,xx));
1302    tl_assert(sameKindedAtoms(yyhash,yy));
1303    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1304              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1305
1306    if (0) {
1307       ppIROp(cmp_op); VG_(printf)(" ");
1308       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1309    }
1310
1311    if (syned && isZero(yy)) {
1312       /* fancy interpretation */
1313       /* if yy is zero, then it must be fully defined (zero#). */
1314       tl_assert(isZero(yyhash));
1315       threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
1316       return
1317          binop(
1318             opOR,
1319             assignNew(
1320                'V', mce,ty,
1321                binop(
1322                   opAND,
1323                   mkPCastTo(mce,ty, xxhash),
1324                   threeLeft1
1325                )),
1326             assignNew(
1327                'V', mce,ty,
1328                binop(
1329                   opSHL,
1330                   assignNew(
1331                      'V', mce,ty,
1332                      binop(opSHR, xxhash, mkU8(width-1))),
1333                   mkU8(3)
1334                ))
1335          );
1336    } else {
1337       /* standard interpretation */
1338       sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1339       return
1340          binop(
1341             opAND,
1342             mkPCastTo( mce,ty,
1343                        mkUifU(mce,ty, xxhash,yyhash)),
1344             sevenLeft1
1345          );
1346    }
1347 }
1348
1349
1350 /*------------------------------------------------------------*/
1351 /*--- Emit a test and complaint if something is undefined. ---*/
1352 /*------------------------------------------------------------*/
1353
1354 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1355
1356
1357 /* Set the annotations on a dirty helper to indicate that the stack
1358    pointer and instruction pointers might be read.  This is the
1359    behaviour of all 'emit-a-complaint' style functions we might
1360    call. */
1361
1362 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1363    di->nFxState = 2;
1364    di->fxState[0].fx        = Ifx_Read;
1365    di->fxState[0].offset    = mce->layout->offset_SP;
1366    di->fxState[0].size      = mce->layout->sizeof_SP;
1367    di->fxState[0].nRepeats  = 0;
1368    di->fxState[0].repeatLen = 0;
1369    di->fxState[1].fx        = Ifx_Read;
1370    di->fxState[1].offset    = mce->layout->offset_IP;
1371    di->fxState[1].size      = mce->layout->sizeof_IP;
1372    di->fxState[1].nRepeats  = 0;
1373    di->fxState[1].repeatLen = 0;
1374 }
1375
1376
1377 /* Check the supplied *original* |atom| for undefinedness, and emit a
1378    complaint if so.  Once that happens, mark it as defined.  This is
1379    possible because the atom is either a tmp or literal.  If it's a
1380    tmp, it will be shadowed by a tmp, and so we can set the shadow to
1381    be defined.  In fact as mentioned above, we will have to allocate a
1382    new tmp to carry the new 'defined' shadow value, and update the
1383    original->tmp mapping accordingly; we cannot simply assign a new
1384    value to an existing shadow tmp as this breaks SSAness.
1385
1386    The checks are performed, any resulting complaint emitted, and
1387    |atom|'s shadow temp set to 'defined', ONLY in the case that
1388    |guard| evaluates to True at run-time.  If it evaluates to False
1389    then no action is performed.  If |guard| is NULL (the usual case)
1390    then it is assumed to be always-true, and hence these actions are
1391    performed unconditionally.
1392
1393    This routine does not generate code to check the definedness of
1394    |guard|.  The caller is assumed to have taken care of that already.
1395 */
1396 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1397 {
1398    IRAtom*  vatom;
1399    IRType   ty;
1400    Int      sz;
1401    IRDirty* di;
1402    IRAtom*  cond;
1403    IRAtom*  origin;
1404    void*    fn;
1405    const HChar* nm;
1406    IRExpr** args;
1407    Int      nargs;
1408
1409    // Don't do V bit tests if we're not reporting undefined value errors.
1410    if (MC_(clo_mc_level) == 1)
1411       return;
1412
1413    if (guard)
1414       tl_assert(isOriginalAtom(mce, guard));
1415
1416    /* Since the original expression is atomic, there's no duplicated
1417       work generated by making multiple V-expressions for it.  So we
1418       don't really care about the possibility that someone else may
1419       also create a V-interpretion for it. */
1420    tl_assert(isOriginalAtom(mce, atom));
1421    vatom = expr2vbits( mce, atom, HuOth );
1422    tl_assert(isShadowAtom(mce, vatom));
1423    tl_assert(sameKindedAtoms(atom, vatom));
1424
1425    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1426
1427    /* sz is only used for constructing the error message */
1428    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1429
1430    cond = mkPCastTo( mce, Ity_I1, vatom );
1431    /* cond will be 0 if all defined, and 1 if any not defined. */
1432
1433    /* Get the origin info for the value we are about to check.  At
1434       least, if we are doing origin tracking.  If not, use a dummy
1435       zero origin. */
1436    if (MC_(clo_mc_level) == 3) {
1437       origin = schemeE( mce, atom );
1438       if (mce->hWordTy == Ity_I64) {
1439          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1440       }
1441    } else {
1442       origin = NULL;
1443    }
1444
1445    fn    = NULL;
1446    nm    = NULL;
1447    args  = NULL;
1448    nargs = -1;
1449
1450    switch (sz) {
1451       case 0:
1452          if (origin) {
1453             fn    = &MC_(helperc_value_check0_fail_w_o);
1454             nm    = "MC_(helperc_value_check0_fail_w_o)";
1455             args  = mkIRExprVec_1(origin);
1456             nargs = 1;
1457          } else {
1458             fn    = &MC_(helperc_value_check0_fail_no_o);
1459             nm    = "MC_(helperc_value_check0_fail_no_o)";
1460             args  = mkIRExprVec_0();
1461             nargs = 0;
1462          }
1463          break;
1464       case 1:
1465          if (origin) {
1466             fn    = &MC_(helperc_value_check1_fail_w_o);
1467             nm    = "MC_(helperc_value_check1_fail_w_o)";
1468             args  = mkIRExprVec_1(origin);
1469             nargs = 1;
1470          } else {
1471             fn    = &MC_(helperc_value_check1_fail_no_o);
1472             nm    = "MC_(helperc_value_check1_fail_no_o)";
1473             args  = mkIRExprVec_0();
1474             nargs = 0;
1475          }
1476          break;
1477       case 4:
1478          if (origin) {
1479             fn    = &MC_(helperc_value_check4_fail_w_o);
1480             nm    = "MC_(helperc_value_check4_fail_w_o)";
1481             args  = mkIRExprVec_1(origin);
1482             nargs = 1;
1483          } else {
1484             fn    = &MC_(helperc_value_check4_fail_no_o);
1485             nm    = "MC_(helperc_value_check4_fail_no_o)";
1486             args  = mkIRExprVec_0();
1487             nargs = 0;
1488          }
1489          break;
1490       case 8:
1491          if (origin) {
1492             fn    = &MC_(helperc_value_check8_fail_w_o);
1493             nm    = "MC_(helperc_value_check8_fail_w_o)";
1494             args  = mkIRExprVec_1(origin);
1495             nargs = 1;
1496          } else {
1497             fn    = &MC_(helperc_value_check8_fail_no_o);
1498             nm    = "MC_(helperc_value_check8_fail_no_o)";
1499             args  = mkIRExprVec_0();
1500             nargs = 0;
1501          }
1502          break;
1503       case 2:
1504       case 16:
1505          if (origin) {
1506             fn    = &MC_(helperc_value_checkN_fail_w_o);
1507             nm    = "MC_(helperc_value_checkN_fail_w_o)";
1508             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1509             nargs = 2;
1510          } else {
1511             fn    = &MC_(helperc_value_checkN_fail_no_o);
1512             nm    = "MC_(helperc_value_checkN_fail_no_o)";
1513             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1514             nargs = 1;
1515          }
1516          break;
1517       default:
1518          VG_(tool_panic)("unexpected szB");
1519    }
1520
1521    tl_assert(fn);
1522    tl_assert(nm);
1523    tl_assert(args);
1524    tl_assert(nargs >= 0 && nargs <= 2);
1525    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1526               || (MC_(clo_mc_level) == 2 && origin == NULL) );
1527
1528    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1529                            VG_(fnptr_to_fnentry)( fn ), args );
1530    di->guard = cond; // and cond is PCast-to-1(atom#)
1531
1532    /* If the complaint is to be issued under a guard condition, AND
1533       that into the guard condition for the helper call. */
1534    if (guard) {
1535       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1536       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1537       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1538       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
1539    }
1540
1541    setHelperAnns( mce, di );
1542    stmt( 'V', mce, IRStmt_Dirty(di));
1543
1544    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1545       defined -- but only in the case where the guard evaluates to
1546       True at run-time.  Do the update by setting the orig->shadow
1547       mapping for tmp to reflect the fact that this shadow is getting
1548       a new value. */
1549    tl_assert(isIRAtom(vatom));
1550    /* sameKindedAtoms ... */
1551    if (vatom->tag == Iex_RdTmp) {
1552       tl_assert(atom->tag == Iex_RdTmp);
1553       if (guard == NULL) {
1554          // guard is 'always True', hence update unconditionally
1555          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1556          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1557                           definedOfType(ty));
1558       } else {
1559          // update the temp only conditionally.  Do this by copying
1560          // its old value when the guard is False.
1561          // The old value ..
1562          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1563          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1564          IRAtom* new_tmpV
1565             = assignNew('V', mce, shadowTypeV(ty),
1566                         IRExpr_ITE(guard, definedOfType(ty),
1567                                           mkexpr(old_tmpV)));
1568          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1569       }
1570    }
1571 }
1572
1573
1574 /*------------------------------------------------------------*/
1575 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
1576 /*------------------------------------------------------------*/
1577
1578 /* Examine the always-defined sections declared in layout to see if
1579    the (offset,size) section is within one.  Note, is is an error to
1580    partially fall into such a region: (offset,size) should either be
1581    completely in such a region or completely not-in such a region.
1582 */
1583 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1584 {
1585    Int minoffD, maxoffD, i;
1586    Int minoff = offset;
1587    Int maxoff = minoff + size - 1;
1588    tl_assert((minoff & ~0xFFFF) == 0);
1589    tl_assert((maxoff & ~0xFFFF) == 0);
1590
1591    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1592       minoffD = mce->layout->alwaysDefd[i].offset;
1593       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1594       tl_assert((minoffD & ~0xFFFF) == 0);
1595       tl_assert((maxoffD & ~0xFFFF) == 0);
1596
1597       if (maxoff < minoffD || maxoffD < minoff)
1598          continue; /* no overlap */
1599       if (minoff >= minoffD && maxoff <= maxoffD)
1600          return True; /* completely contained in an always-defd section */
1601
1602       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1603    }
1604    return False; /* could not find any containing section */
1605 }
1606
1607
1608 /* Generate into bb suitable actions to shadow this Put.  If the state
1609    slice is marked 'always defined', do nothing.  Otherwise, write the
1610    supplied V bits to the shadow state.  We can pass in either an
1611    original atom or a V-atom, but not both.  In the former case the
1612    relevant V-bits are then generated from the original.
1613    We assume here, that the definedness of GUARD has already been checked.
1614 */
1615 static
1616 void do_shadow_PUT ( MCEnv* mce,  Int offset,
1617                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1618 {
1619    IRType ty;
1620
1621    // Don't do shadow PUTs if we're not doing undefined value checking.
1622    // Their absence lets Vex's optimiser remove all the shadow computation
1623    // that they depend on, which includes GETs of the shadow registers.
1624    if (MC_(clo_mc_level) == 1)
1625       return;
1626
1627    if (atom) {
1628       tl_assert(!vatom);
1629       tl_assert(isOriginalAtom(mce, atom));
1630       vatom = expr2vbits( mce, atom, HuOth );
1631    } else {
1632       tl_assert(vatom);
1633       tl_assert(isShadowAtom(mce, vatom));
1634    }
1635
1636    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1637    tl_assert(ty != Ity_I1);
1638    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1639       /* later: no ... */
1640       /* emit code to emit a complaint if any of the vbits are 1. */
1641       /* complainIfUndefined(mce, atom); */
1642    } else {
1643       /* Do a plain shadow Put. */
1644       if (guard) {
1645          /* If the guard expression evaluates to false we simply Put the value
1646             that is already stored in the guest state slot */
1647          IRAtom *cond, *iffalse;
1648
1649          cond    = assignNew('V', mce, Ity_I1, guard);
1650          iffalse = assignNew('V', mce, ty,
1651                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1652          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1653       }
1654       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1655    }
1656 }
1657
1658
1659 /* Return an expression which contains the V bits corresponding to the
1660    given GETI (passed in in pieces).
1661 */
1662 static
1663 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1664 {
1665    IRAtom* vatom;
1666    IRType  ty, tyS;
1667    Int     arrSize;;
1668    IRRegArray* descr = puti->descr;
1669    IRAtom*     ix    = puti->ix;
1670    Int         bias  = puti->bias;
1671    IRAtom*     atom  = puti->data;
1672
1673    // Don't do shadow PUTIs if we're not doing undefined value checking.
1674    // Their absence lets Vex's optimiser remove all the shadow computation
1675    // that they depend on, which includes GETIs of the shadow registers.
1676    if (MC_(clo_mc_level) == 1)
1677       return;
1678
1679    tl_assert(isOriginalAtom(mce,atom));
1680    vatom = expr2vbits( mce, atom, HuOth );
1681    tl_assert(sameKindedAtoms(atom, vatom));
1682    ty   = descr->elemTy;
1683    tyS  = shadowTypeV(ty);
1684    arrSize = descr->nElems * sizeofIRType(ty);
1685    tl_assert(ty != Ity_I1);
1686    tl_assert(isOriginalAtom(mce,ix));
1687    complainIfUndefined(mce, ix, NULL);
1688    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1689       /* later: no ... */
1690       /* emit code to emit a complaint if any of the vbits are 1. */
1691       /* complainIfUndefined(mce, atom); */
1692    } else {
1693       /* Do a cloned version of the Put that refers to the shadow
1694          area. */
1695       IRRegArray* new_descr
1696          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1697                          tyS, descr->nElems);
1698       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1699    }
1700 }
1701
1702
1703 /* Return an expression which contains the V bits corresponding to the
1704    given GET (passed in in pieces).
1705 */
1706 static
1707 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1708 {
1709    IRType tyS = shadowTypeV(ty);
1710    tl_assert(ty != Ity_I1);
1711    tl_assert(ty != Ity_I128);
1712    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1713       /* Always defined, return all zeroes of the relevant type */
1714       return definedOfType(tyS);
1715    } else {
1716       /* return a cloned version of the Get that refers to the shadow
1717          area. */
1718       /* FIXME: this isn't an atom! */
1719       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1720    }
1721 }
1722
1723
1724 /* Return an expression which contains the V bits corresponding to the
1725    given GETI (passed in in pieces).
1726 */
1727 static
1728 IRExpr* shadow_GETI ( MCEnv* mce,
1729                       IRRegArray* descr, IRAtom* ix, Int bias )
1730 {
1731    IRType ty   = descr->elemTy;
1732    IRType tyS  = shadowTypeV(ty);
1733    Int arrSize = descr->nElems * sizeofIRType(ty);
1734    tl_assert(ty != Ity_I1);
1735    tl_assert(isOriginalAtom(mce,ix));
1736    complainIfUndefined(mce, ix, NULL);
1737    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1738       /* Always defined, return all zeroes of the relevant type */
1739       return definedOfType(tyS);
1740    } else {
1741       /* return a cloned version of the Get that refers to the shadow
1742          area. */
1743       IRRegArray* new_descr
1744          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1745                          tyS, descr->nElems);
1746       return IRExpr_GetI( new_descr, ix, bias );
1747    }
1748 }
1749
1750
1751 /*------------------------------------------------------------*/
1752 /*--- Generating approximations for unknown operations,    ---*/
1753 /*--- using lazy-propagate semantics                       ---*/
1754 /*------------------------------------------------------------*/
1755
1756 /* Lazy propagation of undefinedness from two values, resulting in the
1757    specified shadow type.
1758 */
1759 static
1760 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1761 {
1762    IRAtom* at;
1763    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1764    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1765    tl_assert(isShadowAtom(mce,va1));
1766    tl_assert(isShadowAtom(mce,va2));
1767
1768    /* The general case is inefficient because PCast is an expensive
1769       operation.  Here are some special cases which use PCast only
1770       once rather than twice. */
1771
1772    /* I64 x I64 -> I64 */
1773    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1774       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1775       at = mkUifU(mce, Ity_I64, va1, va2);
1776       at = mkPCastTo(mce, Ity_I64, at);
1777       return at;
1778    }
1779
1780    /* I64 x I64 -> I32 */
1781    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1782       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1783       at = mkUifU(mce, Ity_I64, va1, va2);
1784       at = mkPCastTo(mce, Ity_I32, at);
1785       return at;
1786    }
1787
1788    /* I32 x I32 -> I32 */
1789    if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1790       if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1791       at = mkUifU(mce, Ity_I32, va1, va2);
1792       at = mkPCastTo(mce, Ity_I32, at);
1793       return at;
1794    }
1795
1796    if (0) {
1797       VG_(printf)("mkLazy2 ");
1798       ppIRType(t1);
1799       VG_(printf)("_");
1800       ppIRType(t2);
1801       VG_(printf)("_");
1802       ppIRType(finalVty);
1803       VG_(printf)("\n");
1804    }
1805
1806    /* General case: force everything via 32-bit intermediaries. */
1807    at = mkPCastTo(mce, Ity_I32, va1);
1808    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1809    at = mkPCastTo(mce, finalVty, at);
1810    return at;
1811 }
1812
1813
1814 /* 3-arg version of the above. */
1815 static
1816 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
1817                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
1818 {
1819    IRAtom* at;
1820    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1821    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1822    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1823    tl_assert(isShadowAtom(mce,va1));
1824    tl_assert(isShadowAtom(mce,va2));
1825    tl_assert(isShadowAtom(mce,va3));
1826
1827    /* The general case is inefficient because PCast is an expensive
1828       operation.  Here are some special cases which use PCast only
1829       twice rather than three times. */
1830
1831    /* I32 x I64 x I64 -> I64 */
1832    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1833    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1834        && finalVty == Ity_I64) {
1835       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
1836       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1837          mode indication which is fully defined, this should get
1838          folded out later. */
1839       at = mkPCastTo(mce, Ity_I64, va1);
1840       /* Now fold in 2nd and 3rd args. */
1841       at = mkUifU(mce, Ity_I64, at, va2);
1842       at = mkUifU(mce, Ity_I64, at, va3);
1843       /* and PCast once again. */
1844       at = mkPCastTo(mce, Ity_I64, at);
1845       return at;
1846    }
1847
1848    /* I32 x I8 x I64 -> I64 */
1849    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
1850        && finalVty == Ity_I64) {
1851       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
1852       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
1853        * rounding mode indication which is fully defined, this should
1854        * get folded out later.
1855       */
1856       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1857       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1858       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1859       at = mkUifU(mce, Ity_I64, at, va3);
1860       /* and PCast once again. */
1861       at = mkPCastTo(mce, Ity_I64, at);
1862       return at;
1863    }
1864
1865    /* I32 x I64 x I64 -> I32 */
1866    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1867        && finalVty == Ity_I32) {
1868       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
1869       at = mkPCastTo(mce, Ity_I64, va1);
1870       at = mkUifU(mce, Ity_I64, at, va2);
1871       at = mkUifU(mce, Ity_I64, at, va3);
1872       at = mkPCastTo(mce, Ity_I32, at);
1873       return at;
1874    }
1875
1876    /* I32 x I32 x I32 -> I32 */
1877    /* 32-bit FP idiom, as (eg) happens on ARM */
1878    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
1879        && finalVty == Ity_I32) {
1880       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
1881       at = va1;
1882       at = mkUifU(mce, Ity_I32, at, va2);
1883       at = mkUifU(mce, Ity_I32, at, va3);
1884       at = mkPCastTo(mce, Ity_I32, at);
1885       return at;
1886    }
1887
1888    /* I32 x I128 x I128 -> I128 */
1889    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1890    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
1891        && finalVty == Ity_I128) {
1892       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
1893       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1894          mode indication which is fully defined, this should get
1895          folded out later. */
1896       at = mkPCastTo(mce, Ity_I128, va1);
1897       /* Now fold in 2nd and 3rd args. */
1898       at = mkUifU(mce, Ity_I128, at, va2);
1899       at = mkUifU(mce, Ity_I128, at, va3);
1900       /* and PCast once again. */
1901       at = mkPCastTo(mce, Ity_I128, at);
1902       return at;
1903    }
1904
1905    /* I32 x I8 x I128 -> I128 */
1906    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1907    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
1908        && finalVty == Ity_I128) {
1909       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
1910       /* Use I64 as an intermediate type, which means PCasting all 3
1911          args to I64 to start with. 1st arg is typically a rounding
1912          mode indication which is fully defined, so we hope that it
1913          will get folded out later. */
1914       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1915       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1916       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
1917       /* Now UifU all three together. */
1918       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1919       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
1920       /* and PCast once again. */
1921       at = mkPCastTo(mce, Ity_I128, at);
1922       return at;
1923    }
1924    if (1) {
1925       VG_(printf)("mkLazy3: ");
1926       ppIRType(t1);
1927       VG_(printf)(" x ");
1928       ppIRType(t2);
1929       VG_(printf)(" x ");
1930       ppIRType(t3);
1931       VG_(printf)(" -> ");
1932       ppIRType(finalVty);
1933       VG_(printf)("\n");
1934    }
1935
1936    tl_assert(0);
1937    /* General case: force everything via 32-bit intermediaries. */
1938    /*
1939    at = mkPCastTo(mce, Ity_I32, va1);
1940    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1941    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
1942    at = mkPCastTo(mce, finalVty, at);
1943    return at;
1944    */
1945 }
1946
1947
1948 /* 4-arg version of the above. */
1949 static
1950 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
1951                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
1952 {
1953    IRAtom* at;
1954    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1955    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1956    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1957    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
1958    tl_assert(isShadowAtom(mce,va1));
1959    tl_assert(isShadowAtom(mce,va2));
1960    tl_assert(isShadowAtom(mce,va3));
1961    tl_assert(isShadowAtom(mce,va4));
1962
1963    /* The general case is inefficient because PCast is an expensive
1964       operation.  Here are some special cases which use PCast only
1965       twice rather than three times. */
1966
1967    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
1968
1969    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
1970        && finalVty == Ity_I128) {
1971       if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
1972       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1973          mode indication which is fully defined, this should get
1974          folded out later. */
1975       at = mkPCastTo(mce, Ity_I128, va1);
1976       /* Now fold in 2nd, 3rd, 4th args. */
1977       at = mkUifU(mce, Ity_I128, at, va2);
1978       at = mkUifU(mce, Ity_I128, at, va3);
1979       at = mkUifU(mce, Ity_I128, at, va4);
1980       /* and PCast once again. */
1981       at = mkPCastTo(mce, Ity_I128, at);
1982       return at;
1983    }
1984
1985    /* I32 x I64 x I64 x I64 -> I64 */
1986    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
1987        && finalVty == Ity_I64) {
1988       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
1989       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1990          mode indication which is fully defined, this should get
1991          folded out later. */
1992       at = mkPCastTo(mce, Ity_I64, va1);
1993       /* Now fold in 2nd, 3rd, 4th args. */
1994       at = mkUifU(mce, Ity_I64, at, va2);
1995       at = mkUifU(mce, Ity_I64, at, va3);
1996       at = mkUifU(mce, Ity_I64, at, va4);
1997       /* and PCast once again. */
1998       at = mkPCastTo(mce, Ity_I64, at);
1999       return at;
2000    }
2001    /* I32 x I32 x I32 x I32 -> I32 */
2002    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2003    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
2004        && finalVty == Ity_I32) {
2005       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2006       at = va1;
2007       /* Now fold in 2nd, 3rd, 4th args. */
2008       at = mkUifU(mce, Ity_I32, at, va2);
2009       at = mkUifU(mce, Ity_I32, at, va3);
2010       at = mkUifU(mce, Ity_I32, at, va4);
2011       at = mkPCastTo(mce, Ity_I32, at);
2012       return at;
2013    }
2014
2015    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2016        && finalVty == Ity_I32) {
2017       if (0) VG_(printf)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2018       at = mkPCastTo(mce, Ity_I8, va1);
2019       /* Now fold in 2nd, 3rd, 4th args. */
2020       at = mkUifU(mce, Ity_I8, at, va2);
2021       at = mkUifU(mce, Ity_I8, at, va3);
2022       at = mkUifU(mce, Ity_I8, at, va4);
2023       at = mkPCastTo(mce, Ity_I32, at);
2024       return at;
2025    }
2026
2027    if (t1 == Ity_I64 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2028        && finalVty == Ity_I64) {
2029       if (0) VG_(printf)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2030       at = mkPCastTo(mce, Ity_I8, va1);
2031       /* Now fold in 2nd, 3rd, 4th args. */
2032       at = mkUifU(mce, Ity_I8, at, va2);
2033       at = mkUifU(mce, Ity_I8, at, va3);
2034       at = mkUifU(mce, Ity_I8, at, va4);
2035       at = mkPCastTo(mce, Ity_I64, at);
2036       return at;
2037    }
2038
2039    if (1) {
2040       VG_(printf)("mkLazy4: ");
2041       ppIRType(t1);
2042       VG_(printf)(" x ");
2043       ppIRType(t2);
2044       VG_(printf)(" x ");
2045       ppIRType(t3);
2046       VG_(printf)(" x ");
2047       ppIRType(t4);
2048       VG_(printf)(" -> ");
2049       ppIRType(finalVty);
2050       VG_(printf)("\n");
2051    }
2052
2053    tl_assert(0);
2054 }
2055
2056
2057 /* Do the lazy propagation game from a null-terminated vector of
2058    atoms.  This is presumably the arguments to a helper call, so the
2059    IRCallee info is also supplied in order that we can know which
2060    arguments should be ignored (via the .mcx_mask field).
2061 */
2062 static
2063 IRAtom* mkLazyN ( MCEnv* mce,
2064                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
2065 {
2066    Int     i;
2067    IRAtom* here;
2068    IRAtom* curr;
2069    IRType  mergeTy;
2070    Bool    mergeTy64 = True;
2071
2072    /* Decide on the type of the merge intermediary.  If all relevant
2073       args are I64, then it's I64.  In all other circumstances, use
2074       I32. */
2075    for (i = 0; exprvec[i]; i++) {
2076       tl_assert(i < 32);
2077       tl_assert(isOriginalAtom(mce, exprvec[i]));
2078       if (cee->mcx_mask & (1<<i))
2079          continue;
2080       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
2081          mergeTy64 = False;
2082    }
2083
2084    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
2085    curr    = definedOfType(mergeTy);
2086
2087    for (i = 0; exprvec[i]; i++) {
2088       tl_assert(i < 32);
2089       tl_assert(isOriginalAtom(mce, exprvec[i]));
2090       /* Only take notice of this arg if the callee's mc-exclusion
2091          mask does not say it is to be excluded. */
2092       if (cee->mcx_mask & (1<<i)) {
2093          /* the arg is to be excluded from definedness checking.  Do
2094             nothing. */
2095          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
2096       } else {
2097          /* calculate the arg's definedness, and pessimistically merge
2098             it in. */
2099          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i], HuOth) );
2100          curr = mergeTy64
2101                    ? mkUifU64(mce, here, curr)
2102                    : mkUifU32(mce, here, curr);
2103       }
2104    }
2105    return mkPCastTo(mce, finalVtype, curr );
2106 }
2107
2108
2109 /*------------------------------------------------------------*/
2110 /*--- Generating expensive sequences for exact carry-chain ---*/
2111 /*--- propagation in add/sub and related operations.       ---*/
2112 /*------------------------------------------------------------*/
2113
2114 static
2115 IRAtom* expensiveAddSub ( MCEnv*  mce,
2116                           Bool    add,
2117                           IRType  ty,
2118                           IRAtom* qaa, IRAtom* qbb,
2119                           IRAtom* aa,  IRAtom* bb )
2120 {
2121    IRAtom *a_min, *b_min, *a_max, *b_max;
2122    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
2123
2124    tl_assert(isShadowAtom(mce,qaa));
2125    tl_assert(isShadowAtom(mce,qbb));
2126    tl_assert(isOriginalAtom(mce,aa));
2127    tl_assert(isOriginalAtom(mce,bb));
2128    tl_assert(sameKindedAtoms(qaa,aa));
2129    tl_assert(sameKindedAtoms(qbb,bb));
2130
2131    switch (ty) {
2132       case Ity_I32:
2133          opAND = Iop_And32;
2134          opOR  = Iop_Or32;
2135          opXOR = Iop_Xor32;
2136          opNOT = Iop_Not32;
2137          opADD = Iop_Add32;
2138          opSUB = Iop_Sub32;
2139          break;
2140       case Ity_I64:
2141          opAND = Iop_And64;
2142          opOR  = Iop_Or64;
2143          opXOR = Iop_Xor64;
2144          opNOT = Iop_Not64;
2145          opADD = Iop_Add64;
2146          opSUB = Iop_Sub64;
2147          break;
2148       default:
2149          VG_(tool_panic)("expensiveAddSub");
2150    }
2151
2152    // a_min = aa & ~qaa
2153    a_min = assignNew('V', mce,ty,
2154                      binop(opAND, aa,
2155                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
2156
2157    // b_min = bb & ~qbb
2158    b_min = assignNew('V', mce,ty,
2159                      binop(opAND, bb,
2160                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
2161
2162    // a_max = aa | qaa
2163    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
2164
2165    // b_max = bb | qbb
2166    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
2167
2168    if (add) {
2169       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2170       return
2171       assignNew('V', mce,ty,
2172          binop( opOR,
2173                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2174                 assignNew('V', mce,ty,
2175                    binop( opXOR,
2176                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
2177                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
2178                    )
2179                 )
2180          )
2181       );
2182    } else {
2183       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2184       return
2185       assignNew('V', mce,ty,
2186          binop( opOR,
2187                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2188                 assignNew('V', mce,ty,
2189                    binop( opXOR,
2190                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
2191                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
2192                    )
2193                 )
2194          )
2195       );
2196    }
2197
2198 }
2199
2200
2201 static
2202 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2203                                        IRAtom* atom, IRAtom* vatom )
2204 {
2205    IRType ty;
2206    IROp xorOp, subOp, andOp;
2207    IRExpr *one;
2208    IRAtom *improver, *improved;
2209    tl_assert(isShadowAtom(mce,vatom));
2210    tl_assert(isOriginalAtom(mce,atom));
2211    tl_assert(sameKindedAtoms(atom,vatom));
2212
2213    switch (czop) {
2214       case Iop_Ctz32:
2215          ty = Ity_I32;
2216          xorOp = Iop_Xor32;
2217          subOp = Iop_Sub32;
2218          andOp = Iop_And32;
2219          one = mkU32(1);
2220          break;
2221       case Iop_Ctz64:
2222          ty = Ity_I64;
2223          xorOp = Iop_Xor64;
2224          subOp = Iop_Sub64;
2225          andOp = Iop_And64;
2226          one = mkU64(1);
2227          break;
2228       default:
2229          ppIROp(czop);
2230          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2231    }
2232
2233    // improver = atom ^ (atom - 1)
2234    //
2235    // That is, improver has its low ctz(atom) bits equal to one;
2236    // higher bits (if any) equal to zero.
2237    improver = assignNew('V', mce,ty,
2238                         binop(xorOp,
2239                               atom,
2240                               assignNew('V', mce, ty,
2241                                         binop(subOp, atom, one))));
2242
2243    // improved = vatom & improver
2244    //
2245    // That is, treat any V bits above the first ctz(atom) bits as
2246    // "defined".
2247    improved = assignNew('V', mce, ty,
2248                         binop(andOp, vatom, improver));
2249
2250    // Return pessimizing cast of improved.
2251    return mkPCastTo(mce, ty, improved);
2252 }
2253
2254
2255 /*------------------------------------------------------------*/
2256 /*--- Scalar shifts.                                       ---*/
2257 /*------------------------------------------------------------*/
2258
2259 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
2260    idea is to shift the definedness bits by the original shift amount.
2261    This introduces 0s ("defined") in new positions for left shifts and
2262    unsigned right shifts, and copies the top definedness bit for
2263    signed right shifts.  So, conveniently, applying the original shift
2264    operator to the definedness bits for the left arg is exactly the
2265    right thing to do:
2266
2267       (qaa << bb)
2268
2269    However if the shift amount is undefined then the whole result
2270    is undefined.  Hence need:
2271
2272       (qaa << bb) `UifU` PCast(qbb)
2273
2274    If the shift amount bb is a literal than qbb will say 'all defined'
2275    and the UifU and PCast will get folded out by post-instrumentation
2276    optimisation.
2277 */
2278 static IRAtom* scalarShift ( MCEnv*  mce,
2279                              IRType  ty,
2280                              IROp    original_op,
2281                              IRAtom* qaa, IRAtom* qbb,
2282                              IRAtom* aa,  IRAtom* bb )
2283 {
2284    tl_assert(isShadowAtom(mce,qaa));
2285    tl_assert(isShadowAtom(mce,qbb));
2286    tl_assert(isOriginalAtom(mce,aa));
2287    tl_assert(isOriginalAtom(mce,bb));
2288    tl_assert(sameKindedAtoms(qaa,aa));
2289    tl_assert(sameKindedAtoms(qbb,bb));
2290    return
2291       assignNew(
2292          'V', mce, ty,
2293          mkUifU( mce, ty,
2294                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2295                  mkPCastTo(mce, ty, qbb)
2296          )
2297    );
2298 }
2299
2300
2301 /*------------------------------------------------------------*/
2302 /*--- Helpers for dealing with vector primops.             ---*/
2303 /*------------------------------------------------------------*/
2304
2305 /* Vector pessimisation -- pessimise within each lane individually. */
2306
2307 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2308 {
2309    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2310 }
2311
2312 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2313 {
2314    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2315 }
2316
2317 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2318 {
2319    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2320 }
2321
2322 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2323 {
2324    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2325 }
2326
2327 static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
2328 {
2329    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
2330 }
2331
2332 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2333 {
2334    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2335 }
2336
2337 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2338 {
2339    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2340 }
2341
2342 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2343 {
2344    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2345 }
2346
2347 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2348 {
2349    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2350 }
2351
2352 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2353 {
2354    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2355 }
2356
2357 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2358 {
2359    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2360 }
2361
2362 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2363 {
2364    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2365 }
2366
2367 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2368 {
2369    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2370 }
2371
2372 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2373 {
2374    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2375 }
2376
2377
2378 /* Here's a simple scheme capable of handling ops derived from SSE1
2379    code and while only generating ops that can be efficiently
2380    implemented in SSE1. */
2381
2382 /* All-lanes versions are straightforward:
2383
2384    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
2385
2386    unary32Fx4(x,y)    ==> PCast32x4(x#)
2387
2388    Lowest-lane-only versions are more complex:
2389
2390    binary32F0x4(x,y)  ==> SetV128lo32(
2391                              x#,
2392                              PCast32(V128to32(UifUV128(x#,y#)))
2393                           )
2394
2395    This is perhaps not so obvious.  In particular, it's faster to
2396    do a V128-bit UifU and then take the bottom 32 bits than the more
2397    obvious scheme of taking the bottom 32 bits of each operand
2398    and doing a 32-bit UifU.  Basically since UifU is fast and
2399    chopping lanes off vector values is slow.
2400
2401    Finally:
2402
2403    unary32F0x4(x)     ==> SetV128lo32(
2404                              x#,
2405                              PCast32(V128to32(x#))
2406                           )
2407
2408    Where:
2409
2410    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
2411    PCast32x4(v#) = CmpNEZ32x4(v#)
2412 */
2413
2414 static
2415 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2416 {
2417    IRAtom* at;
2418    tl_assert(isShadowAtom(mce, vatomX));
2419    tl_assert(isShadowAtom(mce, vatomY));
2420    at = mkUifUV128(mce, vatomX, vatomY);
2421    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2422    return at;
2423 }
2424
2425 static
2426 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2427 {
2428    IRAtom* at;
2429    tl_assert(isShadowAtom(mce, vatomX));
2430    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2431    return at;
2432 }
2433
2434 static
2435 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2436 {
2437    IRAtom* at;
2438    tl_assert(isShadowAtom(mce, vatomX));
2439    tl_assert(isShadowAtom(mce, vatomY));
2440    at = mkUifUV128(mce, vatomX, vatomY);
2441    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2442    at = mkPCastTo(mce, Ity_I32, at);
2443    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2444    return at;
2445 }
2446
2447 static
2448 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2449 {
2450    IRAtom* at;
2451    tl_assert(isShadowAtom(mce, vatomX));
2452    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2453    at = mkPCastTo(mce, Ity_I32, at);
2454    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2455    return at;
2456 }
2457
2458 /* --- ... and ... 64Fx2 versions of the same ... --- */
2459
2460 static
2461 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2462 {
2463    IRAtom* at;
2464    tl_assert(isShadowAtom(mce, vatomX));
2465    tl_assert(isShadowAtom(mce, vatomY));
2466    at = mkUifUV128(mce, vatomX, vatomY);
2467    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2468    return at;
2469 }
2470
2471 static
2472 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2473 {
2474    IRAtom* at;
2475    tl_assert(isShadowAtom(mce, vatomX));
2476    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2477    return at;
2478 }
2479
2480 static
2481 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2482 {
2483    IRAtom* at;
2484    tl_assert(isShadowAtom(mce, vatomX));
2485    tl_assert(isShadowAtom(mce, vatomY));
2486    at = mkUifUV128(mce, vatomX, vatomY);
2487    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2488    at = mkPCastTo(mce, Ity_I64, at);
2489    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2490    return at;
2491 }
2492
2493 static
2494 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2495 {
2496    IRAtom* at;
2497    tl_assert(isShadowAtom(mce, vatomX));
2498    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2499    at = mkPCastTo(mce, Ity_I64, at);
2500    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2501    return at;
2502 }
2503
2504 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2505
2506 static
2507 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2508 {
2509    IRAtom* at;
2510    tl_assert(isShadowAtom(mce, vatomX));
2511    tl_assert(isShadowAtom(mce, vatomY));
2512    at = mkUifU64(mce, vatomX, vatomY);
2513    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2514    return at;
2515 }
2516
2517 static
2518 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2519 {
2520    IRAtom* at;
2521    tl_assert(isShadowAtom(mce, vatomX));
2522    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2523    return at;
2524 }
2525
2526 /* --- ... and ... 64Fx4 versions of the same ... --- */
2527
2528 static
2529 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2530 {
2531    IRAtom* at;
2532    tl_assert(isShadowAtom(mce, vatomX));
2533    tl_assert(isShadowAtom(mce, vatomY));
2534    at = mkUifUV256(mce, vatomX, vatomY);
2535    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2536    return at;
2537 }
2538
2539 static
2540 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2541 {
2542    IRAtom* at;
2543    tl_assert(isShadowAtom(mce, vatomX));
2544    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2545    return at;
2546 }
2547
2548 /* --- ... and ... 32Fx8 versions of the same ... --- */
2549
2550 static
2551 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2552 {
2553    IRAtom* at;
2554    tl_assert(isShadowAtom(mce, vatomX));
2555    tl_assert(isShadowAtom(mce, vatomY));
2556    at = mkUifUV256(mce, vatomX, vatomY);
2557    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2558    return at;
2559 }
2560
2561 static
2562 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2563 {
2564    IRAtom* at;
2565    tl_assert(isShadowAtom(mce, vatomX));
2566    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2567    return at;
2568 }
2569
2570 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2571
2572 static
2573 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2574                                        IRAtom* vatomX, IRAtom* vatomY )
2575 {
2576    /* This is the same as binary64Fx2, except that we subsequently
2577       pessimise vRM (definedness of the rounding mode), widen to 128
2578       bits and UifU it into the result.  As with the scalar cases, if
2579       the RM is a constant then it is defined and so this extra bit
2580       will get constant-folded out later. */
2581    // "do" the vector args
2582    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2583    // PCast the RM, and widen it to 128 bits
2584    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2585    // Roll it into the result
2586    t1 = mkUifUV128(mce, t1, t2);
2587    return t1;
2588 }
2589
2590 /* --- ... and ... 32Fx4 versions of the same --- */
2591
2592 static
2593 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2594                                        IRAtom* vatomX, IRAtom* vatomY )
2595 {
2596    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2597    // PCast the RM, and widen it to 128 bits
2598    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2599    // Roll it into the result
2600    t1 = mkUifUV128(mce, t1, t2);
2601    return t1;
2602 }
2603
2604 /* --- ... and ... 64Fx4 versions of the same --- */
2605
2606 static
2607 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2608                                        IRAtom* vatomX, IRAtom* vatomY )
2609 {
2610    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2611    // PCast the RM, and widen it to 256 bits
2612    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2613    // Roll it into the result
2614    t1 = mkUifUV256(mce, t1, t2);
2615    return t1;
2616 }
2617
2618 /* --- ... and ... 32Fx8 versions of the same --- */
2619
2620 static
2621 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2622                                        IRAtom* vatomX, IRAtom* vatomY )
2623 {
2624    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2625    // PCast the RM, and widen it to 256 bits
2626    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2627    // Roll it into the result
2628    t1 = mkUifUV256(mce, t1, t2);
2629    return t1;
2630 }
2631
2632 /* --- 64Fx2 unary FP ops, with rounding mode --- */
2633
2634 static
2635 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2636 {
2637    /* Same scheme as binary64Fx2_w_rm. */
2638    // "do" the vector arg
2639    IRAtom* t1 = unary64Fx2(mce, vatomX);
2640    // PCast the RM, and widen it to 128 bits
2641    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2642    // Roll it into the result
2643    t1 = mkUifUV128(mce, t1, t2);
2644    return t1;
2645 }
2646
2647 /* --- ... and ... 32Fx4 versions of the same --- */
2648
2649 static
2650 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2651 {
2652    /* Same scheme as unary32Fx4_w_rm. */
2653    IRAtom* t1 = unary32Fx4(mce, vatomX);
2654    // PCast the RM, and widen it to 128 bits
2655    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2656    // Roll it into the result
2657    t1 = mkUifUV128(mce, t1, t2);
2658    return t1;
2659 }
2660
2661
2662 /* --- --- Vector saturated narrowing --- --- */
2663
2664 /* We used to do something very clever here, but on closer inspection
2665    (2011-Jun-15), and in particular bug #279698, it turns out to be
2666    wrong.  Part of the problem came from the fact that for a long
2667    time, the IR primops to do with saturated narrowing were
2668    underspecified and managed to confuse multiple cases which needed
2669    to be separate: the op names had a signedness qualifier, but in
2670    fact the source and destination signednesses needed to be specified
2671    independently, so the op names really need two independent
2672    signedness specifiers.
2673
2674    As of 2011-Jun-15 (ish) the underspecification was sorted out
2675    properly.  The incorrect instrumentation remained, though.  That
2676    has now (2011-Oct-22) been fixed.
2677
2678    What we now do is simple:
2679
2680    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
2681    number of lanes, X is the source lane width and signedness, and Y
2682    is the destination lane width and signedness.  In all cases the
2683    destination lane width is half the source lane width, so the names
2684    have a bit of redundancy, but are at least easy to read.
2685
2686    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
2687    to unsigned 16s.
2688
2689    Let Vanilla(OP) be a function that takes OP, one of these
2690    saturating narrowing ops, and produces the same "shaped" narrowing
2691    op which is not saturating, but merely dumps the most significant
2692    bits.  "same shape" means that the lane numbers and widths are the
2693    same as with OP.
2694
2695    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
2696                   = Iop_NarrowBin32to16x8,
2697    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
2698    dumping the top half of each lane.
2699
2700    So, with that in place, the scheme is simple, and it is simple to
2701    pessimise each lane individually and then apply Vanilla(OP) so as
2702    to get the result in the right "shape".  If the original OP is
2703    QNarrowBinXtoYxZ then we produce
2704
2705    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
2706
2707    or for the case when OP is unary (Iop_QNarrowUn*)
2708
2709    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
2710 */
2711 static
2712 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
2713 {
2714    switch (qnarrowOp) {
2715       /* Binary: (128, 128) -> 128 */
2716       case Iop_QNarrowBin16Sto8Ux16:
2717       case Iop_QNarrowBin16Sto8Sx16:
2718       case Iop_QNarrowBin16Uto8Ux16:
2719       case Iop_QNarrowBin64Sto32Sx4:
2720       case Iop_QNarrowBin64Uto32Ux4:
2721          return Iop_NarrowBin16to8x16;
2722       case Iop_QNarrowBin32Sto16Ux8:
2723       case Iop_QNarrowBin32Sto16Sx8:
2724       case Iop_QNarrowBin32Uto16Ux8:
2725          return Iop_NarrowBin32to16x8;
2726       /* Binary: (64, 64) -> 64 */
2727       case Iop_QNarrowBin32Sto16Sx4:
2728          return Iop_NarrowBin32to16x4;
2729       case Iop_QNarrowBin16Sto8Ux8:
2730       case Iop_QNarrowBin16Sto8Sx8:
2731          return Iop_NarrowBin16to8x8;
2732       /* Unary: 128 -> 64 */
2733       case Iop_QNarrowUn64Uto32Ux2:
2734       case Iop_QNarrowUn64Sto32Sx2:
2735       case Iop_QNarrowUn64Sto32Ux2:
2736          return Iop_NarrowUn64to32x2;
2737       case Iop_QNarrowUn32Uto16Ux4:
2738       case Iop_QNarrowUn32Sto16Sx4:
2739       case Iop_QNarrowUn32Sto16Ux4:
2740       case Iop_F32toF16x4:
2741          return Iop_NarrowUn32to16x4;
2742       case Iop_QNarrowUn16Uto8Ux8:
2743       case Iop_QNarrowUn16Sto8Sx8:
2744       case Iop_QNarrowUn16Sto8Ux8:
2745          return Iop_NarrowUn16to8x8;
2746       default:
2747          ppIROp(qnarrowOp);
2748          VG_(tool_panic)("vanillaNarrowOpOfShape");
2749    }
2750 }
2751
2752 static
2753 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
2754                               IRAtom* vatom1, IRAtom* vatom2)
2755 {
2756    IRAtom *at1, *at2, *at3;
2757    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2758    switch (narrow_op) {
2759       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
2760       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
2761       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
2762       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
2763       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
2764       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
2765       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
2766       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
2767       default: VG_(tool_panic)("vectorNarrowBinV128");
2768    }
2769    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2770    tl_assert(isShadowAtom(mce,vatom1));
2771    tl_assert(isShadowAtom(mce,vatom2));
2772    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2773    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
2774    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
2775    return at3;
2776 }
2777
2778 static
2779 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
2780                             IRAtom* vatom1, IRAtom* vatom2)
2781 {
2782    IRAtom *at1, *at2, *at3;
2783    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2784    switch (narrow_op) {
2785       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
2786       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
2787       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
2788       default: VG_(tool_panic)("vectorNarrowBin64");
2789    }
2790    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2791    tl_assert(isShadowAtom(mce,vatom1));
2792    tl_assert(isShadowAtom(mce,vatom2));
2793    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
2794    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
2795    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
2796    return at3;
2797 }
2798
2799 static
2800 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
2801                              IRAtom* vatom1)
2802 {
2803    IRAtom *at1, *at2;
2804    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2805    tl_assert(isShadowAtom(mce,vatom1));
2806    /* For vanilla narrowing (non-saturating), we can just apply
2807       the op directly to the V bits. */
2808    switch (narrow_op) {
2809       case Iop_NarrowUn16to8x8:
2810       case Iop_NarrowUn32to16x4:
2811       case Iop_NarrowUn64to32x2:
2812       case Iop_F32toF16x4:
2813          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
2814          return at1;
2815       default:
2816          break; /* Do Plan B */
2817    }
2818    /* Plan B: for ops that involve a saturation operation on the args,
2819       we must PCast before the vanilla narrow. */
2820    switch (narrow_op) {
2821       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
2822       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
2823       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
2824       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
2825       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
2826       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
2827       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
2828       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
2829       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
2830       default: VG_(tool_panic)("vectorNarrowUnV128");
2831    }
2832    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2833    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2834    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
2835    return at2;
2836 }
2837
2838 static
2839 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
2840                          IRAtom* vatom1)
2841 {
2842    IRAtom *at1, *at2;
2843    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2844    switch (longen_op) {
2845       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
2846       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
2847       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
2848       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
2849       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
2850       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
2851       case Iop_F16toF32x4:     pcast = mkPCast32x4; break;
2852       default: VG_(tool_panic)("vectorWidenI64");
2853    }
2854    tl_assert(isShadowAtom(mce,vatom1));
2855    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
2856    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
2857    return at2;
2858 }
2859
2860
2861 /* --- --- Vector integer arithmetic --- --- */
2862
2863 /* Simple ... UifU the args and per-lane pessimise the results. */
2864
2865 /* --- V256-bit versions --- */
2866
2867 static
2868 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2869 {
2870    IRAtom* at;
2871    at = mkUifUV256(mce, vatom1, vatom2);
2872    at = mkPCast8x32(mce, at);
2873    return at;
2874 }
2875
2876 static
2877 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2878 {
2879    IRAtom* at;
2880    at = mkUifUV256(mce, vatom1, vatom2);
2881    at = mkPCast16x16(mce, at);
2882    return at;
2883 }
2884
2885 static
2886 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2887 {
2888    IRAtom* at;
2889    at = mkUifUV256(mce, vatom1, vatom2);
2890    at = mkPCast32x8(mce, at);
2891    return at;
2892 }
2893
2894 static
2895 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2896 {
2897    IRAtom* at;
2898    at = mkUifUV256(mce, vatom1, vatom2);
2899    at = mkPCast64x4(mce, at);
2900    return at;
2901 }
2902
2903 /* --- V128-bit versions --- */
2904
2905 static
2906 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2907 {
2908    IRAtom* at;
2909    at = mkUifUV128(mce, vatom1, vatom2);
2910    at = mkPCast8x16(mce, at);
2911    return at;
2912 }
2913
2914 static
2915 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2916 {
2917    IRAtom* at;
2918    at = mkUifUV128(mce, vatom1, vatom2);
2919    at = mkPCast16x8(mce, at);
2920    return at;
2921 }
2922
2923 static
2924 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2925 {
2926    IRAtom* at;
2927    at = mkUifUV128(mce, vatom1, vatom2);
2928    at = mkPCast32x4(mce, at);
2929    return at;
2930 }
2931
2932 static
2933 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2934 {
2935    IRAtom* at;
2936    at = mkUifUV128(mce, vatom1, vatom2);
2937    at = mkPCast64x2(mce, at);
2938    return at;
2939 }
2940
2941 static
2942 IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2943 {
2944    IRAtom* at;
2945    at = mkUifUV128(mce, vatom1, vatom2);
2946    at = mkPCast128x1(mce, at);
2947    return at;
2948 }
2949
2950 /* --- 64-bit versions --- */
2951
2952 static
2953 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2954 {
2955    IRAtom* at;
2956    at = mkUifU64(mce, vatom1, vatom2);
2957    at = mkPCast8x8(mce, at);
2958    return at;
2959 }
2960
2961 static
2962 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2963 {
2964    IRAtom* at;
2965    at = mkUifU64(mce, vatom1, vatom2);
2966    at = mkPCast16x4(mce, at);
2967    return at;
2968 }
2969
2970 static
2971 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2972 {
2973    IRAtom* at;
2974    at = mkUifU64(mce, vatom1, vatom2);
2975    at = mkPCast32x2(mce, at);
2976    return at;
2977 }
2978
2979 static
2980 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2981 {
2982    IRAtom* at;
2983    at = mkUifU64(mce, vatom1, vatom2);
2984    at = mkPCastTo(mce, Ity_I64, at);
2985    return at;
2986 }
2987
2988 /* --- 32-bit versions --- */
2989
2990 static
2991 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2992 {
2993    IRAtom* at;
2994    at = mkUifU32(mce, vatom1, vatom2);
2995    at = mkPCast8x4(mce, at);
2996    return at;
2997 }
2998
2999 static
3000 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3001 {
3002    IRAtom* at;
3003    at = mkUifU32(mce, vatom1, vatom2);
3004    at = mkPCast16x2(mce, at);
3005    return at;
3006 }
3007
3008
3009 /*------------------------------------------------------------*/
3010 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
3011 /*------------------------------------------------------------*/
3012
3013 static
3014 IRAtom* expr2vbits_Qop ( MCEnv* mce,
3015                          IROp op,
3016                          IRAtom* atom1, IRAtom* atom2,
3017                          IRAtom* atom3, IRAtom* atom4 )
3018 {
3019    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3020    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3021    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3022    IRAtom* vatom4 = expr2vbits( mce, atom4, HuOth );
3023
3024    tl_assert(isOriginalAtom(mce,atom1));
3025    tl_assert(isOriginalAtom(mce,atom2));
3026    tl_assert(isOriginalAtom(mce,atom3));
3027    tl_assert(isOriginalAtom(mce,atom4));
3028    tl_assert(isShadowAtom(mce,vatom1));
3029    tl_assert(isShadowAtom(mce,vatom2));
3030    tl_assert(isShadowAtom(mce,vatom3));
3031    tl_assert(isShadowAtom(mce,vatom4));
3032    tl_assert(sameKindedAtoms(atom1,vatom1));
3033    tl_assert(sameKindedAtoms(atom2,vatom2));
3034    tl_assert(sameKindedAtoms(atom3,vatom3));
3035    tl_assert(sameKindedAtoms(atom4,vatom4));
3036    switch (op) {
3037       case Iop_MAddF64:
3038       case Iop_MAddF64r32:
3039       case Iop_MSubF64:
3040       case Iop_MSubF64r32:
3041          /* I32(rm) x F64 x F64 x F64 -> F64 */
3042          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3043
3044       case Iop_MAddF32:
3045       case Iop_MSubF32:
3046          /* I32(rm) x F32 x F32 x F32 -> F32 */
3047          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3048
3049       case Iop_MAddF128:
3050       case Iop_MSubF128:
3051       case Iop_NegMAddF128:
3052       case Iop_NegMSubF128:
3053          /* I32(rm) x F128 x F128 x F128 -> F128 */
3054          return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
3055
3056       /* V256-bit data-steering */
3057       case Iop_64x4toV256:
3058          return assignNew('V', mce, Ity_V256,
3059                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
3060
3061       /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3062       case Iop_Rotx32:
3063          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3064       case Iop_Rotx64:
3065          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3066       default:
3067          ppIROp(op);
3068          VG_(tool_panic)("memcheck:expr2vbits_Qop");
3069    }
3070 }
3071
3072
3073 static
3074 IRAtom* expr2vbits_Triop ( MCEnv* mce,
3075                            IROp op,
3076                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
3077 {
3078    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3079    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3080    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3081
3082    tl_assert(isOriginalAtom(mce,atom1));
3083    tl_assert(isOriginalAtom(mce,atom2));
3084    tl_assert(isOriginalAtom(mce,atom3));
3085    tl_assert(isShadowAtom(mce,vatom1));
3086    tl_assert(isShadowAtom(mce,vatom2));
3087    tl_assert(isShadowAtom(mce,vatom3));
3088    tl_assert(sameKindedAtoms(atom1,vatom1));
3089    tl_assert(sameKindedAtoms(atom2,vatom2));
3090    tl_assert(sameKindedAtoms(atom3,vatom3));
3091    switch (op) {
3092       case Iop_AddF128:
3093       case Iop_SubF128:
3094       case Iop_MulF128:
3095       case Iop_DivF128:
3096       case Iop_AddD128:
3097       case Iop_SubD128:
3098       case Iop_MulD128:
3099       case Iop_DivD128:
3100       case Iop_QuantizeD128:
3101          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3102          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3103       case Iop_AddF64:
3104       case Iop_AddD64:
3105       case Iop_AddF64r32:
3106       case Iop_SubF64:
3107       case Iop_SubD64:
3108       case Iop_SubF64r32:
3109       case Iop_MulF64:
3110       case Iop_MulD64:
3111       case Iop_MulF64r32:
3112       case Iop_DivF64:
3113       case Iop_DivD64:
3114       case Iop_DivF64r32:
3115       case Iop_ScaleF64:
3116       case Iop_Yl2xF64:
3117       case Iop_Yl2xp1F64:
3118       case Iop_AtanF64:
3119       case Iop_PRemF64:
3120       case Iop_PRem1F64:
3121       case Iop_QuantizeD64:
3122          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3123          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3124       case Iop_PRemC3210F64:
3125       case Iop_PRem1C3210F64:
3126          /* I32(rm) x F64 x F64 -> I32 */
3127          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3128       case Iop_AddF32:
3129       case Iop_SubF32:
3130       case Iop_MulF32:
3131       case Iop_DivF32:
3132          /* I32(rm) x F32 x F32 -> I32 */
3133          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3134       case Iop_SignificanceRoundD64:
3135          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3136          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3137       case Iop_SignificanceRoundD128:
3138          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3139          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3140       case Iop_SliceV128:
3141          /* (V128, V128, I8) -> V128 */
3142          complainIfUndefined(mce, atom3, NULL);
3143          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
3144       case Iop_Slice64:
3145          /* (I64, I64, I8) -> I64 */
3146          complainIfUndefined(mce, atom3, NULL);
3147          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
3148       case Iop_SetElem8x8:
3149       case Iop_SetElem16x4:
3150       case Iop_SetElem32x2:
3151          complainIfUndefined(mce, atom2, NULL);
3152          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
3153
3154       case Iop_SetElem8x16:
3155       case Iop_SetElem16x8:
3156       case Iop_SetElem32x4:
3157       case Iop_SetElem64x2:
3158          complainIfUndefined(mce, atom2, NULL);
3159          return assignNew('V', mce, Ity_V128, triop(op, vatom1, atom2, vatom3));
3160
3161       case Iop_Perm8x16x2:
3162          /* (V128, V128, V128) -> V128 */
3163             complainIfUndefined(mce, atom3, NULL);
3164             return mkUifUV128(
3165                    mce,
3166                    assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3)),
3167                    mkPCast8x16(mce, vatom3)
3168                 );
3169
3170       /* Vector FP with rounding mode as the first arg */
3171       case Iop_Add64Fx2:
3172       case Iop_Sub64Fx2:
3173       case Iop_Mul64Fx2:
3174       case Iop_Div64Fx2:
3175       case Iop_Scale2_64Fx2:
3176          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
3177
3178       case Iop_Add32Fx4:
3179       case Iop_Sub32Fx4:
3180       case Iop_Mul32Fx4:
3181       case Iop_Div32Fx4:
3182       case Iop_Scale2_32Fx4:
3183         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3184
3185       case Iop_Add64Fx4:
3186       case Iop_Sub64Fx4:
3187       case Iop_Mul64Fx4:
3188       case Iop_Div64Fx4:
3189          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3190
3191       case Iop_Add32Fx8:
3192       case Iop_Sub32Fx8:
3193       case Iop_Mul32Fx8:
3194       case Iop_Div32Fx8:
3195          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3196
3197       case Iop_F32x4_2toQ16x8:
3198          return assignNew('V', mce, Ity_V128,
3199                           binop(Iop_PackEvenLanes16x8,
3200                                 unary32Fx4_w_rm(mce, vatom1, vatom2),
3201                                 unary32Fx4_w_rm(mce, vatom1, vatom3)));
3202       case Iop_F64x2_2toQ32x4:
3203          return assignNew('V', mce, Ity_V128,
3204                           binop(Iop_PackEvenLanes32x4,
3205                                 unary64Fx2_w_rm(mce, vatom1, vatom2),
3206                                 unary64Fx2_w_rm(mce, vatom1, vatom3)));
3207
3208
3209       default:
3210          ppIROp(op);
3211          VG_(tool_panic)("memcheck:expr2vbits_Triop");
3212    }
3213 }
3214
3215
3216 static
3217 IRAtom* expr2vbits_Binop ( MCEnv* mce,
3218                            IROp op,
3219                            IRAtom* atom1, IRAtom* atom2,
3220                            HowUsed hu/*use HuOth if unknown*/ )
3221 {
3222    IRType  and_or_ty;
3223    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*);
3224    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*);
3225    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*);
3226
3227    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3228    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3229
3230    tl_assert(isOriginalAtom(mce,atom1));
3231    tl_assert(isOriginalAtom(mce,atom2));
3232    tl_assert(isShadowAtom(mce,vatom1));
3233    tl_assert(isShadowAtom(mce,vatom2));
3234    tl_assert(sameKindedAtoms(atom1,vatom1));
3235    tl_assert(sameKindedAtoms(atom2,vatom2));
3236    switch (op) {
3237
3238       /* 32-bit SIMD */
3239
3240       case Iop_Add16x2:
3241       case Iop_HAdd16Ux2:
3242       case Iop_HAdd16Sx2:
3243       case Iop_Sub16x2:
3244       case Iop_HSub16Ux2:
3245       case Iop_HSub16Sx2:
3246       case Iop_QAdd16Sx2:
3247       case Iop_QSub16Sx2:
3248       case Iop_QSub16Ux2:
3249       case Iop_QAdd16Ux2:
3250          return binary16Ix2(mce, vatom1, vatom2);
3251
3252       case Iop_Add8x4:
3253       case Iop_HAdd8Ux4:
3254       case Iop_HAdd8Sx4:
3255       case Iop_Sub8x4:
3256       case Iop_HSub8Ux4:
3257       case Iop_HSub8Sx4:
3258       case Iop_QSub8Ux4:
3259       case Iop_QAdd8Ux4:
3260       case Iop_QSub8Sx4:
3261       case Iop_QAdd8Sx4:
3262          return binary8Ix4(mce, vatom1, vatom2);
3263
3264       /* 64-bit SIMD */
3265
3266       case Iop_ShrN8x8:
3267       case Iop_ShrN16x4:
3268       case Iop_ShrN32x2:
3269       case Iop_SarN8x8:
3270       case Iop_SarN16x4:
3271       case Iop_SarN32x2:
3272       case Iop_ShlN16x4:
3273       case Iop_ShlN32x2:
3274       case Iop_ShlN8x8:
3275          /* Same scheme as with all other shifts. */
3276          complainIfUndefined(mce, atom2, NULL);
3277          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3278
3279       case Iop_QNarrowBin32Sto16Sx4:
3280       case Iop_QNarrowBin16Sto8Sx8:
3281       case Iop_QNarrowBin16Sto8Ux8:
3282          return vectorNarrowBin64(mce, op, vatom1, vatom2);
3283
3284       case Iop_Min8Ux8:
3285       case Iop_Min8Sx8:
3286       case Iop_Max8Ux8:
3287       case Iop_Max8Sx8:
3288       case Iop_Avg8Ux8:
3289       case Iop_QSub8Sx8:
3290       case Iop_QSub8Ux8:
3291       case Iop_Sub8x8:
3292       case Iop_CmpGT8Sx8:
3293       case Iop_CmpGT8Ux8:
3294       case Iop_CmpEQ8x8:
3295       case Iop_QAdd8Sx8:
3296       case Iop_QAdd8Ux8:
3297       case Iop_QSal8x8:
3298       case Iop_QShl8x8:
3299       case Iop_Add8x8:
3300       case Iop_Mul8x8:
3301       case Iop_PolynomialMul8x8:
3302          return binary8Ix8(mce, vatom1, vatom2);
3303
3304       case Iop_Min16Sx4:
3305       case Iop_Min16Ux4:
3306       case Iop_Max16Sx4:
3307       case Iop_Max16Ux4:
3308       case Iop_Avg16Ux4:
3309       case Iop_QSub16Ux4:
3310       case Iop_QSub16Sx4:
3311       case Iop_Sub16x4:
3312       case Iop_Mul16x4:
3313       case Iop_MulHi16Sx4:
3314       case Iop_MulHi16Ux4:
3315       case Iop_CmpGT16Sx4:
3316       case Iop_CmpGT16Ux4:
3317       case Iop_CmpEQ16x4:
3318       case Iop_QAdd16Sx4:
3319       case Iop_QAdd16Ux4:
3320       case Iop_QSal16x4:
3321       case Iop_QShl16x4:
3322       case Iop_Add16x4:
3323       case Iop_QDMulHi16Sx4:
3324       case Iop_QRDMulHi16Sx4:
3325          return binary16Ix4(mce, vatom1, vatom2);
3326
3327       case Iop_Sub32x2:
3328       case Iop_Mul32x2:
3329       case Iop_Max32Sx2:
3330       case Iop_Max32Ux2:
3331       case Iop_Min32Sx2:
3332       case Iop_Min32Ux2:
3333       case Iop_CmpGT32Sx2:
3334       case Iop_CmpGT32Ux2:
3335       case Iop_CmpEQ32x2:
3336       case Iop_Add32x2:
3337       case Iop_QAdd32Ux2:
3338       case Iop_QAdd32Sx2:
3339       case Iop_QSub32Ux2:
3340       case Iop_QSub32Sx2:
3341       case Iop_QSal32x2:
3342       case Iop_QShl32x2:
3343       case Iop_QDMulHi32Sx2:
3344       case Iop_QRDMulHi32Sx2:
3345          return binary32Ix2(mce, vatom1, vatom2);
3346
3347       case Iop_QSub64Ux1:
3348       case Iop_QSub64Sx1:
3349       case Iop_QAdd64Ux1:
3350       case Iop_QAdd64Sx1:
3351       case Iop_QSal64x1:
3352       case Iop_QShl64x1:
3353       case Iop_Sal64x1:
3354          return binary64Ix1(mce, vatom1, vatom2);
3355
3356       case Iop_QShlNsatSU8x8:
3357       case Iop_QShlNsatUU8x8:
3358       case Iop_QShlNsatSS8x8:
3359          complainIfUndefined(mce, atom2, NULL);
3360          return mkPCast8x8(mce, vatom1);
3361
3362       case Iop_QShlNsatSU16x4:
3363       case Iop_QShlNsatUU16x4:
3364       case Iop_QShlNsatSS16x4:
3365          complainIfUndefined(mce, atom2, NULL);
3366          return mkPCast16x4(mce, vatom1);
3367
3368       case Iop_QShlNsatSU32x2:
3369       case Iop_QShlNsatUU32x2:
3370       case Iop_QShlNsatSS32x2:
3371          complainIfUndefined(mce, atom2, NULL);
3372          return mkPCast32x2(mce, vatom1);
3373
3374       case Iop_QShlNsatSU64x1:
3375       case Iop_QShlNsatUU64x1:
3376       case Iop_QShlNsatSS64x1:
3377          complainIfUndefined(mce, atom2, NULL);
3378          return mkPCast32x2(mce, vatom1);
3379
3380       case Iop_PwMax32Sx2:
3381       case Iop_PwMax32Ux2:
3382       case Iop_PwMin32Sx2:
3383       case Iop_PwMin32Ux2:
3384       case Iop_PwMax32Fx2:
3385       case Iop_PwMin32Fx2:
3386          return assignNew('V', mce, Ity_I64,
3387                           binop(Iop_PwMax32Ux2,
3388                                 mkPCast32x2(mce, vatom1),
3389                                 mkPCast32x2(mce, vatom2)));
3390
3391       case Iop_PwMax16Sx4:
3392       case Iop_PwMax16Ux4:
3393       case Iop_PwMin16Sx4:
3394       case Iop_PwMin16Ux4:
3395          return assignNew('V', mce, Ity_I64,
3396                           binop(Iop_PwMax16Ux4,
3397                                 mkPCast16x4(mce, vatom1),
3398                                 mkPCast16x4(mce, vatom2)));
3399
3400       case Iop_PwMax8Sx8:
3401       case Iop_PwMax8Ux8:
3402       case Iop_PwMin8Sx8:
3403       case Iop_PwMin8Ux8:
3404          return assignNew('V', mce, Ity_I64,
3405                           binop(Iop_PwMax8Ux8,
3406                                 mkPCast8x8(mce, vatom1),
3407                                 mkPCast8x8(mce, vatom2)));
3408
3409       case Iop_PwAdd32x2:
3410       case Iop_PwAdd32Fx2:
3411          return mkPCast32x2(mce,
3412                assignNew('V', mce, Ity_I64,
3413                          binop(Iop_PwAdd32x2,
3414                                mkPCast32x2(mce, vatom1),
3415                                mkPCast32x2(mce, vatom2))));
3416
3417       case Iop_PwAdd16x4:
3418          return mkPCast16x4(mce,
3419                assignNew('V', mce, Ity_I64,
3420                          binop(op, mkPCast16x4(mce, vatom1),
3421                                    mkPCast16x4(mce, vatom2))));
3422
3423       case Iop_PwAdd8x8:
3424          return mkPCast8x8(mce,
3425                assignNew('V', mce, Ity_I64,
3426                          binop(op, mkPCast8x8(mce, vatom1),
3427                                    mkPCast8x8(mce, vatom2))));
3428
3429       case Iop_Shl8x8:
3430       case Iop_Shr8x8:
3431       case Iop_Sar8x8:
3432       case Iop_Sal8x8:
3433          return mkUifU64(mce,
3434                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3435                    mkPCast8x8(mce,vatom2)
3436                 );
3437
3438       case Iop_Shl16x4:
3439       case Iop_Shr16x4:
3440       case Iop_Sar16x4:
3441       case Iop_Sal16x4:
3442          return mkUifU64(mce,
3443                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3444                    mkPCast16x4(mce,vatom2)
3445                 );
3446
3447       case Iop_Shl32x2:
3448       case Iop_Shr32x2:
3449       case Iop_Sar32x2:
3450       case Iop_Sal32x2:
3451          return mkUifU64(mce,
3452                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3453                    mkPCast32x2(mce,vatom2)
3454                 );
3455
3456       /* 64-bit data-steering */
3457       case Iop_InterleaveLO32x2:
3458       case Iop_InterleaveLO16x4:
3459       case Iop_InterleaveLO8x8:
3460       case Iop_InterleaveHI32x2:
3461       case Iop_InterleaveHI16x4:
3462       case Iop_InterleaveHI8x8:
3463       case Iop_CatOddLanes8x8:
3464       case Iop_CatEvenLanes8x8:
3465       case Iop_CatOddLanes16x4:
3466       case Iop_CatEvenLanes16x4:
3467       case Iop_InterleaveOddLanes8x8:
3468       case Iop_InterleaveEvenLanes8x8:
3469       case Iop_InterleaveOddLanes16x4:
3470       case Iop_InterleaveEvenLanes16x4:
3471          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3472
3473       case Iop_GetElem8x8:
3474          complainIfUndefined(mce, atom2, NULL);
3475          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3476       case Iop_GetElem16x4:
3477          complainIfUndefined(mce, atom2, NULL);
3478          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3479       case Iop_GetElem32x2:
3480          complainIfUndefined(mce, atom2, NULL);
3481          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3482
3483       /* Perm8x8: rearrange values in left arg using steering values
3484         from right arg.  So rearrange the vbits in the same way but
3485         pessimise wrt steering values. */
3486       case Iop_Perm8x8:
3487          return mkUifU64(
3488                    mce,
3489                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3490                    mkPCast8x8(mce, vatom2)
3491                 );
3492
3493       /* V128-bit SIMD */
3494
3495       case Iop_Sqrt32Fx4:
3496          return unary32Fx4_w_rm(mce, vatom1, vatom2);
3497       case Iop_Sqrt64Fx2:
3498          return unary64Fx2_w_rm(mce, vatom1, vatom2);
3499
3500       case Iop_ShrN8x16:
3501       case Iop_ShrN16x8:
3502       case Iop_ShrN32x4:
3503       case Iop_ShrN64x2:
3504       case Iop_SarN8x16:
3505       case Iop_SarN16x8:
3506       case Iop_SarN32x4:
3507       case Iop_SarN64x2:
3508       case Iop_ShlN8x16:
3509       case Iop_ShlN16x8:
3510       case Iop_ShlN32x4:
3511       case Iop_ShlN64x2:
3512          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3513             this is wrong now, scalar shifts are done properly lazily.
3514             Vector shifts should be fixed too. */
3515          complainIfUndefined(mce, atom2, NULL);
3516          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3517
3518       /* V x V shifts/rotates are done using the standard lazy scheme. */
3519       /* For the non-rounding variants of bi-di vector x vector
3520          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3521          But note that this is overly pessimistic, because in fact only
3522          the bottom 8 bits of each lane of the second argument are taken
3523          into account when shifting.  So really we ought to ignore
3524          undefinedness in bits 8 and above of each lane in the
3525          second argument. */
3526       case Iop_Shl8x16:
3527       case Iop_Shr8x16:
3528       case Iop_Sar8x16:
3529       case Iop_Sal8x16:
3530       case Iop_Rol8x16:
3531       case Iop_Sh8Sx16:
3532       case Iop_Sh8Ux16:
3533          return mkUifUV128(mce,
3534                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3535                    mkPCast8x16(mce,vatom2)
3536                 );
3537
3538       case Iop_Shl16x8:
3539       case Iop_Shr16x8:
3540       case Iop_Sar16x8:
3541       case Iop_Sal16x8:
3542       case Iop_Rol16x8:
3543       case Iop_Sh16Sx8:
3544       case Iop_Sh16Ux8:
3545          return mkUifUV128(mce,
3546                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3547                    mkPCast16x8(mce,vatom2)
3548                 );
3549
3550       case Iop_Shl32x4:
3551       case Iop_Shr32x4:
3552       case Iop_Sar32x4:
3553       case Iop_Sal32x4:
3554       case Iop_Rol32x4:
3555       case Iop_Sh32Sx4:
3556       case Iop_Sh32Ux4:
3557          return mkUifUV128(mce,
3558                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3559                    mkPCast32x4(mce,vatom2)
3560                 );
3561
3562       case Iop_Shl64x2:
3563       case Iop_Shr64x2:
3564       case Iop_Sar64x2:
3565       case Iop_Sal64x2:
3566       case Iop_Rol64x2:
3567       case Iop_Sh64Sx2:
3568       case Iop_Sh64Ux2:
3569          return mkUifUV128(mce,
3570                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3571                    mkPCast64x2(mce,vatom2)
3572                 );
3573
3574       /* For the rounding variants of bi-di vector x vector shifts, the
3575          rounding adjustment can cause undefinedness to propagate through
3576          the entire lane, in the worst case.  Too complex to handle
3577          properly .. just UifU the arguments and then PCast them.
3578          Suboptimal but safe. */
3579       case Iop_Rsh8Sx16:
3580       case Iop_Rsh8Ux16:
3581          return binary8Ix16(mce, vatom1, vatom2);
3582       case Iop_Rsh16Sx8:
3583       case Iop_Rsh16Ux8:
3584          return binary16Ix8(mce, vatom1, vatom2);
3585       case Iop_Rsh32Sx4:
3586       case Iop_Rsh32Ux4:
3587          return binary32Ix4(mce, vatom1, vatom2);
3588       case Iop_Rsh64Sx2:
3589       case Iop_Rsh64Ux2:
3590          return binary64Ix2(mce, vatom1, vatom2);
3591
3592       case Iop_F32ToFixed32Ux4_RZ:
3593       case Iop_F32ToFixed32Sx4_RZ:
3594       case Iop_Fixed32UToF32x4_RN:
3595       case Iop_Fixed32SToF32x4_RN:
3596          complainIfUndefined(mce, atom2, NULL);
3597          return mkPCast32x4(mce, vatom1);
3598
3599       case Iop_F32ToFixed32Ux2_RZ:
3600       case Iop_F32ToFixed32Sx2_RZ:
3601       case Iop_Fixed32UToF32x2_RN:
3602       case Iop_Fixed32SToF32x2_RN:
3603          complainIfUndefined(mce, atom2, NULL);
3604          return mkPCast32x2(mce, vatom1);
3605
3606       case Iop_QSub8Ux16:
3607       case Iop_QSub8Sx16:
3608       case Iop_Sub8x16:
3609       case Iop_Min8Ux16:
3610       case Iop_Min8Sx16:
3611       case Iop_Max8Ux16:
3612       case Iop_Max8Sx16:
3613       case Iop_CmpGT8Sx16:
3614       case Iop_CmpGT8Ux16:
3615       case Iop_CmpEQ8x16:
3616       case Iop_Avg8Ux16:
3617       case Iop_Avg8Sx16:
3618       case Iop_QAdd8Ux16:
3619       case Iop_QAdd8Sx16:
3620       case Iop_QAddExtUSsatSS8x16:
3621       case Iop_QAddExtSUsatUU8x16:
3622       case Iop_QSal8x16:
3623       case Iop_QShl8x16:
3624       case Iop_Add8x16:
3625       case Iop_Mul8x16:
3626       case Iop_MulHi8Sx16:
3627       case Iop_MulHi8Ux16:
3628       case Iop_PolynomialMul8x16:
3629       case Iop_PolynomialMulAdd8x16:
3630          return binary8Ix16(mce, vatom1, vatom2);
3631
3632       case Iop_QSub16Ux8:
3633       case Iop_QSub16Sx8:
3634       case Iop_Sub16x8:
3635       case Iop_Mul16x8:
3636       case Iop_MulHi16Sx8:
3637       case Iop_MulHi16Ux8:
3638       case Iop_Min16Sx8:
3639       case Iop_Min16Ux8:
3640       case Iop_Max16Sx8:
3641       case Iop_Max16Ux8:
3642       case Iop_CmpGT16Sx8:
3643       case Iop_CmpGT16Ux8:
3644       case Iop_CmpEQ16x8:
3645       case Iop_Avg16Ux8:
3646       case Iop_Avg16Sx8:
3647       case Iop_QAdd16Ux8:
3648       case Iop_QAdd16Sx8:
3649       case Iop_QAddExtUSsatSS16x8:
3650       case Iop_QAddExtSUsatUU16x8:
3651       case Iop_QSal16x8:
3652       case Iop_QShl16x8:
3653       case Iop_Add16x8:
3654       case Iop_QDMulHi16Sx8:
3655       case Iop_QRDMulHi16Sx8:
3656       case Iop_PolynomialMulAdd16x8:
3657          return binary16Ix8(mce, vatom1, vatom2);
3658
3659       case Iop_Sub32x4:
3660       case Iop_CmpGT32Sx4:
3661       case Iop_CmpGT32Ux4:
3662       case Iop_CmpEQ32x4:
3663       case Iop_QAdd32Sx4:
3664       case Iop_QAdd32Ux4:
3665       case Iop_QSub32Sx4:
3666       case Iop_QSub32Ux4:
3667       case Iop_QAddExtUSsatSS32x4:
3668       case Iop_QAddExtSUsatUU32x4:
3669       case Iop_QSal32x4:
3670       case Iop_QShl32x4:
3671       case Iop_Avg32Ux4:
3672       case Iop_Avg32Sx4:
3673       case Iop_Add32x4:
3674       case Iop_Max32Ux4:
3675       case Iop_Max32Sx4:
3676       case Iop_Min32Ux4:
3677       case Iop_Min32Sx4:
3678       case Iop_Mul32x4:
3679       case Iop_MulHi32Sx4:
3680       case Iop_MulHi32Ux4:
3681       case Iop_QDMulHi32Sx4:
3682       case Iop_QRDMulHi32Sx4:
3683       case Iop_PolynomialMulAdd32x4:
3684          return binary32Ix4(mce, vatom1, vatom2);
3685
3686       case Iop_Sub64x2:
3687       case Iop_Add64x2:
3688       case Iop_Avg64Ux2:
3689       case Iop_Avg64Sx2:
3690       case Iop_Max64Sx2:
3691       case Iop_Max64Ux2:
3692       case Iop_Min64Sx2:
3693       case Iop_Min64Ux2:
3694       case Iop_CmpEQ64x2:
3695       case Iop_CmpGT64Sx2:
3696       case Iop_CmpGT64Ux2:
3697       case Iop_QSal64x2:
3698       case Iop_QShl64x2:
3699       case Iop_QAdd64Ux2:
3700       case Iop_QAdd64Sx2:
3701       case Iop_QSub64Ux2:
3702       case Iop_QSub64Sx2:
3703       case Iop_QAddExtUSsatSS64x2:
3704       case Iop_QAddExtSUsatUU64x2:
3705       case Iop_PolynomialMulAdd64x2:
3706       case Iop_CipherV128:
3707       case Iop_CipherLV128:
3708       case Iop_NCipherV128:
3709       case Iop_NCipherLV128:
3710       case Iop_MulI128by10E:
3711       case Iop_MulI128by10ECarry:
3712         return binary64Ix2(mce, vatom1, vatom2);
3713
3714       case Iop_Add128x1:
3715       case Iop_Sub128x1:
3716       case Iop_CmpNEZ128x1:
3717          return binary128Ix1(mce, vatom1, vatom2);
3718
3719       case Iop_QNarrowBin64Sto32Sx4:
3720       case Iop_QNarrowBin64Uto32Ux4:
3721       case Iop_QNarrowBin32Sto16Sx8:
3722       case Iop_QNarrowBin32Uto16Ux8:
3723       case Iop_QNarrowBin32Sto16Ux8:
3724       case Iop_QNarrowBin16Sto8Sx16:
3725       case Iop_QNarrowBin16Uto8Ux16:
3726       case Iop_QNarrowBin16Sto8Ux16:
3727          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
3728
3729       case Iop_Min64Fx2:
3730       case Iop_Max64Fx2:
3731       case Iop_CmpLT64Fx2:
3732       case Iop_CmpLE64Fx2:
3733       case Iop_CmpEQ64Fx2:
3734       case Iop_CmpUN64Fx2:
3735       case Iop_RecipStep64Fx2:
3736       case Iop_RSqrtStep64Fx2:
3737          return binary64Fx2(mce, vatom1, vatom2);
3738
3739       case Iop_Sub64F0x2:
3740       case Iop_Mul64F0x2:
3741       case Iop_Min64F0x2:
3742       case Iop_Max64F0x2:
3743       case Iop_Div64F0x2:
3744       case Iop_CmpLT64F0x2:
3745       case Iop_CmpLE64F0x2:
3746       case Iop_CmpEQ64F0x2:
3747       case Iop_CmpUN64F0x2:
3748       case Iop_Add64F0x2:
3749          return binary64F0x2(mce, vatom1, vatom2);
3750
3751       case Iop_Min32Fx4:
3752       case Iop_Max32Fx4:
3753       case Iop_CmpLT32Fx4:
3754       case Iop_CmpLE32Fx4:
3755       case Iop_CmpEQ32Fx4:
3756       case Iop_CmpUN32Fx4:
3757       case Iop_CmpGT32Fx4:
3758       case Iop_CmpGE32Fx4:
3759       case Iop_RecipStep32Fx4:
3760       case Iop_RSqrtStep32Fx4:
3761          return binary32Fx4(mce, vatom1, vatom2);
3762
3763       case Iop_Sub32Fx2:
3764       case Iop_Mul32Fx2:
3765       case Iop_Min32Fx2:
3766       case Iop_Max32Fx2:
3767       case Iop_CmpEQ32Fx2:
3768       case Iop_CmpGT32Fx2:
3769       case Iop_CmpGE32Fx2:
3770       case Iop_Add32Fx2:
3771       case Iop_RecipStep32Fx2:
3772       case Iop_RSqrtStep32Fx2:
3773          return binary32Fx2(mce, vatom1, vatom2);
3774
3775       case Iop_Sub32F0x4:
3776       case Iop_Mul32F0x4:
3777       case Iop_Min32F0x4:
3778       case Iop_Max32F0x4:
3779       case Iop_Div32F0x4:
3780       case Iop_CmpLT32F0x4:
3781       case Iop_CmpLE32F0x4:
3782       case Iop_CmpEQ32F0x4:
3783       case Iop_CmpUN32F0x4:
3784       case Iop_Add32F0x4:
3785          return binary32F0x4(mce, vatom1, vatom2);
3786
3787       case Iop_QShlNsatSU8x16:
3788       case Iop_QShlNsatUU8x16:
3789       case Iop_QShlNsatSS8x16:
3790          complainIfUndefined(mce, atom2, NULL);
3791          return mkPCast8x16(mce, vatom1);
3792
3793       case Iop_QShlNsatSU16x8:
3794       case Iop_QShlNsatUU16x8:
3795       case Iop_QShlNsatSS16x8:
3796          complainIfUndefined(mce, atom2, NULL);
3797          return mkPCast16x8(mce, vatom1);
3798
3799       case Iop_QShlNsatSU32x4:
3800       case Iop_QShlNsatUU32x4:
3801       case Iop_QShlNsatSS32x4:
3802          complainIfUndefined(mce, atom2, NULL);
3803          return mkPCast32x4(mce, vatom1);
3804
3805       case Iop_QShlNsatSU64x2:
3806       case Iop_QShlNsatUU64x2:
3807       case Iop_QShlNsatSS64x2:
3808          complainIfUndefined(mce, atom2, NULL);
3809          return mkPCast32x4(mce, vatom1);
3810
3811       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
3812          To make this simpler, do the following:
3813          * complain if the shift amount (the I8) is undefined
3814          * pcast each lane at the wide width
3815          * truncate each lane to half width
3816          * pcast the resulting 64-bit value to a single bit and use
3817            that as the least significant bit of the upper half of the
3818            result. */
3819       case Iop_QandQShrNnarrow64Uto32Ux2:
3820       case Iop_QandQSarNnarrow64Sto32Sx2:
3821       case Iop_QandQSarNnarrow64Sto32Ux2:
3822       case Iop_QandQRShrNnarrow64Uto32Ux2:
3823       case Iop_QandQRSarNnarrow64Sto32Sx2:
3824       case Iop_QandQRSarNnarrow64Sto32Ux2:
3825       case Iop_QandQShrNnarrow32Uto16Ux4:
3826       case Iop_QandQSarNnarrow32Sto16Sx4:
3827       case Iop_QandQSarNnarrow32Sto16Ux4:
3828       case Iop_QandQRShrNnarrow32Uto16Ux4:
3829       case Iop_QandQRSarNnarrow32Sto16Sx4:
3830       case Iop_QandQRSarNnarrow32Sto16Ux4:
3831       case Iop_QandQShrNnarrow16Uto8Ux8:
3832       case Iop_QandQSarNnarrow16Sto8Sx8:
3833       case Iop_QandQSarNnarrow16Sto8Ux8:
3834       case Iop_QandQRShrNnarrow16Uto8Ux8:
3835       case Iop_QandQRSarNnarrow16Sto8Sx8:
3836       case Iop_QandQRSarNnarrow16Sto8Ux8:
3837       {
3838          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
3839          IROp opNarrow = Iop_INVALID;
3840          switch (op) {
3841             case Iop_QandQShrNnarrow64Uto32Ux2:
3842             case Iop_QandQSarNnarrow64Sto32Sx2:
3843             case Iop_QandQSarNnarrow64Sto32Ux2:
3844             case Iop_QandQRShrNnarrow64Uto32Ux2:
3845             case Iop_QandQRSarNnarrow64Sto32Sx2:
3846             case Iop_QandQRSarNnarrow64Sto32Ux2:
3847                fnPessim = mkPCast64x2;
3848                opNarrow = Iop_NarrowUn64to32x2;
3849                break;
3850             case Iop_QandQShrNnarrow32Uto16Ux4:
3851             case Iop_QandQSarNnarrow32Sto16Sx4:
3852             case Iop_QandQSarNnarrow32Sto16Ux4:
3853             case Iop_QandQRShrNnarrow32Uto16Ux4:
3854             case Iop_QandQRSarNnarrow32Sto16Sx4:
3855             case Iop_QandQRSarNnarrow32Sto16Ux4:
3856                fnPessim = mkPCast32x4;
3857                opNarrow = Iop_NarrowUn32to16x4;
3858                break;
3859             case Iop_QandQShrNnarrow16Uto8Ux8:
3860             case Iop_QandQSarNnarrow16Sto8Sx8:
3861             case Iop_QandQSarNnarrow16Sto8Ux8:
3862             case Iop_QandQRShrNnarrow16Uto8Ux8:
3863             case Iop_QandQRSarNnarrow16Sto8Sx8:
3864             case Iop_QandQRSarNnarrow16Sto8Ux8:
3865                fnPessim = mkPCast16x8;
3866                opNarrow = Iop_NarrowUn16to8x8;
3867                break;
3868             default:
3869                tl_assert(0);
3870          }
3871          complainIfUndefined(mce, atom2, NULL);
3872          // Pessimised shift result
3873          IRAtom* shV
3874             = fnPessim(mce, vatom1);
3875          // Narrowed, pessimised shift result
3876          IRAtom* shVnarrowed
3877             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
3878          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
3879          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
3880          // and assemble the result
3881          return assignNew('V', mce, Ity_V128,
3882                           binop(Iop_64HLtoV128, qV, shVnarrowed));
3883       }
3884
3885       case Iop_Mull32Sx2:
3886       case Iop_Mull32Ux2:
3887       case Iop_QDMull32Sx2:
3888          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
3889                                     mkUifU64(mce, vatom1, vatom2));
3890
3891       case Iop_Mull16Sx4:
3892       case Iop_Mull16Ux4:
3893       case Iop_QDMull16Sx4:
3894          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
3895                                     mkUifU64(mce, vatom1, vatom2));
3896
3897       case Iop_Mull8Sx8:
3898       case Iop_Mull8Ux8:
3899       case Iop_PolynomialMull8x8:
3900          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
3901                                     mkUifU64(mce, vatom1, vatom2));
3902
3903       case Iop_PwAdd32x4:
3904          return mkPCast32x4(mce,
3905                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
3906                      mkPCast32x4(mce, vatom2))));
3907
3908       case Iop_PwAdd16x8:
3909          return mkPCast16x8(mce,
3910                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
3911                      mkPCast16x8(mce, vatom2))));
3912
3913       case Iop_PwAdd8x16:
3914          return mkPCast8x16(mce,
3915                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
3916                      mkPCast8x16(mce, vatom2))));
3917
3918       /* V128-bit data-steering */
3919       case Iop_SetV128lo32:
3920       case Iop_SetV128lo64:
3921       case Iop_64HLtoV128:
3922       case Iop_InterleaveLO64x2:
3923       case Iop_InterleaveLO32x4:
3924       case Iop_InterleaveLO16x8:
3925       case Iop_InterleaveLO8x16:
3926       case Iop_InterleaveHI64x2:
3927       case Iop_InterleaveHI32x4:
3928       case Iop_InterleaveHI16x8:
3929       case Iop_InterleaveHI8x16:
3930       case Iop_CatOddLanes8x16:
3931       case Iop_CatOddLanes16x8:
3932       case Iop_CatOddLanes32x4:
3933       case Iop_CatEvenLanes8x16:
3934       case Iop_CatEvenLanes16x8:
3935       case Iop_CatEvenLanes32x4:
3936       case Iop_InterleaveOddLanes8x16:
3937       case Iop_InterleaveOddLanes16x8:
3938       case Iop_InterleaveOddLanes32x4:
3939       case Iop_InterleaveEvenLanes8x16:
3940       case Iop_InterleaveEvenLanes16x8:
3941       case Iop_InterleaveEvenLanes32x4:
3942       case Iop_PackOddLanes8x16:
3943       case Iop_PackOddLanes16x8:
3944       case Iop_PackOddLanes32x4:
3945       case Iop_PackEvenLanes8x16:
3946       case Iop_PackEvenLanes16x8:
3947       case Iop_PackEvenLanes32x4:
3948          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
3949
3950       case Iop_GetElem8x16:
3951          complainIfUndefined(mce, atom2, NULL);
3952          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3953       case Iop_GetElem16x8:
3954          complainIfUndefined(mce, atom2, NULL);
3955          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3956       case Iop_GetElem32x4:
3957          complainIfUndefined(mce, atom2, NULL);
3958          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3959       case Iop_GetElem64x2:
3960          complainIfUndefined(mce, atom2, NULL);
3961          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3962
3963      /* Perm8x16: rearrange values in left arg using steering values
3964         from right arg.  So rearrange the vbits in the same way but
3965         pessimise wrt steering values.  Perm32x4 ditto. */
3966       case Iop_Perm8x16:
3967          return mkUifUV128(
3968                    mce,
3969                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3970                    mkPCast8x16(mce, vatom2)
3971                 );
3972       case Iop_Perm32x4:
3973          return mkUifUV128(
3974                    mce,
3975                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3976                    mkPCast32x4(mce, vatom2)
3977                 );
3978
3979      /* These two take the lower half of each 16-bit lane, sign/zero
3980         extend it to 32, and multiply together, producing a 32x4
3981         result (and implicitly ignoring half the operand bits).  So
3982         treat it as a bunch of independent 16x8 operations, but then
3983         do 32-bit shifts left-right to copy the lower half results
3984         (which are all 0s or all 1s due to PCasting in binary16Ix8)
3985         into the upper half of each result lane. */
3986       case Iop_MullEven16Ux8:
3987       case Iop_MullEven16Sx8: {
3988          IRAtom* at;
3989          at = binary16Ix8(mce,vatom1,vatom2);
3990          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
3991          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
3992          return at;
3993       }
3994
3995       /* Same deal as Iop_MullEven16{S,U}x8 */
3996       case Iop_MullEven8Ux16:
3997       case Iop_MullEven8Sx16: {
3998          IRAtom* at;
3999          at = binary8Ix16(mce,vatom1,vatom2);
4000          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
4001          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
4002          return at;
4003       }
4004
4005       /* Same deal as Iop_MullEven16{S,U}x8 */
4006       case Iop_MullEven32Ux4:
4007       case Iop_MullEven32Sx4: {
4008          IRAtom* at;
4009          at = binary32Ix4(mce,vatom1,vatom2);
4010          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
4011          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
4012          return at;
4013       }
4014
4015       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4016          32x4 -> 16x8 laneage, discarding the upper half of each lane.
4017          Simply apply same op to the V bits, since this really no more
4018          than a data steering operation. */
4019       case Iop_NarrowBin32to16x8:
4020       case Iop_NarrowBin16to8x16:
4021       case Iop_NarrowBin64to32x4:
4022          return assignNew('V', mce, Ity_V128,
4023                                     binop(op, vatom1, vatom2));
4024
4025       case Iop_ShrV128:
4026       case Iop_SarV128:
4027       case Iop_ShlV128:
4028       case Iop_I128StoBCD128:
4029          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
4030             this is wrong now, scalar shifts are done properly lazily.
4031             Vector shifts should be fixed too. */
4032          complainIfUndefined(mce, atom2, NULL);
4033          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4034
4035       case Iop_BCDAdd:
4036       case Iop_BCDSub:
4037          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4038
4039       /* SHA Iops */
4040       case Iop_SHA256:
4041       case Iop_SHA512:
4042          complainIfUndefined(mce, atom2, NULL);
4043          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4044
4045       /* I128-bit data-steering */
4046       case Iop_64HLto128:
4047          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
4048
4049       /* V256-bit SIMD */
4050
4051       case Iop_Max64Fx4:
4052       case Iop_Min64Fx4:
4053          return binary64Fx4(mce, vatom1, vatom2);
4054
4055       case Iop_Max32Fx8:
4056       case Iop_Min32Fx8:
4057          return binary32Fx8(mce, vatom1, vatom2);
4058
4059       /* V256-bit data-steering */
4060       case Iop_V128HLtoV256:
4061          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
4062
4063       /* Scalar floating point */
4064
4065       case Iop_F32toI64S:
4066       case Iop_F32toI64U:
4067          /* I32(rm) x F32 -> I64 */
4068          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4069
4070       case Iop_I64StoF32:
4071          /* I32(rm) x I64 -> F32 */
4072          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4073
4074       case Iop_RoundF64toInt:
4075       case Iop_RoundF64toF32:
4076       case Iop_F64toI64S:
4077       case Iop_F64toI64U:
4078       case Iop_I64StoF64:
4079       case Iop_I64UtoF64:
4080       case Iop_SinF64:
4081       case Iop_CosF64:
4082       case Iop_TanF64:
4083       case Iop_2xm1F64:
4084       case Iop_SqrtF64:
4085       case Iop_RecpExpF64:
4086          /* I32(rm) x I64/F64 -> I64/F64 */
4087          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4088
4089       case Iop_ShlD64:
4090       case Iop_ShrD64:
4091       case Iop_RoundD64toInt:
4092          /* I32(rm) x D64 -> D64 */
4093          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4094
4095       case Iop_ShlD128:
4096       case Iop_ShrD128:
4097       case Iop_RoundD128toInt:
4098          /* I32(rm) x D128 -> D128 */
4099          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4100
4101       case Iop_RoundF128toInt:
4102          /* I32(rm) x F128 -> F128 */
4103          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4104
4105       case Iop_D64toI64S:
4106       case Iop_D64toI64U:
4107       case Iop_I64StoD64:
4108       case Iop_I64UtoD64:
4109          /* I32(rm) x I64/D64 -> D64/I64 */
4110          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4111
4112       case Iop_F32toD32:
4113       case Iop_F64toD32:
4114       case Iop_F128toD32:
4115       case Iop_D32toF32:
4116       case Iop_D64toF32:
4117       case Iop_D128toF32:
4118          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4119          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4120
4121       case Iop_F32toD64:
4122       case Iop_F64toD64:
4123       case Iop_F128toD64:
4124       case Iop_D32toF64:
4125       case Iop_D64toF64:
4126       case Iop_D128toF64:
4127          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4128          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4129
4130       case Iop_F32toD128:
4131       case Iop_F64toD128:
4132       case Iop_F128toD128:
4133       case Iop_D32toF128:
4134       case Iop_D64toF128:
4135       case Iop_D128toF128:
4136          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4137          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4138
4139       case Iop_RoundF32toInt:
4140       case Iop_SqrtF32:
4141       case Iop_RecpExpF32:
4142          /* I32(rm) x I32/F32 -> I32/F32 */
4143          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4144
4145       case Iop_SqrtF128:
4146          /* I32(rm) x F128 -> F128 */
4147          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4148
4149       case Iop_I32StoF32:
4150       case Iop_I32UtoF32:
4151       case Iop_F32toI32S:
4152       case Iop_F32toI32U:
4153          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4154          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4155
4156       case Iop_F64toF16:
4157       case Iop_F32toF16:
4158          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4159          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4160
4161       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
4162       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
4163       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
4164       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
4165       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
4166          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4167
4168       case Iop_F128toI128S:   /* IRRoundingMode(I32) x F128 -> signed I128 */
4169       case Iop_RndF128:       /* IRRoundingMode(I32) x F128 -> F128 */
4170          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4171
4172       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
4173       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
4174       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
4175       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
4176       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
4177       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
4178          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4179
4180       case Iop_F64HLtoF128:
4181       case Iop_D64HLtoD128:
4182          return assignNew('V', mce, Ity_I128,
4183                           binop(Iop_64HLto128, vatom1, vatom2));
4184
4185       case Iop_F64toI32U:
4186       case Iop_F64toI32S:
4187       case Iop_F64toF32:
4188       case Iop_I64UtoF32:
4189       case Iop_D64toI32U:
4190       case Iop_D64toI32S:
4191          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4192          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4193
4194       case Iop_D64toD32:
4195          /* First arg is I32 (rounding mode), second is D64 (data). */
4196          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4197
4198       case Iop_F64toI16S:
4199          /* First arg is I32 (rounding mode), second is F64 (data). */
4200          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4201
4202       case Iop_InsertExpD64:
4203          /*  I64 x I64 -> D64 */
4204          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4205
4206       case Iop_InsertExpD128:
4207          /*  I64 x I128 -> D128 */
4208          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4209
4210       case Iop_CmpF32:
4211       case Iop_CmpF64:
4212       case Iop_CmpF128:
4213       case Iop_CmpD64:
4214       case Iop_CmpD128:
4215       case Iop_CmpExpD64:
4216       case Iop_CmpExpD128:
4217          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4218
4219       case Iop_MaxNumF32:
4220       case Iop_MinNumF32:
4221          /* F32 x F32 -> F32 */
4222          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4223
4224       case Iop_MaxNumF64:
4225       case Iop_MinNumF64:
4226          /* F64 x F64 -> F64 */
4227          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4228
4229       /* non-FP after here */
4230
4231       case Iop_DivModU64to32:
4232       case Iop_DivModS64to32:
4233          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4234
4235       case Iop_DivModU128to64:
4236       case Iop_DivModS128to64:
4237          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4238
4239       case Iop_8HLto16:
4240          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
4241       case Iop_16HLto32:
4242          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
4243       case Iop_32HLto64:
4244          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
4245
4246       case Iop_DivModU64to64:
4247       case Iop_DivModS64to64: {
4248          IRAtom* vTmp64 = mkLazy2(mce, Ity_I64, vatom1, vatom2);
4249          return assignNew('V', mce, Ity_I128,
4250                           binop(Iop_64HLto128, vTmp64, vTmp64));
4251       }
4252
4253       case Iop_MullS64:
4254       case Iop_MullU64: {
4255          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4256          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
4257          return assignNew('V', mce, Ity_I128,
4258                           binop(Iop_64HLto128, vHi64, vLo64));
4259       }
4260
4261       case Iop_DivModU32to32:
4262       case Iop_DivModS32to32: {
4263          IRAtom* vTmp32 = mkLazy2(mce, Ity_I32, vatom1, vatom2);
4264          return assignNew('V', mce, Ity_I64,
4265                           binop(Iop_32HLto64, vTmp32, vTmp32));
4266       }
4267
4268       case Iop_MullS32:
4269       case Iop_MullU32: {
4270          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4271          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
4272          return assignNew('V', mce, Ity_I64,
4273                           binop(Iop_32HLto64, vHi32, vLo32));
4274       }
4275
4276       case Iop_MullS16:
4277       case Iop_MullU16: {
4278          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4279          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4280          return assignNew('V', mce, Ity_I32,
4281                           binop(Iop_16HLto32, vHi16, vLo16));
4282       }
4283
4284       case Iop_MullS8:
4285       case Iop_MullU8: {
4286          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4287          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4288          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4289       }
4290
4291       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
4292       case Iop_DivS32:
4293       case Iop_DivU32:
4294       case Iop_DivU32E:
4295       case Iop_DivS32E:
4296       case Iop_QAdd32S: /* could probably do better */
4297       case Iop_QSub32S: /* could probably do better */
4298          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4299
4300       case Iop_DivS64:
4301       case Iop_DivU64:
4302       case Iop_DivS64E:
4303       case Iop_DivU64E:
4304          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4305
4306       case Iop_Add32:
4307          if (mce->dlbo.dl_Add32 == DLexpensive
4308              || (mce->dlbo.dl_Add32 == DLauto && hu == HuOth)) {
4309              return expensiveAddSub(mce,True,Ity_I32,
4310                                     vatom1,vatom2, atom1,atom2);
4311          } else {
4312              goto cheap_AddSub32;
4313          }
4314       case Iop_Sub32:
4315          if (mce->dlbo.dl_Sub32 == DLexpensive
4316              || (mce->dlbo.dl_Sub32 == DLauto && hu == HuOth)) {
4317              return expensiveAddSub(mce,False,Ity_I32,
4318                                     vatom1,vatom2, atom1,atom2);
4319          } else {
4320              goto cheap_AddSub32;
4321          }
4322
4323       cheap_AddSub32:
4324       case Iop_Mul32:
4325          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4326
4327       case Iop_CmpORD32S:
4328       case Iop_CmpORD32U:
4329       case Iop_CmpORD64S:
4330       case Iop_CmpORD64U:
4331          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4332
4333       case Iop_Add64:
4334          if (mce->dlbo.dl_Add64 == DLexpensive
4335              || (mce->dlbo.dl_Add64 == DLauto && hu == HuOth)) {
4336              return expensiveAddSub(mce,True,Ity_I64,
4337                                     vatom1,vatom2, atom1,atom2);
4338          } else {
4339              goto cheap_AddSub64;
4340          }
4341       case Iop_Sub64:
4342          if (mce->dlbo.dl_Sub64 == DLexpensive
4343              || (mce->dlbo.dl_Sub64 == DLauto && hu == HuOth)) {
4344              return expensiveAddSub(mce,False,Ity_I64,
4345                                     vatom1,vatom2, atom1,atom2);
4346          } else {
4347              goto cheap_AddSub64;
4348          }
4349
4350       cheap_AddSub64:
4351       case Iop_Mul64:
4352          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4353
4354       case Iop_Mul16:
4355       case Iop_Add16:
4356       case Iop_Sub16:
4357          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4358
4359       case Iop_Mul8:
4360       case Iop_Sub8:
4361       case Iop_Add8:
4362          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4363
4364       ////---- CmpXX64
4365       case Iop_CmpEQ64: case Iop_CmpNE64:
4366          if (mce->dlbo.dl_CmpEQ64_CmpNE64 == DLexpensive)
4367             goto expensive_cmp64;
4368          else
4369             goto cheap_cmp64;
4370
4371       expensive_cmp64:
4372       case Iop_ExpCmpNE64:
4373          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4374
4375       cheap_cmp64:
4376       case Iop_CmpLE64S: case Iop_CmpLE64U:
4377       case Iop_CmpLT64U: case Iop_CmpLT64S:
4378          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4379
4380       ////---- CmpXX32
4381       case Iop_CmpEQ32: case Iop_CmpNE32:
4382          if (mce->dlbo.dl_CmpEQ32_CmpNE32 == DLexpensive)
4383             goto expensive_cmp32;
4384          else
4385             goto cheap_cmp32;
4386
4387       expensive_cmp32:
4388       case Iop_ExpCmpNE32:
4389          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4390
4391       cheap_cmp32:
4392       case Iop_CmpLE32S: case Iop_CmpLE32U:
4393       case Iop_CmpLT32U: case Iop_CmpLT32S:
4394          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4395
4396       ////---- CmpXX16
4397       case Iop_CmpEQ16: case Iop_CmpNE16:
4398          if (mce->dlbo.dl_CmpEQ16_CmpNE16 == DLexpensive)
4399             goto expensive_cmp16;
4400          else
4401             goto cheap_cmp16;
4402
4403       expensive_cmp16:
4404       case Iop_ExpCmpNE16:
4405          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4406
4407       cheap_cmp16:
4408          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4409
4410       ////---- CmpXX8
4411       case Iop_CmpEQ8: case Iop_CmpNE8:
4412          if (mce->dlbo.dl_CmpEQ8_CmpNE8 == DLexpensive)
4413             goto expensive_cmp8;
4414          else
4415             goto cheap_cmp8;
4416
4417       expensive_cmp8:
4418          return expensiveCmpEQorNE(mce,Ity_I8, vatom1,vatom2, atom1,atom2 );
4419
4420       cheap_cmp8:
4421          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4422
4423       ////---- end CmpXX{64,32,16,8}
4424
4425       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
4426       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4427       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4428       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4429          /* Just say these all produce a defined result, regardless
4430             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
4431          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4432
4433       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4434          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4435
4436       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4437          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4438
4439       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4440          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4441
4442       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4443          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4444
4445       case Iop_AndV256:
4446          uifu = mkUifUV256; difd = mkDifDV256;
4447          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4448       case Iop_AndV128:
4449          uifu = mkUifUV128; difd = mkDifDV128;
4450          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4451       case Iop_And64:
4452          uifu = mkUifU64; difd = mkDifD64;
4453          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4454       case Iop_And32:
4455          uifu = mkUifU32; difd = mkDifD32;
4456          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4457       case Iop_And16:
4458          uifu = mkUifU16; difd = mkDifD16;
4459          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4460       case Iop_And8:
4461          uifu = mkUifU8; difd = mkDifD8;
4462          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4463
4464       case Iop_OrV256:
4465          uifu = mkUifUV256; difd = mkDifDV256;
4466          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4467       case Iop_OrV128:
4468          uifu = mkUifUV128; difd = mkDifDV128;
4469          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4470       case Iop_Or64:
4471          uifu = mkUifU64; difd = mkDifD64;
4472          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4473       case Iop_Or32:
4474          uifu = mkUifU32; difd = mkDifD32;
4475          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4476       case Iop_Or16:
4477          uifu = mkUifU16; difd = mkDifD16;
4478          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4479       case Iop_Or8:
4480          uifu = mkUifU8; difd = mkDifD8;
4481          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4482
4483       do_And_Or:
4484          return
4485          assignNew(
4486             'V', mce,
4487             and_or_ty,
4488             difd(mce, uifu(mce, vatom1, vatom2),
4489                       difd(mce, improve(mce, atom1, vatom1),
4490                                 improve(mce, atom2, vatom2) ) ) );
4491
4492       case Iop_Xor8:
4493          return mkUifU8(mce, vatom1, vatom2);
4494       case Iop_Xor16:
4495          return mkUifU16(mce, vatom1, vatom2);
4496       case Iop_Xor32:
4497          return mkUifU32(mce, vatom1, vatom2);
4498       case Iop_Xor64:
4499          return mkUifU64(mce, vatom1, vatom2);
4500       case Iop_XorV128:
4501          return mkUifUV128(mce, vatom1, vatom2);
4502       case Iop_XorV256:
4503          return mkUifUV256(mce, vatom1, vatom2);
4504
4505       /* V256-bit SIMD */
4506
4507       case Iop_ShrN16x16:
4508       case Iop_ShrN32x8:
4509       case Iop_ShrN64x4:
4510       case Iop_SarN16x16:
4511       case Iop_SarN32x8:
4512       case Iop_ShlN16x16:
4513       case Iop_ShlN32x8:
4514       case Iop_ShlN64x4:
4515          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
4516             this is wrong now, scalar shifts are done properly lazily.
4517             Vector shifts should be fixed too. */
4518          complainIfUndefined(mce, atom2, NULL);
4519          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4520
4521       case Iop_QSub8Ux32:
4522       case Iop_QSub8Sx32:
4523       case Iop_Sub8x32:
4524       case Iop_Min8Ux32:
4525       case Iop_Min8Sx32:
4526       case Iop_Max8Ux32:
4527       case Iop_Max8Sx32:
4528       case Iop_CmpGT8Sx32:
4529       case Iop_CmpEQ8x32:
4530       case Iop_Avg8Ux32:
4531       case Iop_QAdd8Ux32:
4532       case Iop_QAdd8Sx32:
4533       case Iop_Add8x32:
4534          return binary8Ix32(mce, vatom1, vatom2);
4535
4536       case Iop_QSub16Ux16:
4537       case Iop_QSub16Sx16:
4538       case Iop_Sub16x16:
4539       case Iop_Mul16x16:
4540       case Iop_MulHi16Sx16:
4541       case Iop_MulHi16Ux16:
4542       case Iop_Min16Sx16:
4543       case Iop_Min16Ux16:
4544       case Iop_Max16Sx16:
4545       case Iop_Max16Ux16:
4546       case Iop_CmpGT16Sx16:
4547       case Iop_CmpEQ16x16:
4548       case Iop_Avg16Ux16:
4549       case Iop_QAdd16Ux16:
4550       case Iop_QAdd16Sx16:
4551       case Iop_Add16x16:
4552          return binary16Ix16(mce, vatom1, vatom2);
4553
4554       case Iop_Sub32x8:
4555       case Iop_CmpGT32Sx8:
4556       case Iop_CmpEQ32x8:
4557       case Iop_Add32x8:
4558       case Iop_Max32Ux8:
4559       case Iop_Max32Sx8:
4560       case Iop_Min32Ux8:
4561       case Iop_Min32Sx8:
4562       case Iop_Mul32x8:
4563          return binary32Ix8(mce, vatom1, vatom2);
4564
4565       case Iop_Sub64x4:
4566       case Iop_Add64x4:
4567       case Iop_CmpEQ64x4:
4568       case Iop_CmpGT64Sx4:
4569          return binary64Ix4(mce, vatom1, vatom2);
4570
4571      /* Perm32x8: rearrange values in left arg using steering values
4572         from right arg.  So rearrange the vbits in the same way but
4573         pessimise wrt steering values. */
4574       case Iop_Perm32x8:
4575          return mkUifUV256(
4576                    mce,
4577                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
4578                    mkPCast32x8(mce, vatom2)
4579                 );
4580
4581       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
4582          Handle the shifted results in the same way that other
4583          binary Q ops are handled, eg QSub: UifU the two args,
4584          then pessimise -- which is binaryNIxM.  But for the upper
4585          V128, we require to generate just 1 bit which is the
4586          pessimised shift result, with 127 defined zeroes above it.
4587
4588          Note that this overly pessimistic in that in fact only the
4589          bottom 8 bits of each lane of the second arg determine the shift
4590          amount.  Really we ought to ignore any undefinedness in the
4591          rest of the lanes of the second arg. */
4592       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
4593       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
4594       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
4595       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
4596       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
4597       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
4598       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
4599       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
4600       {
4601          // The function to generate the pessimised shift result
4602          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
4603          switch (op) {
4604             case Iop_QandSQsh64x2:
4605             case Iop_QandUQsh64x2:
4606             case Iop_QandSQRsh64x2:
4607             case Iop_QandUQRsh64x2:
4608                binaryNIxM = binary64Ix2;
4609                break;
4610             case Iop_QandSQsh32x4:
4611             case Iop_QandUQsh32x4:
4612             case Iop_QandSQRsh32x4:
4613             case Iop_QandUQRsh32x4:
4614                binaryNIxM = binary32Ix4;
4615                break;
4616             case Iop_QandSQsh16x8:
4617             case Iop_QandUQsh16x8:
4618             case Iop_QandSQRsh16x8:
4619             case Iop_QandUQRsh16x8:
4620                binaryNIxM = binary16Ix8;
4621                break;
4622             case Iop_QandSQsh8x16:
4623             case Iop_QandUQsh8x16:
4624             case Iop_QandSQRsh8x16:
4625             case Iop_QandUQRsh8x16:
4626                binaryNIxM = binary8Ix16;
4627                break;
4628             default:
4629                tl_assert(0);
4630          }
4631          tl_assert(binaryNIxM);
4632          // Pessimised shift result, shV[127:0]
4633          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
4634          // Generates: Def--(127)--Def PCast-to-I1(shV)
4635          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
4636          // and assemble the result
4637          return assignNew('V', mce, Ity_V256,
4638                           binop(Iop_V128HLtoV256, qV, shV));
4639       }
4640
4641       default:
4642          ppIROp(op);
4643          VG_(tool_panic)("memcheck:expr2vbits_Binop");
4644    }
4645 }
4646
4647
4648 static
4649 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
4650 {
4651    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
4652       selection of shadow operation implicitly duplicates the logic in
4653       do_shadow_LoadG and should be kept in sync (in the very unlikely
4654       event that the interpretation of such widening ops changes in
4655       future).  See comment in do_shadow_LoadG. */
4656    IRAtom* vatom = expr2vbits( mce, atom, HuOth );
4657    tl_assert(isOriginalAtom(mce,atom));
4658    switch (op) {
4659
4660       case Iop_Abs64Fx2:
4661       case Iop_Neg64Fx2:
4662       case Iop_RSqrtEst64Fx2:
4663       case Iop_RecipEst64Fx2:
4664       case Iop_Log2_64Fx2:
4665          return unary64Fx2(mce, vatom);
4666
4667       case Iop_Sqrt64F0x2:
4668          return unary64F0x2(mce, vatom);
4669
4670       case Iop_Sqrt32Fx8:
4671       case Iop_RSqrtEst32Fx8:
4672       case Iop_RecipEst32Fx8:
4673          return unary32Fx8(mce, vatom);
4674
4675       case Iop_Sqrt64Fx4:
4676          return unary64Fx4(mce, vatom);
4677
4678       case Iop_RecipEst32Fx4:
4679       case Iop_I32UtoFx4:
4680       case Iop_I32StoFx4:
4681       case Iop_QFtoI32Ux4_RZ:
4682       case Iop_QFtoI32Sx4_RZ:
4683       case Iop_RoundF32x4_RM:
4684       case Iop_RoundF32x4_RP:
4685       case Iop_RoundF32x4_RN:
4686       case Iop_RoundF32x4_RZ:
4687       case Iop_RecipEst32Ux4:
4688       case Iop_Abs32Fx4:
4689       case Iop_Neg32Fx4:
4690       case Iop_RSqrtEst32Fx4:
4691       case Iop_Log2_32Fx4:
4692          return unary32Fx4(mce, vatom);
4693
4694       case Iop_I32UtoFx2:
4695       case Iop_I32StoFx2:
4696       case Iop_RecipEst32Fx2:
4697       case Iop_RecipEst32Ux2:
4698       case Iop_Abs32Fx2:
4699       case Iop_Neg32Fx2:
4700       case Iop_RSqrtEst32Fx2:
4701          return unary32Fx2(mce, vatom);
4702
4703       case Iop_Sqrt32F0x4:
4704       case Iop_RSqrtEst32F0x4:
4705       case Iop_RecipEst32F0x4:
4706          return unary32F0x4(mce, vatom);
4707
4708       case Iop_32UtoV128:
4709       case Iop_64UtoV128:
4710       case Iop_Dup8x16:
4711       case Iop_Dup16x8:
4712       case Iop_Dup32x4:
4713       case Iop_Reverse1sIn8_x16:
4714       case Iop_Reverse8sIn16_x8:
4715       case Iop_Reverse8sIn32_x4:
4716       case Iop_Reverse16sIn32_x4:
4717       case Iop_Reverse8sIn64_x2:
4718       case Iop_Reverse16sIn64_x2:
4719       case Iop_Reverse32sIn64_x2:
4720       case Iop_V256toV128_1: case Iop_V256toV128_0:
4721       case Iop_ZeroHI64ofV128:
4722       case Iop_ZeroHI96ofV128:
4723       case Iop_ZeroHI112ofV128:
4724       case Iop_ZeroHI120ofV128:
4725          return assignNew('V', mce, Ity_V128, unop(op, vatom));
4726
4727       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
4728       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
4729          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
4730       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
4731       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
4732          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
4733
4734       case Iop_NegF128:
4735       case Iop_AbsF128:
4736       case Iop_RndF128:
4737       case Iop_TruncF128toI64S: /* F128 -> I64S */
4738       case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
4739       case Iop_TruncF128toI64U: /* F128 -> I64U */
4740       case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
4741          return mkPCastTo(mce, Ity_I128, vatom);
4742
4743       case Iop_BCD128toI128S:
4744       case Iop_MulI128by10:
4745       case Iop_MulI128by10Carry:
4746       case Iop_F16toF64x2:
4747       case Iop_F64toF16x2:
4748          return vatom;
4749
4750       case Iop_I32StoF128: /* signed I32 -> F128 */
4751       case Iop_I64StoF128: /* signed I64 -> F128 */
4752       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
4753       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
4754       case Iop_F32toF128:  /* F32 -> F128 */
4755       case Iop_F64toF128:  /* F64 -> F128 */
4756       case Iop_I32StoD128: /* signed I64 -> D128 */
4757       case Iop_I64StoD128: /* signed I64 -> D128 */
4758       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
4759       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
4760          return mkPCastTo(mce, Ity_I128, vatom);
4761
4762       case Iop_F16toF64:
4763       case Iop_F32toF64:
4764       case Iop_I32StoF64:
4765       case Iop_I32UtoF64:
4766       case Iop_NegF64:
4767       case Iop_AbsF64:
4768       case Iop_RSqrtEst5GoodF64:
4769       case Iop_RoundF64toF64_NEAREST:
4770       case Iop_RoundF64toF64_NegINF:
4771       case Iop_RoundF64toF64_PosINF:
4772       case Iop_RoundF64toF64_ZERO:
4773       case Iop_Clz64:
4774       case Iop_D32toD64:
4775       case Iop_I32StoD64:
4776       case Iop_I32UtoD64:
4777       case Iop_ExtractExpD64:    /* D64  -> I64 */
4778       case Iop_ExtractExpD128:   /* D128 -> I64 */
4779       case Iop_ExtractSigD64:    /* D64  -> I64 */
4780       case Iop_ExtractSigD128:   /* D128 -> I64 */
4781       case Iop_DPBtoBCD:
4782       case Iop_BCDtoDPB:
4783          return mkPCastTo(mce, Ity_I64, vatom);
4784
4785       case Iop_D64toD128:
4786          return mkPCastTo(mce, Ity_I128, vatom);
4787
4788       case Iop_Clz32:
4789       case Iop_TruncF64asF32:
4790       case Iop_NegF32:
4791       case Iop_AbsF32:
4792       case Iop_F16toF32:
4793          return mkPCastTo(mce, Ity_I32, vatom);
4794
4795       case Iop_Ctz32:
4796       case Iop_Ctz64:
4797          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
4798
4799       case Iop_1Uto64:
4800       case Iop_1Sto64:
4801       case Iop_8Uto64:
4802       case Iop_8Sto64:
4803       case Iop_16Uto64:
4804       case Iop_16Sto64:
4805       case Iop_32Sto64:
4806       case Iop_32Uto64:
4807       case Iop_V128to64:
4808       case Iop_V128HIto64:
4809       case Iop_128HIto64:
4810       case Iop_128to64:
4811       case Iop_Dup8x8:
4812       case Iop_Dup16x4:
4813       case Iop_Dup32x2:
4814       case Iop_Reverse8sIn16_x4:
4815       case Iop_Reverse8sIn32_x2:
4816       case Iop_Reverse16sIn32_x2:
4817       case Iop_Reverse8sIn64_x1:
4818       case Iop_Reverse16sIn64_x1:
4819       case Iop_Reverse32sIn64_x1:
4820       case Iop_V256to64_0: case Iop_V256to64_1:
4821       case Iop_V256to64_2: case Iop_V256to64_3:
4822          return assignNew('V', mce, Ity_I64, unop(op, vatom));
4823
4824       case Iop_64to32:
4825       case Iop_64HIto32:
4826       case Iop_1Uto32:
4827       case Iop_1Sto32:
4828       case Iop_8Uto32:
4829       case Iop_16Uto32:
4830       case Iop_16Sto32:
4831       case Iop_8Sto32:
4832       case Iop_V128to32:
4833          return assignNew('V', mce, Ity_I32, unop(op, vatom));
4834
4835       case Iop_8Sto16:
4836       case Iop_8Uto16:
4837       case Iop_32to16:
4838       case Iop_32HIto16:
4839       case Iop_64to16:
4840       case Iop_GetMSBs8x16:
4841          return assignNew('V', mce, Ity_I16, unop(op, vatom));
4842
4843       case Iop_1Uto8:
4844       case Iop_1Sto8:
4845       case Iop_16to8:
4846       case Iop_16HIto8:
4847       case Iop_32to8:
4848       case Iop_64to8:
4849       case Iop_GetMSBs8x8:
4850          return assignNew('V', mce, Ity_I8, unop(op, vatom));
4851
4852       case Iop_32to1:
4853          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
4854
4855       case Iop_64to1:
4856          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
4857
4858       case Iop_ReinterpF64asI64:
4859       case Iop_ReinterpI64asF64:
4860       case Iop_ReinterpI32asF32:
4861       case Iop_ReinterpF32asI32:
4862       case Iop_ReinterpI64asD64:
4863       case Iop_ReinterpD64asI64:
4864       case Iop_NotV256:
4865       case Iop_NotV128:
4866       case Iop_Not64:
4867       case Iop_Not32:
4868       case Iop_Not16:
4869       case Iop_Not8:
4870       case Iop_Not1:
4871          return vatom;
4872
4873       case Iop_CmpNEZ8x8:
4874       case Iop_Cnt8x8:
4875       case Iop_Clz8x8:
4876       case Iop_Cls8x8:
4877       case Iop_Abs8x8:
4878          return mkPCast8x8(mce, vatom);
4879
4880       case Iop_CmpNEZ8x16:
4881       case Iop_Cnt8x16:
4882       case Iop_Clz8x16:
4883       case Iop_Cls8x16:
4884       case Iop_Abs8x16:
4885       case Iop_Ctz8x16:
4886          return mkPCast8x16(mce, vatom);
4887
4888       case Iop_CmpNEZ16x4:
4889       case Iop_Clz16x4:
4890       case Iop_Cls16x4:
4891       case Iop_Abs16x4:
4892          return mkPCast16x4(mce, vatom);
4893
4894       case Iop_CmpNEZ16x8:
4895       case Iop_Clz16x8:
4896       case Iop_Cls16x8:
4897       case Iop_Abs16x8:
4898       case Iop_Ctz16x8:
4899          return mkPCast16x8(mce, vatom);
4900
4901       case Iop_CmpNEZ32x2:
4902       case Iop_Clz32x2:
4903       case Iop_Cls32x2:
4904       case Iop_FtoI32Ux2_RZ:
4905       case Iop_FtoI32Sx2_RZ:
4906       case Iop_Abs32x2:
4907          return mkPCast32x2(mce, vatom);
4908
4909       case Iop_CmpNEZ32x4:
4910       case Iop_Clz32x4:
4911       case Iop_Cls32x4:
4912       case Iop_FtoI32Ux4_RZ:
4913       case Iop_FtoI32Sx4_RZ:
4914       case Iop_Abs32x4:
4915       case Iop_RSqrtEst32Ux4:
4916       case Iop_Ctz32x4:
4917          return mkPCast32x4(mce, vatom);
4918
4919       case Iop_CmpwNEZ32:
4920          return mkPCastTo(mce, Ity_I32, vatom);
4921
4922       case Iop_CmpwNEZ64:
4923          return mkPCastTo(mce, Ity_I64, vatom);
4924
4925       case Iop_CmpNEZ64x2:
4926       case Iop_CipherSV128:
4927       case Iop_Clz64x2:
4928       case Iop_Abs64x2:
4929       case Iop_Ctz64x2:
4930          return mkPCast64x2(mce, vatom);
4931
4932       case Iop_PwBitMtxXpose64x2:
4933          return assignNew('V', mce, Ity_V128, unop(op, vatom));
4934
4935       case Iop_NarrowUn16to8x8:
4936       case Iop_NarrowUn32to16x4:
4937       case Iop_NarrowUn64to32x2:
4938       case Iop_QNarrowUn16Sto8Sx8:
4939       case Iop_QNarrowUn16Sto8Ux8:
4940       case Iop_QNarrowUn16Uto8Ux8:
4941       case Iop_QNarrowUn32Sto16Sx4:
4942       case Iop_QNarrowUn32Sto16Ux4:
4943       case Iop_QNarrowUn32Uto16Ux4:
4944       case Iop_QNarrowUn64Sto32Sx2:
4945       case Iop_QNarrowUn64Sto32Ux2:
4946       case Iop_QNarrowUn64Uto32Ux2:
4947       case Iop_F32toF16x4:
4948          return vectorNarrowUnV128(mce, op, vatom);
4949
4950       case Iop_Widen8Sto16x8:
4951       case Iop_Widen8Uto16x8:
4952       case Iop_Widen16Sto32x4:
4953       case Iop_Widen16Uto32x4:
4954       case Iop_Widen32Sto64x2:
4955       case Iop_Widen32Uto64x2:
4956       case Iop_F16toF32x4:
4957          return vectorWidenI64(mce, op, vatom);
4958
4959       case Iop_PwAddL32Ux2:
4960       case Iop_PwAddL32Sx2:
4961          return mkPCastTo(mce, Ity_I64,
4962                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
4963
4964       case Iop_PwAddL16Ux4:
4965       case Iop_PwAddL16Sx4:
4966          return mkPCast32x2(mce,
4967                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
4968
4969       case Iop_PwAddL8Ux8:
4970       case Iop_PwAddL8Sx8:
4971          return mkPCast16x4(mce,
4972                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
4973
4974       case Iop_PwAddL32Ux4:
4975       case Iop_PwAddL32Sx4:
4976          return mkPCast64x2(mce,
4977                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
4978
4979       case Iop_PwAddL64Ux2:
4980          return mkPCast128x1(mce,
4981                assignNew('V', mce, Ity_V128, unop(op, mkPCast64x2(mce, vatom))));
4982
4983       case Iop_PwAddL16Ux8:
4984       case Iop_PwAddL16Sx8:
4985          return mkPCast32x4(mce,
4986                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
4987
4988       case Iop_PwAddL8Ux16:
4989       case Iop_PwAddL8Sx16:
4990          return mkPCast16x8(mce,
4991                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
4992
4993       case Iop_I64UtoF32:
4994       default:
4995          ppIROp(op);
4996          VG_(tool_panic)("memcheck:expr2vbits_Unop");
4997    }
4998 }
4999
5000
5001 /* Worker function -- do not call directly.  See comments on
5002    expr2vbits_Load for the meaning of |guard|.
5003
5004    Generates IR to (1) perform a definedness test of |addr|, (2)
5005    perform a validity test of |addr|, and (3) return the Vbits for the
5006    location indicated by |addr|.  All of this only happens when
5007    |guard| is NULL or |guard| evaluates to True at run time.
5008
5009    If |guard| evaluates to False at run time, the returned value is
5010    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5011    performed.
5012
5013    The definedness of |guard| itself is not checked.  That is assumed
5014    to have been done before this point, by the caller. */
5015 static
5016 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
5017                               IREndness end, IRType ty,
5018                               IRAtom* addr, UInt bias, IRAtom* guard )
5019 {
5020    tl_assert(isOriginalAtom(mce,addr));
5021    tl_assert(end == Iend_LE || end == Iend_BE);
5022
5023    /* First, emit a definedness test for the address.  This also sets
5024       the address (shadow) to 'defined' following the test. */
5025    complainIfUndefined( mce, addr, guard );
5026
5027    /* Now cook up a call to the relevant helper function, to read the
5028       data V bits from shadow memory. */
5029    ty = shadowTypeV(ty);
5030
5031    void*        helper           = NULL;
5032    const HChar* hname            = NULL;
5033    Bool         ret_via_outparam = False;
5034
5035    if (end == Iend_LE) {
5036       switch (ty) {
5037          case Ity_V256: helper = &MC_(helperc_LOADV256le);
5038                         hname = "MC_(helperc_LOADV256le)";
5039                         ret_via_outparam = True;
5040                         break;
5041          case Ity_V128: helper = &MC_(helperc_LOADV128le);
5042                         hname = "MC_(helperc_LOADV128le)";
5043                         ret_via_outparam = True;
5044                         break;
5045          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
5046                         hname = "MC_(helperc_LOADV64le)";
5047                         break;
5048          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
5049                         hname = "MC_(helperc_LOADV32le)";
5050                         break;
5051          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
5052                         hname = "MC_(helperc_LOADV16le)";
5053                         break;
5054          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5055                         hname = "MC_(helperc_LOADV8)";
5056                         break;
5057          default:       ppIRType(ty);
5058                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
5059       }
5060    } else {
5061       switch (ty) {
5062          case Ity_V256: helper = &MC_(helperc_LOADV256be);
5063                         hname = "MC_(helperc_LOADV256be)";
5064                         ret_via_outparam = True;
5065                         break;
5066          case Ity_V128: helper = &MC_(helperc_LOADV128be);
5067                         hname = "MC_(helperc_LOADV128be)";
5068                         ret_via_outparam = True;
5069                         break;
5070          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
5071                         hname = "MC_(helperc_LOADV64be)";
5072                         break;
5073          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
5074                         hname = "MC_(helperc_LOADV32be)";
5075                         break;
5076          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
5077                         hname = "MC_(helperc_LOADV16be)";
5078                         break;
5079          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5080                         hname = "MC_(helperc_LOADV8)";
5081                         break;
5082          default:       ppIRType(ty);
5083                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
5084       }
5085    }
5086
5087    tl_assert(helper);
5088    tl_assert(hname);
5089
5090    /* Generate the actual address into addrAct. */
5091    IRAtom* addrAct;
5092    if (bias == 0) {
5093       addrAct = addr;
5094    } else {
5095       IROp    mkAdd;
5096       IRAtom* eBias;
5097       IRType  tyAddr  = mce->hWordTy;
5098       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5099       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5100       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5101       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
5102    }
5103
5104    /* We need to have a place to park the V bits we're just about to
5105       read. */
5106    IRTemp datavbits = newTemp(mce, ty, VSh);
5107
5108    /* Here's the call. */
5109    IRDirty* di;
5110    if (ret_via_outparam) {
5111       di = unsafeIRDirty_1_N( datavbits,
5112                               2/*regparms*/,
5113                               hname, VG_(fnptr_to_fnentry)( helper ),
5114                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
5115    } else {
5116       di = unsafeIRDirty_1_N( datavbits,
5117                               1/*regparms*/,
5118                               hname, VG_(fnptr_to_fnentry)( helper ),
5119                               mkIRExprVec_1( addrAct ) );
5120    }
5121
5122    setHelperAnns( mce, di );
5123    if (guard) {
5124       di->guard = guard;
5125       /* Ideally the didn't-happen return value here would be all-ones
5126          (all-undefined), so it'd be obvious if it got used
5127          inadvertently.  We can get by with the IR-mandated default
5128          value (0b01 repeating, 0x55 etc) as that'll still look pretty
5129          undefined if it ever leaks out. */
5130    }
5131    stmt( 'V', mce, IRStmt_Dirty(di) );
5132
5133    return mkexpr(datavbits);
5134 }
5135
5136
5137 /* Generate IR to do a shadow load.  The helper is expected to check
5138    the validity of the address and return the V bits for that address.
5139    This can optionally be controlled by a guard, which is assumed to
5140    be True if NULL.  In the case where the guard is False at runtime,
5141    the helper will return the didn't-do-the-call value of 0x55..55.
5142    Since that means "completely undefined result", the caller of
5143    this function will need to fix up the result somehow in that
5144    case.
5145
5146    Caller of this function is also expected to have checked the
5147    definedness of |guard| before this point.
5148 */
5149 static
5150 IRAtom* expr2vbits_Load ( MCEnv* mce,
5151                           IREndness end, IRType ty,
5152                           IRAtom* addr, UInt bias,
5153                           IRAtom* guard )
5154 {
5155    tl_assert(end == Iend_LE || end == Iend_BE);
5156    switch (shadowTypeV(ty)) {
5157       case Ity_I8:
5158       case Ity_I16:
5159       case Ity_I32:
5160       case Ity_I64:
5161       case Ity_V128:
5162       case Ity_V256:
5163          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
5164       default:
5165          VG_(tool_panic)("expr2vbits_Load");
5166    }
5167 }
5168
5169
5170 /* The most general handler for guarded loads.  Assumes the
5171    definedness of GUARD has already been checked by the caller.  A
5172    GUARD of NULL is assumed to mean "always True".  Generates code to
5173    check the definedness and validity of ADDR.
5174
5175    Generate IR to do a shadow load from ADDR and return the V bits.
5176    The loaded type is TY.  The loaded data is then (shadow) widened by
5177    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
5178    evaluates to False at run time then the returned Vbits are simply
5179    VALT instead.  Note therefore that the argument type of VWIDEN must
5180    be TY and the result type of VWIDEN must equal the type of VALT.
5181 */
5182 static
5183 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
5184                                           IREndness end, IRType ty,
5185                                           IRAtom* addr, UInt bias,
5186                                           IRAtom* guard,
5187                                           IROp vwiden, IRAtom* valt )
5188 {
5189    /* Sanity check the conversion operation, and also set TYWIDE. */
5190    IRType tyWide = Ity_INVALID;
5191    switch (vwiden) {
5192       case Iop_INVALID:
5193          tyWide = ty;
5194          break;
5195       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
5196          tyWide = Ity_I32;
5197          break;
5198       default:
5199          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
5200    }
5201
5202    /* If the guard evaluates to True, this will hold the loaded V bits
5203       at TY.  If the guard evaluates to False, this will be all
5204       ones, meaning "all undefined", in which case we will have to
5205       replace it using an ITE below. */
5206    IRAtom* iftrue1
5207       = assignNew('V', mce, ty,
5208                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
5209    /* Now (shadow-) widen the loaded V bits to the desired width.  In
5210       the guard-is-False case, the allowable widening operators will
5211       in the worst case (unsigned widening) at least leave the
5212       pre-widened part as being marked all-undefined, and in the best
5213       case (signed widening) mark the whole widened result as
5214       undefined.  Anyway, it doesn't matter really, since in this case
5215       we will replace said value with the default value |valt| using an
5216       ITE. */
5217    IRAtom* iftrue2
5218       = vwiden == Iop_INVALID
5219            ? iftrue1
5220            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
5221    /* These are the V bits we will return if the load doesn't take
5222       place. */
5223    IRAtom* iffalse
5224       = valt;
5225    /* Prepare the cond for the ITE.  Convert a NULL cond into
5226       something that iropt knows how to fold out later. */
5227    IRAtom* cond
5228       = guard == NULL  ? mkU1(1)  : guard;
5229    /* And assemble the final result. */
5230    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
5231 }
5232
5233
5234 /* A simpler handler for guarded loads, in which there is no
5235    conversion operation, and the default V bit return (when the guard
5236    evaluates to False at runtime) is "all defined".  If there is no
5237    guard expression or the guard is always TRUE this function behaves
5238    like expr2vbits_Load.  It is assumed that definedness of GUARD has
5239    already been checked at the call site. */
5240 static
5241 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
5242                                          IREndness end, IRType ty,
5243                                          IRAtom* addr, UInt bias,
5244                                          IRAtom *guard )
5245 {
5246    return expr2vbits_Load_guarded_General(
5247              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
5248           );
5249 }
5250
5251
5252 static
5253 IRAtom* expr2vbits_ITE ( MCEnv* mce,
5254                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
5255 {
5256    IRAtom *vbitsC, *vbits0, *vbits1;
5257    IRType ty;
5258    /* Given ITE(cond, iftrue,  iffalse),  generate
5259             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5260       That is, steer the V bits like the originals, but trash the
5261       result if the steering value is undefined.  This gives
5262       lazy propagation. */
5263    tl_assert(isOriginalAtom(mce, cond));
5264    tl_assert(isOriginalAtom(mce, iftrue));
5265    tl_assert(isOriginalAtom(mce, iffalse));
5266
5267    vbitsC = expr2vbits(mce, cond, HuOth); // could we use HuPCa here?
5268    vbits1 = expr2vbits(mce, iftrue, HuOth);
5269    vbits0 = expr2vbits(mce, iffalse, HuOth);
5270    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
5271
5272    return
5273       mkUifU(mce, ty, assignNew('V', mce, ty,
5274                                      IRExpr_ITE(cond, vbits1, vbits0)),
5275                       mkPCastTo(mce, ty, vbitsC) );
5276 }
5277
5278 /* --------- This is the main expression-handling function. --------- */
5279
5280 static
5281 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e,
5282                      HowUsed hu/*use HuOth if unknown*/ )
5283 {
5284    switch (e->tag) {
5285
5286       case Iex_Get:
5287          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
5288
5289       case Iex_GetI:
5290          return shadow_GETI( mce, e->Iex.GetI.descr,
5291                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
5292
5293       case Iex_RdTmp:
5294          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
5295
5296       case Iex_Const:
5297          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
5298
5299       case Iex_Qop:
5300          return expr2vbits_Qop(
5301                    mce,
5302                    e->Iex.Qop.details->op,
5303                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
5304                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
5305                 );
5306
5307       case Iex_Triop:
5308          return expr2vbits_Triop(
5309                    mce,
5310                    e->Iex.Triop.details->op,
5311                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
5312                    e->Iex.Triop.details->arg3
5313                 );
5314
5315       case Iex_Binop:
5316          return expr2vbits_Binop(
5317                    mce,
5318                    e->Iex.Binop.op,
5319                    e->Iex.Binop.arg1, e->Iex.Binop.arg2,
5320                    hu
5321                 );
5322
5323       case Iex_Unop:
5324          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5325
5326       case Iex_Load:
5327          return expr2vbits_Load( mce, e->Iex.Load.end,
5328                                       e->Iex.Load.ty,
5329                                       e->Iex.Load.addr, 0/*addr bias*/,
5330                                       NULL/* guard == "always True"*/ );
5331
5332       case Iex_CCall:
5333          return mkLazyN( mce, e->Iex.CCall.args,
5334                               e->Iex.CCall.retty,
5335                               e->Iex.CCall.cee );
5336
5337       case Iex_ITE:
5338          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5339                                      e->Iex.ITE.iffalse);
5340
5341       default:
5342          VG_(printf)("\n");
5343          ppIRExpr(e);
5344          VG_(printf)("\n");
5345          VG_(tool_panic)("memcheck: expr2vbits");
5346    }
5347 }
5348
5349
5350 /*------------------------------------------------------------*/
5351 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
5352 /*------------------------------------------------------------*/
5353
5354 /* Widen a value to the host word size. */
5355
5356 static
5357 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5358 {
5359    IRType ty, tyH;
5360
5361    /* vatom is vbits-value and as such can only have a shadow type. */
5362    tl_assert(isShadowAtom(mce,vatom));
5363
5364    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
5365    tyH = mce->hWordTy;
5366
5367    if (tyH == Ity_I32) {
5368       switch (ty) {
5369          case Ity_I32:
5370             return vatom;
5371          case Ity_I16:
5372             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5373          case Ity_I8:
5374             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5375          default:
5376             goto unhandled;
5377       }
5378    } else
5379    if (tyH == Ity_I64) {
5380       switch (ty) {
5381          case Ity_I32:
5382             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5383          case Ity_I16:
5384             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5385                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5386          case Ity_I8:
5387             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5388                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5389          default:
5390             goto unhandled;
5391       }
5392    } else {
5393       goto unhandled;
5394    }
5395   unhandled:
5396    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
5397    VG_(tool_panic)("zwidenToHostWord");
5398 }
5399
5400
5401 /* Generate a shadow store.  |addr| is always the original address
5402    atom.  You can pass in either originals or V-bits for the data
5403    atom, but obviously not both.  This function generates a check for
5404    the definedness and (indirectly) the validity of |addr|, but only
5405    when |guard| evaluates to True at run time (or is NULL).
5406
5407    |guard| :: Ity_I1 controls whether the store really happens; NULL
5408    means it unconditionally does.  Note that |guard| itself is not
5409    checked for definedness; the caller of this function must do that
5410    if necessary.
5411 */
5412 static
5413 void do_shadow_Store ( MCEnv* mce,
5414                        IREndness end,
5415                        IRAtom* addr, UInt bias,
5416                        IRAtom* data, IRAtom* vdata,
5417                        IRAtom* guard )
5418 {
5419    IROp     mkAdd;
5420    IRType   ty, tyAddr;
5421    void*    helper = NULL;
5422    const HChar* hname = NULL;
5423    IRConst* c;
5424
5425    tyAddr = mce->hWordTy;
5426    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5427    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5428    tl_assert( end == Iend_LE || end == Iend_BE );
5429
5430    if (data) {
5431       tl_assert(!vdata);
5432       tl_assert(isOriginalAtom(mce, data));
5433       tl_assert(bias == 0);
5434       vdata = expr2vbits( mce, data, HuOth );
5435    } else {
5436       tl_assert(vdata);
5437    }
5438
5439    tl_assert(isOriginalAtom(mce,addr));
5440    tl_assert(isShadowAtom(mce,vdata));
5441
5442    if (guard) {
5443       tl_assert(isOriginalAtom(mce, guard));
5444       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
5445    }
5446
5447    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
5448
5449    // If we're not doing undefined value checking, pretend that this value
5450    // is "all valid".  That lets Vex's optimiser remove some of the V bit
5451    // shadow computation ops that precede it.
5452    if (MC_(clo_mc_level) == 1) {
5453       switch (ty) {
5454          case Ity_V256: // V256 weirdness -- used four times
5455                         c = IRConst_V256(V_BITS32_DEFINED); break;
5456          case Ity_V128: // V128 weirdness -- used twice
5457                         c = IRConst_V128(V_BITS16_DEFINED); break;
5458          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
5459          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
5460          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
5461          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
5462          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5463       }
5464       vdata = IRExpr_Const( c );
5465    }
5466
5467    /* First, emit a definedness test for the address.  This also sets
5468       the address (shadow) to 'defined' following the test.  Both of
5469       those actions are gated on |guard|. */
5470    complainIfUndefined( mce, addr, guard );
5471
5472    /* Now decide which helper function to call to write the data V
5473       bits into shadow memory. */
5474    if (end == Iend_LE) {
5475       switch (ty) {
5476          case Ity_V256: /* we'll use the helper four times */
5477          case Ity_V128: /* we'll use the helper twice */
5478          case Ity_I64: helper = &MC_(helperc_STOREV64le);
5479                        hname = "MC_(helperc_STOREV64le)";
5480                        break;
5481          case Ity_I32: helper = &MC_(helperc_STOREV32le);
5482                        hname = "MC_(helperc_STOREV32le)";
5483                        break;
5484          case Ity_I16: helper = &MC_(helperc_STOREV16le);
5485                        hname = "MC_(helperc_STOREV16le)";
5486                        break;
5487          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5488                        hname = "MC_(helperc_STOREV8)";
5489                        break;
5490          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5491       }
5492    } else {
5493       switch (ty) {
5494          case Ity_V128: /* we'll use the helper twice */
5495          case Ity_I64: helper = &MC_(helperc_STOREV64be);
5496                        hname = "MC_(helperc_STOREV64be)";
5497                        break;
5498          case Ity_I32: helper = &MC_(helperc_STOREV32be);
5499                        hname = "MC_(helperc_STOREV32be)";
5500                        break;
5501          case Ity_I16: helper = &MC_(helperc_STOREV16be);
5502                        hname = "MC_(helperc_STOREV16be)";
5503                        break;
5504          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5505                        hname = "MC_(helperc_STOREV8)";
5506                        break;
5507          /* Note, no V256 case here, because no big-endian target that
5508             we support, has 256 vectors. */
5509          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
5510       }
5511    }
5512
5513    if (UNLIKELY(ty == Ity_V256)) {
5514
5515       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
5516          Q3 being the most significant lane. */
5517       /* These are the offsets of the Qs in memory. */
5518       Int     offQ0, offQ1, offQ2, offQ3;
5519
5520       /* Various bits for constructing the 4 lane helper calls */
5521       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
5522       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
5523       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
5524       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
5525
5526       if (end == Iend_LE) {
5527          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
5528       } else {
5529          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
5530       }
5531
5532       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
5533       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
5534       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
5535       diQ0    = unsafeIRDirty_0_N(
5536                    1/*regparms*/,
5537                    hname, VG_(fnptr_to_fnentry)( helper ),
5538                    mkIRExprVec_2( addrQ0, vdataQ0 )
5539                 );
5540
5541       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
5542       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
5543       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
5544       diQ1    = unsafeIRDirty_0_N(
5545                    1/*regparms*/,
5546                    hname, VG_(fnptr_to_fnentry)( helper ),
5547                    mkIRExprVec_2( addrQ1, vdataQ1 )
5548                 );
5549
5550       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
5551       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
5552       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
5553       diQ2    = unsafeIRDirty_0_N(
5554                    1/*regparms*/,
5555                    hname, VG_(fnptr_to_fnentry)( helper ),
5556                    mkIRExprVec_2( addrQ2, vdataQ2 )
5557                 );
5558
5559       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
5560       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
5561       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
5562       diQ3    = unsafeIRDirty_0_N(
5563                    1/*regparms*/,
5564                    hname, VG_(fnptr_to_fnentry)( helper ),
5565                    mkIRExprVec_2( addrQ3, vdataQ3 )
5566                 );
5567
5568       if (guard)
5569          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
5570
5571       setHelperAnns( mce, diQ0 );
5572       setHelperAnns( mce, diQ1 );
5573       setHelperAnns( mce, diQ2 );
5574       setHelperAnns( mce, diQ3 );
5575       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
5576       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
5577       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
5578       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
5579
5580    }
5581    else if (UNLIKELY(ty == Ity_V128)) {
5582
5583       /* V128-bit case */
5584       /* See comment in next clause re 64-bit regparms */
5585       /* also, need to be careful about endianness */
5586
5587       Int     offLo64, offHi64;
5588       IRDirty *diLo64, *diHi64;
5589       IRAtom  *addrLo64, *addrHi64;
5590       IRAtom  *vdataLo64, *vdataHi64;
5591       IRAtom  *eBiasLo64, *eBiasHi64;
5592
5593       if (end == Iend_LE) {
5594          offLo64 = 0;
5595          offHi64 = 8;
5596       } else {
5597          offLo64 = 8;
5598          offHi64 = 0;
5599       }
5600
5601       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
5602       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
5603       vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
5604       diLo64    = unsafeIRDirty_0_N(
5605                      1/*regparms*/,
5606                      hname, VG_(fnptr_to_fnentry)( helper ),
5607                      mkIRExprVec_2( addrLo64, vdataLo64 )
5608                   );
5609       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
5610       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
5611       vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
5612       diHi64    = unsafeIRDirty_0_N(
5613                      1/*regparms*/,
5614                      hname, VG_(fnptr_to_fnentry)( helper ),
5615                      mkIRExprVec_2( addrHi64, vdataHi64 )
5616                   );
5617       if (guard) diLo64->guard = guard;
5618       if (guard) diHi64->guard = guard;
5619       setHelperAnns( mce, diLo64 );
5620       setHelperAnns( mce, diHi64 );
5621       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
5622       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
5623
5624    } else {
5625
5626       IRDirty *di;
5627       IRAtom  *addrAct;
5628
5629       /* 8/16/32/64-bit cases */
5630       /* Generate the actual address into addrAct. */
5631       if (bias == 0) {
5632          addrAct = addr;
5633       } else {
5634          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5635          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
5636       }
5637
5638       if (ty == Ity_I64) {
5639          /* We can't do this with regparm 2 on 32-bit platforms, since
5640             the back ends aren't clever enough to handle 64-bit
5641             regparm args.  Therefore be different. */
5642          di = unsafeIRDirty_0_N(
5643                  1/*regparms*/,
5644                  hname, VG_(fnptr_to_fnentry)( helper ),
5645                  mkIRExprVec_2( addrAct, vdata )
5646               );
5647       } else {
5648          di = unsafeIRDirty_0_N(
5649                  2/*regparms*/,
5650                  hname, VG_(fnptr_to_fnentry)( helper ),
5651                  mkIRExprVec_2( addrAct,
5652                                 zwidenToHostWord( mce, vdata ))
5653               );
5654       }
5655       if (guard) di->guard = guard;
5656       setHelperAnns( mce, di );
5657       stmt( 'V', mce, IRStmt_Dirty(di) );
5658    }
5659
5660 }
5661
5662
5663 /* Do lazy pessimistic propagation through a dirty helper call, by
5664    looking at the annotations on it.  This is the most complex part of
5665    Memcheck. */
5666
5667 static IRType szToITy ( Int n )
5668 {
5669    switch (n) {
5670       case 1: return Ity_I8;
5671       case 2: return Ity_I16;
5672       case 4: return Ity_I32;
5673       case 8: return Ity_I64;
5674       default: VG_(tool_panic)("szToITy(memcheck)");
5675    }
5676 }
5677
5678 static
5679 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
5680 {
5681    Int       i, k, n, toDo, gSz, gOff;
5682    IRAtom    *src, *here, *curr;
5683    IRType    tySrc, tyDst;
5684    IRTemp    dst;
5685    IREndness end;
5686
5687    /* What's the native endianness?  We need to know this. */
5688 #  if defined(VG_BIGENDIAN)
5689    end = Iend_BE;
5690 #  elif defined(VG_LITTLEENDIAN)
5691    end = Iend_LE;
5692 #  else
5693 #    error "Unknown endianness"
5694 #  endif
5695
5696    /* First check the guard. */
5697    complainIfUndefined(mce, d->guard, NULL);
5698
5699    /* Now round up all inputs and PCast over them. */
5700    curr = definedOfType(Ity_I32);
5701
5702    /* Inputs: unmasked args
5703       Note: arguments are evaluated REGARDLESS of the guard expression */
5704    for (i = 0; d->args[i]; i++) {
5705       IRAtom* arg = d->args[i];
5706       if ( (d->cee->mcx_mask & (1<<i))
5707            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
5708          /* ignore this arg */
5709       } else {
5710          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg, HuOth) );
5711          curr = mkUifU32(mce, here, curr);
5712       }
5713    }
5714
5715    /* Inputs: guest state that we read. */
5716    for (i = 0; i < d->nFxState; i++) {
5717       tl_assert(d->fxState[i].fx != Ifx_None);
5718       if (d->fxState[i].fx == Ifx_Write)
5719          continue;
5720
5721       /* Enumerate the described state segments */
5722       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5723          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5724          gSz  = d->fxState[i].size;
5725
5726          /* Ignore any sections marked as 'always defined'. */
5727          if (isAlwaysDefd(mce, gOff, gSz)) {
5728             if (0)
5729             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
5730                         gOff, gSz);
5731             continue;
5732          }
5733
5734          /* This state element is read or modified.  So we need to
5735             consider it.  If larger than 8 bytes, deal with it in
5736             8-byte chunks. */
5737          while (True) {
5738             tl_assert(gSz >= 0);
5739             if (gSz == 0) break;
5740             n = gSz <= 8 ? gSz : 8;
5741             /* update 'curr' with UifU of the state slice
5742                gOff .. gOff+n-1 */
5743             tySrc = szToITy( n );
5744
5745             /* Observe the guard expression. If it is false use an
5746                all-bits-defined bit pattern */
5747             IRAtom *cond, *iffalse, *iftrue;
5748
5749             cond    = assignNew('V', mce, Ity_I1, d->guard);
5750             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
5751             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
5752             src     = assignNew('V', mce, tySrc,
5753                                 IRExpr_ITE(cond, iftrue, iffalse));
5754
5755             here = mkPCastTo( mce, Ity_I32, src );
5756             curr = mkUifU32(mce, here, curr);
5757             gSz -= n;
5758             gOff += n;
5759          }
5760       }
5761    }
5762
5763    /* Inputs: memory.  First set up some info needed regardless of
5764       whether we're doing reads or writes. */
5765
5766    if (d->mFx != Ifx_None) {
5767       /* Because we may do multiple shadow loads/stores from the same
5768          base address, it's best to do a single test of its
5769          definedness right now.  Post-instrumentation optimisation
5770          should remove all but this test. */
5771       IRType tyAddr;
5772       tl_assert(d->mAddr);
5773       complainIfUndefined(mce, d->mAddr, d->guard);
5774
5775       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
5776       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
5777       tl_assert(tyAddr == mce->hWordTy); /* not really right */
5778    }
5779
5780    /* Deal with memory inputs (reads or modifies) */
5781    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
5782       toDo   = d->mSize;
5783       /* chew off 32-bit chunks.  We don't care about the endianness
5784          since it's all going to be condensed down to a single bit,
5785          but nevertheless choose an endianness which is hopefully
5786          native to the platform. */
5787       while (toDo >= 4) {
5788          here = mkPCastTo(
5789                    mce, Ity_I32,
5790                    expr2vbits_Load_guarded_Simple(
5791                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
5792                 );
5793          curr = mkUifU32(mce, here, curr);
5794          toDo -= 4;
5795       }
5796       /* chew off 16-bit chunks */
5797       while (toDo >= 2) {
5798          here = mkPCastTo(
5799                    mce, Ity_I32,
5800                    expr2vbits_Load_guarded_Simple(
5801                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
5802                 );
5803          curr = mkUifU32(mce, here, curr);
5804          toDo -= 2;
5805       }
5806       /* chew off the remaining 8-bit chunk, if any */
5807       if (toDo == 1) {
5808          here = mkPCastTo(
5809                    mce, Ity_I32,
5810                    expr2vbits_Load_guarded_Simple(
5811                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
5812                 );
5813          curr = mkUifU32(mce, here, curr);
5814          toDo -= 1;
5815       }
5816       tl_assert(toDo == 0);
5817    }
5818
5819    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
5820       all the inputs to the helper.  Now we need to re-distribute the
5821       results to all destinations. */
5822
5823    /* Outputs: the destination temporary, if there is one. */
5824    if (d->tmp != IRTemp_INVALID) {
5825       dst   = findShadowTmpV(mce, d->tmp);
5826       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
5827       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
5828    }
5829
5830    /* Outputs: guest state that we write or modify. */
5831    for (i = 0; i < d->nFxState; i++) {
5832       tl_assert(d->fxState[i].fx != Ifx_None);
5833       if (d->fxState[i].fx == Ifx_Read)
5834          continue;
5835
5836       /* Enumerate the described state segments */
5837       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5838          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5839          gSz  = d->fxState[i].size;
5840
5841          /* Ignore any sections marked as 'always defined'. */
5842          if (isAlwaysDefd(mce, gOff, gSz))
5843             continue;
5844
5845          /* This state element is written or modified.  So we need to
5846             consider it.  If larger than 8 bytes, deal with it in
5847             8-byte chunks. */
5848          while (True) {
5849             tl_assert(gSz >= 0);
5850             if (gSz == 0) break;
5851             n = gSz <= 8 ? gSz : 8;
5852             /* Write suitably-casted 'curr' to the state slice
5853                gOff .. gOff+n-1 */
5854             tyDst = szToITy( n );
5855             do_shadow_PUT( mce, gOff,
5856                                 NULL, /* original atom */
5857                                 mkPCastTo( mce, tyDst, curr ), d->guard );
5858             gSz -= n;
5859             gOff += n;
5860          }
5861       }
5862    }
5863
5864    /* Outputs: memory that we write or modify.  Same comments about
5865       endianness as above apply. */
5866    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
5867       toDo   = d->mSize;
5868       /* chew off 32-bit chunks */
5869       while (toDo >= 4) {
5870          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5871                           NULL, /* original data */
5872                           mkPCastTo( mce, Ity_I32, curr ),
5873                           d->guard );
5874          toDo -= 4;
5875       }
5876       /* chew off 16-bit chunks */
5877       while (toDo >= 2) {
5878          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5879                           NULL, /* original data */
5880                           mkPCastTo( mce, Ity_I16, curr ),
5881                           d->guard );
5882          toDo -= 2;
5883       }
5884       /* chew off the remaining 8-bit chunk, if any */
5885       if (toDo == 1) {
5886          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5887                           NULL, /* original data */
5888                           mkPCastTo( mce, Ity_I8, curr ),
5889                           d->guard );
5890          toDo -= 1;
5891       }
5892       tl_assert(toDo == 0);
5893    }
5894
5895 }
5896
5897
5898 /* We have an ABI hint telling us that [base .. base+len-1] is to
5899    become undefined ("writable").  Generate code to call a helper to
5900    notify the A/V bit machinery of this fact.
5901
5902    We call
5903    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
5904                                                     Addr nia );
5905 */
5906 static
5907 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
5908 {
5909    IRDirty* di;
5910
5911    if (MC_(clo_mc_level) == 3) {
5912       di = unsafeIRDirty_0_N(
5913               3/*regparms*/,
5914               "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
5915               VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
5916               mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
5917            );
5918    } else {
5919       /* We ignore the supplied nia, since it is irrelevant. */
5920       tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
5921       /* Special-case the len==128 case, since that is for amd64-ELF,
5922          which is a very common target. */
5923       if (len == 128) {
5924          di = unsafeIRDirty_0_N(
5925                  1/*regparms*/,
5926                  "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
5927                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
5928                  mkIRExprVec_1( base )
5929               );
5930       } else {
5931          di = unsafeIRDirty_0_N(
5932                  2/*regparms*/,
5933                  "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
5934                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
5935                  mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
5936               );
5937       }
5938    }
5939
5940    stmt( 'V', mce, IRStmt_Dirty(di) );
5941 }
5942
5943
5944 /* ------ Dealing with IRCAS (big and complex) ------ */
5945
5946 /* FWDS */
5947 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
5948                              IRAtom* baseaddr, Int offset );
5949 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
5950 static void    gen_store_b ( MCEnv* mce, Int szB,
5951                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
5952                              IRAtom* guard );
5953
5954 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
5955 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
5956
5957
5958 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
5959    IRExpr.Consts, else this asserts.  If they are both Consts, it
5960    doesn't do anything.  So that just leaves the RdTmp case.
5961
5962    In which case: this assigns the shadow value SHADOW to the IR
5963    shadow temporary associated with ORIG.  That is, ORIG, being an
5964    original temporary, will have a shadow temporary associated with
5965    it.  However, in the case envisaged here, there will so far have
5966    been no IR emitted to actually write a shadow value into that
5967    temporary.  What this routine does is to (emit IR to) copy the
5968    value in SHADOW into said temporary, so that after this call,
5969    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
5970    value in SHADOW.
5971
5972    Point is to allow callers to compute "by hand" a shadow value for
5973    ORIG, and force it to be associated with ORIG.
5974
5975    How do we know that that shadow associated with ORIG has not so far
5976    been assigned to?  Well, we don't per se know that, but supposing
5977    it had.  Then this routine would create a second assignment to it,
5978    and later the IR sanity checker would barf.  But that never
5979    happens.  QED.
5980 */
5981 static void bind_shadow_tmp_to_orig ( UChar how,
5982                                       MCEnv* mce,
5983                                       IRAtom* orig, IRAtom* shadow )
5984 {
5985    tl_assert(isOriginalAtom(mce, orig));
5986    tl_assert(isShadowAtom(mce, shadow));
5987    switch (orig->tag) {
5988       case Iex_Const:
5989          tl_assert(shadow->tag == Iex_Const);
5990          break;
5991       case Iex_RdTmp:
5992          tl_assert(shadow->tag == Iex_RdTmp);
5993          if (how == 'V') {
5994             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
5995                    shadow);
5996          } else {
5997             tl_assert(how == 'B');
5998             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
5999                    shadow);
6000          }
6001          break;
6002       default:
6003          tl_assert(0);
6004    }
6005 }
6006
6007
6008 static
6009 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
6010 {
6011    /* Scheme is (both single- and double- cases):
6012
6013       1. fetch data#,dataB (the proposed new value)
6014
6015       2. fetch expd#,expdB (what we expect to see at the address)
6016
6017       3. check definedness of address
6018
6019       4. load old#,oldB from shadow memory; this also checks
6020          addressibility of the address
6021
6022       5. the CAS itself
6023
6024       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
6025
6026       7. if "expected == old" (as computed by (6))
6027             store data#,dataB to shadow memory
6028
6029       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
6030       'data' but 7 stores 'data#'.  Hence it is possible for the
6031       shadow data to be incorrectly checked and/or updated:
6032
6033       * 7 is at least gated correctly, since the 'expected == old'
6034         condition is derived from outputs of 5.  However, the shadow
6035         write could happen too late: imagine after 5 we are
6036         descheduled, a different thread runs, writes a different
6037         (shadow) value at the address, and then we resume, hence
6038         overwriting the shadow value written by the other thread.
6039
6040       Because the original memory access is atomic, there's no way to
6041       make both the original and shadow accesses into a single atomic
6042       thing, hence this is unavoidable.
6043
6044       At least as Valgrind stands, I don't think it's a problem, since
6045       we're single threaded *and* we guarantee that there are no
6046       context switches during the execution of any specific superblock
6047       -- context switches can only happen at superblock boundaries.
6048
6049       If Valgrind ever becomes MT in the future, then it might be more
6050       of a problem.  A possible kludge would be to artificially
6051       associate with the location, a lock, which we must acquire and
6052       release around the transaction as a whole.  Hmm, that probably
6053       would't work properly since it only guards us against other
6054       threads doing CASs on the same location, not against other
6055       threads doing normal reads and writes.
6056
6057       ------------------------------------------------------------
6058
6059       COMMENT_ON_CasCmpEQ:
6060
6061       Note two things.  Firstly, in the sequence above, we compute
6062       "expected == old", but we don't check definedness of it.  Why
6063       not?  Also, the x86 and amd64 front ends use
6064       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6065       determination (expected == old ?) for themselves, and we also
6066       don't check definedness for those primops; we just say that the
6067       result is defined.  Why?  Details follow.
6068
6069       x86/amd64 contains various forms of locked insns:
6070       * lock prefix before all basic arithmetic insn;
6071         eg lock xorl %reg1,(%reg2)
6072       * atomic exchange reg-mem
6073       * compare-and-swaps
6074
6075       Rather than attempt to represent them all, which would be a
6076       royal PITA, I used a result from Maurice Herlihy
6077       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6078       demonstrates that compare-and-swap is a primitive more general
6079       than the other two, and so can be used to represent all of them.
6080       So the translation scheme for (eg) lock incl (%reg) is as
6081       follows:
6082
6083         again:
6084          old = * %reg
6085          new = old + 1
6086          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6087
6088       The "atomically" is the CAS bit.  The scheme is always the same:
6089       get old value from memory, compute new value, atomically stuff
6090       new value back in memory iff the old value has not changed (iow,
6091       no other thread modified it in the meantime).  If it has changed
6092       then we've been out-raced and we have to start over.
6093
6094       Now that's all very neat, but it has the bad side effect of
6095       introducing an explicit equality test into the translation.
6096       Consider the behaviour of said code on a memory location which
6097       is uninitialised.  We will wind up doing a comparison on
6098       uninitialised data, and mc duly complains.
6099
6100       What's difficult about this is, the common case is that the
6101       location is uncontended, and so we're usually comparing the same
6102       value (* %reg) with itself.  So we shouldn't complain even if it
6103       is undefined.  But mc doesn't know that.
6104
6105       My solution is to mark the == in the IR specially, so as to tell
6106       mc that it almost certainly compares a value with itself, and we
6107       should just regard the result as always defined.  Rather than
6108       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6109       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6110
6111       So there's always the question of, can this give a false
6112       negative?  eg, imagine that initially, * %reg is defined; and we
6113       read that; but then in the gap between the read and the CAS, a
6114       different thread writes an undefined (and different) value at
6115       the location.  Then the CAS in this thread will fail and we will
6116       go back to "again:", but without knowing that the trip back
6117       there was based on an undefined comparison.  No matter; at least
6118       the other thread won the race and the location is correctly
6119       marked as undefined.  What if it wrote an uninitialised version
6120       of the same value that was there originally, though?
6121
6122       etc etc.  Seems like there's a small corner case in which we
6123       might lose the fact that something's defined -- we're out-raced
6124       in between the "old = * reg" and the "atomically {", _and_ the
6125       other thread is writing in an undefined version of what's
6126       already there.  Well, that seems pretty unlikely.
6127
6128       ---
6129
6130       If we ever need to reinstate it .. code which generates a
6131       definedness test for "expected == old" was removed at r10432 of
6132       this file.
6133    */
6134    if (cas->oldHi == IRTemp_INVALID) {
6135       do_shadow_CAS_single( mce, cas );
6136    } else {
6137       do_shadow_CAS_double( mce, cas );
6138    }
6139 }
6140
6141
6142 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
6143 {
6144    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6145    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6146    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6147    IRAtom *expd_eq_old = NULL;
6148    IROp   opCasCmpEQ;
6149    Int    elemSzB;
6150    IRType elemTy;
6151    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6152
6153    /* single CAS */
6154    tl_assert(cas->oldHi == IRTemp_INVALID);
6155    tl_assert(cas->expdHi == NULL);
6156    tl_assert(cas->dataHi == NULL);
6157
6158    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6159    switch (elemTy) {
6160       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
6161       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
6162       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
6163       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
6164       default: tl_assert(0); /* IR defn disallows any other types */
6165    }
6166
6167    /* 1. fetch data# (the proposed new value) */
6168    tl_assert(isOriginalAtom(mce, cas->dataLo));
6169    vdataLo
6170       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6171    tl_assert(isShadowAtom(mce, vdataLo));
6172    if (otrak) {
6173       bdataLo
6174          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6175       tl_assert(isShadowAtom(mce, bdataLo));
6176    }
6177
6178    /* 2. fetch expected# (what we expect to see at the address) */
6179    tl_assert(isOriginalAtom(mce, cas->expdLo));
6180    vexpdLo
6181       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6182    tl_assert(isShadowAtom(mce, vexpdLo));
6183    if (otrak) {
6184       bexpdLo
6185          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6186       tl_assert(isShadowAtom(mce, bexpdLo));
6187    }
6188
6189    /* 3. check definedness of address */
6190    /* 4. fetch old# from shadow memory; this also checks
6191          addressibility of the address */
6192    voldLo
6193       = assignNew(
6194            'V', mce, elemTy,
6195            expr2vbits_Load(
6196               mce,
6197               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
6198               NULL/*always happens*/
6199         ));
6200    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6201    if (otrak) {
6202       boldLo
6203          = assignNew('B', mce, Ity_I32,
6204                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
6205       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6206    }
6207
6208    /* 5. the CAS itself */
6209    stmt( 'C', mce, IRStmt_CAS(cas) );
6210
6211    /* 6. compute "expected == old" */
6212    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6213    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6214       tree, but it's not copied from the input block. */
6215    expd_eq_old
6216       = assignNew('C', mce, Ity_I1,
6217                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
6218
6219    /* 7. if "expected == old"
6220             store data# to shadow memory */
6221    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
6222                     NULL/*data*/, vdataLo/*vdata*/,
6223                     expd_eq_old/*guard for store*/ );
6224    if (otrak) {
6225       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
6226                    bdataLo/*bdata*/,
6227                    expd_eq_old/*guard for store*/ );
6228    }
6229 }
6230
6231
6232 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
6233 {
6234    IRAtom *vdataHi = NULL, *bdataHi = NULL;
6235    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6236    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
6237    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6238    IRAtom *voldHi  = NULL, *boldHi  = NULL;
6239    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6240    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
6241    IRAtom *expd_eq_old = NULL, *zero = NULL;
6242    IROp   opCasCmpEQ, opOr, opXor;
6243    Int    elemSzB, memOffsLo, memOffsHi;
6244    IRType elemTy;
6245    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6246
6247    /* double CAS */
6248    tl_assert(cas->oldHi != IRTemp_INVALID);
6249    tl_assert(cas->expdHi != NULL);
6250    tl_assert(cas->dataHi != NULL);
6251
6252    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6253    switch (elemTy) {
6254       case Ity_I8:
6255          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
6256          elemSzB = 1; zero = mkU8(0);
6257          break;
6258       case Ity_I16:
6259          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
6260          elemSzB = 2; zero = mkU16(0);
6261          break;
6262       case Ity_I32:
6263          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
6264          elemSzB = 4; zero = mkU32(0);
6265          break;
6266       case Ity_I64:
6267          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
6268          elemSzB = 8; zero = mkU64(0);
6269          break;
6270       default:
6271          tl_assert(0); /* IR defn disallows any other types */
6272    }
6273
6274    /* 1. fetch data# (the proposed new value) */
6275    tl_assert(isOriginalAtom(mce, cas->dataHi));
6276    tl_assert(isOriginalAtom(mce, cas->dataLo));
6277    vdataHi
6278       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi, HuOth));
6279    vdataLo
6280       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6281    tl_assert(isShadowAtom(mce, vdataHi));
6282    tl_assert(isShadowAtom(mce, vdataLo));
6283    if (otrak) {
6284       bdataHi
6285          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
6286       bdataLo
6287          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6288       tl_assert(isShadowAtom(mce, bdataHi));
6289       tl_assert(isShadowAtom(mce, bdataLo));
6290    }
6291
6292    /* 2. fetch expected# (what we expect to see at the address) */
6293    tl_assert(isOriginalAtom(mce, cas->expdHi));
6294    tl_assert(isOriginalAtom(mce, cas->expdLo));
6295    vexpdHi
6296       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi, HuOth));
6297    vexpdLo
6298       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6299    tl_assert(isShadowAtom(mce, vexpdHi));
6300    tl_assert(isShadowAtom(mce, vexpdLo));
6301    if (otrak) {
6302       bexpdHi
6303          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
6304       bexpdLo
6305          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6306       tl_assert(isShadowAtom(mce, bexpdHi));
6307       tl_assert(isShadowAtom(mce, bexpdLo));
6308    }
6309
6310    /* 3. check definedness of address */
6311    /* 4. fetch old# from shadow memory; this also checks
6312          addressibility of the address */
6313    if (cas->end == Iend_LE) {
6314       memOffsLo = 0;
6315       memOffsHi = elemSzB;
6316    } else {
6317       tl_assert(cas->end == Iend_BE);
6318       memOffsLo = elemSzB;
6319       memOffsHi = 0;
6320    }
6321    voldHi
6322       = assignNew(
6323            'V', mce, elemTy,
6324            expr2vbits_Load(
6325               mce,
6326               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6327               NULL/*always happens*/
6328         ));
6329    voldLo
6330       = assignNew(
6331            'V', mce, elemTy,
6332            expr2vbits_Load(
6333               mce,
6334               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6335               NULL/*always happens*/
6336         ));
6337    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6338    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6339    if (otrak) {
6340       boldHi
6341          = assignNew('B', mce, Ity_I32,
6342                      gen_load_b(mce, elemSzB, cas->addr,
6343                                 memOffsHi/*addr bias*/));
6344       boldLo
6345          = assignNew('B', mce, Ity_I32,
6346                      gen_load_b(mce, elemSzB, cas->addr,
6347                                 memOffsLo/*addr bias*/));
6348       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6349       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6350    }
6351
6352    /* 5. the CAS itself */
6353    stmt( 'C', mce, IRStmt_CAS(cas) );
6354
6355    /* 6. compute "expected == old" */
6356    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6357    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6358       tree, but it's not copied from the input block. */
6359    /*
6360       xHi = oldHi ^ expdHi;
6361       xLo = oldLo ^ expdLo;
6362       xHL = xHi | xLo;
6363       expd_eq_old = xHL == 0;
6364    */
6365    xHi = assignNew('C', mce, elemTy,
6366                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6367    xLo = assignNew('C', mce, elemTy,
6368                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6369    xHL = assignNew('C', mce, elemTy,
6370                    binop(opOr, xHi, xLo));
6371    expd_eq_old
6372       = assignNew('C', mce, Ity_I1,
6373                   binop(opCasCmpEQ, xHL, zero));
6374
6375    /* 7. if "expected == old"
6376             store data# to shadow memory */
6377    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6378                     NULL/*data*/, vdataHi/*vdata*/,
6379                     expd_eq_old/*guard for store*/ );
6380    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6381                     NULL/*data*/, vdataLo/*vdata*/,
6382                     expd_eq_old/*guard for store*/ );
6383    if (otrak) {
6384       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
6385                    bdataHi/*bdata*/,
6386                    expd_eq_old/*guard for store*/ );
6387       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
6388                    bdataLo/*bdata*/,
6389                    expd_eq_old/*guard for store*/ );
6390    }
6391 }
6392
6393
6394 /* ------ Dealing with LL/SC (not difficult) ------ */
6395
6396 static void do_shadow_LLSC ( MCEnv*    mce,
6397                              IREndness stEnd,
6398                              IRTemp    stResult,
6399                              IRExpr*   stAddr,
6400                              IRExpr*   stStoredata )
6401 {
6402    /* In short: treat a load-linked like a normal load followed by an
6403       assignment of the loaded (shadow) data to the result temporary.
6404       Treat a store-conditional like a normal store, and mark the
6405       result temporary as defined. */
6406    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
6407    IRTemp resTmp = findShadowTmpV(mce, stResult);
6408
6409    tl_assert(isIRAtom(stAddr));
6410    if (stStoredata)
6411       tl_assert(isIRAtom(stStoredata));
6412
6413    if (stStoredata == NULL) {
6414       /* Load Linked */
6415       /* Just treat this as a normal load, followed by an assignment of
6416          the value to .result. */
6417       /* Stay sane */
6418       tl_assert(resTy == Ity_I64 || resTy == Ity_I32
6419                 || resTy == Ity_I16 || resTy == Ity_I8);
6420       assign( 'V', mce, resTmp,
6421                    expr2vbits_Load(
6422                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
6423                       NULL/*always happens*/) );
6424    } else {
6425       /* Store Conditional */
6426       /* Stay sane */
6427       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
6428                                    stStoredata);
6429       tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
6430                 || dataTy == Ity_I16 || dataTy == Ity_I8);
6431       do_shadow_Store( mce, stEnd,
6432                             stAddr, 0/* addr bias */,
6433                             stStoredata,
6434                             NULL /* shadow data */,
6435                             NULL/*guard*/ );
6436       /* This is a store conditional, so it writes to .result a value
6437          indicating whether or not the store succeeded.  Just claim
6438          this value is always defined.  In the PowerPC interpretation
6439          of store-conditional, definedness of the success indication
6440          depends on whether the address of the store matches the
6441          reservation address.  But we can't tell that here (and
6442          anyway, we're not being PowerPC-specific).  At least we are
6443          guaranteed that the definedness of the store address, and its
6444          addressibility, will be checked as per normal.  So it seems
6445          pretty safe to just say that the success indication is always
6446          defined.
6447
6448          In schemeS, for origin tracking, we must correspondingly set
6449          a no-origin value for the origin shadow of .result.
6450       */
6451       tl_assert(resTy == Ity_I1);
6452       assign( 'V', mce, resTmp, definedOfType(resTy) );
6453    }
6454 }
6455
6456
6457 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
6458
6459 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
6460 {
6461    complainIfUndefined(mce, sg->guard, NULL);
6462    /* do_shadow_Store will generate code to check the definedness and
6463       validity of sg->addr, in the case where sg->guard evaluates to
6464       True at run-time. */
6465    do_shadow_Store( mce, sg->end,
6466                     sg->addr, 0/* addr bias */,
6467                     sg->data,
6468                     NULL /* shadow data */,
6469                     sg->guard );
6470 }
6471
6472 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
6473 {
6474    complainIfUndefined(mce, lg->guard, NULL);
6475    /* expr2vbits_Load_guarded_General will generate code to check the
6476       definedness and validity of lg->addr, in the case where
6477       lg->guard evaluates to True at run-time. */
6478
6479    /* Look at the LoadG's built-in conversion operation, to determine
6480       the source (actual loaded data) type, and the equivalent IROp.
6481       NOTE that implicitly we are taking a widening operation to be
6482       applied to original atoms and producing one that applies to V
6483       bits.  Since signed and unsigned widening are self-shadowing,
6484       this is a straight copy of the op (modulo swapping from the
6485       IRLoadGOp form to the IROp form).  Note also therefore that this
6486       implicitly duplicates the logic to do with said widening ops in
6487       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
6488    IROp   vwiden   = Iop_INVALID;
6489    IRType loadedTy = Ity_INVALID;
6490    switch (lg->cvt) {
6491       case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
6492       case ILGop_Ident64:   loadedTy = Ity_I64;  vwiden = Iop_INVALID; break;
6493       case ILGop_Ident32:   loadedTy = Ity_I32;  vwiden = Iop_INVALID; break;
6494       case ILGop_16Uto32:   loadedTy = Ity_I16;  vwiden = Iop_16Uto32; break;
6495       case ILGop_16Sto32:   loadedTy = Ity_I16;  vwiden = Iop_16Sto32; break;
6496       case ILGop_8Uto32:    loadedTy = Ity_I8;   vwiden = Iop_8Uto32;  break;
6497       case ILGop_8Sto32:    loadedTy = Ity_I8;   vwiden = Iop_8Sto32;  break;
6498       default: VG_(tool_panic)("do_shadow_LoadG");
6499    }
6500
6501    IRAtom* vbits_alt
6502       = expr2vbits( mce, lg->alt, HuOth );
6503    IRAtom* vbits_final
6504       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
6505                                         lg->addr, 0/*addr bias*/,
6506                                         lg->guard, vwiden, vbits_alt );
6507    /* And finally, bind the V bits to the destination temporary. */
6508    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
6509 }
6510
6511
6512 /*------------------------------------------------------------*/
6513 /*--- Origin tracking stuff                                ---*/
6514 /*------------------------------------------------------------*/
6515
6516 /* Almost identical to findShadowTmpV. */
6517 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
6518 {
6519    TempMapEnt* ent;
6520    /* VG_(indexXA) range-checks 'orig', hence no need to check
6521       here. */
6522    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6523    tl_assert(ent->kind == Orig);
6524    if (ent->shadowB == IRTemp_INVALID) {
6525       IRTemp tmpB
6526         = newTemp( mce, Ity_I32, BSh );
6527       /* newTemp may cause mce->tmpMap to resize, hence previous results
6528          from VG_(indexXA) are invalid. */
6529       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6530       tl_assert(ent->kind == Orig);
6531       tl_assert(ent->shadowB == IRTemp_INVALID);
6532       ent->shadowB = tmpB;
6533    }
6534    return ent->shadowB;
6535 }
6536
6537 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
6538 {
6539    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
6540 }
6541
6542
6543 /* Make a guarded origin load, with no special handling in the
6544    didn't-happen case.  A GUARD of NULL is assumed to mean "always
6545    True".
6546
6547    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
6548    return the otag.  The loaded size is SZB.  If GUARD evaluates to
6549    False at run time then the returned otag is zero.
6550 */
6551 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
6552                                     IRAtom* baseaddr,
6553                                     Int offset, IRExpr* guard )
6554 {
6555    void*    hFun;
6556    const HChar* hName;
6557    IRTemp   bTmp;
6558    IRDirty* di;
6559    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6560    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6561    IRAtom*  ea    = baseaddr;
6562    if (offset != 0) {
6563       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6564                                    : mkU64( (Long)(Int)offset );
6565       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
6566    }
6567    bTmp = newTemp(mce, mce->hWordTy, BSh);
6568
6569    switch (szB) {
6570       case 1: hFun  = (void*)&MC_(helperc_b_load1);
6571               hName = "MC_(helperc_b_load1)";
6572               break;
6573       case 2: hFun  = (void*)&MC_(helperc_b_load2);
6574               hName = "MC_(helperc_b_load2)";
6575               break;
6576       case 4: hFun  = (void*)&MC_(helperc_b_load4);
6577               hName = "MC_(helperc_b_load4)";
6578               break;
6579       case 8: hFun  = (void*)&MC_(helperc_b_load8);
6580               hName = "MC_(helperc_b_load8)";
6581               break;
6582       case 16: hFun  = (void*)&MC_(helperc_b_load16);
6583                hName = "MC_(helperc_b_load16)";
6584                break;
6585       case 32: hFun  = (void*)&MC_(helperc_b_load32);
6586                hName = "MC_(helperc_b_load32)";
6587                break;
6588       default:
6589          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
6590          tl_assert(0);
6591    }
6592    di = unsafeIRDirty_1_N(
6593            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
6594            mkIRExprVec_1( ea )
6595         );
6596    if (guard) {
6597       di->guard = guard;
6598       /* Ideally the didn't-happen return value here would be
6599          all-zeroes (unknown-origin), so it'd be harmless if it got
6600          used inadvertently.  We slum it out with the IR-mandated
6601          default value (0b01 repeating, 0x55 etc) as that'll probably
6602          trump all legitimate otags via Max32, and it's pretty
6603          obviously bogus. */
6604    }
6605    /* no need to mess with any annotations.  This call accesses
6606       neither guest state nor guest memory. */
6607    stmt( 'B', mce, IRStmt_Dirty(di) );
6608    if (mce->hWordTy == Ity_I64) {
6609       /* 64-bit host */
6610       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
6611       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
6612       return mkexpr(bTmp32);
6613    } else {
6614       /* 32-bit host */
6615       return mkexpr(bTmp);
6616    }
6617 }
6618
6619
6620 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
6621    loaded size is SZB.  The load is regarded as unconditional (always
6622    happens).
6623 */
6624 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
6625                             Int offset )
6626 {
6627    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
6628 }
6629
6630
6631 /* The most general handler for guarded origin loads.  A GUARD of NULL
6632    is assumed to mean "always True".
6633
6634    Generate IR to do a shadow origin load from ADDR+BIAS and return
6635    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
6636    run time then the returned B bits are simply BALT instead.
6637 */
6638 static
6639 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
6640                                         IRType ty,
6641                                         IRAtom* addr, UInt bias,
6642                                         IRAtom* guard, IRAtom* balt )
6643 {
6644    /* If the guard evaluates to True, this will hold the loaded
6645       origin.  If the guard evaluates to False, this will be zero,
6646       meaning "unknown origin", in which case we will have to replace
6647       it using an ITE below. */
6648    IRAtom* iftrue
6649       = assignNew('B', mce, Ity_I32,
6650                   gen_guarded_load_b(mce, sizeofIRType(ty),
6651                                      addr, bias, guard));
6652    /* These are the bits we will return if the load doesn't take
6653       place. */
6654    IRAtom* iffalse
6655       = balt;
6656    /* Prepare the cond for the ITE.  Convert a NULL cond into
6657       something that iropt knows how to fold out later. */
6658    IRAtom* cond
6659       = guard == NULL  ? mkU1(1)  : guard;
6660    /* And assemble the final result. */
6661    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
6662 }
6663
6664
6665 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
6666    the store really happens; NULL means it unconditionally does. */
6667 static void gen_store_b ( MCEnv* mce, Int szB,
6668                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
6669                           IRAtom* guard )
6670 {
6671    void*    hFun;
6672    const HChar* hName;
6673    IRDirty* di;
6674    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6675    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6676    IRAtom*  ea    = baseaddr;
6677    if (guard) {
6678       tl_assert(isOriginalAtom(mce, guard));
6679       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
6680    }
6681    if (offset != 0) {
6682       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6683                                    : mkU64( (Long)(Int)offset );
6684       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
6685    }
6686    if (mce->hWordTy == Ity_I64)
6687       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
6688
6689    switch (szB) {
6690       case 1: hFun  = (void*)&MC_(helperc_b_store1);
6691               hName = "MC_(helperc_b_store1)";
6692               break;
6693       case 2: hFun  = (void*)&MC_(helperc_b_store2);
6694               hName = "MC_(helperc_b_store2)";
6695               break;
6696       case 4: hFun  = (void*)&MC_(helperc_b_store4);
6697               hName = "MC_(helperc_b_store4)";
6698               break;
6699       case 8: hFun  = (void*)&MC_(helperc_b_store8);
6700               hName = "MC_(helperc_b_store8)";
6701               break;
6702       case 16: hFun  = (void*)&MC_(helperc_b_store16);
6703                hName = "MC_(helperc_b_store16)";
6704                break;
6705       case 32: hFun  = (void*)&MC_(helperc_b_store32);
6706                hName = "MC_(helperc_b_store32)";
6707                break;
6708       default:
6709          tl_assert(0);
6710    }
6711    di = unsafeIRDirty_0_N( 2/*regparms*/,
6712            hName, VG_(fnptr_to_fnentry)( hFun ),
6713            mkIRExprVec_2( ea, dataB )
6714         );
6715    /* no need to mess with any annotations.  This call accesses
6716       neither guest state nor guest memory. */
6717    if (guard) di->guard = guard;
6718    stmt( 'B', mce, IRStmt_Dirty(di) );
6719 }
6720
6721 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
6722    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
6723    if (eTy == Ity_I64)
6724       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
6725    if (eTy == Ity_I32)
6726       return e;
6727    tl_assert(0);
6728 }
6729
6730 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
6731    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
6732    tl_assert(eTy == Ity_I32);
6733    if (dstTy == Ity_I64)
6734       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
6735    tl_assert(0);
6736 }
6737
6738
6739 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
6740 {
6741    tl_assert(MC_(clo_mc_level) == 3);
6742
6743    switch (e->tag) {
6744
6745       case Iex_GetI: {
6746          IRRegArray* descr_b;
6747          IRAtom      *t1, *t2, *t3, *t4;
6748          IRRegArray* descr      = e->Iex.GetI.descr;
6749          IRType equivIntTy
6750             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
6751          /* If this array is unshadowable for whatever reason, use the
6752             usual approximation. */
6753          if (equivIntTy == Ity_INVALID)
6754             return mkU32(0);
6755          tl_assert(sizeofIRType(equivIntTy) >= 4);
6756          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
6757          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
6758                                  equivIntTy, descr->nElems );
6759          /* Do a shadow indexed get of the same size, giving t1.  Take
6760             the bottom 32 bits of it, giving t2.  Compute into t3 the
6761             origin for the index (almost certainly zero, but there's
6762             no harm in being completely general here, since iropt will
6763             remove any useless code), and fold it in, giving a final
6764             value t4. */
6765          t1 = assignNew( 'B', mce, equivIntTy,
6766                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
6767                                                 e->Iex.GetI.bias ));
6768          t2 = narrowTo32( mce, t1 );
6769          t3 = schemeE( mce, e->Iex.GetI.ix );
6770          t4 = gen_maxU32( mce, t2, t3 );
6771          return t4;
6772       }
6773       case Iex_CCall: {
6774          Int i;
6775          IRAtom*  here;
6776          IRExpr** args = e->Iex.CCall.args;
6777          IRAtom*  curr = mkU32(0);
6778          for (i = 0; args[i]; i++) {
6779             tl_assert(i < 32);
6780             tl_assert(isOriginalAtom(mce, args[i]));
6781             /* Only take notice of this arg if the callee's
6782                mc-exclusion mask does not say it is to be excluded. */
6783             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
6784                /* the arg is to be excluded from definedness checking.
6785                   Do nothing. */
6786                if (0) VG_(printf)("excluding %s(%d)\n",
6787                                   e->Iex.CCall.cee->name, i);
6788             } else {
6789                /* calculate the arg's definedness, and pessimistically
6790                   merge it in. */
6791                here = schemeE( mce, args[i] );
6792                curr = gen_maxU32( mce, curr, here );
6793             }
6794          }
6795          return curr;
6796       }
6797       case Iex_Load: {
6798          Int dszB;
6799          dszB = sizeofIRType(e->Iex.Load.ty);
6800          /* assert that the B value for the address is already
6801             available (somewhere) */
6802          tl_assert(isIRAtom(e->Iex.Load.addr));
6803          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
6804          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
6805       }
6806       case Iex_ITE: {
6807          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
6808          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
6809          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
6810          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
6811       }
6812       case Iex_Qop: {
6813          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
6814          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
6815          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
6816          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
6817          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
6818                                  gen_maxU32( mce, b3, b4 ) );
6819       }
6820       case Iex_Triop: {
6821          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
6822          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
6823          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
6824          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
6825       }
6826       case Iex_Binop: {
6827          switch (e->Iex.Binop.op) {
6828             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
6829             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
6830             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
6831             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
6832                /* Just say these all produce a defined result,
6833                   regardless of their arguments.  See
6834                   COMMENT_ON_CasCmpEQ in this file. */
6835                return mkU32(0);
6836             default: {
6837                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
6838                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
6839                return gen_maxU32( mce, b1, b2 );
6840             }
6841          }
6842          tl_assert(0);
6843          /*NOTREACHED*/
6844       }
6845       case Iex_Unop: {
6846          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
6847          return b1;
6848       }
6849       case Iex_Const:
6850          return mkU32(0);
6851       case Iex_RdTmp:
6852          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
6853       case Iex_Get: {
6854          Int b_offset = MC_(get_otrack_shadow_offset)(
6855                            e->Iex.Get.offset,
6856                            sizeofIRType(e->Iex.Get.ty)
6857                         );
6858          tl_assert(b_offset >= -1
6859                    && b_offset <= mce->layout->total_sizeB -4);
6860          if (b_offset >= 0) {
6861             /* FIXME: this isn't an atom! */
6862             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
6863                                Ity_I32 );
6864          }
6865          return mkU32(0);
6866       }
6867       default:
6868          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
6869          ppIRExpr(e);
6870          VG_(tool_panic)("memcheck:schemeE");
6871    }
6872 }
6873
6874
6875 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
6876 {
6877    // This is a hacked version of do_shadow_Dirty
6878    Int       i, k, n, toDo, gSz, gOff;
6879    IRAtom    *here, *curr;
6880    IRTemp    dst;
6881
6882    /* First check the guard. */
6883    curr = schemeE( mce, d->guard );
6884
6885    /* Now round up all inputs and maxU32 over them. */
6886
6887    /* Inputs: unmasked args
6888       Note: arguments are evaluated REGARDLESS of the guard expression */
6889    for (i = 0; d->args[i]; i++) {
6890       IRAtom* arg = d->args[i];
6891       if ( (d->cee->mcx_mask & (1<<i))
6892            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
6893          /* ignore this arg */
6894       } else {
6895          here = schemeE( mce, arg );
6896          curr = gen_maxU32( mce, curr, here );
6897       }
6898    }
6899
6900    /* Inputs: guest state that we read. */
6901    for (i = 0; i < d->nFxState; i++) {
6902       tl_assert(d->fxState[i].fx != Ifx_None);
6903       if (d->fxState[i].fx == Ifx_Write)
6904          continue;
6905
6906       /* Enumerate the described state segments */
6907       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6908          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6909          gSz  = d->fxState[i].size;
6910
6911          /* Ignore any sections marked as 'always defined'. */
6912          if (isAlwaysDefd(mce, gOff, gSz)) {
6913             if (0)
6914             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6915                         gOff, gSz);
6916             continue;
6917          }
6918
6919          /* This state element is read or modified.  So we need to
6920             consider it.  If larger than 4 bytes, deal with it in
6921             4-byte chunks. */
6922          while (True) {
6923             Int b_offset;
6924             tl_assert(gSz >= 0);
6925             if (gSz == 0) break;
6926             n = gSz <= 4 ? gSz : 4;
6927             /* update 'curr' with maxU32 of the state slice
6928                gOff .. gOff+n-1 */
6929             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
6930             if (b_offset != -1) {
6931                /* Observe the guard expression. If it is false use 0, i.e.
6932                   nothing is known about the origin */
6933                IRAtom *cond, *iffalse, *iftrue;
6934
6935                cond = assignNew( 'B', mce, Ity_I1, d->guard);
6936                iffalse = mkU32(0);
6937                iftrue  = assignNew( 'B', mce, Ity_I32,
6938                                     IRExpr_Get(b_offset
6939                                                  + 2*mce->layout->total_sizeB,
6940                                                Ity_I32));
6941                here = assignNew( 'B', mce, Ity_I32,
6942                                  IRExpr_ITE(cond, iftrue, iffalse));
6943                curr = gen_maxU32( mce, curr, here );
6944             }
6945             gSz -= n;
6946             gOff += n;
6947          }
6948       }
6949    }
6950
6951    /* Inputs: memory */
6952
6953    if (d->mFx != Ifx_None) {
6954       /* Because we may do multiple shadow loads/stores from the same
6955          base address, it's best to do a single test of its
6956          definedness right now.  Post-instrumentation optimisation
6957          should remove all but this test. */
6958       tl_assert(d->mAddr);
6959       here = schemeE( mce, d->mAddr );
6960       curr = gen_maxU32( mce, curr, here );
6961    }
6962
6963    /* Deal with memory inputs (reads or modifies) */
6964    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6965       toDo   = d->mSize;
6966       /* chew off 32-bit chunks.  We don't care about the endianness
6967          since it's all going to be condensed down to a single bit,
6968          but nevertheless choose an endianness which is hopefully
6969          native to the platform. */
6970       while (toDo >= 4) {
6971          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
6972                                     d->guard );
6973          curr = gen_maxU32( mce, curr, here );
6974          toDo -= 4;
6975       }
6976       /* handle possible 16-bit excess */
6977       while (toDo >= 2) {
6978          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
6979                                     d->guard );
6980          curr = gen_maxU32( mce, curr, here );
6981          toDo -= 2;
6982       }
6983       /* chew off the remaining 8-bit chunk, if any */
6984       if (toDo == 1) {
6985          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
6986                                     d->guard );
6987          curr = gen_maxU32( mce, curr, here );
6988          toDo -= 1;
6989       }
6990       tl_assert(toDo == 0);
6991    }
6992
6993    /* Whew!  So curr is a 32-bit B-value which should give an origin
6994       of some use if any of the inputs to the helper are undefined.
6995       Now we need to re-distribute the results to all destinations. */
6996
6997    /* Outputs: the destination temporary, if there is one. */
6998    if (d->tmp != IRTemp_INVALID) {
6999       dst   = findShadowTmpB(mce, d->tmp);
7000       assign( 'V', mce, dst, curr );
7001    }
7002
7003    /* Outputs: guest state that we write or modify. */
7004    for (i = 0; i < d->nFxState; i++) {
7005       tl_assert(d->fxState[i].fx != Ifx_None);
7006       if (d->fxState[i].fx == Ifx_Read)
7007          continue;
7008
7009       /* Enumerate the described state segments */
7010       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7011          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7012          gSz  = d->fxState[i].size;
7013
7014          /* Ignore any sections marked as 'always defined'. */
7015          if (isAlwaysDefd(mce, gOff, gSz))
7016             continue;
7017
7018          /* This state element is written or modified.  So we need to
7019             consider it.  If larger than 4 bytes, deal with it in
7020             4-byte chunks. */
7021          while (True) {
7022             Int b_offset;
7023             tl_assert(gSz >= 0);
7024             if (gSz == 0) break;
7025             n = gSz <= 4 ? gSz : 4;
7026             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7027             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7028             if (b_offset != -1) {
7029
7030                /* If the guard expression evaluates to false we simply Put
7031                   the value that is already stored in the guest state slot */
7032                IRAtom *cond, *iffalse;
7033
7034                cond    = assignNew('B', mce, Ity_I1,
7035                                    d->guard);
7036                iffalse = assignNew('B', mce, Ity_I32,
7037                                    IRExpr_Get(b_offset +
7038                                               2*mce->layout->total_sizeB,
7039                                               Ity_I32));
7040                curr = assignNew('V', mce, Ity_I32,
7041                                 IRExpr_ITE(cond, curr, iffalse));
7042
7043                stmt( 'B', mce, IRStmt_Put(b_offset
7044                                           + 2*mce->layout->total_sizeB,
7045                                           curr ));
7046             }
7047             gSz -= n;
7048             gOff += n;
7049          }
7050       }
7051    }
7052
7053    /* Outputs: memory that we write or modify.  Same comments about
7054       endianness as above apply. */
7055    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7056       toDo   = d->mSize;
7057       /* chew off 32-bit chunks */
7058       while (toDo >= 4) {
7059          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7060                       d->guard );
7061          toDo -= 4;
7062       }
7063       /* handle possible 16-bit excess */
7064       while (toDo >= 2) {
7065          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7066                       d->guard );
7067          toDo -= 2;
7068       }
7069       /* chew off the remaining 8-bit chunk, if any */
7070       if (toDo == 1) {
7071          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7072                       d->guard );
7073          toDo -= 1;
7074       }
7075       tl_assert(toDo == 0);
7076    }
7077 }
7078
7079
7080 /* Generate IR for origin shadowing for a general guarded store. */
7081 static void do_origins_Store_guarded ( MCEnv* mce,
7082                                        IREndness stEnd,
7083                                        IRExpr* stAddr,
7084                                        IRExpr* stData,
7085                                        IRExpr* guard )
7086 {
7087    Int     dszB;
7088    IRAtom* dataB;
7089    /* assert that the B value for the address is already available
7090       (somewhere), since the call to schemeE will want to see it.
7091       XXXX how does this actually ensure that?? */
7092    tl_assert(isIRAtom(stAddr));
7093    tl_assert(isIRAtom(stData));
7094    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7095    dataB = schemeE( mce, stData );
7096    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7097 }
7098
7099
7100 /* Generate IR for origin shadowing for a plain store. */
7101 static void do_origins_Store_plain ( MCEnv* mce,
7102                                      IREndness stEnd,
7103                                      IRExpr* stAddr,
7104                                      IRExpr* stData )
7105 {
7106    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7107                               NULL/*guard*/ );
7108 }
7109
7110
7111 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7112
7113 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7114 {
7115    do_origins_Store_guarded( mce, sg->end, sg->addr,
7116                              sg->data, sg->guard );
7117 }
7118
7119 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7120 {
7121    IRType loadedTy = Ity_INVALID;
7122    switch (lg->cvt) {
7123       case ILGop_IdentV128: loadedTy = Ity_V128; break;
7124       case ILGop_Ident64:   loadedTy = Ity_I64;  break;
7125       case ILGop_Ident32:   loadedTy = Ity_I32;  break;
7126       case ILGop_16Uto32:   loadedTy = Ity_I16;  break;
7127       case ILGop_16Sto32:   loadedTy = Ity_I16;  break;
7128       case ILGop_8Uto32:    loadedTy = Ity_I8;   break;
7129       case ILGop_8Sto32:    loadedTy = Ity_I8;   break;
7130       default: VG_(tool_panic)("schemeS.IRLoadG");
7131    }
7132    IRAtom* ori_alt
7133       = schemeE( mce,lg->alt );
7134    IRAtom* ori_final
7135       = expr2ori_Load_guarded_General(mce, loadedTy,
7136                                       lg->addr, 0/*addr bias*/,
7137                                       lg->guard, ori_alt );
7138    /* And finally, bind the origin to the destination temporary. */
7139    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7140 }
7141
7142
7143 static void schemeS ( MCEnv* mce, IRStmt* st )
7144 {
7145    tl_assert(MC_(clo_mc_level) == 3);
7146
7147    switch (st->tag) {
7148
7149       case Ist_AbiHint:
7150          /* The value-check instrumenter handles this - by arranging
7151             to pass the address of the next instruction to
7152             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
7153             happen for origin tracking w.r.t. AbiHints.  So there is
7154             nothing to do here. */
7155          break;
7156
7157       case Ist_PutI: {
7158          IRPutI *puti = st->Ist.PutI.details;
7159          IRRegArray* descr_b;
7160          IRAtom      *t1, *t2, *t3, *t4;
7161          IRRegArray* descr = puti->descr;
7162          IRType equivIntTy
7163             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7164          /* If this array is unshadowable for whatever reason,
7165             generate no code. */
7166          if (equivIntTy == Ity_INVALID)
7167             break;
7168          tl_assert(sizeofIRType(equivIntTy) >= 4);
7169          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7170          descr_b
7171             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7172                             equivIntTy, descr->nElems );
7173          /* Compute a value to Put - the conjoinment of the origin for
7174             the data to be Put-ted (obviously) and of the index value
7175             (not so obviously). */
7176          t1 = schemeE( mce, puti->data );
7177          t2 = schemeE( mce, puti->ix );
7178          t3 = gen_maxU32( mce, t1, t2 );
7179          t4 = zWidenFrom32( mce, equivIntTy, t3 );
7180          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7181                                                puti->bias, t4) ));
7182          break;
7183       }
7184
7185       case Ist_Dirty:
7186          do_origins_Dirty( mce, st->Ist.Dirty.details );
7187          break;
7188
7189       case Ist_Store:
7190          do_origins_Store_plain( mce, st->Ist.Store.end,
7191                                       st->Ist.Store.addr,
7192                                       st->Ist.Store.data );
7193          break;
7194
7195       case Ist_StoreG:
7196          do_origins_StoreG( mce, st->Ist.StoreG.details );
7197          break;
7198
7199       case Ist_LoadG:
7200          do_origins_LoadG( mce, st->Ist.LoadG.details );
7201          break;
7202
7203       case Ist_LLSC: {
7204          /* In short: treat a load-linked like a normal load followed
7205             by an assignment of the loaded (shadow) data the result
7206             temporary.  Treat a store-conditional like a normal store,
7207             and mark the result temporary as defined. */
7208          if (st->Ist.LLSC.storedata == NULL) {
7209             /* Load Linked */
7210             IRType resTy
7211                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7212             IRExpr* vanillaLoad
7213                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7214             tl_assert(resTy == Ity_I64 || resTy == Ity_I32
7215                       || resTy == Ity_I16 || resTy == Ity_I8);
7216             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7217                               schemeE(mce, vanillaLoad));
7218          } else {
7219             /* Store conditional */
7220             do_origins_Store_plain( mce, st->Ist.LLSC.end,
7221                                     st->Ist.LLSC.addr,
7222                                     st->Ist.LLSC.storedata );
7223             /* For the rationale behind this, see comments at the
7224                place where the V-shadow for .result is constructed, in
7225                do_shadow_LLSC.  In short, we regard .result as
7226                always-defined. */
7227             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7228                               mkU32(0) );
7229          }
7230          break;
7231       }
7232
7233       case Ist_Put: {
7234          Int b_offset
7235             = MC_(get_otrack_shadow_offset)(
7236                  st->Ist.Put.offset,
7237                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7238               );
7239          if (b_offset >= 0) {
7240             /* FIXME: this isn't an atom! */
7241             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7242                                        schemeE( mce, st->Ist.Put.data )) );
7243          }
7244          break;
7245       }
7246
7247       case Ist_WrTmp:
7248          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7249                            schemeE(mce, st->Ist.WrTmp.data) );
7250          break;
7251
7252       case Ist_MBE:
7253       case Ist_NoOp:
7254       case Ist_Exit:
7255       case Ist_IMark:
7256          break;
7257
7258       default:
7259          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7260          ppIRStmt(st);
7261          VG_(tool_panic)("memcheck:schemeS");
7262    }
7263 }
7264
7265
7266 /*------------------------------------------------------------*/
7267 /*--- Post-tree-build final tidying                        ---*/
7268 /*------------------------------------------------------------*/
7269
7270 /* This exploits the observation that Memcheck often produces
7271    repeated conditional calls of the form
7272
7273    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7274
7275    with the same guard expression G guarding the same helper call.
7276    The second and subsequent calls are redundant.  This usually
7277    results from instrumentation of guest code containing multiple
7278    memory references at different constant offsets from the same base
7279    register.  After optimisation of the instrumentation, you get a
7280    test for the definedness of the base register for each memory
7281    reference, which is kinda pointless.  MC_(final_tidy) therefore
7282    looks for such repeated calls and removes all but the first. */
7283
7284
7285 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7286    gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7287    get almost all the benefits of this transformation whilst causing
7288    the slide-back case to just often enough to be verifiably
7289    correct.  For posterity, the numbers are:
7290
7291    bz2-32
7292
7293    1   4,336 (112,212 -> 1,709,473; ratio 15.2)
7294    2   4,336 (112,194 -> 1,669,895; ratio 14.9)
7295    3   4,336 (112,194 -> 1,660,713; ratio 14.8)
7296    4   4,336 (112,194 -> 1,658,555; ratio 14.8)
7297    5   4,336 (112,194 -> 1,655,447; ratio 14.8)
7298    6   4,336 (112,194 -> 1,655,101; ratio 14.8)
7299    7   4,336 (112,194 -> 1,654,858; ratio 14.7)
7300    8   4,336 (112,194 -> 1,654,810; ratio 14.7)
7301    10  4,336 (112,194 -> 1,654,621; ratio 14.7)
7302    12  4,336 (112,194 -> 1,654,678; ratio 14.7)
7303    16  4,336 (112,194 -> 1,654,494; ratio 14.7)
7304    32  4,336 (112,194 -> 1,654,602; ratio 14.7)
7305    inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7306
7307    bz2-64
7308
7309    1   4,113 (107,329 -> 1,822,171; ratio 17.0)
7310    2   4,113 (107,329 -> 1,806,443; ratio 16.8)
7311    3   4,113 (107,329 -> 1,803,967; ratio 16.8)
7312    4   4,113 (107,329 -> 1,802,785; ratio 16.8)
7313    5   4,113 (107,329 -> 1,802,412; ratio 16.8)
7314    6   4,113 (107,329 -> 1,802,062; ratio 16.8)
7315    7   4,113 (107,329 -> 1,801,976; ratio 16.8)
7316    8   4,113 (107,329 -> 1,801,886; ratio 16.8)
7317    10  4,113 (107,329 -> 1,801,653; ratio 16.8)
7318    12  4,113 (107,329 -> 1,801,526; ratio 16.8)
7319    16  4,113 (107,329 -> 1,801,298; ratio 16.8)
7320    32  4,113 (107,329 -> 1,800,827; ratio 16.8)
7321    inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7322 */
7323
7324 /* Structs for recording which (helper, guard) pairs we have already
7325    seen. */
7326
7327 #define N_TIDYING_PAIRS 16
7328
7329 typedef
7330    struct { void* entry; IRExpr* guard; }
7331    Pair;
7332
7333 typedef
7334    struct {
7335       Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
7336       UInt pairsUsed;
7337    }
7338    Pairs;
7339
7340
7341 /* Return True if e1 and e2 definitely denote the same value (used to
7342    compare guards).  Return False if unknown; False is the safe
7343    answer.  Since guest registers and guest memory do not have the
7344    SSA property we must return False if any Gets or Loads appear in
7345    the expression.  This implicitly assumes that e1 and e2 have the
7346    same IR type, which is always true here -- the type is Ity_I1. */
7347
7348 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
7349 {
7350    if (e1->tag != e2->tag)
7351       return False;
7352    switch (e1->tag) {
7353       case Iex_Const:
7354          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
7355       case Iex_Binop:
7356          return e1->Iex.Binop.op == e2->Iex.Binop.op
7357                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
7358                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
7359       case Iex_Unop:
7360          return e1->Iex.Unop.op == e2->Iex.Unop.op
7361                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
7362       case Iex_RdTmp:
7363          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
7364       case Iex_ITE:
7365          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
7366                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
7367                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
7368       case Iex_Qop:
7369       case Iex_Triop:
7370       case Iex_CCall:
7371          /* be lazy.  Could define equality for these, but they never
7372             appear to be used. */
7373          return False;
7374       case Iex_Get:
7375       case Iex_GetI:
7376       case Iex_Load:
7377          /* be conservative - these may not give the same value each
7378             time */
7379          return False;
7380       case Iex_Binder:
7381          /* should never see this */
7382          /* fallthrough */
7383       default:
7384          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
7385          ppIRExpr(e1);
7386          VG_(tool_panic)("memcheck:sameIRValue");
7387          return False;
7388    }
7389 }
7390
7391 /* See if 'pairs' already has an entry for (entry, guard).  Return
7392    True if so.  If not, add an entry. */
7393
7394 static
7395 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
7396 {
7397    UInt i, n = tidyingEnv->pairsUsed;
7398    tl_assert(n <= N_TIDYING_PAIRS);
7399    for (i = 0; i < n; i++) {
7400       if (tidyingEnv->pairs[i].entry == entry
7401           && sameIRValue(tidyingEnv->pairs[i].guard, guard))
7402          return True;
7403    }
7404    /* (guard, entry) wasn't found in the array.  Add it at the end.
7405       If the array is already full, slide the entries one slot
7406       backwards.  This means we will lose to ability to detect
7407       duplicates from the pair in slot zero, but that happens so
7408       rarely that it's unlikely to have much effect on overall code
7409       quality.  Also, this strategy loses the check for the oldest
7410       tracked exit (memory reference, basically) and so that is (I'd
7411       guess) least likely to be re-used after this point. */
7412    tl_assert(i == n);
7413    if (n == N_TIDYING_PAIRS) {
7414       for (i = 1; i < N_TIDYING_PAIRS; i++) {
7415          tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
7416       }
7417       tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
7418       tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
7419    } else {
7420       tl_assert(n < N_TIDYING_PAIRS);
7421       tidyingEnv->pairs[n].entry = entry;
7422       tidyingEnv->pairs[n].guard = guard;
7423       n++;
7424       tidyingEnv->pairsUsed = n;
7425    }
7426    return False;
7427 }
7428
7429 static Bool is_helperc_value_checkN_fail ( const HChar* name )
7430 {
7431    /* This is expensive because it happens a lot.  We are checking to
7432       see whether |name| is one of the following 8 strings:
7433
7434          MC_(helperc_value_check8_fail_no_o)
7435          MC_(helperc_value_check4_fail_no_o)
7436          MC_(helperc_value_check0_fail_no_o)
7437          MC_(helperc_value_check1_fail_no_o)
7438          MC_(helperc_value_check8_fail_w_o)
7439          MC_(helperc_value_check0_fail_w_o)
7440          MC_(helperc_value_check1_fail_w_o)
7441          MC_(helperc_value_check4_fail_w_o)
7442
7443       To speed it up, check the common prefix just once, rather than
7444       all 8 times.
7445    */
7446    const HChar* prefix = "MC_(helperc_value_check";
7447
7448    HChar n, p;
7449    while (True) {
7450       n = *name;
7451       p = *prefix;
7452       if (p == 0) break; /* ran off the end of the prefix */
7453       /* We still have some prefix to use */
7454       if (n == 0) return False; /* have prefix, but name ran out */
7455       if (n != p) return False; /* have both pfx and name, but no match */
7456       name++;
7457       prefix++;
7458    }
7459
7460    /* Check the part after the prefix. */
7461    tl_assert(*prefix == 0 && *name != 0);
7462    return    0==VG_(strcmp)(name, "8_fail_no_o)")
7463           || 0==VG_(strcmp)(name, "4_fail_no_o)")
7464           || 0==VG_(strcmp)(name, "0_fail_no_o)")
7465           || 0==VG_(strcmp)(name, "1_fail_no_o)")
7466           || 0==VG_(strcmp)(name, "8_fail_w_o)")
7467           || 0==VG_(strcmp)(name, "4_fail_w_o)")
7468           || 0==VG_(strcmp)(name, "0_fail_w_o)")
7469           || 0==VG_(strcmp)(name, "1_fail_w_o)");
7470 }
7471
7472 IRSB* MC_(final_tidy) ( IRSB* sb_in )
7473 {
7474    Int       i;
7475    IRStmt*   st;
7476    IRDirty*  di;
7477    IRExpr*   guard;
7478    IRCallee* cee;
7479    Bool      alreadyPresent;
7480    Pairs     pairs;
7481
7482    pairs.pairsUsed = 0;
7483
7484    pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
7485    pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
7486
7487    /* Scan forwards through the statements.  Each time a call to one
7488       of the relevant helpers is seen, check if we have made a
7489       previous call to the same helper using the same guard
7490       expression, and if so, delete the call. */
7491    for (i = 0; i < sb_in->stmts_used; i++) {
7492       st = sb_in->stmts[i];
7493       tl_assert(st);
7494       if (st->tag != Ist_Dirty)
7495          continue;
7496       di = st->Ist.Dirty.details;
7497       guard = di->guard;
7498       tl_assert(guard);
7499       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
7500       cee = di->cee;
7501       if (!is_helperc_value_checkN_fail( cee->name ))
7502          continue;
7503        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
7504           guard 'guard'.  Check if we have already seen a call to this
7505           function with the same guard.  If so, delete it.  If not,
7506           add it to the set of calls we do know about. */
7507       alreadyPresent = check_or_add( &pairs, guard, cee->addr );
7508       if (alreadyPresent) {
7509          sb_in->stmts[i] = IRStmt_NoOp();
7510          if (0) VG_(printf)("XX\n");
7511       }
7512    }
7513
7514    tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
7515    tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
7516
7517    return sb_in;
7518 }
7519
7520 #undef N_TIDYING_PAIRS
7521
7522
7523 /*------------------------------------------------------------*/
7524 /*--- Startup assertion checking                           ---*/
7525 /*------------------------------------------------------------*/
7526
7527 void MC_(do_instrumentation_startup_checks)( void )
7528 {
7529    /* Make a best-effort check to see that is_helperc_value_checkN_fail
7530       is working as we expect. */
7531
7532 #  define CHECK(_expected, _string) \
7533       tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
7534
7535    /* It should identify these 8, and no others, as targets. */
7536    CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
7537    CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
7538    CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
7539    CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
7540    CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
7541    CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
7542    CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
7543    CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
7544
7545    /* Ad-hoc selection of other strings gathered via a quick test. */
7546    CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
7547    CHECK(False, "amd64g_dirtyhelper_RDTSC");
7548    CHECK(False, "MC_(helperc_b_load1)");
7549    CHECK(False, "MC_(helperc_b_load2)");
7550    CHECK(False, "MC_(helperc_b_load4)");
7551    CHECK(False, "MC_(helperc_b_load8)");
7552    CHECK(False, "MC_(helperc_b_load16)");
7553    CHECK(False, "MC_(helperc_b_load32)");
7554    CHECK(False, "MC_(helperc_b_store1)");
7555    CHECK(False, "MC_(helperc_b_store2)");
7556    CHECK(False, "MC_(helperc_b_store4)");
7557    CHECK(False, "MC_(helperc_b_store8)");
7558    CHECK(False, "MC_(helperc_b_store16)");
7559    CHECK(False, "MC_(helperc_b_store32)");
7560    CHECK(False, "MC_(helperc_LOADV8)");
7561    CHECK(False, "MC_(helperc_LOADV16le)");
7562    CHECK(False, "MC_(helperc_LOADV32le)");
7563    CHECK(False, "MC_(helperc_LOADV64le)");
7564    CHECK(False, "MC_(helperc_LOADV128le)");
7565    CHECK(False, "MC_(helperc_LOADV256le)");
7566    CHECK(False, "MC_(helperc_STOREV16le)");
7567    CHECK(False, "MC_(helperc_STOREV32le)");
7568    CHECK(False, "MC_(helperc_STOREV64le)");
7569    CHECK(False, "MC_(helperc_STOREV8)");
7570    CHECK(False, "track_die_mem_stack_8");
7571    CHECK(False, "track_new_mem_stack_8_w_ECU");
7572    CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
7573    CHECK(False, "VG_(unknown_SP_update_w_ECU)");
7574
7575 #  undef CHECK
7576 }
7577
7578
7579 /*------------------------------------------------------------*/
7580 /*--- Memcheck main                                        ---*/
7581 /*------------------------------------------------------------*/
7582
7583 static Bool isBogusAtom ( IRAtom* at )
7584 {
7585    if (at->tag == Iex_RdTmp)
7586       return False;
7587    tl_assert(at->tag == Iex_Const);
7588
7589    ULong n = 0;
7590    IRConst* con = at->Iex.Const.con;
7591    switch (con->tag) {
7592       case Ico_U1:   return False;
7593       case Ico_U8:   n = (ULong)con->Ico.U8; break;
7594       case Ico_U16:  n = (ULong)con->Ico.U16; break;
7595       case Ico_U32:  n = (ULong)con->Ico.U32; break;
7596       case Ico_U64:  n = (ULong)con->Ico.U64; break;
7597       case Ico_F32:  return False;
7598       case Ico_F64:  return False;
7599       case Ico_F32i: return False;
7600       case Ico_F64i: return False;
7601       case Ico_V128: return False;
7602       case Ico_V256: return False;
7603       default: ppIRExpr(at); tl_assert(0);
7604    }
7605    /* VG_(printf)("%llx\n", n); */
7606    /* Shortcuts */
7607    if (LIKELY(n <= 0x0000000000001000ULL)) return False;
7608    if (LIKELY(n >= 0xFFFFFFFFFFFFF000ULL)) return False;
7609    /* The list of bogus atoms is: */
7610    return (/*32*/    n == 0xFEFEFEFFULL
7611            /*32*/ || n == 0x80808080ULL
7612            /*32*/ || n == 0x7F7F7F7FULL
7613            /*32*/ || n == 0x7EFEFEFFULL
7614            /*32*/ || n == 0x81010100ULL
7615            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
7616            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
7617            /*64*/ || n == 0x0000000000008080ULL
7618            /*64*/ || n == 0x8080808080808080ULL
7619            /*64*/ || n == 0x0101010101010101ULL
7620           );
7621 }
7622
7623
7624 /* Does 'st' mention any of the literals identified/listed in
7625    isBogusAtom()? */
7626 static inline Bool containsBogusLiterals ( /*FLAT*/ IRStmt* st )
7627 {
7628    Int      i;
7629    IRExpr*  e;
7630    IRDirty* d;
7631    IRCAS*   cas;
7632    switch (st->tag) {
7633       case Ist_WrTmp:
7634          e = st->Ist.WrTmp.data;
7635          switch (e->tag) {
7636             case Iex_Get:
7637             case Iex_RdTmp:
7638                return False;
7639             case Iex_Const:
7640                return isBogusAtom(e);
7641             case Iex_Unop:
7642                return isBogusAtom(e->Iex.Unop.arg)
7643                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
7644             case Iex_GetI:
7645                return isBogusAtom(e->Iex.GetI.ix);
7646             case Iex_Binop:
7647                return isBogusAtom(e->Iex.Binop.arg1)
7648                       || isBogusAtom(e->Iex.Binop.arg2);
7649             case Iex_Triop:
7650                return isBogusAtom(e->Iex.Triop.details->arg1)
7651                       || isBogusAtom(e->Iex.Triop.details->arg2)
7652                       || isBogusAtom(e->Iex.Triop.details->arg3);
7653             case Iex_Qop:
7654                return isBogusAtom(e->Iex.Qop.details->arg1)
7655                       || isBogusAtom(e->Iex.Qop.details->arg2)
7656                       || isBogusAtom(e->Iex.Qop.details->arg3)
7657                       || isBogusAtom(e->Iex.Qop.details->arg4);
7658             case Iex_ITE:
7659                return isBogusAtom(e->Iex.ITE.cond)
7660                       || isBogusAtom(e->Iex.ITE.iftrue)
7661                       || isBogusAtom(e->Iex.ITE.iffalse);
7662             case Iex_Load:
7663                return isBogusAtom(e->Iex.Load.addr);
7664             case Iex_CCall:
7665                for (i = 0; e->Iex.CCall.args[i]; i++)
7666                   if (isBogusAtom(e->Iex.CCall.args[i]))
7667                      return True;
7668                return False;
7669             default:
7670                goto unhandled;
7671          }
7672       case Ist_Dirty:
7673          d = st->Ist.Dirty.details;
7674          for (i = 0; d->args[i]; i++) {
7675             IRAtom* atom = d->args[i];
7676             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
7677                if (isBogusAtom(atom))
7678                   return True;
7679             }
7680          }
7681          if (isBogusAtom(d->guard))
7682             return True;
7683          if (d->mAddr && isBogusAtom(d->mAddr))
7684             return True;
7685          return False;
7686       case Ist_Put:
7687          return isBogusAtom(st->Ist.Put.data);
7688       case Ist_PutI:
7689          return isBogusAtom(st->Ist.PutI.details->ix)
7690                 || isBogusAtom(st->Ist.PutI.details->data);
7691       case Ist_Store:
7692          return isBogusAtom(st->Ist.Store.addr)
7693                 || isBogusAtom(st->Ist.Store.data);
7694       case Ist_StoreG: {
7695          IRStoreG* sg = st->Ist.StoreG.details;
7696          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
7697                 || isBogusAtom(sg->guard);
7698       }
7699       case Ist_LoadG: {
7700          IRLoadG* lg = st->Ist.LoadG.details;
7701          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
7702                 || isBogusAtom(lg->guard);
7703       }
7704       case Ist_Exit:
7705          return isBogusAtom(st->Ist.Exit.guard);
7706       case Ist_AbiHint:
7707          return isBogusAtom(st->Ist.AbiHint.base)
7708                 || isBogusAtom(st->Ist.AbiHint.nia);
7709       case Ist_NoOp:
7710       case Ist_IMark:
7711       case Ist_MBE:
7712          return False;
7713       case Ist_CAS:
7714          cas = st->Ist.CAS.details;
7715          return isBogusAtom(cas->addr)
7716                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
7717                 || isBogusAtom(cas->expdLo)
7718                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
7719                 || isBogusAtom(cas->dataLo);
7720       case Ist_LLSC:
7721          return isBogusAtom(st->Ist.LLSC.addr)
7722                 || (st->Ist.LLSC.storedata
7723                        ? isBogusAtom(st->Ist.LLSC.storedata)
7724                        : False);
7725       default:
7726       unhandled:
7727          ppIRStmt(st);
7728          VG_(tool_panic)("hasBogusLiterals");
7729    }
7730 }
7731
7732
7733 /* This is the pre-instrumentation analysis.  It does a backwards pass over
7734    the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
7735    the block.
7736
7737    Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
7738    as a positive result from that is a strong indication that we need to
7739    expensively instrument add/sub in the block.  We do both analyses in one
7740    pass, even though they are independent, so as to avoid the overhead of
7741    having to traverse the whole block twice.
7742
7743    The usage pass proceeds as follows.  Let max= be the max operation in the
7744    HowUsed lattice, hence
7745
7746      X max= Y   means   X = max(X, Y)
7747
7748    then
7749
7750      for t in original tmps . useEnv[t] = HuUnU
7751
7752      for t used in the block's . next field
7753         useEnv[t] max= HuPCa  // because jmp targets are PCast-tested
7754
7755      for st iterating *backwards* in the block
7756
7757         match st
7758
7759            case "t1 = load(t2)"          // case 1
7760               useEnv[t2] max= HuPCa
7761
7762            case "t1 = add(t2, t3)"       // case 2
7763               useEnv[t2] max= useEnv[t1]
7764               useEnv[t3] max= useEnv[t1]
7765
7766            other
7767               for t in st.usedTmps       // case 3
7768                  useEnv[t] max= HuOth
7769                  // same as useEnv[t] = HuOth
7770
7771    The general idea is that we accumulate, in useEnv[], information about
7772    how each tmp is used.  That can be updated as we work further back
7773    through the block and find more uses of it, but its HowUsed value can
7774    only ascend the lattice, not descend.
7775
7776    Initially we mark all tmps as unused.  In case (1), if a tmp is seen to
7777    be used as a memory address, then its use is at least HuPCa.  The point
7778    is that for a memory address we will add instrumentation to check if any
7779    bit of the address is undefined, which means that we won't need expensive
7780    V-bit propagation through an add expression that computed the address --
7781    cheap add instrumentation will be equivalent.
7782
7783    Note in case (1) that if we have previously seen a non-memory-address use
7784    of the tmp, then its use will already be HuOth and will be unchanged by
7785    the max= operation.  And if it turns out that the source of the tmp was
7786    an add, then we'll have to expensively instrument the add, because we
7787    can't prove that, for the previous non-memory-address use of the tmp,
7788    cheap and expensive instrumentation will be equivalent.
7789
7790    In case 2, we propagate the usage-mode of the result of an add back
7791    through to its operands.  Again, we use max= so as to take account of the
7792    fact that t2 or t3 might later in the block (viz, earlier in the
7793    iteration) have been used in a way that requires expensive add
7794    instrumentation.
7795
7796    In case 3, we deal with all other tmp uses.  We assume that we'll need a
7797    result that is as accurate as possible, so we max= HuOth into its use
7798    mode.  Since HuOth is the top of the lattice, that's equivalent to just
7799    setting its use to HuOth.
7800
7801    The net result of all this is that:
7802
7803      tmps that are used either
7804        - only as a memory address, or
7805        - only as part of a tree of adds that computes a memory address,
7806          and has no other use
7807      are marked as HuPCa, and so we can instrument their generating Add
7808      nodes cheaply, which is the whole point of this analysis
7809
7810      tmps that are used any other way at all are marked as HuOth
7811
7812      tmps that are unused are marked as HuUnU.  We don't expect to see any
7813      since we expect that the incoming IR has had all dead assignments
7814      removed by previous optimisation passes.  Nevertheless the analysis is
7815      correct even in the presence of dead tmps.
7816
7817    A final comment on dead tmps.  In case 1 and case 2, we could actually
7818    conditionalise the updates thusly:
7819
7820      if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa }  // case 1
7821
7822      if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] }  // case 2
7823      if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] }  // case 2
7824
7825    In other words, if the assigned-to tmp |t1| is never used, then there's
7826    no point in propagating any use through to its operands.  That won't
7827    change the final HuPCa-vs-HuOth results, which is what we care about.
7828    Given that we expect to get dead-code-free inputs, there's no point in
7829    adding this extra refinement.
7830 */
7831
7832 /* Helper for |preInstrumentationAnalysis|. */
7833 static inline void noteTmpUsesIn ( /*MOD*/HowUsed* useEnv,
7834                                    UInt tyenvUsed,
7835                                    HowUsed newUse, IRAtom* at )
7836 {
7837    /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
7838       seen a use of |newUse|.  So, merge that info into |t|'s accumulated
7839       use info. */
7840    switch (at->tag) {
7841       case Iex_GSPTR:
7842       case Iex_Const:
7843          return;
7844       case Iex_RdTmp: {
7845          IRTemp t = at->Iex.RdTmp.tmp;
7846          tl_assert(t < tyenvUsed); // "is an original tmp"
7847          // The "max" operation in the lattice
7848          if (newUse > useEnv[t]) useEnv[t] = newUse;
7849          return;
7850       }
7851       default:
7852          // We should never get here -- it implies non-flat IR
7853          ppIRExpr(at);
7854          VG_(tool_panic)("noteTmpUsesIn");
7855    }
7856    /*NOTREACHED*/
7857    tl_assert(0);
7858 }
7859
7860
7861 static void preInstrumentationAnalysis ( /*OUT*/HowUsed** useEnvP,
7862                                          /*OUT*/Bool* hasBogusLiteralsP,
7863                                          const IRSB* sb_in )
7864 {
7865    const UInt nOrigTmps = (UInt)sb_in->tyenv->types_used;
7866
7867    // We've seen no bogus literals so far.
7868    Bool bogus = False;
7869
7870    // This is calloc'd, so implicitly all entries are initialised to HuUnU.
7871    HowUsed* useEnv = VG_(calloc)("mc.preInstrumentationAnalysis.1",
7872                                  nOrigTmps, sizeof(HowUsed));
7873
7874    // Firstly, roll in contributions from the final dst address.
7875    bogus = isBogusAtom(sb_in->next);
7876    noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, sb_in->next);
7877
7878    // Now work backwards through the stmts.
7879    for (Int i = sb_in->stmts_used-1; i >= 0; i--) {
7880       IRStmt* st = sb_in->stmts[i];
7881
7882       // Deal with literals.
7883       if (LIKELY(!bogus)) {
7884          bogus = containsBogusLiterals(st);
7885       }
7886
7887       // Deal with tmp uses.
7888       switch (st->tag) {
7889          case Ist_WrTmp: {
7890             IRTemp  dst = st->Ist.WrTmp.tmp;
7891             IRExpr* rhs = st->Ist.WrTmp.data;
7892             // This is the one place where we have to consider all possible
7893             // tags for |rhs|, and can't just assume it is a tmp or a const.
7894             switch (rhs->tag) {
7895                case Iex_RdTmp:
7896                   // just propagate demand for |dst| into this tmp use.
7897                   noteTmpUsesIn(useEnv, nOrigTmps, useEnv[dst], rhs);
7898                   break;
7899                case Iex_Unop:
7900                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.Unop.arg);
7901                   break;
7902                case Iex_Binop:
7903                   if (rhs->Iex.Binop.op == Iop_Add64
7904                       || rhs->Iex.Binop.op == Iop_Add32) {
7905                      // propagate demand for |dst| through to the operands.
7906                      noteTmpUsesIn(useEnv, nOrigTmps,
7907                                    useEnv[dst], rhs->Iex.Binop.arg1);
7908                      noteTmpUsesIn(useEnv, nOrigTmps,
7909                                    useEnv[dst], rhs->Iex.Binop.arg2);
7910                   } else {
7911                      // just say that the operands are used in some unknown way.
7912                      noteTmpUsesIn(useEnv, nOrigTmps,
7913                                    HuOth, rhs->Iex.Binop.arg1);
7914                      noteTmpUsesIn(useEnv, nOrigTmps,
7915                                    HuOth, rhs->Iex.Binop.arg2);
7916                   }
7917                   break;
7918                case Iex_Triop: {
7919                   // All operands are used in some unknown way.
7920                   IRTriop* tri = rhs->Iex.Triop.details;
7921                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg1);
7922                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg2);
7923                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg3);
7924                   break;
7925                }
7926                case Iex_Qop: {
7927                   // All operands are used in some unknown way.
7928                   IRQop* qop = rhs->Iex.Qop.details;
7929                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg1);
7930                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg2);
7931                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg3);
7932                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg4);
7933                   break;
7934                }
7935                case Iex_Load:
7936                   // The address will be checked (== PCasted).
7937                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.Load.addr);
7938                   break;
7939                case Iex_ITE:
7940                   // The condition is PCasted, the then- and else-values
7941                   // aren't.
7942                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.ITE.cond);
7943                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iftrue);
7944                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iffalse);
7945                   break;
7946                case Iex_CCall:
7947                   // The args are used in unknown ways.
7948                   for (IRExpr** args = rhs->Iex.CCall.args; *args; args++) {
7949                      noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
7950                   }
7951                   break;
7952                case Iex_GetI: {
7953                   // The index will be checked/PCasted (see do_shadow_GETI)
7954                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.GetI.ix);
7955                   break;
7956                }
7957                case Iex_Const:
7958                case Iex_Get:
7959                   break;
7960                default:
7961                   ppIRExpr(rhs);
7962                   VG_(tool_panic)("preInstrumentationAnalysis:"
7963                                   " unhandled IRExpr");
7964             }
7965             break;
7966          }
7967          case Ist_Store:
7968             // The address will be checked (== PCasted).  The data will be
7969             // used in some unknown way.
7970             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Store.addr);
7971             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Store.data);
7972             break;
7973          case Ist_Exit:
7974             // The guard will be checked (== PCasted)
7975             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Exit.guard);
7976             break;
7977          case Ist_Put:
7978             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Put.data);
7979             break;
7980          case Ist_PutI: {
7981             IRPutI* putI = st->Ist.PutI.details;
7982             // The index will be checked/PCasted (see do_shadow_PUTI).  The
7983             // data will be used in an unknown way.
7984             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, putI->ix);
7985             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, putI->data);
7986             break;
7987          }
7988          case Ist_Dirty: {
7989             IRDirty* d = st->Ist.Dirty.details;
7990             // The guard will be checked (== PCasted)
7991             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, d->guard);
7992             // The args will be used in unknown ways.
7993             for (IRExpr** args = d->args; *args; args++) {
7994                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
7995             }
7996             break;
7997          }
7998          case Ist_CAS: {
7999             IRCAS* cas = st->Ist.CAS.details;
8000             // Address will be pcasted, everything else used as unknown
8001             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, cas->addr);
8002             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdLo);
8003             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataLo);
8004             if (cas->expdHi)
8005                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdHi);
8006             if (cas->dataHi)
8007                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataHi);
8008             break;
8009          }
8010          case Ist_AbiHint:
8011             // Both exprs are used in unknown ways.  TODO: can we safely
8012             // just ignore AbiHints?
8013             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.base);
8014             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.nia);
8015             break;
8016          case Ist_StoreG: {
8017             // We might be able to do better, and use HuPCa for the addr.
8018             // It's not immediately obvious that we can, because the address
8019             // is regarded as "used" only when the guard is true.
8020             IRStoreG* sg = st->Ist.StoreG.details;
8021             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->addr);
8022             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->data);
8023             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->guard);
8024             break;
8025          }
8026          case Ist_LoadG: {
8027             // Per similar comments to Ist_StoreG .. not sure whether this
8028             // is really optimal.
8029             IRLoadG* lg = st->Ist.LoadG.details;
8030             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->addr);
8031             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->alt);
8032             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->guard);
8033             break;
8034          }
8035          case Ist_LLSC: {
8036             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.LLSC.addr);
8037             if (st->Ist.LLSC.storedata)
8038                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.LLSC.storedata);
8039             break;
8040          }
8041          case Ist_MBE:
8042          case Ist_IMark:
8043          case Ist_NoOp:
8044             break;
8045          default: {
8046             ppIRStmt(st);
8047             VG_(tool_panic)("preInstrumentationAnalysis: unhandled IRStmt");
8048          }
8049       }
8050    } // Now work backwards through the stmts.
8051
8052    // Return the computed use env and the bogus-atom flag.
8053    tl_assert(*useEnvP == NULL);
8054    *useEnvP = useEnv;
8055
8056    tl_assert(*hasBogusLiteralsP == False);
8057    *hasBogusLiteralsP = bogus;
8058 }
8059
8060
8061 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
8062                         IRSB* sb_in,
8063                         const VexGuestLayout* layout,
8064                         const VexGuestExtents* vge,
8065                         const VexArchInfo* archinfo_host,
8066                         IRType gWordTy, IRType hWordTy )
8067 {
8068    Bool    verboze = 0||False;
8069    Int     i, j, first_stmt;
8070    IRStmt* st;
8071    MCEnv   mce;
8072    IRSB*   sb_out;
8073
8074    if (gWordTy != hWordTy) {
8075       /* We don't currently support this case. */
8076       VG_(tool_panic)("host/guest word size mismatch");
8077    }
8078
8079    /* Check we're not completely nuts */
8080    tl_assert(sizeof(UWord)  == sizeof(void*));
8081    tl_assert(sizeof(Word)   == sizeof(void*));
8082    tl_assert(sizeof(Addr)   == sizeof(void*));
8083    tl_assert(sizeof(ULong)  == 8);
8084    tl_assert(sizeof(Long)   == 8);
8085    tl_assert(sizeof(UInt)   == 4);
8086    tl_assert(sizeof(Int)    == 4);
8087
8088    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
8089
8090    /* Set up SB */
8091    sb_out = deepCopyIRSBExceptStmts(sb_in);
8092
8093    /* Set up the running environment.  Both .sb and .tmpMap are
8094       modified as we go along.  Note that tmps are added to both
8095       .sb->tyenv and .tmpMap together, so the valid index-set for
8096       those two arrays should always be identical. */
8097    VG_(memset)(&mce, 0, sizeof(mce));
8098    mce.sb             = sb_out;
8099    mce.trace          = verboze;
8100    mce.layout         = layout;
8101    mce.hWordTy        = hWordTy;
8102    mce.tmpHowUsed     = NULL;
8103
8104    /* BEGIN decide on expense levels for instrumentation. */
8105
8106    /* Initially, select the cheap version of everything for which we have an
8107       option. */
8108    DetailLevelByOp__set_all( &mce.dlbo, DLcheap );
8109
8110    /* Take account of the --expensive-definedness-checks= flag. */
8111    if (MC_(clo_expensive_definedness_checks) == EdcNO) {
8112       /* We just selected 'cheap for everything', so we don't need to do
8113          anything here.  mce.tmpHowUsed remains NULL. */
8114    }
8115    else if (MC_(clo_expensive_definedness_checks) == EdcYES) {
8116       /* Select 'expensive for everything'.  mce.tmpHowUsed remains NULL. */
8117       DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8118    }
8119    else {
8120       tl_assert(MC_(clo_expensive_definedness_checks) == EdcAUTO);
8121       /* We'll make our own selection, based on known per-target constraints
8122          and also on analysis of the block to be instrumented.  First, set
8123          up default values for detail levels.
8124
8125          On x86 and amd64, we'll routinely encounter code optimised by LLVM
8126          5 and above.  Enable accurate interpretation of the following.
8127          LLVM uses adds for some bitfield inserts, and we get a lot of false
8128          errors if the cheap interpretation is used, alas.  Could solve this
8129          much better if we knew which of such adds came from x86/amd64 LEA
8130          instructions, since these are the only ones really needing the
8131          expensive interpretation, but that would require some way to tag
8132          them in the _toIR.c front ends, which is a lot of faffing around.
8133          So for now we use preInstrumentationAnalysis() to detect adds which
8134          are used only to construct memory addresses, which is an
8135          approximation to the above, and is self-contained.*/
8136 #     if defined(VGA_x86)
8137       mce.dlbo.dl_Add32           = DLauto;
8138       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8139 #     elif defined(VGA_amd64)
8140       mce.dlbo.dl_Add64           = DLauto;
8141       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8142 #     endif
8143
8144       /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8145          fill it in. */
8146       Bool hasBogusLiterals = False;
8147       preInstrumentationAnalysis( &mce.tmpHowUsed, &hasBogusLiterals, sb_in );
8148
8149       if (hasBogusLiterals) {
8150          /* This happens very rarely.  In this case just select expensive
8151             for everything, and throw away the tmp-use analysis results. */
8152          DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8153          VG_(free)( mce.tmpHowUsed );
8154          mce.tmpHowUsed = NULL;
8155       } else {
8156          /* Nothing.  mce.tmpHowUsed contains tmp-use analysis results,
8157             which will be used for some subset of Iop_{Add,Sub}{32,64},
8158             based on which ones are set to DLauto for this target. */
8159       }
8160    }
8161
8162    DetailLevelByOp__check_sanity( &mce.dlbo );
8163
8164    if (0) {
8165       // Debug printing: which tmps have been identified as PCast-only use
8166       if (mce.tmpHowUsed) {
8167          VG_(printf)("Cheapies: ");
8168          for (UInt q = 0; q < sb_in->tyenv->types_used; q++) {
8169             if (mce.tmpHowUsed[q] == HuPCa) {
8170                VG_(printf)("t%u ", q);
8171             }
8172          }
8173          VG_(printf)("\n");
8174       }
8175
8176       // Debug printing: number of ops by detail level
8177       UChar nCheap     = DetailLevelByOp__count( &mce.dlbo, DLcheap     );
8178       UChar nAuto      = DetailLevelByOp__count( &mce.dlbo, DLauto      );
8179       UChar nExpensive = DetailLevelByOp__count( &mce.dlbo, DLexpensive );
8180       tl_assert(nCheap + nAuto + nExpensive == 8);
8181
8182       VG_(printf)("%u,%u,%u ", nCheap, nAuto, nExpensive);
8183    }
8184    /* END decide on expense levels for instrumentation. */
8185
8186    /* Initialise the running the tmp environment. */
8187
8188    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
8189                             sizeof(TempMapEnt));
8190    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
8191    for (i = 0; i < sb_in->tyenv->types_used; i++) {
8192       TempMapEnt ent;
8193       ent.kind    = Orig;
8194       ent.shadowV = IRTemp_INVALID;
8195       ent.shadowB = IRTemp_INVALID;
8196       VG_(addToXA)( mce.tmpMap, &ent );
8197    }
8198    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
8199
8200    /* Finally, begin instrumentation. */
8201    /* Copy verbatim any IR preamble preceding the first IMark */
8202
8203    tl_assert(mce.sb == sb_out);
8204    tl_assert(mce.sb != sb_in);
8205
8206    i = 0;
8207    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
8208
8209       st = sb_in->stmts[i];
8210       tl_assert(st);
8211       tl_assert(isFlatIRStmt(st));
8212
8213       stmt( 'C', &mce, sb_in->stmts[i] );
8214       i++;
8215    }
8216
8217    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
8218       cause the IR following the preamble to contain references to IR
8219       temporaries defined in the preamble.  Because the preamble isn't
8220       instrumented, these temporaries don't have any shadows.
8221       Nevertheless uses of them following the preamble will cause
8222       memcheck to generate references to their shadows.  End effect is
8223       to cause IR sanity check failures, due to references to
8224       non-existent shadows.  This is only evident for the complex
8225       preambles used for function wrapping on TOC-afflicted platforms
8226       (ppc64-linux).
8227
8228       The following loop therefore scans the preamble looking for
8229       assignments to temporaries.  For each one found it creates an
8230       assignment to the corresponding (V) shadow temp, marking it as
8231       'defined'.  This is the same resulting IR as if the main
8232       instrumentation loop before had been applied to the statement
8233       'tmp = CONSTANT'.
8234
8235       Similarly, if origin tracking is enabled, we must generate an
8236       assignment for the corresponding origin (B) shadow, claiming
8237       no-origin, as appropriate for a defined value.
8238    */
8239    for (j = 0; j < i; j++) {
8240       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
8241          /* findShadowTmpV checks its arg is an original tmp;
8242             no need to assert that here. */
8243          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
8244          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
8245          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
8246          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
8247          if (MC_(clo_mc_level) == 3) {
8248             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
8249             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
8250             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
8251          }
8252          if (0) {
8253             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
8254             ppIRType( ty_v );
8255             VG_(printf)("\n");
8256          }
8257       }
8258    }
8259
8260    /* Iterate over the remaining stmts to generate instrumentation. */
8261
8262    tl_assert(sb_in->stmts_used > 0);
8263    tl_assert(i >= 0);
8264    tl_assert(i < sb_in->stmts_used);
8265    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
8266
8267    for (/* use current i*/; i < sb_in->stmts_used; i++) {
8268
8269       st = sb_in->stmts[i];
8270       first_stmt = sb_out->stmts_used;
8271
8272       if (verboze) {
8273          VG_(printf)("\n");
8274          ppIRStmt(st);
8275          VG_(printf)("\n");
8276       }
8277
8278       if (MC_(clo_mc_level) == 3) {
8279          /* See comments on case Ist_CAS below. */
8280          if (st->tag != Ist_CAS)
8281             schemeS( &mce, st );
8282       }
8283
8284       /* Generate instrumentation code for each stmt ... */
8285
8286       switch (st->tag) {
8287
8288          case Ist_WrTmp: {
8289             IRTemp dst = st->Ist.WrTmp.tmp;
8290             tl_assert(dst < (UInt)sb_in->tyenv->types_used);
8291             HowUsed hu = mce.tmpHowUsed ? mce.tmpHowUsed[dst]
8292                                         : HuOth/*we don't know, so play safe*/;
8293             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
8294                                expr2vbits( &mce, st->Ist.WrTmp.data, hu ));
8295             break;
8296          }
8297
8298          case Ist_Put:
8299             do_shadow_PUT( &mce,
8300                            st->Ist.Put.offset,
8301                            st->Ist.Put.data,
8302                            NULL /* shadow atom */, NULL /* guard */ );
8303             break;
8304
8305          case Ist_PutI:
8306             do_shadow_PUTI( &mce, st->Ist.PutI.details);
8307             break;
8308
8309          case Ist_Store:
8310             do_shadow_Store( &mce, st->Ist.Store.end,
8311                                    st->Ist.Store.addr, 0/* addr bias */,
8312                                    st->Ist.Store.data,
8313                                    NULL /* shadow data */,
8314                                    NULL/*guard*/ );
8315             break;
8316
8317          case Ist_StoreG:
8318             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
8319             break;
8320
8321          case Ist_LoadG:
8322             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
8323             break;
8324
8325          case Ist_Exit:
8326             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
8327             break;
8328
8329          case Ist_IMark:
8330             break;
8331
8332          case Ist_NoOp:
8333          case Ist_MBE:
8334             break;
8335
8336          case Ist_Dirty:
8337             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
8338             break;
8339
8340          case Ist_AbiHint:
8341             do_AbiHint( &mce, st->Ist.AbiHint.base,
8342                               st->Ist.AbiHint.len,
8343                               st->Ist.AbiHint.nia );
8344             break;
8345
8346          case Ist_CAS:
8347             do_shadow_CAS( &mce, st->Ist.CAS.details );
8348             /* Note, do_shadow_CAS copies the CAS itself to the output
8349                block, because it needs to add instrumentation both
8350                before and after it.  Hence skip the copy below.  Also
8351                skip the origin-tracking stuff (call to schemeS) above,
8352                since that's all tangled up with it too; do_shadow_CAS
8353                does it all. */
8354             break;
8355
8356          case Ist_LLSC:
8357             do_shadow_LLSC( &mce,
8358                             st->Ist.LLSC.end,
8359                             st->Ist.LLSC.result,
8360                             st->Ist.LLSC.addr,
8361                             st->Ist.LLSC.storedata );
8362             break;
8363
8364          default:
8365             VG_(printf)("\n");
8366             ppIRStmt(st);
8367             VG_(printf)("\n");
8368             VG_(tool_panic)("memcheck: unhandled IRStmt");
8369
8370       } /* switch (st->tag) */
8371
8372       if (0 && verboze) {
8373          for (j = first_stmt; j < sb_out->stmts_used; j++) {
8374             VG_(printf)("   ");
8375             ppIRStmt(sb_out->stmts[j]);
8376             VG_(printf)("\n");
8377          }
8378          VG_(printf)("\n");
8379       }
8380
8381       /* ... and finally copy the stmt itself to the output.  Except,
8382          skip the copy of IRCASs; see comments on case Ist_CAS
8383          above. */
8384       if (st->tag != Ist_CAS)
8385          stmt('C', &mce, st);
8386    }
8387
8388    /* Now we need to complain if the jump target is undefined. */
8389    first_stmt = sb_out->stmts_used;
8390
8391    if (verboze) {
8392       VG_(printf)("sb_in->next = ");
8393       ppIRExpr(sb_in->next);
8394       VG_(printf)("\n\n");
8395    }
8396
8397    complainIfUndefined( &mce, sb_in->next, NULL );
8398
8399    if (0 && verboze) {
8400       for (j = first_stmt; j < sb_out->stmts_used; j++) {
8401          VG_(printf)("   ");
8402          ppIRStmt(sb_out->stmts[j]);
8403          VG_(printf)("\n");
8404       }
8405       VG_(printf)("\n");
8406    }
8407
8408    /* If this fails, there's been some serious snafu with tmp management,
8409       that should be investigated. */
8410    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
8411    VG_(deleteXA)( mce.tmpMap );
8412
8413    if (mce.tmpHowUsed) {
8414       VG_(free)( mce.tmpHowUsed );
8415    }
8416
8417    tl_assert(mce.sb == sb_out);
8418    return sb_out;
8419 }
8420
8421
8422 /*--------------------------------------------------------------------*/
8423 /*--- end                                           mc_translate.c ---*/
8424 /*--------------------------------------------------------------------*/