memcheck/mc_translate.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- Instrument IR to perform memory checking operations.         ---*/
   4 /*---                                               mc_translate.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of MemCheck, a heavyweight Valgrind tool for
   9    detecting memory errors.
  10
  11    Copyright (C) 2000-2017 Julian Seward
  12       jseward@acm.org
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, write to the Free Software
  26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  27    02111-1307, USA.
  28
  29    The GNU General Public License is contained in the file COPYING.
  30 */
  31
  32 #include "pub_tool_basics.h"
  33 #include "pub_tool_poolalloc.h"     // For mc_include.h
  34 #include "pub_tool_hashtable.h"     // For mc_include.h
  35 #include "pub_tool_libcassert.h"
  36 #include "pub_tool_libcprint.h"
  37 #include "pub_tool_tooliface.h"
  38 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
  39 #include "pub_tool_xarray.h"
  40 #include "pub_tool_mallocfree.h"
  41 #include "pub_tool_libcbase.h"
  42
  43 #include "mc_include.h"
  44
  45
  46 /* FIXMEs JRS 2011-June-16.
  47
  48    Check the interpretation for vector narrowing and widening ops,
  49    particularly the saturating ones.  I suspect they are either overly
  50    pessimistic and/or wrong.
  51
  52    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
  53    saturating shifts): the interpretation is overly pessimistic.
  54    See comments on the relevant cases below for details.
  55
  56    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
  57    both rounding and non-rounding variants): ditto
  58 */
  59
  60 /* This file implements the Memcheck instrumentation, and in
  61    particular contains the core of its undefined value detection
  62    machinery.  For a comprehensive background of the terminology,
  63    algorithms and rationale used herein, read:
  64
  65      Using Valgrind to detect undefined value errors with
  66      bit-precision
  67
  68      Julian Seward and Nicholas Nethercote
  69
  70      2005 USENIX Annual Technical Conference (General Track),
  71      Anaheim, CA, USA, April 10-15, 2005.
  72
  73    ----
  74
  75    Here is as good a place as any to record exactly when V bits are and
  76    should be checked, why, and what function is responsible.
  77
  78
  79    Memcheck complains when an undefined value is used:
  80
  81    1. In the condition of a conditional branch.  Because it could cause
  82       incorrect control flow, and thus cause incorrect externally-visible
  83       behaviour.  [mc_translate.c:complainIfUndefined]
  84
  85    2. As an argument to a system call, or as the value that specifies
  86       the system call number.  Because it could cause an incorrect
  87       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
  88
  89    3. As the address in a load or store.  Because it could cause an
  90       incorrect value to be used later, which could cause externally-visible
  91       behaviour (eg. via incorrect control flow or an incorrect system call
  92       argument)  [complainIfUndefined]
  93
  94    4. As the target address of a branch.  Because it could cause incorrect
  95       control flow.  [complainIfUndefined]
  96
  97    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
  98       an incorrect value into the external environment.
  99       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
 100
 101    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
 102       [complainIfUndefined]
 103
 104    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
 105       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
 106       requested it.  [in memcheck.h]
 107
 108
 109    Memcheck also complains, but should not, when an undefined value is used:
 110
 111    8. As the shift value in certain SIMD shift operations (but not in the
 112       standard integer shift operations).  This inconsistency is due to
 113       historical reasons.)  [complainIfUndefined]
 114
 115
 116    Memcheck does not complain, but should, when an undefined value is used:
 117
 118    9. As an input to a client request.  Because the client request may
 119       affect the visible behaviour -- see bug #144362 for an example
 120       involving the malloc replacements in vg_replace_malloc.c and
 121       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
 122       isn't identified.  That bug report also has some info on how to solve
 123       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
 124
 125
 126    In practice, 1 and 2 account for the vast majority of cases.
 127 */
 128
 129 /* Generation of addr-definedness, addr-validity and
 130    guard-definedness checks pertaining to loads and stores (Iex_Load,
 131    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
 132    loads/stores) was re-checked 11 May 2013. */
 133
 134
 135 /*------------------------------------------------------------*/
 136 /*--- Forward decls                                        ---*/
 137 /*------------------------------------------------------------*/
 138
 139 struct _MCEnv;
 140
 141 // See below for comments explaining what this is for.
 142 typedef
 143    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 144    HowUsed;
 145
 146 static IRType  shadowTypeV ( IRType ty );
 147 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e,
 148                             HowUsed hu/*use HuOth if unknown*/ );
 149 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
 150
 151 static IRExpr *i128_const_zero(void);
 152
 153
 154 /*------------------------------------------------------------*/
 155 /*--- Memcheck running state, and tmp management.          ---*/
 156 /*------------------------------------------------------------*/
 157
 158 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
 159    propagation scheme, and a more expensive, more precise vbit propagation
 160    scheme.  This enum describes, for such an IROp, which scheme to use. */
 161 typedef
 162    enum {
 163       // Use the cheaper, less-exact variant.
 164       DLcheap=4,
 165       // Choose between cheap and expensive based on analysis of the block
 166       // to be instrumented.  Note that the choice may be done on a
 167       // per-instance basis of the IROp that this DetailLevel describes.
 168       DLauto,
 169       // Use the more expensive, more-exact variant.
 170       DLexpensive
 171    }
 172    DetailLevel;
 173
 174
 175 /* A readonly part of the running state.  For IROps that have both a
 176    less-exact and more-exact interpretation, records which interpretation is
 177    to be used.  */
 178 typedef
 179    struct {
 180       // For Add32/64 and Sub32/64, all 3 settings are allowed.  For the
 181       // DLauto case, a per-instance decision is to be made by inspecting
 182       // the associated tmp's entry in MCEnv.tmpHowUsed.
 183       DetailLevel dl_Add32;
 184       DetailLevel dl_Add64;
 185       DetailLevel dl_Sub32;
 186       DetailLevel dl_Sub64;
 187       // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
 188       // allowed.
 189       DetailLevel dl_CmpEQ64_CmpNE64;
 190       DetailLevel dl_CmpEQ32_CmpNE32;
 191       DetailLevel dl_CmpEQ16_CmpNE16;
 192       DetailLevel dl_CmpEQ8_CmpNE8;
 193    }
 194    DetailLevelByOp;
 195
 196 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp* dlbo,
 197                                        DetailLevel dl )
 198 {
 199    dlbo->dl_Add32           = dl;
 200    dlbo->dl_Add64           = dl;
 201    dlbo->dl_Sub32           = dl;
 202    dlbo->dl_Sub64           = dl;
 203    dlbo->dl_CmpEQ64_CmpNE64 = dl;
 204    dlbo->dl_CmpEQ32_CmpNE32 = dl;
 205    dlbo->dl_CmpEQ16_CmpNE16 = dl;
 206    dlbo->dl_CmpEQ8_CmpNE8   = dl;
 207 }
 208
 209 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp* dlbo )
 210 {
 211    tl_assert(dlbo->dl_Add32 >= DLcheap && dlbo->dl_Add32 <= DLexpensive);
 212    tl_assert(dlbo->dl_Add64 >= DLcheap && dlbo->dl_Add64 <= DLexpensive);
 213    tl_assert(dlbo->dl_Sub32 >= DLcheap && dlbo->dl_Sub32 <= DLexpensive);
 214    tl_assert(dlbo->dl_Sub64 >= DLcheap && dlbo->dl_Sub64 <= DLexpensive);
 215    tl_assert(dlbo->dl_CmpEQ64_CmpNE64 == DLcheap
 216              || dlbo->dl_CmpEQ64_CmpNE64 == DLexpensive);
 217    tl_assert(dlbo->dl_CmpEQ32_CmpNE32 == DLcheap
 218              || dlbo->dl_CmpEQ32_CmpNE32 == DLexpensive);
 219    tl_assert(dlbo->dl_CmpEQ16_CmpNE16 == DLcheap
 220              || dlbo->dl_CmpEQ16_CmpNE16 == DLexpensive);
 221    tl_assert(dlbo->dl_CmpEQ8_CmpNE8 == DLcheap
 222              || dlbo->dl_CmpEQ8_CmpNE8 == DLexpensive);
 223 }
 224
 225 static UInt DetailLevelByOp__count ( const DetailLevelByOp* dlbo,
 226                                      DetailLevel dl )
 227 {
 228    UInt n = 0;
 229    n += (dlbo->dl_Add32 == dl            ? 1 : 0);
 230    n += (dlbo->dl_Add64 == dl            ? 1 : 0);
 231    n += (dlbo->dl_Sub32 == dl            ? 1 : 0);
 232    n += (dlbo->dl_Sub64 == dl            ? 1 : 0);
 233    n += (dlbo->dl_CmpEQ64_CmpNE64 == dl  ? 1 : 0);
 234    n += (dlbo->dl_CmpEQ32_CmpNE32 == dl  ? 1 : 0);
 235    n += (dlbo->dl_CmpEQ16_CmpNE16 == dl  ? 1 : 0);
 236    n += (dlbo->dl_CmpEQ8_CmpNE8 == dl    ? 1 : 0);
 237    return n;
 238 }
 239
 240
 241 /* Carries info about a particular tmp.  The tmp's number is not
 242    recorded, as this is implied by (equal to) its index in the tmpMap
 243    in MCEnv.  The tmp's type is also not recorded, as this is present
 244    in MCEnv.sb->tyenv.
 245
 246    When .kind is Orig, .shadowV and .shadowB may give the identities
 247    of the temps currently holding the associated definedness (shadowV)
 248    and origin (shadowB) values, or these may be IRTemp_INVALID if code
 249    to compute such values has not yet been emitted.
 250
 251    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
 252    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
 253    illogical for a shadow tmp itself to be shadowed.
 254 */
 255 typedef
 256    enum { Orig=1, VSh=2, BSh=3 }
 257    TempKind;
 258
 259 typedef
 260    struct {
 261       TempKind kind;
 262       IRTemp   shadowV;
 263       IRTemp   shadowB;
 264    }
 265    TempMapEnt;
 266
 267
 268 /* A |HowUsed| value carries analysis results about how values are used,
 269    pertaining to whether we need to instrument integer adds expensively or
 270    not.  The running state carries a (readonly) mapping from original tmp to
 271    a HowUsed value for it.  A usage value can be one of three values,
 272    forming a 3-point chain lattice.
 273
 274       HuOth   ("Other") used in some arbitrary way
 275        |
 276       HuPCa   ("PCast") used *only* in effectively a PCast, in which all
 277        |      we care about is the all-defined vs not-all-defined distinction
 278        |
 279       HuUnU   ("Unused") not used at all.
 280
 281    The "safe" (don't-know) end of the lattice is "HuOth".  See comments
 282    below in |preInstrumentationAnalysis| for further details.
 283 */
 284 /* DECLARED ABOVE:
 285 typedef
 286    enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
 287    HowUsed;
 288 */
 289
 290 // Not actually necessary, but we don't want to waste D1 space.
 291 STATIC_ASSERT(sizeof(HowUsed) == 1);
 292
 293
 294 /* Carries around state during memcheck instrumentation. */
 295 typedef
 296    struct _MCEnv {
 297       /* MODIFIED: the superblock being constructed.  IRStmts are
 298          added. */
 299       IRSB* sb;
 300       Bool  trace;
 301
 302       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
 303          current kind and possibly shadow temps for each temp in the
 304          IRSB being constructed.  Note that it does not contain the
 305          type of each tmp.  If you want to know the type, look at the
 306          relevant entry in sb->tyenv.  It follows that at all times
 307          during the instrumentation process, the valid indices for
 308          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
 309          total number of Orig, V- and B- temps allocated so far.
 310
 311          The reason for this strange split (types in one place, all
 312          other info in another) is that we need the types to be
 313          attached to sb so as to make it possible to do
 314          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
 315          instrumentation process. */
 316       XArray* /* of TempMapEnt */ tmpMap;
 317
 318       /* READONLY: contains details of which ops should be expensively
 319          instrumented. */
 320       DetailLevelByOp dlbo;
 321
 322       /* READONLY: for each original tmp, how the tmp is used.  This is
 323          computed by |preInstrumentationAnalysis|.  Valid indices are
 324          0 .. #temps_in_sb-1 (same as for tmpMap). */
 325       HowUsed* tmpHowUsed;
 326
 327       /* READONLY: the guest layout.  This indicates which parts of
 328          the guest state should be regarded as 'always defined'. */
 329       const VexGuestLayout* layout;
 330
 331       /* READONLY: the host word type.  Needed for constructing
 332          arguments of type 'HWord' to be passed to helper functions.
 333          Ity_I32 or Ity_I64 only. */
 334       IRType hWordTy;
 335    }
 336    MCEnv;
 337
 338
 339 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
 340    demand), as they are encountered.  This is for two reasons.
 341
 342    (1) (less important reason): Many original tmps are unused due to
 343    initial IR optimisation, and we do not want to spaces in tables
 344    tracking them.
 345
 346    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
 347    table indexed [0 .. n_types-1], which gives the current shadow for
 348    each original tmp, or INVALID_IRTEMP if none is so far assigned.
 349    It is necessary to support making multiple assignments to a shadow
 350    -- specifically, after testing a shadow for definedness, it needs
 351    to be made defined.  But IR's SSA property disallows this.
 352
 353    (2) (more important reason): Therefore, when a shadow needs to get
 354    a new value, a new temporary is created, the value is assigned to
 355    that, and the tmpMap is updated to reflect the new binding.
 356
 357    A corollary is that if the tmpMap maps a given tmp to
 358    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
 359    there's a read-before-write error in the original tmps.  The IR
 360    sanity checker should catch all such anomalies, however.
 361 */
 362
 363 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
 364    both the table in mce->sb and to our auxiliary mapping.  Note that
 365    newTemp may cause mce->tmpMap to resize, hence previous results
 366    from VG_(indexXA)(mce->tmpMap) are invalidated. */
 367 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
 368 {
 369    Word       newIx;
 370    TempMapEnt ent;
 371    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
 372    ent.kind    = kind;
 373    ent.shadowV = IRTemp_INVALID;
 374    ent.shadowB = IRTemp_INVALID;
 375    newIx = VG_(addToXA)( mce->tmpMap, &ent );
 376    tl_assert(newIx == (Word)tmp);
 377    return tmp;
 378 }
 379
 380
 381 /* Find the tmp currently shadowing the given original tmp.  If none
 382    so far exists, allocate one.  */
 383 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
 384 {
 385    TempMapEnt* ent;
 386    /* VG_(indexXA) range-checks 'orig', hence no need to check
 387       here. */
 388    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 389    tl_assert(ent->kind == Orig);
 390    if (ent->shadowV == IRTemp_INVALID) {
 391       IRTemp tmpV
 392         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 393       /* newTemp may cause mce->tmpMap to resize, hence previous results
 394          from VG_(indexXA) are invalid. */
 395       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 396       tl_assert(ent->kind == Orig);
 397       tl_assert(ent->shadowV == IRTemp_INVALID);
 398       ent->shadowV = tmpV;
 399    }
 400    return ent->shadowV;
 401 }
 402
 403 /* Allocate a new shadow for the given original tmp.  This means any
 404    previous shadow is abandoned.  This is needed because it is
 405    necessary to give a new value to a shadow once it has been tested
 406    for undefinedness, but unfortunately IR's SSA property disallows
 407    this.  Instead we must abandon the old shadow, allocate a new one
 408    and use that instead.
 409
 410    This is the same as findShadowTmpV, except we don't bother to see
 411    if a shadow temp already existed -- we simply allocate a new one
 412    regardless. */
 413 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
 414 {
 415    TempMapEnt* ent;
 416    /* VG_(indexXA) range-checks 'orig', hence no need to check
 417       here. */
 418    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 419    tl_assert(ent->kind == Orig);
 420    if (1) {
 421       IRTemp tmpV
 422         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
 423       /* newTemp may cause mce->tmpMap to resize, hence previous results
 424          from VG_(indexXA) are invalid. */
 425       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
 426       tl_assert(ent->kind == Orig);
 427       ent->shadowV = tmpV;
 428    }
 429 }
 430
 431
 432 /*------------------------------------------------------------*/
 433 /*--- IRAtoms -- a subset of IRExprs                       ---*/
 434 /*------------------------------------------------------------*/
 435
 436 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
 437    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
 438    input, most of this code deals in atoms.  Usefully, a value atom
 439    always has a V-value which is also an atom: constants are shadowed
 440    by constants, and temps are shadowed by the corresponding shadow
 441    temporary. */
 442
 443 typedef  IRExpr  IRAtom;
 444
 445 /* (used for sanity checks only): is this an atom which looks
 446    like it's from original code? */
 447 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
 448 {
 449    if (a1->tag == Iex_Const)
 450       return True;
 451    if (a1->tag == Iex_RdTmp) {
 452       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 453       return ent->kind == Orig;
 454    }
 455    return False;
 456 }
 457
 458 /* (used for sanity checks only): is this an atom which looks
 459    like it's from shadow code? */
 460 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
 461 {
 462    if (a1->tag == Iex_Const)
 463       return True;
 464    if (a1->tag == Iex_RdTmp) {
 465       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
 466       return ent->kind == VSh || ent->kind == BSh;
 467    }
 468    return False;
 469 }
 470
 471 /* (used for sanity checks only): check that both args are atoms and
 472    are identically-kinded. */
 473 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
 474 {
 475    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
 476       return True;
 477    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
 478       return True;
 479    return False;
 480 }
 481
 482
 483 /*------------------------------------------------------------*/
 484 /*--- Type management                                      ---*/
 485 /*------------------------------------------------------------*/
 486
 487 /* Shadow state is always accessed using integer types.  This returns
 488    an integer type with the same size (as per sizeofIRType) as the
 489    given type.  The only valid shadow types are Bit, I8, I16, I32,
 490    I64, I128, V128, V256. */
 491
 492 static IRType shadowTypeV ( IRType ty )
 493 {
 494    switch (ty) {
 495       case Ity_I1:
 496       case Ity_I8:
 497       case Ity_I16:
 498       case Ity_I32:
 499       case Ity_I64:
 500       case Ity_I128: return ty;
 501       case Ity_F16:  return Ity_I16;
 502       case Ity_F32:  return Ity_I32;
 503       case Ity_D32:  return Ity_I32;
 504       case Ity_F64:  return Ity_I64;
 505       case Ity_D64:  return Ity_I64;
 506       case Ity_F128: return Ity_I128;
 507       case Ity_D128: return Ity_I128;
 508       case Ity_V128: return Ity_V128;
 509       case Ity_V256: return Ity_V256;
 510       default: ppIRType(ty);
 511                VG_(tool_panic)("memcheck:shadowTypeV");
 512    }
 513 }
 514
 515 /* Produce a 'defined' value of the given shadow type.  Should only be
 516    supplied shadow types (Bit/I8/I16/I32/UI64). */
 517 static IRExpr* definedOfType ( IRType ty ) {
 518    switch (ty) {
 519       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
 520       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
 521       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
 522       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
 523       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
 524       case Ity_I128: return i128_const_zero();
 525       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
 526       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
 527       default:       VG_(tool_panic)("memcheck:definedOfType");
 528    }
 529 }
 530
 531
 532 /*------------------------------------------------------------*/
 533 /*--- Constructing IR fragments                            ---*/
 534 /*------------------------------------------------------------*/
 535
 536 /* add stmt to a bb */
 537 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
 538    if (mce->trace) {
 539       VG_(printf)("  %c: ", cat);
 540       ppIRStmt(st);
 541       VG_(printf)("\n");
 542    }
 543    addStmtToIRSB(mce->sb, st);
 544 }
 545
 546 /* assign value to tmp */
 547 static inline
 548 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
 549    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
 550 }
 551
 552 /* build various kinds of expressions */
 553 #define triop(_op, _arg1, _arg2, _arg3) \
 554                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
 555 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
 556 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
 557 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
 558 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
 559 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
 560 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
 561 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
 562 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
 563 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
 564
 565 /* Bind the given expression to a new temporary, and return the
 566    temporary.  This effectively converts an arbitrary expression into
 567    an atom.
 568
 569    'ty' is the type of 'e' and hence the type that the new temporary
 570    needs to be.  But passing it in is redundant, since we can deduce
 571    the type merely by inspecting 'e'.  So at least use that fact to
 572    assert that the two types agree. */
 573 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
 574 {
 575    TempKind k;
 576    IRTemp   t;
 577    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
 578
 579    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
 580    switch (cat) {
 581       case 'V': k = VSh;  break;
 582       case 'B': k = BSh;  break;
 583       case 'C': k = Orig; break;
 584                 /* happens when we are making up new "orig"
 585                    expressions, for IRCAS handling */
 586       default: tl_assert(0);
 587    }
 588    t = newTemp(mce, ty, k);
 589    assign(cat, mce, t, e);
 590    return mkexpr(t);
 591 }
 592
 593
 594 /*------------------------------------------------------------*/
 595 /*--- Helper functions for 128-bit ops                     ---*/
 596 /*------------------------------------------------------------*/
 597
 598 static IRExpr *i128_const_zero(void)
 599 {
 600    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
 601    return binop(Iop_64HLto128, z64, z64);
 602 }
 603
 604 /* There are no I128-bit loads and/or stores [as generated by any
 605    current front ends].  So we do not need to worry about that in
 606    expr2vbits_Load */
 607
 608
 609 /*------------------------------------------------------------*/
 610 /*--- Constructing definedness primitive ops               ---*/
 611 /*------------------------------------------------------------*/
 612
 613 /* --------- Defined-if-either-defined --------- */
 614
 615 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 616    tl_assert(isShadowAtom(mce,a1));
 617    tl_assert(isShadowAtom(mce,a2));
 618    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
 619 }
 620
 621 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 622    tl_assert(isShadowAtom(mce,a1));
 623    tl_assert(isShadowAtom(mce,a2));
 624    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
 625 }
 626
 627 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 628    tl_assert(isShadowAtom(mce,a1));
 629    tl_assert(isShadowAtom(mce,a2));
 630    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
 631 }
 632
 633 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 634    tl_assert(isShadowAtom(mce,a1));
 635    tl_assert(isShadowAtom(mce,a2));
 636    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
 637 }
 638
 639 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 640    tl_assert(isShadowAtom(mce,a1));
 641    tl_assert(isShadowAtom(mce,a2));
 642    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
 643 }
 644
 645 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 646    tl_assert(isShadowAtom(mce,a1));
 647    tl_assert(isShadowAtom(mce,a2));
 648    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
 649 }
 650
 651 /* --------- Undefined-if-either-undefined --------- */
 652
 653 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 654    tl_assert(isShadowAtom(mce,a1));
 655    tl_assert(isShadowAtom(mce,a2));
 656    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
 657 }
 658
 659 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 660    tl_assert(isShadowAtom(mce,a1));
 661    tl_assert(isShadowAtom(mce,a2));
 662    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
 663 }
 664
 665 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 666    tl_assert(isShadowAtom(mce,a1));
 667    tl_assert(isShadowAtom(mce,a2));
 668    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
 669 }
 670
 671 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 672    tl_assert(isShadowAtom(mce,a1));
 673    tl_assert(isShadowAtom(mce,a2));
 674    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
 675 }
 676
 677 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 678    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
 679    tl_assert(isShadowAtom(mce,a1));
 680    tl_assert(isShadowAtom(mce,a2));
 681    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
 682    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
 683    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
 684    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
 685    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
 686    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
 687
 688    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
 689 }
 690
 691 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 692    tl_assert(isShadowAtom(mce,a1));
 693    tl_assert(isShadowAtom(mce,a2));
 694    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
 695 }
 696
 697 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
 698    tl_assert(isShadowAtom(mce,a1));
 699    tl_assert(isShadowAtom(mce,a2));
 700    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
 701 }
 702
 703 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
 704    switch (vty) {
 705       case Ity_I8:   return mkUifU8(mce, a1, a2);
 706       case Ity_I16:  return mkUifU16(mce, a1, a2);
 707       case Ity_I32:  return mkUifU32(mce, a1, a2);
 708       case Ity_I64:  return mkUifU64(mce, a1, a2);
 709       case Ity_I128: return mkUifU128(mce, a1, a2);
 710       case Ity_V128: return mkUifUV128(mce, a1, a2);
 711       case Ity_V256: return mkUifUV256(mce, a1, a2);
 712       default:
 713          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
 714          VG_(tool_panic)("memcheck:mkUifU");
 715    }
 716 }
 717
 718 /* --------- The Left-family of operations. --------- */
 719
 720 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
 721    tl_assert(isShadowAtom(mce,a1));
 722    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
 723 }
 724
 725 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
 726    tl_assert(isShadowAtom(mce,a1));
 727    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
 728 }
 729
 730 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
 731    tl_assert(isShadowAtom(mce,a1));
 732    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
 733 }
 734
 735 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
 736    tl_assert(isShadowAtom(mce,a1));
 737    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
 738 }
 739
 740 /* --------- 'Improvement' functions for AND/OR. --------- */
 741
 742 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
 743    defined (0); all other -> undefined (1).
 744 */
 745 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 746 {
 747    tl_assert(isOriginalAtom(mce, data));
 748    tl_assert(isShadowAtom(mce, vbits));
 749    tl_assert(sameKindedAtoms(data, vbits));
 750    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
 751 }
 752
 753 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 754 {
 755    tl_assert(isOriginalAtom(mce, data));
 756    tl_assert(isShadowAtom(mce, vbits));
 757    tl_assert(sameKindedAtoms(data, vbits));
 758    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
 759 }
 760
 761 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 762 {
 763    tl_assert(isOriginalAtom(mce, data));
 764    tl_assert(isShadowAtom(mce, vbits));
 765    tl_assert(sameKindedAtoms(data, vbits));
 766    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
 767 }
 768
 769 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 770 {
 771    tl_assert(isOriginalAtom(mce, data));
 772    tl_assert(isShadowAtom(mce, vbits));
 773    tl_assert(sameKindedAtoms(data, vbits));
 774    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
 775 }
 776
 777 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 778 {
 779    tl_assert(isOriginalAtom(mce, data));
 780    tl_assert(isShadowAtom(mce, vbits));
 781    tl_assert(sameKindedAtoms(data, vbits));
 782    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
 783 }
 784
 785 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 786 {
 787    tl_assert(isOriginalAtom(mce, data));
 788    tl_assert(isShadowAtom(mce, vbits));
 789    tl_assert(sameKindedAtoms(data, vbits));
 790    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
 791 }
 792
 793 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
 794    defined (0); all other -> undefined (1).
 795 */
 796 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 797 {
 798    tl_assert(isOriginalAtom(mce, data));
 799    tl_assert(isShadowAtom(mce, vbits));
 800    tl_assert(sameKindedAtoms(data, vbits));
 801    return assignNew(
 802              'V', mce, Ity_I8,
 803              binop(Iop_Or8,
 804                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
 805                    vbits) );
 806 }
 807
 808 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 809 {
 810    tl_assert(isOriginalAtom(mce, data));
 811    tl_assert(isShadowAtom(mce, vbits));
 812    tl_assert(sameKindedAtoms(data, vbits));
 813    return assignNew(
 814              'V', mce, Ity_I16,
 815              binop(Iop_Or16,
 816                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
 817                    vbits) );
 818 }
 819
 820 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 821 {
 822    tl_assert(isOriginalAtom(mce, data));
 823    tl_assert(isShadowAtom(mce, vbits));
 824    tl_assert(sameKindedAtoms(data, vbits));
 825    return assignNew(
 826              'V', mce, Ity_I32,
 827              binop(Iop_Or32,
 828                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
 829                    vbits) );
 830 }
 831
 832 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 833 {
 834    tl_assert(isOriginalAtom(mce, data));
 835    tl_assert(isShadowAtom(mce, vbits));
 836    tl_assert(sameKindedAtoms(data, vbits));
 837    return assignNew(
 838              'V', mce, Ity_I64,
 839              binop(Iop_Or64,
 840                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
 841                    vbits) );
 842 }
 843
 844 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 845 {
 846    tl_assert(isOriginalAtom(mce, data));
 847    tl_assert(isShadowAtom(mce, vbits));
 848    tl_assert(sameKindedAtoms(data, vbits));
 849    return assignNew(
 850              'V', mce, Ity_V128,
 851              binop(Iop_OrV128,
 852                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
 853                    vbits) );
 854 }
 855
 856 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
 857 {
 858    tl_assert(isOriginalAtom(mce, data));
 859    tl_assert(isShadowAtom(mce, vbits));
 860    tl_assert(sameKindedAtoms(data, vbits));
 861    return assignNew(
 862              'V', mce, Ity_V256,
 863              binop(Iop_OrV256,
 864                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
 865                    vbits) );
 866 }
 867
 868 /* --------- Pessimising casts. --------- */
 869
 870 /* The function returns an expression of type DST_TY. If any of the VBITS
 871    is undefined (value == 1) the resulting expression has all bits set to
 872    1. Otherwise, all bits are 0. */
 873
 874 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
 875 {
 876    IRType  src_ty;
 877    IRAtom* tmp1;
 878
 879    /* Note, dst_ty is a shadow type, not an original type. */
 880    tl_assert(isShadowAtom(mce,vbits));
 881    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
 882
 883    /* Fast-track some common cases */
 884    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
 885       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 886
 887    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
 888       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 889
 890    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
 891       /* PCast the arg, then clone it. */
 892       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 893       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 894    }
 895
 896    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
 897       /* PCast the arg, then clone it 4 times. */
 898       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 899       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 900       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 901    }
 902
 903    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
 904       /* PCast the arg, then clone it 8 times. */
 905       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
 906       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
 907       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
 908       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
 909    }
 910
 911    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
 912       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
 913          the top half. */
 914       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
 915       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
 916    }
 917
 918    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
 919       /* Use InterleaveHI64x2 to copy the top half of the vector into
 920          the bottom half.  Then we can UifU it with the original, throw
 921          away the upper half of the result, and PCast-I64-to-I64
 922          the lower half. */
 923       // Generates vbits[127:64] : vbits[127:64]
 924       IRAtom* hi64hi64
 925          = assignNew('V', mce, Ity_V128,
 926                      binop(Iop_InterleaveHI64x2, vbits, vbits));
 927       // Generates
 928       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
 929       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
 930       IRAtom* lohi64
 931          = mkUifUV128(mce, hi64hi64, vbits);
 932       // Generates UifU(vbits[127:64],vbits[63:0])
 933       IRAtom* lo64
 934          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
 935       // Generates
 936       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
 937       //   == PCast-to-I64( vbits[127:0] )
 938       IRAtom* res
 939          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
 940       return res;
 941    }
 942
 943    /* Else do it the slow way .. */
 944    /* First of all, collapse vbits down to a single bit. */
 945    tmp1   = NULL;
 946    switch (src_ty) {
 947       case Ity_I1:
 948          tmp1 = vbits;
 949          break;
 950       case Ity_I8:
 951          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
 952          break;
 953       case Ity_I16:
 954          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
 955          break;
 956       case Ity_I32:
 957          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
 958          break;
 959       case Ity_I64:
 960          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
 961          break;
 962       case Ity_I128: {
 963          /* Gah.  Chop it in half, OR the halves together, and compare
 964             that with zero. */
 965          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
 966          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
 967          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
 968          tmp1         = assignNew('V', mce, Ity_I1,
 969                                        unop(Iop_CmpNEZ64, tmp4));
 970          break;
 971       }
 972       case Ity_V128: {
 973          /* Chop it in half, OR the halves together, and compare that
 974           * with zero.
 975           */
 976          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
 977          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
 978          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
 979          tmp1         = assignNew('V', mce, Ity_I1,
 980                                        unop(Iop_CmpNEZ64, tmp4));
 981          break;
 982       }
 983       default:
 984          ppIRType(src_ty);
 985          VG_(tool_panic)("mkPCastTo(1)");
 986    }
 987    tl_assert(tmp1);
 988    /* Now widen up to the dst type. */
 989    switch (dst_ty) {
 990       case Ity_I1:
 991          return tmp1;
 992       case Ity_I8:
 993          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
 994       case Ity_I16:
 995          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
 996       case Ity_I32:
 997          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
 998       case Ity_I64:
 999          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1000       case Ity_V128:
1001          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1002          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
1003          return tmp1;
1004       case Ity_I128:
1005          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1006          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
1007          return tmp1;
1008       case Ity_V256:
1009          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
1010          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
1011                                                     tmp1, tmp1));
1012          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
1013                                                     tmp1, tmp1));
1014          return tmp1;
1015       default:
1016          ppIRType(dst_ty);
1017          VG_(tool_panic)("mkPCastTo(2)");
1018    }
1019 }
1020
1021 /* This is a minor variant.  It takes an arg of some type and returns
1022    a value of the same type.  The result consists entirely of Defined
1023    (zero) bits except its least significant bit, which is a PCast of
1024    the entire argument down to a single bit. */
1025 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
1026 {
1027    if (ty == Ity_V128) {
1028       /* --- Case for V128 --- */
1029       IRAtom* varg128 = varg;
1030       // generates: PCast-to-I64(varg128)
1031       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
1032       // Now introduce zeros (defined bits) in the top 63 places
1033       // generates: Def--(63)--Def PCast-to-I1(varg128)
1034       IRAtom* d63pc
1035          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
1036       // generates: Def--(64)--Def
1037       IRAtom* d64
1038          = definedOfType(Ity_I64);
1039       // generates: Def--(127)--Def PCast-to-I1(varg128)
1040       IRAtom* res
1041          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
1042       return res;
1043    }
1044    if (ty == Ity_I64) {
1045       /* --- Case for I64 --- */
1046       // PCast to 64
1047       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
1048       // Zero (Def) out the top 63 bits
1049       IRAtom* res
1050          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
1051       return res;
1052    }
1053    /*NOTREACHED*/
1054    tl_assert(0);
1055 }
1056
1057 /* --------- Optimistic casts. --------- */
1058
1059 /* The function takes and returns an expression of type TY. If any of the
1060    VBITS indicate defined (value == 0) the resulting expression has all bits
1061    set to 0. Otherwise, all bits are 1.  In words, if any bits are defined
1062    then all bits are made to be defined.
1063
1064    In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1065 */
1066 static IRAtom* mkOCastAt( MCEnv* mce, IRType ty, IRAtom* vbits )
1067 {
1068    IROp opSUB, opSHR, opSAR;
1069    UInt sh;
1070
1071    switch (ty) {
1072       case Ity_I64:
1073          opSUB = Iop_Sub64; opSHR = Iop_Shr64; opSAR = Iop_Sar64; sh = 63;
1074          break;
1075       case Ity_I32:
1076          opSUB = Iop_Sub32; opSHR = Iop_Shr32; opSAR = Iop_Sar32; sh = 31;
1077          break;
1078       case Ity_I16:
1079          opSUB = Iop_Sub16; opSHR = Iop_Shr16; opSAR = Iop_Sar16; sh = 15;
1080          break;
1081       case Ity_I8:
1082          opSUB = Iop_Sub8; opSHR = Iop_Shr8; opSAR = Iop_Sar8; sh = 7;
1083          break;
1084       default:
1085          ppIRType(ty);
1086          VG_(tool_panic)("mkOCastTo");
1087    }
1088
1089    IRAtom *shr1, *at;
1090    shr1 = assignNew('V', mce,ty, binop(opSHR, vbits, mkU8(1)));
1091    at   = assignNew('V', mce,ty, binop(opSUB, vbits, shr1));
1092    at   = assignNew('V', mce,ty, binop(opSAR, at, mkU8(sh)));
1093    return at;
1094 }
1095
1096
1097 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1098 /*
1099    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1100    PCasting to Ity_U1.  However, sometimes it is necessary to be more
1101    accurate.  The insight is that the result is defined if two
1102    corresponding bits can be found, one from each argument, so that
1103    both bits are defined but are different -- that makes EQ say "No"
1104    and NE say "Yes".  Hence, we compute an improvement term and DifD
1105    it onto the "normal" (UifU) result.
1106
1107    The result is:
1108
1109    PCastTo<1> (
1110       -- naive version
1111       UifU<sz>(vxx, vyy)
1112
1113       `DifD<sz>`
1114
1115       -- improvement term
1116       OCast<sz>(vec)
1117    )
1118
1119    where
1120      vec contains 0 (defined) bits where the corresponding arg bits
1121      are defined but different, and 1 bits otherwise.
1122
1123      vec = Or<sz>( vxx,   // 0 iff bit defined
1124                    vyy,   // 0 iff bit defined
1125                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1126                  )
1127
1128      If any bit of vec is 0, the result is defined and so the
1129      improvement term should produce 0...0, else it should produce
1130      1...1.
1131
1132      Hence require for the improvement term:
1133
1134         OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1135
1136      which you can think of as an "optimistic cast" (OCast, the opposite of
1137      the normal "pessimistic cast" (PCast) family.  An OCast says all bits
1138      are defined if any bit is defined.
1139
1140      It is possible to show that
1141
1142          if vec == 1...1 then 1...1 else 0...0
1143
1144      can be implemented in straight-line code as
1145
1146          (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1147
1148    We note that vec contains the sub-term Or<sz>(vxx, vyy).  Since UifU is
1149    implemented with Or (since 1 signifies undefinedness), this is a
1150    duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1151    a final version of:
1152
1153    let naive = UifU<sz>(vxx, vyy)
1154        vec   = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1155    in
1156        PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1157
1158    This was extensively re-analysed and checked on 6 July 05 and again
1159    in July 2017.
1160 */
1161 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
1162                                     IRType  ty,
1163                                     IRAtom* vxx, IRAtom* vyy,
1164                                     IRAtom* xx,  IRAtom* yy )
1165 {
1166    IRAtom *naive, *vec, *improved, *final_cast;
1167    IROp   opDIFD, opUIFU, opOR, opXOR, opNOT;
1168
1169    tl_assert(isShadowAtom(mce,vxx));
1170    tl_assert(isShadowAtom(mce,vyy));
1171    tl_assert(isOriginalAtom(mce,xx));
1172    tl_assert(isOriginalAtom(mce,yy));
1173    tl_assert(sameKindedAtoms(vxx,xx));
1174    tl_assert(sameKindedAtoms(vyy,yy));
1175
1176    switch (ty) {
1177       case Ity_I8:
1178          opDIFD = Iop_And8;
1179          opUIFU = Iop_Or8;
1180          opOR   = Iop_Or8;
1181          opXOR  = Iop_Xor8;
1182          opNOT  = Iop_Not8;
1183          break;
1184       case Ity_I16:
1185          opDIFD = Iop_And16;
1186          opUIFU = Iop_Or16;
1187          opOR   = Iop_Or16;
1188          opXOR  = Iop_Xor16;
1189          opNOT  = Iop_Not16;
1190          break;
1191       case Ity_I32:
1192          opDIFD = Iop_And32;
1193          opUIFU = Iop_Or32;
1194          opOR   = Iop_Or32;
1195          opXOR  = Iop_Xor32;
1196          opNOT  = Iop_Not32;
1197          break;
1198       case Ity_I64:
1199          opDIFD = Iop_And64;
1200          opUIFU = Iop_Or64;
1201          opOR   = Iop_Or64;
1202          opXOR  = Iop_Xor64;
1203          opNOT  = Iop_Not64;
1204          break;
1205       default:
1206          VG_(tool_panic)("expensiveCmpEQorNE");
1207    }
1208
1209    naive
1210       = assignNew('V', mce, ty, binop(opUIFU, vxx, vyy));
1211
1212    vec
1213       = assignNew(
1214            'V', mce,ty,
1215            binop( opOR,
1216                   naive,
1217                   assignNew(
1218                      'V', mce,ty,
1219                      unop(opNOT,
1220                           assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1221
1222    improved
1223       = assignNew( 'V', mce,ty,
1224                    binop(opDIFD, naive, mkOCastAt(mce, ty, vec)));
1225
1226    final_cast
1227       = mkPCastTo( mce, Ity_I1, improved );
1228
1229    return final_cast;
1230 }
1231
1232
1233 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1234
1235 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1236
1237       CmpORD32S(x,y) = 1<<3   if  x <s y
1238                      = 1<<2   if  x >s y
1239                      = 1<<1   if  x == y
1240
1241    and similarly the unsigned variant.  The default interpretation is:
1242
1243       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1244                                   & (7<<1)
1245
1246    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1247    are zero and therefore defined (viz, zero).
1248
1249    Also deal with a special case better:
1250
1251       CmpORD32S(x,0)
1252
1253    Here, bit 3 (LT) of the result is a copy of the top bit of x and
1254    will be defined even if the rest of x isn't.  In which case we do:
1255
1256       CmpORD32S#(x,x#,0,{impliedly 0}#)
1257          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
1258            | (x# >>u 31) << 3      -- LT# = x#[31]
1259
1260    Analogous handling for CmpORD64{S,U}.
1261 */
1262 static Bool isZeroU32 ( IRAtom* e )
1263 {
1264    return
1265       toBool( e->tag == Iex_Const
1266               && e->Iex.Const.con->tag == Ico_U32
1267               && e->Iex.Const.con->Ico.U32 == 0 );
1268 }
1269
1270 static Bool isZeroU64 ( IRAtom* e )
1271 {
1272    return
1273       toBool( e->tag == Iex_Const
1274               && e->Iex.Const.con->tag == Ico_U64
1275               && e->Iex.Const.con->Ico.U64 == 0 );
1276 }
1277
1278 static IRAtom* doCmpORD ( MCEnv*  mce,
1279                           IROp    cmp_op,
1280                           IRAtom* xxhash, IRAtom* yyhash,
1281                           IRAtom* xx,     IRAtom* yy )
1282 {
1283    Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1284    Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1285    IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
1286    IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
1287    IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
1288    IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
1289    IRType ty     = m64 ? Ity_I64   : Ity_I32;
1290    Int    width  = m64 ? 64        : 32;
1291
1292    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1293
1294    IRAtom* threeLeft1 = NULL;
1295    IRAtom* sevenLeft1 = NULL;
1296
1297    tl_assert(isShadowAtom(mce,xxhash));
1298    tl_assert(isShadowAtom(mce,yyhash));
1299    tl_assert(isOriginalAtom(mce,xx));
1300    tl_assert(isOriginalAtom(mce,yy));
1301    tl_assert(sameKindedAtoms(xxhash,xx));
1302    tl_assert(sameKindedAtoms(yyhash,yy));
1303    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1304              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1305
1306    if (0) {
1307       ppIROp(cmp_op); VG_(printf)(" ");
1308       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1309    }
1310
1311    if (syned && isZero(yy)) {
1312       /* fancy interpretation */
1313       /* if yy is zero, then it must be fully defined (zero#). */
1314       tl_assert(isZero(yyhash));
1315       threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
1316       return
1317          binop(
1318             opOR,
1319             assignNew(
1320                'V', mce,ty,
1321                binop(
1322                   opAND,
1323                   mkPCastTo(mce,ty, xxhash),
1324                   threeLeft1
1325                )),
1326             assignNew(
1327                'V', mce,ty,
1328                binop(
1329                   opSHL,
1330                   assignNew(
1331                      'V', mce,ty,
1332                      binop(opSHR, xxhash, mkU8(width-1))),
1333                   mkU8(3)
1334                ))
1335          );
1336    } else {
1337       /* standard interpretation */
1338       sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1339       return
1340          binop(
1341             opAND,
1342             mkPCastTo( mce,ty,
1343                        mkUifU(mce,ty, xxhash,yyhash)),
1344             sevenLeft1
1345          );
1346    }
1347 }
1348
1349
1350 /*------------------------------------------------------------*/
1351 /*--- Emit a test and complaint if something is undefined. ---*/
1352 /*------------------------------------------------------------*/
1353
1354 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1355
1356
1357 /* Set the annotations on a dirty helper to indicate that the stack
1358    pointer and instruction pointers might be read.  This is the
1359    behaviour of all 'emit-a-complaint' style functions we might
1360    call. */
1361
1362 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1363    di->nFxState = 2;
1364    di->fxState[0].fx        = Ifx_Read;
1365    di->fxState[0].offset    = mce->layout->offset_SP;
1366    di->fxState[0].size      = mce->layout->sizeof_SP;
1367    di->fxState[0].nRepeats  = 0;
1368    di->fxState[0].repeatLen = 0;
1369    di->fxState[1].fx        = Ifx_Read;
1370    di->fxState[1].offset    = mce->layout->offset_IP;
1371    di->fxState[1].size      = mce->layout->sizeof_IP;
1372    di->fxState[1].nRepeats  = 0;
1373    di->fxState[1].repeatLen = 0;
1374 }
1375
1376
1377 /* Check the supplied *original* |atom| for undefinedness, and emit a
1378    complaint if so.  Once that happens, mark it as defined.  This is
1379    possible because the atom is either a tmp or literal.  If it's a
1380    tmp, it will be shadowed by a tmp, and so we can set the shadow to
1381    be defined.  In fact as mentioned above, we will have to allocate a
1382    new tmp to carry the new 'defined' shadow value, and update the
1383    original->tmp mapping accordingly; we cannot simply assign a new
1384    value to an existing shadow tmp as this breaks SSAness.
1385
1386    The checks are performed, any resulting complaint emitted, and
1387    |atom|'s shadow temp set to 'defined', ONLY in the case that
1388    |guard| evaluates to True at run-time.  If it evaluates to False
1389    then no action is performed.  If |guard| is NULL (the usual case)
1390    then it is assumed to be always-true, and hence these actions are
1391    performed unconditionally.
1392
1393    This routine does not generate code to check the definedness of
1394    |guard|.  The caller is assumed to have taken care of that already.
1395 */
1396 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1397 {
1398    IRAtom*  vatom;
1399    IRType   ty;
1400    Int      sz;
1401    IRDirty* di;
1402    IRAtom*  cond;
1403    IRAtom*  origin;
1404    void*    fn;
1405    const HChar* nm;
1406    IRExpr** args;
1407    Int      nargs;
1408
1409    // Don't do V bit tests if we're not reporting undefined value errors.
1410    if (MC_(clo_mc_level) == 1)
1411       return;
1412
1413    if (guard)
1414       tl_assert(isOriginalAtom(mce, guard));
1415
1416    /* Since the original expression is atomic, there's no duplicated
1417       work generated by making multiple V-expressions for it.  So we
1418       don't really care about the possibility that someone else may
1419       also create a V-interpretion for it. */
1420    tl_assert(isOriginalAtom(mce, atom));
1421    vatom = expr2vbits( mce, atom, HuOth );
1422    tl_assert(isShadowAtom(mce, vatom));
1423    tl_assert(sameKindedAtoms(atom, vatom));
1424
1425    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1426
1427    /* sz is only used for constructing the error message */
1428    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1429
1430    cond = mkPCastTo( mce, Ity_I1, vatom );
1431    /* cond will be 0 if all defined, and 1 if any not defined. */
1432
1433    /* Get the origin info for the value we are about to check.  At
1434       least, if we are doing origin tracking.  If not, use a dummy
1435       zero origin. */
1436    if (MC_(clo_mc_level) == 3) {
1437       origin = schemeE( mce, atom );
1438       if (mce->hWordTy == Ity_I64) {
1439          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1440       }
1441    } else {
1442       origin = NULL;
1443    }
1444
1445    fn    = NULL;
1446    nm    = NULL;
1447    args  = NULL;
1448    nargs = -1;
1449
1450    switch (sz) {
1451       case 0:
1452          if (origin) {
1453             fn    = &MC_(helperc_value_check0_fail_w_o);
1454             nm    = "MC_(helperc_value_check0_fail_w_o)";
1455             args  = mkIRExprVec_1(origin);
1456             nargs = 1;
1457          } else {
1458             fn    = &MC_(helperc_value_check0_fail_no_o);
1459             nm    = "MC_(helperc_value_check0_fail_no_o)";
1460             args  = mkIRExprVec_0();
1461             nargs = 0;
1462          }
1463          break;
1464       case 1:
1465          if (origin) {
1466             fn    = &MC_(helperc_value_check1_fail_w_o);
1467             nm    = "MC_(helperc_value_check1_fail_w_o)";
1468             args  = mkIRExprVec_1(origin);
1469             nargs = 1;
1470          } else {
1471             fn    = &MC_(helperc_value_check1_fail_no_o);
1472             nm    = "MC_(helperc_value_check1_fail_no_o)";
1473             args  = mkIRExprVec_0();
1474             nargs = 0;
1475          }
1476          break;
1477       case 4:
1478          if (origin) {
1479             fn    = &MC_(helperc_value_check4_fail_w_o);
1480             nm    = "MC_(helperc_value_check4_fail_w_o)";
1481             args  = mkIRExprVec_1(origin);
1482             nargs = 1;
1483          } else {
1484             fn    = &MC_(helperc_value_check4_fail_no_o);
1485             nm    = "MC_(helperc_value_check4_fail_no_o)";
1486             args  = mkIRExprVec_0();
1487             nargs = 0;
1488          }
1489          break;
1490       case 8:
1491          if (origin) {
1492             fn    = &MC_(helperc_value_check8_fail_w_o);
1493             nm    = "MC_(helperc_value_check8_fail_w_o)";
1494             args  = mkIRExprVec_1(origin);
1495             nargs = 1;
1496          } else {
1497             fn    = &MC_(helperc_value_check8_fail_no_o);
1498             nm    = "MC_(helperc_value_check8_fail_no_o)";
1499             args  = mkIRExprVec_0();
1500             nargs = 0;
1501          }
1502          break;
1503       case 2:
1504       case 16:
1505          if (origin) {
1506             fn    = &MC_(helperc_value_checkN_fail_w_o);
1507             nm    = "MC_(helperc_value_checkN_fail_w_o)";
1508             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1509             nargs = 2;
1510          } else {
1511             fn    = &MC_(helperc_value_checkN_fail_no_o);
1512             nm    = "MC_(helperc_value_checkN_fail_no_o)";
1513             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1514             nargs = 1;
1515          }
1516          break;
1517       default:
1518          VG_(tool_panic)("unexpected szB");
1519    }
1520
1521    tl_assert(fn);
1522    tl_assert(nm);
1523    tl_assert(args);
1524    tl_assert(nargs >= 0 && nargs <= 2);
1525    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1526               || (MC_(clo_mc_level) == 2 && origin == NULL) );
1527
1528    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1529                            VG_(fnptr_to_fnentry)( fn ), args );
1530    di->guard = cond; // and cond is PCast-to-1(atom#)
1531
1532    /* If the complaint is to be issued under a guard condition, AND
1533       that into the guard condition for the helper call. */
1534    if (guard) {
1535       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1536       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1537       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1538       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
1539    }
1540
1541    setHelperAnns( mce, di );
1542    stmt( 'V', mce, IRStmt_Dirty(di));
1543
1544    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1545       defined -- but only in the case where the guard evaluates to
1546       True at run-time.  Do the update by setting the orig->shadow
1547       mapping for tmp to reflect the fact that this shadow is getting
1548       a new value. */
1549    tl_assert(isIRAtom(vatom));
1550    /* sameKindedAtoms ... */
1551    if (vatom->tag == Iex_RdTmp) {
1552       tl_assert(atom->tag == Iex_RdTmp);
1553       if (guard == NULL) {
1554          // guard is 'always True', hence update unconditionally
1555          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1556          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1557                           definedOfType(ty));
1558       } else {
1559          // update the temp only conditionally.  Do this by copying
1560          // its old value when the guard is False.
1561          // The old value ..
1562          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1563          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1564          IRAtom* new_tmpV
1565             = assignNew('V', mce, shadowTypeV(ty),
1566                         IRExpr_ITE(guard, definedOfType(ty),
1567                                           mkexpr(old_tmpV)));
1568          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1569       }
1570    }
1571 }
1572
1573
1574 /*------------------------------------------------------------*/
1575 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
1576 /*------------------------------------------------------------*/
1577
1578 /* Examine the always-defined sections declared in layout to see if
1579    the (offset,size) section is within one.  Note, is is an error to
1580    partially fall into such a region: (offset,size) should either be
1581    completely in such a region or completely not-in such a region.
1582 */
1583 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1584 {
1585    Int minoffD, maxoffD, i;
1586    Int minoff = offset;
1587    Int maxoff = minoff + size - 1;
1588    tl_assert((minoff & ~0xFFFF) == 0);
1589    tl_assert((maxoff & ~0xFFFF) == 0);
1590
1591    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1592       minoffD = mce->layout->alwaysDefd[i].offset;
1593       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1594       tl_assert((minoffD & ~0xFFFF) == 0);
1595       tl_assert((maxoffD & ~0xFFFF) == 0);
1596
1597       if (maxoff < minoffD || maxoffD < minoff)
1598          continue; /* no overlap */
1599       if (minoff >= minoffD && maxoff <= maxoffD)
1600          return True; /* completely contained in an always-defd section */
1601
1602       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1603    }
1604    return False; /* could not find any containing section */
1605 }
1606
1607
1608 /* Generate into bb suitable actions to shadow this Put.  If the state
1609    slice is marked 'always defined', do nothing.  Otherwise, write the
1610    supplied V bits to the shadow state.  We can pass in either an
1611    original atom or a V-atom, but not both.  In the former case the
1612    relevant V-bits are then generated from the original.
1613    We assume here, that the definedness of GUARD has already been checked.
1614 */
1615 static
1616 void do_shadow_PUT ( MCEnv* mce,  Int offset,
1617                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1618 {
1619    IRType ty;
1620
1621    // Don't do shadow PUTs if we're not doing undefined value checking.
1622    // Their absence lets Vex's optimiser remove all the shadow computation
1623    // that they depend on, which includes GETs of the shadow registers.
1624    if (MC_(clo_mc_level) == 1)
1625       return;
1626
1627    if (atom) {
1628       tl_assert(!vatom);
1629       tl_assert(isOriginalAtom(mce, atom));
1630       vatom = expr2vbits( mce, atom, HuOth );
1631    } else {
1632       tl_assert(vatom);
1633       tl_assert(isShadowAtom(mce, vatom));
1634    }
1635
1636    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1637    tl_assert(ty != Ity_I1);
1638    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1639       /* later: no ... */
1640       /* emit code to emit a complaint if any of the vbits are 1. */
1641       /* complainIfUndefined(mce, atom); */
1642    } else {
1643       /* Do a plain shadow Put. */
1644       if (guard) {
1645          /* If the guard expression evaluates to false we simply Put the value
1646             that is already stored in the guest state slot */
1647          IRAtom *cond, *iffalse;
1648
1649          cond    = assignNew('V', mce, Ity_I1, guard);
1650          iffalse = assignNew('V', mce, ty,
1651                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1652          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1653       }
1654       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1655    }
1656 }
1657
1658
1659 /* Return an expression which contains the V bits corresponding to the
1660    given GETI (passed in in pieces).
1661 */
1662 static
1663 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1664 {
1665    IRAtom* vatom;
1666    IRType  ty, tyS;
1667    Int     arrSize;;
1668    IRRegArray* descr = puti->descr;
1669    IRAtom*     ix    = puti->ix;
1670    Int         bias  = puti->bias;
1671    IRAtom*     atom  = puti->data;
1672
1673    // Don't do shadow PUTIs if we're not doing undefined value checking.
1674    // Their absence lets Vex's optimiser remove all the shadow computation
1675    // that they depend on, which includes GETIs of the shadow registers.
1676    if (MC_(clo_mc_level) == 1)
1677       return;
1678
1679    tl_assert(isOriginalAtom(mce,atom));
1680    vatom = expr2vbits( mce, atom, HuOth );
1681    tl_assert(sameKindedAtoms(atom, vatom));
1682    ty   = descr->elemTy;
1683    tyS  = shadowTypeV(ty);
1684    arrSize = descr->nElems * sizeofIRType(ty);
1685    tl_assert(ty != Ity_I1);
1686    tl_assert(isOriginalAtom(mce,ix));
1687    complainIfUndefined(mce, ix, NULL);
1688    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1689       /* later: no ... */
1690       /* emit code to emit a complaint if any of the vbits are 1. */
1691       /* complainIfUndefined(mce, atom); */
1692    } else {
1693       /* Do a cloned version of the Put that refers to the shadow
1694          area. */
1695       IRRegArray* new_descr
1696          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1697                          tyS, descr->nElems);
1698       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1699    }
1700 }
1701
1702
1703 /* Return an expression which contains the V bits corresponding to the
1704    given GET (passed in in pieces).
1705 */
1706 static
1707 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1708 {
1709    IRType tyS = shadowTypeV(ty);
1710    tl_assert(ty != Ity_I1);
1711    tl_assert(ty != Ity_I128);
1712    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1713       /* Always defined, return all zeroes of the relevant type */
1714       return definedOfType(tyS);
1715    } else {
1716       /* return a cloned version of the Get that refers to the shadow
1717          area. */
1718       /* FIXME: this isn't an atom! */
1719       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1720    }
1721 }
1722
1723
1724 /* Return an expression which contains the V bits corresponding to the
1725    given GETI (passed in in pieces).
1726 */
1727 static
1728 IRExpr* shadow_GETI ( MCEnv* mce,
1729                       IRRegArray* descr, IRAtom* ix, Int bias )
1730 {
1731    IRType ty   = descr->elemTy;
1732    IRType tyS  = shadowTypeV(ty);
1733    Int arrSize = descr->nElems * sizeofIRType(ty);
1734    tl_assert(ty != Ity_I1);
1735    tl_assert(isOriginalAtom(mce,ix));
1736    complainIfUndefined(mce, ix, NULL);
1737    if (isAlwaysDefd(mce, descr->base, arrSize)) {
1738       /* Always defined, return all zeroes of the relevant type */
1739       return definedOfType(tyS);
1740    } else {
1741       /* return a cloned version of the Get that refers to the shadow
1742          area. */
1743       IRRegArray* new_descr
1744          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1745                          tyS, descr->nElems);
1746       return IRExpr_GetI( new_descr, ix, bias );
1747    }
1748 }
1749
1750
1751 /*------------------------------------------------------------*/
1752 /*--- Generating approximations for unknown operations,    ---*/
1753 /*--- using lazy-propagate semantics                       ---*/
1754 /*------------------------------------------------------------*/
1755
1756 /* Lazy propagation of undefinedness from two values, resulting in the
1757    specified shadow type.
1758 */
1759 static
1760 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1761 {
1762    IRAtom* at;
1763    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1764    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1765    tl_assert(isShadowAtom(mce,va1));
1766    tl_assert(isShadowAtom(mce,va2));
1767
1768    /* The general case is inefficient because PCast is an expensive
1769       operation.  Here are some special cases which use PCast only
1770       once rather than twice. */
1771
1772    /* I64 x I64 -> I64 */
1773    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1774       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1775       at = mkUifU(mce, Ity_I64, va1, va2);
1776       at = mkPCastTo(mce, Ity_I64, at);
1777       return at;
1778    }
1779
1780    /* I64 x I64 -> I32 */
1781    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1782       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1783       at = mkUifU(mce, Ity_I64, va1, va2);
1784       at = mkPCastTo(mce, Ity_I32, at);
1785       return at;
1786    }
1787
1788    /* I32 x I32 -> I32 */
1789    if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1790       if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1791       at = mkUifU(mce, Ity_I32, va1, va2);
1792       at = mkPCastTo(mce, Ity_I32, at);
1793       return at;
1794    }
1795
1796    if (0) {
1797       VG_(printf)("mkLazy2 ");
1798       ppIRType(t1);
1799       VG_(printf)("_");
1800       ppIRType(t2);
1801       VG_(printf)("_");
1802       ppIRType(finalVty);
1803       VG_(printf)("\n");
1804    }
1805
1806    /* General case: force everything via 32-bit intermediaries. */
1807    at = mkPCastTo(mce, Ity_I32, va1);
1808    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1809    at = mkPCastTo(mce, finalVty, at);
1810    return at;
1811 }
1812
1813
1814 /* 3-arg version of the above. */
1815 static
1816 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
1817                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
1818 {
1819    IRAtom* at;
1820    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1821    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1822    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1823    tl_assert(isShadowAtom(mce,va1));
1824    tl_assert(isShadowAtom(mce,va2));
1825    tl_assert(isShadowAtom(mce,va3));
1826
1827    /* The general case is inefficient because PCast is an expensive
1828       operation.  Here are some special cases which use PCast only
1829       twice rather than three times. */
1830
1831    /* I32 x I64 x I64 -> I64 */
1832    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1833    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1834        && finalVty == Ity_I64) {
1835       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
1836       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1837          mode indication which is fully defined, this should get
1838          folded out later. */
1839       at = mkPCastTo(mce, Ity_I64, va1);
1840       /* Now fold in 2nd and 3rd args. */
1841       at = mkUifU(mce, Ity_I64, at, va2);
1842       at = mkUifU(mce, Ity_I64, at, va3);
1843       /* and PCast once again. */
1844       at = mkPCastTo(mce, Ity_I64, at);
1845       return at;
1846    }
1847
1848    /* I32 x I8 x I64 -> I64 */
1849    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
1850        && finalVty == Ity_I64) {
1851       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
1852       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
1853        * rounding mode indication which is fully defined, this should
1854        * get folded out later.
1855       */
1856       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1857       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1858       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1859       at = mkUifU(mce, Ity_I64, at, va3);
1860       /* and PCast once again. */
1861       at = mkPCastTo(mce, Ity_I64, at);
1862       return at;
1863    }
1864
1865    /* I32 x I64 x I64 -> I32 */
1866    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1867        && finalVty == Ity_I32) {
1868       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
1869       at = mkPCastTo(mce, Ity_I64, va1);
1870       at = mkUifU(mce, Ity_I64, at, va2);
1871       at = mkUifU(mce, Ity_I64, at, va3);
1872       at = mkPCastTo(mce, Ity_I32, at);
1873       return at;
1874    }
1875
1876    /* I32 x I32 x I32 -> I32 */
1877    /* 32-bit FP idiom, as (eg) happens on ARM */
1878    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
1879        && finalVty == Ity_I32) {
1880       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
1881       at = va1;
1882       at = mkUifU(mce, Ity_I32, at, va2);
1883       at = mkUifU(mce, Ity_I32, at, va3);
1884       at = mkPCastTo(mce, Ity_I32, at);
1885       return at;
1886    }
1887
1888    /* I32 x I128 x I128 -> I128 */
1889    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1890    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
1891        && finalVty == Ity_I128) {
1892       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
1893       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1894          mode indication which is fully defined, this should get
1895          folded out later. */
1896       at = mkPCastTo(mce, Ity_I128, va1);
1897       /* Now fold in 2nd and 3rd args. */
1898       at = mkUifU(mce, Ity_I128, at, va2);
1899       at = mkUifU(mce, Ity_I128, at, va3);
1900       /* and PCast once again. */
1901       at = mkPCastTo(mce, Ity_I128, at);
1902       return at;
1903    }
1904
1905    /* I32 x I8 x I128 -> I128 */
1906    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1907    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
1908        && finalVty == Ity_I128) {
1909       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
1910       /* Use I64 as an intermediate type, which means PCasting all 3
1911          args to I64 to start with. 1st arg is typically a rounding
1912          mode indication which is fully defined, so we hope that it
1913          will get folded out later. */
1914       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1915       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1916       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
1917       /* Now UifU all three together. */
1918       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
1919       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
1920       /* and PCast once again. */
1921       at = mkPCastTo(mce, Ity_I128, at);
1922       return at;
1923    }
1924    if (1) {
1925       VG_(printf)("mkLazy3: ");
1926       ppIRType(t1);
1927       VG_(printf)(" x ");
1928       ppIRType(t2);
1929       VG_(printf)(" x ");
1930       ppIRType(t3);
1931       VG_(printf)(" -> ");
1932       ppIRType(finalVty);
1933       VG_(printf)("\n");
1934    }
1935
1936    tl_assert(0);
1937    /* General case: force everything via 32-bit intermediaries. */
1938    /*
1939    at = mkPCastTo(mce, Ity_I32, va1);
1940    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1941    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
1942    at = mkPCastTo(mce, finalVty, at);
1943    return at;
1944    */
1945 }
1946
1947
1948 /* 4-arg version of the above. */
1949 static
1950 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
1951                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
1952 {
1953    IRAtom* at;
1954    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1955    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1956    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1957    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
1958    tl_assert(isShadowAtom(mce,va1));
1959    tl_assert(isShadowAtom(mce,va2));
1960    tl_assert(isShadowAtom(mce,va3));
1961    tl_assert(isShadowAtom(mce,va4));
1962
1963    /* The general case is inefficient because PCast is an expensive
1964       operation.  Here are some special cases which use PCast only
1965       twice rather than three times. */
1966
1967    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
1968
1969    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
1970        && finalVty == Ity_I128) {
1971       if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
1972       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
1973          mode indication which is fully defined, this should get
1974          folded out later. */
1975       at = mkPCastTo(mce, Ity_I128, va1);
1976       /* Now fold in 2nd, 3rd, 4th args. */
1977       at = mkUifU(mce, Ity_I128, at, va2);
1978       at = mkUifU(mce, Ity_I128, at, va3);
1979       at = mkUifU(mce, Ity_I128, at, va4);
1980       /* and PCast once again. */
1981       at = mkPCastTo(mce, Ity_I128, at);
1982       return at;
1983    }
1984
1985    /* I32 x I64 x I64 x I64 -> I64 */
1986    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
1987        && finalVty == Ity_I64) {
1988       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
1989       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
1990          mode indication which is fully defined, this should get
1991          folded out later. */
1992       at = mkPCastTo(mce, Ity_I64, va1);
1993       /* Now fold in 2nd, 3rd, 4th args. */
1994       at = mkUifU(mce, Ity_I64, at, va2);
1995       at = mkUifU(mce, Ity_I64, at, va3);
1996       at = mkUifU(mce, Ity_I64, at, va4);
1997       /* and PCast once again. */
1998       at = mkPCastTo(mce, Ity_I64, at);
1999       return at;
2000    }
2001    /* I32 x I32 x I32 x I32 -> I32 */
2002    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2003    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
2004        && finalVty == Ity_I32) {
2005       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2006       at = va1;
2007       /* Now fold in 2nd, 3rd, 4th args. */
2008       at = mkUifU(mce, Ity_I32, at, va2);
2009       at = mkUifU(mce, Ity_I32, at, va3);
2010       at = mkUifU(mce, Ity_I32, at, va4);
2011       at = mkPCastTo(mce, Ity_I32, at);
2012       return at;
2013    }
2014
2015    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2016        && finalVty == Ity_I32) {
2017       if (0) VG_(printf)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2018       at = mkPCastTo(mce, Ity_I8, va1);
2019       /* Now fold in 2nd, 3rd, 4th args. */
2020       at = mkUifU(mce, Ity_I8, at, va2);
2021       at = mkUifU(mce, Ity_I8, at, va3);
2022       at = mkUifU(mce, Ity_I8, at, va4);
2023       at = mkPCastTo(mce, Ity_I32, at);
2024       return at;
2025    }
2026
2027    if (t1 == Ity_I64 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2028        && finalVty == Ity_I64) {
2029       if (0) VG_(printf)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2030       at = mkPCastTo(mce, Ity_I8, va1);
2031       /* Now fold in 2nd, 3rd, 4th args. */
2032       at = mkUifU(mce, Ity_I8, at, va2);
2033       at = mkUifU(mce, Ity_I8, at, va3);
2034       at = mkUifU(mce, Ity_I8, at, va4);
2035       at = mkPCastTo(mce, Ity_I64, at);
2036       return at;
2037    }
2038
2039    if (1) {
2040       VG_(printf)("mkLazy4: ");
2041       ppIRType(t1);
2042       VG_(printf)(" x ");
2043       ppIRType(t2);
2044       VG_(printf)(" x ");
2045       ppIRType(t3);
2046       VG_(printf)(" x ");
2047       ppIRType(t4);
2048       VG_(printf)(" -> ");
2049       ppIRType(finalVty);
2050       VG_(printf)("\n");
2051    }
2052
2053    tl_assert(0);
2054 }
2055
2056
2057 /* Do the lazy propagation game from a null-terminated vector of
2058    atoms.  This is presumably the arguments to a helper call, so the
2059    IRCallee info is also supplied in order that we can know which
2060    arguments should be ignored (via the .mcx_mask field).
2061 */
2062 static
2063 IRAtom* mkLazyN ( MCEnv* mce,
2064                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
2065 {
2066    Int     i;
2067    IRAtom* here;
2068    IRAtom* curr;
2069    IRType  mergeTy;
2070    Bool    mergeTy64 = True;
2071
2072    /* Decide on the type of the merge intermediary.  If all relevant
2073       args are I64, then it's I64.  In all other circumstances, use
2074       I32. */
2075    for (i = 0; exprvec[i]; i++) {
2076       tl_assert(i < 32);
2077       tl_assert(isOriginalAtom(mce, exprvec[i]));
2078       if (cee->mcx_mask & (1<<i))
2079          continue;
2080       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
2081          mergeTy64 = False;
2082    }
2083
2084    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
2085    curr    = definedOfType(mergeTy);
2086
2087    for (i = 0; exprvec[i]; i++) {
2088       tl_assert(i < 32);
2089       tl_assert(isOriginalAtom(mce, exprvec[i]));
2090       /* Only take notice of this arg if the callee's mc-exclusion
2091          mask does not say it is to be excluded. */
2092       if (cee->mcx_mask & (1<<i)) {
2093          /* the arg is to be excluded from definedness checking.  Do
2094             nothing. */
2095          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
2096       } else {
2097          /* calculate the arg's definedness, and pessimistically merge
2098             it in. */
2099          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i], HuOth) );
2100          curr = mergeTy64
2101                    ? mkUifU64(mce, here, curr)
2102                    : mkUifU32(mce, here, curr);
2103       }
2104    }
2105    return mkPCastTo(mce, finalVtype, curr );
2106 }
2107
2108
2109 /*------------------------------------------------------------*/
2110 /*--- Generating expensive sequences for exact carry-chain ---*/
2111 /*--- propagation in add/sub and related operations.       ---*/
2112 /*------------------------------------------------------------*/
2113
2114 static
2115 IRAtom* expensiveAddSub ( MCEnv*  mce,
2116                           Bool    add,
2117                           IRType  ty,
2118                           IRAtom* qaa, IRAtom* qbb,
2119                           IRAtom* aa,  IRAtom* bb )
2120 {
2121    IRAtom *a_min, *b_min, *a_max, *b_max;
2122    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
2123
2124    tl_assert(isShadowAtom(mce,qaa));
2125    tl_assert(isShadowAtom(mce,qbb));
2126    tl_assert(isOriginalAtom(mce,aa));
2127    tl_assert(isOriginalAtom(mce,bb));
2128    tl_assert(sameKindedAtoms(qaa,aa));
2129    tl_assert(sameKindedAtoms(qbb,bb));
2130
2131    switch (ty) {
2132       case Ity_I32:
2133          opAND = Iop_And32;
2134          opOR  = Iop_Or32;
2135          opXOR = Iop_Xor32;
2136          opNOT = Iop_Not32;
2137          opADD = Iop_Add32;
2138          opSUB = Iop_Sub32;
2139          break;
2140       case Ity_I64:
2141          opAND = Iop_And64;
2142          opOR  = Iop_Or64;
2143          opXOR = Iop_Xor64;
2144          opNOT = Iop_Not64;
2145          opADD = Iop_Add64;
2146          opSUB = Iop_Sub64;
2147          break;
2148       default:
2149          VG_(tool_panic)("expensiveAddSub");
2150    }
2151
2152    // a_min = aa & ~qaa
2153    a_min = assignNew('V', mce,ty,
2154                      binop(opAND, aa,
2155                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
2156
2157    // b_min = bb & ~qbb
2158    b_min = assignNew('V', mce,ty,
2159                      binop(opAND, bb,
2160                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
2161
2162    // a_max = aa | qaa
2163    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
2164
2165    // b_max = bb | qbb
2166    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
2167
2168    if (add) {
2169       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2170       return
2171       assignNew('V', mce,ty,
2172          binop( opOR,
2173                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2174                 assignNew('V', mce,ty,
2175                    binop( opXOR,
2176                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
2177                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
2178                    )
2179                 )
2180          )
2181       );
2182    } else {
2183       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2184       return
2185       assignNew('V', mce,ty,
2186          binop( opOR,
2187                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2188                 assignNew('V', mce,ty,
2189                    binop( opXOR,
2190                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
2191                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
2192                    )
2193                 )
2194          )
2195       );
2196    }
2197
2198 }
2199
2200
2201 static
2202 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2203                                        IRAtom* atom, IRAtom* vatom )
2204 {
2205    IRType ty;
2206    IROp xorOp, subOp, andOp;
2207    IRExpr *one;
2208    IRAtom *improver, *improved;
2209    tl_assert(isShadowAtom(mce,vatom));
2210    tl_assert(isOriginalAtom(mce,atom));
2211    tl_assert(sameKindedAtoms(atom,vatom));
2212
2213    switch (czop) {
2214       case Iop_Ctz32:
2215          ty = Ity_I32;
2216          xorOp = Iop_Xor32;
2217          subOp = Iop_Sub32;
2218          andOp = Iop_And32;
2219          one = mkU32(1);
2220          break;
2221       case Iop_Ctz64:
2222          ty = Ity_I64;
2223          xorOp = Iop_Xor64;
2224          subOp = Iop_Sub64;
2225          andOp = Iop_And64;
2226          one = mkU64(1);
2227          break;
2228       default:
2229          ppIROp(czop);
2230          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2231    }
2232
2233    // improver = atom ^ (atom - 1)
2234    //
2235    // That is, improver has its low ctz(atom) bits equal to one;
2236    // higher bits (if any) equal to zero.
2237    improver = assignNew('V', mce,ty,
2238                         binop(xorOp,
2239                               atom,
2240                               assignNew('V', mce, ty,
2241                                         binop(subOp, atom, one))));
2242
2243    // improved = vatom & improver
2244    //
2245    // That is, treat any V bits above the first ctz(atom) bits as
2246    // "defined".
2247    improved = assignNew('V', mce, ty,
2248                         binop(andOp, vatom, improver));
2249
2250    // Return pessimizing cast of improved.
2251    return mkPCastTo(mce, ty, improved);
2252 }
2253
2254
2255 /*------------------------------------------------------------*/
2256 /*--- Scalar shifts.                                       ---*/
2257 /*------------------------------------------------------------*/
2258
2259 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
2260    idea is to shift the definedness bits by the original shift amount.
2261    This introduces 0s ("defined") in new positions for left shifts and
2262    unsigned right shifts, and copies the top definedness bit for
2263    signed right shifts.  So, conveniently, applying the original shift
2264    operator to the definedness bits for the left arg is exactly the
2265    right thing to do:
2266
2267       (qaa << bb)
2268
2269    However if the shift amount is undefined then the whole result
2270    is undefined.  Hence need:
2271
2272       (qaa << bb) `UifU` PCast(qbb)
2273
2274    If the shift amount bb is a literal than qbb will say 'all defined'
2275    and the UifU and PCast will get folded out by post-instrumentation
2276    optimisation.
2277 */
2278 static IRAtom* scalarShift ( MCEnv*  mce,
2279                              IRType  ty,
2280                              IROp    original_op,
2281                              IRAtom* qaa, IRAtom* qbb,
2282                              IRAtom* aa,  IRAtom* bb )
2283 {
2284    tl_assert(isShadowAtom(mce,qaa));
2285    tl_assert(isShadowAtom(mce,qbb));
2286    tl_assert(isOriginalAtom(mce,aa));
2287    tl_assert(isOriginalAtom(mce,bb));
2288    tl_assert(sameKindedAtoms(qaa,aa));
2289    tl_assert(sameKindedAtoms(qbb,bb));
2290    return
2291       assignNew(
2292          'V', mce, ty,
2293          mkUifU( mce, ty,
2294                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2295                  mkPCastTo(mce, ty, qbb)
2296          )
2297    );
2298 }
2299
2300
2301 /*------------------------------------------------------------*/
2302 /*--- Helpers for dealing with vector primops.             ---*/
2303 /*------------------------------------------------------------*/
2304
2305 /* Vector pessimisation -- pessimise within each lane individually. */
2306
2307 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2308 {
2309    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2310 }
2311
2312 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2313 {
2314    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2315 }
2316
2317 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2318 {
2319    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2320 }
2321
2322 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2323 {
2324    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2325 }
2326
2327 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2328 {
2329    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2330 }
2331
2332 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2333 {
2334    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2335 }
2336
2337 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2338 {
2339    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2340 }
2341
2342 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2343 {
2344    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2345 }
2346
2347 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2348 {
2349    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2350 }
2351
2352 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2353 {
2354    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2355 }
2356
2357 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2358 {
2359    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2360 }
2361
2362 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2363 {
2364    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2365 }
2366
2367 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2368 {
2369    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2370 }
2371
2372
2373 /* Here's a simple scheme capable of handling ops derived from SSE1
2374    code and while only generating ops that can be efficiently
2375    implemented in SSE1. */
2376
2377 /* All-lanes versions are straightforward:
2378
2379    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
2380
2381    unary32Fx4(x,y)    ==> PCast32x4(x#)
2382
2383    Lowest-lane-only versions are more complex:
2384
2385    binary32F0x4(x,y)  ==> SetV128lo32(
2386                              x#,
2387                              PCast32(V128to32(UifUV128(x#,y#)))
2388                           )
2389
2390    This is perhaps not so obvious.  In particular, it's faster to
2391    do a V128-bit UifU and then take the bottom 32 bits than the more
2392    obvious scheme of taking the bottom 32 bits of each operand
2393    and doing a 32-bit UifU.  Basically since UifU is fast and
2394    chopping lanes off vector values is slow.
2395
2396    Finally:
2397
2398    unary32F0x4(x)     ==> SetV128lo32(
2399                              x#,
2400                              PCast32(V128to32(x#))
2401                           )
2402
2403    Where:
2404
2405    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
2406    PCast32x4(v#) = CmpNEZ32x4(v#)
2407 */
2408
2409 static
2410 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2411 {
2412    IRAtom* at;
2413    tl_assert(isShadowAtom(mce, vatomX));
2414    tl_assert(isShadowAtom(mce, vatomY));
2415    at = mkUifUV128(mce, vatomX, vatomY);
2416    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2417    return at;
2418 }
2419
2420 static
2421 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2422 {
2423    IRAtom* at;
2424    tl_assert(isShadowAtom(mce, vatomX));
2425    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2426    return at;
2427 }
2428
2429 static
2430 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2431 {
2432    IRAtom* at;
2433    tl_assert(isShadowAtom(mce, vatomX));
2434    tl_assert(isShadowAtom(mce, vatomY));
2435    at = mkUifUV128(mce, vatomX, vatomY);
2436    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2437    at = mkPCastTo(mce, Ity_I32, at);
2438    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2439    return at;
2440 }
2441
2442 static
2443 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2444 {
2445    IRAtom* at;
2446    tl_assert(isShadowAtom(mce, vatomX));
2447    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2448    at = mkPCastTo(mce, Ity_I32, at);
2449    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2450    return at;
2451 }
2452
2453 /* --- ... and ... 64Fx2 versions of the same ... --- */
2454
2455 static
2456 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2457 {
2458    IRAtom* at;
2459    tl_assert(isShadowAtom(mce, vatomX));
2460    tl_assert(isShadowAtom(mce, vatomY));
2461    at = mkUifUV128(mce, vatomX, vatomY);
2462    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2463    return at;
2464 }
2465
2466 static
2467 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2468 {
2469    IRAtom* at;
2470    tl_assert(isShadowAtom(mce, vatomX));
2471    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2472    return at;
2473 }
2474
2475 static
2476 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2477 {
2478    IRAtom* at;
2479    tl_assert(isShadowAtom(mce, vatomX));
2480    tl_assert(isShadowAtom(mce, vatomY));
2481    at = mkUifUV128(mce, vatomX, vatomY);
2482    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2483    at = mkPCastTo(mce, Ity_I64, at);
2484    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2485    return at;
2486 }
2487
2488 static
2489 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2490 {
2491    IRAtom* at;
2492    tl_assert(isShadowAtom(mce, vatomX));
2493    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2494    at = mkPCastTo(mce, Ity_I64, at);
2495    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2496    return at;
2497 }
2498
2499 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2500
2501 static
2502 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2503 {
2504    IRAtom* at;
2505    tl_assert(isShadowAtom(mce, vatomX));
2506    tl_assert(isShadowAtom(mce, vatomY));
2507    at = mkUifU64(mce, vatomX, vatomY);
2508    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2509    return at;
2510 }
2511
2512 static
2513 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2514 {
2515    IRAtom* at;
2516    tl_assert(isShadowAtom(mce, vatomX));
2517    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2518    return at;
2519 }
2520
2521 /* --- ... and ... 64Fx4 versions of the same ... --- */
2522
2523 static
2524 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2525 {
2526    IRAtom* at;
2527    tl_assert(isShadowAtom(mce, vatomX));
2528    tl_assert(isShadowAtom(mce, vatomY));
2529    at = mkUifUV256(mce, vatomX, vatomY);
2530    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2531    return at;
2532 }
2533
2534 static
2535 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2536 {
2537    IRAtom* at;
2538    tl_assert(isShadowAtom(mce, vatomX));
2539    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2540    return at;
2541 }
2542
2543 /* --- ... and ... 32Fx8 versions of the same ... --- */
2544
2545 static
2546 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2547 {
2548    IRAtom* at;
2549    tl_assert(isShadowAtom(mce, vatomX));
2550    tl_assert(isShadowAtom(mce, vatomY));
2551    at = mkUifUV256(mce, vatomX, vatomY);
2552    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2553    return at;
2554 }
2555
2556 static
2557 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2558 {
2559    IRAtom* at;
2560    tl_assert(isShadowAtom(mce, vatomX));
2561    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2562    return at;
2563 }
2564
2565 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2566
2567 static
2568 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2569                                        IRAtom* vatomX, IRAtom* vatomY )
2570 {
2571    /* This is the same as binary64Fx2, except that we subsequently
2572       pessimise vRM (definedness of the rounding mode), widen to 128
2573       bits and UifU it into the result.  As with the scalar cases, if
2574       the RM is a constant then it is defined and so this extra bit
2575       will get constant-folded out later. */
2576    // "do" the vector args
2577    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2578    // PCast the RM, and widen it to 128 bits
2579    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2580    // Roll it into the result
2581    t1 = mkUifUV128(mce, t1, t2);
2582    return t1;
2583 }
2584
2585 /* --- ... and ... 32Fx4 versions of the same --- */
2586
2587 static
2588 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2589                                        IRAtom* vatomX, IRAtom* vatomY )
2590 {
2591    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2592    // PCast the RM, and widen it to 128 bits
2593    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2594    // Roll it into the result
2595    t1 = mkUifUV128(mce, t1, t2);
2596    return t1;
2597 }
2598
2599 /* --- ... and ... 64Fx4 versions of the same --- */
2600
2601 static
2602 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2603                                        IRAtom* vatomX, IRAtom* vatomY )
2604 {
2605    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2606    // PCast the RM, and widen it to 256 bits
2607    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2608    // Roll it into the result
2609    t1 = mkUifUV256(mce, t1, t2);
2610    return t1;
2611 }
2612
2613 /* --- ... and ... 32Fx8 versions of the same --- */
2614
2615 static
2616 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2617                                        IRAtom* vatomX, IRAtom* vatomY )
2618 {
2619    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2620    // PCast the RM, and widen it to 256 bits
2621    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2622    // Roll it into the result
2623    t1 = mkUifUV256(mce, t1, t2);
2624    return t1;
2625 }
2626
2627 /* --- 64Fx2 unary FP ops, with rounding mode --- */
2628
2629 static
2630 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2631 {
2632    /* Same scheme as binary64Fx2_w_rm. */
2633    // "do" the vector arg
2634    IRAtom* t1 = unary64Fx2(mce, vatomX);
2635    // PCast the RM, and widen it to 128 bits
2636    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2637    // Roll it into the result
2638    t1 = mkUifUV128(mce, t1, t2);
2639    return t1;
2640 }
2641
2642 /* --- ... and ... 32Fx4 versions of the same --- */
2643
2644 static
2645 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2646 {
2647    /* Same scheme as unary32Fx4_w_rm. */
2648    IRAtom* t1 = unary32Fx4(mce, vatomX);
2649    // PCast the RM, and widen it to 128 bits
2650    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2651    // Roll it into the result
2652    t1 = mkUifUV128(mce, t1, t2);
2653    return t1;
2654 }
2655
2656
2657 /* --- --- Vector saturated narrowing --- --- */
2658
2659 /* We used to do something very clever here, but on closer inspection
2660    (2011-Jun-15), and in particular bug #279698, it turns out to be
2661    wrong.  Part of the problem came from the fact that for a long
2662    time, the IR primops to do with saturated narrowing were
2663    underspecified and managed to confuse multiple cases which needed
2664    to be separate: the op names had a signedness qualifier, but in
2665    fact the source and destination signednesses needed to be specified
2666    independently, so the op names really need two independent
2667    signedness specifiers.
2668
2669    As of 2011-Jun-15 (ish) the underspecification was sorted out
2670    properly.  The incorrect instrumentation remained, though.  That
2671    has now (2011-Oct-22) been fixed.
2672
2673    What we now do is simple:
2674
2675    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
2676    number of lanes, X is the source lane width and signedness, and Y
2677    is the destination lane width and signedness.  In all cases the
2678    destination lane width is half the source lane width, so the names
2679    have a bit of redundancy, but are at least easy to read.
2680
2681    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
2682    to unsigned 16s.
2683
2684    Let Vanilla(OP) be a function that takes OP, one of these
2685    saturating narrowing ops, and produces the same "shaped" narrowing
2686    op which is not saturating, but merely dumps the most significant
2687    bits.  "same shape" means that the lane numbers and widths are the
2688    same as with OP.
2689
2690    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
2691                   = Iop_NarrowBin32to16x8,
2692    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
2693    dumping the top half of each lane.
2694
2695    So, with that in place, the scheme is simple, and it is simple to
2696    pessimise each lane individually and then apply Vanilla(OP) so as
2697    to get the result in the right "shape".  If the original OP is
2698    QNarrowBinXtoYxZ then we produce
2699
2700    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
2701
2702    or for the case when OP is unary (Iop_QNarrowUn*)
2703
2704    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
2705 */
2706 static
2707 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
2708 {
2709    switch (qnarrowOp) {
2710       /* Binary: (128, 128) -> 128 */
2711       case Iop_QNarrowBin16Sto8Ux16:
2712       case Iop_QNarrowBin16Sto8Sx16:
2713       case Iop_QNarrowBin16Uto8Ux16:
2714       case Iop_QNarrowBin64Sto32Sx4:
2715       case Iop_QNarrowBin64Uto32Ux4:
2716          return Iop_NarrowBin16to8x16;
2717       case Iop_QNarrowBin32Sto16Ux8:
2718       case Iop_QNarrowBin32Sto16Sx8:
2719       case Iop_QNarrowBin32Uto16Ux8:
2720          return Iop_NarrowBin32to16x8;
2721       /* Binary: (64, 64) -> 64 */
2722       case Iop_QNarrowBin32Sto16Sx4:
2723          return Iop_NarrowBin32to16x4;
2724       case Iop_QNarrowBin16Sto8Ux8:
2725       case Iop_QNarrowBin16Sto8Sx8:
2726          return Iop_NarrowBin16to8x8;
2727       /* Unary: 128 -> 64 */
2728       case Iop_QNarrowUn64Uto32Ux2:
2729       case Iop_QNarrowUn64Sto32Sx2:
2730       case Iop_QNarrowUn64Sto32Ux2:
2731          return Iop_NarrowUn64to32x2;
2732       case Iop_QNarrowUn32Uto16Ux4:
2733       case Iop_QNarrowUn32Sto16Sx4:
2734       case Iop_QNarrowUn32Sto16Ux4:
2735       case Iop_F32toF16x4:
2736          return Iop_NarrowUn32to16x4;
2737       case Iop_QNarrowUn16Uto8Ux8:
2738       case Iop_QNarrowUn16Sto8Sx8:
2739       case Iop_QNarrowUn16Sto8Ux8:
2740          return Iop_NarrowUn16to8x8;
2741       default:
2742          ppIROp(qnarrowOp);
2743          VG_(tool_panic)("vanillaNarrowOpOfShape");
2744    }
2745 }
2746
2747 static
2748 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
2749                               IRAtom* vatom1, IRAtom* vatom2)
2750 {
2751    IRAtom *at1, *at2, *at3;
2752    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2753    switch (narrow_op) {
2754       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
2755       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
2756       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
2757       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
2758       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
2759       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
2760       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
2761       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
2762       default: VG_(tool_panic)("vectorNarrowBinV128");
2763    }
2764    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2765    tl_assert(isShadowAtom(mce,vatom1));
2766    tl_assert(isShadowAtom(mce,vatom2));
2767    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2768    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
2769    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
2770    return at3;
2771 }
2772
2773 static
2774 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
2775                             IRAtom* vatom1, IRAtom* vatom2)
2776 {
2777    IRAtom *at1, *at2, *at3;
2778    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2779    switch (narrow_op) {
2780       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
2781       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
2782       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
2783       default: VG_(tool_panic)("vectorNarrowBin64");
2784    }
2785    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2786    tl_assert(isShadowAtom(mce,vatom1));
2787    tl_assert(isShadowAtom(mce,vatom2));
2788    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
2789    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
2790    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
2791    return at3;
2792 }
2793
2794 static
2795 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
2796                              IRAtom* vatom1)
2797 {
2798    IRAtom *at1, *at2;
2799    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2800    tl_assert(isShadowAtom(mce,vatom1));
2801    /* For vanilla narrowing (non-saturating), we can just apply
2802       the op directly to the V bits. */
2803    switch (narrow_op) {
2804       case Iop_NarrowUn16to8x8:
2805       case Iop_NarrowUn32to16x4:
2806       case Iop_NarrowUn64to32x2:
2807       case Iop_F32toF16x4:
2808          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
2809          return at1;
2810       default:
2811          break; /* Do Plan B */
2812    }
2813    /* Plan B: for ops that involve a saturation operation on the args,
2814       we must PCast before the vanilla narrow. */
2815    switch (narrow_op) {
2816       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
2817       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
2818       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
2819       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
2820       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
2821       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
2822       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
2823       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
2824       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
2825       default: VG_(tool_panic)("vectorNarrowUnV128");
2826    }
2827    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
2828    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
2829    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
2830    return at2;
2831 }
2832
2833 static
2834 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
2835                          IRAtom* vatom1)
2836 {
2837    IRAtom *at1, *at2;
2838    IRAtom* (*pcast)( MCEnv*, IRAtom* );
2839    switch (longen_op) {
2840       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
2841       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
2842       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
2843       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
2844       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
2845       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
2846       case Iop_F16toF32x4:     pcast = mkPCast32x4; break;
2847       default: VG_(tool_panic)("vectorWidenI64");
2848    }
2849    tl_assert(isShadowAtom(mce,vatom1));
2850    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
2851    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
2852    return at2;
2853 }
2854
2855
2856 /* --- --- Vector integer arithmetic --- --- */
2857
2858 /* Simple ... UifU the args and per-lane pessimise the results. */
2859
2860 /* --- V256-bit versions --- */
2861
2862 static
2863 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2864 {
2865    IRAtom* at;
2866    at = mkUifUV256(mce, vatom1, vatom2);
2867    at = mkPCast8x32(mce, at);
2868    return at;
2869 }
2870
2871 static
2872 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2873 {
2874    IRAtom* at;
2875    at = mkUifUV256(mce, vatom1, vatom2);
2876    at = mkPCast16x16(mce, at);
2877    return at;
2878 }
2879
2880 static
2881 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2882 {
2883    IRAtom* at;
2884    at = mkUifUV256(mce, vatom1, vatom2);
2885    at = mkPCast32x8(mce, at);
2886    return at;
2887 }
2888
2889 static
2890 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2891 {
2892    IRAtom* at;
2893    at = mkUifUV256(mce, vatom1, vatom2);
2894    at = mkPCast64x4(mce, at);
2895    return at;
2896 }
2897
2898 /* --- V128-bit versions --- */
2899
2900 static
2901 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2902 {
2903    IRAtom* at;
2904    at = mkUifUV128(mce, vatom1, vatom2);
2905    at = mkPCast8x16(mce, at);
2906    return at;
2907 }
2908
2909 static
2910 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2911 {
2912    IRAtom* at;
2913    at = mkUifUV128(mce, vatom1, vatom2);
2914    at = mkPCast16x8(mce, at);
2915    return at;
2916 }
2917
2918 static
2919 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2920 {
2921    IRAtom* at;
2922    at = mkUifUV128(mce, vatom1, vatom2);
2923    at = mkPCast32x4(mce, at);
2924    return at;
2925 }
2926
2927 static
2928 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2929 {
2930    IRAtom* at;
2931    at = mkUifUV128(mce, vatom1, vatom2);
2932    at = mkPCast64x2(mce, at);
2933    return at;
2934 }
2935
2936 /* --- 64-bit versions --- */
2937
2938 static
2939 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2940 {
2941    IRAtom* at;
2942    at = mkUifU64(mce, vatom1, vatom2);
2943    at = mkPCast8x8(mce, at);
2944    return at;
2945 }
2946
2947 static
2948 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2949 {
2950    IRAtom* at;
2951    at = mkUifU64(mce, vatom1, vatom2);
2952    at = mkPCast16x4(mce, at);
2953    return at;
2954 }
2955
2956 static
2957 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2958 {
2959    IRAtom* at;
2960    at = mkUifU64(mce, vatom1, vatom2);
2961    at = mkPCast32x2(mce, at);
2962    return at;
2963 }
2964
2965 static
2966 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2967 {
2968    IRAtom* at;
2969    at = mkUifU64(mce, vatom1, vatom2);
2970    at = mkPCastTo(mce, Ity_I64, at);
2971    return at;
2972 }
2973
2974 /* --- 32-bit versions --- */
2975
2976 static
2977 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2978 {
2979    IRAtom* at;
2980    at = mkUifU32(mce, vatom1, vatom2);
2981    at = mkPCast8x4(mce, at);
2982    return at;
2983 }
2984
2985 static
2986 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
2987 {
2988    IRAtom* at;
2989    at = mkUifU32(mce, vatom1, vatom2);
2990    at = mkPCast16x2(mce, at);
2991    return at;
2992 }
2993
2994
2995 /*------------------------------------------------------------*/
2996 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
2997 /*------------------------------------------------------------*/
2998
2999 static
3000 IRAtom* expr2vbits_Qop ( MCEnv* mce,
3001                          IROp op,
3002                          IRAtom* atom1, IRAtom* atom2,
3003                          IRAtom* atom3, IRAtom* atom4 )
3004 {
3005    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3006    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3007    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3008    IRAtom* vatom4 = expr2vbits( mce, atom4, HuOth );
3009
3010    tl_assert(isOriginalAtom(mce,atom1));
3011    tl_assert(isOriginalAtom(mce,atom2));
3012    tl_assert(isOriginalAtom(mce,atom3));
3013    tl_assert(isOriginalAtom(mce,atom4));
3014    tl_assert(isShadowAtom(mce,vatom1));
3015    tl_assert(isShadowAtom(mce,vatom2));
3016    tl_assert(isShadowAtom(mce,vatom3));
3017    tl_assert(isShadowAtom(mce,vatom4));
3018    tl_assert(sameKindedAtoms(atom1,vatom1));
3019    tl_assert(sameKindedAtoms(atom2,vatom2));
3020    tl_assert(sameKindedAtoms(atom3,vatom3));
3021    tl_assert(sameKindedAtoms(atom4,vatom4));
3022    switch (op) {
3023       case Iop_MAddF64:
3024       case Iop_MAddF64r32:
3025       case Iop_MSubF64:
3026       case Iop_MSubF64r32:
3027          /* I32(rm) x F64 x F64 x F64 -> F64 */
3028          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3029
3030       case Iop_MAddF32:
3031       case Iop_MSubF32:
3032          /* I32(rm) x F32 x F32 x F32 -> F32 */
3033          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3034
3035       case Iop_MAddF128:
3036       case Iop_MSubF128:
3037       case Iop_NegMAddF128:
3038       case Iop_NegMSubF128:
3039          /* I32(rm) x F128 x F128 x F128 -> F128 */
3040          return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
3041
3042       /* V256-bit data-steering */
3043       case Iop_64x4toV256:
3044          return assignNew('V', mce, Ity_V256,
3045                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
3046
3047       /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3048       case Iop_Rotx32:
3049          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3050       case Iop_Rotx64:
3051          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3052       default:
3053          ppIROp(op);
3054          VG_(tool_panic)("memcheck:expr2vbits_Qop");
3055    }
3056 }
3057
3058
3059 static
3060 IRAtom* expr2vbits_Triop ( MCEnv* mce,
3061                            IROp op,
3062                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
3063 {
3064    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3065    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3066    IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3067
3068    tl_assert(isOriginalAtom(mce,atom1));
3069    tl_assert(isOriginalAtom(mce,atom2));
3070    tl_assert(isOriginalAtom(mce,atom3));
3071    tl_assert(isShadowAtom(mce,vatom1));
3072    tl_assert(isShadowAtom(mce,vatom2));
3073    tl_assert(isShadowAtom(mce,vatom3));
3074    tl_assert(sameKindedAtoms(atom1,vatom1));
3075    tl_assert(sameKindedAtoms(atom2,vatom2));
3076    tl_assert(sameKindedAtoms(atom3,vatom3));
3077    switch (op) {
3078       case Iop_AddF128:
3079       case Iop_SubF128:
3080       case Iop_MulF128:
3081       case Iop_DivF128:
3082       case Iop_AddD128:
3083       case Iop_SubD128:
3084       case Iop_MulD128:
3085       case Iop_DivD128:
3086       case Iop_QuantizeD128:
3087          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3088          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3089       case Iop_AddF64:
3090       case Iop_AddD64:
3091       case Iop_AddF64r32:
3092       case Iop_SubF64:
3093       case Iop_SubD64:
3094       case Iop_SubF64r32:
3095       case Iop_MulF64:
3096       case Iop_MulD64:
3097       case Iop_MulF64r32:
3098       case Iop_DivF64:
3099       case Iop_DivD64:
3100       case Iop_DivF64r32:
3101       case Iop_ScaleF64:
3102       case Iop_Yl2xF64:
3103       case Iop_Yl2xp1F64:
3104       case Iop_AtanF64:
3105       case Iop_PRemF64:
3106       case Iop_PRem1F64:
3107       case Iop_QuantizeD64:
3108          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3109          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3110       case Iop_PRemC3210F64:
3111       case Iop_PRem1C3210F64:
3112          /* I32(rm) x F64 x F64 -> I32 */
3113          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3114       case Iop_AddF32:
3115       case Iop_SubF32:
3116       case Iop_MulF32:
3117       case Iop_DivF32:
3118          /* I32(rm) x F32 x F32 -> I32 */
3119          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3120       case Iop_SignificanceRoundD64:
3121          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3122          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3123       case Iop_SignificanceRoundD128:
3124          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3125          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3126       case Iop_SliceV128:
3127          /* (V128, V128, I8) -> V128 */
3128          complainIfUndefined(mce, atom3, NULL);
3129          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
3130       case Iop_Slice64:
3131          /* (I64, I64, I8) -> I64 */
3132          complainIfUndefined(mce, atom3, NULL);
3133          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
3134       case Iop_SetElem8x8:
3135       case Iop_SetElem16x4:
3136       case Iop_SetElem32x2:
3137          complainIfUndefined(mce, atom2, NULL);
3138          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
3139
3140       case Iop_SetElem8x16:
3141       case Iop_SetElem16x8:
3142       case Iop_SetElem32x4:
3143       case Iop_SetElem64x2:
3144          complainIfUndefined(mce, atom2, NULL);
3145          return assignNew('V', mce, Ity_V128, triop(op, vatom1, atom2, vatom3));
3146
3147       case Iop_Perm8x16x2:
3148          /* (V128, V128, V128) -> V128 */
3149             complainIfUndefined(mce, atom3, NULL);
3150             return mkUifUV128(
3151                    mce,
3152                    assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3)),
3153                    mkPCast8x16(mce, vatom3)
3154                 );
3155
3156       /* Vector FP with rounding mode as the first arg */
3157       case Iop_Add64Fx2:
3158       case Iop_Sub64Fx2:
3159       case Iop_Mul64Fx2:
3160       case Iop_Div64Fx2:
3161       case Iop_Scale2_64Fx2:
3162          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
3163
3164       case Iop_Add32Fx4:
3165       case Iop_Sub32Fx4:
3166       case Iop_Mul32Fx4:
3167       case Iop_Div32Fx4:
3168       case Iop_Scale2_32Fx4:
3169         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3170
3171       case Iop_Add64Fx4:
3172       case Iop_Sub64Fx4:
3173       case Iop_Mul64Fx4:
3174       case Iop_Div64Fx4:
3175          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3176
3177       case Iop_Add32Fx8:
3178       case Iop_Sub32Fx8:
3179       case Iop_Mul32Fx8:
3180       case Iop_Div32Fx8:
3181          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3182
3183       case Iop_F32x4_2toQ16x8:
3184          return assignNew('V', mce, Ity_V128,
3185                           binop(Iop_PackEvenLanes16x8,
3186                                 unary32Fx4_w_rm(mce, vatom1, vatom2),
3187                                 unary32Fx4_w_rm(mce, vatom1, vatom3)));
3188       case Iop_F64x2_2toQ32x4:
3189          return assignNew('V', mce, Ity_V128,
3190                           binop(Iop_PackEvenLanes32x4,
3191                                 unary64Fx2_w_rm(mce, vatom1, vatom2),
3192                                 unary64Fx2_w_rm(mce, vatom1, vatom3)));
3193
3194
3195       default:
3196          ppIROp(op);
3197          VG_(tool_panic)("memcheck:expr2vbits_Triop");
3198    }
3199 }
3200
3201
3202 static
3203 IRAtom* expr2vbits_Binop ( MCEnv* mce,
3204                            IROp op,
3205                            IRAtom* atom1, IRAtom* atom2,
3206                            HowUsed hu/*use HuOth if unknown*/ )
3207 {
3208    IRType  and_or_ty;
3209    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*);
3210    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*);
3211    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*);
3212
3213    IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3214    IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3215
3216    tl_assert(isOriginalAtom(mce,atom1));
3217    tl_assert(isOriginalAtom(mce,atom2));
3218    tl_assert(isShadowAtom(mce,vatom1));
3219    tl_assert(isShadowAtom(mce,vatom2));
3220    tl_assert(sameKindedAtoms(atom1,vatom1));
3221    tl_assert(sameKindedAtoms(atom2,vatom2));
3222    switch (op) {
3223
3224       /* 32-bit SIMD */
3225
3226       case Iop_Add16x2:
3227       case Iop_HAdd16Ux2:
3228       case Iop_HAdd16Sx2:
3229       case Iop_Sub16x2:
3230       case Iop_HSub16Ux2:
3231       case Iop_HSub16Sx2:
3232       case Iop_QAdd16Sx2:
3233       case Iop_QSub16Sx2:
3234       case Iop_QSub16Ux2:
3235       case Iop_QAdd16Ux2:
3236          return binary16Ix2(mce, vatom1, vatom2);
3237
3238       case Iop_Add8x4:
3239       case Iop_HAdd8Ux4:
3240       case Iop_HAdd8Sx4:
3241       case Iop_Sub8x4:
3242       case Iop_HSub8Ux4:
3243       case Iop_HSub8Sx4:
3244       case Iop_QSub8Ux4:
3245       case Iop_QAdd8Ux4:
3246       case Iop_QSub8Sx4:
3247       case Iop_QAdd8Sx4:
3248          return binary8Ix4(mce, vatom1, vatom2);
3249
3250       /* 64-bit SIMD */
3251
3252       case Iop_ShrN8x8:
3253       case Iop_ShrN16x4:
3254       case Iop_ShrN32x2:
3255       case Iop_SarN8x8:
3256       case Iop_SarN16x4:
3257       case Iop_SarN32x2:
3258       case Iop_ShlN16x4:
3259       case Iop_ShlN32x2:
3260       case Iop_ShlN8x8:
3261          /* Same scheme as with all other shifts. */
3262          complainIfUndefined(mce, atom2, NULL);
3263          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3264
3265       case Iop_QNarrowBin32Sto16Sx4:
3266       case Iop_QNarrowBin16Sto8Sx8:
3267       case Iop_QNarrowBin16Sto8Ux8:
3268          return vectorNarrowBin64(mce, op, vatom1, vatom2);
3269
3270       case Iop_Min8Ux8:
3271       case Iop_Min8Sx8:
3272       case Iop_Max8Ux8:
3273       case Iop_Max8Sx8:
3274       case Iop_Avg8Ux8:
3275       case Iop_QSub8Sx8:
3276       case Iop_QSub8Ux8:
3277       case Iop_Sub8x8:
3278       case Iop_CmpGT8Sx8:
3279       case Iop_CmpGT8Ux8:
3280       case Iop_CmpEQ8x8:
3281       case Iop_QAdd8Sx8:
3282       case Iop_QAdd8Ux8:
3283       case Iop_QSal8x8:
3284       case Iop_QShl8x8:
3285       case Iop_Add8x8:
3286       case Iop_Mul8x8:
3287       case Iop_PolynomialMul8x8:
3288          return binary8Ix8(mce, vatom1, vatom2);
3289
3290       case Iop_Min16Sx4:
3291       case Iop_Min16Ux4:
3292       case Iop_Max16Sx4:
3293       case Iop_Max16Ux4:
3294       case Iop_Avg16Ux4:
3295       case Iop_QSub16Ux4:
3296       case Iop_QSub16Sx4:
3297       case Iop_Sub16x4:
3298       case Iop_Mul16x4:
3299       case Iop_MulHi16Sx4:
3300       case Iop_MulHi16Ux4:
3301       case Iop_CmpGT16Sx4:
3302       case Iop_CmpGT16Ux4:
3303       case Iop_CmpEQ16x4:
3304       case Iop_QAdd16Sx4:
3305       case Iop_QAdd16Ux4:
3306       case Iop_QSal16x4:
3307       case Iop_QShl16x4:
3308       case Iop_Add16x4:
3309       case Iop_QDMulHi16Sx4:
3310       case Iop_QRDMulHi16Sx4:
3311          return binary16Ix4(mce, vatom1, vatom2);
3312
3313       case Iop_Sub32x2:
3314       case Iop_Mul32x2:
3315       case Iop_Max32Sx2:
3316       case Iop_Max32Ux2:
3317       case Iop_Min32Sx2:
3318       case Iop_Min32Ux2:
3319       case Iop_CmpGT32Sx2:
3320       case Iop_CmpGT32Ux2:
3321       case Iop_CmpEQ32x2:
3322       case Iop_Add32x2:
3323       case Iop_QAdd32Ux2:
3324       case Iop_QAdd32Sx2:
3325       case Iop_QSub32Ux2:
3326       case Iop_QSub32Sx2:
3327       case Iop_QSal32x2:
3328       case Iop_QShl32x2:
3329       case Iop_QDMulHi32Sx2:
3330       case Iop_QRDMulHi32Sx2:
3331          return binary32Ix2(mce, vatom1, vatom2);
3332
3333       case Iop_QSub64Ux1:
3334       case Iop_QSub64Sx1:
3335       case Iop_QAdd64Ux1:
3336       case Iop_QAdd64Sx1:
3337       case Iop_QSal64x1:
3338       case Iop_QShl64x1:
3339       case Iop_Sal64x1:
3340          return binary64Ix1(mce, vatom1, vatom2);
3341
3342       case Iop_QShlNsatSU8x8:
3343       case Iop_QShlNsatUU8x8:
3344       case Iop_QShlNsatSS8x8:
3345          complainIfUndefined(mce, atom2, NULL);
3346          return mkPCast8x8(mce, vatom1);
3347
3348       case Iop_QShlNsatSU16x4:
3349       case Iop_QShlNsatUU16x4:
3350       case Iop_QShlNsatSS16x4:
3351          complainIfUndefined(mce, atom2, NULL);
3352          return mkPCast16x4(mce, vatom1);
3353
3354       case Iop_QShlNsatSU32x2:
3355       case Iop_QShlNsatUU32x2:
3356       case Iop_QShlNsatSS32x2:
3357          complainIfUndefined(mce, atom2, NULL);
3358          return mkPCast32x2(mce, vatom1);
3359
3360       case Iop_QShlNsatSU64x1:
3361       case Iop_QShlNsatUU64x1:
3362       case Iop_QShlNsatSS64x1:
3363          complainIfUndefined(mce, atom2, NULL);
3364          return mkPCast32x2(mce, vatom1);
3365
3366       case Iop_PwMax32Sx2:
3367       case Iop_PwMax32Ux2:
3368       case Iop_PwMin32Sx2:
3369       case Iop_PwMin32Ux2:
3370       case Iop_PwMax32Fx2:
3371       case Iop_PwMin32Fx2:
3372          return assignNew('V', mce, Ity_I64,
3373                           binop(Iop_PwMax32Ux2,
3374                                 mkPCast32x2(mce, vatom1),
3375                                 mkPCast32x2(mce, vatom2)));
3376
3377       case Iop_PwMax16Sx4:
3378       case Iop_PwMax16Ux4:
3379       case Iop_PwMin16Sx4:
3380       case Iop_PwMin16Ux4:
3381          return assignNew('V', mce, Ity_I64,
3382                           binop(Iop_PwMax16Ux4,
3383                                 mkPCast16x4(mce, vatom1),
3384                                 mkPCast16x4(mce, vatom2)));
3385
3386       case Iop_PwMax8Sx8:
3387       case Iop_PwMax8Ux8:
3388       case Iop_PwMin8Sx8:
3389       case Iop_PwMin8Ux8:
3390          return assignNew('V', mce, Ity_I64,
3391                           binop(Iop_PwMax8Ux8,
3392                                 mkPCast8x8(mce, vatom1),
3393                                 mkPCast8x8(mce, vatom2)));
3394
3395       case Iop_PwAdd32x2:
3396       case Iop_PwAdd32Fx2:
3397          return mkPCast32x2(mce,
3398                assignNew('V', mce, Ity_I64,
3399                          binop(Iop_PwAdd32x2,
3400                                mkPCast32x2(mce, vatom1),
3401                                mkPCast32x2(mce, vatom2))));
3402
3403       case Iop_PwAdd16x4:
3404          return mkPCast16x4(mce,
3405                assignNew('V', mce, Ity_I64,
3406                          binop(op, mkPCast16x4(mce, vatom1),
3407                                    mkPCast16x4(mce, vatom2))));
3408
3409       case Iop_PwAdd8x8:
3410          return mkPCast8x8(mce,
3411                assignNew('V', mce, Ity_I64,
3412                          binop(op, mkPCast8x8(mce, vatom1),
3413                                    mkPCast8x8(mce, vatom2))));
3414
3415       case Iop_Shl8x8:
3416       case Iop_Shr8x8:
3417       case Iop_Sar8x8:
3418       case Iop_Sal8x8:
3419          return mkUifU64(mce,
3420                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3421                    mkPCast8x8(mce,vatom2)
3422                 );
3423
3424       case Iop_Shl16x4:
3425       case Iop_Shr16x4:
3426       case Iop_Sar16x4:
3427       case Iop_Sal16x4:
3428          return mkUifU64(mce,
3429                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3430                    mkPCast16x4(mce,vatom2)
3431                 );
3432
3433       case Iop_Shl32x2:
3434       case Iop_Shr32x2:
3435       case Iop_Sar32x2:
3436       case Iop_Sal32x2:
3437          return mkUifU64(mce,
3438                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3439                    mkPCast32x2(mce,vatom2)
3440                 );
3441
3442       /* 64-bit data-steering */
3443       case Iop_InterleaveLO32x2:
3444       case Iop_InterleaveLO16x4:
3445       case Iop_InterleaveLO8x8:
3446       case Iop_InterleaveHI32x2:
3447       case Iop_InterleaveHI16x4:
3448       case Iop_InterleaveHI8x8:
3449       case Iop_CatOddLanes8x8:
3450       case Iop_CatEvenLanes8x8:
3451       case Iop_CatOddLanes16x4:
3452       case Iop_CatEvenLanes16x4:
3453       case Iop_InterleaveOddLanes8x8:
3454       case Iop_InterleaveEvenLanes8x8:
3455       case Iop_InterleaveOddLanes16x4:
3456       case Iop_InterleaveEvenLanes16x4:
3457          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3458
3459       case Iop_GetElem8x8:
3460          complainIfUndefined(mce, atom2, NULL);
3461          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3462       case Iop_GetElem16x4:
3463          complainIfUndefined(mce, atom2, NULL);
3464          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3465       case Iop_GetElem32x2:
3466          complainIfUndefined(mce, atom2, NULL);
3467          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3468
3469       /* Perm8x8: rearrange values in left arg using steering values
3470         from right arg.  So rearrange the vbits in the same way but
3471         pessimise wrt steering values. */
3472       case Iop_Perm8x8:
3473          return mkUifU64(
3474                    mce,
3475                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3476                    mkPCast8x8(mce, vatom2)
3477                 );
3478
3479       /* V128-bit SIMD */
3480
3481       case Iop_Sqrt32Fx4:
3482          return unary32Fx4_w_rm(mce, vatom1, vatom2);
3483       case Iop_Sqrt64Fx2:
3484          return unary64Fx2_w_rm(mce, vatom1, vatom2);
3485
3486       case Iop_ShrN8x16:
3487       case Iop_ShrN16x8:
3488       case Iop_ShrN32x4:
3489       case Iop_ShrN64x2:
3490       case Iop_SarN8x16:
3491       case Iop_SarN16x8:
3492       case Iop_SarN32x4:
3493       case Iop_SarN64x2:
3494       case Iop_ShlN8x16:
3495       case Iop_ShlN16x8:
3496       case Iop_ShlN32x4:
3497       case Iop_ShlN64x2:
3498          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
3499             this is wrong now, scalar shifts are done properly lazily.
3500             Vector shifts should be fixed too. */
3501          complainIfUndefined(mce, atom2, NULL);
3502          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3503
3504       /* V x V shifts/rotates are done using the standard lazy scheme. */
3505       /* For the non-rounding variants of bi-di vector x vector
3506          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3507          But note that this is overly pessimistic, because in fact only
3508          the bottom 8 bits of each lane of the second argument are taken
3509          into account when shifting.  So really we ought to ignore
3510          undefinedness in bits 8 and above of each lane in the
3511          second argument. */
3512       case Iop_Shl8x16:
3513       case Iop_Shr8x16:
3514       case Iop_Sar8x16:
3515       case Iop_Sal8x16:
3516       case Iop_Rol8x16:
3517       case Iop_Sh8Sx16:
3518       case Iop_Sh8Ux16:
3519          return mkUifUV128(mce,
3520                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3521                    mkPCast8x16(mce,vatom2)
3522                 );
3523
3524       case Iop_Shl16x8:
3525       case Iop_Shr16x8:
3526       case Iop_Sar16x8:
3527       case Iop_Sal16x8:
3528       case Iop_Rol16x8:
3529       case Iop_Sh16Sx8:
3530       case Iop_Sh16Ux8:
3531          return mkUifUV128(mce,
3532                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3533                    mkPCast16x8(mce,vatom2)
3534                 );
3535
3536       case Iop_Shl32x4:
3537       case Iop_Shr32x4:
3538       case Iop_Sar32x4:
3539       case Iop_Sal32x4:
3540       case Iop_Rol32x4:
3541       case Iop_Sh32Sx4:
3542       case Iop_Sh32Ux4:
3543          return mkUifUV128(mce,
3544                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3545                    mkPCast32x4(mce,vatom2)
3546                 );
3547
3548       case Iop_Shl64x2:
3549       case Iop_Shr64x2:
3550       case Iop_Sar64x2:
3551       case Iop_Sal64x2:
3552       case Iop_Rol64x2:
3553       case Iop_Sh64Sx2:
3554       case Iop_Sh64Ux2:
3555          return mkUifUV128(mce,
3556                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3557                    mkPCast64x2(mce,vatom2)
3558                 );
3559
3560       /* For the rounding variants of bi-di vector x vector shifts, the
3561          rounding adjustment can cause undefinedness to propagate through
3562          the entire lane, in the worst case.  Too complex to handle
3563          properly .. just UifU the arguments and then PCast them.
3564          Suboptimal but safe. */
3565       case Iop_Rsh8Sx16:
3566       case Iop_Rsh8Ux16:
3567          return binary8Ix16(mce, vatom1, vatom2);
3568       case Iop_Rsh16Sx8:
3569       case Iop_Rsh16Ux8:
3570          return binary16Ix8(mce, vatom1, vatom2);
3571       case Iop_Rsh32Sx4:
3572       case Iop_Rsh32Ux4:
3573          return binary32Ix4(mce, vatom1, vatom2);
3574       case Iop_Rsh64Sx2:
3575       case Iop_Rsh64Ux2:
3576          return binary64Ix2(mce, vatom1, vatom2);
3577
3578       case Iop_F32ToFixed32Ux4_RZ:
3579       case Iop_F32ToFixed32Sx4_RZ:
3580       case Iop_Fixed32UToF32x4_RN:
3581       case Iop_Fixed32SToF32x4_RN:
3582          complainIfUndefined(mce, atom2, NULL);
3583          return mkPCast32x4(mce, vatom1);
3584
3585       case Iop_F32ToFixed32Ux2_RZ:
3586       case Iop_F32ToFixed32Sx2_RZ:
3587       case Iop_Fixed32UToF32x2_RN:
3588       case Iop_Fixed32SToF32x2_RN:
3589          complainIfUndefined(mce, atom2, NULL);
3590          return mkPCast32x2(mce, vatom1);
3591
3592       case Iop_QSub8Ux16:
3593       case Iop_QSub8Sx16:
3594       case Iop_Sub8x16:
3595       case Iop_Min8Ux16:
3596       case Iop_Min8Sx16:
3597       case Iop_Max8Ux16:
3598       case Iop_Max8Sx16:
3599       case Iop_CmpGT8Sx16:
3600       case Iop_CmpGT8Ux16:
3601       case Iop_CmpEQ8x16:
3602       case Iop_Avg8Ux16:
3603       case Iop_Avg8Sx16:
3604       case Iop_QAdd8Ux16:
3605       case Iop_QAdd8Sx16:
3606       case Iop_QAddExtUSsatSS8x16:
3607       case Iop_QAddExtSUsatUU8x16:
3608       case Iop_QSal8x16:
3609       case Iop_QShl8x16:
3610       case Iop_Add8x16:
3611       case Iop_Mul8x16:
3612       case Iop_PolynomialMul8x16:
3613       case Iop_PolynomialMulAdd8x16:
3614          return binary8Ix16(mce, vatom1, vatom2);
3615
3616       case Iop_QSub16Ux8:
3617       case Iop_QSub16Sx8:
3618       case Iop_Sub16x8:
3619       case Iop_Mul16x8:
3620       case Iop_MulHi16Sx8:
3621       case Iop_MulHi16Ux8:
3622       case Iop_Min16Sx8:
3623       case Iop_Min16Ux8:
3624       case Iop_Max16Sx8:
3625       case Iop_Max16Ux8:
3626       case Iop_CmpGT16Sx8:
3627       case Iop_CmpGT16Ux8:
3628       case Iop_CmpEQ16x8:
3629       case Iop_Avg16Ux8:
3630       case Iop_Avg16Sx8:
3631       case Iop_QAdd16Ux8:
3632       case Iop_QAdd16Sx8:
3633       case Iop_QAddExtUSsatSS16x8:
3634       case Iop_QAddExtSUsatUU16x8:
3635       case Iop_QSal16x8:
3636       case Iop_QShl16x8:
3637       case Iop_Add16x8:
3638       case Iop_QDMulHi16Sx8:
3639       case Iop_QRDMulHi16Sx8:
3640       case Iop_PolynomialMulAdd16x8:
3641          return binary16Ix8(mce, vatom1, vatom2);
3642
3643       case Iop_Sub32x4:
3644       case Iop_CmpGT32Sx4:
3645       case Iop_CmpGT32Ux4:
3646       case Iop_CmpEQ32x4:
3647       case Iop_QAdd32Sx4:
3648       case Iop_QAdd32Ux4:
3649       case Iop_QSub32Sx4:
3650       case Iop_QSub32Ux4:
3651       case Iop_QAddExtUSsatSS32x4:
3652       case Iop_QAddExtSUsatUU32x4:
3653       case Iop_QSal32x4:
3654       case Iop_QShl32x4:
3655       case Iop_Avg32Ux4:
3656       case Iop_Avg32Sx4:
3657       case Iop_Add32x4:
3658       case Iop_Max32Ux4:
3659       case Iop_Max32Sx4:
3660       case Iop_Min32Ux4:
3661       case Iop_Min32Sx4:
3662       case Iop_Mul32x4:
3663       case Iop_QDMulHi32Sx4:
3664       case Iop_QRDMulHi32Sx4:
3665       case Iop_PolynomialMulAdd32x4:
3666          return binary32Ix4(mce, vatom1, vatom2);
3667
3668       case Iop_Sub64x2:
3669       case Iop_Add64x2:
3670       case Iop_Max64Sx2:
3671       case Iop_Max64Ux2:
3672       case Iop_Min64Sx2:
3673       case Iop_Min64Ux2:
3674       case Iop_CmpEQ64x2:
3675       case Iop_CmpGT64Sx2:
3676       case Iop_CmpGT64Ux2:
3677       case Iop_QSal64x2:
3678       case Iop_QShl64x2:
3679       case Iop_QAdd64Ux2:
3680       case Iop_QAdd64Sx2:
3681       case Iop_QSub64Ux2:
3682       case Iop_QSub64Sx2:
3683       case Iop_QAddExtUSsatSS64x2:
3684       case Iop_QAddExtSUsatUU64x2:
3685       case Iop_PolynomialMulAdd64x2:
3686       case Iop_CipherV128:
3687       case Iop_CipherLV128:
3688       case Iop_NCipherV128:
3689       case Iop_NCipherLV128:
3690       case Iop_MulI128by10E:
3691       case Iop_MulI128by10ECarry:
3692         return binary64Ix2(mce, vatom1, vatom2);
3693
3694       case Iop_QNarrowBin64Sto32Sx4:
3695       case Iop_QNarrowBin64Uto32Ux4:
3696       case Iop_QNarrowBin32Sto16Sx8:
3697       case Iop_QNarrowBin32Uto16Ux8:
3698       case Iop_QNarrowBin32Sto16Ux8:
3699       case Iop_QNarrowBin16Sto8Sx16:
3700       case Iop_QNarrowBin16Uto8Ux16:
3701       case Iop_QNarrowBin16Sto8Ux16:
3702          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
3703
3704       case Iop_Min64Fx2:
3705       case Iop_Max64Fx2:
3706       case Iop_CmpLT64Fx2:
3707       case Iop_CmpLE64Fx2:
3708       case Iop_CmpEQ64Fx2:
3709       case Iop_CmpUN64Fx2:
3710       case Iop_RecipStep64Fx2:
3711       case Iop_RSqrtStep64Fx2:
3712          return binary64Fx2(mce, vatom1, vatom2);
3713
3714       case Iop_Sub64F0x2:
3715       case Iop_Mul64F0x2:
3716       case Iop_Min64F0x2:
3717       case Iop_Max64F0x2:
3718       case Iop_Div64F0x2:
3719       case Iop_CmpLT64F0x2:
3720       case Iop_CmpLE64F0x2:
3721       case Iop_CmpEQ64F0x2:
3722       case Iop_CmpUN64F0x2:
3723       case Iop_Add64F0x2:
3724          return binary64F0x2(mce, vatom1, vatom2);
3725
3726       case Iop_Min32Fx4:
3727       case Iop_Max32Fx4:
3728       case Iop_CmpLT32Fx4:
3729       case Iop_CmpLE32Fx4:
3730       case Iop_CmpEQ32Fx4:
3731       case Iop_CmpUN32Fx4:
3732       case Iop_CmpGT32Fx4:
3733       case Iop_CmpGE32Fx4:
3734       case Iop_RecipStep32Fx4:
3735       case Iop_RSqrtStep32Fx4:
3736          return binary32Fx4(mce, vatom1, vatom2);
3737
3738       case Iop_Sub32Fx2:
3739       case Iop_Mul32Fx2:
3740       case Iop_Min32Fx2:
3741       case Iop_Max32Fx2:
3742       case Iop_CmpEQ32Fx2:
3743       case Iop_CmpGT32Fx2:
3744       case Iop_CmpGE32Fx2:
3745       case Iop_Add32Fx2:
3746       case Iop_RecipStep32Fx2:
3747       case Iop_RSqrtStep32Fx2:
3748          return binary32Fx2(mce, vatom1, vatom2);
3749
3750       case Iop_Sub32F0x4:
3751       case Iop_Mul32F0x4:
3752       case Iop_Min32F0x4:
3753       case Iop_Max32F0x4:
3754       case Iop_Div32F0x4:
3755       case Iop_CmpLT32F0x4:
3756       case Iop_CmpLE32F0x4:
3757       case Iop_CmpEQ32F0x4:
3758       case Iop_CmpUN32F0x4:
3759       case Iop_Add32F0x4:
3760          return binary32F0x4(mce, vatom1, vatom2);
3761
3762       case Iop_QShlNsatSU8x16:
3763       case Iop_QShlNsatUU8x16:
3764       case Iop_QShlNsatSS8x16:
3765          complainIfUndefined(mce, atom2, NULL);
3766          return mkPCast8x16(mce, vatom1);
3767
3768       case Iop_QShlNsatSU16x8:
3769       case Iop_QShlNsatUU16x8:
3770       case Iop_QShlNsatSS16x8:
3771          complainIfUndefined(mce, atom2, NULL);
3772          return mkPCast16x8(mce, vatom1);
3773
3774       case Iop_QShlNsatSU32x4:
3775       case Iop_QShlNsatUU32x4:
3776       case Iop_QShlNsatSS32x4:
3777          complainIfUndefined(mce, atom2, NULL);
3778          return mkPCast32x4(mce, vatom1);
3779
3780       case Iop_QShlNsatSU64x2:
3781       case Iop_QShlNsatUU64x2:
3782       case Iop_QShlNsatSS64x2:
3783          complainIfUndefined(mce, atom2, NULL);
3784          return mkPCast32x4(mce, vatom1);
3785
3786       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
3787          To make this simpler, do the following:
3788          * complain if the shift amount (the I8) is undefined
3789          * pcast each lane at the wide width
3790          * truncate each lane to half width
3791          * pcast the resulting 64-bit value to a single bit and use
3792            that as the least significant bit of the upper half of the
3793            result. */
3794       case Iop_QandQShrNnarrow64Uto32Ux2:
3795       case Iop_QandQSarNnarrow64Sto32Sx2:
3796       case Iop_QandQSarNnarrow64Sto32Ux2:
3797       case Iop_QandQRShrNnarrow64Uto32Ux2:
3798       case Iop_QandQRSarNnarrow64Sto32Sx2:
3799       case Iop_QandQRSarNnarrow64Sto32Ux2:
3800       case Iop_QandQShrNnarrow32Uto16Ux4:
3801       case Iop_QandQSarNnarrow32Sto16Sx4:
3802       case Iop_QandQSarNnarrow32Sto16Ux4:
3803       case Iop_QandQRShrNnarrow32Uto16Ux4:
3804       case Iop_QandQRSarNnarrow32Sto16Sx4:
3805       case Iop_QandQRSarNnarrow32Sto16Ux4:
3806       case Iop_QandQShrNnarrow16Uto8Ux8:
3807       case Iop_QandQSarNnarrow16Sto8Sx8:
3808       case Iop_QandQSarNnarrow16Sto8Ux8:
3809       case Iop_QandQRShrNnarrow16Uto8Ux8:
3810       case Iop_QandQRSarNnarrow16Sto8Sx8:
3811       case Iop_QandQRSarNnarrow16Sto8Ux8:
3812       {
3813          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
3814          IROp opNarrow = Iop_INVALID;
3815          switch (op) {
3816             case Iop_QandQShrNnarrow64Uto32Ux2:
3817             case Iop_QandQSarNnarrow64Sto32Sx2:
3818             case Iop_QandQSarNnarrow64Sto32Ux2:
3819             case Iop_QandQRShrNnarrow64Uto32Ux2:
3820             case Iop_QandQRSarNnarrow64Sto32Sx2:
3821             case Iop_QandQRSarNnarrow64Sto32Ux2:
3822                fnPessim = mkPCast64x2;
3823                opNarrow = Iop_NarrowUn64to32x2;
3824                break;
3825             case Iop_QandQShrNnarrow32Uto16Ux4:
3826             case Iop_QandQSarNnarrow32Sto16Sx4:
3827             case Iop_QandQSarNnarrow32Sto16Ux4:
3828             case Iop_QandQRShrNnarrow32Uto16Ux4:
3829             case Iop_QandQRSarNnarrow32Sto16Sx4:
3830             case Iop_QandQRSarNnarrow32Sto16Ux4:
3831                fnPessim = mkPCast32x4;
3832                opNarrow = Iop_NarrowUn32to16x4;
3833                break;
3834             case Iop_QandQShrNnarrow16Uto8Ux8:
3835             case Iop_QandQSarNnarrow16Sto8Sx8:
3836             case Iop_QandQSarNnarrow16Sto8Ux8:
3837             case Iop_QandQRShrNnarrow16Uto8Ux8:
3838             case Iop_QandQRSarNnarrow16Sto8Sx8:
3839             case Iop_QandQRSarNnarrow16Sto8Ux8:
3840                fnPessim = mkPCast16x8;
3841                opNarrow = Iop_NarrowUn16to8x8;
3842                break;
3843             default:
3844                tl_assert(0);
3845          }
3846          complainIfUndefined(mce, atom2, NULL);
3847          // Pessimised shift result
3848          IRAtom* shV
3849             = fnPessim(mce, vatom1);
3850          // Narrowed, pessimised shift result
3851          IRAtom* shVnarrowed
3852             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
3853          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
3854          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
3855          // and assemble the result
3856          return assignNew('V', mce, Ity_V128,
3857                           binop(Iop_64HLtoV128, qV, shVnarrowed));
3858       }
3859
3860       case Iop_Mull32Sx2:
3861       case Iop_Mull32Ux2:
3862       case Iop_QDMull32Sx2:
3863          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
3864                                     mkUifU64(mce, vatom1, vatom2));
3865
3866       case Iop_Mull16Sx4:
3867       case Iop_Mull16Ux4:
3868       case Iop_QDMull16Sx4:
3869          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
3870                                     mkUifU64(mce, vatom1, vatom2));
3871
3872       case Iop_Mull8Sx8:
3873       case Iop_Mull8Ux8:
3874       case Iop_PolynomialMull8x8:
3875          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
3876                                     mkUifU64(mce, vatom1, vatom2));
3877
3878       case Iop_PwAdd32x4:
3879          return mkPCast32x4(mce,
3880                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
3881                      mkPCast32x4(mce, vatom2))));
3882
3883       case Iop_PwAdd16x8:
3884          return mkPCast16x8(mce,
3885                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
3886                      mkPCast16x8(mce, vatom2))));
3887
3888       case Iop_PwAdd8x16:
3889          return mkPCast8x16(mce,
3890                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
3891                      mkPCast8x16(mce, vatom2))));
3892
3893       /* V128-bit data-steering */
3894       case Iop_SetV128lo32:
3895       case Iop_SetV128lo64:
3896       case Iop_64HLtoV128:
3897       case Iop_InterleaveLO64x2:
3898       case Iop_InterleaveLO32x4:
3899       case Iop_InterleaveLO16x8:
3900       case Iop_InterleaveLO8x16:
3901       case Iop_InterleaveHI64x2:
3902       case Iop_InterleaveHI32x4:
3903       case Iop_InterleaveHI16x8:
3904       case Iop_InterleaveHI8x16:
3905       case Iop_CatOddLanes8x16:
3906       case Iop_CatOddLanes16x8:
3907       case Iop_CatOddLanes32x4:
3908       case Iop_CatEvenLanes8x16:
3909       case Iop_CatEvenLanes16x8:
3910       case Iop_CatEvenLanes32x4:
3911       case Iop_InterleaveOddLanes8x16:
3912       case Iop_InterleaveOddLanes16x8:
3913       case Iop_InterleaveOddLanes32x4:
3914       case Iop_InterleaveEvenLanes8x16:
3915       case Iop_InterleaveEvenLanes16x8:
3916       case Iop_InterleaveEvenLanes32x4:
3917       case Iop_PackOddLanes8x16:
3918       case Iop_PackOddLanes16x8:
3919       case Iop_PackOddLanes32x4:
3920       case Iop_PackEvenLanes8x16:
3921       case Iop_PackEvenLanes16x8:
3922       case Iop_PackEvenLanes32x4:
3923          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
3924
3925       case Iop_GetElem8x16:
3926          complainIfUndefined(mce, atom2, NULL);
3927          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3928       case Iop_GetElem16x8:
3929          complainIfUndefined(mce, atom2, NULL);
3930          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3931       case Iop_GetElem32x4:
3932          complainIfUndefined(mce, atom2, NULL);
3933          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3934       case Iop_GetElem64x2:
3935          complainIfUndefined(mce, atom2, NULL);
3936          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3937
3938      /* Perm8x16: rearrange values in left arg using steering values
3939         from right arg.  So rearrange the vbits in the same way but
3940         pessimise wrt steering values.  Perm32x4 ditto. */
3941       case Iop_Perm8x16:
3942          return mkUifUV128(
3943                    mce,
3944                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3945                    mkPCast8x16(mce, vatom2)
3946                 );
3947       case Iop_Perm32x4:
3948          return mkUifUV128(
3949                    mce,
3950                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3951                    mkPCast32x4(mce, vatom2)
3952                 );
3953
3954      /* These two take the lower half of each 16-bit lane, sign/zero
3955         extend it to 32, and multiply together, producing a 32x4
3956         result (and implicitly ignoring half the operand bits).  So
3957         treat it as a bunch of independent 16x8 operations, but then
3958         do 32-bit shifts left-right to copy the lower half results
3959         (which are all 0s or all 1s due to PCasting in binary16Ix8)
3960         into the upper half of each result lane. */
3961       case Iop_MullEven16Ux8:
3962       case Iop_MullEven16Sx8: {
3963          IRAtom* at;
3964          at = binary16Ix8(mce,vatom1,vatom2);
3965          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
3966          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
3967          return at;
3968       }
3969
3970       /* Same deal as Iop_MullEven16{S,U}x8 */
3971       case Iop_MullEven8Ux16:
3972       case Iop_MullEven8Sx16: {
3973          IRAtom* at;
3974          at = binary8Ix16(mce,vatom1,vatom2);
3975          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
3976          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
3977          return at;
3978       }
3979
3980       /* Same deal as Iop_MullEven16{S,U}x8 */
3981       case Iop_MullEven32Ux4:
3982       case Iop_MullEven32Sx4: {
3983          IRAtom* at;
3984          at = binary32Ix4(mce,vatom1,vatom2);
3985          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
3986          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
3987          return at;
3988       }
3989
3990       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
3991          32x4 -> 16x8 laneage, discarding the upper half of each lane.
3992          Simply apply same op to the V bits, since this really no more
3993          than a data steering operation. */
3994       case Iop_NarrowBin32to16x8:
3995       case Iop_NarrowBin16to8x16:
3996       case Iop_NarrowBin64to32x4:
3997          return assignNew('V', mce, Ity_V128,
3998                                     binop(op, vatom1, vatom2));
3999
4000       case Iop_ShrV128:
4001       case Iop_ShlV128:
4002       case Iop_I128StoBCD128:
4003          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
4004             this is wrong now, scalar shifts are done properly lazily.
4005             Vector shifts should be fixed too. */
4006          complainIfUndefined(mce, atom2, NULL);
4007          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4008
4009       case Iop_BCDAdd:
4010       case Iop_BCDSub:
4011          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4012
4013       /* SHA Iops */
4014       case Iop_SHA256:
4015       case Iop_SHA512:
4016          complainIfUndefined(mce, atom2, NULL);
4017          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4018
4019       /* I128-bit data-steering */
4020       case Iop_64HLto128:
4021          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
4022
4023       /* V256-bit SIMD */
4024
4025       case Iop_Max64Fx4:
4026       case Iop_Min64Fx4:
4027          return binary64Fx4(mce, vatom1, vatom2);
4028
4029       case Iop_Max32Fx8:
4030       case Iop_Min32Fx8:
4031          return binary32Fx8(mce, vatom1, vatom2);
4032
4033       /* V256-bit data-steering */
4034       case Iop_V128HLtoV256:
4035          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
4036
4037       /* Scalar floating point */
4038
4039       case Iop_F32toI64S:
4040       case Iop_F32toI64U:
4041          /* I32(rm) x F32 -> I64 */
4042          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4043
4044       case Iop_I64StoF32:
4045          /* I32(rm) x I64 -> F32 */
4046          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4047
4048       case Iop_RoundF64toInt:
4049       case Iop_RoundF64toF32:
4050       case Iop_F64toI64S:
4051       case Iop_F64toI64U:
4052       case Iop_I64StoF64:
4053       case Iop_I64UtoF64:
4054       case Iop_SinF64:
4055       case Iop_CosF64:
4056       case Iop_TanF64:
4057       case Iop_2xm1F64:
4058       case Iop_SqrtF64:
4059       case Iop_RecpExpF64:
4060          /* I32(rm) x I64/F64 -> I64/F64 */
4061          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4062
4063       case Iop_ShlD64:
4064       case Iop_ShrD64:
4065       case Iop_RoundD64toInt:
4066          /* I32(rm) x D64 -> D64 */
4067          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4068
4069       case Iop_ShlD128:
4070       case Iop_ShrD128:
4071       case Iop_RoundD128toInt:
4072          /* I32(rm) x D128 -> D128 */
4073          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4074
4075       case Iop_RoundF128toInt:
4076          /* I32(rm) x F128 -> F128 */
4077          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4078
4079       case Iop_D64toI64S:
4080       case Iop_D64toI64U:
4081       case Iop_I64StoD64:
4082       case Iop_I64UtoD64:
4083          /* I32(rm) x I64/D64 -> D64/I64 */
4084          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4085
4086       case Iop_F32toD32:
4087       case Iop_F64toD32:
4088       case Iop_F128toD32:
4089       case Iop_D32toF32:
4090       case Iop_D64toF32:
4091       case Iop_D128toF32:
4092          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4093          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4094
4095       case Iop_F32toD64:
4096       case Iop_F64toD64:
4097       case Iop_F128toD64:
4098       case Iop_D32toF64:
4099       case Iop_D64toF64:
4100       case Iop_D128toF64:
4101          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4102          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4103
4104       case Iop_F32toD128:
4105       case Iop_F64toD128:
4106       case Iop_F128toD128:
4107       case Iop_D32toF128:
4108       case Iop_D64toF128:
4109       case Iop_D128toF128:
4110          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4111          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4112
4113       case Iop_RoundF32toInt:
4114       case Iop_SqrtF32:
4115       case Iop_RecpExpF32:
4116          /* I32(rm) x I32/F32 -> I32/F32 */
4117          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4118
4119       case Iop_SqrtF128:
4120          /* I32(rm) x F128 -> F128 */
4121          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4122
4123       case Iop_I32StoF32:
4124       case Iop_I32UtoF32:
4125       case Iop_F32toI32S:
4126       case Iop_F32toI32U:
4127          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4128          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4129
4130       case Iop_F64toF16:
4131       case Iop_F32toF16:
4132          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4133          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4134
4135       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
4136       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
4137       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
4138       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
4139       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
4140          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4141
4142       case Iop_F128toI128S:   /* IRRoundingMode(I32) x F128 -> signed I128 */
4143       case Iop_RndF128:       /* IRRoundingMode(I32) x F128 -> F128 */
4144          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4145
4146       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
4147       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
4148       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
4149       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
4150       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
4151       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
4152          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4153
4154       case Iop_F64HLtoF128:
4155       case Iop_D64HLtoD128:
4156          return assignNew('V', mce, Ity_I128,
4157                           binop(Iop_64HLto128, vatom1, vatom2));
4158
4159       case Iop_F64toI32U:
4160       case Iop_F64toI32S:
4161       case Iop_F64toF32:
4162       case Iop_I64UtoF32:
4163       case Iop_D64toI32U:
4164       case Iop_D64toI32S:
4165          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4166          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4167
4168       case Iop_D64toD32:
4169          /* First arg is I32 (rounding mode), second is D64 (data). */
4170          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4171
4172       case Iop_F64toI16S:
4173          /* First arg is I32 (rounding mode), second is F64 (data). */
4174          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4175
4176       case Iop_InsertExpD64:
4177          /*  I64 x I64 -> D64 */
4178          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4179
4180       case Iop_InsertExpD128:
4181          /*  I64 x I128 -> D128 */
4182          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4183
4184       case Iop_CmpF32:
4185       case Iop_CmpF64:
4186       case Iop_CmpF128:
4187       case Iop_CmpD64:
4188       case Iop_CmpD128:
4189       case Iop_CmpExpD64:
4190       case Iop_CmpExpD128:
4191          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4192
4193       case Iop_MaxNumF32:
4194       case Iop_MinNumF32:
4195          /* F32 x F32 -> F32 */
4196          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4197
4198       case Iop_MaxNumF64:
4199       case Iop_MinNumF64:
4200          /* F64 x F64 -> F64 */
4201          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4202
4203       /* non-FP after here */
4204
4205       case Iop_DivModU64to32:
4206       case Iop_DivModS64to32:
4207          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4208
4209       case Iop_DivModU128to64:
4210       case Iop_DivModS128to64:
4211          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4212
4213       case Iop_8HLto16:
4214          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
4215       case Iop_16HLto32:
4216          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
4217       case Iop_32HLto64:
4218          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
4219
4220       case Iop_DivModU64to64:
4221       case Iop_DivModS64to64: {
4222          IRAtom* vTmp64 = mkLazy2(mce, Ity_I64, vatom1, vatom2);
4223          return assignNew('V', mce, Ity_I128,
4224                           binop(Iop_64HLto128, vTmp64, vTmp64));
4225       }
4226
4227       case Iop_MullS64:
4228       case Iop_MullU64: {
4229          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4230          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
4231          return assignNew('V', mce, Ity_I128,
4232                           binop(Iop_64HLto128, vHi64, vLo64));
4233       }
4234
4235       case Iop_DivModU32to32:
4236       case Iop_DivModS32to32: {
4237          IRAtom* vTmp32 = mkLazy2(mce, Ity_I32, vatom1, vatom2);
4238          return assignNew('V', mce, Ity_I64,
4239                           binop(Iop_32HLto64, vTmp32, vTmp32));
4240       }
4241
4242       case Iop_MullS32:
4243       case Iop_MullU32: {
4244          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4245          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
4246          return assignNew('V', mce, Ity_I64,
4247                           binop(Iop_32HLto64, vHi32, vLo32));
4248       }
4249
4250       case Iop_MullS16:
4251       case Iop_MullU16: {
4252          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4253          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4254          return assignNew('V', mce, Ity_I32,
4255                           binop(Iop_16HLto32, vHi16, vLo16));
4256       }
4257
4258       case Iop_MullS8:
4259       case Iop_MullU8: {
4260          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4261          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4262          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4263       }
4264
4265       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
4266       case Iop_DivS32:
4267       case Iop_DivU32:
4268       case Iop_DivU32E:
4269       case Iop_DivS32E:
4270       case Iop_QAdd32S: /* could probably do better */
4271       case Iop_QSub32S: /* could probably do better */
4272          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4273
4274       case Iop_DivS64:
4275       case Iop_DivU64:
4276       case Iop_DivS64E:
4277       case Iop_DivU64E:
4278          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4279
4280       case Iop_Add32:
4281          if (mce->dlbo.dl_Add32 == DLexpensive
4282              || (mce->dlbo.dl_Add32 == DLauto && hu == HuOth)) {
4283              return expensiveAddSub(mce,True,Ity_I32,
4284                                     vatom1,vatom2, atom1,atom2);
4285          } else {
4286              goto cheap_AddSub32;
4287          }
4288       case Iop_Sub32:
4289          if (mce->dlbo.dl_Sub32 == DLexpensive
4290              || (mce->dlbo.dl_Sub32 == DLauto && hu == HuOth)) {
4291              return expensiveAddSub(mce,False,Ity_I32,
4292                                     vatom1,vatom2, atom1,atom2);
4293          } else {
4294              goto cheap_AddSub32;
4295          }
4296
4297       cheap_AddSub32:
4298       case Iop_Mul32:
4299          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4300
4301       case Iop_CmpORD32S:
4302       case Iop_CmpORD32U:
4303       case Iop_CmpORD64S:
4304       case Iop_CmpORD64U:
4305          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4306
4307       case Iop_Add64:
4308          if (mce->dlbo.dl_Add64 == DLexpensive
4309              || (mce->dlbo.dl_Add64 == DLauto && hu == HuOth)) {
4310              return expensiveAddSub(mce,True,Ity_I64,
4311                                     vatom1,vatom2, atom1,atom2);
4312          } else {
4313              goto cheap_AddSub64;
4314          }
4315       case Iop_Sub64:
4316          if (mce->dlbo.dl_Sub64 == DLexpensive
4317              || (mce->dlbo.dl_Sub64 == DLauto && hu == HuOth)) {
4318              return expensiveAddSub(mce,False,Ity_I64,
4319                                     vatom1,vatom2, atom1,atom2);
4320          } else {
4321              goto cheap_AddSub64;
4322          }
4323
4324       cheap_AddSub64:
4325       case Iop_Mul64:
4326          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4327
4328       case Iop_Mul16:
4329       case Iop_Add16:
4330       case Iop_Sub16:
4331          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4332
4333       case Iop_Mul8:
4334       case Iop_Sub8:
4335       case Iop_Add8:
4336          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4337
4338       ////---- CmpXX64
4339       case Iop_CmpEQ64: case Iop_CmpNE64:
4340          if (mce->dlbo.dl_CmpEQ64_CmpNE64 == DLexpensive)
4341             goto expensive_cmp64;
4342          else
4343             goto cheap_cmp64;
4344
4345       expensive_cmp64:
4346       case Iop_ExpCmpNE64:
4347          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4348
4349       cheap_cmp64:
4350       case Iop_CmpLE64S: case Iop_CmpLE64U:
4351       case Iop_CmpLT64U: case Iop_CmpLT64S:
4352          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4353
4354       ////---- CmpXX32
4355       case Iop_CmpEQ32: case Iop_CmpNE32:
4356          if (mce->dlbo.dl_CmpEQ32_CmpNE32 == DLexpensive)
4357             goto expensive_cmp32;
4358          else
4359             goto cheap_cmp32;
4360
4361       expensive_cmp32:
4362       case Iop_ExpCmpNE32:
4363          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4364
4365       cheap_cmp32:
4366       case Iop_CmpLE32S: case Iop_CmpLE32U:
4367       case Iop_CmpLT32U: case Iop_CmpLT32S:
4368          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4369
4370       ////---- CmpXX16
4371       case Iop_CmpEQ16: case Iop_CmpNE16:
4372          if (mce->dlbo.dl_CmpEQ16_CmpNE16 == DLexpensive)
4373             goto expensive_cmp16;
4374          else
4375             goto cheap_cmp16;
4376
4377       expensive_cmp16:
4378       case Iop_ExpCmpNE16:
4379          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4380
4381       cheap_cmp16:
4382          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4383
4384       ////---- CmpXX8
4385       case Iop_CmpEQ8: case Iop_CmpNE8:
4386          if (mce->dlbo.dl_CmpEQ8_CmpNE8 == DLexpensive)
4387             goto expensive_cmp8;
4388          else
4389             goto cheap_cmp8;
4390
4391       expensive_cmp8:
4392          return expensiveCmpEQorNE(mce,Ity_I8, vatom1,vatom2, atom1,atom2 );
4393
4394       cheap_cmp8:
4395          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4396
4397       ////---- end CmpXX{64,32,16,8}
4398
4399       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
4400       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4401       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4402       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4403          /* Just say these all produce a defined result, regardless
4404             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
4405          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4406
4407       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4408          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4409
4410       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4411          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4412
4413       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4414          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4415
4416       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4417          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4418
4419       case Iop_AndV256:
4420          uifu = mkUifUV256; difd = mkDifDV256;
4421          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4422       case Iop_AndV128:
4423          uifu = mkUifUV128; difd = mkDifDV128;
4424          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4425       case Iop_And64:
4426          uifu = mkUifU64; difd = mkDifD64;
4427          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4428       case Iop_And32:
4429          uifu = mkUifU32; difd = mkDifD32;
4430          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4431       case Iop_And16:
4432          uifu = mkUifU16; difd = mkDifD16;
4433          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4434       case Iop_And8:
4435          uifu = mkUifU8; difd = mkDifD8;
4436          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4437
4438       case Iop_OrV256:
4439          uifu = mkUifUV256; difd = mkDifDV256;
4440          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4441       case Iop_OrV128:
4442          uifu = mkUifUV128; difd = mkDifDV128;
4443          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4444       case Iop_Or64:
4445          uifu = mkUifU64; difd = mkDifD64;
4446          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4447       case Iop_Or32:
4448          uifu = mkUifU32; difd = mkDifD32;
4449          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4450       case Iop_Or16:
4451          uifu = mkUifU16; difd = mkDifD16;
4452          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4453       case Iop_Or8:
4454          uifu = mkUifU8; difd = mkDifD8;
4455          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4456
4457       do_And_Or:
4458          return
4459          assignNew(
4460             'V', mce,
4461             and_or_ty,
4462             difd(mce, uifu(mce, vatom1, vatom2),
4463                       difd(mce, improve(mce, atom1, vatom1),
4464                                 improve(mce, atom2, vatom2) ) ) );
4465
4466       case Iop_Xor8:
4467          return mkUifU8(mce, vatom1, vatom2);
4468       case Iop_Xor16:
4469          return mkUifU16(mce, vatom1, vatom2);
4470       case Iop_Xor32:
4471          return mkUifU32(mce, vatom1, vatom2);
4472       case Iop_Xor64:
4473          return mkUifU64(mce, vatom1, vatom2);
4474       case Iop_XorV128:
4475          return mkUifUV128(mce, vatom1, vatom2);
4476       case Iop_XorV256:
4477          return mkUifUV256(mce, vatom1, vatom2);
4478
4479       /* V256-bit SIMD */
4480
4481       case Iop_ShrN16x16:
4482       case Iop_ShrN32x8:
4483       case Iop_ShrN64x4:
4484       case Iop_SarN16x16:
4485       case Iop_SarN32x8:
4486       case Iop_ShlN16x16:
4487       case Iop_ShlN32x8:
4488       case Iop_ShlN64x4:
4489          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
4490             this is wrong now, scalar shifts are done properly lazily.
4491             Vector shifts should be fixed too. */
4492          complainIfUndefined(mce, atom2, NULL);
4493          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4494
4495       case Iop_QSub8Ux32:
4496       case Iop_QSub8Sx32:
4497       case Iop_Sub8x32:
4498       case Iop_Min8Ux32:
4499       case Iop_Min8Sx32:
4500       case Iop_Max8Ux32:
4501       case Iop_Max8Sx32:
4502       case Iop_CmpGT8Sx32:
4503       case Iop_CmpEQ8x32:
4504       case Iop_Avg8Ux32:
4505       case Iop_QAdd8Ux32:
4506       case Iop_QAdd8Sx32:
4507       case Iop_Add8x32:
4508          return binary8Ix32(mce, vatom1, vatom2);
4509
4510       case Iop_QSub16Ux16:
4511       case Iop_QSub16Sx16:
4512       case Iop_Sub16x16:
4513       case Iop_Mul16x16:
4514       case Iop_MulHi16Sx16:
4515       case Iop_MulHi16Ux16:
4516       case Iop_Min16Sx16:
4517       case Iop_Min16Ux16:
4518       case Iop_Max16Sx16:
4519       case Iop_Max16Ux16:
4520       case Iop_CmpGT16Sx16:
4521       case Iop_CmpEQ16x16:
4522       case Iop_Avg16Ux16:
4523       case Iop_QAdd16Ux16:
4524       case Iop_QAdd16Sx16:
4525       case Iop_Add16x16:
4526          return binary16Ix16(mce, vatom1, vatom2);
4527
4528       case Iop_Sub32x8:
4529       case Iop_CmpGT32Sx8:
4530       case Iop_CmpEQ32x8:
4531       case Iop_Add32x8:
4532       case Iop_Max32Ux8:
4533       case Iop_Max32Sx8:
4534       case Iop_Min32Ux8:
4535       case Iop_Min32Sx8:
4536       case Iop_Mul32x8:
4537          return binary32Ix8(mce, vatom1, vatom2);
4538
4539       case Iop_Sub64x4:
4540       case Iop_Add64x4:
4541       case Iop_CmpEQ64x4:
4542       case Iop_CmpGT64Sx4:
4543          return binary64Ix4(mce, vatom1, vatom2);
4544
4545      /* Perm32x8: rearrange values in left arg using steering values
4546         from right arg.  So rearrange the vbits in the same way but
4547         pessimise wrt steering values. */
4548       case Iop_Perm32x8:
4549          return mkUifUV256(
4550                    mce,
4551                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
4552                    mkPCast32x8(mce, vatom2)
4553                 );
4554
4555       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
4556          Handle the shifted results in the same way that other
4557          binary Q ops are handled, eg QSub: UifU the two args,
4558          then pessimise -- which is binaryNIxM.  But for the upper
4559          V128, we require to generate just 1 bit which is the
4560          pessimised shift result, with 127 defined zeroes above it.
4561
4562          Note that this overly pessimistic in that in fact only the
4563          bottom 8 bits of each lane of the second arg determine the shift
4564          amount.  Really we ought to ignore any undefinedness in the
4565          rest of the lanes of the second arg. */
4566       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
4567       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
4568       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
4569       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
4570       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
4571       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
4572       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
4573       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
4574       {
4575          // The function to generate the pessimised shift result
4576          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
4577          switch (op) {
4578             case Iop_QandSQsh64x2:
4579             case Iop_QandUQsh64x2:
4580             case Iop_QandSQRsh64x2:
4581             case Iop_QandUQRsh64x2:
4582                binaryNIxM = binary64Ix2;
4583                break;
4584             case Iop_QandSQsh32x4:
4585             case Iop_QandUQsh32x4:
4586             case Iop_QandSQRsh32x4:
4587             case Iop_QandUQRsh32x4:
4588                binaryNIxM = binary32Ix4;
4589                break;
4590             case Iop_QandSQsh16x8:
4591             case Iop_QandUQsh16x8:
4592             case Iop_QandSQRsh16x8:
4593             case Iop_QandUQRsh16x8:
4594                binaryNIxM = binary16Ix8;
4595                break;
4596             case Iop_QandSQsh8x16:
4597             case Iop_QandUQsh8x16:
4598             case Iop_QandSQRsh8x16:
4599             case Iop_QandUQRsh8x16:
4600                binaryNIxM = binary8Ix16;
4601                break;
4602             default:
4603                tl_assert(0);
4604          }
4605          tl_assert(binaryNIxM);
4606          // Pessimised shift result, shV[127:0]
4607          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
4608          // Generates: Def--(127)--Def PCast-to-I1(shV)
4609          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
4610          // and assemble the result
4611          return assignNew('V', mce, Ity_V256,
4612                           binop(Iop_V128HLtoV256, qV, shV));
4613       }
4614
4615       default:
4616          ppIROp(op);
4617          VG_(tool_panic)("memcheck:expr2vbits_Binop");
4618    }
4619 }
4620
4621
4622 static
4623 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
4624 {
4625    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
4626       selection of shadow operation implicitly duplicates the logic in
4627       do_shadow_LoadG and should be kept in sync (in the very unlikely
4628       event that the interpretation of such widening ops changes in
4629       future).  See comment in do_shadow_LoadG. */
4630    IRAtom* vatom = expr2vbits( mce, atom, HuOth );
4631    tl_assert(isOriginalAtom(mce,atom));
4632    switch (op) {
4633
4634       case Iop_Abs64Fx2:
4635       case Iop_Neg64Fx2:
4636       case Iop_RSqrtEst64Fx2:
4637       case Iop_RecipEst64Fx2:
4638       case Iop_Log2_64Fx2:
4639          return unary64Fx2(mce, vatom);
4640
4641       case Iop_Sqrt64F0x2:
4642          return unary64F0x2(mce, vatom);
4643
4644       case Iop_Sqrt32Fx8:
4645       case Iop_RSqrtEst32Fx8:
4646       case Iop_RecipEst32Fx8:
4647          return unary32Fx8(mce, vatom);
4648
4649       case Iop_Sqrt64Fx4:
4650          return unary64Fx4(mce, vatom);
4651
4652       case Iop_RecipEst32Fx4:
4653       case Iop_I32UtoFx4:
4654       case Iop_I32StoFx4:
4655       case Iop_QFtoI32Ux4_RZ:
4656       case Iop_QFtoI32Sx4_RZ:
4657       case Iop_RoundF32x4_RM:
4658       case Iop_RoundF32x4_RP:
4659       case Iop_RoundF32x4_RN:
4660       case Iop_RoundF32x4_RZ:
4661       case Iop_RecipEst32Ux4:
4662       case Iop_Abs32Fx4:
4663       case Iop_Neg32Fx4:
4664       case Iop_RSqrtEst32Fx4:
4665       case Iop_Log2_32Fx4:
4666          return unary32Fx4(mce, vatom);
4667
4668       case Iop_I32UtoFx2:
4669       case Iop_I32StoFx2:
4670       case Iop_RecipEst32Fx2:
4671       case Iop_RecipEst32Ux2:
4672       case Iop_Abs32Fx2:
4673       case Iop_Neg32Fx2:
4674       case Iop_RSqrtEst32Fx2:
4675          return unary32Fx2(mce, vatom);
4676
4677       case Iop_Sqrt32F0x4:
4678       case Iop_RSqrtEst32F0x4:
4679       case Iop_RecipEst32F0x4:
4680          return unary32F0x4(mce, vatom);
4681
4682       case Iop_32UtoV128:
4683       case Iop_64UtoV128:
4684       case Iop_Dup8x16:
4685       case Iop_Dup16x8:
4686       case Iop_Dup32x4:
4687       case Iop_Reverse1sIn8_x16:
4688       case Iop_Reverse8sIn16_x8:
4689       case Iop_Reverse8sIn32_x4:
4690       case Iop_Reverse16sIn32_x4:
4691       case Iop_Reverse8sIn64_x2:
4692       case Iop_Reverse16sIn64_x2:
4693       case Iop_Reverse32sIn64_x2:
4694       case Iop_V256toV128_1: case Iop_V256toV128_0:
4695       case Iop_ZeroHI64ofV128:
4696       case Iop_ZeroHI96ofV128:
4697       case Iop_ZeroHI112ofV128:
4698       case Iop_ZeroHI120ofV128:
4699          return assignNew('V', mce, Ity_V128, unop(op, vatom));
4700
4701       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
4702       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
4703          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
4704       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
4705       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
4706          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
4707
4708       case Iop_NegF128:
4709       case Iop_AbsF128:
4710       case Iop_RndF128:
4711       case Iop_TruncF128toI64S: /* F128 -> I64S */
4712       case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
4713       case Iop_TruncF128toI64U: /* F128 -> I64U */
4714       case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
4715          return mkPCastTo(mce, Ity_I128, vatom);
4716
4717       case Iop_BCD128toI128S:
4718       case Iop_MulI128by10:
4719       case Iop_MulI128by10Carry:
4720       case Iop_F16toF64x2:
4721       case Iop_F64toF16x2:
4722          return vatom;
4723
4724       case Iop_I32StoF128: /* signed I32 -> F128 */
4725       case Iop_I64StoF128: /* signed I64 -> F128 */
4726       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
4727       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
4728       case Iop_F32toF128:  /* F32 -> F128 */
4729       case Iop_F64toF128:  /* F64 -> F128 */
4730       case Iop_I32StoD128: /* signed I64 -> D128 */
4731       case Iop_I64StoD128: /* signed I64 -> D128 */
4732       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
4733       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
4734          return mkPCastTo(mce, Ity_I128, vatom);
4735
4736       case Iop_F16toF64:
4737       case Iop_F32toF64:
4738       case Iop_I32StoF64:
4739       case Iop_I32UtoF64:
4740       case Iop_NegF64:
4741       case Iop_AbsF64:
4742       case Iop_RSqrtEst5GoodF64:
4743       case Iop_RoundF64toF64_NEAREST:
4744       case Iop_RoundF64toF64_NegINF:
4745       case Iop_RoundF64toF64_PosINF:
4746       case Iop_RoundF64toF64_ZERO:
4747       case Iop_Clz64:
4748       case Iop_D32toD64:
4749       case Iop_I32StoD64:
4750       case Iop_I32UtoD64:
4751       case Iop_ExtractExpD64:    /* D64  -> I64 */
4752       case Iop_ExtractExpD128:   /* D128 -> I64 */
4753       case Iop_ExtractSigD64:    /* D64  -> I64 */
4754       case Iop_ExtractSigD128:   /* D128 -> I64 */
4755       case Iop_DPBtoBCD:
4756       case Iop_BCDtoDPB:
4757          return mkPCastTo(mce, Ity_I64, vatom);
4758
4759       case Iop_D64toD128:
4760          return mkPCastTo(mce, Ity_I128, vatom);
4761
4762       case Iop_Clz32:
4763       case Iop_TruncF64asF32:
4764       case Iop_NegF32:
4765       case Iop_AbsF32:
4766       case Iop_F16toF32:
4767          return mkPCastTo(mce, Ity_I32, vatom);
4768
4769       case Iop_Ctz32:
4770       case Iop_Ctz64:
4771          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
4772
4773       case Iop_1Uto64:
4774       case Iop_1Sto64:
4775       case Iop_8Uto64:
4776       case Iop_8Sto64:
4777       case Iop_16Uto64:
4778       case Iop_16Sto64:
4779       case Iop_32Sto64:
4780       case Iop_32Uto64:
4781       case Iop_V128to64:
4782       case Iop_V128HIto64:
4783       case Iop_128HIto64:
4784       case Iop_128to64:
4785       case Iop_Dup8x8:
4786       case Iop_Dup16x4:
4787       case Iop_Dup32x2:
4788       case Iop_Reverse8sIn16_x4:
4789       case Iop_Reverse8sIn32_x2:
4790       case Iop_Reverse16sIn32_x2:
4791       case Iop_Reverse8sIn64_x1:
4792       case Iop_Reverse16sIn64_x1:
4793       case Iop_Reverse32sIn64_x1:
4794       case Iop_V256to64_0: case Iop_V256to64_1:
4795       case Iop_V256to64_2: case Iop_V256to64_3:
4796          return assignNew('V', mce, Ity_I64, unop(op, vatom));
4797
4798       case Iop_64to32:
4799       case Iop_64HIto32:
4800       case Iop_1Uto32:
4801       case Iop_1Sto32:
4802       case Iop_8Uto32:
4803       case Iop_16Uto32:
4804       case Iop_16Sto32:
4805       case Iop_8Sto32:
4806       case Iop_V128to32:
4807          return assignNew('V', mce, Ity_I32, unop(op, vatom));
4808
4809       case Iop_8Sto16:
4810       case Iop_8Uto16:
4811       case Iop_32to16:
4812       case Iop_32HIto16:
4813       case Iop_64to16:
4814       case Iop_GetMSBs8x16:
4815          return assignNew('V', mce, Ity_I16, unop(op, vatom));
4816
4817       case Iop_1Uto8:
4818       case Iop_1Sto8:
4819       case Iop_16to8:
4820       case Iop_16HIto8:
4821       case Iop_32to8:
4822       case Iop_64to8:
4823       case Iop_GetMSBs8x8:
4824          return assignNew('V', mce, Ity_I8, unop(op, vatom));
4825
4826       case Iop_32to1:
4827          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
4828
4829       case Iop_64to1:
4830          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
4831
4832       case Iop_ReinterpF64asI64:
4833       case Iop_ReinterpI64asF64:
4834       case Iop_ReinterpI32asF32:
4835       case Iop_ReinterpF32asI32:
4836       case Iop_ReinterpI64asD64:
4837       case Iop_ReinterpD64asI64:
4838       case Iop_NotV256:
4839       case Iop_NotV128:
4840       case Iop_Not64:
4841       case Iop_Not32:
4842       case Iop_Not16:
4843       case Iop_Not8:
4844       case Iop_Not1:
4845          return vatom;
4846
4847       case Iop_CmpNEZ8x8:
4848       case Iop_Cnt8x8:
4849       case Iop_Clz8x8:
4850       case Iop_Cls8x8:
4851       case Iop_Abs8x8:
4852          return mkPCast8x8(mce, vatom);
4853
4854       case Iop_CmpNEZ8x16:
4855       case Iop_Cnt8x16:
4856       case Iop_Clz8x16:
4857       case Iop_Cls8x16:
4858       case Iop_Abs8x16:
4859       case Iop_Ctz8x16:
4860          return mkPCast8x16(mce, vatom);
4861
4862       case Iop_CmpNEZ16x4:
4863       case Iop_Clz16x4:
4864       case Iop_Cls16x4:
4865       case Iop_Abs16x4:
4866          return mkPCast16x4(mce, vatom);
4867
4868       case Iop_CmpNEZ16x8:
4869       case Iop_Clz16x8:
4870       case Iop_Cls16x8:
4871       case Iop_Abs16x8:
4872       case Iop_Ctz16x8:
4873          return mkPCast16x8(mce, vatom);
4874
4875       case Iop_CmpNEZ32x2:
4876       case Iop_Clz32x2:
4877       case Iop_Cls32x2:
4878       case Iop_FtoI32Ux2_RZ:
4879       case Iop_FtoI32Sx2_RZ:
4880       case Iop_Abs32x2:
4881          return mkPCast32x2(mce, vatom);
4882
4883       case Iop_CmpNEZ32x4:
4884       case Iop_Clz32x4:
4885       case Iop_Cls32x4:
4886       case Iop_FtoI32Ux4_RZ:
4887       case Iop_FtoI32Sx4_RZ:
4888       case Iop_Abs32x4:
4889       case Iop_RSqrtEst32Ux4:
4890       case Iop_Ctz32x4:
4891          return mkPCast32x4(mce, vatom);
4892
4893       case Iop_CmpwNEZ32:
4894          return mkPCastTo(mce, Ity_I32, vatom);
4895
4896       case Iop_CmpwNEZ64:
4897          return mkPCastTo(mce, Ity_I64, vatom);
4898
4899       case Iop_CmpNEZ64x2:
4900       case Iop_CipherSV128:
4901       case Iop_Clz64x2:
4902       case Iop_Abs64x2:
4903       case Iop_Ctz64x2:
4904          return mkPCast64x2(mce, vatom);
4905
4906       case Iop_PwBitMtxXpose64x2:
4907          return assignNew('V', mce, Ity_V128, unop(op, vatom));
4908
4909       case Iop_NarrowUn16to8x8:
4910       case Iop_NarrowUn32to16x4:
4911       case Iop_NarrowUn64to32x2:
4912       case Iop_QNarrowUn16Sto8Sx8:
4913       case Iop_QNarrowUn16Sto8Ux8:
4914       case Iop_QNarrowUn16Uto8Ux8:
4915       case Iop_QNarrowUn32Sto16Sx4:
4916       case Iop_QNarrowUn32Sto16Ux4:
4917       case Iop_QNarrowUn32Uto16Ux4:
4918       case Iop_QNarrowUn64Sto32Sx2:
4919       case Iop_QNarrowUn64Sto32Ux2:
4920       case Iop_QNarrowUn64Uto32Ux2:
4921       case Iop_F32toF16x4:
4922          return vectorNarrowUnV128(mce, op, vatom);
4923
4924       case Iop_Widen8Sto16x8:
4925       case Iop_Widen8Uto16x8:
4926       case Iop_Widen16Sto32x4:
4927       case Iop_Widen16Uto32x4:
4928       case Iop_Widen32Sto64x2:
4929       case Iop_Widen32Uto64x2:
4930       case Iop_F16toF32x4:
4931          return vectorWidenI64(mce, op, vatom);
4932
4933       case Iop_PwAddL32Ux2:
4934       case Iop_PwAddL32Sx2:
4935          return mkPCastTo(mce, Ity_I64,
4936                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
4937
4938       case Iop_PwAddL16Ux4:
4939       case Iop_PwAddL16Sx4:
4940          return mkPCast32x2(mce,
4941                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
4942
4943       case Iop_PwAddL8Ux8:
4944       case Iop_PwAddL8Sx8:
4945          return mkPCast16x4(mce,
4946                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
4947
4948       case Iop_PwAddL32Ux4:
4949       case Iop_PwAddL32Sx4:
4950          return mkPCast64x2(mce,
4951                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
4952
4953       case Iop_PwAddL16Ux8:
4954       case Iop_PwAddL16Sx8:
4955          return mkPCast32x4(mce,
4956                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
4957
4958       case Iop_PwAddL8Ux16:
4959       case Iop_PwAddL8Sx16:
4960          return mkPCast16x8(mce,
4961                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
4962
4963       case Iop_I64UtoF32:
4964       default:
4965          ppIROp(op);
4966          VG_(tool_panic)("memcheck:expr2vbits_Unop");
4967    }
4968 }
4969
4970
4971 /* Worker function -- do not call directly.  See comments on
4972    expr2vbits_Load for the meaning of |guard|.
4973
4974    Generates IR to (1) perform a definedness test of |addr|, (2)
4975    perform a validity test of |addr|, and (3) return the Vbits for the
4976    location indicated by |addr|.  All of this only happens when
4977    |guard| is NULL or |guard| evaluates to True at run time.
4978
4979    If |guard| evaluates to False at run time, the returned value is
4980    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
4981    performed.
4982
4983    The definedness of |guard| itself is not checked.  That is assumed
4984    to have been done before this point, by the caller. */
4985 static
4986 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
4987                               IREndness end, IRType ty,
4988                               IRAtom* addr, UInt bias, IRAtom* guard )
4989 {
4990    tl_assert(isOriginalAtom(mce,addr));
4991    tl_assert(end == Iend_LE || end == Iend_BE);
4992
4993    /* First, emit a definedness test for the address.  This also sets
4994       the address (shadow) to 'defined' following the test. */
4995    complainIfUndefined( mce, addr, guard );
4996
4997    /* Now cook up a call to the relevant helper function, to read the
4998       data V bits from shadow memory. */
4999    ty = shadowTypeV(ty);
5000
5001    void*        helper           = NULL;
5002    const HChar* hname            = NULL;
5003    Bool         ret_via_outparam = False;
5004
5005    if (end == Iend_LE) {
5006       switch (ty) {
5007          case Ity_V256: helper = &MC_(helperc_LOADV256le);
5008                         hname = "MC_(helperc_LOADV256le)";
5009                         ret_via_outparam = True;
5010                         break;
5011          case Ity_V128: helper = &MC_(helperc_LOADV128le);
5012                         hname = "MC_(helperc_LOADV128le)";
5013                         ret_via_outparam = True;
5014                         break;
5015          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
5016                         hname = "MC_(helperc_LOADV64le)";
5017                         break;
5018          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
5019                         hname = "MC_(helperc_LOADV32le)";
5020                         break;
5021          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
5022                         hname = "MC_(helperc_LOADV16le)";
5023                         break;
5024          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5025                         hname = "MC_(helperc_LOADV8)";
5026                         break;
5027          default:       ppIRType(ty);
5028                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
5029       }
5030    } else {
5031       switch (ty) {
5032          case Ity_V256: helper = &MC_(helperc_LOADV256be);
5033                         hname = "MC_(helperc_LOADV256be)";
5034                         ret_via_outparam = True;
5035                         break;
5036          case Ity_V128: helper = &MC_(helperc_LOADV128be);
5037                         hname = "MC_(helperc_LOADV128be)";
5038                         ret_via_outparam = True;
5039                         break;
5040          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
5041                         hname = "MC_(helperc_LOADV64be)";
5042                         break;
5043          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
5044                         hname = "MC_(helperc_LOADV32be)";
5045                         break;
5046          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
5047                         hname = "MC_(helperc_LOADV16be)";
5048                         break;
5049          case Ity_I8:   helper = &MC_(helperc_LOADV8);
5050                         hname = "MC_(helperc_LOADV8)";
5051                         break;
5052          default:       ppIRType(ty);
5053                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
5054       }
5055    }
5056
5057    tl_assert(helper);
5058    tl_assert(hname);
5059
5060    /* Generate the actual address into addrAct. */
5061    IRAtom* addrAct;
5062    if (bias == 0) {
5063       addrAct = addr;
5064    } else {
5065       IROp    mkAdd;
5066       IRAtom* eBias;
5067       IRType  tyAddr  = mce->hWordTy;
5068       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5069       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5070       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5071       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
5072    }
5073
5074    /* We need to have a place to park the V bits we're just about to
5075       read. */
5076    IRTemp datavbits = newTemp(mce, ty, VSh);
5077
5078    /* Here's the call. */
5079    IRDirty* di;
5080    if (ret_via_outparam) {
5081       di = unsafeIRDirty_1_N( datavbits,
5082                               2/*regparms*/,
5083                               hname, VG_(fnptr_to_fnentry)( helper ),
5084                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
5085    } else {
5086       di = unsafeIRDirty_1_N( datavbits,
5087                               1/*regparms*/,
5088                               hname, VG_(fnptr_to_fnentry)( helper ),
5089                               mkIRExprVec_1( addrAct ) );
5090    }
5091
5092    setHelperAnns( mce, di );
5093    if (guard) {
5094       di->guard = guard;
5095       /* Ideally the didn't-happen return value here would be all-ones
5096          (all-undefined), so it'd be obvious if it got used
5097          inadvertently.  We can get by with the IR-mandated default
5098          value (0b01 repeating, 0x55 etc) as that'll still look pretty
5099          undefined if it ever leaks out. */
5100    }
5101    stmt( 'V', mce, IRStmt_Dirty(di) );
5102
5103    return mkexpr(datavbits);
5104 }
5105
5106
5107 /* Generate IR to do a shadow load.  The helper is expected to check
5108    the validity of the address and return the V bits for that address.
5109    This can optionally be controlled by a guard, which is assumed to
5110    be True if NULL.  In the case where the guard is False at runtime,
5111    the helper will return the didn't-do-the-call value of 0x55..55.
5112    Since that means "completely undefined result", the caller of
5113    this function will need to fix up the result somehow in that
5114    case.
5115
5116    Caller of this function is also expected to have checked the
5117    definedness of |guard| before this point.
5118 */
5119 static
5120 IRAtom* expr2vbits_Load ( MCEnv* mce,
5121                           IREndness end, IRType ty,
5122                           IRAtom* addr, UInt bias,
5123                           IRAtom* guard )
5124 {
5125    tl_assert(end == Iend_LE || end == Iend_BE);
5126    switch (shadowTypeV(ty)) {
5127       case Ity_I8:
5128       case Ity_I16:
5129       case Ity_I32:
5130       case Ity_I64:
5131       case Ity_V128:
5132       case Ity_V256:
5133          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
5134       default:
5135          VG_(tool_panic)("expr2vbits_Load");
5136    }
5137 }
5138
5139
5140 /* The most general handler for guarded loads.  Assumes the
5141    definedness of GUARD has already been checked by the caller.  A
5142    GUARD of NULL is assumed to mean "always True".  Generates code to
5143    check the definedness and validity of ADDR.
5144
5145    Generate IR to do a shadow load from ADDR and return the V bits.
5146    The loaded type is TY.  The loaded data is then (shadow) widened by
5147    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
5148    evaluates to False at run time then the returned Vbits are simply
5149    VALT instead.  Note therefore that the argument type of VWIDEN must
5150    be TY and the result type of VWIDEN must equal the type of VALT.
5151 */
5152 static
5153 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
5154                                           IREndness end, IRType ty,
5155                                           IRAtom* addr, UInt bias,
5156                                           IRAtom* guard,
5157                                           IROp vwiden, IRAtom* valt )
5158 {
5159    /* Sanity check the conversion operation, and also set TYWIDE. */
5160    IRType tyWide = Ity_INVALID;
5161    switch (vwiden) {
5162       case Iop_INVALID:
5163          tyWide = ty;
5164          break;
5165       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
5166          tyWide = Ity_I32;
5167          break;
5168       default:
5169          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
5170    }
5171
5172    /* If the guard evaluates to True, this will hold the loaded V bits
5173       at TY.  If the guard evaluates to False, this will be all
5174       ones, meaning "all undefined", in which case we will have to
5175       replace it using an ITE below. */
5176    IRAtom* iftrue1
5177       = assignNew('V', mce, ty,
5178                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
5179    /* Now (shadow-) widen the loaded V bits to the desired width.  In
5180       the guard-is-False case, the allowable widening operators will
5181       in the worst case (unsigned widening) at least leave the
5182       pre-widened part as being marked all-undefined, and in the best
5183       case (signed widening) mark the whole widened result as
5184       undefined.  Anyway, it doesn't matter really, since in this case
5185       we will replace said value with the default value |valt| using an
5186       ITE. */
5187    IRAtom* iftrue2
5188       = vwiden == Iop_INVALID
5189            ? iftrue1
5190            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
5191    /* These are the V bits we will return if the load doesn't take
5192       place. */
5193    IRAtom* iffalse
5194       = valt;
5195    /* Prepare the cond for the ITE.  Convert a NULL cond into
5196       something that iropt knows how to fold out later. */
5197    IRAtom* cond
5198       = guard == NULL  ? mkU1(1)  : guard;
5199    /* And assemble the final result. */
5200    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
5201 }
5202
5203
5204 /* A simpler handler for guarded loads, in which there is no
5205    conversion operation, and the default V bit return (when the guard
5206    evaluates to False at runtime) is "all defined".  If there is no
5207    guard expression or the guard is always TRUE this function behaves
5208    like expr2vbits_Load.  It is assumed that definedness of GUARD has
5209    already been checked at the call site. */
5210 static
5211 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
5212                                          IREndness end, IRType ty,
5213                                          IRAtom* addr, UInt bias,
5214                                          IRAtom *guard )
5215 {
5216    return expr2vbits_Load_guarded_General(
5217              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
5218           );
5219 }
5220
5221
5222 static
5223 IRAtom* expr2vbits_ITE ( MCEnv* mce,
5224                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
5225 {
5226    IRAtom *vbitsC, *vbits0, *vbits1;
5227    IRType ty;
5228    /* Given ITE(cond, iftrue,  iffalse),  generate
5229             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5230       That is, steer the V bits like the originals, but trash the
5231       result if the steering value is undefined.  This gives
5232       lazy propagation. */
5233    tl_assert(isOriginalAtom(mce, cond));
5234    tl_assert(isOriginalAtom(mce, iftrue));
5235    tl_assert(isOriginalAtom(mce, iffalse));
5236
5237    vbitsC = expr2vbits(mce, cond, HuOth); // could we use HuPCa here?
5238    vbits1 = expr2vbits(mce, iftrue, HuOth);
5239    vbits0 = expr2vbits(mce, iffalse, HuOth);
5240    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
5241
5242    return
5243       mkUifU(mce, ty, assignNew('V', mce, ty,
5244                                      IRExpr_ITE(cond, vbits1, vbits0)),
5245                       mkPCastTo(mce, ty, vbitsC) );
5246 }
5247
5248 /* --------- This is the main expression-handling function. --------- */
5249
5250 static
5251 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e,
5252                      HowUsed hu/*use HuOth if unknown*/ )
5253 {
5254    switch (e->tag) {
5255
5256       case Iex_Get:
5257          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
5258
5259       case Iex_GetI:
5260          return shadow_GETI( mce, e->Iex.GetI.descr,
5261                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
5262
5263       case Iex_RdTmp:
5264          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
5265
5266       case Iex_Const:
5267          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
5268
5269       case Iex_Qop:
5270          return expr2vbits_Qop(
5271                    mce,
5272                    e->Iex.Qop.details->op,
5273                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
5274                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
5275                 );
5276
5277       case Iex_Triop:
5278          return expr2vbits_Triop(
5279                    mce,
5280                    e->Iex.Triop.details->op,
5281                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
5282                    e->Iex.Triop.details->arg3
5283                 );
5284
5285       case Iex_Binop:
5286          return expr2vbits_Binop(
5287                    mce,
5288                    e->Iex.Binop.op,
5289                    e->Iex.Binop.arg1, e->Iex.Binop.arg2,
5290                    hu
5291                 );
5292
5293       case Iex_Unop:
5294          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5295
5296       case Iex_Load:
5297          return expr2vbits_Load( mce, e->Iex.Load.end,
5298                                       e->Iex.Load.ty,
5299                                       e->Iex.Load.addr, 0/*addr bias*/,
5300                                       NULL/* guard == "always True"*/ );
5301
5302       case Iex_CCall:
5303          return mkLazyN( mce, e->Iex.CCall.args,
5304                               e->Iex.CCall.retty,
5305                               e->Iex.CCall.cee );
5306
5307       case Iex_ITE:
5308          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5309                                      e->Iex.ITE.iffalse);
5310
5311       default:
5312          VG_(printf)("\n");
5313          ppIRExpr(e);
5314          VG_(printf)("\n");
5315          VG_(tool_panic)("memcheck: expr2vbits");
5316    }
5317 }
5318
5319
5320 /*------------------------------------------------------------*/
5321 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
5322 /*------------------------------------------------------------*/
5323
5324 /* Widen a value to the host word size. */
5325
5326 static
5327 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5328 {
5329    IRType ty, tyH;
5330
5331    /* vatom is vbits-value and as such can only have a shadow type. */
5332    tl_assert(isShadowAtom(mce,vatom));
5333
5334    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
5335    tyH = mce->hWordTy;
5336
5337    if (tyH == Ity_I32) {
5338       switch (ty) {
5339          case Ity_I32:
5340             return vatom;
5341          case Ity_I16:
5342             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5343          case Ity_I8:
5344             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5345          default:
5346             goto unhandled;
5347       }
5348    } else
5349    if (tyH == Ity_I64) {
5350       switch (ty) {
5351          case Ity_I32:
5352             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5353          case Ity_I16:
5354             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5355                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5356          case Ity_I8:
5357             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5358                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5359          default:
5360             goto unhandled;
5361       }
5362    } else {
5363       goto unhandled;
5364    }
5365   unhandled:
5366    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
5367    VG_(tool_panic)("zwidenToHostWord");
5368 }
5369
5370
5371 /* Generate a shadow store.  |addr| is always the original address
5372    atom.  You can pass in either originals or V-bits for the data
5373    atom, but obviously not both.  This function generates a check for
5374    the definedness and (indirectly) the validity of |addr|, but only
5375    when |guard| evaluates to True at run time (or is NULL).
5376
5377    |guard| :: Ity_I1 controls whether the store really happens; NULL
5378    means it unconditionally does.  Note that |guard| itself is not
5379    checked for definedness; the caller of this function must do that
5380    if necessary.
5381 */
5382 static
5383 void do_shadow_Store ( MCEnv* mce,
5384                        IREndness end,
5385                        IRAtom* addr, UInt bias,
5386                        IRAtom* data, IRAtom* vdata,
5387                        IRAtom* guard )
5388 {
5389    IROp     mkAdd;
5390    IRType   ty, tyAddr;
5391    void*    helper = NULL;
5392    const HChar* hname = NULL;
5393    IRConst* c;
5394
5395    tyAddr = mce->hWordTy;
5396    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5397    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5398    tl_assert( end == Iend_LE || end == Iend_BE );
5399
5400    if (data) {
5401       tl_assert(!vdata);
5402       tl_assert(isOriginalAtom(mce, data));
5403       tl_assert(bias == 0);
5404       vdata = expr2vbits( mce, data, HuOth );
5405    } else {
5406       tl_assert(vdata);
5407    }
5408
5409    tl_assert(isOriginalAtom(mce,addr));
5410    tl_assert(isShadowAtom(mce,vdata));
5411
5412    if (guard) {
5413       tl_assert(isOriginalAtom(mce, guard));
5414       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
5415    }
5416
5417    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
5418
5419    // If we're not doing undefined value checking, pretend that this value
5420    // is "all valid".  That lets Vex's optimiser remove some of the V bit
5421    // shadow computation ops that precede it.
5422    if (MC_(clo_mc_level) == 1) {
5423       switch (ty) {
5424          case Ity_V256: // V256 weirdness -- used four times
5425                         c = IRConst_V256(V_BITS32_DEFINED); break;
5426          case Ity_V128: // V128 weirdness -- used twice
5427                         c = IRConst_V128(V_BITS16_DEFINED); break;
5428          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
5429          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
5430          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
5431          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
5432          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5433       }
5434       vdata = IRExpr_Const( c );
5435    }
5436
5437    /* First, emit a definedness test for the address.  This also sets
5438       the address (shadow) to 'defined' following the test.  Both of
5439       those actions are gated on |guard|. */
5440    complainIfUndefined( mce, addr, guard );
5441
5442    /* Now decide which helper function to call to write the data V
5443       bits into shadow memory. */
5444    if (end == Iend_LE) {
5445       switch (ty) {
5446          case Ity_V256: /* we'll use the helper four times */
5447          case Ity_V128: /* we'll use the helper twice */
5448          case Ity_I64: helper = &MC_(helperc_STOREV64le);
5449                        hname = "MC_(helperc_STOREV64le)";
5450                        break;
5451          case Ity_I32: helper = &MC_(helperc_STOREV32le);
5452                        hname = "MC_(helperc_STOREV32le)";
5453                        break;
5454          case Ity_I16: helper = &MC_(helperc_STOREV16le);
5455                        hname = "MC_(helperc_STOREV16le)";
5456                        break;
5457          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5458                        hname = "MC_(helperc_STOREV8)";
5459                        break;
5460          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5461       }
5462    } else {
5463       switch (ty) {
5464          case Ity_V128: /* we'll use the helper twice */
5465          case Ity_I64: helper = &MC_(helperc_STOREV64be);
5466                        hname = "MC_(helperc_STOREV64be)";
5467                        break;
5468          case Ity_I32: helper = &MC_(helperc_STOREV32be);
5469                        hname = "MC_(helperc_STOREV32be)";
5470                        break;
5471          case Ity_I16: helper = &MC_(helperc_STOREV16be);
5472                        hname = "MC_(helperc_STOREV16be)";
5473                        break;
5474          case Ity_I8:  helper = &MC_(helperc_STOREV8);
5475                        hname = "MC_(helperc_STOREV8)";
5476                        break;
5477          /* Note, no V256 case here, because no big-endian target that
5478             we support, has 256 vectors. */
5479          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
5480       }
5481    }
5482
5483    if (UNLIKELY(ty == Ity_V256)) {
5484
5485       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
5486          Q3 being the most significant lane. */
5487       /* These are the offsets of the Qs in memory. */
5488       Int     offQ0, offQ1, offQ2, offQ3;
5489
5490       /* Various bits for constructing the 4 lane helper calls */
5491       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
5492       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
5493       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
5494       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
5495
5496       if (end == Iend_LE) {
5497          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
5498       } else {
5499          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
5500       }
5501
5502       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
5503       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
5504       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
5505       diQ0    = unsafeIRDirty_0_N(
5506                    1/*regparms*/,
5507                    hname, VG_(fnptr_to_fnentry)( helper ),
5508                    mkIRExprVec_2( addrQ0, vdataQ0 )
5509                 );
5510
5511       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
5512       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
5513       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
5514       diQ1    = unsafeIRDirty_0_N(
5515                    1/*regparms*/,
5516                    hname, VG_(fnptr_to_fnentry)( helper ),
5517                    mkIRExprVec_2( addrQ1, vdataQ1 )
5518                 );
5519
5520       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
5521       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
5522       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
5523       diQ2    = unsafeIRDirty_0_N(
5524                    1/*regparms*/,
5525                    hname, VG_(fnptr_to_fnentry)( helper ),
5526                    mkIRExprVec_2( addrQ2, vdataQ2 )
5527                 );
5528
5529       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
5530       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
5531       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
5532       diQ3    = unsafeIRDirty_0_N(
5533                    1/*regparms*/,
5534                    hname, VG_(fnptr_to_fnentry)( helper ),
5535                    mkIRExprVec_2( addrQ3, vdataQ3 )
5536                 );
5537
5538       if (guard)
5539          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
5540
5541       setHelperAnns( mce, diQ0 );
5542       setHelperAnns( mce, diQ1 );
5543       setHelperAnns( mce, diQ2 );
5544       setHelperAnns( mce, diQ3 );
5545       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
5546       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
5547       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
5548       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
5549
5550    }
5551    else if (UNLIKELY(ty == Ity_V128)) {
5552
5553       /* V128-bit case */
5554       /* See comment in next clause re 64-bit regparms */
5555       /* also, need to be careful about endianness */
5556
5557       Int     offLo64, offHi64;
5558       IRDirty *diLo64, *diHi64;
5559       IRAtom  *addrLo64, *addrHi64;
5560       IRAtom  *vdataLo64, *vdataHi64;
5561       IRAtom  *eBiasLo64, *eBiasHi64;
5562
5563       if (end == Iend_LE) {
5564          offLo64 = 0;
5565          offHi64 = 8;
5566       } else {
5567          offLo64 = 8;
5568          offHi64 = 0;
5569       }
5570
5571       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
5572       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
5573       vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
5574       diLo64    = unsafeIRDirty_0_N(
5575                      1/*regparms*/,
5576                      hname, VG_(fnptr_to_fnentry)( helper ),
5577                      mkIRExprVec_2( addrLo64, vdataLo64 )
5578                   );
5579       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
5580       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
5581       vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
5582       diHi64    = unsafeIRDirty_0_N(
5583                      1/*regparms*/,
5584                      hname, VG_(fnptr_to_fnentry)( helper ),
5585                      mkIRExprVec_2( addrHi64, vdataHi64 )
5586                   );
5587       if (guard) diLo64->guard = guard;
5588       if (guard) diHi64->guard = guard;
5589       setHelperAnns( mce, diLo64 );
5590       setHelperAnns( mce, diHi64 );
5591       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
5592       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
5593
5594    } else {
5595
5596       IRDirty *di;
5597       IRAtom  *addrAct;
5598
5599       /* 8/16/32/64-bit cases */
5600       /* Generate the actual address into addrAct. */
5601       if (bias == 0) {
5602          addrAct = addr;
5603       } else {
5604          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5605          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
5606       }
5607
5608       if (ty == Ity_I64) {
5609          /* We can't do this with regparm 2 on 32-bit platforms, since
5610             the back ends aren't clever enough to handle 64-bit
5611             regparm args.  Therefore be different. */
5612          di = unsafeIRDirty_0_N(
5613                  1/*regparms*/,
5614                  hname, VG_(fnptr_to_fnentry)( helper ),
5615                  mkIRExprVec_2( addrAct, vdata )
5616               );
5617       } else {
5618          di = unsafeIRDirty_0_N(
5619                  2/*regparms*/,
5620                  hname, VG_(fnptr_to_fnentry)( helper ),
5621                  mkIRExprVec_2( addrAct,
5622                                 zwidenToHostWord( mce, vdata ))
5623               );
5624       }
5625       if (guard) di->guard = guard;
5626       setHelperAnns( mce, di );
5627       stmt( 'V', mce, IRStmt_Dirty(di) );
5628    }
5629
5630 }
5631
5632
5633 /* Do lazy pessimistic propagation through a dirty helper call, by
5634    looking at the annotations on it.  This is the most complex part of
5635    Memcheck. */
5636
5637 static IRType szToITy ( Int n )
5638 {
5639    switch (n) {
5640       case 1: return Ity_I8;
5641       case 2: return Ity_I16;
5642       case 4: return Ity_I32;
5643       case 8: return Ity_I64;
5644       default: VG_(tool_panic)("szToITy(memcheck)");
5645    }
5646 }
5647
5648 static
5649 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
5650 {
5651    Int       i, k, n, toDo, gSz, gOff;
5652    IRAtom    *src, *here, *curr;
5653    IRType    tySrc, tyDst;
5654    IRTemp    dst;
5655    IREndness end;
5656
5657    /* What's the native endianness?  We need to know this. */
5658 #  if defined(VG_BIGENDIAN)
5659    end = Iend_BE;
5660 #  elif defined(VG_LITTLEENDIAN)
5661    end = Iend_LE;
5662 #  else
5663 #    error "Unknown endianness"
5664 #  endif
5665
5666    /* First check the guard. */
5667    complainIfUndefined(mce, d->guard, NULL);
5668
5669    /* Now round up all inputs and PCast over them. */
5670    curr = definedOfType(Ity_I32);
5671
5672    /* Inputs: unmasked args
5673       Note: arguments are evaluated REGARDLESS of the guard expression */
5674    for (i = 0; d->args[i]; i++) {
5675       IRAtom* arg = d->args[i];
5676       if ( (d->cee->mcx_mask & (1<<i))
5677            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
5678          /* ignore this arg */
5679       } else {
5680          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg, HuOth) );
5681          curr = mkUifU32(mce, here, curr);
5682       }
5683    }
5684
5685    /* Inputs: guest state that we read. */
5686    for (i = 0; i < d->nFxState; i++) {
5687       tl_assert(d->fxState[i].fx != Ifx_None);
5688       if (d->fxState[i].fx == Ifx_Write)
5689          continue;
5690
5691       /* Enumerate the described state segments */
5692       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5693          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5694          gSz  = d->fxState[i].size;
5695
5696          /* Ignore any sections marked as 'always defined'. */
5697          if (isAlwaysDefd(mce, gOff, gSz)) {
5698             if (0)
5699             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
5700                         gOff, gSz);
5701             continue;
5702          }
5703
5704          /* This state element is read or modified.  So we need to
5705             consider it.  If larger than 8 bytes, deal with it in
5706             8-byte chunks. */
5707          while (True) {
5708             tl_assert(gSz >= 0);
5709             if (gSz == 0) break;
5710             n = gSz <= 8 ? gSz : 8;
5711             /* update 'curr' with UifU of the state slice
5712                gOff .. gOff+n-1 */
5713             tySrc = szToITy( n );
5714
5715             /* Observe the guard expression. If it is false use an
5716                all-bits-defined bit pattern */
5717             IRAtom *cond, *iffalse, *iftrue;
5718
5719             cond    = assignNew('V', mce, Ity_I1, d->guard);
5720             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
5721             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
5722             src     = assignNew('V', mce, tySrc,
5723                                 IRExpr_ITE(cond, iftrue, iffalse));
5724
5725             here = mkPCastTo( mce, Ity_I32, src );
5726             curr = mkUifU32(mce, here, curr);
5727             gSz -= n;
5728             gOff += n;
5729          }
5730       }
5731    }
5732
5733    /* Inputs: memory.  First set up some info needed regardless of
5734       whether we're doing reads or writes. */
5735
5736    if (d->mFx != Ifx_None) {
5737       /* Because we may do multiple shadow loads/stores from the same
5738          base address, it's best to do a single test of its
5739          definedness right now.  Post-instrumentation optimisation
5740          should remove all but this test. */
5741       IRType tyAddr;
5742       tl_assert(d->mAddr);
5743       complainIfUndefined(mce, d->mAddr, d->guard);
5744
5745       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
5746       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
5747       tl_assert(tyAddr == mce->hWordTy); /* not really right */
5748    }
5749
5750    /* Deal with memory inputs (reads or modifies) */
5751    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
5752       toDo   = d->mSize;
5753       /* chew off 32-bit chunks.  We don't care about the endianness
5754          since it's all going to be condensed down to a single bit,
5755          but nevertheless choose an endianness which is hopefully
5756          native to the platform. */
5757       while (toDo >= 4) {
5758          here = mkPCastTo(
5759                    mce, Ity_I32,
5760                    expr2vbits_Load_guarded_Simple(
5761                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
5762                 );
5763          curr = mkUifU32(mce, here, curr);
5764          toDo -= 4;
5765       }
5766       /* chew off 16-bit chunks */
5767       while (toDo >= 2) {
5768          here = mkPCastTo(
5769                    mce, Ity_I32,
5770                    expr2vbits_Load_guarded_Simple(
5771                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
5772                 );
5773          curr = mkUifU32(mce, here, curr);
5774          toDo -= 2;
5775       }
5776       /* chew off the remaining 8-bit chunk, if any */
5777       if (toDo == 1) {
5778          here = mkPCastTo(
5779                    mce, Ity_I32,
5780                    expr2vbits_Load_guarded_Simple(
5781                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
5782                 );
5783          curr = mkUifU32(mce, here, curr);
5784          toDo -= 1;
5785       }
5786       tl_assert(toDo == 0);
5787    }
5788
5789    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
5790       all the inputs to the helper.  Now we need to re-distribute the
5791       results to all destinations. */
5792
5793    /* Outputs: the destination temporary, if there is one. */
5794    if (d->tmp != IRTemp_INVALID) {
5795       dst   = findShadowTmpV(mce, d->tmp);
5796       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
5797       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
5798    }
5799
5800    /* Outputs: guest state that we write or modify. */
5801    for (i = 0; i < d->nFxState; i++) {
5802       tl_assert(d->fxState[i].fx != Ifx_None);
5803       if (d->fxState[i].fx == Ifx_Read)
5804          continue;
5805
5806       /* Enumerate the described state segments */
5807       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
5808          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
5809          gSz  = d->fxState[i].size;
5810
5811          /* Ignore any sections marked as 'always defined'. */
5812          if (isAlwaysDefd(mce, gOff, gSz))
5813             continue;
5814
5815          /* This state element is written or modified.  So we need to
5816             consider it.  If larger than 8 bytes, deal with it in
5817             8-byte chunks. */
5818          while (True) {
5819             tl_assert(gSz >= 0);
5820             if (gSz == 0) break;
5821             n = gSz <= 8 ? gSz : 8;
5822             /* Write suitably-casted 'curr' to the state slice
5823                gOff .. gOff+n-1 */
5824             tyDst = szToITy( n );
5825             do_shadow_PUT( mce, gOff,
5826                                 NULL, /* original atom */
5827                                 mkPCastTo( mce, tyDst, curr ), d->guard );
5828             gSz -= n;
5829             gOff += n;
5830          }
5831       }
5832    }
5833
5834    /* Outputs: memory that we write or modify.  Same comments about
5835       endianness as above apply. */
5836    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
5837       toDo   = d->mSize;
5838       /* chew off 32-bit chunks */
5839       while (toDo >= 4) {
5840          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5841                           NULL, /* original data */
5842                           mkPCastTo( mce, Ity_I32, curr ),
5843                           d->guard );
5844          toDo -= 4;
5845       }
5846       /* chew off 16-bit chunks */
5847       while (toDo >= 2) {
5848          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5849                           NULL, /* original data */
5850                           mkPCastTo( mce, Ity_I16, curr ),
5851                           d->guard );
5852          toDo -= 2;
5853       }
5854       /* chew off the remaining 8-bit chunk, if any */
5855       if (toDo == 1) {
5856          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
5857                           NULL, /* original data */
5858                           mkPCastTo( mce, Ity_I8, curr ),
5859                           d->guard );
5860          toDo -= 1;
5861       }
5862       tl_assert(toDo == 0);
5863    }
5864
5865 }
5866
5867
5868 /* We have an ABI hint telling us that [base .. base+len-1] is to
5869    become undefined ("writable").  Generate code to call a helper to
5870    notify the A/V bit machinery of this fact.
5871
5872    We call
5873    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
5874                                                     Addr nia );
5875 */
5876 static
5877 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
5878 {
5879    IRDirty* di;
5880
5881    if (MC_(clo_mc_level) == 3) {
5882       di = unsafeIRDirty_0_N(
5883               3/*regparms*/,
5884               "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
5885               VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
5886               mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
5887            );
5888    } else {
5889       /* We ignore the supplied nia, since it is irrelevant. */
5890       tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
5891       /* Special-case the len==128 case, since that is for amd64-ELF,
5892          which is a very common target. */
5893       if (len == 128) {
5894          di = unsafeIRDirty_0_N(
5895                  1/*regparms*/,
5896                  "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
5897                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
5898                  mkIRExprVec_1( base )
5899               );
5900       } else {
5901          di = unsafeIRDirty_0_N(
5902                  2/*regparms*/,
5903                  "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
5904                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
5905                  mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
5906               );
5907       }
5908    }
5909
5910    stmt( 'V', mce, IRStmt_Dirty(di) );
5911 }
5912
5913
5914 /* ------ Dealing with IRCAS (big and complex) ------ */
5915
5916 /* FWDS */
5917 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
5918                              IRAtom* baseaddr, Int offset );
5919 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
5920 static void    gen_store_b ( MCEnv* mce, Int szB,
5921                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
5922                              IRAtom* guard );
5923
5924 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
5925 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
5926
5927
5928 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
5929    IRExpr.Consts, else this asserts.  If they are both Consts, it
5930    doesn't do anything.  So that just leaves the RdTmp case.
5931
5932    In which case: this assigns the shadow value SHADOW to the IR
5933    shadow temporary associated with ORIG.  That is, ORIG, being an
5934    original temporary, will have a shadow temporary associated with
5935    it.  However, in the case envisaged here, there will so far have
5936    been no IR emitted to actually write a shadow value into that
5937    temporary.  What this routine does is to (emit IR to) copy the
5938    value in SHADOW into said temporary, so that after this call,
5939    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
5940    value in SHADOW.
5941
5942    Point is to allow callers to compute "by hand" a shadow value for
5943    ORIG, and force it to be associated with ORIG.
5944
5945    How do we know that that shadow associated with ORIG has not so far
5946    been assigned to?  Well, we don't per se know that, but supposing
5947    it had.  Then this routine would create a second assignment to it,
5948    and later the IR sanity checker would barf.  But that never
5949    happens.  QED.
5950 */
5951 static void bind_shadow_tmp_to_orig ( UChar how,
5952                                       MCEnv* mce,
5953                                       IRAtom* orig, IRAtom* shadow )
5954 {
5955    tl_assert(isOriginalAtom(mce, orig));
5956    tl_assert(isShadowAtom(mce, shadow));
5957    switch (orig->tag) {
5958       case Iex_Const:
5959          tl_assert(shadow->tag == Iex_Const);
5960          break;
5961       case Iex_RdTmp:
5962          tl_assert(shadow->tag == Iex_RdTmp);
5963          if (how == 'V') {
5964             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
5965                    shadow);
5966          } else {
5967             tl_assert(how == 'B');
5968             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
5969                    shadow);
5970          }
5971          break;
5972       default:
5973          tl_assert(0);
5974    }
5975 }
5976
5977
5978 static
5979 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
5980 {
5981    /* Scheme is (both single- and double- cases):
5982
5983       1. fetch data#,dataB (the proposed new value)
5984
5985       2. fetch expd#,expdB (what we expect to see at the address)
5986
5987       3. check definedness of address
5988
5989       4. load old#,oldB from shadow memory; this also checks
5990          addressibility of the address
5991
5992       5. the CAS itself
5993
5994       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
5995
5996       7. if "expected == old" (as computed by (6))
5997             store data#,dataB to shadow memory
5998
5999       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
6000       'data' but 7 stores 'data#'.  Hence it is possible for the
6001       shadow data to be incorrectly checked and/or updated:
6002
6003       * 7 is at least gated correctly, since the 'expected == old'
6004         condition is derived from outputs of 5.  However, the shadow
6005         write could happen too late: imagine after 5 we are
6006         descheduled, a different thread runs, writes a different
6007         (shadow) value at the address, and then we resume, hence
6008         overwriting the shadow value written by the other thread.
6009
6010       Because the original memory access is atomic, there's no way to
6011       make both the original and shadow accesses into a single atomic
6012       thing, hence this is unavoidable.
6013
6014       At least as Valgrind stands, I don't think it's a problem, since
6015       we're single threaded *and* we guarantee that there are no
6016       context switches during the execution of any specific superblock
6017       -- context switches can only happen at superblock boundaries.
6018
6019       If Valgrind ever becomes MT in the future, then it might be more
6020       of a problem.  A possible kludge would be to artificially
6021       associate with the location, a lock, which we must acquire and
6022       release around the transaction as a whole.  Hmm, that probably
6023       would't work properly since it only guards us against other
6024       threads doing CASs on the same location, not against other
6025       threads doing normal reads and writes.
6026
6027       ------------------------------------------------------------
6028
6029       COMMENT_ON_CasCmpEQ:
6030
6031       Note two things.  Firstly, in the sequence above, we compute
6032       "expected == old", but we don't check definedness of it.  Why
6033       not?  Also, the x86 and amd64 front ends use
6034       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6035       determination (expected == old ?) for themselves, and we also
6036       don't check definedness for those primops; we just say that the
6037       result is defined.  Why?  Details follow.
6038
6039       x86/amd64 contains various forms of locked insns:
6040       * lock prefix before all basic arithmetic insn;
6041         eg lock xorl %reg1,(%reg2)
6042       * atomic exchange reg-mem
6043       * compare-and-swaps
6044
6045       Rather than attempt to represent them all, which would be a
6046       royal PITA, I used a result from Maurice Herlihy
6047       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6048       demonstrates that compare-and-swap is a primitive more general
6049       than the other two, and so can be used to represent all of them.
6050       So the translation scheme for (eg) lock incl (%reg) is as
6051       follows:
6052
6053         again:
6054          old = * %reg
6055          new = old + 1
6056          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6057
6058       The "atomically" is the CAS bit.  The scheme is always the same:
6059       get old value from memory, compute new value, atomically stuff
6060       new value back in memory iff the old value has not changed (iow,
6061       no other thread modified it in the meantime).  If it has changed
6062       then we've been out-raced and we have to start over.
6063
6064       Now that's all very neat, but it has the bad side effect of
6065       introducing an explicit equality test into the translation.
6066       Consider the behaviour of said code on a memory location which
6067       is uninitialised.  We will wind up doing a comparison on
6068       uninitialised data, and mc duly complains.
6069
6070       What's difficult about this is, the common case is that the
6071       location is uncontended, and so we're usually comparing the same
6072       value (* %reg) with itself.  So we shouldn't complain even if it
6073       is undefined.  But mc doesn't know that.
6074
6075       My solution is to mark the == in the IR specially, so as to tell
6076       mc that it almost certainly compares a value with itself, and we
6077       should just regard the result as always defined.  Rather than
6078       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6079       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6080
6081       So there's always the question of, can this give a false
6082       negative?  eg, imagine that initially, * %reg is defined; and we
6083       read that; but then in the gap between the read and the CAS, a
6084       different thread writes an undefined (and different) value at
6085       the location.  Then the CAS in this thread will fail and we will
6086       go back to "again:", but without knowing that the trip back
6087       there was based on an undefined comparison.  No matter; at least
6088       the other thread won the race and the location is correctly
6089       marked as undefined.  What if it wrote an uninitialised version
6090       of the same value that was there originally, though?
6091
6092       etc etc.  Seems like there's a small corner case in which we
6093       might lose the fact that something's defined -- we're out-raced
6094       in between the "old = * reg" and the "atomically {", _and_ the
6095       other thread is writing in an undefined version of what's
6096       already there.  Well, that seems pretty unlikely.
6097
6098       ---
6099
6100       If we ever need to reinstate it .. code which generates a
6101       definedness test for "expected == old" was removed at r10432 of
6102       this file.
6103    */
6104    if (cas->oldHi == IRTemp_INVALID) {
6105       do_shadow_CAS_single( mce, cas );
6106    } else {
6107       do_shadow_CAS_double( mce, cas );
6108    }
6109 }
6110
6111
6112 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
6113 {
6114    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6115    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6116    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6117    IRAtom *expd_eq_old = NULL;
6118    IROp   opCasCmpEQ;
6119    Int    elemSzB;
6120    IRType elemTy;
6121    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6122
6123    /* single CAS */
6124    tl_assert(cas->oldHi == IRTemp_INVALID);
6125    tl_assert(cas->expdHi == NULL);
6126    tl_assert(cas->dataHi == NULL);
6127
6128    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6129    switch (elemTy) {
6130       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
6131       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
6132       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
6133       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
6134       default: tl_assert(0); /* IR defn disallows any other types */
6135    }
6136
6137    /* 1. fetch data# (the proposed new value) */
6138    tl_assert(isOriginalAtom(mce, cas->dataLo));
6139    vdataLo
6140       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6141    tl_assert(isShadowAtom(mce, vdataLo));
6142    if (otrak) {
6143       bdataLo
6144          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6145       tl_assert(isShadowAtom(mce, bdataLo));
6146    }
6147
6148    /* 2. fetch expected# (what we expect to see at the address) */
6149    tl_assert(isOriginalAtom(mce, cas->expdLo));
6150    vexpdLo
6151       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6152    tl_assert(isShadowAtom(mce, vexpdLo));
6153    if (otrak) {
6154       bexpdLo
6155          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6156       tl_assert(isShadowAtom(mce, bexpdLo));
6157    }
6158
6159    /* 3. check definedness of address */
6160    /* 4. fetch old# from shadow memory; this also checks
6161          addressibility of the address */
6162    voldLo
6163       = assignNew(
6164            'V', mce, elemTy,
6165            expr2vbits_Load(
6166               mce,
6167               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
6168               NULL/*always happens*/
6169         ));
6170    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6171    if (otrak) {
6172       boldLo
6173          = assignNew('B', mce, Ity_I32,
6174                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
6175       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6176    }
6177
6178    /* 5. the CAS itself */
6179    stmt( 'C', mce, IRStmt_CAS(cas) );
6180
6181    /* 6. compute "expected == old" */
6182    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6183    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6184       tree, but it's not copied from the input block. */
6185    expd_eq_old
6186       = assignNew('C', mce, Ity_I1,
6187                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
6188
6189    /* 7. if "expected == old"
6190             store data# to shadow memory */
6191    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
6192                     NULL/*data*/, vdataLo/*vdata*/,
6193                     expd_eq_old/*guard for store*/ );
6194    if (otrak) {
6195       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
6196                    bdataLo/*bdata*/,
6197                    expd_eq_old/*guard for store*/ );
6198    }
6199 }
6200
6201
6202 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
6203 {
6204    IRAtom *vdataHi = NULL, *bdataHi = NULL;
6205    IRAtom *vdataLo = NULL, *bdataLo = NULL;
6206    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
6207    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6208    IRAtom *voldHi  = NULL, *boldHi  = NULL;
6209    IRAtom *voldLo  = NULL, *boldLo  = NULL;
6210    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
6211    IRAtom *expd_eq_old = NULL, *zero = NULL;
6212    IROp   opCasCmpEQ, opOr, opXor;
6213    Int    elemSzB, memOffsLo, memOffsHi;
6214    IRType elemTy;
6215    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6216
6217    /* double CAS */
6218    tl_assert(cas->oldHi != IRTemp_INVALID);
6219    tl_assert(cas->expdHi != NULL);
6220    tl_assert(cas->dataHi != NULL);
6221
6222    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6223    switch (elemTy) {
6224       case Ity_I8:
6225          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
6226          elemSzB = 1; zero = mkU8(0);
6227          break;
6228       case Ity_I16:
6229          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
6230          elemSzB = 2; zero = mkU16(0);
6231          break;
6232       case Ity_I32:
6233          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
6234          elemSzB = 4; zero = mkU32(0);
6235          break;
6236       case Ity_I64:
6237          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
6238          elemSzB = 8; zero = mkU64(0);
6239          break;
6240       default:
6241          tl_assert(0); /* IR defn disallows any other types */
6242    }
6243
6244    /* 1. fetch data# (the proposed new value) */
6245    tl_assert(isOriginalAtom(mce, cas->dataHi));
6246    tl_assert(isOriginalAtom(mce, cas->dataLo));
6247    vdataHi
6248       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi, HuOth));
6249    vdataLo
6250       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6251    tl_assert(isShadowAtom(mce, vdataHi));
6252    tl_assert(isShadowAtom(mce, vdataLo));
6253    if (otrak) {
6254       bdataHi
6255          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
6256       bdataLo
6257          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6258       tl_assert(isShadowAtom(mce, bdataHi));
6259       tl_assert(isShadowAtom(mce, bdataLo));
6260    }
6261
6262    /* 2. fetch expected# (what we expect to see at the address) */
6263    tl_assert(isOriginalAtom(mce, cas->expdHi));
6264    tl_assert(isOriginalAtom(mce, cas->expdLo));
6265    vexpdHi
6266       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi, HuOth));
6267    vexpdLo
6268       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6269    tl_assert(isShadowAtom(mce, vexpdHi));
6270    tl_assert(isShadowAtom(mce, vexpdLo));
6271    if (otrak) {
6272       bexpdHi
6273          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
6274       bexpdLo
6275          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6276       tl_assert(isShadowAtom(mce, bexpdHi));
6277       tl_assert(isShadowAtom(mce, bexpdLo));
6278    }
6279
6280    /* 3. check definedness of address */
6281    /* 4. fetch old# from shadow memory; this also checks
6282          addressibility of the address */
6283    if (cas->end == Iend_LE) {
6284       memOffsLo = 0;
6285       memOffsHi = elemSzB;
6286    } else {
6287       tl_assert(cas->end == Iend_BE);
6288       memOffsLo = elemSzB;
6289       memOffsHi = 0;
6290    }
6291    voldHi
6292       = assignNew(
6293            'V', mce, elemTy,
6294            expr2vbits_Load(
6295               mce,
6296               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6297               NULL/*always happens*/
6298         ));
6299    voldLo
6300       = assignNew(
6301            'V', mce, elemTy,
6302            expr2vbits_Load(
6303               mce,
6304               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6305               NULL/*always happens*/
6306         ));
6307    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6308    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6309    if (otrak) {
6310       boldHi
6311          = assignNew('B', mce, Ity_I32,
6312                      gen_load_b(mce, elemSzB, cas->addr,
6313                                 memOffsHi/*addr bias*/));
6314       boldLo
6315          = assignNew('B', mce, Ity_I32,
6316                      gen_load_b(mce, elemSzB, cas->addr,
6317                                 memOffsLo/*addr bias*/));
6318       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6319       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6320    }
6321
6322    /* 5. the CAS itself */
6323    stmt( 'C', mce, IRStmt_CAS(cas) );
6324
6325    /* 6. compute "expected == old" */
6326    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6327    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6328       tree, but it's not copied from the input block. */
6329    /*
6330       xHi = oldHi ^ expdHi;
6331       xLo = oldLo ^ expdLo;
6332       xHL = xHi | xLo;
6333       expd_eq_old = xHL == 0;
6334    */
6335    xHi = assignNew('C', mce, elemTy,
6336                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6337    xLo = assignNew('C', mce, elemTy,
6338                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6339    xHL = assignNew('C', mce, elemTy,
6340                    binop(opOr, xHi, xLo));
6341    expd_eq_old
6342       = assignNew('C', mce, Ity_I1,
6343                   binop(opCasCmpEQ, xHL, zero));
6344
6345    /* 7. if "expected == old"
6346             store data# to shadow memory */
6347    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6348                     NULL/*data*/, vdataHi/*vdata*/,
6349                     expd_eq_old/*guard for store*/ );
6350    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6351                     NULL/*data*/, vdataLo/*vdata*/,
6352                     expd_eq_old/*guard for store*/ );
6353    if (otrak) {
6354       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
6355                    bdataHi/*bdata*/,
6356                    expd_eq_old/*guard for store*/ );
6357       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
6358                    bdataLo/*bdata*/,
6359                    expd_eq_old/*guard for store*/ );
6360    }
6361 }
6362
6363
6364 /* ------ Dealing with LL/SC (not difficult) ------ */
6365
6366 static void do_shadow_LLSC ( MCEnv*    mce,
6367                              IREndness stEnd,
6368                              IRTemp    stResult,
6369                              IRExpr*   stAddr,
6370                              IRExpr*   stStoredata )
6371 {
6372    /* In short: treat a load-linked like a normal load followed by an
6373       assignment of the loaded (shadow) data to the result temporary.
6374       Treat a store-conditional like a normal store, and mark the
6375       result temporary as defined. */
6376    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
6377    IRTemp resTmp = findShadowTmpV(mce, stResult);
6378
6379    tl_assert(isIRAtom(stAddr));
6380    if (stStoredata)
6381       tl_assert(isIRAtom(stStoredata));
6382
6383    if (stStoredata == NULL) {
6384       /* Load Linked */
6385       /* Just treat this as a normal load, followed by an assignment of
6386          the value to .result. */
6387       /* Stay sane */
6388       tl_assert(resTy == Ity_I64 || resTy == Ity_I32
6389                 || resTy == Ity_I16 || resTy == Ity_I8);
6390       assign( 'V', mce, resTmp,
6391                    expr2vbits_Load(
6392                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
6393                       NULL/*always happens*/) );
6394    } else {
6395       /* Store Conditional */
6396       /* Stay sane */
6397       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
6398                                    stStoredata);
6399       tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
6400                 || dataTy == Ity_I16 || dataTy == Ity_I8);
6401       do_shadow_Store( mce, stEnd,
6402                             stAddr, 0/* addr bias */,
6403                             stStoredata,
6404                             NULL /* shadow data */,
6405                             NULL/*guard*/ );
6406       /* This is a store conditional, so it writes to .result a value
6407          indicating whether or not the store succeeded.  Just claim
6408          this value is always defined.  In the PowerPC interpretation
6409          of store-conditional, definedness of the success indication
6410          depends on whether the address of the store matches the
6411          reservation address.  But we can't tell that here (and
6412          anyway, we're not being PowerPC-specific).  At least we are
6413          guaranteed that the definedness of the store address, and its
6414          addressibility, will be checked as per normal.  So it seems
6415          pretty safe to just say that the success indication is always
6416          defined.
6417
6418          In schemeS, for origin tracking, we must correspondingly set
6419          a no-origin value for the origin shadow of .result.
6420       */
6421       tl_assert(resTy == Ity_I1);
6422       assign( 'V', mce, resTmp, definedOfType(resTy) );
6423    }
6424 }
6425
6426
6427 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
6428
6429 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
6430 {
6431    complainIfUndefined(mce, sg->guard, NULL);
6432    /* do_shadow_Store will generate code to check the definedness and
6433       validity of sg->addr, in the case where sg->guard evaluates to
6434       True at run-time. */
6435    do_shadow_Store( mce, sg->end,
6436                     sg->addr, 0/* addr bias */,
6437                     sg->data,
6438                     NULL /* shadow data */,
6439                     sg->guard );
6440 }
6441
6442 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
6443 {
6444    complainIfUndefined(mce, lg->guard, NULL);
6445    /* expr2vbits_Load_guarded_General will generate code to check the
6446       definedness and validity of lg->addr, in the case where
6447       lg->guard evaluates to True at run-time. */
6448
6449    /* Look at the LoadG's built-in conversion operation, to determine
6450       the source (actual loaded data) type, and the equivalent IROp.
6451       NOTE that implicitly we are taking a widening operation to be
6452       applied to original atoms and producing one that applies to V
6453       bits.  Since signed and unsigned widening are self-shadowing,
6454       this is a straight copy of the op (modulo swapping from the
6455       IRLoadGOp form to the IROp form).  Note also therefore that this
6456       implicitly duplicates the logic to do with said widening ops in
6457       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
6458    IROp   vwiden   = Iop_INVALID;
6459    IRType loadedTy = Ity_INVALID;
6460    switch (lg->cvt) {
6461       case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
6462       case ILGop_Ident64:   loadedTy = Ity_I64;  vwiden = Iop_INVALID; break;
6463       case ILGop_Ident32:   loadedTy = Ity_I32;  vwiden = Iop_INVALID; break;
6464       case ILGop_16Uto32:   loadedTy = Ity_I16;  vwiden = Iop_16Uto32; break;
6465       case ILGop_16Sto32:   loadedTy = Ity_I16;  vwiden = Iop_16Sto32; break;
6466       case ILGop_8Uto32:    loadedTy = Ity_I8;   vwiden = Iop_8Uto32;  break;
6467       case ILGop_8Sto32:    loadedTy = Ity_I8;   vwiden = Iop_8Sto32;  break;
6468       default: VG_(tool_panic)("do_shadow_LoadG");
6469    }
6470
6471    IRAtom* vbits_alt
6472       = expr2vbits( mce, lg->alt, HuOth );
6473    IRAtom* vbits_final
6474       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
6475                                         lg->addr, 0/*addr bias*/,
6476                                         lg->guard, vwiden, vbits_alt );
6477    /* And finally, bind the V bits to the destination temporary. */
6478    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
6479 }
6480
6481
6482 /*------------------------------------------------------------*/
6483 /*--- Origin tracking stuff                                ---*/
6484 /*------------------------------------------------------------*/
6485
6486 /* Almost identical to findShadowTmpV. */
6487 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
6488 {
6489    TempMapEnt* ent;
6490    /* VG_(indexXA) range-checks 'orig', hence no need to check
6491       here. */
6492    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6493    tl_assert(ent->kind == Orig);
6494    if (ent->shadowB == IRTemp_INVALID) {
6495       IRTemp tmpB
6496         = newTemp( mce, Ity_I32, BSh );
6497       /* newTemp may cause mce->tmpMap to resize, hence previous results
6498          from VG_(indexXA) are invalid. */
6499       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
6500       tl_assert(ent->kind == Orig);
6501       tl_assert(ent->shadowB == IRTemp_INVALID);
6502       ent->shadowB = tmpB;
6503    }
6504    return ent->shadowB;
6505 }
6506
6507 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
6508 {
6509    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
6510 }
6511
6512
6513 /* Make a guarded origin load, with no special handling in the
6514    didn't-happen case.  A GUARD of NULL is assumed to mean "always
6515    True".
6516
6517    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
6518    return the otag.  The loaded size is SZB.  If GUARD evaluates to
6519    False at run time then the returned otag is zero.
6520 */
6521 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
6522                                     IRAtom* baseaddr,
6523                                     Int offset, IRExpr* guard )
6524 {
6525    void*    hFun;
6526    const HChar* hName;
6527    IRTemp   bTmp;
6528    IRDirty* di;
6529    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6530    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6531    IRAtom*  ea    = baseaddr;
6532    if (offset != 0) {
6533       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6534                                    : mkU64( (Long)(Int)offset );
6535       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
6536    }
6537    bTmp = newTemp(mce, mce->hWordTy, BSh);
6538
6539    switch (szB) {
6540       case 1: hFun  = (void*)&MC_(helperc_b_load1);
6541               hName = "MC_(helperc_b_load1)";
6542               break;
6543       case 2: hFun  = (void*)&MC_(helperc_b_load2);
6544               hName = "MC_(helperc_b_load2)";
6545               break;
6546       case 4: hFun  = (void*)&MC_(helperc_b_load4);
6547               hName = "MC_(helperc_b_load4)";
6548               break;
6549       case 8: hFun  = (void*)&MC_(helperc_b_load8);
6550               hName = "MC_(helperc_b_load8)";
6551               break;
6552       case 16: hFun  = (void*)&MC_(helperc_b_load16);
6553                hName = "MC_(helperc_b_load16)";
6554                break;
6555       case 32: hFun  = (void*)&MC_(helperc_b_load32);
6556                hName = "MC_(helperc_b_load32)";
6557                break;
6558       default:
6559          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
6560          tl_assert(0);
6561    }
6562    di = unsafeIRDirty_1_N(
6563            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
6564            mkIRExprVec_1( ea )
6565         );
6566    if (guard) {
6567       di->guard = guard;
6568       /* Ideally the didn't-happen return value here would be
6569          all-zeroes (unknown-origin), so it'd be harmless if it got
6570          used inadvertently.  We slum it out with the IR-mandated
6571          default value (0b01 repeating, 0x55 etc) as that'll probably
6572          trump all legitimate otags via Max32, and it's pretty
6573          obviously bogus. */
6574    }
6575    /* no need to mess with any annotations.  This call accesses
6576       neither guest state nor guest memory. */
6577    stmt( 'B', mce, IRStmt_Dirty(di) );
6578    if (mce->hWordTy == Ity_I64) {
6579       /* 64-bit host */
6580       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
6581       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
6582       return mkexpr(bTmp32);
6583    } else {
6584       /* 32-bit host */
6585       return mkexpr(bTmp);
6586    }
6587 }
6588
6589
6590 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
6591    loaded size is SZB.  The load is regarded as unconditional (always
6592    happens).
6593 */
6594 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
6595                             Int offset )
6596 {
6597    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
6598 }
6599
6600
6601 /* The most general handler for guarded origin loads.  A GUARD of NULL
6602    is assumed to mean "always True".
6603
6604    Generate IR to do a shadow origin load from ADDR+BIAS and return
6605    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
6606    run time then the returned B bits are simply BALT instead.
6607 */
6608 static
6609 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
6610                                         IRType ty,
6611                                         IRAtom* addr, UInt bias,
6612                                         IRAtom* guard, IRAtom* balt )
6613 {
6614    /* If the guard evaluates to True, this will hold the loaded
6615       origin.  If the guard evaluates to False, this will be zero,
6616       meaning "unknown origin", in which case we will have to replace
6617       it using an ITE below. */
6618    IRAtom* iftrue
6619       = assignNew('B', mce, Ity_I32,
6620                   gen_guarded_load_b(mce, sizeofIRType(ty),
6621                                      addr, bias, guard));
6622    /* These are the bits we will return if the load doesn't take
6623       place. */
6624    IRAtom* iffalse
6625       = balt;
6626    /* Prepare the cond for the ITE.  Convert a NULL cond into
6627       something that iropt knows how to fold out later. */
6628    IRAtom* cond
6629       = guard == NULL  ? mkU1(1)  : guard;
6630    /* And assemble the final result. */
6631    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
6632 }
6633
6634
6635 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
6636    the store really happens; NULL means it unconditionally does. */
6637 static void gen_store_b ( MCEnv* mce, Int szB,
6638                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
6639                           IRAtom* guard )
6640 {
6641    void*    hFun;
6642    const HChar* hName;
6643    IRDirty* di;
6644    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
6645    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
6646    IRAtom*  ea    = baseaddr;
6647    if (guard) {
6648       tl_assert(isOriginalAtom(mce, guard));
6649       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
6650    }
6651    if (offset != 0) {
6652       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
6653                                    : mkU64( (Long)(Int)offset );
6654       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
6655    }
6656    if (mce->hWordTy == Ity_I64)
6657       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
6658
6659    switch (szB) {
6660       case 1: hFun  = (void*)&MC_(helperc_b_store1);
6661               hName = "MC_(helperc_b_store1)";
6662               break;
6663       case 2: hFun  = (void*)&MC_(helperc_b_store2);
6664               hName = "MC_(helperc_b_store2)";
6665               break;
6666       case 4: hFun  = (void*)&MC_(helperc_b_store4);
6667               hName = "MC_(helperc_b_store4)";
6668               break;
6669       case 8: hFun  = (void*)&MC_(helperc_b_store8);
6670               hName = "MC_(helperc_b_store8)";
6671               break;
6672       case 16: hFun  = (void*)&MC_(helperc_b_store16);
6673                hName = "MC_(helperc_b_store16)";
6674                break;
6675       case 32: hFun  = (void*)&MC_(helperc_b_store32);
6676                hName = "MC_(helperc_b_store32)";
6677                break;
6678       default:
6679          tl_assert(0);
6680    }
6681    di = unsafeIRDirty_0_N( 2/*regparms*/,
6682            hName, VG_(fnptr_to_fnentry)( hFun ),
6683            mkIRExprVec_2( ea, dataB )
6684         );
6685    /* no need to mess with any annotations.  This call accesses
6686       neither guest state nor guest memory. */
6687    if (guard) di->guard = guard;
6688    stmt( 'B', mce, IRStmt_Dirty(di) );
6689 }
6690
6691 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
6692    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
6693    if (eTy == Ity_I64)
6694       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
6695    if (eTy == Ity_I32)
6696       return e;
6697    tl_assert(0);
6698 }
6699
6700 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
6701    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
6702    tl_assert(eTy == Ity_I32);
6703    if (dstTy == Ity_I64)
6704       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
6705    tl_assert(0);
6706 }
6707
6708
6709 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
6710 {
6711    tl_assert(MC_(clo_mc_level) == 3);
6712
6713    switch (e->tag) {
6714
6715       case Iex_GetI: {
6716          IRRegArray* descr_b;
6717          IRAtom      *t1, *t2, *t3, *t4;
6718          IRRegArray* descr      = e->Iex.GetI.descr;
6719          IRType equivIntTy
6720             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
6721          /* If this array is unshadowable for whatever reason, use the
6722             usual approximation. */
6723          if (equivIntTy == Ity_INVALID)
6724             return mkU32(0);
6725          tl_assert(sizeofIRType(equivIntTy) >= 4);
6726          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
6727          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
6728                                  equivIntTy, descr->nElems );
6729          /* Do a shadow indexed get of the same size, giving t1.  Take
6730             the bottom 32 bits of it, giving t2.  Compute into t3 the
6731             origin for the index (almost certainly zero, but there's
6732             no harm in being completely general here, since iropt will
6733             remove any useless code), and fold it in, giving a final
6734             value t4. */
6735          t1 = assignNew( 'B', mce, equivIntTy,
6736                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
6737                                                 e->Iex.GetI.bias ));
6738          t2 = narrowTo32( mce, t1 );
6739          t3 = schemeE( mce, e->Iex.GetI.ix );
6740          t4 = gen_maxU32( mce, t2, t3 );
6741          return t4;
6742       }
6743       case Iex_CCall: {
6744          Int i;
6745          IRAtom*  here;
6746          IRExpr** args = e->Iex.CCall.args;
6747          IRAtom*  curr = mkU32(0);
6748          for (i = 0; args[i]; i++) {
6749             tl_assert(i < 32);
6750             tl_assert(isOriginalAtom(mce, args[i]));
6751             /* Only take notice of this arg if the callee's
6752                mc-exclusion mask does not say it is to be excluded. */
6753             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
6754                /* the arg is to be excluded from definedness checking.
6755                   Do nothing. */
6756                if (0) VG_(printf)("excluding %s(%d)\n",
6757                                   e->Iex.CCall.cee->name, i);
6758             } else {
6759                /* calculate the arg's definedness, and pessimistically
6760                   merge it in. */
6761                here = schemeE( mce, args[i] );
6762                curr = gen_maxU32( mce, curr, here );
6763             }
6764          }
6765          return curr;
6766       }
6767       case Iex_Load: {
6768          Int dszB;
6769          dszB = sizeofIRType(e->Iex.Load.ty);
6770          /* assert that the B value for the address is already
6771             available (somewhere) */
6772          tl_assert(isIRAtom(e->Iex.Load.addr));
6773          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
6774          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
6775       }
6776       case Iex_ITE: {
6777          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
6778          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
6779          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
6780          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
6781       }
6782       case Iex_Qop: {
6783          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
6784          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
6785          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
6786          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
6787          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
6788                                  gen_maxU32( mce, b3, b4 ) );
6789       }
6790       case Iex_Triop: {
6791          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
6792          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
6793          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
6794          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
6795       }
6796       case Iex_Binop: {
6797          switch (e->Iex.Binop.op) {
6798             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
6799             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
6800             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
6801             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
6802                /* Just say these all produce a defined result,
6803                   regardless of their arguments.  See
6804                   COMMENT_ON_CasCmpEQ in this file. */
6805                return mkU32(0);
6806             default: {
6807                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
6808                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
6809                return gen_maxU32( mce, b1, b2 );
6810             }
6811          }
6812          tl_assert(0);
6813          /*NOTREACHED*/
6814       }
6815       case Iex_Unop: {
6816          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
6817          return b1;
6818       }
6819       case Iex_Const:
6820          return mkU32(0);
6821       case Iex_RdTmp:
6822          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
6823       case Iex_Get: {
6824          Int b_offset = MC_(get_otrack_shadow_offset)(
6825                            e->Iex.Get.offset,
6826                            sizeofIRType(e->Iex.Get.ty)
6827                         );
6828          tl_assert(b_offset >= -1
6829                    && b_offset <= mce->layout->total_sizeB -4);
6830          if (b_offset >= 0) {
6831             /* FIXME: this isn't an atom! */
6832             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
6833                                Ity_I32 );
6834          }
6835          return mkU32(0);
6836       }
6837       default:
6838          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
6839          ppIRExpr(e);
6840          VG_(tool_panic)("memcheck:schemeE");
6841    }
6842 }
6843
6844
6845 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
6846 {
6847    // This is a hacked version of do_shadow_Dirty
6848    Int       i, k, n, toDo, gSz, gOff;
6849    IRAtom    *here, *curr;
6850    IRTemp    dst;
6851
6852    /* First check the guard. */
6853    curr = schemeE( mce, d->guard );
6854
6855    /* Now round up all inputs and maxU32 over them. */
6856
6857    /* Inputs: unmasked args
6858       Note: arguments are evaluated REGARDLESS of the guard expression */
6859    for (i = 0; d->args[i]; i++) {
6860       IRAtom* arg = d->args[i];
6861       if ( (d->cee->mcx_mask & (1<<i))
6862            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
6863          /* ignore this arg */
6864       } else {
6865          here = schemeE( mce, arg );
6866          curr = gen_maxU32( mce, curr, here );
6867       }
6868    }
6869
6870    /* Inputs: guest state that we read. */
6871    for (i = 0; i < d->nFxState; i++) {
6872       tl_assert(d->fxState[i].fx != Ifx_None);
6873       if (d->fxState[i].fx == Ifx_Write)
6874          continue;
6875
6876       /* Enumerate the described state segments */
6877       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6878          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6879          gSz  = d->fxState[i].size;
6880
6881          /* Ignore any sections marked as 'always defined'. */
6882          if (isAlwaysDefd(mce, gOff, gSz)) {
6883             if (0)
6884             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6885                         gOff, gSz);
6886             continue;
6887          }
6888
6889          /* This state element is read or modified.  So we need to
6890             consider it.  If larger than 4 bytes, deal with it in
6891             4-byte chunks. */
6892          while (True) {
6893             Int b_offset;
6894             tl_assert(gSz >= 0);
6895             if (gSz == 0) break;
6896             n = gSz <= 4 ? gSz : 4;
6897             /* update 'curr' with maxU32 of the state slice
6898                gOff .. gOff+n-1 */
6899             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
6900             if (b_offset != -1) {
6901                /* Observe the guard expression. If it is false use 0, i.e.
6902                   nothing is known about the origin */
6903                IRAtom *cond, *iffalse, *iftrue;
6904
6905                cond = assignNew( 'B', mce, Ity_I1, d->guard);
6906                iffalse = mkU32(0);
6907                iftrue  = assignNew( 'B', mce, Ity_I32,
6908                                     IRExpr_Get(b_offset
6909                                                  + 2*mce->layout->total_sizeB,
6910                                                Ity_I32));
6911                here = assignNew( 'B', mce, Ity_I32,
6912                                  IRExpr_ITE(cond, iftrue, iffalse));
6913                curr = gen_maxU32( mce, curr, here );
6914             }
6915             gSz -= n;
6916             gOff += n;
6917          }
6918       }
6919    }
6920
6921    /* Inputs: memory */
6922
6923    if (d->mFx != Ifx_None) {
6924       /* Because we may do multiple shadow loads/stores from the same
6925          base address, it's best to do a single test of its
6926          definedness right now.  Post-instrumentation optimisation
6927          should remove all but this test. */
6928       tl_assert(d->mAddr);
6929       here = schemeE( mce, d->mAddr );
6930       curr = gen_maxU32( mce, curr, here );
6931    }
6932
6933    /* Deal with memory inputs (reads or modifies) */
6934    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6935       toDo   = d->mSize;
6936       /* chew off 32-bit chunks.  We don't care about the endianness
6937          since it's all going to be condensed down to a single bit,
6938          but nevertheless choose an endianness which is hopefully
6939          native to the platform. */
6940       while (toDo >= 4) {
6941          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
6942                                     d->guard );
6943          curr = gen_maxU32( mce, curr, here );
6944          toDo -= 4;
6945       }
6946       /* handle possible 16-bit excess */
6947       while (toDo >= 2) {
6948          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
6949                                     d->guard );
6950          curr = gen_maxU32( mce, curr, here );
6951          toDo -= 2;
6952       }
6953       /* chew off the remaining 8-bit chunk, if any */
6954       if (toDo == 1) {
6955          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
6956                                     d->guard );
6957          curr = gen_maxU32( mce, curr, here );
6958          toDo -= 1;
6959       }
6960       tl_assert(toDo == 0);
6961    }
6962
6963    /* Whew!  So curr is a 32-bit B-value which should give an origin
6964       of some use if any of the inputs to the helper are undefined.
6965       Now we need to re-distribute the results to all destinations. */
6966
6967    /* Outputs: the destination temporary, if there is one. */
6968    if (d->tmp != IRTemp_INVALID) {
6969       dst   = findShadowTmpB(mce, d->tmp);
6970       assign( 'V', mce, dst, curr );
6971    }
6972
6973    /* Outputs: guest state that we write or modify. */
6974    for (i = 0; i < d->nFxState; i++) {
6975       tl_assert(d->fxState[i].fx != Ifx_None);
6976       if (d->fxState[i].fx == Ifx_Read)
6977          continue;
6978
6979       /* Enumerate the described state segments */
6980       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6981          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6982          gSz  = d->fxState[i].size;
6983
6984          /* Ignore any sections marked as 'always defined'. */
6985          if (isAlwaysDefd(mce, gOff, gSz))
6986             continue;
6987
6988          /* This state element is written or modified.  So we need to
6989             consider it.  If larger than 4 bytes, deal with it in
6990             4-byte chunks. */
6991          while (True) {
6992             Int b_offset;
6993             tl_assert(gSz >= 0);
6994             if (gSz == 0) break;
6995             n = gSz <= 4 ? gSz : 4;
6996             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
6997             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
6998             if (b_offset != -1) {
6999
7000                /* If the guard expression evaluates to false we simply Put
7001                   the value that is already stored in the guest state slot */
7002                IRAtom *cond, *iffalse;
7003
7004                cond    = assignNew('B', mce, Ity_I1,
7005                                    d->guard);
7006                iffalse = assignNew('B', mce, Ity_I32,
7007                                    IRExpr_Get(b_offset +
7008                                               2*mce->layout->total_sizeB,
7009                                               Ity_I32));
7010                curr = assignNew('V', mce, Ity_I32,
7011                                 IRExpr_ITE(cond, curr, iffalse));
7012
7013                stmt( 'B', mce, IRStmt_Put(b_offset
7014                                           + 2*mce->layout->total_sizeB,
7015                                           curr ));
7016             }
7017             gSz -= n;
7018             gOff += n;
7019          }
7020       }
7021    }
7022
7023    /* Outputs: memory that we write or modify.  Same comments about
7024       endianness as above apply. */
7025    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7026       toDo   = d->mSize;
7027       /* chew off 32-bit chunks */
7028       while (toDo >= 4) {
7029          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7030                       d->guard );
7031          toDo -= 4;
7032       }
7033       /* handle possible 16-bit excess */
7034       while (toDo >= 2) {
7035          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7036                       d->guard );
7037          toDo -= 2;
7038       }
7039       /* chew off the remaining 8-bit chunk, if any */
7040       if (toDo == 1) {
7041          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7042                       d->guard );
7043          toDo -= 1;
7044       }
7045       tl_assert(toDo == 0);
7046    }
7047 }
7048
7049
7050 /* Generate IR for origin shadowing for a general guarded store. */
7051 static void do_origins_Store_guarded ( MCEnv* mce,
7052                                        IREndness stEnd,
7053                                        IRExpr* stAddr,
7054                                        IRExpr* stData,
7055                                        IRExpr* guard )
7056 {
7057    Int     dszB;
7058    IRAtom* dataB;
7059    /* assert that the B value for the address is already available
7060       (somewhere), since the call to schemeE will want to see it.
7061       XXXX how does this actually ensure that?? */
7062    tl_assert(isIRAtom(stAddr));
7063    tl_assert(isIRAtom(stData));
7064    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7065    dataB = schemeE( mce, stData );
7066    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7067 }
7068
7069
7070 /* Generate IR for origin shadowing for a plain store. */
7071 static void do_origins_Store_plain ( MCEnv* mce,
7072                                      IREndness stEnd,
7073                                      IRExpr* stAddr,
7074                                      IRExpr* stData )
7075 {
7076    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7077                               NULL/*guard*/ );
7078 }
7079
7080
7081 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7082
7083 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7084 {
7085    do_origins_Store_guarded( mce, sg->end, sg->addr,
7086                              sg->data, sg->guard );
7087 }
7088
7089 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7090 {
7091    IRType loadedTy = Ity_INVALID;
7092    switch (lg->cvt) {
7093       case ILGop_IdentV128: loadedTy = Ity_V128; break;
7094       case ILGop_Ident64:   loadedTy = Ity_I64;  break;
7095       case ILGop_Ident32:   loadedTy = Ity_I32;  break;
7096       case ILGop_16Uto32:   loadedTy = Ity_I16;  break;
7097       case ILGop_16Sto32:   loadedTy = Ity_I16;  break;
7098       case ILGop_8Uto32:    loadedTy = Ity_I8;   break;
7099       case ILGop_8Sto32:    loadedTy = Ity_I8;   break;
7100       default: VG_(tool_panic)("schemeS.IRLoadG");
7101    }
7102    IRAtom* ori_alt
7103       = schemeE( mce,lg->alt );
7104    IRAtom* ori_final
7105       = expr2ori_Load_guarded_General(mce, loadedTy,
7106                                       lg->addr, 0/*addr bias*/,
7107                                       lg->guard, ori_alt );
7108    /* And finally, bind the origin to the destination temporary. */
7109    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7110 }
7111
7112
7113 static void schemeS ( MCEnv* mce, IRStmt* st )
7114 {
7115    tl_assert(MC_(clo_mc_level) == 3);
7116
7117    switch (st->tag) {
7118
7119       case Ist_AbiHint:
7120          /* The value-check instrumenter handles this - by arranging
7121             to pass the address of the next instruction to
7122             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
7123             happen for origin tracking w.r.t. AbiHints.  So there is
7124             nothing to do here. */
7125          break;
7126
7127       case Ist_PutI: {
7128          IRPutI *puti = st->Ist.PutI.details;
7129          IRRegArray* descr_b;
7130          IRAtom      *t1, *t2, *t3, *t4;
7131          IRRegArray* descr = puti->descr;
7132          IRType equivIntTy
7133             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7134          /* If this array is unshadowable for whatever reason,
7135             generate no code. */
7136          if (equivIntTy == Ity_INVALID)
7137             break;
7138          tl_assert(sizeofIRType(equivIntTy) >= 4);
7139          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7140          descr_b
7141             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7142                             equivIntTy, descr->nElems );
7143          /* Compute a value to Put - the conjoinment of the origin for
7144             the data to be Put-ted (obviously) and of the index value
7145             (not so obviously). */
7146          t1 = schemeE( mce, puti->data );
7147          t2 = schemeE( mce, puti->ix );
7148          t3 = gen_maxU32( mce, t1, t2 );
7149          t4 = zWidenFrom32( mce, equivIntTy, t3 );
7150          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7151                                                puti->bias, t4) ));
7152          break;
7153       }
7154
7155       case Ist_Dirty:
7156          do_origins_Dirty( mce, st->Ist.Dirty.details );
7157          break;
7158
7159       case Ist_Store:
7160          do_origins_Store_plain( mce, st->Ist.Store.end,
7161                                       st->Ist.Store.addr,
7162                                       st->Ist.Store.data );
7163          break;
7164
7165       case Ist_StoreG:
7166          do_origins_StoreG( mce, st->Ist.StoreG.details );
7167          break;
7168
7169       case Ist_LoadG:
7170          do_origins_LoadG( mce, st->Ist.LoadG.details );
7171          break;
7172
7173       case Ist_LLSC: {
7174          /* In short: treat a load-linked like a normal load followed
7175             by an assignment of the loaded (shadow) data the result
7176             temporary.  Treat a store-conditional like a normal store,
7177             and mark the result temporary as defined. */
7178          if (st->Ist.LLSC.storedata == NULL) {
7179             /* Load Linked */
7180             IRType resTy
7181                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7182             IRExpr* vanillaLoad
7183                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7184             tl_assert(resTy == Ity_I64 || resTy == Ity_I32
7185                       || resTy == Ity_I16 || resTy == Ity_I8);
7186             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7187                               schemeE(mce, vanillaLoad));
7188          } else {
7189             /* Store conditional */
7190             do_origins_Store_plain( mce, st->Ist.LLSC.end,
7191                                     st->Ist.LLSC.addr,
7192                                     st->Ist.LLSC.storedata );
7193             /* For the rationale behind this, see comments at the
7194                place where the V-shadow for .result is constructed, in
7195                do_shadow_LLSC.  In short, we regard .result as
7196                always-defined. */
7197             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7198                               mkU32(0) );
7199          }
7200          break;
7201       }
7202
7203       case Ist_Put: {
7204          Int b_offset
7205             = MC_(get_otrack_shadow_offset)(
7206                  st->Ist.Put.offset,
7207                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7208               );
7209          if (b_offset >= 0) {
7210             /* FIXME: this isn't an atom! */
7211             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7212                                        schemeE( mce, st->Ist.Put.data )) );
7213          }
7214          break;
7215       }
7216
7217       case Ist_WrTmp:
7218          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7219                            schemeE(mce, st->Ist.WrTmp.data) );
7220          break;
7221
7222       case Ist_MBE:
7223       case Ist_NoOp:
7224       case Ist_Exit:
7225       case Ist_IMark:
7226          break;
7227
7228       default:
7229          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7230          ppIRStmt(st);
7231          VG_(tool_panic)("memcheck:schemeS");
7232    }
7233 }
7234
7235
7236 /*------------------------------------------------------------*/
7237 /*--- Post-tree-build final tidying                        ---*/
7238 /*------------------------------------------------------------*/
7239
7240 /* This exploits the observation that Memcheck often produces
7241    repeated conditional calls of the form
7242
7243    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7244
7245    with the same guard expression G guarding the same helper call.
7246    The second and subsequent calls are redundant.  This usually
7247    results from instrumentation of guest code containing multiple
7248    memory references at different constant offsets from the same base
7249    register.  After optimisation of the instrumentation, you get a
7250    test for the definedness of the base register for each memory
7251    reference, which is kinda pointless.  MC_(final_tidy) therefore
7252    looks for such repeated calls and removes all but the first. */
7253
7254
7255 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7256    gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7257    get almost all the benefits of this transformation whilst causing
7258    the slide-back case to just often enough to be verifiably
7259    correct.  For posterity, the numbers are:
7260
7261    bz2-32
7262
7263    1   4,336 (112,212 -> 1,709,473; ratio 15.2)
7264    2   4,336 (112,194 -> 1,669,895; ratio 14.9)
7265    3   4,336 (112,194 -> 1,660,713; ratio 14.8)
7266    4   4,336 (112,194 -> 1,658,555; ratio 14.8)
7267    5   4,336 (112,194 -> 1,655,447; ratio 14.8)
7268    6   4,336 (112,194 -> 1,655,101; ratio 14.8)
7269    7   4,336 (112,194 -> 1,654,858; ratio 14.7)
7270    8   4,336 (112,194 -> 1,654,810; ratio 14.7)
7271    10  4,336 (112,194 -> 1,654,621; ratio 14.7)
7272    12  4,336 (112,194 -> 1,654,678; ratio 14.7)
7273    16  4,336 (112,194 -> 1,654,494; ratio 14.7)
7274    32  4,336 (112,194 -> 1,654,602; ratio 14.7)
7275    inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7276
7277    bz2-64
7278
7279    1   4,113 (107,329 -> 1,822,171; ratio 17.0)
7280    2   4,113 (107,329 -> 1,806,443; ratio 16.8)
7281    3   4,113 (107,329 -> 1,803,967; ratio 16.8)
7282    4   4,113 (107,329 -> 1,802,785; ratio 16.8)
7283    5   4,113 (107,329 -> 1,802,412; ratio 16.8)
7284    6   4,113 (107,329 -> 1,802,062; ratio 16.8)
7285    7   4,113 (107,329 -> 1,801,976; ratio 16.8)
7286    8   4,113 (107,329 -> 1,801,886; ratio 16.8)
7287    10  4,113 (107,329 -> 1,801,653; ratio 16.8)
7288    12  4,113 (107,329 -> 1,801,526; ratio 16.8)
7289    16  4,113 (107,329 -> 1,801,298; ratio 16.8)
7290    32  4,113 (107,329 -> 1,800,827; ratio 16.8)
7291    inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7292 */
7293
7294 /* Structs for recording which (helper, guard) pairs we have already
7295    seen. */
7296
7297 #define N_TIDYING_PAIRS 16
7298
7299 typedef
7300    struct { void* entry; IRExpr* guard; }
7301    Pair;
7302
7303 typedef
7304    struct {
7305       Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
7306       UInt pairsUsed;
7307    }
7308    Pairs;
7309
7310
7311 /* Return True if e1 and e2 definitely denote the same value (used to
7312    compare guards).  Return False if unknown; False is the safe
7313    answer.  Since guest registers and guest memory do not have the
7314    SSA property we must return False if any Gets or Loads appear in
7315    the expression.  This implicitly assumes that e1 and e2 have the
7316    same IR type, which is always true here -- the type is Ity_I1. */
7317
7318 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
7319 {
7320    if (e1->tag != e2->tag)
7321       return False;
7322    switch (e1->tag) {
7323       case Iex_Const:
7324          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
7325       case Iex_Binop:
7326          return e1->Iex.Binop.op == e2->Iex.Binop.op
7327                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
7328                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
7329       case Iex_Unop:
7330          return e1->Iex.Unop.op == e2->Iex.Unop.op
7331                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
7332       case Iex_RdTmp:
7333          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
7334       case Iex_ITE:
7335          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
7336                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
7337                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
7338       case Iex_Qop:
7339       case Iex_Triop:
7340       case Iex_CCall:
7341          /* be lazy.  Could define equality for these, but they never
7342             appear to be used. */
7343          return False;
7344       case Iex_Get:
7345       case Iex_GetI:
7346       case Iex_Load:
7347          /* be conservative - these may not give the same value each
7348             time */
7349          return False;
7350       case Iex_Binder:
7351          /* should never see this */
7352          /* fallthrough */
7353       default:
7354          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
7355          ppIRExpr(e1);
7356          VG_(tool_panic)("memcheck:sameIRValue");
7357          return False;
7358    }
7359 }
7360
7361 /* See if 'pairs' already has an entry for (entry, guard).  Return
7362    True if so.  If not, add an entry. */
7363
7364 static
7365 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
7366 {
7367    UInt i, n = tidyingEnv->pairsUsed;
7368    tl_assert(n <= N_TIDYING_PAIRS);
7369    for (i = 0; i < n; i++) {
7370       if (tidyingEnv->pairs[i].entry == entry
7371           && sameIRValue(tidyingEnv->pairs[i].guard, guard))
7372          return True;
7373    }
7374    /* (guard, entry) wasn't found in the array.  Add it at the end.
7375       If the array is already full, slide the entries one slot
7376       backwards.  This means we will lose to ability to detect
7377       duplicates from the pair in slot zero, but that happens so
7378       rarely that it's unlikely to have much effect on overall code
7379       quality.  Also, this strategy loses the check for the oldest
7380       tracked exit (memory reference, basically) and so that is (I'd
7381       guess) least likely to be re-used after this point. */
7382    tl_assert(i == n);
7383    if (n == N_TIDYING_PAIRS) {
7384       for (i = 1; i < N_TIDYING_PAIRS; i++) {
7385          tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
7386       }
7387       tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
7388       tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
7389    } else {
7390       tl_assert(n < N_TIDYING_PAIRS);
7391       tidyingEnv->pairs[n].entry = entry;
7392       tidyingEnv->pairs[n].guard = guard;
7393       n++;
7394       tidyingEnv->pairsUsed = n;
7395    }
7396    return False;
7397 }
7398
7399 static Bool is_helperc_value_checkN_fail ( const HChar* name )
7400 {
7401    /* This is expensive because it happens a lot.  We are checking to
7402       see whether |name| is one of the following 8 strings:
7403
7404          MC_(helperc_value_check8_fail_no_o)
7405          MC_(helperc_value_check4_fail_no_o)
7406          MC_(helperc_value_check0_fail_no_o)
7407          MC_(helperc_value_check1_fail_no_o)
7408          MC_(helperc_value_check8_fail_w_o)
7409          MC_(helperc_value_check0_fail_w_o)
7410          MC_(helperc_value_check1_fail_w_o)
7411          MC_(helperc_value_check4_fail_w_o)
7412
7413       To speed it up, check the common prefix just once, rather than
7414       all 8 times.
7415    */
7416    const HChar* prefix = "MC_(helperc_value_check";
7417
7418    HChar n, p;
7419    while (True) {
7420       n = *name;
7421       p = *prefix;
7422       if (p == 0) break; /* ran off the end of the prefix */
7423       /* We still have some prefix to use */
7424       if (n == 0) return False; /* have prefix, but name ran out */
7425       if (n != p) return False; /* have both pfx and name, but no match */
7426       name++;
7427       prefix++;
7428    }
7429
7430    /* Check the part after the prefix. */
7431    tl_assert(*prefix == 0 && *name != 0);
7432    return    0==VG_(strcmp)(name, "8_fail_no_o)")
7433           || 0==VG_(strcmp)(name, "4_fail_no_o)")
7434           || 0==VG_(strcmp)(name, "0_fail_no_o)")
7435           || 0==VG_(strcmp)(name, "1_fail_no_o)")
7436           || 0==VG_(strcmp)(name, "8_fail_w_o)")
7437           || 0==VG_(strcmp)(name, "4_fail_w_o)")
7438           || 0==VG_(strcmp)(name, "0_fail_w_o)")
7439           || 0==VG_(strcmp)(name, "1_fail_w_o)");
7440 }
7441
7442 IRSB* MC_(final_tidy) ( IRSB* sb_in )
7443 {
7444    Int       i;
7445    IRStmt*   st;
7446    IRDirty*  di;
7447    IRExpr*   guard;
7448    IRCallee* cee;
7449    Bool      alreadyPresent;
7450    Pairs     pairs;
7451
7452    pairs.pairsUsed = 0;
7453
7454    pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
7455    pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
7456
7457    /* Scan forwards through the statements.  Each time a call to one
7458       of the relevant helpers is seen, check if we have made a
7459       previous call to the same helper using the same guard
7460       expression, and if so, delete the call. */
7461    for (i = 0; i < sb_in->stmts_used; i++) {
7462       st = sb_in->stmts[i];
7463       tl_assert(st);
7464       if (st->tag != Ist_Dirty)
7465          continue;
7466       di = st->Ist.Dirty.details;
7467       guard = di->guard;
7468       tl_assert(guard);
7469       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
7470       cee = di->cee;
7471       if (!is_helperc_value_checkN_fail( cee->name ))
7472          continue;
7473        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
7474           guard 'guard'.  Check if we have already seen a call to this
7475           function with the same guard.  If so, delete it.  If not,
7476           add it to the set of calls we do know about. */
7477       alreadyPresent = check_or_add( &pairs, guard, cee->addr );
7478       if (alreadyPresent) {
7479          sb_in->stmts[i] = IRStmt_NoOp();
7480          if (0) VG_(printf)("XX\n");
7481       }
7482    }
7483
7484    tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
7485    tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
7486
7487    return sb_in;
7488 }
7489
7490 #undef N_TIDYING_PAIRS
7491
7492
7493 /*------------------------------------------------------------*/
7494 /*--- Startup assertion checking                           ---*/
7495 /*------------------------------------------------------------*/
7496
7497 void MC_(do_instrumentation_startup_checks)( void )
7498 {
7499    /* Make a best-effort check to see that is_helperc_value_checkN_fail
7500       is working as we expect. */
7501
7502 #  define CHECK(_expected, _string) \
7503       tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
7504
7505    /* It should identify these 8, and no others, as targets. */
7506    CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
7507    CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
7508    CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
7509    CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
7510    CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
7511    CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
7512    CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
7513    CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
7514
7515    /* Ad-hoc selection of other strings gathered via a quick test. */
7516    CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
7517    CHECK(False, "amd64g_dirtyhelper_RDTSC");
7518    CHECK(False, "MC_(helperc_b_load1)");
7519    CHECK(False, "MC_(helperc_b_load2)");
7520    CHECK(False, "MC_(helperc_b_load4)");
7521    CHECK(False, "MC_(helperc_b_load8)");
7522    CHECK(False, "MC_(helperc_b_load16)");
7523    CHECK(False, "MC_(helperc_b_load32)");
7524    CHECK(False, "MC_(helperc_b_store1)");
7525    CHECK(False, "MC_(helperc_b_store2)");
7526    CHECK(False, "MC_(helperc_b_store4)");
7527    CHECK(False, "MC_(helperc_b_store8)");
7528    CHECK(False, "MC_(helperc_b_store16)");
7529    CHECK(False, "MC_(helperc_b_store32)");
7530    CHECK(False, "MC_(helperc_LOADV8)");
7531    CHECK(False, "MC_(helperc_LOADV16le)");
7532    CHECK(False, "MC_(helperc_LOADV32le)");
7533    CHECK(False, "MC_(helperc_LOADV64le)");
7534    CHECK(False, "MC_(helperc_LOADV128le)");
7535    CHECK(False, "MC_(helperc_LOADV256le)");
7536    CHECK(False, "MC_(helperc_STOREV16le)");
7537    CHECK(False, "MC_(helperc_STOREV32le)");
7538    CHECK(False, "MC_(helperc_STOREV64le)");
7539    CHECK(False, "MC_(helperc_STOREV8)");
7540    CHECK(False, "track_die_mem_stack_8");
7541    CHECK(False, "track_new_mem_stack_8_w_ECU");
7542    CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
7543    CHECK(False, "VG_(unknown_SP_update_w_ECU)");
7544
7545 #  undef CHECK
7546 }
7547
7548
7549 /*------------------------------------------------------------*/
7550 /*--- Memcheck main                                        ---*/
7551 /*------------------------------------------------------------*/
7552
7553 static Bool isBogusAtom ( IRAtom* at )
7554 {
7555    if (at->tag == Iex_RdTmp)
7556       return False;
7557    tl_assert(at->tag == Iex_Const);
7558
7559    ULong n = 0;
7560    IRConst* con = at->Iex.Const.con;
7561    switch (con->tag) {
7562       case Ico_U1:   return False;
7563       case Ico_U8:   n = (ULong)con->Ico.U8; break;
7564       case Ico_U16:  n = (ULong)con->Ico.U16; break;
7565       case Ico_U32:  n = (ULong)con->Ico.U32; break;
7566       case Ico_U64:  n = (ULong)con->Ico.U64; break;
7567       case Ico_F32:  return False;
7568       case Ico_F64:  return False;
7569       case Ico_F32i: return False;
7570       case Ico_F64i: return False;
7571       case Ico_V128: return False;
7572       case Ico_V256: return False;
7573       default: ppIRExpr(at); tl_assert(0);
7574    }
7575    /* VG_(printf)("%llx\n", n); */
7576    /* Shortcuts */
7577    if (LIKELY(n <= 0x0000000000001000ULL)) return False;
7578    if (LIKELY(n >= 0xFFFFFFFFFFFFF000ULL)) return False;
7579    /* The list of bogus atoms is: */
7580    return (/*32*/    n == 0xFEFEFEFFULL
7581            /*32*/ || n == 0x80808080ULL
7582            /*32*/ || n == 0x7F7F7F7FULL
7583            /*32*/ || n == 0x7EFEFEFFULL
7584            /*32*/ || n == 0x81010100ULL
7585            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
7586            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
7587            /*64*/ || n == 0x0000000000008080ULL
7588            /*64*/ || n == 0x8080808080808080ULL
7589            /*64*/ || n == 0x0101010101010101ULL
7590           );
7591 }
7592
7593
7594 /* Does 'st' mention any of the literals identified/listed in
7595    isBogusAtom()? */
7596 static inline Bool containsBogusLiterals ( /*FLAT*/ IRStmt* st )
7597 {
7598    Int      i;
7599    IRExpr*  e;
7600    IRDirty* d;
7601    IRCAS*   cas;
7602    switch (st->tag) {
7603       case Ist_WrTmp:
7604          e = st->Ist.WrTmp.data;
7605          switch (e->tag) {
7606             case Iex_Get:
7607             case Iex_RdTmp:
7608                return False;
7609             case Iex_Const:
7610                return isBogusAtom(e);
7611             case Iex_Unop:
7612                return isBogusAtom(e->Iex.Unop.arg)
7613                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
7614             case Iex_GetI:
7615                return isBogusAtom(e->Iex.GetI.ix);
7616             case Iex_Binop:
7617                return isBogusAtom(e->Iex.Binop.arg1)
7618                       || isBogusAtom(e->Iex.Binop.arg2);
7619             case Iex_Triop:
7620                return isBogusAtom(e->Iex.Triop.details->arg1)
7621                       || isBogusAtom(e->Iex.Triop.details->arg2)
7622                       || isBogusAtom(e->Iex.Triop.details->arg3);
7623             case Iex_Qop:
7624                return isBogusAtom(e->Iex.Qop.details->arg1)
7625                       || isBogusAtom(e->Iex.Qop.details->arg2)
7626                       || isBogusAtom(e->Iex.Qop.details->arg3)
7627                       || isBogusAtom(e->Iex.Qop.details->arg4);
7628             case Iex_ITE:
7629                return isBogusAtom(e->Iex.ITE.cond)
7630                       || isBogusAtom(e->Iex.ITE.iftrue)
7631                       || isBogusAtom(e->Iex.ITE.iffalse);
7632             case Iex_Load:
7633                return isBogusAtom(e->Iex.Load.addr);
7634             case Iex_CCall:
7635                for (i = 0; e->Iex.CCall.args[i]; i++)
7636                   if (isBogusAtom(e->Iex.CCall.args[i]))
7637                      return True;
7638                return False;
7639             default:
7640                goto unhandled;
7641          }
7642       case Ist_Dirty:
7643          d = st->Ist.Dirty.details;
7644          for (i = 0; d->args[i]; i++) {
7645             IRAtom* atom = d->args[i];
7646             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
7647                if (isBogusAtom(atom))
7648                   return True;
7649             }
7650          }
7651          if (isBogusAtom(d->guard))
7652             return True;
7653          if (d->mAddr && isBogusAtom(d->mAddr))
7654             return True;
7655          return False;
7656       case Ist_Put:
7657          return isBogusAtom(st->Ist.Put.data);
7658       case Ist_PutI:
7659          return isBogusAtom(st->Ist.PutI.details->ix)
7660                 || isBogusAtom(st->Ist.PutI.details->data);
7661       case Ist_Store:
7662          return isBogusAtom(st->Ist.Store.addr)
7663                 || isBogusAtom(st->Ist.Store.data);
7664       case Ist_StoreG: {
7665          IRStoreG* sg = st->Ist.StoreG.details;
7666          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
7667                 || isBogusAtom(sg->guard);
7668       }
7669       case Ist_LoadG: {
7670          IRLoadG* lg = st->Ist.LoadG.details;
7671          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
7672                 || isBogusAtom(lg->guard);
7673       }
7674       case Ist_Exit:
7675          return isBogusAtom(st->Ist.Exit.guard);
7676       case Ist_AbiHint:
7677          return isBogusAtom(st->Ist.AbiHint.base)
7678                 || isBogusAtom(st->Ist.AbiHint.nia);
7679       case Ist_NoOp:
7680       case Ist_IMark:
7681       case Ist_MBE:
7682          return False;
7683       case Ist_CAS:
7684          cas = st->Ist.CAS.details;
7685          return isBogusAtom(cas->addr)
7686                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
7687                 || isBogusAtom(cas->expdLo)
7688                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
7689                 || isBogusAtom(cas->dataLo);
7690       case Ist_LLSC:
7691          return isBogusAtom(st->Ist.LLSC.addr)
7692                 || (st->Ist.LLSC.storedata
7693                        ? isBogusAtom(st->Ist.LLSC.storedata)
7694                        : False);
7695       default:
7696       unhandled:
7697          ppIRStmt(st);
7698          VG_(tool_panic)("hasBogusLiterals");
7699    }
7700 }
7701
7702
7703 /* This is the pre-instrumentation analysis.  It does a backwards pass over
7704    the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
7705    the block.
7706
7707    Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
7708    as a positive result from that is a strong indication that we need to
7709    expensively instrument add/sub in the block.  We do both analyses in one
7710    pass, even though they are independent, so as to avoid the overhead of
7711    having to traverse the whole block twice.
7712
7713    The usage pass proceeds as follows.  Let max= be the max operation in the
7714    HowUsed lattice, hence
7715
7716      X max= Y   means   X = max(X, Y)
7717
7718    then
7719
7720      for t in original tmps . useEnv[t] = HuUnU
7721
7722      for t used in the block's . next field
7723         useEnv[t] max= HuPCa  // because jmp targets are PCast-tested
7724
7725      for st iterating *backwards* in the block
7726
7727         match st
7728
7729            case "t1 = load(t2)"          // case 1
7730               useEnv[t2] max= HuPCa
7731
7732            case "t1 = add(t2, t3)"       // case 2
7733               useEnv[t2] max= useEnv[t1]
7734               useEnv[t3] max= useEnv[t1]
7735
7736            other
7737               for t in st.usedTmps       // case 3
7738                  useEnv[t] max= HuOth
7739                  // same as useEnv[t] = HuOth
7740
7741    The general idea is that we accumulate, in useEnv[], information about
7742    how each tmp is used.  That can be updated as we work further back
7743    through the block and find more uses of it, but its HowUsed value can
7744    only ascend the lattice, not descend.
7745
7746    Initially we mark all tmps as unused.  In case (1), if a tmp is seen to
7747    be used as a memory address, then its use is at least HuPCa.  The point
7748    is that for a memory address we will add instrumentation to check if any
7749    bit of the address is undefined, which means that we won't need expensive
7750    V-bit propagation through an add expression that computed the address --
7751    cheap add instrumentation will be equivalent.
7752
7753    Note in case (1) that if we have previously seen a non-memory-address use
7754    of the tmp, then its use will already be HuOth and will be unchanged by
7755    the max= operation.  And if it turns out that the source of the tmp was
7756    an add, then we'll have to expensively instrument the add, because we
7757    can't prove that, for the previous non-memory-address use of the tmp,
7758    cheap and expensive instrumentation will be equivalent.
7759
7760    In case 2, we propagate the usage-mode of the result of an add back
7761    through to its operands.  Again, we use max= so as to take account of the
7762    fact that t2 or t3 might later in the block (viz, earlier in the
7763    iteration) have been used in a way that requires expensive add
7764    instrumentation.
7765
7766    In case 3, we deal with all other tmp uses.  We assume that we'll need a
7767    result that is as accurate as possible, so we max= HuOth into its use
7768    mode.  Since HuOth is the top of the lattice, that's equivalent to just
7769    setting its use to HuOth.
7770
7771    The net result of all this is that:
7772
7773      tmps that are used either
7774        - only as a memory address, or
7775        - only as part of a tree of adds that computes a memory address,
7776          and has no other use
7777      are marked as HuPCa, and so we can instrument their generating Add
7778      nodes cheaply, which is the whole point of this analysis
7779
7780      tmps that are used any other way at all are marked as HuOth
7781
7782      tmps that are unused are marked as HuUnU.  We don't expect to see any
7783      since we expect that the incoming IR has had all dead assignments
7784      removed by previous optimisation passes.  Nevertheless the analysis is
7785      correct even in the presence of dead tmps.
7786
7787    A final comment on dead tmps.  In case 1 and case 2, we could actually
7788    conditionalise the updates thusly:
7789
7790      if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa }  // case 1
7791
7792      if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] }  // case 2
7793      if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] }  // case 2
7794
7795    In other words, if the assigned-to tmp |t1| is never used, then there's
7796    no point in propagating any use through to its operands.  That won't
7797    change the final HuPCa-vs-HuOth results, which is what we care about.
7798    Given that we expect to get dead-code-free inputs, there's no point in
7799    adding this extra refinement.
7800 */
7801
7802 /* Helper for |preInstrumentationAnalysis|. */
7803 static inline void noteTmpUsesIn ( /*MOD*/HowUsed* useEnv,
7804                                    UInt tyenvUsed,
7805                                    HowUsed newUse, IRAtom* at )
7806 {
7807    /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
7808       seen a use of |newUse|.  So, merge that info into |t|'s accumulated
7809       use info. */
7810    switch (at->tag) {
7811       case Iex_GSPTR:
7812       case Iex_Const:
7813          return;
7814       case Iex_RdTmp: {
7815          IRTemp t = at->Iex.RdTmp.tmp;
7816          tl_assert(t < tyenvUsed); // "is an original tmp"
7817          // The "max" operation in the lattice
7818          if (newUse > useEnv[t]) useEnv[t] = newUse;
7819          return;
7820       }
7821       default:
7822          // We should never get here -- it implies non-flat IR
7823          ppIRExpr(at);
7824          VG_(tool_panic)("noteTmpUsesIn");
7825    }
7826    /*NOTREACHED*/
7827    tl_assert(0);
7828 }
7829
7830
7831 static void preInstrumentationAnalysis ( /*OUT*/HowUsed** useEnvP,
7832                                          /*OUT*/Bool* hasBogusLiteralsP,
7833                                          const IRSB* sb_in )
7834 {
7835    const UInt nOrigTmps = (UInt)sb_in->tyenv->types_used;
7836
7837    // We've seen no bogus literals so far.
7838    Bool bogus = False;
7839
7840    // This is calloc'd, so implicitly all entries are initialised to HuUnU.
7841    HowUsed* useEnv = VG_(calloc)("mc.preInstrumentationAnalysis.1",
7842                                  nOrigTmps, sizeof(HowUsed));
7843
7844    // Firstly, roll in contributions from the final dst address.
7845    bogus = isBogusAtom(sb_in->next);
7846    noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, sb_in->next);
7847
7848    // Now work backwards through the stmts.
7849    for (Int i = sb_in->stmts_used-1; i >= 0; i--) {
7850       IRStmt* st = sb_in->stmts[i];
7851
7852       // Deal with literals.
7853       if (LIKELY(!bogus)) {
7854          bogus = containsBogusLiterals(st);
7855       }
7856
7857       // Deal with tmp uses.
7858       switch (st->tag) {
7859          case Ist_WrTmp: {
7860             IRTemp  dst = st->Ist.WrTmp.tmp;
7861             IRExpr* rhs = st->Ist.WrTmp.data;
7862             // This is the one place where we have to consider all possible
7863             // tags for |rhs|, and can't just assume it is a tmp or a const.
7864             switch (rhs->tag) {
7865                case Iex_RdTmp:
7866                   // just propagate demand for |dst| into this tmp use.
7867                   noteTmpUsesIn(useEnv, nOrigTmps, useEnv[dst], rhs);
7868                   break;
7869                case Iex_Unop:
7870                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.Unop.arg);
7871                   break;
7872                case Iex_Binop:
7873                   if (rhs->Iex.Binop.op == Iop_Add64
7874                       || rhs->Iex.Binop.op == Iop_Add32) {
7875                      // propagate demand for |dst| through to the operands.
7876                      noteTmpUsesIn(useEnv, nOrigTmps,
7877                                    useEnv[dst], rhs->Iex.Binop.arg1);
7878                      noteTmpUsesIn(useEnv, nOrigTmps,
7879                                    useEnv[dst], rhs->Iex.Binop.arg2);
7880                   } else {
7881                      // just say that the operands are used in some unknown way.
7882                      noteTmpUsesIn(useEnv, nOrigTmps,
7883                                    HuOth, rhs->Iex.Binop.arg1);
7884                      noteTmpUsesIn(useEnv, nOrigTmps,
7885                                    HuOth, rhs->Iex.Binop.arg2);
7886                   }
7887                   break;
7888                case Iex_Triop: {
7889                   // All operands are used in some unknown way.
7890                   IRTriop* tri = rhs->Iex.Triop.details;
7891                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg1);
7892                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg2);
7893                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg3);
7894                   break;
7895                }
7896                case Iex_Qop: {
7897                   // All operands are used in some unknown way.
7898                   IRQop* qop = rhs->Iex.Qop.details;
7899                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg1);
7900                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg2);
7901                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg3);
7902                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg4);
7903                   break;
7904                }
7905                case Iex_Load:
7906                   // The address will be checked (== PCasted).
7907                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.Load.addr);
7908                   break;
7909                case Iex_ITE:
7910                   // The condition is PCasted, the then- and else-values
7911                   // aren't.
7912                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.ITE.cond);
7913                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iftrue);
7914                   noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iffalse);
7915                   break;
7916                case Iex_CCall:
7917                   // The args are used in unknown ways.
7918                   for (IRExpr** args = rhs->Iex.CCall.args; *args; args++) {
7919                      noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
7920                   }
7921                   break;
7922                case Iex_GetI: {
7923                   // The index will be checked/PCasted (see do_shadow_GETI)
7924                   noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.GetI.ix);
7925                   break;
7926                }
7927                case Iex_Const:
7928                case Iex_Get:
7929                   break;
7930                default:
7931                   ppIRExpr(rhs);
7932                   VG_(tool_panic)("preInstrumentationAnalysis:"
7933                                   " unhandled IRExpr");
7934             }
7935             break;
7936          }
7937          case Ist_Store:
7938             // The address will be checked (== PCasted).  The data will be
7939             // used in some unknown way.
7940             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Store.addr);
7941             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Store.data);
7942             break;
7943          case Ist_Exit:
7944             // The guard will be checked (== PCasted)
7945             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Exit.guard);
7946             break;
7947          case Ist_Put:
7948             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Put.data);
7949             break;
7950          case Ist_PutI: {
7951             IRPutI* putI = st->Ist.PutI.details;
7952             // The index will be checked/PCasted (see do_shadow_PUTI).  The
7953             // data will be used in an unknown way.
7954             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, putI->ix);
7955             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, putI->data);
7956             break;
7957          }
7958          case Ist_Dirty: {
7959             IRDirty* d = st->Ist.Dirty.details;
7960             // The guard will be checked (== PCasted)
7961             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, d->guard);
7962             // The args will be used in unknown ways.
7963             for (IRExpr** args = d->args; *args; args++) {
7964                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
7965             }
7966             break;
7967          }
7968          case Ist_CAS: {
7969             IRCAS* cas = st->Ist.CAS.details;
7970             // Address will be pcasted, everything else used as unknown
7971             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, cas->addr);
7972             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdLo);
7973             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataLo);
7974             if (cas->expdHi)
7975                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdHi);
7976             if (cas->dataHi)
7977                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataHi);
7978             break;
7979          }
7980          case Ist_AbiHint:
7981             // Both exprs are used in unknown ways.  TODO: can we safely
7982             // just ignore AbiHints?
7983             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.base);
7984             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.nia);
7985             break;
7986          case Ist_StoreG: {
7987             // We might be able to do better, and use HuPCa for the addr.
7988             // It's not immediately obvious that we can, because the address
7989             // is regarded as "used" only when the guard is true.
7990             IRStoreG* sg = st->Ist.StoreG.details;
7991             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->addr);
7992             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->data);
7993             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->guard);
7994             break;
7995          }
7996          case Ist_LoadG: {
7997             // Per similar comments to Ist_StoreG .. not sure whether this
7998             // is really optimal.
7999             IRLoadG* lg = st->Ist.LoadG.details;
8000             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->addr);
8001             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->alt);
8002             noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->guard);
8003             break;
8004          }
8005          case Ist_LLSC: {
8006             noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.LLSC.addr);
8007             if (st->Ist.LLSC.storedata)
8008                noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.LLSC.storedata);
8009             break;
8010          }
8011          case Ist_MBE:
8012          case Ist_IMark:
8013          case Ist_NoOp:
8014             break;
8015          default: {
8016             ppIRStmt(st);
8017             VG_(tool_panic)("preInstrumentationAnalysis: unhandled IRStmt");
8018          }
8019       }
8020    } // Now work backwards through the stmts.
8021
8022    // Return the computed use env and the bogus-atom flag.
8023    tl_assert(*useEnvP == NULL);
8024    *useEnvP = useEnv;
8025
8026    tl_assert(*hasBogusLiteralsP == False);
8027    *hasBogusLiteralsP = bogus;
8028 }
8029
8030
8031 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
8032                         IRSB* sb_in,
8033                         const VexGuestLayout* layout,
8034                         const VexGuestExtents* vge,
8035                         const VexArchInfo* archinfo_host,
8036                         IRType gWordTy, IRType hWordTy )
8037 {
8038    Bool    verboze = 0||False;
8039    Int     i, j, first_stmt;
8040    IRStmt* st;
8041    MCEnv   mce;
8042    IRSB*   sb_out;
8043
8044    if (gWordTy != hWordTy) {
8045       /* We don't currently support this case. */
8046       VG_(tool_panic)("host/guest word size mismatch");
8047    }
8048
8049    /* Check we're not completely nuts */
8050    tl_assert(sizeof(UWord)  == sizeof(void*));
8051    tl_assert(sizeof(Word)   == sizeof(void*));
8052    tl_assert(sizeof(Addr)   == sizeof(void*));
8053    tl_assert(sizeof(ULong)  == 8);
8054    tl_assert(sizeof(Long)   == 8);
8055    tl_assert(sizeof(UInt)   == 4);
8056    tl_assert(sizeof(Int)    == 4);
8057
8058    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
8059
8060    /* Set up SB */
8061    sb_out = deepCopyIRSBExceptStmts(sb_in);
8062
8063    /* Set up the running environment.  Both .sb and .tmpMap are
8064       modified as we go along.  Note that tmps are added to both
8065       .sb->tyenv and .tmpMap together, so the valid index-set for
8066       those two arrays should always be identical. */
8067    VG_(memset)(&mce, 0, sizeof(mce));
8068    mce.sb             = sb_out;
8069    mce.trace          = verboze;
8070    mce.layout         = layout;
8071    mce.hWordTy        = hWordTy;
8072    mce.tmpHowUsed     = NULL;
8073
8074    /* BEGIN decide on expense levels for instrumentation. */
8075
8076    /* Initially, select the cheap version of everything for which we have an
8077       option. */
8078    DetailLevelByOp__set_all( &mce.dlbo, DLcheap );
8079
8080    /* Take account of the --expensive-definedness-checks= flag. */
8081    if (MC_(clo_expensive_definedness_checks) == EdcNO) {
8082       /* We just selected 'cheap for everything', so we don't need to do
8083          anything here.  mce.tmpHowUsed remains NULL. */
8084    }
8085    else if (MC_(clo_expensive_definedness_checks) == EdcYES) {
8086       /* Select 'expensive for everything'.  mce.tmpHowUsed remains NULL. */
8087       DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8088    }
8089    else {
8090       tl_assert(MC_(clo_expensive_definedness_checks) == EdcAUTO);
8091       /* We'll make our own selection, based on known per-target constraints
8092          and also on analysis of the block to be instrumented.  First, set
8093          up default values for detail levels.
8094
8095          On x86 and amd64, we'll routinely encounter code optimised by LLVM
8096          5 and above.  Enable accurate interpretation of the following.
8097          LLVM uses adds for some bitfield inserts, and we get a lot of false
8098          errors if the cheap interpretation is used, alas.  Could solve this
8099          much better if we knew which of such adds came from x86/amd64 LEA
8100          instructions, since these are the only ones really needing the
8101          expensive interpretation, but that would require some way to tag
8102          them in the _toIR.c front ends, which is a lot of faffing around.
8103          So for now we use preInstrumentationAnalysis() to detect adds which
8104          are used only to construct memory addresses, which is an
8105          approximation to the above, and is self-contained.*/
8106 #     if defined(VGA_x86)
8107       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8108 #     elif defined(VGA_amd64)
8109       mce.dlbo.dl_Add64           = DLauto;
8110       mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8111 #     endif
8112
8113       /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8114          fill it in. */
8115       Bool hasBogusLiterals = False;
8116       preInstrumentationAnalysis( &mce.tmpHowUsed, &hasBogusLiterals, sb_in );
8117
8118       if (hasBogusLiterals) {
8119          /* This happens very rarely.  In this case just select expensive
8120             for everything, and throw away the tmp-use analysis results. */
8121          DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8122          VG_(free)( mce.tmpHowUsed );
8123          mce.tmpHowUsed = NULL;
8124       } else {
8125          /* Nothing.  mce.tmpHowUsed contains tmp-use analysis results,
8126             which will be used for some subset of Iop_{Add,Sub}{32,64},
8127             based on which ones are set to DLauto for this target. */
8128       }
8129    }
8130
8131    DetailLevelByOp__check_sanity( &mce.dlbo );
8132
8133    if (0) {
8134       // Debug printing: which tmps have been identified as PCast-only use
8135       if (mce.tmpHowUsed) {
8136          VG_(printf)("Cheapies: ");
8137          for (UInt q = 0; q < sb_in->tyenv->types_used; q++) {
8138             if (mce.tmpHowUsed[q] == HuPCa) {
8139                VG_(printf)("t%u ", q);
8140             }
8141          }
8142          VG_(printf)("\n");
8143       }
8144
8145       // Debug printing: number of ops by detail level
8146       UChar nCheap     = DetailLevelByOp__count( &mce.dlbo, DLcheap     );
8147       UChar nAuto      = DetailLevelByOp__count( &mce.dlbo, DLauto      );
8148       UChar nExpensive = DetailLevelByOp__count( &mce.dlbo, DLexpensive );
8149       tl_assert(nCheap + nAuto + nExpensive == 8);
8150
8151       VG_(printf)("%u,%u,%u ", nCheap, nAuto, nExpensive);
8152    }
8153    /* END decide on expense levels for instrumentation. */
8154
8155    /* Initialise the running the tmp environment. */
8156
8157    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
8158                             sizeof(TempMapEnt));
8159    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
8160    for (i = 0; i < sb_in->tyenv->types_used; i++) {
8161       TempMapEnt ent;
8162       ent.kind    = Orig;
8163       ent.shadowV = IRTemp_INVALID;
8164       ent.shadowB = IRTemp_INVALID;
8165       VG_(addToXA)( mce.tmpMap, &ent );
8166    }
8167    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
8168
8169    /* Finally, begin instrumentation. */
8170    /* Copy verbatim any IR preamble preceding the first IMark */
8171
8172    tl_assert(mce.sb == sb_out);
8173    tl_assert(mce.sb != sb_in);
8174
8175    i = 0;
8176    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
8177
8178       st = sb_in->stmts[i];
8179       tl_assert(st);
8180       tl_assert(isFlatIRStmt(st));
8181
8182       stmt( 'C', &mce, sb_in->stmts[i] );
8183       i++;
8184    }
8185
8186    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
8187       cause the IR following the preamble to contain references to IR
8188       temporaries defined in the preamble.  Because the preamble isn't
8189       instrumented, these temporaries don't have any shadows.
8190       Nevertheless uses of them following the preamble will cause
8191       memcheck to generate references to their shadows.  End effect is
8192       to cause IR sanity check failures, due to references to
8193       non-existent shadows.  This is only evident for the complex
8194       preambles used for function wrapping on TOC-afflicted platforms
8195       (ppc64-linux).
8196
8197       The following loop therefore scans the preamble looking for
8198       assignments to temporaries.  For each one found it creates an
8199       assignment to the corresponding (V) shadow temp, marking it as
8200       'defined'.  This is the same resulting IR as if the main
8201       instrumentation loop before had been applied to the statement
8202       'tmp = CONSTANT'.
8203
8204       Similarly, if origin tracking is enabled, we must generate an
8205       assignment for the corresponding origin (B) shadow, claiming
8206       no-origin, as appropriate for a defined value.
8207    */
8208    for (j = 0; j < i; j++) {
8209       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
8210          /* findShadowTmpV checks its arg is an original tmp;
8211             no need to assert that here. */
8212          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
8213          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
8214          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
8215          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
8216          if (MC_(clo_mc_level) == 3) {
8217             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
8218             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
8219             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
8220          }
8221          if (0) {
8222             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
8223             ppIRType( ty_v );
8224             VG_(printf)("\n");
8225          }
8226       }
8227    }
8228
8229    /* Iterate over the remaining stmts to generate instrumentation. */
8230
8231    tl_assert(sb_in->stmts_used > 0);
8232    tl_assert(i >= 0);
8233    tl_assert(i < sb_in->stmts_used);
8234    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
8235
8236    for (/* use current i*/; i < sb_in->stmts_used; i++) {
8237
8238       st = sb_in->stmts[i];
8239       first_stmt = sb_out->stmts_used;
8240
8241       if (verboze) {
8242          VG_(printf)("\n");
8243          ppIRStmt(st);
8244          VG_(printf)("\n");
8245       }
8246
8247       if (MC_(clo_mc_level) == 3) {
8248          /* See comments on case Ist_CAS below. */
8249          if (st->tag != Ist_CAS)
8250             schemeS( &mce, st );
8251       }
8252
8253       /* Generate instrumentation code for each stmt ... */
8254
8255       switch (st->tag) {
8256
8257          case Ist_WrTmp: {
8258             IRTemp dst = st->Ist.WrTmp.tmp;
8259             tl_assert(dst < (UInt)sb_in->tyenv->types_used);
8260             HowUsed hu = mce.tmpHowUsed ? mce.tmpHowUsed[dst]
8261                                         : HuOth/*we don't know, so play safe*/;
8262             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
8263                                expr2vbits( &mce, st->Ist.WrTmp.data, hu ));
8264             break;
8265          }
8266
8267          case Ist_Put:
8268             do_shadow_PUT( &mce,
8269                            st->Ist.Put.offset,
8270                            st->Ist.Put.data,
8271                            NULL /* shadow atom */, NULL /* guard */ );
8272             break;
8273
8274          case Ist_PutI:
8275             do_shadow_PUTI( &mce, st->Ist.PutI.details);
8276             break;
8277
8278          case Ist_Store:
8279             do_shadow_Store( &mce, st->Ist.Store.end,
8280                                    st->Ist.Store.addr, 0/* addr bias */,
8281                                    st->Ist.Store.data,
8282                                    NULL /* shadow data */,
8283                                    NULL/*guard*/ );
8284             break;
8285
8286          case Ist_StoreG:
8287             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
8288             break;
8289
8290          case Ist_LoadG:
8291             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
8292             break;
8293
8294          case Ist_Exit:
8295             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
8296             break;
8297
8298          case Ist_IMark:
8299             break;
8300
8301          case Ist_NoOp:
8302          case Ist_MBE:
8303             break;
8304
8305          case Ist_Dirty:
8306             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
8307             break;
8308
8309          case Ist_AbiHint:
8310             do_AbiHint( &mce, st->Ist.AbiHint.base,
8311                               st->Ist.AbiHint.len,
8312                               st->Ist.AbiHint.nia );
8313             break;
8314
8315          case Ist_CAS:
8316             do_shadow_CAS( &mce, st->Ist.CAS.details );
8317             /* Note, do_shadow_CAS copies the CAS itself to the output
8318                block, because it needs to add instrumentation both
8319                before and after it.  Hence skip the copy below.  Also
8320                skip the origin-tracking stuff (call to schemeS) above,
8321                since that's all tangled up with it too; do_shadow_CAS
8322                does it all. */
8323             break;
8324
8325          case Ist_LLSC:
8326             do_shadow_LLSC( &mce,
8327                             st->Ist.LLSC.end,
8328                             st->Ist.LLSC.result,
8329                             st->Ist.LLSC.addr,
8330                             st->Ist.LLSC.storedata );
8331             break;
8332
8333          default:
8334             VG_(printf)("\n");
8335             ppIRStmt(st);
8336             VG_(printf)("\n");
8337             VG_(tool_panic)("memcheck: unhandled IRStmt");
8338
8339       } /* switch (st->tag) */
8340
8341       if (0 && verboze) {
8342          for (j = first_stmt; j < sb_out->stmts_used; j++) {
8343             VG_(printf)("   ");
8344             ppIRStmt(sb_out->stmts[j]);
8345             VG_(printf)("\n");
8346          }
8347          VG_(printf)("\n");
8348       }
8349
8350       /* ... and finally copy the stmt itself to the output.  Except,
8351          skip the copy of IRCASs; see comments on case Ist_CAS
8352          above. */
8353       if (st->tag != Ist_CAS)
8354          stmt('C', &mce, st);
8355    }
8356
8357    /* Now we need to complain if the jump target is undefined. */
8358    first_stmt = sb_out->stmts_used;
8359
8360    if (verboze) {
8361       VG_(printf)("sb_in->next = ");
8362       ppIRExpr(sb_in->next);
8363       VG_(printf)("\n\n");
8364    }
8365
8366    complainIfUndefined( &mce, sb_in->next, NULL );
8367
8368    if (0 && verboze) {
8369       for (j = first_stmt; j < sb_out->stmts_used; j++) {
8370          VG_(printf)("   ");
8371          ppIRStmt(sb_out->stmts[j]);
8372          VG_(printf)("\n");
8373       }
8374       VG_(printf)("\n");
8375    }
8376
8377    /* If this fails, there's been some serious snafu with tmp management,
8378       that should be investigated. */
8379    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
8380    VG_(deleteXA)( mce.tmpMap );
8381
8382    if (mce.tmpHowUsed) {
8383       VG_(free)( mce.tmpHowUsed );
8384    }
8385
8386    tl_assert(mce.sb == sb_out);
8387    return sb_out;
8388 }
8389
8390
8391 /*--------------------------------------------------------------------*/
8392 /*--- end                                           mc_translate.c ---*/
8393 /*--------------------------------------------------------------------*/