Add missing zstd.h to coregrind Makefile.am noinst_HEADERS
[valgrind.git] / memcheck / mc_translate.c
blob05e6d59afa516e3cf9774c1f529f05427ab1a366
2 /*--------------------------------------------------------------------*/
3 /*--- Instrument IR to perform memory checking operations. ---*/
4 /*--- mc_translate.c ---*/
5 /*--------------------------------------------------------------------*/
7 /*
8 This file is part of MemCheck, a heavyweight Valgrind tool for
9 detecting memory errors.
11 Copyright (C) 2000-2017 Julian Seward
12 jseward@acm.org
14 This program is free software; you can redistribute it and/or
15 modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of the
17 License, or (at your option) any later version.
19 This program is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, see <http://www.gnu.org/licenses/>.
27 The GNU General Public License is contained in the file COPYING.
30 #include "pub_tool_basics.h"
31 #include "pub_tool_poolalloc.h" // For mc_include.h
32 #include "pub_tool_hashtable.h" // For mc_include.h
33 #include "pub_tool_libcassert.h"
34 #include "pub_tool_libcprint.h"
35 #include "pub_tool_tooliface.h"
36 #include "pub_tool_machine.h" // VG_(fnptr_to_fnentry)
37 #include "pub_tool_xarray.h"
38 #include "pub_tool_mallocfree.h"
39 #include "pub_tool_libcbase.h"
41 #include "mc_include.h"
44 /* FIXMEs JRS 2011-June-16.
46 Check the interpretation for vector narrowing and widening ops,
47 particularly the saturating ones. I suspect they are either overly
48 pessimistic and/or wrong.
50 Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
51 saturating shifts): the interpretation is overly pessimistic.
52 See comments on the relevant cases below for details.
54 Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
55 both rounding and non-rounding variants): ditto
58 /* This file implements the Memcheck instrumentation, and in
59 particular contains the core of its undefined value detection
60 machinery. For a comprehensive background of the terminology,
61 algorithms and rationale used herein, read:
63 Using Valgrind to detect undefined value errors with
64 bit-precision
66 Julian Seward and Nicholas Nethercote
68 2005 USENIX Annual Technical Conference (General Track),
69 Anaheim, CA, USA, April 10-15, 2005.
71 ----
73 Here is as good a place as any to record exactly when V bits are and
74 should be checked, why, and what function is responsible.
77 Memcheck complains when an undefined value is used:
79 1. In the condition of a conditional branch. Because it could cause
80 incorrect control flow, and thus cause incorrect externally-visible
81 behaviour. [mc_translate.c:complainIfUndefined]
83 2. As an argument to a system call, or as the value that specifies
84 the system call number. Because it could cause an incorrect
85 externally-visible side effect. [mc_translate.c:mc_pre_reg_read]
87 3. As the address in a load or store. Because it could cause an
88 incorrect value to be used later, which could cause externally-visible
89 behaviour (eg. via incorrect control flow or an incorrect system call
90 argument) [complainIfUndefined]
92 4. As the target address of a branch. Because it could cause incorrect
93 control flow. [complainIfUndefined]
95 5. As an argument to setenv, unsetenv, or putenv. Because it could put
96 an incorrect value into the external environment.
97 [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
99 6. As the index in a GETI or PUTI operation. I'm not sure why... (njn).
100 [complainIfUndefined]
102 7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
103 VALGRIND_CHECK_VALUE_IS_DEFINED client requests. Because the user
104 requested it. [in memcheck.h]
107 Memcheck also complains, but should not, when an undefined value is used:
109 8. As the shift value in certain SIMD shift operations (but not in the
110 standard integer shift operations). This inconsistency is due to
111 historical reasons.) [complainIfUndefined]
114 Memcheck does not complain, but should, when an undefined value is used:
116 9. As an input to a client request. Because the client request may
117 affect the visible behaviour -- see bug #144362 for an example
118 involving the malloc replacements in vg_replace_malloc.c and
119 VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
120 isn't identified. That bug report also has some info on how to solve
121 the problem. [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
124 In practice, 1 and 2 account for the vast majority of cases.
127 /* Generation of addr-definedness, addr-validity and
128 guard-definedness checks pertaining to loads and stores (Iex_Load,
129 Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
130 loads/stores) was re-checked 11 May 2013. */
133 /*------------------------------------------------------------*/
134 /*--- Forward decls ---*/
135 /*------------------------------------------------------------*/
137 struct _MCEnv;
139 // See below for comments explaining what this is for.
140 typedef
141 enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
142 HowUsed;
144 static IRType shadowTypeV ( IRType ty );
145 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e,
146 HowUsed hu/*use HuOth if unknown*/ );
147 static IRTemp findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
149 static IRExpr *i128_const_zero(void);
152 /*------------------------------------------------------------*/
153 /*--- Memcheck running state, and tmp management. ---*/
154 /*------------------------------------------------------------*/
156 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
157 propagation scheme, and a more expensive, more precise vbit propagation
158 scheme. This enum describes, for such an IROp, which scheme to use. */
159 typedef
160 enum {
161 // Use the cheaper, less-exact variant.
162 DLcheap=4,
163 // Choose between cheap and expensive based on analysis of the block
164 // to be instrumented. Note that the choice may be done on a
165 // per-instance basis of the IROp that this DetailLevel describes.
166 DLauto,
167 // Use the more expensive, more-exact variant.
168 DLexpensive
170 DetailLevel;
173 /* A readonly part of the running state. For IROps that have both a
174 less-exact and more-exact interpretation, records which interpretation is
175 to be used. */
176 typedef
177 struct {
178 // For Add32/64 and Sub32/64, all 3 settings are allowed. For the
179 // DLauto case, a per-instance decision is to be made by inspecting
180 // the associated tmp's entry in MCEnv.tmpHowUsed.
181 DetailLevel dl_Add32;
182 DetailLevel dl_Add64;
183 DetailLevel dl_Sub32;
184 DetailLevel dl_Sub64;
185 // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
186 // allowed.
187 DetailLevel dl_CmpEQ64_CmpNE64;
188 DetailLevel dl_CmpEQ32_CmpNE32;
189 DetailLevel dl_CmpEQ16_CmpNE16;
190 DetailLevel dl_CmpEQ8_CmpNE8;
192 DetailLevelByOp;
194 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp* dlbo,
195 DetailLevel dl )
197 dlbo->dl_Add32 = dl;
198 dlbo->dl_Add64 = dl;
199 dlbo->dl_Sub32 = dl;
200 dlbo->dl_Sub64 = dl;
201 dlbo->dl_CmpEQ64_CmpNE64 = dl;
202 dlbo->dl_CmpEQ32_CmpNE32 = dl;
203 dlbo->dl_CmpEQ16_CmpNE16 = dl;
204 dlbo->dl_CmpEQ8_CmpNE8 = dl;
207 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp* dlbo )
209 tl_assert(dlbo->dl_Add32 >= DLcheap && dlbo->dl_Add32 <= DLexpensive);
210 tl_assert(dlbo->dl_Add64 >= DLcheap && dlbo->dl_Add64 <= DLexpensive);
211 tl_assert(dlbo->dl_Sub32 >= DLcheap && dlbo->dl_Sub32 <= DLexpensive);
212 tl_assert(dlbo->dl_Sub64 >= DLcheap && dlbo->dl_Sub64 <= DLexpensive);
213 tl_assert(dlbo->dl_CmpEQ64_CmpNE64 == DLcheap
214 || dlbo->dl_CmpEQ64_CmpNE64 == DLexpensive);
215 tl_assert(dlbo->dl_CmpEQ32_CmpNE32 == DLcheap
216 || dlbo->dl_CmpEQ32_CmpNE32 == DLexpensive);
217 tl_assert(dlbo->dl_CmpEQ16_CmpNE16 == DLcheap
218 || dlbo->dl_CmpEQ16_CmpNE16 == DLexpensive);
219 tl_assert(dlbo->dl_CmpEQ8_CmpNE8 == DLcheap
220 || dlbo->dl_CmpEQ8_CmpNE8 == DLexpensive);
223 static UInt DetailLevelByOp__count ( const DetailLevelByOp* dlbo,
224 DetailLevel dl )
226 UInt n = 0;
227 n += (dlbo->dl_Add32 == dl ? 1 : 0);
228 n += (dlbo->dl_Add64 == dl ? 1 : 0);
229 n += (dlbo->dl_Sub32 == dl ? 1 : 0);
230 n += (dlbo->dl_Sub64 == dl ? 1 : 0);
231 n += (dlbo->dl_CmpEQ64_CmpNE64 == dl ? 1 : 0);
232 n += (dlbo->dl_CmpEQ32_CmpNE32 == dl ? 1 : 0);
233 n += (dlbo->dl_CmpEQ16_CmpNE16 == dl ? 1 : 0);
234 n += (dlbo->dl_CmpEQ8_CmpNE8 == dl ? 1 : 0);
235 return n;
239 /* Carries info about a particular tmp. The tmp's number is not
240 recorded, as this is implied by (equal to) its index in the tmpMap
241 in MCEnv. The tmp's type is also not recorded, as this is present
242 in MCEnv.sb->tyenv.
244 When .kind is Orig, .shadowV and .shadowB may give the identities
245 of the temps currently holding the associated definedness (shadowV)
246 and origin (shadowB) values, or these may be IRTemp_INVALID if code
247 to compute such values has not yet been emitted.
249 When .kind is VSh or BSh then the tmp is holds a V- or B- value,
250 and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
251 illogical for a shadow tmp itself to be shadowed.
253 typedef
254 enum { Orig=1, VSh=2, BSh=3 }
255 TempKind;
257 typedef
258 struct {
259 TempKind kind;
260 IRTemp shadowV;
261 IRTemp shadowB;
263 TempMapEnt;
266 /* A |HowUsed| value carries analysis results about how values are used,
267 pertaining to whether we need to instrument integer adds expensively or
268 not. The running state carries a (readonly) mapping from original tmp to
269 a HowUsed value for it. A usage value can be one of three values,
270 forming a 3-point chain lattice.
272 HuOth ("Other") used in some arbitrary way
274 HuPCa ("PCast") used *only* in effectively a PCast, in which all
275 | we care about is the all-defined vs not-all-defined distinction
277 HuUnU ("Unused") not used at all.
279 The "safe" (don't-know) end of the lattice is "HuOth". See comments
280 below in |preInstrumentationAnalysis| for further details.
282 /* DECLARED ABOVE:
283 typedef
284 enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
285 HowUsed;
288 // Not actually necessary, but we don't want to waste D1 space.
289 STATIC_ASSERT(sizeof(HowUsed) == 1);
292 /* Carries around state during memcheck instrumentation. */
293 typedef
294 struct _MCEnv {
295 /* MODIFIED: the superblock being constructed. IRStmts are
296 added. */
297 IRSB* sb;
298 Bool trace;
300 /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
301 current kind and possibly shadow temps for each temp in the
302 IRSB being constructed. Note that it does not contain the
303 type of each tmp. If you want to know the type, look at the
304 relevant entry in sb->tyenv. It follows that at all times
305 during the instrumentation process, the valid indices for
306 tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
307 total number of Orig, V- and B- temps allocated so far.
309 The reason for this strange split (types in one place, all
310 other info in another) is that we need the types to be
311 attached to sb so as to make it possible to do
312 "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
313 instrumentation process. */
314 XArray* /* of TempMapEnt */ tmpMap;
316 /* READONLY: contains details of which ops should be expensively
317 instrumented. */
318 DetailLevelByOp dlbo;
320 /* READONLY: for each original tmp, how the tmp is used. This is
321 computed by |preInstrumentationAnalysis|. Valid indices are
322 0 .. #temps_in_sb-1 (same as for tmpMap). */
323 HowUsed* tmpHowUsed;
325 /* READONLY: the guest layout. This indicates which parts of
326 the guest state should be regarded as 'always defined'. */
327 const VexGuestLayout* layout;
329 /* READONLY: the host word type. Needed for constructing
330 arguments of type 'HWord' to be passed to helper functions.
331 Ity_I32 or Ity_I64 only. */
332 IRType hWordTy;
334 MCEnv;
337 /* SHADOW TMP MANAGEMENT. Shadow tmps are allocated lazily (on
338 demand), as they are encountered. This is for two reasons.
340 (1) (less important reason): Many original tmps are unused due to
341 initial IR optimisation, and we do not want to spaces in tables
342 tracking them.
344 Shadow IRTemps are therefore allocated on demand. mce.tmpMap is a
345 table indexed [0 .. n_types-1], which gives the current shadow for
346 each original tmp, or INVALID_IRTEMP if none is so far assigned.
347 It is necessary to support making multiple assignments to a shadow
348 -- specifically, after testing a shadow for definedness, it needs
349 to be made defined. But IR's SSA property disallows this.
351 (2) (more important reason): Therefore, when a shadow needs to get
352 a new value, a new temporary is created, the value is assigned to
353 that, and the tmpMap is updated to reflect the new binding.
355 A corollary is that if the tmpMap maps a given tmp to
356 IRTemp_INVALID and we are hoping to read that shadow tmp, it means
357 there's a read-before-write error in the original tmps. The IR
358 sanity checker should catch all such anomalies, however.
361 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
362 both the table in mce->sb and to our auxiliary mapping. Note that
363 newTemp may cause mce->tmpMap to resize, hence previous results
364 from VG_(indexXA)(mce->tmpMap) are invalidated. */
365 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
367 Word newIx;
368 TempMapEnt ent;
369 IRTemp tmp = newIRTemp(mce->sb->tyenv, ty);
370 ent.kind = kind;
371 ent.shadowV = IRTemp_INVALID;
372 ent.shadowB = IRTemp_INVALID;
373 newIx = VG_(addToXA)( mce->tmpMap, &ent );
374 tl_assert(newIx == (Word)tmp);
375 return tmp;
379 /* Find the tmp currently shadowing the given original tmp. If none
380 so far exists, allocate one. */
381 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
383 TempMapEnt* ent;
384 /* VG_(indexXA) range-checks 'orig', hence no need to check
385 here. */
386 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
387 tl_assert(ent->kind == Orig);
388 if (ent->shadowV == IRTemp_INVALID) {
389 IRTemp tmpV
390 = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
391 /* newTemp may cause mce->tmpMap to resize, hence previous results
392 from VG_(indexXA) are invalid. */
393 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
394 tl_assert(ent->kind == Orig);
395 tl_assert(ent->shadowV == IRTemp_INVALID);
396 ent->shadowV = tmpV;
398 return ent->shadowV;
401 /* Allocate a new shadow for the given original tmp. This means any
402 previous shadow is abandoned. This is needed because it is
403 necessary to give a new value to a shadow once it has been tested
404 for undefinedness, but unfortunately IR's SSA property disallows
405 this. Instead we must abandon the old shadow, allocate a new one
406 and use that instead.
408 This is the same as findShadowTmpV, except we don't bother to see
409 if a shadow temp already existed -- we simply allocate a new one
410 regardless. */
411 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
413 TempMapEnt* ent;
414 /* VG_(indexXA) range-checks 'orig', hence no need to check
415 here. */
416 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
417 tl_assert(ent->kind == Orig);
418 if (1) {
419 IRTemp tmpV
420 = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
421 /* newTemp may cause mce->tmpMap to resize, hence previous results
422 from VG_(indexXA) are invalid. */
423 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
424 tl_assert(ent->kind == Orig);
425 ent->shadowV = tmpV;
430 /*------------------------------------------------------------*/
431 /*--- IRAtoms -- a subset of IRExprs ---*/
432 /*------------------------------------------------------------*/
434 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
435 isIRAtom() in libvex_ir.h. Because this instrumenter expects flat
436 input, most of this code deals in atoms. Usefully, a value atom
437 always has a V-value which is also an atom: constants are shadowed
438 by constants, and temps are shadowed by the corresponding shadow
439 temporary. */
441 typedef IRExpr IRAtom;
443 /* (used for sanity checks only): is this an atom which looks
444 like it's from original code? */
445 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
447 if (a1->tag == Iex_Const)
448 return True;
449 if (a1->tag == Iex_RdTmp) {
450 TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
451 return ent->kind == Orig;
453 return False;
456 /* (used for sanity checks only): is this an atom which looks
457 like it's from shadow code? */
458 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
460 if (a1->tag == Iex_Const)
461 return True;
462 if (a1->tag == Iex_RdTmp) {
463 TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
464 return ent->kind == VSh || ent->kind == BSh;
466 return False;
469 /* (used for sanity checks only): check that both args are atoms and
470 are identically-kinded. */
471 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
473 if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
474 return True;
475 if (a1->tag == Iex_Const && a2->tag == Iex_Const)
476 return True;
477 return False;
481 /*------------------------------------------------------------*/
482 /*--- Type management ---*/
483 /*------------------------------------------------------------*/
485 /* Shadow state is always accessed using integer types. This returns
486 an integer type with the same size (as per sizeofIRType) as the
487 given type. The only valid shadow types are Bit, I8, I16, I32,
488 I64, I128, V128, V256. */
490 static IRType shadowTypeV ( IRType ty )
492 switch (ty) {
493 case Ity_I1:
494 case Ity_I8:
495 case Ity_I16:
496 case Ity_I32:
497 case Ity_I64:
498 case Ity_I128: return ty;
499 case Ity_F16: return Ity_I16;
500 case Ity_F32: return Ity_I32;
501 case Ity_D32: return Ity_I32;
502 case Ity_F64: return Ity_I64;
503 case Ity_D64: return Ity_I64;
504 case Ity_F128: return Ity_I128;
505 case Ity_D128: return Ity_I128;
506 case Ity_V128: return Ity_V128;
507 case Ity_V256: return Ity_V256;
508 default: ppIRType(ty);
509 VG_(tool_panic)("memcheck:shadowTypeV");
513 /* Produce a 'defined' value of the given shadow type. Should only be
514 supplied shadow types (Bit/I8/I16/I32/UI64). */
515 static IRExpr* definedOfType ( IRType ty ) {
516 switch (ty) {
517 case Ity_I1: return IRExpr_Const(IRConst_U1(False));
518 case Ity_I8: return IRExpr_Const(IRConst_U8(0));
519 case Ity_I16: return IRExpr_Const(IRConst_U16(0));
520 case Ity_I32: return IRExpr_Const(IRConst_U32(0));
521 case Ity_I64: return IRExpr_Const(IRConst_U64(0));
522 case Ity_I128: return i128_const_zero();
523 case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
524 case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
525 default: VG_(tool_panic)("memcheck:definedOfType");
530 /*------------------------------------------------------------*/
531 /*--- Constructing IR fragments ---*/
532 /*------------------------------------------------------------*/
534 /* add stmt to a bb */
535 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
536 if (mce->trace) {
537 VG_(printf)(" %c: ", cat);
538 ppIRStmt(st);
539 VG_(printf)("\n");
541 addStmtToIRSB(mce->sb, st);
544 /* assign value to tmp */
545 static inline
546 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
547 stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
550 /* build various kinds of expressions */
551 #define triop(_op, _arg1, _arg2, _arg3) \
552 IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
553 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
554 #define unop(_op, _arg) IRExpr_Unop((_op),(_arg))
555 #define mkU1(_n) IRExpr_Const(IRConst_U1(_n))
556 #define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
557 #define mkU16(_n) IRExpr_Const(IRConst_U16(_n))
558 #define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
559 #define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
560 #define mkV128(_n) IRExpr_Const(IRConst_V128(_n))
561 #define mkexpr(_tmp) IRExpr_RdTmp((_tmp))
563 /* Bind the given expression to a new temporary, and return the
564 temporary. This effectively converts an arbitrary expression into
565 an atom.
567 'ty' is the type of 'e' and hence the type that the new temporary
568 needs to be. But passing it in is redundant, since we can deduce
569 the type merely by inspecting 'e'. So at least use that fact to
570 assert that the two types agree. */
571 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
573 TempKind k;
574 IRTemp t;
575 IRType tyE = typeOfIRExpr(mce->sb->tyenv, e);
577 tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
578 switch (cat) {
579 case 'V': k = VSh; break;
580 case 'B': k = BSh; break;
581 case 'C': k = Orig; break;
582 /* happens when we are making up new "orig"
583 expressions, for IRCAS handling */
584 default: tl_assert(0);
586 t = newTemp(mce, ty, k);
587 assign(cat, mce, t, e);
588 return mkexpr(t);
592 /*------------------------------------------------------------*/
593 /*--- Helper functions for 128-bit ops ---*/
594 /*------------------------------------------------------------*/
596 static IRExpr *i128_const_zero(void)
598 IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
599 return binop(Iop_64HLto128, z64, z64);
602 /* There are no I128-bit loads and/or stores [as generated by any
603 current front ends]. So we do not need to worry about that in
604 expr2vbits_Load */
607 /*------------------------------------------------------------*/
608 /*--- Constructing definedness primitive ops ---*/
609 /*------------------------------------------------------------*/
611 /* --------- Defined-if-either-defined --------- */
613 static IRAtom* mkDifD1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
614 tl_assert(isShadowAtom(mce,a1));
615 tl_assert(isShadowAtom(mce,a2));
616 return assignNew('V', mce, Ity_I1, binop(Iop_And1, a1, a2));
619 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
620 tl_assert(isShadowAtom(mce,a1));
621 tl_assert(isShadowAtom(mce,a2));
622 return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
625 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
626 tl_assert(isShadowAtom(mce,a1));
627 tl_assert(isShadowAtom(mce,a2));
628 return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
631 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
632 tl_assert(isShadowAtom(mce,a1));
633 tl_assert(isShadowAtom(mce,a2));
634 return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
637 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
638 tl_assert(isShadowAtom(mce,a1));
639 tl_assert(isShadowAtom(mce,a2));
640 return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
643 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
644 tl_assert(isShadowAtom(mce,a1));
645 tl_assert(isShadowAtom(mce,a2));
646 return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
649 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
650 tl_assert(isShadowAtom(mce,a1));
651 tl_assert(isShadowAtom(mce,a2));
652 return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
655 /* --------- Undefined-if-either-undefined --------- */
657 static IRAtom* mkUifU1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
658 tl_assert(isShadowAtom(mce,a1));
659 tl_assert(isShadowAtom(mce,a2));
660 return assignNew('V', mce, Ity_I1, binop(Iop_Or1, a1, a2));
663 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
664 tl_assert(isShadowAtom(mce,a1));
665 tl_assert(isShadowAtom(mce,a2));
666 return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
669 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
670 tl_assert(isShadowAtom(mce,a1));
671 tl_assert(isShadowAtom(mce,a2));
672 return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
675 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
676 tl_assert(isShadowAtom(mce,a1));
677 tl_assert(isShadowAtom(mce,a2));
678 return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
681 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
682 tl_assert(isShadowAtom(mce,a1));
683 tl_assert(isShadowAtom(mce,a2));
684 return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
687 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
688 IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
689 tl_assert(isShadowAtom(mce,a1));
690 tl_assert(isShadowAtom(mce,a2));
691 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
692 tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
693 tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
694 tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
695 tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
696 tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
698 return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
701 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
702 tl_assert(isShadowAtom(mce,a1));
703 tl_assert(isShadowAtom(mce,a2));
704 return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
707 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
708 tl_assert(isShadowAtom(mce,a1));
709 tl_assert(isShadowAtom(mce,a2));
710 return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
713 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
714 switch (vty) {
715 case Ity_I8: return mkUifU8(mce, a1, a2);
716 case Ity_I16: return mkUifU16(mce, a1, a2);
717 case Ity_I32: return mkUifU32(mce, a1, a2);
718 case Ity_I64: return mkUifU64(mce, a1, a2);
719 case Ity_I128: return mkUifU128(mce, a1, a2);
720 case Ity_V128: return mkUifUV128(mce, a1, a2);
721 case Ity_V256: return mkUifUV256(mce, a1, a2);
722 default:
723 VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
724 VG_(tool_panic)("memcheck:mkUifU");
728 /* --------- The Left-family of operations. --------- */
730 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
731 tl_assert(isShadowAtom(mce,a1));
732 return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
735 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
736 tl_assert(isShadowAtom(mce,a1));
737 return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
740 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
741 tl_assert(isShadowAtom(mce,a1));
742 return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
745 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
746 tl_assert(isShadowAtom(mce,a1));
747 return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
750 /* --------- The Right-family of operations. --------- */
752 /* Unfortunately these are a lot more expensive then their Left
753 counterparts. Fortunately they are only very rarely used -- only for
754 count-leading-zeroes instrumentation. */
756 static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )
758 for (Int i = 1; i <= 16; i *= 2) {
759 // a1 |= (a1 >>u i)
760 IRAtom* tmp
761 = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));
762 a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));
764 return a1;
767 static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )
769 for (Int i = 1; i <= 32; i *= 2) {
770 // a1 |= (a1 >>u i)
771 IRAtom* tmp
772 = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));
773 a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));
775 return a1;
778 /* --------- 'Improvement' functions for AND/OR. --------- */
780 /* ImproveAND(data, vbits) = data OR vbits. Defined (0) data 0s give
781 defined (0); all other -> undefined (1).
783 static IRAtom* mkImproveAND1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
785 tl_assert(isOriginalAtom(mce, data));
786 tl_assert(isShadowAtom(mce, vbits));
787 tl_assert(sameKindedAtoms(data, vbits));
788 return assignNew('V', mce, Ity_I1, binop(Iop_Or1, data, vbits));
791 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
793 tl_assert(isOriginalAtom(mce, data));
794 tl_assert(isShadowAtom(mce, vbits));
795 tl_assert(sameKindedAtoms(data, vbits));
796 return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
799 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
801 tl_assert(isOriginalAtom(mce, data));
802 tl_assert(isShadowAtom(mce, vbits));
803 tl_assert(sameKindedAtoms(data, vbits));
804 return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
807 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
809 tl_assert(isOriginalAtom(mce, data));
810 tl_assert(isShadowAtom(mce, vbits));
811 tl_assert(sameKindedAtoms(data, vbits));
812 return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
815 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
817 tl_assert(isOriginalAtom(mce, data));
818 tl_assert(isShadowAtom(mce, vbits));
819 tl_assert(sameKindedAtoms(data, vbits));
820 return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
823 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
825 tl_assert(isOriginalAtom(mce, data));
826 tl_assert(isShadowAtom(mce, vbits));
827 tl_assert(sameKindedAtoms(data, vbits));
828 return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
831 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
833 tl_assert(isOriginalAtom(mce, data));
834 tl_assert(isShadowAtom(mce, vbits));
835 tl_assert(sameKindedAtoms(data, vbits));
836 return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
839 /* ImproveOR(data, vbits) = ~data OR vbits. Defined (0) data 1s give
840 defined (0); all other -> undefined (1).
842 static IRAtom* mkImproveOR1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
844 tl_assert(isOriginalAtom(mce, data));
845 tl_assert(isShadowAtom(mce, vbits));
846 tl_assert(sameKindedAtoms(data, vbits));
847 return assignNew(
848 'V', mce, Ity_I1,
849 binop(Iop_Or1,
850 assignNew('V', mce, Ity_I1, unop(Iop_Not1, data)),
851 vbits) );
854 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
856 tl_assert(isOriginalAtom(mce, data));
857 tl_assert(isShadowAtom(mce, vbits));
858 tl_assert(sameKindedAtoms(data, vbits));
859 return assignNew(
860 'V', mce, Ity_I8,
861 binop(Iop_Or8,
862 assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
863 vbits) );
866 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
868 tl_assert(isOriginalAtom(mce, data));
869 tl_assert(isShadowAtom(mce, vbits));
870 tl_assert(sameKindedAtoms(data, vbits));
871 return assignNew(
872 'V', mce, Ity_I16,
873 binop(Iop_Or16,
874 assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
875 vbits) );
878 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
880 tl_assert(isOriginalAtom(mce, data));
881 tl_assert(isShadowAtom(mce, vbits));
882 tl_assert(sameKindedAtoms(data, vbits));
883 return assignNew(
884 'V', mce, Ity_I32,
885 binop(Iop_Or32,
886 assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
887 vbits) );
890 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
892 tl_assert(isOriginalAtom(mce, data));
893 tl_assert(isShadowAtom(mce, vbits));
894 tl_assert(sameKindedAtoms(data, vbits));
895 return assignNew(
896 'V', mce, Ity_I64,
897 binop(Iop_Or64,
898 assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
899 vbits) );
902 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
904 tl_assert(isOriginalAtom(mce, data));
905 tl_assert(isShadowAtom(mce, vbits));
906 tl_assert(sameKindedAtoms(data, vbits));
907 return assignNew(
908 'V', mce, Ity_V128,
909 binop(Iop_OrV128,
910 assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
911 vbits) );
914 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
916 tl_assert(isOriginalAtom(mce, data));
917 tl_assert(isShadowAtom(mce, vbits));
918 tl_assert(sameKindedAtoms(data, vbits));
919 return assignNew(
920 'V', mce, Ity_V256,
921 binop(Iop_OrV256,
922 assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
923 vbits) );
926 /* --------- Pessimising casts. --------- */
928 /* The function returns an expression of type DST_TY. If any of the VBITS
929 is undefined (value == 1) the resulting expression has all bits set to
930 1. Otherwise, all bits are 0. */
932 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
934 IRType src_ty;
935 IRAtom* tmp1;
937 /* Note, dst_ty is a shadow type, not an original type. */
938 tl_assert(isShadowAtom(mce,vbits));
939 src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
941 /* Fast-track some common cases */
942 if (src_ty == Ity_I32 && dst_ty == Ity_I32)
943 return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
945 if (src_ty == Ity_I64 && dst_ty == Ity_I64)
946 return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
948 if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
949 /* PCast the arg, then clone it. */
950 IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
951 return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
954 if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
955 /* PCast the arg, then clone it 4 times. */
956 IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
957 tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
958 return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
961 if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
962 /* PCast the arg, then clone it 8 times. */
963 IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
964 tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
965 tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
966 return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
969 if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
970 /* PCast the arg. This gives all 0s or all 1s. Then throw away
971 the top half. */
972 IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
973 return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
976 if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
977 /* Use InterleaveHI64x2 to copy the top half of the vector into
978 the bottom half. Then we can UifU it with the original, throw
979 away the upper half of the result, and PCast-I64-to-I64
980 the lower half. */
981 // Generates vbits[127:64] : vbits[127:64]
982 IRAtom* hi64hi64
983 = assignNew('V', mce, Ity_V128,
984 binop(Iop_InterleaveHI64x2, vbits, vbits));
985 // Generates
986 // UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
987 // == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
988 IRAtom* lohi64
989 = mkUifUV128(mce, hi64hi64, vbits);
990 // Generates UifU(vbits[127:64],vbits[63:0])
991 IRAtom* lo64
992 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
993 // Generates
994 // PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
995 // == PCast-to-I64( vbits[127:0] )
996 IRAtom* res
997 = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
998 return res;
1001 /* Else do it the slow way .. */
1002 /* First of all, collapse vbits down to a single bit. */
1003 tmp1 = NULL;
1004 switch (src_ty) {
1005 case Ity_I1:
1006 tmp1 = vbits;
1007 break;
1008 case Ity_I8:
1009 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
1010 break;
1011 case Ity_I16:
1012 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
1013 break;
1014 case Ity_I32:
1015 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
1016 break;
1017 case Ity_I64:
1018 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
1019 break;
1020 case Ity_I128: {
1021 /* Gah. Chop it in half, OR the halves together, and compare
1022 that with zero. */
1023 IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
1024 IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
1025 IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1026 tmp1 = assignNew('V', mce, Ity_I1,
1027 unop(Iop_CmpNEZ64, tmp4));
1028 break;
1030 case Ity_V128: {
1031 /* Chop it in half, OR the halves together, and compare that
1032 * with zero.
1034 IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
1035 IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
1036 IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1037 tmp1 = assignNew('V', mce, Ity_I1,
1038 unop(Iop_CmpNEZ64, tmp4));
1039 break;
1041 default:
1042 ppIRType(src_ty);
1043 VG_(tool_panic)("mkPCastTo(1)");
1045 tl_assert(tmp1);
1046 /* Now widen up to the dst type. */
1047 switch (dst_ty) {
1048 case Ity_I1:
1049 return tmp1;
1050 case Ity_I8:
1051 return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
1052 case Ity_I16:
1053 return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
1054 case Ity_I32:
1055 return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
1056 case Ity_I64:
1057 return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1058 case Ity_V128:
1059 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1060 tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
1061 return tmp1;
1062 case Ity_I128:
1063 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1064 tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
1065 return tmp1;
1066 case Ity_V256:
1067 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1068 tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
1069 tmp1, tmp1));
1070 tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
1071 tmp1, tmp1));
1072 return tmp1;
1073 default:
1074 ppIRType(dst_ty);
1075 VG_(tool_panic)("mkPCastTo(2)");
1079 /* This is a minor variant. It takes an arg of some type and returns
1080 a value of the same type. The result consists entirely of Defined
1081 (zero) bits except its least significant bit, which is a PCast of
1082 the entire argument down to a single bit. */
1083 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
1085 if (ty == Ity_V128) {
1086 /* --- Case for V128 --- */
1087 IRAtom* varg128 = varg;
1088 // generates: PCast-to-I64(varg128)
1089 IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
1090 // Now introduce zeros (defined bits) in the top 63 places
1091 // generates: Def--(63)--Def PCast-to-I1(varg128)
1092 IRAtom* d63pc
1093 = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
1094 // generates: Def--(64)--Def
1095 IRAtom* d64
1096 = definedOfType(Ity_I64);
1097 // generates: Def--(127)--Def PCast-to-I1(varg128)
1098 IRAtom* res
1099 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
1100 return res;
1102 if (ty == Ity_I64) {
1103 /* --- Case for I64 --- */
1104 // PCast to 64
1105 IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
1106 // Zero (Def) out the top 63 bits
1107 IRAtom* res
1108 = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
1109 return res;
1111 /*NOTREACHED*/
1112 tl_assert(0);
1115 /* --------- Optimistic casts. --------- */
1117 /* The function takes and returns an expression of type TY. If any of the
1118 VBITS indicate defined (value == 0) the resulting expression has all bits
1119 set to 0. Otherwise, all bits are 1. In words, if any bits are defined
1120 then all bits are made to be defined.
1122 In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1124 static IRAtom* mkOCastAt( MCEnv* mce, IRType ty, IRAtom* vbits )
1126 IROp opSUB, opSHR, opSAR;
1127 UInt sh;
1129 switch (ty) {
1130 case Ity_I64:
1131 opSUB = Iop_Sub64; opSHR = Iop_Shr64; opSAR = Iop_Sar64; sh = 63;
1132 break;
1133 case Ity_I32:
1134 opSUB = Iop_Sub32; opSHR = Iop_Shr32; opSAR = Iop_Sar32; sh = 31;
1135 break;
1136 case Ity_I16:
1137 opSUB = Iop_Sub16; opSHR = Iop_Shr16; opSAR = Iop_Sar16; sh = 15;
1138 break;
1139 case Ity_I8:
1140 opSUB = Iop_Sub8; opSHR = Iop_Shr8; opSAR = Iop_Sar8; sh = 7;
1141 break;
1142 default:
1143 ppIRType(ty);
1144 VG_(tool_panic)("mkOCastTo");
1147 IRAtom *shr1, *at;
1148 shr1 = assignNew('V', mce,ty, binop(opSHR, vbits, mkU8(1)));
1149 at = assignNew('V', mce,ty, binop(opSUB, vbits, shr1));
1150 at = assignNew('V', mce,ty, binop(opSAR, at, mkU8(sh)));
1151 return at;
1155 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1157 Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1158 PCasting to Ity_U1. However, sometimes it is necessary to be more
1159 accurate. The insight is that the result is defined if two
1160 corresponding bits can be found, one from each argument, so that
1161 both bits are defined but are different -- that makes EQ say "No"
1162 and NE say "Yes". Hence, we compute an improvement term and DifD
1163 it onto the "normal" (UifU) result.
1165 The result is:
1167 PCastTo<1> (
1168 -- naive version
1169 UifU<sz>(vxx, vyy)
1171 `DifD<sz>`
1173 -- improvement term
1174 OCast<sz>(vec)
1177 where
1178 vec contains 0 (defined) bits where the corresponding arg bits
1179 are defined but different, and 1 bits otherwise.
1181 vec = Or<sz>( vxx, // 0 iff bit defined
1182 vyy, // 0 iff bit defined
1183 Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1186 If any bit of vec is 0, the result is defined and so the
1187 improvement term should produce 0...0, else it should produce
1188 1...1.
1190 Hence require for the improvement term:
1192 OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1194 which you can think of as an "optimistic cast" (OCast, the opposite of
1195 the normal "pessimistic cast" (PCast) family. An OCast says all bits
1196 are defined if any bit is defined.
1198 It is possible to show that
1200 if vec == 1...1 then 1...1 else 0...0
1202 can be implemented in straight-line code as
1204 (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1206 We note that vec contains the sub-term Or<sz>(vxx, vyy). Since UifU is
1207 implemented with Or (since 1 signifies undefinedness), this is a
1208 duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1209 a final version of:
1211 let naive = UifU<sz>(vxx, vyy)
1212 vec = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1214 PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1216 This was extensively re-analysed and checked on 6 July 05 and again
1217 in July 2017.
1219 static IRAtom* expensiveCmpEQorNE ( MCEnv* mce,
1220 IRType ty,
1221 IRAtom* vxx, IRAtom* vyy,
1222 IRAtom* xx, IRAtom* yy )
1224 IRAtom *naive, *vec, *improved, *final_cast;
1225 IROp opDIFD, opUIFU, opOR, opXOR, opNOT;
1227 tl_assert(isShadowAtom(mce,vxx));
1228 tl_assert(isShadowAtom(mce,vyy));
1229 tl_assert(isOriginalAtom(mce,xx));
1230 tl_assert(isOriginalAtom(mce,yy));
1231 tl_assert(sameKindedAtoms(vxx,xx));
1232 tl_assert(sameKindedAtoms(vyy,yy));
1234 switch (ty) {
1235 case Ity_I8:
1236 opDIFD = Iop_And8;
1237 opUIFU = Iop_Or8;
1238 opOR = Iop_Or8;
1239 opXOR = Iop_Xor8;
1240 opNOT = Iop_Not8;
1241 break;
1242 case Ity_I16:
1243 opDIFD = Iop_And16;
1244 opUIFU = Iop_Or16;
1245 opOR = Iop_Or16;
1246 opXOR = Iop_Xor16;
1247 opNOT = Iop_Not16;
1248 break;
1249 case Ity_I32:
1250 opDIFD = Iop_And32;
1251 opUIFU = Iop_Or32;
1252 opOR = Iop_Or32;
1253 opXOR = Iop_Xor32;
1254 opNOT = Iop_Not32;
1255 break;
1256 case Ity_I64:
1257 opDIFD = Iop_And64;
1258 opUIFU = Iop_Or64;
1259 opOR = Iop_Or64;
1260 opXOR = Iop_Xor64;
1261 opNOT = Iop_Not64;
1262 break;
1263 default:
1264 VG_(tool_panic)("expensiveCmpEQorNE");
1267 naive
1268 = assignNew('V', mce, ty, binop(opUIFU, vxx, vyy));
1270 vec
1271 = assignNew(
1272 'V', mce,ty,
1273 binop( opOR,
1274 naive,
1275 assignNew(
1276 'V', mce,ty,
1277 unop(opNOT,
1278 assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1280 improved
1281 = assignNew( 'V', mce,ty,
1282 binop(opDIFD, naive, mkOCastAt(mce, ty, vec)));
1284 final_cast
1285 = mkPCastTo( mce, Ity_I1, improved );
1287 return final_cast;
1290 /* Check if we can know, despite the uncertain bits, that xx is greater than yy.
1291 Notice that it's xx > yy and not the other way around. This is Intel syntax
1292 with destination first. It will appear reversed in gdb disassembly (AT&T
1293 syntax).
1295 static IRAtom* expensiveCmpGT ( MCEnv* mce,
1296 IROp opGT,
1297 IRAtom* vxx, IRAtom* vyy,
1298 IRAtom* xx, IRAtom* yy )
1300 IROp opAND, opOR, opXOR, opNOT, opSHL;
1301 IRType ty;
1302 unsigned int word_size;
1303 Bool is_signed;
1305 tl_assert(isShadowAtom(mce,vxx));
1306 tl_assert(isShadowAtom(mce,vyy));
1307 tl_assert(isOriginalAtom(mce,xx));
1308 tl_assert(isOriginalAtom(mce,yy));
1309 tl_assert(sameKindedAtoms(vxx,xx));
1310 tl_assert(sameKindedAtoms(vyy,yy));
1312 switch (opGT) {
1313 case Iop_CmpGT64Sx2:
1314 case Iop_CmpGT64Ux2:
1315 opSHL = Iop_ShlN64x2;
1316 word_size = 64;
1317 break;
1318 case Iop_CmpGT32Sx4:
1319 case Iop_CmpGT32Ux4:
1320 opSHL = Iop_ShlN32x4;
1321 word_size = 32;
1322 break;
1323 case Iop_CmpGT16Sx8:
1324 case Iop_CmpGT16Ux8:
1325 opSHL = Iop_ShlN16x8;
1326 word_size = 16;
1327 break;
1328 case Iop_CmpGT8Sx16:
1329 case Iop_CmpGT8Ux16:
1330 opSHL = Iop_ShlN8x16;
1331 word_size = 8;
1332 break;
1333 default:
1334 VG_(tool_panic)("expensiveCmpGT");
1337 switch (opGT) {
1338 case Iop_CmpGT64Sx2:
1339 case Iop_CmpGT32Sx4:
1340 case Iop_CmpGT16Sx8:
1341 case Iop_CmpGT8Sx16:
1342 is_signed = True;
1343 break;
1344 case Iop_CmpGT64Ux2:
1345 case Iop_CmpGT32Ux4:
1346 case Iop_CmpGT16Ux8:
1347 case Iop_CmpGT8Ux16:
1348 is_signed = False;
1349 break;
1350 default:
1351 VG_(tool_panic)("expensiveCmpGT");
1354 ty = Ity_V128;
1355 opAND = Iop_AndV128;
1356 opOR = Iop_OrV128;
1357 opXOR = Iop_XorV128;
1358 opNOT = Iop_NotV128;
1360 IRAtom *MSBs;
1361 if (is_signed) {
1362 // For unsigned it's easy to make the min and max: Just set the unknown
1363 // bits all to 0s or 1s. For signed it's harder because having a 1 in the
1364 // MSB makes a number smaller, not larger! We can work around this by
1365 // flipping the MSB before and after computing the min and max values.
1366 IRAtom *all_ones = mkV128(0xffff);
1367 MSBs = assignNew('V', mce, ty, binop(opSHL, all_ones, mkU8(word_size-1)));
1368 xx = assignNew('V', mce, ty, binop(opXOR, xx, MSBs));
1369 yy = assignNew('V', mce, ty, binop(opXOR, yy, MSBs));
1370 // From here on out, we're dealing with MSB-flipped integers.
1372 // We can combine xx and vxx to create two values: the largest that xx could
1373 // possibly be and the smallest that xx could possibly be. Likewise, we can
1374 // do the same for yy. We'll call those max_xx and min_xx and max_yy and
1375 // min_yy.
1376 IRAtom *not_vxx = assignNew('V', mce, ty, unop(opNOT, vxx));
1377 IRAtom *not_vyy = assignNew('V', mce, ty, unop(opNOT, vyy));
1378 IRAtom *max_xx = assignNew('V', mce, ty, binop(opOR, xx, vxx));
1379 IRAtom *min_xx = assignNew('V', mce, ty, binop(opAND, xx, not_vxx));
1380 IRAtom *max_yy = assignNew('V', mce, ty, binop(opOR, yy, vyy));
1381 IRAtom *min_yy = assignNew('V', mce, ty, binop(opAND, yy, not_vyy));
1382 if (is_signed) {
1383 // Unflip the MSBs.
1384 max_xx = assignNew('V', mce, ty, binop(opXOR, max_xx, MSBs));
1385 min_xx = assignNew('V', mce, ty, binop(opXOR, min_xx, MSBs));
1386 max_yy = assignNew('V', mce, ty, binop(opXOR, max_yy, MSBs));
1387 min_yy = assignNew('V', mce, ty, binop(opXOR, min_yy, MSBs));
1389 IRAtom *min_xx_gt_max_yy = assignNew('V', mce, ty, binop(opGT, min_xx, max_yy));
1390 IRAtom *max_xx_gt_min_yy = assignNew('V', mce, ty, binop(opGT, max_xx, min_yy));
1391 // If min_xx is greater than max_yy then xx is surely greater than yy so we know
1392 // our answer for sure. If max_xx is not greater than min_yy then xx can't
1393 // possible be greater than yy so again we know the answer for sure. For all
1394 // other cases, we can't know.
1396 // So the result is defined if:
1398 // min_xx_gt_max_yy | ~max_xx_gt_min_yy
1400 // Because defined in vbits is 0s and not 1s, we need to invert that:
1402 // ~(min_xx_gt_max_yy | ~max_xx_gt_min_yy)
1404 // We can use DeMorgan's Law to simplify the above:
1406 // ~min_xx_gt_max_yy & max_xx_gt_min_yy
1407 IRAtom *not_min_xx_gt_max_yy = assignNew('V', mce, ty, unop(opNOT, min_xx_gt_max_yy));
1408 return assignNew('V', mce, ty, binop(opAND, not_min_xx_gt_max_yy, max_xx_gt_min_yy));
1411 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1413 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1415 CmpORD32S(x,y) = 1<<3 if x <s y
1416 = 1<<2 if x >s y
1417 = 1<<1 if x == y
1419 and similarly the unsigned variant. The default interpretation is:
1421 CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1422 & (7<<1)
1424 The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1425 are zero and therefore defined (viz, zero).
1427 Also deal with a special case better:
1429 CmpORD32S(x,0)
1431 Here, bit 3 (LT) of the result is a copy of the top bit of x and
1432 will be defined even if the rest of x isn't. In which case we do:
1434 CmpORD32S#(x,x#,0,{impliedly 0}#)
1435 = PCast(x#) & (3<<1) -- standard interp for GT#,EQ#
1436 | (x# >>u 31) << 3 -- LT# = x#[31]
1438 Analogous handling for CmpORD64{S,U}.
1440 static Bool isZeroU32 ( IRAtom* e )
1442 return
1443 toBool( e->tag == Iex_Const
1444 && e->Iex.Const.con->tag == Ico_U32
1445 && e->Iex.Const.con->Ico.U32 == 0 );
1448 static Bool isZeroU64 ( IRAtom* e )
1450 return
1451 toBool( e->tag == Iex_Const
1452 && e->Iex.Const.con->tag == Ico_U64
1453 && e->Iex.Const.con->Ico.U64 == 0 );
1456 static IRAtom* doCmpORD ( MCEnv* mce,
1457 IROp cmp_op,
1458 IRAtom* xxhash, IRAtom* yyhash,
1459 IRAtom* xx, IRAtom* yy )
1461 Bool m64 = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1462 Bool syned = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1463 IROp opOR = m64 ? Iop_Or64 : Iop_Or32;
1464 IROp opAND = m64 ? Iop_And64 : Iop_And32;
1465 IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32;
1466 IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32;
1467 IROp op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;
1468 IRType ty = m64 ? Ity_I64 : Ity_I32;
1469 Int width = m64 ? 64 : 32;
1471 Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1473 tl_assert(isShadowAtom(mce,xxhash));
1474 tl_assert(isShadowAtom(mce,yyhash));
1475 tl_assert(isOriginalAtom(mce,xx));
1476 tl_assert(isOriginalAtom(mce,yy));
1477 tl_assert(sameKindedAtoms(xxhash,xx));
1478 tl_assert(sameKindedAtoms(yyhash,yy));
1479 tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1480 || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1482 if (0) {
1483 ppIROp(cmp_op); VG_(printf)(" ");
1484 ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1487 if (syned && isZero(yy)) {
1488 /* fancy interpretation */
1489 /* if yy is zero, then it must be fully defined (zero#). */
1490 tl_assert(isZero(yyhash));
1491 // This is still inaccurate, but I don't think it matters, since
1492 // nobody writes code of the form
1493 // "is <partially-undefined-value> signedly greater than zero?".
1494 // We therefore simply declare "x >s 0" to be undefined if any bit in
1495 // x is undefined. That's clearly suboptimal in some cases. Eg, if
1496 // the highest order bit is a defined 1 then x is negative so it
1497 // doesn't matter whether the remaining bits are defined or not.
1498 IRAtom* t_0_gt_0_0
1499 = assignNew(
1500 'V', mce,ty,
1501 binop(
1502 opAND,
1503 mkPCastTo(mce,ty, xxhash),
1504 m64 ? mkU64(1<<2) : mkU32(1<<2)
1506 // For "x <s 0", we can just copy the definedness of the top bit of x
1507 // and we have a precise result.
1508 IRAtom* t_lt_0_0_0
1509 = assignNew(
1510 'V', mce,ty,
1511 binop(
1512 opSHL,
1513 assignNew(
1514 'V', mce,ty,
1515 binop(opSHR, xxhash, mkU8(width-1))),
1516 mkU8(3)
1518 // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
1519 IRAtom* t_0_0_eq_0
1520 = assignNew(
1521 'V', mce,ty,
1522 binop(
1523 opSHL,
1524 assignNew('V', mce,ty,
1525 unop(
1526 op1UtoWS,
1527 expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))
1529 mkU8(1)
1531 return
1532 binop(
1533 opOR,
1534 assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),
1535 t_0_0_eq_0
1537 } else {
1538 /* standard interpretation */
1539 IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1540 return
1541 binop(
1542 opAND,
1543 mkPCastTo( mce,ty,
1544 mkUifU(mce,ty, xxhash,yyhash)),
1545 sevenLeft1
1551 /*------------------------------------------------------------*/
1552 /*--- Emit a test and complaint if something is undefined. ---*/
1553 /*------------------------------------------------------------*/
1555 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1558 /* Set the annotations on a dirty helper to indicate that the stack
1559 pointer and instruction pointers might be read. This is the
1560 behaviour of all 'emit-a-complaint' style functions we might
1561 call. */
1563 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1564 di->nFxState = 2;
1565 di->fxState[0].fx = Ifx_Read;
1566 di->fxState[0].offset = mce->layout->offset_SP;
1567 di->fxState[0].size = mce->layout->sizeof_SP;
1568 di->fxState[0].nRepeats = 0;
1569 di->fxState[0].repeatLen = 0;
1570 di->fxState[1].fx = Ifx_Read;
1571 di->fxState[1].offset = mce->layout->offset_IP;
1572 di->fxState[1].size = mce->layout->sizeof_IP;
1573 di->fxState[1].nRepeats = 0;
1574 di->fxState[1].repeatLen = 0;
1578 /* Check the supplied *original* |atom| for undefinedness, and emit a
1579 complaint if so. Once that happens, mark it as defined. This is
1580 possible because the atom is either a tmp or literal. If it's a
1581 tmp, it will be shadowed by a tmp, and so we can set the shadow to
1582 be defined. In fact as mentioned above, we will have to allocate a
1583 new tmp to carry the new 'defined' shadow value, and update the
1584 original->tmp mapping accordingly; we cannot simply assign a new
1585 value to an existing shadow tmp as this breaks SSAness.
1587 The checks are performed, any resulting complaint emitted, and
1588 |atom|'s shadow temp set to 'defined', ONLY in the case that
1589 |guard| evaluates to True at run-time. If it evaluates to False
1590 then no action is performed. If |guard| is NULL (the usual case)
1591 then it is assumed to be always-true, and hence these actions are
1592 performed unconditionally.
1594 This routine does not generate code to check the definedness of
1595 |guard|. The caller is assumed to have taken care of that already.
1597 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1599 IRAtom* vatom;
1600 IRType ty;
1601 Int sz;
1602 IRDirty* di;
1603 IRAtom* cond;
1604 IRAtom* origin;
1605 void* fn;
1606 const HChar* nm;
1607 IRExpr** args;
1608 Int nargs;
1610 // Don't do V bit tests if we're not reporting undefined value errors.
1611 if (MC_(clo_mc_level) == 1)
1612 return;
1614 if (guard)
1615 tl_assert(isOriginalAtom(mce, guard));
1617 /* Since the original expression is atomic, there's no duplicated
1618 work generated by making multiple V-expressions for it. So we
1619 don't really care about the possibility that someone else may
1620 also create a V-interpretion for it. */
1621 tl_assert(isOriginalAtom(mce, atom));
1622 vatom = expr2vbits( mce, atom, HuOth );
1623 tl_assert(isShadowAtom(mce, vatom));
1624 tl_assert(sameKindedAtoms(atom, vatom));
1626 ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1628 /* sz is only used for constructing the error message */
1629 sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1631 cond = mkPCastTo( mce, Ity_I1, vatom );
1632 /* cond will be 0 if all defined, and 1 if any not defined. */
1634 /* Get the origin info for the value we are about to check. At
1635 least, if we are doing origin tracking. If not, use a dummy
1636 zero origin. */
1637 if (MC_(clo_mc_level) == 3) {
1638 origin = schemeE( mce, atom );
1639 if (mce->hWordTy == Ity_I64) {
1640 origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1642 } else {
1643 origin = NULL;
1646 fn = NULL;
1647 nm = NULL;
1648 args = NULL;
1649 nargs = -1;
1651 switch (sz) {
1652 case 0:
1653 if (origin) {
1654 fn = &MC_(helperc_value_check0_fail_w_o);
1655 nm = "MC_(helperc_value_check0_fail_w_o)";
1656 args = mkIRExprVec_1(origin);
1657 nargs = 1;
1658 } else {
1659 fn = &MC_(helperc_value_check0_fail_no_o);
1660 nm = "MC_(helperc_value_check0_fail_no_o)";
1661 args = mkIRExprVec_0();
1662 nargs = 0;
1664 break;
1665 case 1:
1666 if (origin) {
1667 fn = &MC_(helperc_value_check1_fail_w_o);
1668 nm = "MC_(helperc_value_check1_fail_w_o)";
1669 args = mkIRExprVec_1(origin);
1670 nargs = 1;
1671 } else {
1672 fn = &MC_(helperc_value_check1_fail_no_o);
1673 nm = "MC_(helperc_value_check1_fail_no_o)";
1674 args = mkIRExprVec_0();
1675 nargs = 0;
1677 break;
1678 case 4:
1679 if (origin) {
1680 fn = &MC_(helperc_value_check4_fail_w_o);
1681 nm = "MC_(helperc_value_check4_fail_w_o)";
1682 args = mkIRExprVec_1(origin);
1683 nargs = 1;
1684 } else {
1685 fn = &MC_(helperc_value_check4_fail_no_o);
1686 nm = "MC_(helperc_value_check4_fail_no_o)";
1687 args = mkIRExprVec_0();
1688 nargs = 0;
1690 break;
1691 case 8:
1692 if (origin) {
1693 fn = &MC_(helperc_value_check8_fail_w_o);
1694 nm = "MC_(helperc_value_check8_fail_w_o)";
1695 args = mkIRExprVec_1(origin);
1696 nargs = 1;
1697 } else {
1698 fn = &MC_(helperc_value_check8_fail_no_o);
1699 nm = "MC_(helperc_value_check8_fail_no_o)";
1700 args = mkIRExprVec_0();
1701 nargs = 0;
1703 break;
1704 case 2:
1705 case 16:
1706 if (origin) {
1707 fn = &MC_(helperc_value_checkN_fail_w_o);
1708 nm = "MC_(helperc_value_checkN_fail_w_o)";
1709 args = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1710 nargs = 2;
1711 } else {
1712 fn = &MC_(helperc_value_checkN_fail_no_o);
1713 nm = "MC_(helperc_value_checkN_fail_no_o)";
1714 args = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1715 nargs = 1;
1717 break;
1718 default:
1719 VG_(tool_panic)("unexpected szB");
1722 tl_assert(fn);
1723 tl_assert(nm);
1724 tl_assert(args);
1725 tl_assert(nargs >= 0 && nargs <= 2);
1726 tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1727 || (MC_(clo_mc_level) == 2 && origin == NULL) );
1729 di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1730 VG_(fnptr_to_fnentry)( fn ), args );
1731 di->guard = cond; // and cond is PCast-to-1(atom#)
1733 /* If the complaint is to be issued under a guard condition, AND
1734 that into the guard condition for the helper call. */
1735 if (guard) {
1736 IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1737 IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1738 IRAtom *e = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1739 di->guard = assignNew('V', mce, Ity_I1, unop(Iop_32to1, e));
1742 setHelperAnns( mce, di );
1743 stmt( 'V', mce, IRStmt_Dirty(di));
1745 /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1746 defined -- but only in the case where the guard evaluates to
1747 True at run-time. Do the update by setting the orig->shadow
1748 mapping for tmp to reflect the fact that this shadow is getting
1749 a new value. */
1750 tl_assert(isIRAtom(vatom));
1751 /* sameKindedAtoms ... */
1752 if (vatom->tag == Iex_RdTmp) {
1753 tl_assert(atom->tag == Iex_RdTmp);
1754 if (guard == NULL) {
1755 // guard is 'always True', hence update unconditionally
1756 newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1757 assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1758 definedOfType(ty));
1759 } else {
1760 // update the temp only conditionally. Do this by copying
1761 // its old value when the guard is False.
1762 // The old value ..
1763 IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1764 newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1765 IRAtom* new_tmpV
1766 = assignNew('V', mce, shadowTypeV(ty),
1767 IRExpr_ITE(guard, definedOfType(ty),
1768 mkexpr(old_tmpV)));
1769 assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1775 /*------------------------------------------------------------*/
1776 /*--- Shadowing PUTs/GETs, and indexed variants thereof ---*/
1777 /*------------------------------------------------------------*/
1779 /* Examine the always-defined sections declared in layout to see if
1780 the (offset,size) section is within one. Note, is is an error to
1781 partially fall into such a region: (offset,size) should either be
1782 completely in such a region or completely not-in such a region.
1784 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1786 Int minoffD, maxoffD, i;
1787 Int minoff = offset;
1788 Int maxoff = minoff + size - 1;
1789 tl_assert((minoff & ~0xFFFF) == 0);
1790 tl_assert((maxoff & ~0xFFFF) == 0);
1792 for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1793 minoffD = mce->layout->alwaysDefd[i].offset;
1794 maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1795 tl_assert((minoffD & ~0xFFFF) == 0);
1796 tl_assert((maxoffD & ~0xFFFF) == 0);
1798 if (maxoff < minoffD || maxoffD < minoff)
1799 continue; /* no overlap */
1800 if (minoff >= minoffD && maxoff <= maxoffD)
1801 return True; /* completely contained in an always-defd section */
1803 VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1805 return False; /* could not find any containing section */
1809 /* Generate into bb suitable actions to shadow this Put. If the state
1810 slice is marked 'always defined', do nothing. Otherwise, write the
1811 supplied V bits to the shadow state. We can pass in either an
1812 original atom or a V-atom, but not both. In the former case the
1813 relevant V-bits are then generated from the original.
1814 We assume here, that the definedness of GUARD has already been checked.
1816 static
1817 void do_shadow_PUT ( MCEnv* mce, Int offset,
1818 IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1820 IRType ty;
1822 // Don't do shadow PUTs if we're not doing undefined value checking.
1823 // Their absence lets Vex's optimiser remove all the shadow computation
1824 // that they depend on, which includes GETs of the shadow registers.
1825 if (MC_(clo_mc_level) == 1)
1826 return;
1828 if (atom) {
1829 tl_assert(!vatom);
1830 tl_assert(isOriginalAtom(mce, atom));
1831 vatom = expr2vbits( mce, atom, HuOth );
1832 } else {
1833 tl_assert(vatom);
1834 tl_assert(isShadowAtom(mce, vatom));
1837 ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1838 tl_assert(ty != Ity_I1);
1839 if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1840 /* later: no ... */
1841 /* emit code to emit a complaint if any of the vbits are 1. */
1842 /* complainIfUndefined(mce, atom); */
1843 } else {
1844 /* Do a plain shadow Put. */
1845 if (guard) {
1846 /* If the guard expression evaluates to false we simply Put the value
1847 that is already stored in the guest state slot */
1848 IRAtom *cond, *iffalse;
1850 cond = assignNew('V', mce, Ity_I1, guard);
1851 iffalse = assignNew('V', mce, ty,
1852 IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1853 vatom = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1855 stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1860 /* Return an expression which contains the V bits corresponding to the
1861 given GETI (passed in in pieces).
1863 static
1864 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1866 IRAtom* vatom;
1867 IRType ty, tyS;
1868 Int arrSize;;
1869 IRRegArray* descr = puti->descr;
1870 IRAtom* ix = puti->ix;
1871 Int bias = puti->bias;
1872 IRAtom* atom = puti->data;
1874 // Don't do shadow PUTIs if we're not doing undefined value checking.
1875 // Their absence lets Vex's optimiser remove all the shadow computation
1876 // that they depend on, which includes GETIs of the shadow registers.
1877 if (MC_(clo_mc_level) == 1)
1878 return;
1880 tl_assert(isOriginalAtom(mce,atom));
1881 vatom = expr2vbits( mce, atom, HuOth );
1882 tl_assert(sameKindedAtoms(atom, vatom));
1883 ty = descr->elemTy;
1884 tyS = shadowTypeV(ty);
1885 arrSize = descr->nElems * sizeofIRType(ty);
1886 tl_assert(ty != Ity_I1);
1887 tl_assert(isOriginalAtom(mce,ix));
1888 complainIfUndefined(mce, ix, NULL);
1889 if (isAlwaysDefd(mce, descr->base, arrSize)) {
1890 /* later: no ... */
1891 /* emit code to emit a complaint if any of the vbits are 1. */
1892 /* complainIfUndefined(mce, atom); */
1893 } else {
1894 /* Do a cloned version of the Put that refers to the shadow
1895 area. */
1896 IRRegArray* new_descr
1897 = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1898 tyS, descr->nElems);
1899 stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1904 /* Return an expression which contains the V bits corresponding to the
1905 given GET (passed in in pieces).
1907 static
1908 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1910 IRType tyS = shadowTypeV(ty);
1911 tl_assert(ty != Ity_I1);
1912 tl_assert(ty != Ity_I128);
1913 if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1914 /* Always defined, return all zeroes of the relevant type */
1915 return definedOfType(tyS);
1916 } else {
1917 /* return a cloned version of the Get that refers to the shadow
1918 area. */
1919 /* FIXME: this isn't an atom! */
1920 return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1925 /* Return an expression which contains the V bits corresponding to the
1926 given GETI (passed in in pieces).
1928 static
1929 IRExpr* shadow_GETI ( MCEnv* mce,
1930 IRRegArray* descr, IRAtom* ix, Int bias )
1932 IRType ty = descr->elemTy;
1933 IRType tyS = shadowTypeV(ty);
1934 Int arrSize = descr->nElems * sizeofIRType(ty);
1935 tl_assert(ty != Ity_I1);
1936 tl_assert(isOriginalAtom(mce,ix));
1937 complainIfUndefined(mce, ix, NULL);
1938 if (isAlwaysDefd(mce, descr->base, arrSize)) {
1939 /* Always defined, return all zeroes of the relevant type */
1940 return definedOfType(tyS);
1941 } else {
1942 /* return a cloned version of the Get that refers to the shadow
1943 area. */
1944 IRRegArray* new_descr
1945 = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1946 tyS, descr->nElems);
1947 return IRExpr_GetI( new_descr, ix, bias );
1952 /*------------------------------------------------------------*/
1953 /*--- Generating approximations for unknown operations, ---*/
1954 /*--- using lazy-propagate semantics ---*/
1955 /*------------------------------------------------------------*/
1957 /* Lazy propagation of undefinedness from two values, resulting in the
1958 specified shadow type.
1960 static
1961 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1963 IRAtom* at;
1964 IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1965 IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1966 tl_assert(isShadowAtom(mce,va1));
1967 tl_assert(isShadowAtom(mce,va2));
1969 /* The general case is inefficient because PCast is an expensive
1970 operation. Here are some special cases which use PCast only
1971 once rather than twice. */
1973 /* I64 x I64 -> I64 */
1974 if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1975 if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1976 at = mkUifU(mce, Ity_I64, va1, va2);
1977 at = mkPCastTo(mce, Ity_I64, at);
1978 return at;
1981 /* I64 x I64 -> I32 */
1982 if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1983 if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1984 at = mkUifU(mce, Ity_I64, va1, va2);
1985 at = mkPCastTo(mce, Ity_I32, at);
1986 return at;
1989 /* I32 x I32 -> I32 */
1990 if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1991 if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1992 at = mkUifU(mce, Ity_I32, va1, va2);
1993 at = mkPCastTo(mce, Ity_I32, at);
1994 return at;
1997 if (0) {
1998 VG_(printf)("mkLazy2 ");
1999 ppIRType(t1);
2000 VG_(printf)("_");
2001 ppIRType(t2);
2002 VG_(printf)("_");
2003 ppIRType(finalVty);
2004 VG_(printf)("\n");
2007 /* General case: force everything via 32-bit intermediaries. */
2008 at = mkPCastTo(mce, Ity_I32, va1);
2009 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2010 at = mkPCastTo(mce, finalVty, at);
2011 return at;
2015 /* 3-arg version of the above. */
2016 static
2017 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
2018 IRAtom* va1, IRAtom* va2, IRAtom* va3 )
2020 IRAtom* at;
2021 IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2022 IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2023 IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2024 tl_assert(isShadowAtom(mce,va1));
2025 tl_assert(isShadowAtom(mce,va2));
2026 tl_assert(isShadowAtom(mce,va3));
2028 /* The general case is inefficient because PCast is an expensive
2029 operation. Here are some special cases which use PCast only
2030 twice rather than three times. */
2032 /* I32 x I64 x I64 -> I64 */
2033 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2034 if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
2035 && finalVty == Ity_I64) {
2036 if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
2037 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
2038 mode indication which is fully defined, this should get
2039 folded out later. */
2040 at = mkPCastTo(mce, Ity_I64, va1);
2041 /* Now fold in 2nd and 3rd args. */
2042 at = mkUifU(mce, Ity_I64, at, va2);
2043 at = mkUifU(mce, Ity_I64, at, va3);
2044 /* and PCast once again. */
2045 at = mkPCastTo(mce, Ity_I64, at);
2046 return at;
2049 /* I32 x I8 x I64 -> I64 */
2050 if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
2051 && finalVty == Ity_I64) {
2052 if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
2053 /* Widen 1st and 2nd args to I64. Since 1st arg is typically a
2054 * rounding mode indication which is fully defined, this should
2055 * get folded out later.
2057 IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2058 IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2059 at = mkUifU(mce, Ity_I64, at1, at2); // UifU(PCast(va1), PCast(va2))
2060 at = mkUifU(mce, Ity_I64, at, va3);
2061 /* and PCast once again. */
2062 at = mkPCastTo(mce, Ity_I64, at);
2063 return at;
2066 /* I32 x I64 x I64 -> I32 */
2067 if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
2068 && finalVty == Ity_I32) {
2069 if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
2070 at = mkPCastTo(mce, Ity_I64, va1);
2071 at = mkUifU(mce, Ity_I64, at, va2);
2072 at = mkUifU(mce, Ity_I64, at, va3);
2073 at = mkPCastTo(mce, Ity_I32, at);
2074 return at;
2077 /* I32 x I32 x I32 -> I32 */
2078 /* 32-bit FP idiom, as (eg) happens on ARM */
2079 if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
2080 && finalVty == Ity_I32) {
2081 if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
2082 at = va1;
2083 at = mkUifU(mce, Ity_I32, at, va2);
2084 at = mkUifU(mce, Ity_I32, at, va3);
2085 at = mkPCastTo(mce, Ity_I32, at);
2086 return at;
2089 /* I32 x I16 x I16 -> I16 */
2090 /* 16-bit half-precision FP idiom, as (eg) happens on arm64 v8.2 onwards */
2091 if (t1 == Ity_I32 && t2 == Ity_I16 && t3 == Ity_I16
2092 && finalVty == Ity_I16) {
2093 if (0) VG_(printf)("mkLazy3: I32 x I16 x I16 -> I16\n");
2094 at = mkPCastTo(mce, Ity_I16, va1);
2095 at = mkUifU(mce, Ity_I16, at, va2);
2096 at = mkUifU(mce, Ity_I16, at, va3);
2097 at = mkPCastTo(mce, Ity_I16, at);
2098 return at;
2101 /* I32 x I128 x I128 -> I128 */
2102 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2103 if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
2104 && finalVty == Ity_I128) {
2105 if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
2106 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
2107 mode indication which is fully defined, this should get
2108 folded out later. */
2109 at = mkPCastTo(mce, Ity_I128, va1);
2110 /* Now fold in 2nd and 3rd args. */
2111 at = mkUifU(mce, Ity_I128, at, va2);
2112 at = mkUifU(mce, Ity_I128, at, va3);
2113 /* and PCast once again. */
2114 at = mkPCastTo(mce, Ity_I128, at);
2115 return at;
2118 /* I32 x I8 x I128 -> I128 */
2119 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2120 if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
2121 && finalVty == Ity_I128) {
2122 if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
2123 /* Use I64 as an intermediate type, which means PCasting all 3
2124 args to I64 to start with. 1st arg is typically a rounding
2125 mode indication which is fully defined, so we hope that it
2126 will get folded out later. */
2127 IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2128 IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2129 IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
2130 /* Now UifU all three together. */
2131 at = mkUifU(mce, Ity_I64, at1, at2); // UifU(PCast(va1), PCast(va2))
2132 at = mkUifU(mce, Ity_I64, at, at3); // ... `UifU` PCast(va3)
2133 /* and PCast once again. */
2134 at = mkPCastTo(mce, Ity_I128, at);
2135 return at;
2137 if (1) {
2138 VG_(printf)("mkLazy3: ");
2139 ppIRType(t1);
2140 VG_(printf)(" x ");
2141 ppIRType(t2);
2142 VG_(printf)(" x ");
2143 ppIRType(t3);
2144 VG_(printf)(" -> ");
2145 ppIRType(finalVty);
2146 VG_(printf)("\n");
2149 tl_assert(0);
2150 /* General case: force everything via 32-bit intermediaries. */
2152 at = mkPCastTo(mce, Ity_I32, va1);
2153 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2154 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
2155 at = mkPCastTo(mce, finalVty, at);
2156 return at;
2161 /* 4-arg version of the above. */
2162 static
2163 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
2164 IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
2166 IRAtom* at;
2167 IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2168 IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2169 IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2170 IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
2171 tl_assert(isShadowAtom(mce,va1));
2172 tl_assert(isShadowAtom(mce,va2));
2173 tl_assert(isShadowAtom(mce,va3));
2174 tl_assert(isShadowAtom(mce,va4));
2176 /* The general case is inefficient because PCast is an expensive
2177 operation. Here are some special cases which use PCast only
2178 twice rather than three times. */
2180 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2182 if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
2183 && finalVty == Ity_I128) {
2184 if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
2185 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
2186 mode indication which is fully defined, this should get
2187 folded out later. */
2188 at = mkPCastTo(mce, Ity_I128, va1);
2189 /* Now fold in 2nd, 3rd, 4th args. */
2190 at = mkUifU(mce, Ity_I128, at, va2);
2191 at = mkUifU(mce, Ity_I128, at, va3);
2192 at = mkUifU(mce, Ity_I128, at, va4);
2193 /* and PCast once again. */
2194 at = mkPCastTo(mce, Ity_I128, at);
2195 return at;
2198 /* I32 x I64 x I64 x I64 -> I64 */
2199 if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
2200 && finalVty == Ity_I64) {
2201 if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
2202 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
2203 mode indication which is fully defined, this should get
2204 folded out later. */
2205 at = mkPCastTo(mce, Ity_I64, va1);
2206 /* Now fold in 2nd, 3rd, 4th args. */
2207 at = mkUifU(mce, Ity_I64, at, va2);
2208 at = mkUifU(mce, Ity_I64, at, va3);
2209 at = mkUifU(mce, Ity_I64, at, va4);
2210 /* and PCast once again. */
2211 at = mkPCastTo(mce, Ity_I64, at);
2212 return at;
2214 /* I32 x I32 x I32 x I32 -> I32 */
2215 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2216 if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
2217 && finalVty == Ity_I32) {
2218 if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2219 at = va1;
2220 /* Now fold in 2nd, 3rd, 4th args. */
2221 at = mkUifU(mce, Ity_I32, at, va2);
2222 at = mkUifU(mce, Ity_I32, at, va3);
2223 at = mkUifU(mce, Ity_I32, at, va4);
2224 at = mkPCastTo(mce, Ity_I32, at);
2225 return at;
2228 if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2229 && finalVty == Ity_I32) {
2230 if (0) VG_(printf)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2231 at = mkPCastTo(mce, Ity_I8, va1);
2232 /* Now fold in 2nd, 3rd, 4th args. */
2233 at = mkUifU(mce, Ity_I8, at, va2);
2234 at = mkUifU(mce, Ity_I8, at, va3);
2235 at = mkUifU(mce, Ity_I8, at, va4);
2236 at = mkPCastTo(mce, Ity_I32, at);
2237 return at;
2240 if (t1 == Ity_I64 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2241 && finalVty == Ity_I64) {
2242 if (0) VG_(printf)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2243 at = mkPCastTo(mce, Ity_I8, va1);
2244 /* Now fold in 2nd, 3rd, 4th args. */
2245 at = mkUifU(mce, Ity_I8, at, va2);
2246 at = mkUifU(mce, Ity_I8, at, va3);
2247 at = mkUifU(mce, Ity_I8, at, va4);
2248 at = mkPCastTo(mce, Ity_I64, at);
2249 return at;
2252 if (1) {
2253 VG_(printf)("mkLazy4: ");
2254 ppIRType(t1);
2255 VG_(printf)(" x ");
2256 ppIRType(t2);
2257 VG_(printf)(" x ");
2258 ppIRType(t3);
2259 VG_(printf)(" x ");
2260 ppIRType(t4);
2261 VG_(printf)(" -> ");
2262 ppIRType(finalVty);
2263 VG_(printf)("\n");
2266 tl_assert(0);
2270 /* Do the lazy propagation game from a null-terminated vector of
2271 atoms. This is presumably the arguments to a helper call, so the
2272 IRCallee info is also supplied in order that we can know which
2273 arguments should be ignored (via the .mcx_mask field).
2275 static
2276 IRAtom* mkLazyN ( MCEnv* mce,
2277 IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
2279 Int i;
2280 IRAtom* here;
2281 IRAtom* curr;
2282 IRType mergeTy;
2283 Bool mergeTy64 = True;
2285 /* Decide on the type of the merge intermediary. If all relevant
2286 args are I64, then it's I64. In all other circumstances, use
2287 I32. */
2288 for (i = 0; exprvec[i]; i++) {
2289 tl_assert(i < 32);
2290 tl_assert(isOriginalAtom(mce, exprvec[i]));
2291 if (cee->mcx_mask & (1<<i))
2292 continue;
2293 if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
2294 mergeTy64 = False;
2297 mergeTy = mergeTy64 ? Ity_I64 : Ity_I32;
2298 curr = definedOfType(mergeTy);
2300 for (i = 0; exprvec[i]; i++) {
2301 tl_assert(i < 32);
2302 tl_assert(isOriginalAtom(mce, exprvec[i]));
2303 /* Only take notice of this arg if the callee's mc-exclusion
2304 mask does not say it is to be excluded. */
2305 if (cee->mcx_mask & (1<<i)) {
2306 /* the arg is to be excluded from definedness checking. Do
2307 nothing. */
2308 if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
2309 } else {
2310 /* calculate the arg's definedness, and pessimistically merge
2311 it in. */
2312 here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i], HuOth) );
2313 curr = mergeTy64
2314 ? mkUifU64(mce, here, curr)
2315 : mkUifU32(mce, here, curr);
2318 return mkPCastTo(mce, finalVtype, curr );
2322 /*------------------------------------------------------------*/
2323 /*--- Generating expensive sequences for exact carry-chain ---*/
2324 /*--- propagation in add/sub and related operations. ---*/
2325 /*------------------------------------------------------------*/
2327 static
2328 IRAtom* expensiveAddSub ( MCEnv* mce,
2329 Bool add,
2330 IRType ty,
2331 IRAtom* qaa, IRAtom* qbb,
2332 IRAtom* aa, IRAtom* bb )
2334 IRAtom *a_min, *b_min, *a_max, *b_max;
2335 IROp opAND, opOR, opXOR, opNOT, opADD, opSUB;
2337 tl_assert(isShadowAtom(mce,qaa));
2338 tl_assert(isShadowAtom(mce,qbb));
2339 tl_assert(isOriginalAtom(mce,aa));
2340 tl_assert(isOriginalAtom(mce,bb));
2341 tl_assert(sameKindedAtoms(qaa,aa));
2342 tl_assert(sameKindedAtoms(qbb,bb));
2344 switch (ty) {
2345 case Ity_I32:
2346 opAND = Iop_And32;
2347 opOR = Iop_Or32;
2348 opXOR = Iop_Xor32;
2349 opNOT = Iop_Not32;
2350 opADD = Iop_Add32;
2351 opSUB = Iop_Sub32;
2352 break;
2353 case Ity_I64:
2354 opAND = Iop_And64;
2355 opOR = Iop_Or64;
2356 opXOR = Iop_Xor64;
2357 opNOT = Iop_Not64;
2358 opADD = Iop_Add64;
2359 opSUB = Iop_Sub64;
2360 break;
2361 default:
2362 VG_(tool_panic)("expensiveAddSub");
2365 // a_min = aa & ~qaa
2366 a_min = assignNew('V', mce,ty,
2367 binop(opAND, aa,
2368 assignNew('V', mce,ty, unop(opNOT, qaa))));
2370 // b_min = bb & ~qbb
2371 b_min = assignNew('V', mce,ty,
2372 binop(opAND, bb,
2373 assignNew('V', mce,ty, unop(opNOT, qbb))));
2375 // a_max = aa | qaa
2376 a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
2378 // b_max = bb | qbb
2379 b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
2381 if (add) {
2382 // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2383 return
2384 assignNew('V', mce,ty,
2385 binop( opOR,
2386 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2387 assignNew('V', mce,ty,
2388 binop( opXOR,
2389 assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
2390 assignNew('V', mce,ty, binop(opADD, a_max, b_max))
2395 } else {
2396 // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2397 return
2398 assignNew('V', mce,ty,
2399 binop( opOR,
2400 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2401 assignNew('V', mce,ty,
2402 binop( opXOR,
2403 assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
2404 assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
2414 static
2415 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2416 IRAtom* atom, IRAtom* vatom )
2418 IRType ty;
2419 IROp xorOp, subOp, andOp;
2420 IRExpr *one;
2421 IRAtom *improver, *improved;
2422 tl_assert(isShadowAtom(mce,vatom));
2423 tl_assert(isOriginalAtom(mce,atom));
2424 tl_assert(sameKindedAtoms(atom,vatom));
2426 switch (czop) {
2427 case Iop_Ctz32: case Iop_CtzNat32:
2428 ty = Ity_I32;
2429 xorOp = Iop_Xor32;
2430 subOp = Iop_Sub32;
2431 andOp = Iop_And32;
2432 one = mkU32(1);
2433 break;
2434 case Iop_Ctz64: case Iop_CtzNat64:
2435 ty = Ity_I64;
2436 xorOp = Iop_Xor64;
2437 subOp = Iop_Sub64;
2438 andOp = Iop_And64;
2439 one = mkU64(1);
2440 break;
2441 default:
2442 ppIROp(czop);
2443 VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2446 // improver = atom ^ (atom - 1)
2448 // That is, improver has its low ctz(atom)+1 bits equal to one;
2449 // higher bits (if any) equal to zero. So it's exactly the right
2450 // mask to use to remove the irrelevant undefined input bits.
2451 /* Here are some examples:
2452 atom = U...U 1 0...0
2453 atom-1 = U...U 0 1...1
2454 ^ed = 0...0 1 11111, which correctly describes which bits of |atom|
2455 actually influence the result
2456 A boundary case
2457 atom = 0...0
2458 atom-1 = 1...1
2459 ^ed = 11111, also a correct mask for the input: all input bits
2460 are relevant
2461 Another boundary case
2462 atom = 1..1 1
2463 atom-1 = 1..1 0
2464 ^ed = 0..0 1, also a correct mask: only the rightmost input bit
2465 is relevant
2466 Now with misc U bits interspersed:
2467 atom = U...U 1 0 U...U 0 1 0...0
2468 atom-1 = U...U 1 0 U...U 0 0 1...1
2469 ^ed = 0...0 0 0 0...0 0 1 1...1, also correct
2470 (Per re-check/analysis of 14 Nov 2018)
2472 improver = assignNew('V', mce,ty,
2473 binop(xorOp,
2474 atom,
2475 assignNew('V', mce, ty,
2476 binop(subOp, atom, one))));
2478 // improved = vatom & improver
2480 // That is, treat any V bits to the left of the rightmost ctz(atom)+1
2481 // bits as "defined".
2482 improved = assignNew('V', mce, ty,
2483 binop(andOp, vatom, improver));
2485 // Return pessimizing cast of improved.
2486 return mkPCastTo(mce, ty, improved);
2489 static
2490 IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,
2491 IRAtom* atom, IRAtom* vatom )
2493 IRType ty;
2494 IROp shrOp, notOp, andOp;
2495 IRAtom* (*mkRight)(MCEnv*, IRAtom*);
2496 IRAtom *improver, *improved;
2497 tl_assert(isShadowAtom(mce,vatom));
2498 tl_assert(isOriginalAtom(mce,atom));
2499 tl_assert(sameKindedAtoms(atom,vatom));
2501 switch (czop) {
2502 case Iop_Clz32: case Iop_ClzNat32:
2503 ty = Ity_I32;
2504 shrOp = Iop_Shr32;
2505 notOp = Iop_Not32;
2506 andOp = Iop_And32;
2507 mkRight = mkRight32;
2508 break;
2509 case Iop_Clz64: case Iop_ClzNat64:
2510 ty = Ity_I64;
2511 shrOp = Iop_Shr64;
2512 notOp = Iop_Not64;
2513 andOp = Iop_And64;
2514 mkRight = mkRight64;
2515 break;
2516 default:
2517 ppIROp(czop);
2518 VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");
2521 // This is in principle very similar to how expensiveCountTrailingZeroes
2522 // works. That function computed an "improver", which it used to mask
2523 // off all but the rightmost 1-bit and the zeroes to the right of it,
2524 // hence removing irrelevant bits from the input. Here, we play the
2525 // exact same game but with the left-vs-right roles interchanged.
2526 // Unfortunately calculation of the improver in this case is
2527 // significantly more expensive.
2529 // improver = ~(RIGHT(atom) >>u 1)
2531 // That is, improver has its upper clz(atom)+1 bits equal to one;
2532 // lower bits (if any) equal to zero. So it's exactly the right
2533 // mask to use to remove the irrelevant undefined input bits.
2534 /* Here are some examples:
2535 atom = 0...0 1 U...U
2536 R(atom) = 0...0 1 1...1
2537 R(atom) >>u 1 = 0...0 0 1...1
2538 ~(R(atom) >>u 1) = 1...1 1 0...0
2539 which correctly describes which bits of |atom|
2540 actually influence the result
2541 A boundary case
2542 atom = 0...0
2543 R(atom) = 0...0
2544 R(atom) >>u 1 = 0...0
2545 ~(R(atom) >>u 1) = 1...1
2546 also a correct mask for the input: all input bits
2547 are relevant
2548 Another boundary case
2549 atom = 1 1..1
2550 R(atom) = 1 1..1
2551 R(atom) >>u 1 = 0 1..1
2552 ~(R(atom) >>u 1) = 1 0..0
2553 also a correct mask: only the leftmost input bit
2554 is relevant
2555 Now with misc U bits interspersed:
2556 atom = 0...0 1 U...U 0 1 U...U
2557 R(atom) = 0...0 1 1...1 1 1 1...1
2558 R(atom) >>u 1 = 0...0 0 1...1 1 1 1...1
2559 ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
2560 (Per initial implementation of 15 Nov 2018)
2562 improver = mkRight(mce, atom);
2563 improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));
2564 improver = assignNew('V', mce, ty, unop(notOp, improver));
2566 // improved = vatom & improver
2568 // That is, treat any V bits to the right of the leftmost clz(atom)+1
2569 // bits as "defined".
2570 improved = assignNew('V', mce, ty,
2571 binop(andOp, vatom, improver));
2573 // Return pessimizing cast of improved.
2574 return mkPCastTo(mce, ty, improved);
2578 /*------------------------------------------------------------*/
2579 /*--- Scalar shifts. ---*/
2580 /*------------------------------------------------------------*/
2582 /* Produce an interpretation for (aa << bb) (or >>s, >>u). The basic
2583 idea is to shift the definedness bits by the original shift amount.
2584 This introduces 0s ("defined") in new positions for left shifts and
2585 unsigned right shifts, and copies the top definedness bit for
2586 signed right shifts. So, conveniently, applying the original shift
2587 operator to the definedness bits for the left arg is exactly the
2588 right thing to do:
2590 (qaa << bb)
2592 However if the shift amount is undefined then the whole result
2593 is undefined. Hence need:
2595 (qaa << bb) `UifU` PCast(qbb)
2597 If the shift amount bb is a literal than qbb will say 'all defined'
2598 and the UifU and PCast will get folded out by post-instrumentation
2599 optimisation.
2601 static IRAtom* scalarShift ( MCEnv* mce,
2602 IRType ty,
2603 IROp original_op,
2604 IRAtom* qaa, IRAtom* qbb,
2605 IRAtom* aa, IRAtom* bb )
2607 tl_assert(isShadowAtom(mce,qaa));
2608 tl_assert(isShadowAtom(mce,qbb));
2609 tl_assert(isOriginalAtom(mce,aa));
2610 tl_assert(isOriginalAtom(mce,bb));
2611 tl_assert(sameKindedAtoms(qaa,aa));
2612 tl_assert(sameKindedAtoms(qbb,bb));
2613 return
2614 assignNew(
2615 'V', mce, ty,
2616 mkUifU( mce, ty,
2617 assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2618 mkPCastTo(mce, ty, qbb)
2624 /*------------------------------------------------------------*/
2625 /*--- Helpers for dealing with vector primops. ---*/
2626 /*------------------------------------------------------------*/
2628 /* Vector pessimisation -- pessimise within each lane individually. */
2630 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2632 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2635 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2637 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2640 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2642 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2645 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2647 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2650 static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
2652 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
2655 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2657 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2660 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2662 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2665 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2667 return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2670 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2672 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2675 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2677 return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2680 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2682 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2685 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2687 return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2690 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2692 return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2695 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2697 return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2701 /* Here's a simple scheme capable of handling ops derived from SSE1
2702 code and while only generating ops that can be efficiently
2703 implemented in SSE1. */
2705 /* All-lanes versions are straightforward:
2707 binary32Fx4(x,y) ==> PCast32x4(UifUV128(x#,y#))
2709 unary32Fx4(x,y) ==> PCast32x4(x#)
2711 Lowest-lane-only versions are more complex:
2713 binary32F0x4(x,y) ==> SetV128lo32(
2714 x#,
2715 PCast32(V128to32(UifUV128(x#,y#)))
2718 This is perhaps not so obvious. In particular, it's faster to
2719 do a V128-bit UifU and then take the bottom 32 bits than the more
2720 obvious scheme of taking the bottom 32 bits of each operand
2721 and doing a 32-bit UifU. Basically since UifU is fast and
2722 chopping lanes off vector values is slow.
2724 Finally:
2726 unary32F0x4(x) ==> SetV128lo32(
2727 x#,
2728 PCast32(V128to32(x#))
2731 Where:
2733 PCast32(v#) = 1Sto32(CmpNE32(v#,0))
2734 PCast32x4(v#) = CmpNEZ32x4(v#)
2737 static
2738 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2740 IRAtom* at;
2741 tl_assert(isShadowAtom(mce, vatomX));
2742 tl_assert(isShadowAtom(mce, vatomY));
2743 at = mkUifUV128(mce, vatomX, vatomY);
2744 at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2745 return at;
2748 static
2749 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2751 IRAtom* at;
2752 tl_assert(isShadowAtom(mce, vatomX));
2753 at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2754 return at;
2757 static
2758 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2760 IRAtom* at;
2761 tl_assert(isShadowAtom(mce, vatomX));
2762 tl_assert(isShadowAtom(mce, vatomY));
2763 at = mkUifUV128(mce, vatomX, vatomY);
2764 at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2765 at = mkPCastTo(mce, Ity_I32, at);
2766 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2767 return at;
2770 static
2771 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2773 IRAtom* at;
2774 tl_assert(isShadowAtom(mce, vatomX));
2775 at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2776 at = mkPCastTo(mce, Ity_I32, at);
2777 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2778 return at;
2781 /* --- ... and ... 64Fx2 versions of the same ... --- */
2783 static
2784 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2786 IRAtom* at;
2787 tl_assert(isShadowAtom(mce, vatomX));
2788 tl_assert(isShadowAtom(mce, vatomY));
2789 at = mkUifUV128(mce, vatomX, vatomY);
2790 at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2791 return at;
2794 static
2795 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2797 IRAtom* at;
2798 tl_assert(isShadowAtom(mce, vatomX));
2799 at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2800 return at;
2803 static
2804 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2806 IRAtom* at;
2807 tl_assert(isShadowAtom(mce, vatomX));
2808 tl_assert(isShadowAtom(mce, vatomY));
2809 at = mkUifUV128(mce, vatomX, vatomY);
2810 at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2811 at = mkPCastTo(mce, Ity_I64, at);
2812 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2813 return at;
2816 static
2817 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2819 IRAtom* at;
2820 tl_assert(isShadowAtom(mce, vatomX));
2821 at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2822 at = mkPCastTo(mce, Ity_I64, at);
2823 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2824 return at;
2827 /* --- --- ... and ... 16Fx8 versions of the same --- --- */
2829 static
2830 IRAtom* binary16Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2832 IRAtom* at;
2833 tl_assert(isShadowAtom(mce, vatomX));
2834 tl_assert(isShadowAtom(mce, vatomY));
2835 at = mkUifUV128(mce, vatomX, vatomY);
2836 at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, at));
2837 return at;
2840 static
2841 IRAtom* unary16Fx8 ( MCEnv* mce, IRAtom* vatomX )
2843 IRAtom* at;
2844 tl_assert(isShadowAtom(mce, vatomX));
2845 at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, vatomX));
2846 return at;
2849 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2850 implemented.
2853 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2855 static
2856 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2858 IRAtom* at;
2859 tl_assert(isShadowAtom(mce, vatomX));
2860 tl_assert(isShadowAtom(mce, vatomY));
2861 at = mkUifU64(mce, vatomX, vatomY);
2862 at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2863 return at;
2866 static
2867 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2869 IRAtom* at;
2870 tl_assert(isShadowAtom(mce, vatomX));
2871 at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2872 return at;
2875 /* --- ... and ... 64Fx4 versions of the same ... --- */
2877 static
2878 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2880 IRAtom* at;
2881 tl_assert(isShadowAtom(mce, vatomX));
2882 tl_assert(isShadowAtom(mce, vatomY));
2883 at = mkUifUV256(mce, vatomX, vatomY);
2884 at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2885 return at;
2888 static
2889 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2891 IRAtom* at;
2892 tl_assert(isShadowAtom(mce, vatomX));
2893 at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2894 return at;
2897 /* --- ... and ... 32Fx8 versions of the same ... --- */
2899 static
2900 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2902 IRAtom* at;
2903 tl_assert(isShadowAtom(mce, vatomX));
2904 tl_assert(isShadowAtom(mce, vatomY));
2905 at = mkUifUV256(mce, vatomX, vatomY);
2906 at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2907 return at;
2910 static
2911 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2913 IRAtom* at;
2914 tl_assert(isShadowAtom(mce, vatomX));
2915 at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2916 return at;
2919 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2921 static
2922 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2923 IRAtom* vatomX, IRAtom* vatomY )
2925 /* This is the same as binary64Fx2, except that we subsequently
2926 pessimise vRM (definedness of the rounding mode), widen to 128
2927 bits and UifU it into the result. As with the scalar cases, if
2928 the RM is a constant then it is defined and so this extra bit
2929 will get constant-folded out later. */
2930 // "do" the vector args
2931 IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2932 // PCast the RM, and widen it to 128 bits
2933 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2934 // Roll it into the result
2935 t1 = mkUifUV128(mce, t1, t2);
2936 return t1;
2939 /* --- ... and ... 32Fx4 versions of the same --- */
2941 static
2942 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2943 IRAtom* vatomX, IRAtom* vatomY )
2945 IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2946 // PCast the RM, and widen it to 128 bits
2947 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2948 // Roll it into the result
2949 t1 = mkUifUV128(mce, t1, t2);
2950 return t1;
2953 /* --- ... and ... 64Fx4 versions of the same --- */
2955 static
2956 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2957 IRAtom* vatomX, IRAtom* vatomY )
2959 IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2960 // PCast the RM, and widen it to 256 bits
2961 IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2962 // Roll it into the result
2963 t1 = mkUifUV256(mce, t1, t2);
2964 return t1;
2967 /* --- ... and ... 16Fx8 versions of the same --- */
2969 static
2970 IRAtom* binary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2971 IRAtom* vatomX, IRAtom* vatomY )
2973 IRAtom* t1 = binary16Fx8(mce, vatomX, vatomY);
2974 // PCast the RM, and widen it to 128 bits
2975 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2976 // Roll it into the result
2977 t1 = mkUifUV128(mce, t1, t2);
2978 return t1;
2981 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2982 implemented.
2985 /* --- ... and ... 32Fx8 versions of the same --- */
2987 static
2988 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2989 IRAtom* vatomX, IRAtom* vatomY )
2991 IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2992 // PCast the RM, and widen it to 256 bits
2993 IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2994 // Roll it into the result
2995 t1 = mkUifUV256(mce, t1, t2);
2996 return t1;
2999 /* --- 64Fx2 unary FP ops, with rounding mode --- */
3001 static
3002 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3004 /* Same scheme as binary64Fx2_w_rm. */
3005 // "do" the vector arg
3006 IRAtom* t1 = unary64Fx2(mce, vatomX);
3007 // PCast the RM, and widen it to 128 bits
3008 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3009 // Roll it into the result
3010 t1 = mkUifUV128(mce, t1, t2);
3011 return t1;
3014 /* --- ... and ... 32Fx4 versions of the same --- */
3016 static
3017 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3019 /* Same scheme as binaryFx4_w_rm. */
3020 IRAtom* t1 = unary32Fx4(mce, vatomX);
3021 // PCast the RM, and widen it to 128 bits
3022 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3023 // Roll it into the result
3024 t1 = mkUifUV128(mce, t1, t2);
3025 return t1;
3028 /* --- ... and ... 16Fx8 versions of the same --- */
3030 static
3031 IRAtom* unary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3033 /* Same scheme as binaryFx4_w_rm. */
3034 IRAtom* t1 = unary16Fx8(mce, vatomX);
3035 // PCast the RM, and widen it to 128 bits
3036 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3037 // Roll it into the result
3038 t1 = mkUifUV128(mce, t1, t2);
3039 return t1;
3042 /* --- ... and ... 32Fx8 versions of the same --- */
3044 static
3045 IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3047 /* Same scheme as unary32Fx8_w_rm. */
3048 IRAtom* t1 = unary32Fx8(mce, vatomX);
3049 // PCast the RM, and widen it to 256 bits
3050 IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
3051 // Roll it into the result
3052 t1 = mkUifUV256(mce, t1, t2);
3053 return t1;
3057 /* --- --- Vector saturated narrowing --- --- */
3059 /* We used to do something very clever here, but on closer inspection
3060 (2011-Jun-15), and in particular bug #279698, it turns out to be
3061 wrong. Part of the problem came from the fact that for a long
3062 time, the IR primops to do with saturated narrowing were
3063 underspecified and managed to confuse multiple cases which needed
3064 to be separate: the op names had a signedness qualifier, but in
3065 fact the source and destination signednesses needed to be specified
3066 independently, so the op names really need two independent
3067 signedness specifiers.
3069 As of 2011-Jun-15 (ish) the underspecification was sorted out
3070 properly. The incorrect instrumentation remained, though. That
3071 has now (2011-Oct-22) been fixed.
3073 What we now do is simple:
3075 Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
3076 number of lanes, X is the source lane width and signedness, and Y
3077 is the destination lane width and signedness. In all cases the
3078 destination lane width is half the source lane width, so the names
3079 have a bit of redundancy, but are at least easy to read.
3081 For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
3082 to unsigned 16s.
3084 Let Vanilla(OP) be a function that takes OP, one of these
3085 saturating narrowing ops, and produces the same "shaped" narrowing
3086 op which is not saturating, but merely dumps the most significant
3087 bits. "same shape" means that the lane numbers and widths are the
3088 same as with OP.
3090 For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
3091 = Iop_NarrowBin32to16x8,
3092 that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
3093 dumping the top half of each lane.
3095 So, with that in place, the scheme is simple, and it is simple to
3096 pessimise each lane individually and then apply Vanilla(OP) so as
3097 to get the result in the right "shape". If the original OP is
3098 QNarrowBinXtoYxZ then we produce
3100 Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
3102 or for the case when OP is unary (Iop_QNarrowUn*)
3104 Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
3106 static
3107 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
3109 switch (qnarrowOp) {
3110 /* Binary: (128, 128) -> 128 */
3111 case Iop_QNarrowBin16Sto8Ux16:
3112 case Iop_QNarrowBin16Sto8Sx16:
3113 case Iop_QNarrowBin16Uto8Ux16:
3114 case Iop_QNarrowBin64Sto32Sx4:
3115 case Iop_QNarrowBin64Uto32Ux4:
3116 return Iop_NarrowBin16to8x16;
3117 case Iop_QNarrowBin32Sto16Ux8:
3118 case Iop_QNarrowBin32Sto16Sx8:
3119 case Iop_QNarrowBin32Uto16Ux8:
3120 return Iop_NarrowBin32to16x8;
3121 /* Binary: (64, 64) -> 64 */
3122 case Iop_QNarrowBin32Sto16Sx4:
3123 return Iop_NarrowBin32to16x4;
3124 case Iop_QNarrowBin16Sto8Ux8:
3125 case Iop_QNarrowBin16Sto8Sx8:
3126 return Iop_NarrowBin16to8x8;
3127 /* Unary: 128 -> 64 */
3128 case Iop_QNarrowUn64Uto32Ux2:
3129 case Iop_QNarrowUn64Sto32Sx2:
3130 case Iop_QNarrowUn64Sto32Ux2:
3131 return Iop_NarrowUn64to32x2;
3132 case Iop_QNarrowUn32Uto16Ux4:
3133 case Iop_QNarrowUn32Sto16Sx4:
3134 case Iop_QNarrowUn32Sto16Ux4:
3135 case Iop_F32toF16x4_DEP:
3136 return Iop_NarrowUn32to16x4;
3137 case Iop_QNarrowUn16Uto8Ux8:
3138 case Iop_QNarrowUn16Sto8Sx8:
3139 case Iop_QNarrowUn16Sto8Ux8:
3140 return Iop_NarrowUn16to8x8;
3141 default:
3142 ppIROp(qnarrowOp);
3143 VG_(tool_panic)("vanillaNarrowOpOfShape");
3147 static
3148 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
3149 IRAtom* vatom1, IRAtom* vatom2)
3151 IRAtom *at1, *at2, *at3;
3152 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3153 switch (narrow_op) {
3154 case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
3155 case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
3156 case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
3157 case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
3158 case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
3159 case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
3160 case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
3161 case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
3162 default: VG_(tool_panic)("vectorNarrowBinV128");
3164 IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3165 tl_assert(isShadowAtom(mce,vatom1));
3166 tl_assert(isShadowAtom(mce,vatom2));
3167 at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3168 at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
3169 at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
3170 return at3;
3173 static
3174 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
3175 IRAtom* vatom1, IRAtom* vatom2)
3177 IRAtom *at1, *at2, *at3;
3178 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3179 switch (narrow_op) {
3180 case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
3181 case Iop_QNarrowBin16Sto8Sx8: pcast = mkPCast16x4; break;
3182 case Iop_QNarrowBin16Sto8Ux8: pcast = mkPCast16x4; break;
3183 default: VG_(tool_panic)("vectorNarrowBin64");
3185 IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3186 tl_assert(isShadowAtom(mce,vatom1));
3187 tl_assert(isShadowAtom(mce,vatom2));
3188 at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
3189 at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
3190 at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
3191 return at3;
3194 static
3195 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
3196 IRAtom* vatom1)
3198 IRAtom *at1, *at2;
3199 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3200 tl_assert(isShadowAtom(mce,vatom1));
3201 /* For vanilla narrowing (non-saturating), we can just apply
3202 the op directly to the V bits. */
3203 switch (narrow_op) {
3204 case Iop_NarrowUn16to8x8:
3205 case Iop_NarrowUn32to16x4:
3206 case Iop_NarrowUn64to32x2:
3207 case Iop_F32toF16x4_DEP:
3208 at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
3209 return at1;
3210 default:
3211 break; /* Do Plan B */
3213 /* Plan B: for ops that involve a saturation operation on the args,
3214 we must PCast before the vanilla narrow. */
3215 switch (narrow_op) {
3216 case Iop_QNarrowUn16Sto8Sx8: pcast = mkPCast16x8; break;
3217 case Iop_QNarrowUn16Sto8Ux8: pcast = mkPCast16x8; break;
3218 case Iop_QNarrowUn16Uto8Ux8: pcast = mkPCast16x8; break;
3219 case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
3220 case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
3221 case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
3222 case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
3223 case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
3224 case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
3225 default: VG_(tool_panic)("vectorNarrowUnV128");
3227 IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3228 at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3229 at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
3230 return at2;
3233 static
3234 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
3235 IRAtom* vatom1)
3237 IRAtom *at1, *at2;
3238 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3239 switch (longen_op) {
3240 case Iop_Widen8Uto16x8: pcast = mkPCast16x8; break;
3241 case Iop_Widen8Sto16x8: pcast = mkPCast16x8; break;
3242 case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
3243 case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
3244 case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
3245 case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
3246 case Iop_F16toF32x4: pcast = mkPCast32x4; break;
3247 default: VG_(tool_panic)("vectorWidenI64");
3249 tl_assert(isShadowAtom(mce,vatom1));
3250 at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
3251 at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
3252 return at2;
3256 /* --- --- Vector integer arithmetic --- --- */
3258 /* Simple ... UifU the args and per-lane pessimise the results. */
3260 /* --- V256-bit versions --- */
3262 static
3263 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3265 IRAtom* at;
3266 at = mkUifUV256(mce, vatom1, vatom2);
3267 at = mkPCast8x32(mce, at);
3268 return at;
3271 static
3272 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3274 IRAtom* at;
3275 at = mkUifUV256(mce, vatom1, vatom2);
3276 at = mkPCast16x16(mce, at);
3277 return at;
3280 static
3281 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3283 IRAtom* at;
3284 at = mkUifUV256(mce, vatom1, vatom2);
3285 at = mkPCast32x8(mce, at);
3286 return at;
3289 static
3290 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3292 IRAtom* at;
3293 at = mkUifUV256(mce, vatom1, vatom2);
3294 at = mkPCast64x4(mce, at);
3295 return at;
3298 /* --- V128-bit versions --- */
3300 static
3301 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3303 IRAtom* at;
3304 at = mkUifUV128(mce, vatom1, vatom2);
3305 at = mkPCast8x16(mce, at);
3306 return at;
3309 static
3310 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3312 IRAtom* at;
3313 at = mkUifUV128(mce, vatom1, vatom2);
3314 at = mkPCast16x8(mce, at);
3315 return at;
3318 static
3319 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3321 IRAtom* at;
3322 at = mkUifUV128(mce, vatom1, vatom2);
3323 at = mkPCast32x4(mce, at);
3324 return at;
3327 static
3328 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3330 IRAtom* at;
3331 at = mkUifUV128(mce, vatom1, vatom2);
3332 at = mkPCast64x2(mce, at);
3333 return at;
3336 static
3337 IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3339 IRAtom* at;
3340 at = mkUifUV128(mce, vatom1, vatom2);
3341 at = mkPCast128x1(mce, at);
3342 return at;
3345 /* --- 64-bit versions --- */
3347 static
3348 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3350 IRAtom* at;
3351 at = mkUifU64(mce, vatom1, vatom2);
3352 at = mkPCast8x8(mce, at);
3353 return at;
3356 static
3357 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3359 IRAtom* at;
3360 at = mkUifU64(mce, vatom1, vatom2);
3361 at = mkPCast16x4(mce, at);
3362 return at;
3365 static
3366 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3368 IRAtom* at;
3369 at = mkUifU64(mce, vatom1, vatom2);
3370 at = mkPCast32x2(mce, at);
3371 return at;
3374 static
3375 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3377 IRAtom* at;
3378 at = mkUifU64(mce, vatom1, vatom2);
3379 at = mkPCastTo(mce, Ity_I64, at);
3380 return at;
3383 /* --- 32-bit versions --- */
3385 static
3386 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3388 IRAtom* at;
3389 at = mkUifU32(mce, vatom1, vatom2);
3390 at = mkPCast8x4(mce, at);
3391 return at;
3394 static
3395 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3397 IRAtom* at;
3398 at = mkUifU32(mce, vatom1, vatom2);
3399 at = mkPCast16x2(mce, at);
3400 return at;
3404 /*------------------------------------------------------------*/
3405 /*--- Generate shadow values from all kinds of IRExprs. ---*/
3406 /*------------------------------------------------------------*/
3408 static
3409 IRAtom* expr2vbits_Qop ( MCEnv* mce,
3410 IROp op,
3411 IRAtom* atom1, IRAtom* atom2,
3412 IRAtom* atom3, IRAtom* atom4 )
3414 IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3415 IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3416 IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3417 IRAtom* vatom4 = expr2vbits( mce, atom4, HuOth );
3419 tl_assert(isOriginalAtom(mce,atom1));
3420 tl_assert(isOriginalAtom(mce,atom2));
3421 tl_assert(isOriginalAtom(mce,atom3));
3422 tl_assert(isOriginalAtom(mce,atom4));
3423 tl_assert(isShadowAtom(mce,vatom1));
3424 tl_assert(isShadowAtom(mce,vatom2));
3425 tl_assert(isShadowAtom(mce,vatom3));
3426 tl_assert(isShadowAtom(mce,vatom4));
3427 tl_assert(sameKindedAtoms(atom1,vatom1));
3428 tl_assert(sameKindedAtoms(atom2,vatom2));
3429 tl_assert(sameKindedAtoms(atom3,vatom3));
3430 tl_assert(sameKindedAtoms(atom4,vatom4));
3431 switch (op) {
3432 case Iop_MAddF64:
3433 case Iop_MAddF64r32:
3434 case Iop_MSubF64:
3435 case Iop_MSubF64r32:
3436 /* I32(rm) x F64 x F64 x F64 -> F64 */
3437 return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3439 case Iop_MAddF32:
3440 case Iop_MSubF32:
3441 /* I32(rm) x F32 x F32 x F32 -> F32 */
3442 return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3444 case Iop_MAddF128:
3445 case Iop_MSubF128:
3446 case Iop_NegMAddF128:
3447 case Iop_NegMSubF128:
3448 /* I32(rm) x F128 x F128 x F128 -> F128 */
3449 return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
3451 /* V256-bit data-steering */
3452 case Iop_64x4toV256:
3453 return assignNew('V', mce, Ity_V256,
3454 IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
3456 /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3457 case Iop_Rotx32:
3458 return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3459 case Iop_Rotx64:
3460 return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3461 default:
3462 ppIROp(op);
3463 VG_(tool_panic)("memcheck:expr2vbits_Qop");
3468 static
3469 IRAtom* expr2vbits_Triop ( MCEnv* mce,
3470 IROp op,
3471 IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
3473 IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3474 IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3475 IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3477 tl_assert(isOriginalAtom(mce,atom1));
3478 tl_assert(isOriginalAtom(mce,atom2));
3479 tl_assert(isOriginalAtom(mce,atom3));
3480 tl_assert(isShadowAtom(mce,vatom1));
3481 tl_assert(isShadowAtom(mce,vatom2));
3482 tl_assert(isShadowAtom(mce,vatom3));
3483 tl_assert(sameKindedAtoms(atom1,vatom1));
3484 tl_assert(sameKindedAtoms(atom2,vatom2));
3485 tl_assert(sameKindedAtoms(atom3,vatom3));
3486 switch (op) {
3487 case Iop_AddF128:
3488 case Iop_SubF128:
3489 case Iop_MulF128:
3490 case Iop_DivF128:
3491 case Iop_AddD128:
3492 case Iop_SubD128:
3493 case Iop_MulD128:
3494 case Iop_DivD128:
3495 case Iop_QuantizeD128:
3496 /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3497 return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3498 case Iop_AddF64:
3499 case Iop_AddD64:
3500 case Iop_AddF64r32:
3501 case Iop_SubF64:
3502 case Iop_SubD64:
3503 case Iop_SubF64r32:
3504 case Iop_MulF64:
3505 case Iop_MulD64:
3506 case Iop_MulF64r32:
3507 case Iop_DivF64:
3508 case Iop_DivD64:
3509 case Iop_DivF64r32:
3510 case Iop_ScaleF64:
3511 case Iop_Yl2xF64:
3512 case Iop_Yl2xp1F64:
3513 case Iop_AtanF64:
3514 case Iop_PRemF64:
3515 case Iop_PRem1F64:
3516 case Iop_QuantizeD64:
3517 /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3518 return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3519 case Iop_PRemC3210F64:
3520 case Iop_PRem1C3210F64:
3521 /* I32(rm) x F64 x F64 -> I32 */
3522 return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3523 case Iop_AddF32:
3524 case Iop_SubF32:
3525 case Iop_MulF32:
3526 case Iop_DivF32:
3527 /* I32(rm) x F32 x F32 -> I32 */
3528 return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3529 case Iop_AddF16:
3530 case Iop_SubF16:
3531 /* I32(rm) x F16 x F16 -> I16 */
3532 return mkLazy3(mce, Ity_I16, vatom1, vatom2, vatom3);
3533 case Iop_SignificanceRoundD64:
3534 /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3535 return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3536 case Iop_SignificanceRoundD128:
3537 /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3538 return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3539 case Iop_SliceV128:
3540 /* (V128, V128, I8) -> V128 */
3541 complainIfUndefined(mce, atom3, NULL);
3542 return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
3543 case Iop_Slice64:
3544 /* (I64, I64, I8) -> I64 */
3545 complainIfUndefined(mce, atom3, NULL);
3546 return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
3547 case Iop_SetElem8x8:
3548 case Iop_SetElem16x4:
3549 case Iop_SetElem32x2:
3550 complainIfUndefined(mce, atom2, NULL);
3551 return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
3553 case Iop_SetElem8x16:
3554 case Iop_SetElem16x8:
3555 case Iop_SetElem32x4:
3556 case Iop_SetElem64x2:
3557 complainIfUndefined(mce, atom2, NULL);
3558 return assignNew('V', mce, Ity_V128, triop(op, vatom1, atom2, vatom3));
3560 /* Int 128-bit Integer three arg */
3561 case Iop_2xMultU64Add128CarryOut:
3562 case Iop_Perm8x16x2:
3563 /* (V128, V128, V128) -> V128 */
3564 complainIfUndefined(mce, atom3, NULL);
3565 return mkUifUV128(
3566 mce,
3567 assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3)),
3568 mkPCast8x16(mce, vatom3)
3571 /* Vector FP with rounding mode as the first arg */
3572 case Iop_Add64Fx2:
3573 case Iop_Sub64Fx2:
3574 case Iop_Mul64Fx2:
3575 case Iop_Div64Fx2:
3576 case Iop_Scale2_64Fx2:
3577 return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
3579 case Iop_Add32Fx4:
3580 case Iop_Sub32Fx4:
3581 case Iop_Mul32Fx4:
3582 case Iop_Div32Fx4:
3583 case Iop_Scale2_32Fx4:
3584 return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3586 case Iop_Add64Fx4:
3587 case Iop_Sub64Fx4:
3588 case Iop_Mul64Fx4:
3589 case Iop_Div64Fx4:
3590 return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3592 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision
3593 IR is implemented.
3595 case Iop_Add16Fx8:
3596 case Iop_Sub16Fx8:
3597 return binary16Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3599 case Iop_Add32Fx8:
3600 case Iop_Sub32Fx8:
3601 case Iop_Mul32Fx8:
3602 case Iop_Div32Fx8:
3603 return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3605 case Iop_F32x4_2toQ16x8:
3606 return assignNew('V', mce, Ity_V128,
3607 binop(Iop_PackEvenLanes16x8,
3608 unary32Fx4_w_rm(mce, vatom1, vatom2),
3609 unary32Fx4_w_rm(mce, vatom1, vatom3)));
3610 case Iop_F64x2_2toQ32x4:
3611 return assignNew('V', mce, Ity_V128,
3612 binop(Iop_PackEvenLanes32x4,
3613 unary64Fx2_w_rm(mce, vatom1, vatom2),
3614 unary64Fx2_w_rm(mce, vatom1, vatom3)));
3616 default:
3617 ppIROp(op);
3618 VG_(tool_panic)("memcheck:expr2vbits_Triop");
3623 static
3624 IRAtom* expr2vbits_Binop ( MCEnv* mce,
3625 IROp op,
3626 IRAtom* atom1, IRAtom* atom2,
3627 HowUsed hu/*use HuOth if unknown*/ )
3629 IRType and_or_ty = Ity_INVALID;
3630 IRAtom* (*uifu) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3631 IRAtom* (*difd) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3632 IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3634 IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3635 IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3637 tl_assert(isOriginalAtom(mce,atom1));
3638 tl_assert(isOriginalAtom(mce,atom2));
3639 tl_assert(isShadowAtom(mce,vatom1));
3640 tl_assert(isShadowAtom(mce,vatom2));
3641 tl_assert(sameKindedAtoms(atom1,vatom1));
3642 tl_assert(sameKindedAtoms(atom2,vatom2));
3643 switch (op) {
3645 /* 32-bit SIMD */
3647 case Iop_Add16x2:
3648 case Iop_HAdd16Ux2:
3649 case Iop_HAdd16Sx2:
3650 case Iop_Sub16x2:
3651 case Iop_HSub16Ux2:
3652 case Iop_HSub16Sx2:
3653 case Iop_QAdd16Sx2:
3654 case Iop_QSub16Sx2:
3655 case Iop_QSub16Ux2:
3656 case Iop_QAdd16Ux2:
3657 return binary16Ix2(mce, vatom1, vatom2);
3659 case Iop_Add8x4:
3660 case Iop_HAdd8Ux4:
3661 case Iop_HAdd8Sx4:
3662 case Iop_Sub8x4:
3663 case Iop_HSub8Ux4:
3664 case Iop_HSub8Sx4:
3665 case Iop_QSub8Ux4:
3666 case Iop_QAdd8Ux4:
3667 case Iop_QSub8Sx4:
3668 case Iop_QAdd8Sx4:
3669 return binary8Ix4(mce, vatom1, vatom2);
3671 /* 64-bit SIMD */
3673 case Iop_ShrN8x8:
3674 case Iop_ShrN16x4:
3675 case Iop_ShrN32x2:
3676 case Iop_SarN8x8:
3677 case Iop_SarN16x4:
3678 case Iop_SarN32x2:
3679 case Iop_ShlN16x4:
3680 case Iop_ShlN32x2:
3681 case Iop_ShlN8x8:
3682 /* Same scheme as with all other shifts. */
3683 complainIfUndefined(mce, atom2, NULL);
3684 return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3686 case Iop_QNarrowBin32Sto16Sx4:
3687 case Iop_QNarrowBin16Sto8Sx8:
3688 case Iop_QNarrowBin16Sto8Ux8:
3689 return vectorNarrowBin64(mce, op, vatom1, vatom2);
3691 case Iop_Min8Ux8:
3692 case Iop_Min8Sx8:
3693 case Iop_Max8Ux8:
3694 case Iop_Max8Sx8:
3695 case Iop_Avg8Ux8:
3696 case Iop_QSub8Sx8:
3697 case Iop_QSub8Ux8:
3698 case Iop_Sub8x8:
3699 case Iop_CmpGT8Sx8:
3700 case Iop_CmpGT8Ux8:
3701 case Iop_CmpEQ8x8:
3702 case Iop_QAdd8Sx8:
3703 case Iop_QAdd8Ux8:
3704 case Iop_QSal8x8:
3705 case Iop_QShl8x8:
3706 case Iop_Add8x8:
3707 case Iop_Mul8x8:
3708 case Iop_PolynomialMul8x8:
3709 return binary8Ix8(mce, vatom1, vatom2);
3711 case Iop_Min16Sx4:
3712 case Iop_Min16Ux4:
3713 case Iop_Max16Sx4:
3714 case Iop_Max16Ux4:
3715 case Iop_Avg16Ux4:
3716 case Iop_QSub16Ux4:
3717 case Iop_QSub16Sx4:
3718 case Iop_Sub16x4:
3719 case Iop_Mul16x4:
3720 case Iop_MulHi16Sx4:
3721 case Iop_MulHi16Ux4:
3722 case Iop_CmpGT16Sx4:
3723 case Iop_CmpGT16Ux4:
3724 case Iop_CmpEQ16x4:
3725 case Iop_QAdd16Sx4:
3726 case Iop_QAdd16Ux4:
3727 case Iop_QSal16x4:
3728 case Iop_QShl16x4:
3729 case Iop_Add16x4:
3730 case Iop_QDMulHi16Sx4:
3731 case Iop_QRDMulHi16Sx4:
3732 return binary16Ix4(mce, vatom1, vatom2);
3734 case Iop_Sub32x2:
3735 case Iop_Mul32x2:
3736 case Iop_Max32Sx2:
3737 case Iop_Max32Ux2:
3738 case Iop_Min32Sx2:
3739 case Iop_Min32Ux2:
3740 case Iop_CmpGT32Sx2:
3741 case Iop_CmpGT32Ux2:
3742 case Iop_CmpEQ32x2:
3743 case Iop_Add32x2:
3744 case Iop_QAdd32Ux2:
3745 case Iop_QAdd32Sx2:
3746 case Iop_QSub32Ux2:
3747 case Iop_QSub32Sx2:
3748 case Iop_QSal32x2:
3749 case Iop_QShl32x2:
3750 case Iop_QDMulHi32Sx2:
3751 case Iop_QRDMulHi32Sx2:
3752 return binary32Ix2(mce, vatom1, vatom2);
3754 case Iop_QSub64Ux1:
3755 case Iop_QSub64Sx1:
3756 case Iop_QAdd64Ux1:
3757 case Iop_QAdd64Sx1:
3758 case Iop_QSal64x1:
3759 case Iop_QShl64x1:
3760 case Iop_Sal64x1:
3761 return binary64Ix1(mce, vatom1, vatom2);
3763 case Iop_QShlNsatSU8x8:
3764 case Iop_QShlNsatUU8x8:
3765 case Iop_QShlNsatSS8x8:
3766 complainIfUndefined(mce, atom2, NULL);
3767 return mkPCast8x8(mce, vatom1);
3769 case Iop_QShlNsatSU16x4:
3770 case Iop_QShlNsatUU16x4:
3771 case Iop_QShlNsatSS16x4:
3772 complainIfUndefined(mce, atom2, NULL);
3773 return mkPCast16x4(mce, vatom1);
3775 case Iop_QShlNsatSU32x2:
3776 case Iop_QShlNsatUU32x2:
3777 case Iop_QShlNsatSS32x2:
3778 complainIfUndefined(mce, atom2, NULL);
3779 return mkPCast32x2(mce, vatom1);
3781 case Iop_QShlNsatSU64x1:
3782 case Iop_QShlNsatUU64x1:
3783 case Iop_QShlNsatSS64x1:
3784 complainIfUndefined(mce, atom2, NULL);
3785 return mkPCast32x2(mce, vatom1);
3787 case Iop_PwMax32Sx2:
3788 case Iop_PwMax32Ux2:
3789 case Iop_PwMin32Sx2:
3790 case Iop_PwMin32Ux2:
3791 case Iop_PwMax32Fx2:
3792 case Iop_PwMin32Fx2:
3793 return assignNew('V', mce, Ity_I64,
3794 binop(Iop_PwMax32Ux2,
3795 mkPCast32x2(mce, vatom1),
3796 mkPCast32x2(mce, vatom2)));
3798 case Iop_PwMax16Sx4:
3799 case Iop_PwMax16Ux4:
3800 case Iop_PwMin16Sx4:
3801 case Iop_PwMin16Ux4:
3802 return assignNew('V', mce, Ity_I64,
3803 binop(Iop_PwMax16Ux4,
3804 mkPCast16x4(mce, vatom1),
3805 mkPCast16x4(mce, vatom2)));
3807 case Iop_PwMax8Sx8:
3808 case Iop_PwMax8Ux8:
3809 case Iop_PwMin8Sx8:
3810 case Iop_PwMin8Ux8:
3811 return assignNew('V', mce, Ity_I64,
3812 binop(Iop_PwMax8Ux8,
3813 mkPCast8x8(mce, vatom1),
3814 mkPCast8x8(mce, vatom2)));
3816 case Iop_PwAdd32x2:
3817 case Iop_PwAdd32Fx2:
3818 return mkPCast32x2(mce,
3819 assignNew('V', mce, Ity_I64,
3820 binop(Iop_PwAdd32x2,
3821 mkPCast32x2(mce, vatom1),
3822 mkPCast32x2(mce, vatom2))));
3824 case Iop_PwAdd16x4:
3825 return mkPCast16x4(mce,
3826 assignNew('V', mce, Ity_I64,
3827 binop(op, mkPCast16x4(mce, vatom1),
3828 mkPCast16x4(mce, vatom2))));
3830 case Iop_PwAdd8x8:
3831 return mkPCast8x8(mce,
3832 assignNew('V', mce, Ity_I64,
3833 binop(op, mkPCast8x8(mce, vatom1),
3834 mkPCast8x8(mce, vatom2))));
3836 case Iop_Shl8x8:
3837 case Iop_Shr8x8:
3838 case Iop_Sar8x8:
3839 case Iop_Sal8x8:
3840 return mkUifU64(mce,
3841 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3842 mkPCast8x8(mce,vatom2)
3845 case Iop_Shl16x4:
3846 case Iop_Shr16x4:
3847 case Iop_Sar16x4:
3848 case Iop_Sal16x4:
3849 return mkUifU64(mce,
3850 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3851 mkPCast16x4(mce,vatom2)
3854 case Iop_Shl32x2:
3855 case Iop_Shr32x2:
3856 case Iop_Sar32x2:
3857 case Iop_Sal32x2:
3858 return mkUifU64(mce,
3859 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3860 mkPCast32x2(mce,vatom2)
3863 /* 64-bit data-steering */
3864 case Iop_InterleaveLO32x2:
3865 case Iop_InterleaveLO16x4:
3866 case Iop_InterleaveLO8x8:
3867 case Iop_InterleaveHI32x2:
3868 case Iop_InterleaveHI16x4:
3869 case Iop_InterleaveHI8x8:
3870 case Iop_CatOddLanes8x8:
3871 case Iop_CatEvenLanes8x8:
3872 case Iop_CatOddLanes16x4:
3873 case Iop_CatEvenLanes16x4:
3874 case Iop_InterleaveOddLanes8x8:
3875 case Iop_InterleaveEvenLanes8x8:
3876 case Iop_InterleaveOddLanes16x4:
3877 case Iop_InterleaveEvenLanes16x4:
3878 return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3880 case Iop_GetElem8x8:
3881 complainIfUndefined(mce, atom2, NULL);
3882 return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3883 case Iop_GetElem16x4:
3884 complainIfUndefined(mce, atom2, NULL);
3885 return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3886 case Iop_GetElem32x2:
3887 complainIfUndefined(mce, atom2, NULL);
3888 return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3890 /* Perm8x8: rearrange values in left arg using steering values from
3891 right arg. So rearrange the vbits in the same way but pessimise wrt
3892 steering values. We assume that unused bits in the steering value
3893 are defined zeros, so we can safely PCast within each lane of the the
3894 steering value without having to take precautions to avoid a
3895 dependency on those unused bits.
3897 This is also correct for PermOrZero8x8, but it is a bit subtle. For
3898 each lane, if bit 7 of the steering value is zero, then we'll steer
3899 the shadow value exactly as per Perm8x8. If that bit is one, then
3900 the operation will set the resulting (concrete) value to zero. That
3901 means it is defined, and should have a shadow value of zero. Hence
3902 in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
3903 as Perm8x8) and then pessimise against the steering values. */
3904 case Iop_Perm8x8:
3905 case Iop_PermOrZero8x8:
3906 return mkUifU64(
3907 mce,
3908 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3909 mkPCast8x8(mce, vatom2)
3912 /* V128-bit SIMD */
3914 case Iop_I32StoF32x4:
3915 case Iop_F32toI32Sx4:
3916 case Iop_Sqrt16Fx8:
3917 return unary16Fx8_w_rm(mce, vatom1, vatom2);
3918 case Iop_Sqrt32Fx4:
3919 return unary32Fx4_w_rm(mce, vatom1, vatom2);
3920 case Iop_Sqrt64Fx2:
3921 return unary64Fx2_w_rm(mce, vatom1, vatom2);
3923 case Iop_ShrN8x16:
3924 case Iop_ShrN16x8:
3925 case Iop_ShrN32x4:
3926 case Iop_ShrN64x2:
3927 case Iop_SarN8x16:
3928 case Iop_SarN16x8:
3929 case Iop_SarN32x4:
3930 case Iop_SarN64x2:
3931 case Iop_ShlN8x16:
3932 case Iop_ShlN16x8:
3933 case Iop_ShlN32x4:
3934 case Iop_ShlN64x2:
3935 /* Same scheme as with all other shifts. Note: 22 Oct 05:
3936 this is wrong now, scalar shifts are done properly lazily.
3937 Vector shifts should be fixed too. */
3938 complainIfUndefined(mce, atom2, NULL);
3939 return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3941 /* V x V shifts/rotates are done using the standard lazy scheme. */
3942 /* For the non-rounding variants of bi-di vector x vector
3943 shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3944 But note that this is overly pessimistic, because in fact only
3945 the bottom 8 bits of each lane of the second argument are taken
3946 into account when shifting. So really we ought to ignore
3947 undefinedness in bits 8 and above of each lane in the
3948 second argument. */
3949 case Iop_Shl8x16:
3950 case Iop_Shr8x16:
3951 case Iop_Sar8x16:
3952 case Iop_Sal8x16:
3953 case Iop_Rol8x16:
3954 case Iop_Sh8Sx16:
3955 case Iop_Sh8Ux16:
3956 return mkUifUV128(mce,
3957 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3958 mkPCast8x16(mce,vatom2)
3961 case Iop_Shl16x8:
3962 case Iop_Shr16x8:
3963 case Iop_Sar16x8:
3964 case Iop_Sal16x8:
3965 case Iop_Rol16x8:
3966 case Iop_Sh16Sx8:
3967 case Iop_Sh16Ux8:
3968 return mkUifUV128(mce,
3969 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3970 mkPCast16x8(mce,vatom2)
3973 case Iop_Shl32x4:
3974 case Iop_Shr32x4:
3975 case Iop_Sar32x4:
3976 case Iop_Sal32x4:
3977 case Iop_Rol32x4:
3978 case Iop_Sh32Sx4:
3979 case Iop_Sh32Ux4:
3980 return mkUifUV128(mce,
3981 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3982 mkPCast32x4(mce,vatom2)
3985 case Iop_Shl64x2:
3986 case Iop_Shr64x2:
3987 case Iop_Sar64x2:
3988 case Iop_Sal64x2:
3989 case Iop_Rol64x2:
3990 case Iop_Sh64Sx2:
3991 case Iop_Sh64Ux2:
3992 return mkUifUV128(mce,
3993 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3994 mkPCast64x2(mce,vatom2)
3997 /* For the rounding variants of bi-di vector x vector shifts, the
3998 rounding adjustment can cause undefinedness to propagate through
3999 the entire lane, in the worst case. Too complex to handle
4000 properly .. just UifU the arguments and then PCast them.
4001 Suboptimal but safe. */
4002 case Iop_Rsh8Sx16:
4003 case Iop_Rsh8Ux16:
4004 return binary8Ix16(mce, vatom1, vatom2);
4005 case Iop_Rsh16Sx8:
4006 case Iop_Rsh16Ux8:
4007 return binary16Ix8(mce, vatom1, vatom2);
4008 case Iop_Rsh32Sx4:
4009 case Iop_Rsh32Ux4:
4010 return binary32Ix4(mce, vatom1, vatom2);
4011 case Iop_Rsh64Sx2:
4012 case Iop_Rsh64Ux2:
4013 return binary64Ix2(mce, vatom1, vatom2);
4015 case Iop_F32ToFixed32Ux4_RZ:
4016 case Iop_F32ToFixed32Sx4_RZ:
4017 case Iop_Fixed32UToF32x4_RN:
4018 case Iop_Fixed32SToF32x4_RN:
4019 complainIfUndefined(mce, atom2, NULL);
4020 return mkPCast32x4(mce, vatom1);
4022 case Iop_F32ToFixed32Ux2_RZ:
4023 case Iop_F32ToFixed32Sx2_RZ:
4024 case Iop_Fixed32UToF32x2_RN:
4025 case Iop_Fixed32SToF32x2_RN:
4026 complainIfUndefined(mce, atom2, NULL);
4027 return mkPCast32x2(mce, vatom1);
4029 case Iop_QSub8Ux16:
4030 case Iop_QSub8Sx16:
4031 case Iop_Sub8x16:
4032 case Iop_Min8Ux16:
4033 case Iop_Min8Sx16:
4034 case Iop_Max8Ux16:
4035 case Iop_Max8Sx16:
4036 case Iop_CmpEQ8x16:
4037 case Iop_Avg8Ux16:
4038 case Iop_Avg8Sx16:
4039 case Iop_QAdd8Ux16:
4040 case Iop_QAdd8Sx16:
4041 case Iop_QAddExtUSsatSS8x16:
4042 case Iop_QAddExtSUsatUU8x16:
4043 case Iop_QSal8x16:
4044 case Iop_QShl8x16:
4045 case Iop_Add8x16:
4046 case Iop_Mul8x16:
4047 case Iop_MulHi8Sx16:
4048 case Iop_MulHi8Ux16:
4049 case Iop_PolynomialMul8x16:
4050 case Iop_PolynomialMulAdd8x16:
4051 return binary8Ix16(mce, vatom1, vatom2);
4053 case Iop_QSub16Ux8:
4054 case Iop_QSub16Sx8:
4055 case Iop_Sub16x8:
4056 case Iop_Mul16x8:
4057 case Iop_MulHi16Sx8:
4058 case Iop_MulHi16Ux8:
4059 case Iop_Min16Sx8:
4060 case Iop_Min16Ux8:
4061 case Iop_Max16Sx8:
4062 case Iop_Max16Ux8:
4063 case Iop_CmpEQ16x8:
4064 case Iop_Avg16Ux8:
4065 case Iop_Avg16Sx8:
4066 case Iop_QAdd16Ux8:
4067 case Iop_QAdd16Sx8:
4068 case Iop_QAddExtUSsatSS16x8:
4069 case Iop_QAddExtSUsatUU16x8:
4070 case Iop_QSal16x8:
4071 case Iop_QShl16x8:
4072 case Iop_Add16x8:
4073 case Iop_QDMulHi16Sx8:
4074 case Iop_QRDMulHi16Sx8:
4075 case Iop_PolynomialMulAdd16x8:
4076 /* PwExtUSMulQAdd8x16 is a bit subtle. The effect of it is that each
4077 16-bit chunk of the output is formed from corresponding 16-bit chunks
4078 of the input args, so we can treat it like an other binary 16x8
4079 operation. That's despite it having '8x16' in its name. */
4080 case Iop_PwExtUSMulQAdd8x16:
4081 return binary16Ix8(mce, vatom1, vatom2);
4083 case Iop_CmpGT64Sx2:
4084 case Iop_CmpGT64Ux2:
4085 case Iop_CmpGT32Sx4:
4086 case Iop_CmpGT32Ux4:
4087 case Iop_CmpGT16Sx8:
4088 case Iop_CmpGT16Ux8:
4089 case Iop_CmpGT8Sx16:
4090 case Iop_CmpGT8Ux16:
4091 return expensiveCmpGT(mce, op,
4092 vatom1, vatom2, atom1, atom2);
4093 case Iop_Sub32x4:
4094 case Iop_CmpEQ32x4:
4095 case Iop_QAdd32Sx4:
4096 case Iop_QAdd32Ux4:
4097 case Iop_QSub32Sx4:
4098 case Iop_QSub32Ux4:
4099 case Iop_QAddExtUSsatSS32x4:
4100 case Iop_QAddExtSUsatUU32x4:
4101 case Iop_QSal32x4:
4102 case Iop_QShl32x4:
4103 case Iop_Avg32Ux4:
4104 case Iop_Avg32Sx4:
4105 case Iop_Add32x4:
4106 case Iop_Max32Ux4:
4107 case Iop_Max32Sx4:
4108 case Iop_Min32Ux4:
4109 case Iop_Min32Sx4:
4110 case Iop_Mul32x4:
4111 case Iop_MulHi32Sx4:
4112 case Iop_MulHi32Ux4:
4113 case Iop_QDMulHi32Sx4:
4114 case Iop_QRDMulHi32Sx4:
4115 case Iop_PolynomialMulAdd32x4:
4116 return binary32Ix4(mce, vatom1, vatom2);
4118 case Iop_Sub64x2:
4119 case Iop_Add64x2:
4120 case Iop_Avg64Ux2:
4121 case Iop_Avg64Sx2:
4122 case Iop_Max64Sx2:
4123 case Iop_Max64Ux2:
4124 case Iop_Min64Sx2:
4125 case Iop_Min64Ux2:
4126 case Iop_CmpEQ64x2:
4127 case Iop_QSal64x2:
4128 case Iop_QShl64x2:
4129 case Iop_QAdd64Ux2:
4130 case Iop_QAdd64Sx2:
4131 case Iop_QSub64Ux2:
4132 case Iop_QSub64Sx2:
4133 case Iop_QAddExtUSsatSS64x2:
4134 case Iop_QAddExtSUsatUU64x2:
4135 case Iop_PolynomialMulAdd64x2:
4136 case Iop_CipherV128:
4137 case Iop_CipherLV128:
4138 case Iop_NCipherV128:
4139 case Iop_NCipherLV128:
4140 case Iop_MulI128by10E:
4141 case Iop_MulI128by10ECarry:
4142 return binary64Ix2(mce, vatom1, vatom2);
4144 case Iop_Add128x1:
4145 case Iop_Sub128x1:
4146 case Iop_CmpNEZ128x1:
4147 return binary128Ix1(mce, vatom1, vatom2);
4149 case Iop_DivU128:
4150 case Iop_DivS128:
4151 case Iop_DivU128E:
4152 case Iop_DivS128E:
4153 case Iop_ModU128:
4154 case Iop_ModS128:
4155 /* I128 x I128 -> I128 */
4156 return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4158 case Iop_QNarrowBin64Sto32Sx4:
4159 case Iop_QNarrowBin64Uto32Ux4:
4160 case Iop_QNarrowBin32Sto16Sx8:
4161 case Iop_QNarrowBin32Uto16Ux8:
4162 case Iop_QNarrowBin32Sto16Ux8:
4163 case Iop_QNarrowBin16Sto8Sx16:
4164 case Iop_QNarrowBin16Uto8Ux16:
4165 case Iop_QNarrowBin16Sto8Ux16:
4166 return vectorNarrowBinV128(mce, op, vatom1, vatom2);
4168 case Iop_Min64Fx2:
4169 case Iop_Max64Fx2:
4170 case Iop_CmpLT64Fx2:
4171 case Iop_CmpLE64Fx2:
4172 case Iop_CmpEQ64Fx2:
4173 case Iop_CmpUN64Fx2:
4174 case Iop_RecipStep64Fx2:
4175 case Iop_RSqrtStep64Fx2:
4176 return binary64Fx2(mce, vatom1, vatom2);
4178 case Iop_CmpLT16Fx8:
4179 case Iop_CmpLE16Fx8:
4180 case Iop_CmpEQ16Fx8:
4181 return binary16Fx8(mce, vatom1, vatom2);
4183 case Iop_Sub64F0x2:
4184 case Iop_Mul64F0x2:
4185 case Iop_Min64F0x2:
4186 case Iop_Max64F0x2:
4187 case Iop_Div64F0x2:
4188 case Iop_CmpLT64F0x2:
4189 case Iop_CmpLE64F0x2:
4190 case Iop_CmpEQ64F0x2:
4191 case Iop_CmpUN64F0x2:
4192 case Iop_Add64F0x2:
4193 return binary64F0x2(mce, vatom1, vatom2);
4195 case Iop_Min32Fx4:
4196 case Iop_Max32Fx4:
4197 case Iop_CmpLT32Fx4:
4198 case Iop_CmpLE32Fx4:
4199 case Iop_CmpEQ32Fx4:
4200 case Iop_CmpUN32Fx4:
4201 case Iop_CmpGT32Fx4:
4202 case Iop_CmpGE32Fx4:
4203 case Iop_RecipStep32Fx4:
4204 case Iop_RSqrtStep32Fx4:
4205 return binary32Fx4(mce, vatom1, vatom2);
4207 case Iop_Sub32Fx2:
4208 case Iop_Mul32Fx2:
4209 case Iop_Min32Fx2:
4210 case Iop_Max32Fx2:
4211 case Iop_CmpEQ32Fx2:
4212 case Iop_CmpGT32Fx2:
4213 case Iop_CmpGE32Fx2:
4214 case Iop_Add32Fx2:
4215 case Iop_RecipStep32Fx2:
4216 case Iop_RSqrtStep32Fx2:
4217 return binary32Fx2(mce, vatom1, vatom2);
4219 case Iop_Sub32F0x4:
4220 case Iop_Mul32F0x4:
4221 case Iop_Min32F0x4:
4222 case Iop_Max32F0x4:
4223 case Iop_Div32F0x4:
4224 case Iop_CmpLT32F0x4:
4225 case Iop_CmpLE32F0x4:
4226 case Iop_CmpEQ32F0x4:
4227 case Iop_CmpUN32F0x4:
4228 case Iop_Add32F0x4:
4229 return binary32F0x4(mce, vatom1, vatom2);
4231 case Iop_QShlNsatSU8x16:
4232 case Iop_QShlNsatUU8x16:
4233 case Iop_QShlNsatSS8x16:
4234 complainIfUndefined(mce, atom2, NULL);
4235 return mkPCast8x16(mce, vatom1);
4237 case Iop_QShlNsatSU16x8:
4238 case Iop_QShlNsatUU16x8:
4239 case Iop_QShlNsatSS16x8:
4240 complainIfUndefined(mce, atom2, NULL);
4241 return mkPCast16x8(mce, vatom1);
4243 case Iop_QShlNsatSU32x4:
4244 case Iop_QShlNsatUU32x4:
4245 case Iop_QShlNsatSS32x4:
4246 complainIfUndefined(mce, atom2, NULL);
4247 return mkPCast32x4(mce, vatom1);
4249 case Iop_QShlNsatSU64x2:
4250 case Iop_QShlNsatUU64x2:
4251 case Iop_QShlNsatSS64x2:
4252 complainIfUndefined(mce, atom2, NULL);
4253 return mkPCast32x4(mce, vatom1);
4255 /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
4256 To make this simpler, do the following:
4257 * complain if the shift amount (the I8) is undefined
4258 * pcast each lane at the wide width
4259 * truncate each lane to half width
4260 * pcast the resulting 64-bit value to a single bit and use
4261 that as the least significant bit of the upper half of the
4262 result. */
4263 case Iop_QandQShrNnarrow64Uto32Ux2:
4264 case Iop_QandQSarNnarrow64Sto32Sx2:
4265 case Iop_QandQSarNnarrow64Sto32Ux2:
4266 case Iop_QandQRShrNnarrow64Uto32Ux2:
4267 case Iop_QandQRSarNnarrow64Sto32Sx2:
4268 case Iop_QandQRSarNnarrow64Sto32Ux2:
4269 case Iop_QandQShrNnarrow32Uto16Ux4:
4270 case Iop_QandQSarNnarrow32Sto16Sx4:
4271 case Iop_QandQSarNnarrow32Sto16Ux4:
4272 case Iop_QandQRShrNnarrow32Uto16Ux4:
4273 case Iop_QandQRSarNnarrow32Sto16Sx4:
4274 case Iop_QandQRSarNnarrow32Sto16Ux4:
4275 case Iop_QandQShrNnarrow16Uto8Ux8:
4276 case Iop_QandQSarNnarrow16Sto8Sx8:
4277 case Iop_QandQSarNnarrow16Sto8Ux8:
4278 case Iop_QandQRShrNnarrow16Uto8Ux8:
4279 case Iop_QandQRSarNnarrow16Sto8Sx8:
4280 case Iop_QandQRSarNnarrow16Sto8Ux8:
4282 IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
4283 IROp opNarrow = Iop_INVALID;
4284 switch (op) {
4285 case Iop_QandQShrNnarrow64Uto32Ux2:
4286 case Iop_QandQSarNnarrow64Sto32Sx2:
4287 case Iop_QandQSarNnarrow64Sto32Ux2:
4288 case Iop_QandQRShrNnarrow64Uto32Ux2:
4289 case Iop_QandQRSarNnarrow64Sto32Sx2:
4290 case Iop_QandQRSarNnarrow64Sto32Ux2:
4291 fnPessim = mkPCast64x2;
4292 opNarrow = Iop_NarrowUn64to32x2;
4293 break;
4294 case Iop_QandQShrNnarrow32Uto16Ux4:
4295 case Iop_QandQSarNnarrow32Sto16Sx4:
4296 case Iop_QandQSarNnarrow32Sto16Ux4:
4297 case Iop_QandQRShrNnarrow32Uto16Ux4:
4298 case Iop_QandQRSarNnarrow32Sto16Sx4:
4299 case Iop_QandQRSarNnarrow32Sto16Ux4:
4300 fnPessim = mkPCast32x4;
4301 opNarrow = Iop_NarrowUn32to16x4;
4302 break;
4303 case Iop_QandQShrNnarrow16Uto8Ux8:
4304 case Iop_QandQSarNnarrow16Sto8Sx8:
4305 case Iop_QandQSarNnarrow16Sto8Ux8:
4306 case Iop_QandQRShrNnarrow16Uto8Ux8:
4307 case Iop_QandQRSarNnarrow16Sto8Sx8:
4308 case Iop_QandQRSarNnarrow16Sto8Ux8:
4309 fnPessim = mkPCast16x8;
4310 opNarrow = Iop_NarrowUn16to8x8;
4311 break;
4312 default:
4313 tl_assert(0);
4315 complainIfUndefined(mce, atom2, NULL);
4316 // Pessimised shift result
4317 IRAtom* shV
4318 = fnPessim(mce, vatom1);
4319 // Narrowed, pessimised shift result
4320 IRAtom* shVnarrowed
4321 = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
4322 // Generates: Def--(63)--Def PCast-to-I1(narrowed)
4323 IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
4324 // and assemble the result
4325 return assignNew('V', mce, Ity_V128,
4326 binop(Iop_64HLtoV128, qV, shVnarrowed));
4329 case Iop_Mull32Sx2:
4330 case Iop_Mull32Ux2:
4331 case Iop_QDMull32Sx2:
4332 return vectorWidenI64(mce, Iop_Widen32Sto64x2,
4333 mkUifU64(mce, vatom1, vatom2));
4335 case Iop_Mull16Sx4:
4336 case Iop_Mull16Ux4:
4337 case Iop_QDMull16Sx4:
4338 return vectorWidenI64(mce, Iop_Widen16Sto32x4,
4339 mkUifU64(mce, vatom1, vatom2));
4341 case Iop_Mull8Sx8:
4342 case Iop_Mull8Ux8:
4343 case Iop_PolynomialMull8x8:
4344 return vectorWidenI64(mce, Iop_Widen8Sto16x8,
4345 mkUifU64(mce, vatom1, vatom2));
4347 case Iop_PwAdd32x4:
4348 return mkPCast32x4(mce,
4349 assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
4350 mkPCast32x4(mce, vatom2))));
4352 case Iop_PwAdd16x8:
4353 return mkPCast16x8(mce,
4354 assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
4355 mkPCast16x8(mce, vatom2))));
4357 case Iop_PwAdd8x16:
4358 return mkPCast8x16(mce,
4359 assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
4360 mkPCast8x16(mce, vatom2))));
4362 /* V128-bit data-steering */
4363 case Iop_SetV128lo32:
4364 case Iop_SetV128lo64:
4365 case Iop_64HLtoV128:
4366 case Iop_InterleaveLO64x2:
4367 case Iop_InterleaveLO32x4:
4368 case Iop_InterleaveLO16x8:
4369 case Iop_InterleaveLO8x16:
4370 case Iop_InterleaveHI64x2:
4371 case Iop_InterleaveHI32x4:
4372 case Iop_InterleaveHI16x8:
4373 case Iop_InterleaveHI8x16:
4374 case Iop_CatOddLanes8x16:
4375 case Iop_CatOddLanes16x8:
4376 case Iop_CatOddLanes32x4:
4377 case Iop_CatEvenLanes8x16:
4378 case Iop_CatEvenLanes16x8:
4379 case Iop_CatEvenLanes32x4:
4380 case Iop_InterleaveOddLanes8x16:
4381 case Iop_InterleaveOddLanes16x8:
4382 case Iop_InterleaveOddLanes32x4:
4383 case Iop_InterleaveEvenLanes8x16:
4384 case Iop_InterleaveEvenLanes16x8:
4385 case Iop_InterleaveEvenLanes32x4:
4386 case Iop_PackOddLanes8x16:
4387 case Iop_PackOddLanes16x8:
4388 case Iop_PackOddLanes32x4:
4389 case Iop_PackEvenLanes8x16:
4390 case Iop_PackEvenLanes16x8:
4391 case Iop_PackEvenLanes32x4:
4392 return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
4394 case Iop_GetElem8x16:
4395 complainIfUndefined(mce, atom2, NULL);
4396 return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
4397 case Iop_GetElem16x8:
4398 complainIfUndefined(mce, atom2, NULL);
4399 return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
4400 case Iop_GetElem32x4:
4401 complainIfUndefined(mce, atom2, NULL);
4402 return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
4403 case Iop_GetElem64x2:
4404 complainIfUndefined(mce, atom2, NULL);
4405 return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
4407 /* Perm8x16: rearrange values in left arg using steering values
4408 from right arg. So rearrange the vbits in the same way but
4409 pessimise wrt steering values. Perm32x4 ditto. */
4410 /* PermOrZero8x16: see comments above for PermOrZero8x8. */
4411 case Iop_Perm8x16:
4412 case Iop_PermOrZero8x16:
4413 return mkUifUV128(
4414 mce,
4415 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4416 mkPCast8x16(mce, vatom2)
4418 case Iop_Perm32x4:
4419 return mkUifUV128(
4420 mce,
4421 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4422 mkPCast32x4(mce, vatom2)
4425 /* These two take the lower half of each 16-bit lane, sign/zero
4426 extend it to 32, and multiply together, producing a 32x4
4427 result (and implicitly ignoring half the operand bits). So
4428 treat it as a bunch of independent 16x8 operations, but then
4429 do 32-bit shifts left-right to copy the lower half results
4430 (which are all 0s or all 1s due to PCasting in binary16Ix8)
4431 into the upper half of each result lane. */
4432 case Iop_MullEven16Ux8:
4433 case Iop_MullEven16Sx8: {
4434 IRAtom* at;
4435 at = binary16Ix8(mce,vatom1,vatom2);
4436 at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
4437 at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
4438 return at;
4441 /* Same deal as Iop_MullEven16{S,U}x8 */
4442 case Iop_MullEven8Ux16:
4443 case Iop_MullEven8Sx16: {
4444 IRAtom* at;
4445 at = binary8Ix16(mce,vatom1,vatom2);
4446 at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
4447 at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
4448 return at;
4451 /* Same deal as Iop_MullEven16{S,U}x8 */
4452 case Iop_MullEven32Ux4:
4453 case Iop_MullEven32Sx4: {
4454 IRAtom* at;
4455 at = binary32Ix4(mce,vatom1,vatom2);
4456 at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
4457 at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
4458 return at;
4461 /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4462 32x4 -> 16x8 laneage, discarding the upper half of each lane.
4463 Simply apply same op to the V bits, since this really no more
4464 than a data steering operation. */
4465 case Iop_NarrowBin32to16x8:
4466 case Iop_NarrowBin16to8x16:
4467 case Iop_NarrowBin64to32x4:
4468 return assignNew('V', mce, Ity_V128,
4469 binop(op, vatom1, vatom2));
4471 case Iop_ShrV128:
4472 case Iop_SarV128:
4473 case Iop_ShlV128:
4474 case Iop_I128StoBCD128:
4475 /* Same scheme as with all other shifts. Note: 10 Nov 05:
4476 this is wrong now, scalar shifts are done properly lazily.
4477 Vector shifts should be fixed too. */
4478 complainIfUndefined(mce, atom2, NULL);
4479 return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4481 case Iop_I128UtoF128: /* I128 -> F128 */
4482 case Iop_I128StoF128: /* I128 -> F128 */
4483 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4485 case Iop_BCDAdd:
4486 case Iop_BCDSub:
4487 return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4489 /* SHA Iops */
4490 case Iop_SHA256:
4491 case Iop_SHA512:
4492 complainIfUndefined(mce, atom2, NULL);
4493 return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4495 /* I128-bit data-steering */
4496 case Iop_64HLto128:
4497 return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
4499 /* V256-bit SIMD */
4501 case Iop_Max64Fx4:
4502 case Iop_Min64Fx4:
4503 return binary64Fx4(mce, vatom1, vatom2);
4505 case Iop_Max32Fx8:
4506 case Iop_Min32Fx8:
4507 return binary32Fx8(mce, vatom1, vatom2);
4509 /* V256-bit data-steering */
4510 case Iop_V128HLtoV256:
4511 return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
4513 /* Scalar floating point */
4515 case Iop_F32toI64S:
4516 case Iop_F32toI64U:
4517 /* I32(rm) x F32 -> I64 */
4518 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4520 case Iop_I64StoF32:
4521 /* I32(rm) x I64 -> F32 */
4522 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4524 case Iop_RoundF64toInt:
4525 case Iop_RoundF64toF32:
4526 case Iop_F64toI64S:
4527 case Iop_F64toI64U:
4528 case Iop_I64StoF64:
4529 case Iop_I64UtoF64:
4530 case Iop_SinF64:
4531 case Iop_CosF64:
4532 case Iop_TanF64:
4533 case Iop_2xm1F64:
4534 case Iop_SqrtF64:
4535 case Iop_RecpExpF64:
4536 /* I32(rm) x I64/F64 -> I64/F64 */
4537 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4539 case Iop_ShlD64:
4540 case Iop_ShrD64:
4541 case Iop_RoundD64toInt:
4542 /* I32(rm) x D64 -> D64 */
4543 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4545 case Iop_ShlD128:
4546 case Iop_ShrD128:
4547 case Iop_RoundD128toInt:
4548 /* I32(rm) x D128 -> D128 */
4549 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4551 case Iop_RoundF128toInt:
4552 /* I32(rm) x F128 -> F128 */
4553 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4555 case Iop_D64toI64S:
4556 case Iop_D64toI64U:
4557 case Iop_I64StoD64:
4558 case Iop_I64UtoD64:
4559 /* I32(rm) x I64/D64 -> D64/I64 */
4560 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4562 case Iop_F32toD32:
4563 case Iop_F64toD32:
4564 case Iop_F128toD32:
4565 case Iop_D32toF32:
4566 case Iop_D64toF32:
4567 case Iop_D128toF32:
4568 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4569 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4571 case Iop_F32toD64:
4572 case Iop_F64toD64:
4573 case Iop_F128toD64:
4574 case Iop_D32toF64:
4575 case Iop_D64toF64:
4576 case Iop_D128toF64:
4577 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4578 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4580 case Iop_F32toD128:
4581 case Iop_F64toD128:
4582 case Iop_F128toD128:
4583 case Iop_D32toF128:
4584 case Iop_D64toF128:
4585 case Iop_D128toF128:
4586 case Iop_I128StoD128:
4587 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4588 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4590 case Iop_SqrtF16:
4591 /* I32(rm) x F16 -> F16 */
4592 return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4594 case Iop_RoundF32toInt:
4595 case Iop_SqrtF32:
4596 case Iop_RecpExpF32:
4597 /* I32(rm) x I32/F32 -> I32/F32 */
4598 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4600 case Iop_SqrtF128:
4601 /* I32(rm) x F128 -> F128 */
4602 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4604 case Iop_I32StoF32:
4605 case Iop_I32UtoF32:
4606 case Iop_F32toI32S:
4607 case Iop_F32toI32U:
4608 /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4609 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4611 case Iop_F64toF16:
4612 case Iop_F32toF16:
4613 /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4614 return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4616 case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32 */
4617 case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32 */
4618 case Iop_F128toF32: /* IRRoundingMode(I32) x F128 -> F32 */
4619 case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32 */
4620 case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32 */
4621 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4623 case Iop_F128toI128S: /* IRRoundingMode(I32) x F128 -> signed I128 */
4624 case Iop_RndF128: /* IRRoundingMode(I32) x F128 -> F128 */
4625 case Iop_D128toI128S: /* IRRoundingMode(I32) x D128 -> signed I128 */
4626 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4628 case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64 */
4629 case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64 */
4630 case Iop_F128toF64: /* IRRoundingMode(I32) x F128 -> F64 */
4631 case Iop_D128toD64: /* IRRoundingMode(I64) x D128 -> D64 */
4632 case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64 */
4633 case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64 */
4634 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4636 case Iop_F64HLtoF128:
4637 case Iop_D64HLtoD128:
4638 return assignNew('V', mce, Ity_I128,
4639 binop(Iop_64HLto128, vatom1, vatom2));
4641 case Iop_F64toI32U:
4642 case Iop_F64toI32S:
4643 case Iop_F64toF32:
4644 case Iop_I64UtoF32:
4645 case Iop_D64toI32U:
4646 case Iop_D64toI32S:
4647 /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4648 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4650 case Iop_D64toD32:
4651 /* First arg is I32 (rounding mode), second is D64 (data). */
4652 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4654 case Iop_F64toI16S:
4655 /* First arg is I32 (rounding mode), second is F64 (data). */
4656 return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4658 case Iop_InsertExpD64:
4659 /* I64 x I64 -> D64 */
4660 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4662 case Iop_InsertExpD128:
4663 /* I64 x I128 -> D128 */
4664 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4666 case Iop_CmpF16:
4667 case Iop_CmpF32:
4668 case Iop_CmpF64:
4669 case Iop_CmpF128:
4670 case Iop_CmpD64:
4671 case Iop_CmpD128:
4672 case Iop_CmpExpD64:
4673 case Iop_CmpExpD128:
4674 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4676 case Iop_MaxNumF32:
4677 case Iop_MinNumF32:
4678 /* F32 x F32 -> F32 */
4679 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4681 case Iop_MaxNumF64:
4682 case Iop_MinNumF64:
4683 /* F64 x F64 -> F64 */
4684 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4686 /* non-FP after here */
4688 case Iop_DivModU64to32:
4689 case Iop_DivModS64to32:
4690 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4692 case Iop_DivModU128to64:
4693 case Iop_DivModS128to64:
4694 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4696 case Iop_8HLto16:
4697 return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
4698 case Iop_16HLto32:
4699 return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
4700 case Iop_32HLto64:
4701 return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
4703 case Iop_DivModU64to64:
4704 case Iop_DivModS64to64: {
4705 IRAtom* vTmp64 = mkLazy2(mce, Ity_I64, vatom1, vatom2);
4706 return assignNew('V', mce, Ity_I128,
4707 binop(Iop_64HLto128, vTmp64, vTmp64));
4710 case Iop_MullS64:
4711 case Iop_MullU64: {
4712 IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4713 IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
4714 return assignNew('V', mce, Ity_I128,
4715 binop(Iop_64HLto128, vHi64, vLo64));
4718 case Iop_DivModU32to32:
4719 case Iop_DivModS32to32: {
4720 IRAtom* vTmp32 = mkLazy2(mce, Ity_I32, vatom1, vatom2);
4721 return assignNew('V', mce, Ity_I64,
4722 binop(Iop_32HLto64, vTmp32, vTmp32));
4725 case Iop_MullS32:
4726 case Iop_MullU32: {
4727 IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4728 IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
4729 return assignNew('V', mce, Ity_I64,
4730 binop(Iop_32HLto64, vHi32, vLo32));
4733 case Iop_MullS16:
4734 case Iop_MullU16: {
4735 IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4736 IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4737 return assignNew('V', mce, Ity_I32,
4738 binop(Iop_16HLto32, vHi16, vLo16));
4741 case Iop_MullS8:
4742 case Iop_MullU8: {
4743 IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4744 IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4745 return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4748 case Iop_Sad8Ux4: /* maybe we could do better? ftm, do mkLazy2. */
4749 case Iop_DivS32:
4750 case Iop_DivU32:
4751 case Iop_DivU32E:
4752 case Iop_DivS32E:
4753 case Iop_QAdd32S: /* could probably do better */
4754 case Iop_QSub32S: /* could probably do better */
4755 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4757 case Iop_DivS64:
4758 case Iop_DivU64:
4759 case Iop_DivS64E:
4760 case Iop_DivU64E:
4761 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4763 case Iop_Add32:
4764 if (mce->dlbo.dl_Add32 == DLexpensive
4765 || (mce->dlbo.dl_Add32 == DLauto && hu == HuOth)) {
4766 return expensiveAddSub(mce,True,Ity_I32,
4767 vatom1,vatom2, atom1,atom2);
4768 } else {
4769 goto cheap_AddSub32;
4771 case Iop_Sub32:
4772 if (mce->dlbo.dl_Sub32 == DLexpensive
4773 || (mce->dlbo.dl_Sub32 == DLauto && hu == HuOth)) {
4774 return expensiveAddSub(mce,False,Ity_I32,
4775 vatom1,vatom2, atom1,atom2);
4776 } else {
4777 goto cheap_AddSub32;
4780 cheap_AddSub32:
4781 case Iop_Mul32:
4782 return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4784 case Iop_CmpORD32S:
4785 case Iop_CmpORD32U:
4786 case Iop_CmpORD64S:
4787 case Iop_CmpORD64U:
4788 return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4790 case Iop_Add64:
4791 if (mce->dlbo.dl_Add64 == DLexpensive
4792 || (mce->dlbo.dl_Add64 == DLauto && hu == HuOth)) {
4793 return expensiveAddSub(mce,True,Ity_I64,
4794 vatom1,vatom2, atom1,atom2);
4795 } else {
4796 goto cheap_AddSub64;
4798 case Iop_Sub64:
4799 if (mce->dlbo.dl_Sub64 == DLexpensive
4800 || (mce->dlbo.dl_Sub64 == DLauto && hu == HuOth)) {
4801 return expensiveAddSub(mce,False,Ity_I64,
4802 vatom1,vatom2, atom1,atom2);
4803 } else {
4804 goto cheap_AddSub64;
4807 cheap_AddSub64:
4808 case Iop_Mul64:
4809 return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4811 case Iop_Mul16:
4812 case Iop_Add16:
4813 case Iop_Sub16:
4814 return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4816 case Iop_Mul8:
4817 case Iop_Sub8:
4818 case Iop_Add8:
4819 return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4821 ////---- CmpXX64
4822 case Iop_CmpEQ64: case Iop_CmpNE64:
4823 if (mce->dlbo.dl_CmpEQ64_CmpNE64 == DLexpensive)
4824 goto expensive_cmp64;
4825 else
4826 goto cheap_cmp64;
4828 expensive_cmp64:
4829 case Iop_ExpCmpNE64:
4830 return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4832 cheap_cmp64:
4833 case Iop_CmpLE64S: case Iop_CmpLE64U:
4834 case Iop_CmpLT64U: case Iop_CmpLT64S:
4835 return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4837 ////---- CmpXX32
4838 case Iop_CmpEQ32: case Iop_CmpNE32:
4839 if (mce->dlbo.dl_CmpEQ32_CmpNE32 == DLexpensive)
4840 goto expensive_cmp32;
4841 else
4842 goto cheap_cmp32;
4844 expensive_cmp32:
4845 case Iop_ExpCmpNE32:
4846 return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4848 cheap_cmp32:
4849 case Iop_CmpLE32S: case Iop_CmpLE32U:
4850 case Iop_CmpLT32U: case Iop_CmpLT32S:
4851 return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4853 ////---- CmpXX16
4854 case Iop_CmpEQ16: case Iop_CmpNE16:
4855 if (mce->dlbo.dl_CmpEQ16_CmpNE16 == DLexpensive)
4856 goto expensive_cmp16;
4857 else
4858 goto cheap_cmp16;
4860 expensive_cmp16:
4861 case Iop_ExpCmpNE16:
4862 return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4864 cheap_cmp16:
4865 return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4867 ////---- CmpXX8
4868 case Iop_CmpEQ8: case Iop_CmpNE8:
4869 if (mce->dlbo.dl_CmpEQ8_CmpNE8 == DLexpensive)
4870 goto expensive_cmp8;
4871 else
4872 goto cheap_cmp8;
4874 expensive_cmp8:
4875 return expensiveCmpEQorNE(mce,Ity_I8, vatom1,vatom2, atom1,atom2 );
4877 cheap_cmp8:
4878 return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4880 ////---- end CmpXX{64,32,16,8}
4882 case Iop_CasCmpEQ8: case Iop_CasCmpNE8:
4883 case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4884 case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4885 case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4886 /* Just say these all produce a defined result, regardless
4887 of their arguments. See COMMENT_ON_CasCmpEQ in this file. */
4888 return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4890 case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4891 return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4893 case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4894 return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4896 case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4897 return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4899 case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4900 return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4902 case Iop_AndV256:
4903 uifu = mkUifUV256; difd = mkDifDV256;
4904 and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4905 case Iop_AndV128:
4906 uifu = mkUifUV128; difd = mkDifDV128;
4907 and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4908 case Iop_And64:
4909 uifu = mkUifU64; difd = mkDifD64;
4910 and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4911 case Iop_And32:
4912 uifu = mkUifU32; difd = mkDifD32;
4913 and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4914 case Iop_And16:
4915 uifu = mkUifU16; difd = mkDifD16;
4916 and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4917 case Iop_And8:
4918 uifu = mkUifU8; difd = mkDifD8;
4919 and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4920 case Iop_And1:
4921 uifu = mkUifU1; difd = mkDifD1;
4922 and_or_ty = Ity_I1; improve = mkImproveAND1; goto do_And_Or;
4924 case Iop_OrV256:
4925 uifu = mkUifUV256; difd = mkDifDV256;
4926 and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4927 case Iop_OrV128:
4928 uifu = mkUifUV128; difd = mkDifDV128;
4929 and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4930 case Iop_Or64:
4931 uifu = mkUifU64; difd = mkDifD64;
4932 and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4933 case Iop_Or32:
4934 uifu = mkUifU32; difd = mkDifD32;
4935 and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4936 case Iop_Or16:
4937 uifu = mkUifU16; difd = mkDifD16;
4938 and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4939 case Iop_Or8:
4940 uifu = mkUifU8; difd = mkDifD8;
4941 and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4942 case Iop_Or1:
4943 uifu = mkUifU1; difd = mkDifD1;
4944 and_or_ty = Ity_I1; improve = mkImproveOR1; goto do_And_Or;
4946 do_And_Or:
4947 return assignNew('V', mce, and_or_ty,
4948 difd(mce, uifu(mce, vatom1, vatom2),
4949 difd(mce, improve(mce, atom1, vatom1),
4950 improve(mce, atom2, vatom2) ) ) );
4952 case Iop_Xor8:
4953 return mkUifU8(mce, vatom1, vatom2);
4954 case Iop_Xor16:
4955 return mkUifU16(mce, vatom1, vatom2);
4956 case Iop_Xor32:
4957 return mkUifU32(mce, vatom1, vatom2);
4958 case Iop_Xor64:
4959 return mkUifU64(mce, vatom1, vatom2);
4960 case Iop_XorV128:
4961 return mkUifUV128(mce, vatom1, vatom2);
4962 case Iop_XorV256:
4963 return mkUifUV256(mce, vatom1, vatom2);
4965 /* V256-bit SIMD */
4967 case Iop_ShrN16x16:
4968 case Iop_ShrN32x8:
4969 case Iop_ShrN64x4:
4970 case Iop_SarN16x16:
4971 case Iop_SarN32x8:
4972 case Iop_ShlN16x16:
4973 case Iop_ShlN32x8:
4974 case Iop_ShlN64x4:
4975 /* Same scheme as with all other shifts. Note: 22 Oct 05:
4976 this is wrong now, scalar shifts are done properly lazily.
4977 Vector shifts should be fixed too. */
4978 complainIfUndefined(mce, atom2, NULL);
4979 return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4981 case Iop_QSub8Ux32:
4982 case Iop_QSub8Sx32:
4983 case Iop_Sub8x32:
4984 case Iop_Min8Ux32:
4985 case Iop_Min8Sx32:
4986 case Iop_Max8Ux32:
4987 case Iop_Max8Sx32:
4988 case Iop_CmpGT8Sx32:
4989 case Iop_CmpEQ8x32:
4990 case Iop_Avg8Ux32:
4991 case Iop_QAdd8Ux32:
4992 case Iop_QAdd8Sx32:
4993 case Iop_Add8x32:
4994 return binary8Ix32(mce, vatom1, vatom2);
4996 case Iop_QSub16Ux16:
4997 case Iop_QSub16Sx16:
4998 case Iop_Sub16x16:
4999 case Iop_Mul16x16:
5000 case Iop_MulHi16Sx16:
5001 case Iop_MulHi16Ux16:
5002 case Iop_Min16Sx16:
5003 case Iop_Min16Ux16:
5004 case Iop_Max16Sx16:
5005 case Iop_Max16Ux16:
5006 case Iop_CmpGT16Sx16:
5007 case Iop_CmpEQ16x16:
5008 case Iop_Avg16Ux16:
5009 case Iop_QAdd16Ux16:
5010 case Iop_QAdd16Sx16:
5011 case Iop_Add16x16:
5012 return binary16Ix16(mce, vatom1, vatom2);
5014 case Iop_Sub32x8:
5015 case Iop_CmpGT32Sx8:
5016 case Iop_CmpEQ32x8:
5017 case Iop_Add32x8:
5018 case Iop_Max32Ux8:
5019 case Iop_Max32Sx8:
5020 case Iop_Min32Ux8:
5021 case Iop_Min32Sx8:
5022 case Iop_Mul32x8:
5023 return binary32Ix8(mce, vatom1, vatom2);
5025 case Iop_Sub64x4:
5026 case Iop_Add64x4:
5027 case Iop_CmpEQ64x4:
5028 case Iop_CmpGT64Sx4:
5029 return binary64Ix4(mce, vatom1, vatom2);
5031 case Iop_I32StoF32x8:
5032 case Iop_F32toI32Sx8:
5033 return unary32Fx8_w_rm(mce, vatom1, vatom2);
5035 /* Perm32x8: rearrange values in left arg using steering values
5036 from right arg. So rearrange the vbits in the same way but
5037 pessimise wrt steering values. */
5038 case Iop_Perm32x8:
5039 return mkUifUV256(
5040 mce,
5041 assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
5042 mkPCast32x8(mce, vatom2)
5045 /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
5046 Handle the shifted results in the same way that other
5047 binary Q ops are handled, eg QSub: UifU the two args,
5048 then pessimise -- which is binaryNIxM. But for the upper
5049 V128, we require to generate just 1 bit which is the
5050 pessimised shift result, with 127 defined zeroes above it.
5052 Note that this overly pessimistic in that in fact only the
5053 bottom 8 bits of each lane of the second arg determine the shift
5054 amount. Really we ought to ignore any undefinedness in the
5055 rest of the lanes of the second arg. */
5056 case Iop_QandSQsh64x2: case Iop_QandUQsh64x2:
5057 case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
5058 case Iop_QandSQsh32x4: case Iop_QandUQsh32x4:
5059 case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
5060 case Iop_QandSQsh16x8: case Iop_QandUQsh16x8:
5061 case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
5062 case Iop_QandSQsh8x16: case Iop_QandUQsh8x16:
5063 case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
5065 // The function to generate the pessimised shift result
5066 IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
5067 switch (op) {
5068 case Iop_QandSQsh64x2:
5069 case Iop_QandUQsh64x2:
5070 case Iop_QandSQRsh64x2:
5071 case Iop_QandUQRsh64x2:
5072 binaryNIxM = binary64Ix2;
5073 break;
5074 case Iop_QandSQsh32x4:
5075 case Iop_QandUQsh32x4:
5076 case Iop_QandSQRsh32x4:
5077 case Iop_QandUQRsh32x4:
5078 binaryNIxM = binary32Ix4;
5079 break;
5080 case Iop_QandSQsh16x8:
5081 case Iop_QandUQsh16x8:
5082 case Iop_QandSQRsh16x8:
5083 case Iop_QandUQRsh16x8:
5084 binaryNIxM = binary16Ix8;
5085 break;
5086 case Iop_QandSQsh8x16:
5087 case Iop_QandUQsh8x16:
5088 case Iop_QandSQRsh8x16:
5089 case Iop_QandUQRsh8x16:
5090 binaryNIxM = binary8Ix16;
5091 break;
5092 default:
5093 tl_assert(0);
5095 tl_assert(binaryNIxM);
5096 // Pessimised shift result, shV[127:0]
5097 IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
5098 // Generates: Def--(127)--Def PCast-to-I1(shV)
5099 IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
5100 // and assemble the result
5101 return assignNew('V', mce, Ity_V256,
5102 binop(Iop_V128HLtoV256, qV, shV));
5105 case Iop_F32toF16x4: {
5106 // First, PCast the input vector, retaining the 32x4 format.
5107 IRAtom* pcasted = mkPCast32x4(mce, vatom2); // :: 32x4
5108 // Now truncate each 32 bit lane to 16 bits. Since we already PCasted
5109 // the input, we're not going to lose any information.
5110 IRAtom* pcHI64
5111 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, pcasted));//32x2
5112 IRAtom* pcLO64
5113 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, pcasted)); // 32x2
5114 IRAtom* narrowed
5115 = assignNew('V', mce, Ity_I64, binop(Iop_NarrowBin32to16x4,
5116 pcHI64, pcLO64)); // 16x4
5117 // Finally, roll in any badness from the rounding mode.
5118 IRAtom* rmPCasted = mkPCastTo(mce, Ity_I64, vatom1);
5119 return mkUifU64(mce, narrowed, rmPCasted);
5122 case Iop_F32toF16x8: {
5123 // Same scheme as for Iop_F32toF16x4.
5124 IRAtom* pcasted = mkPCast32x8(mce, vatom2); // :: 32x8
5125 IRAtom* pcHI128
5126 = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_1,
5127 pcasted)); // 32x4
5128 IRAtom* pcLO128
5129 = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_0,
5130 pcasted)); // 32x4
5131 IRAtom* narrowed
5132 = assignNew('V', mce, Ity_V128, binop(Iop_NarrowBin32to16x8,
5133 pcHI128, pcLO128)); // 16x8
5134 // Finally, roll in any badness from the rounding mode.
5135 IRAtom* rmPCasted = mkPCastTo(mce, Ity_V128, vatom1);
5136 return mkUifUV128(mce, narrowed, rmPCasted);
5139 default:
5140 ppIROp(op);
5141 VG_(tool_panic)("memcheck:expr2vbits_Binop");
5146 static
5147 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
5149 /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
5150 selection of shadow operation implicitly duplicates the logic in
5151 do_shadow_LoadG and should be kept in sync (in the very unlikely
5152 event that the interpretation of such widening ops changes in
5153 future). See comment in do_shadow_LoadG. */
5154 IRAtom* vatom = expr2vbits( mce, atom, HuOth );
5155 tl_assert(isOriginalAtom(mce,atom));
5156 switch (op) {
5158 case Iop_Abs64Fx2:
5159 case Iop_Neg64Fx2:
5160 case Iop_RSqrtEst64Fx2:
5161 case Iop_RecipEst64Fx2:
5162 case Iop_Log2_64Fx2:
5163 return unary64Fx2(mce, vatom);
5165 case Iop_Sqrt64F0x2:
5166 return unary64F0x2(mce, vatom);
5168 case Iop_Sqrt32Fx8:
5169 case Iop_RSqrtEst32Fx8:
5170 case Iop_RecipEst32Fx8:
5171 return unary32Fx8(mce, vatom);
5173 case Iop_Sqrt64Fx4:
5174 return unary64Fx4(mce, vatom);
5176 case Iop_RecipEst32Fx4:
5177 case Iop_I32UtoF32x4_DEP:
5178 case Iop_I32StoF32x4_DEP:
5179 case Iop_QF32toI32Ux4_RZ:
5180 case Iop_QF32toI32Sx4_RZ:
5181 case Iop_RoundF32x4_RM:
5182 case Iop_RoundF32x4_RP:
5183 case Iop_RoundF32x4_RN:
5184 case Iop_RoundF32x4_RZ:
5185 case Iop_RecipEst32Ux4:
5186 case Iop_Abs32Fx4:
5187 case Iop_Neg32Fx4:
5188 case Iop_RSqrtEst32Fx4:
5189 case Iop_Log2_32Fx4:
5190 case Iop_Exp2_32Fx4:
5191 return unary32Fx4(mce, vatom);
5193 case Iop_I32UtoF32x2_DEP:
5194 case Iop_I32StoF32x2_DEP:
5195 case Iop_RecipEst32Fx2:
5196 case Iop_RecipEst32Ux2:
5197 case Iop_Abs32Fx2:
5198 case Iop_Neg32Fx2:
5199 case Iop_RSqrtEst32Fx2:
5200 return unary32Fx2(mce, vatom);
5202 case Iop_Sqrt32F0x4:
5203 case Iop_RSqrtEst32F0x4:
5204 case Iop_RecipEst32F0x4:
5205 return unary32F0x4(mce, vatom);
5207 case Iop_Abs16Fx8:
5208 case Iop_Neg16Fx8:
5209 return unary16Fx8(mce, vatom);
5211 // These are self-shadowing.
5212 case Iop_32UtoV128:
5213 case Iop_64UtoV128:
5214 case Iop_Dup8x16:
5215 case Iop_Dup16x8:
5216 case Iop_Dup32x4:
5217 case Iop_Reverse1sIn8_x16:
5218 case Iop_Reverse8sIn16_x8:
5219 case Iop_Reverse8sIn32_x4:
5220 case Iop_Reverse16sIn32_x4:
5221 case Iop_Reverse8sIn64_x2:
5222 case Iop_Reverse16sIn64_x2:
5223 case Iop_Reverse32sIn64_x2:
5224 case Iop_V256toV128_1: case Iop_V256toV128_0:
5225 case Iop_ZeroHI64ofV128:
5226 case Iop_ZeroHI96ofV128:
5227 case Iop_ZeroHI112ofV128:
5228 case Iop_ZeroHI120ofV128:
5229 case Iop_ReinterpI128asV128: /* I128 -> V128 */
5230 return assignNew('V', mce, Ity_V128, unop(op, vatom));
5232 case Iop_F128HItoF64: /* F128 -> high half of F128 */
5233 case Iop_D128HItoD64: /* D128 -> high half of D128 */
5234 return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
5236 case Iop_F128LOtoF64: /* F128 -> low half of F128 */
5237 case Iop_D128LOtoD64: /* D128 -> low half of D128 */
5238 return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
5240 case Iop_NegF128:
5241 case Iop_AbsF128:
5242 case Iop_RndF128:
5243 case Iop_TruncF128toI128S: /* F128 -> I128S */
5244 case Iop_TruncF128toI128U: /* F128 -> I128U */
5245 case Iop_ReinterpV128asI128: /* V128 -> I128 */
5246 case Iop_ReinterpI128asF128:
5247 case Iop_ReinterpF128asI128:
5248 return mkPCastTo(mce, Ity_I128, vatom);
5250 case Iop_BCD128toI128S:
5251 case Iop_MulI128by10:
5252 case Iop_MulI128by10Carry:
5253 case Iop_F16toF64x2:
5254 case Iop_F64toF16x2_DEP:
5255 // FIXME JRS 2018-Nov-15. This is surely not correct!
5256 return vatom;
5258 case Iop_ReinterpI32asF32:
5259 case Iop_ReinterpF32asI32:
5260 return assignNew('V', mce, Ity_I32, vatom);
5262 case Iop_ReinterpF64asI64:
5263 case Iop_ReinterpI64asF64:
5264 case Iop_ReinterpI64asD64:
5265 case Iop_ReinterpD64asI64:
5266 return assignNew('V', mce, Ity_I64, vatom);
5268 case Iop_I32StoF128: /* signed I32 -> F128 */
5269 case Iop_I64StoF128: /* signed I64 -> F128 */
5270 case Iop_I32UtoF128: /* unsigned I32 -> F128 */
5271 case Iop_I64UtoF128: /* unsigned I64 -> F128 */
5272 case Iop_F32toF128: /* F32 -> F128 */
5273 case Iop_F64toF128: /* F64 -> F128 */
5274 case Iop_I32StoD128: /* signed I64 -> D128 */
5275 case Iop_I64StoD128: /* signed I64 -> D128 */
5276 case Iop_I32UtoD128: /* unsigned I32 -> D128 */
5277 case Iop_I64UtoD128: /* unsigned I64 -> D128 */
5278 return mkPCastTo(mce, Ity_I128, vatom);
5280 case Iop_F16toF64:
5281 case Iop_F32toF64:
5282 case Iop_I32StoF64:
5283 case Iop_I32UtoF64:
5284 case Iop_NegF64:
5285 case Iop_AbsF64:
5286 case Iop_RSqrtEst5GoodF64:
5287 case Iop_RoundF64toF64_NEAREST:
5288 case Iop_RoundF64toF64_NegINF:
5289 case Iop_RoundF64toF64_PosINF:
5290 case Iop_RoundF64toF64_ZERO:
5291 case Iop_RoundF64toIntA0:
5292 case Iop_RoundF64toIntE:
5293 case Iop_D32toD64:
5294 case Iop_I32StoD64:
5295 case Iop_I32UtoD64:
5296 case Iop_ExtractExpD64: /* D64 -> I64 */
5297 case Iop_ExtractExpD128: /* D128 -> I64 */
5298 case Iop_ExtractSigD64: /* D64 -> I64 */
5299 case Iop_ExtractSigD128: /* D128 -> I64 */
5300 case Iop_DPBtoBCD:
5301 case Iop_BCDtoDPB:
5302 return mkPCastTo(mce, Ity_I64, vatom);
5304 case Iop_D64toD128:
5305 return mkPCastTo(mce, Ity_I128, vatom);
5307 case Iop_TruncF64asF32:
5308 case Iop_NegF32:
5309 case Iop_AbsF32:
5310 case Iop_F16toF32:
5311 case Iop_RoundF32toIntA0:
5312 case Iop_RoundF32toIntE:
5313 return mkPCastTo(mce, Ity_I32, vatom);
5315 case Iop_AbsF16:
5316 case Iop_NegF16:
5317 return mkPCastTo(mce, Ity_I16, vatom);
5319 case Iop_Ctz32: case Iop_CtzNat32:
5320 case Iop_Ctz64: case Iop_CtzNat64:
5321 return expensiveCountTrailingZeroes(mce, op, atom, vatom);
5323 case Iop_Clz32: case Iop_ClzNat32:
5324 case Iop_Clz64: case Iop_ClzNat64:
5325 return expensiveCountLeadingZeroes(mce, op, atom, vatom);
5327 // PopCount32: this is slightly pessimistic. It is true that the
5328 // result depends on all input bits, so that aspect of the PCast is
5329 // correct. However, regardless of the input, only the lowest 5 bits
5330 // out of the output can ever be undefined. So we could actually
5331 // "improve" the results here by marking the top 27 bits of output as
5332 // defined. A similar comment applies for PopCount64.
5333 case Iop_PopCount32:
5334 return mkPCastTo(mce, Ity_I32, vatom);
5335 case Iop_PopCount64:
5336 return mkPCastTo(mce, Ity_I64, vatom);
5338 // These are self-shadowing.
5339 case Iop_1Uto64:
5340 case Iop_1Sto64:
5341 case Iop_8Uto64:
5342 case Iop_8Sto64:
5343 case Iop_16Uto64:
5344 case Iop_16Sto64:
5345 case Iop_32Sto64:
5346 case Iop_32Uto64:
5347 case Iop_V128to64:
5348 case Iop_V128HIto64:
5349 case Iop_128HIto64:
5350 case Iop_128to64:
5351 case Iop_Dup8x8:
5352 case Iop_Dup16x4:
5353 case Iop_Dup32x2:
5354 case Iop_Reverse8sIn16_x4:
5355 case Iop_Reverse8sIn32_x2:
5356 case Iop_Reverse16sIn32_x2:
5357 case Iop_Reverse8sIn64_x1:
5358 case Iop_Reverse16sIn64_x1:
5359 case Iop_Reverse32sIn64_x1:
5360 case Iop_V256to64_0: case Iop_V256to64_1:
5361 case Iop_V256to64_2: case Iop_V256to64_3:
5362 return assignNew('V', mce, Ity_I64, unop(op, vatom));
5364 // These are self-shadowing.
5365 case Iop_64to32:
5366 case Iop_64HIto32:
5367 case Iop_1Uto32:
5368 case Iop_1Sto32:
5369 case Iop_8Uto32:
5370 case Iop_16Uto32:
5371 case Iop_16Sto32:
5372 case Iop_8Sto32:
5373 case Iop_V128to32:
5374 case Iop_Reverse8sIn32_x1:
5375 return assignNew('V', mce, Ity_I32, unop(op, vatom));
5377 // These are self-shadowing.
5378 case Iop_1Sto16:
5379 case Iop_8Sto16:
5380 case Iop_8Uto16:
5381 case Iop_32to16:
5382 case Iop_32HIto16:
5383 case Iop_64to16:
5384 case Iop_GetMSBs8x16:
5385 return assignNew('V', mce, Ity_I16, unop(op, vatom));
5387 // These are self-shadowing.
5388 case Iop_1Uto8:
5389 case Iop_1Sto8:
5390 case Iop_16to8:
5391 case Iop_16HIto8:
5392 case Iop_32to8:
5393 case Iop_64to8:
5394 case Iop_GetMSBs8x8:
5395 return assignNew('V', mce, Ity_I8, unop(op, vatom));
5397 case Iop_32to1:
5398 return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
5400 case Iop_64to1:
5401 return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
5403 case Iop_NotV256:
5404 case Iop_NotV128:
5405 case Iop_Not64:
5406 case Iop_Not32:
5407 case Iop_Not16:
5408 case Iop_Not8:
5409 case Iop_Not1:
5410 // FIXME JRS 2018-Nov-15. This is surely not correct!
5411 return vatom;
5413 case Iop_CmpNEZ8x8:
5414 case Iop_Cnt8x8:
5415 case Iop_Clz8x8:
5416 case Iop_Cls8x8:
5417 case Iop_Abs8x8:
5418 return mkPCast8x8(mce, vatom);
5420 case Iop_CmpNEZ8x16:
5421 case Iop_Cnt8x16:
5422 case Iop_Clz8x16:
5423 case Iop_Cls8x16:
5424 case Iop_Abs8x16:
5425 case Iop_Ctz8x16:
5426 return mkPCast8x16(mce, vatom);
5428 case Iop_CmpNEZ16x4:
5429 case Iop_Clz16x4:
5430 case Iop_Cls16x4:
5431 case Iop_Abs16x4:
5432 return mkPCast16x4(mce, vatom);
5434 case Iop_CmpNEZ16x8:
5435 case Iop_Clz16x8:
5436 case Iop_Cls16x8:
5437 case Iop_Abs16x8:
5438 case Iop_Ctz16x8:
5439 return mkPCast16x8(mce, vatom);
5441 case Iop_CmpNEZ32x2:
5442 case Iop_Clz32x2:
5443 case Iop_Cls32x2:
5444 case Iop_F32toI32Ux2_RZ:
5445 case Iop_F32toI32Sx2_RZ:
5446 case Iop_Abs32x2:
5447 return mkPCast32x2(mce, vatom);
5449 case Iop_CmpNEZ32x4:
5450 case Iop_Clz32x4:
5451 case Iop_Cls32x4:
5452 case Iop_F32toI32Ux4_RZ:
5453 case Iop_F32toI32Sx4_RZ:
5454 case Iop_Abs32x4:
5455 case Iop_RSqrtEst32Ux4:
5456 case Iop_Ctz32x4:
5457 return mkPCast32x4(mce, vatom);
5459 case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
5460 case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
5461 case Iop_CmpwNEZ32:
5462 return mkPCastTo(mce, Ity_I32, vatom);
5464 case Iop_TruncF128toI64S: /* F128 -> I64S */
5465 case Iop_TruncF128toI64U: /* F128 -> I64U */
5466 case Iop_CmpwNEZ64:
5467 return mkPCastTo(mce, Ity_I64, vatom);
5469 case Iop_CmpNEZ64x2:
5470 case Iop_CipherSV128:
5471 case Iop_Clz64x2:
5472 case Iop_Abs64x2:
5473 case Iop_Ctz64x2:
5474 return mkPCast64x2(mce, vatom);
5476 // This is self-shadowing.
5477 case Iop_PwBitMtxXpose64x2:
5478 return assignNew('V', mce, Ity_V128, unop(op, vatom));
5480 case Iop_NarrowUn16to8x8:
5481 case Iop_NarrowUn32to16x4:
5482 case Iop_NarrowUn64to32x2:
5483 case Iop_QNarrowUn16Sto8Sx8:
5484 case Iop_QNarrowUn16Sto8Ux8:
5485 case Iop_QNarrowUn16Uto8Ux8:
5486 case Iop_QNarrowUn32Sto16Sx4:
5487 case Iop_QNarrowUn32Sto16Ux4:
5488 case Iop_QNarrowUn32Uto16Ux4:
5489 case Iop_QNarrowUn64Sto32Sx2:
5490 case Iop_QNarrowUn64Sto32Ux2:
5491 case Iop_QNarrowUn64Uto32Ux2:
5492 return vectorNarrowUnV128(mce, op, vatom);
5494 // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not
5495 // right.
5496 case Iop_F32toF16x4_DEP:
5497 return vectorNarrowUnV128(mce, op, vatom);
5499 case Iop_Widen8Sto16x8:
5500 case Iop_Widen8Uto16x8:
5501 case Iop_Widen16Sto32x4:
5502 case Iop_Widen16Uto32x4:
5503 case Iop_Widen32Sto64x2:
5504 case Iop_Widen32Uto64x2:
5505 return vectorWidenI64(mce, op, vatom);
5507 case Iop_F16toF32x4:
5508 // JRS 2019 Mar 17: this definitely isn't right, but it probably works
5509 // OK by accident if -- as seems likely -- the F16 to F32 conversion
5510 // preserves will generate an output 32 bits with at least one 1 bit
5511 // set if there's one or more 1 bits set in the input 16 bits. More
5512 // correct code for this is just below, but commented out, so as to
5513 // avoid short-term backend failures on targets that can't do
5514 // Iop_Interleave{LO,HI}16x4.
5515 return vectorWidenI64(mce, op, vatom);
5517 case Iop_F16toF32x8: {
5518 // PCast the input at 16x8. This makes each lane hold either all
5519 // zeroes or all ones.
5520 IRAtom* pcasted = mkPCast16x8(mce, vatom); // :: I16x8
5521 // Now double the width of each lane to 32 bits. Because the lanes are
5522 // all zeroes or all ones, we can just copy the each lane twice into
5523 // the result. Here's the low half:
5524 IRAtom* widenedLO // :: I32x4
5525 = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveLO16x8,
5526 pcasted, pcasted));
5527 // And the high half:
5528 IRAtom* widenedHI // :: I32x4
5529 = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveHI16x8,
5530 pcasted, pcasted));
5531 // Glue them back together:
5532 return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
5533 widenedHI, widenedLO));
5536 // See comment just above, for Iop_F16toF32x4
5537 //case Iop_F16toF32x4: {
5538 // // Same scheme as F16toF32x4
5539 // IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4
5540 // IRAtom* widenedLO // :: I32x2
5541 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4,
5542 // pcasted, pcasted));
5543 // IRAtom* widenedHI // :: I32x4
5544 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4,
5545 // pcasted, pcasted));
5546 // // Glue them back together:
5547 // return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
5548 // widenedHI, widenedLO));
5551 case Iop_PwAddL32Ux2:
5552 case Iop_PwAddL32Sx2:
5553 return mkPCastTo(mce, Ity_I64,
5554 assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
5556 case Iop_PwAddL16Ux4:
5557 case Iop_PwAddL16Sx4:
5558 return mkPCast32x2(mce,
5559 assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
5561 case Iop_PwAddL8Ux8:
5562 case Iop_PwAddL8Sx8:
5563 return mkPCast16x4(mce,
5564 assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
5566 case Iop_PwAddL32Ux4:
5567 case Iop_PwAddL32Sx4:
5568 return mkPCast64x2(mce,
5569 assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
5571 case Iop_PwAddL64Ux2:
5572 return mkPCast128x1(mce,
5573 assignNew('V', mce, Ity_V128, unop(op, mkPCast64x2(mce, vatom))));
5575 case Iop_PwAddL16Ux8:
5576 case Iop_PwAddL16Sx8:
5577 return mkPCast32x4(mce,
5578 assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
5580 case Iop_PwAddL8Ux16:
5581 case Iop_PwAddL8Sx16:
5582 return mkPCast16x8(mce,
5583 assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
5585 case Iop_I64UtoF32:
5586 default:
5587 ppIROp(op);
5588 VG_(tool_panic)("memcheck:expr2vbits_Unop");
5593 /* Worker function -- do not call directly. See comments on
5594 expr2vbits_Load for the meaning of |guard|.
5596 Generates IR to (1) perform a definedness test of |addr|, (2)
5597 perform a validity test of |addr|, and (3) return the Vbits for the
5598 location indicated by |addr|. All of this only happens when
5599 |guard| is NULL or |guard| evaluates to True at run time.
5601 If |guard| evaluates to False at run time, the returned value is
5602 the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5603 performed.
5605 The definedness of |guard| itself is not checked. That is assumed
5606 to have been done before this point, by the caller. */
5607 static
5608 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
5609 IREndness end, IRType ty,
5610 IRAtom* addr, UInt bias, IRAtom* guard )
5612 tl_assert(isOriginalAtom(mce,addr));
5613 tl_assert(end == Iend_LE || end == Iend_BE);
5615 /* First, emit a definedness test for the address. This also sets
5616 the address (shadow) to 'defined' following the test. */
5617 complainIfUndefined( mce, addr, guard );
5619 /* Now cook up a call to the relevant helper function, to read the data V
5620 bits from shadow memory. Note that I128 loads are done by pretending
5621 we're doing a V128 load, and then converting the resulting V128 vbits
5622 word to an I128, right at the end of this function -- see `castedToI128`
5623 below. (It's only a minor hack :-) This pertains to bug 444399. */
5624 ty = shadowTypeV(ty);
5626 void* helper = NULL;
5627 const HChar* hname = NULL;
5628 Bool ret_via_outparam = False;
5630 if (end == Iend_LE) {
5631 switch (ty) {
5632 case Ity_V256: helper = &MC_(helperc_LOADV256le);
5633 hname = "MC_(helperc_LOADV256le)";
5634 ret_via_outparam = True;
5635 break;
5636 case Ity_I128: // fallthrough. See comment above.
5637 case Ity_V128: helper = &MC_(helperc_LOADV128le);
5638 hname = "MC_(helperc_LOADV128le)";
5639 ret_via_outparam = True;
5640 break;
5641 case Ity_I64: helper = &MC_(helperc_LOADV64le);
5642 hname = "MC_(helperc_LOADV64le)";
5643 break;
5644 case Ity_I32: helper = &MC_(helperc_LOADV32le);
5645 hname = "MC_(helperc_LOADV32le)";
5646 break;
5647 case Ity_I16: helper = &MC_(helperc_LOADV16le);
5648 hname = "MC_(helperc_LOADV16le)";
5649 break;
5650 case Ity_I8: helper = &MC_(helperc_LOADV8);
5651 hname = "MC_(helperc_LOADV8)";
5652 break;
5653 default: ppIRType(ty);
5654 VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
5656 } else {
5657 switch (ty) {
5658 case Ity_V256: helper = &MC_(helperc_LOADV256be);
5659 hname = "MC_(helperc_LOADV256be)";
5660 ret_via_outparam = True;
5661 break;
5662 case Ity_V128: helper = &MC_(helperc_LOADV128be);
5663 hname = "MC_(helperc_LOADV128be)";
5664 ret_via_outparam = True;
5665 break;
5666 case Ity_I64: helper = &MC_(helperc_LOADV64be);
5667 hname = "MC_(helperc_LOADV64be)";
5668 break;
5669 case Ity_I32: helper = &MC_(helperc_LOADV32be);
5670 hname = "MC_(helperc_LOADV32be)";
5671 break;
5672 case Ity_I16: helper = &MC_(helperc_LOADV16be);
5673 hname = "MC_(helperc_LOADV16be)";
5674 break;
5675 case Ity_I8: helper = &MC_(helperc_LOADV8);
5676 hname = "MC_(helperc_LOADV8)";
5677 break;
5678 default: ppIRType(ty);
5679 VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
5683 tl_assert(helper);
5684 tl_assert(hname);
5686 /* Generate the actual address into addrAct. */
5687 IRAtom* addrAct;
5688 if (bias == 0) {
5689 addrAct = addr;
5690 } else {
5691 IROp mkAdd;
5692 IRAtom* eBias;
5693 IRType tyAddr = mce->hWordTy;
5694 tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5695 mkAdd = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5696 eBias = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5697 addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
5700 /* We need to have a place to park the V bits we're just about to
5701 read. */
5702 IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
5704 /* Here's the call. */
5705 IRDirty* di;
5706 if (ret_via_outparam) {
5707 di = unsafeIRDirty_1_N( datavbits,
5708 2/*regparms*/,
5709 hname, VG_(fnptr_to_fnentry)( helper ),
5710 mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
5711 } else {
5712 di = unsafeIRDirty_1_N( datavbits,
5713 1/*regparms*/,
5714 hname, VG_(fnptr_to_fnentry)( helper ),
5715 mkIRExprVec_1( addrAct ) );
5718 setHelperAnns( mce, di );
5719 if (guard) {
5720 di->guard = guard;
5721 /* Ideally the didn't-happen return value here would be all-ones
5722 (all-undefined), so it'd be obvious if it got used
5723 inadvertently. We can get by with the IR-mandated default
5724 value (0b01 repeating, 0x55 etc) as that'll still look pretty
5725 undefined if it ever leaks out. */
5727 stmt( 'V', mce, IRStmt_Dirty(di) );
5729 if (ty == Ity_I128) {
5730 IRAtom* castedToI128
5731 = assignNew('V', mce, Ity_I128,
5732 unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
5733 return castedToI128;
5734 } else {
5735 return mkexpr(datavbits);
5740 /* Generate IR to do a shadow load. The helper is expected to check
5741 the validity of the address and return the V bits for that address.
5742 This can optionally be controlled by a guard, which is assumed to
5743 be True if NULL. In the case where the guard is False at runtime,
5744 the helper will return the didn't-do-the-call value of 0x55..55.
5745 Since that means "completely undefined result", the caller of
5746 this function will need to fix up the result somehow in that
5747 case.
5749 Caller of this function is also expected to have checked the
5750 definedness of |guard| before this point.
5752 static
5753 IRAtom* expr2vbits_Load ( MCEnv* mce,
5754 IREndness end, IRType ty,
5755 IRAtom* addr, UInt bias,
5756 IRAtom* guard )
5758 tl_assert(end == Iend_LE || end == Iend_BE);
5759 switch (shadowTypeV(ty)) {
5760 case Ity_I8:
5761 case Ity_I16:
5762 case Ity_I32:
5763 case Ity_I64:
5764 case Ity_I128:
5765 case Ity_V128:
5766 case Ity_V256:
5767 return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
5768 default:
5769 VG_(tool_panic)("expr2vbits_Load");
5774 /* The most general handler for guarded loads. Assumes the
5775 definedness of GUARD has already been checked by the caller. A
5776 GUARD of NULL is assumed to mean "always True". Generates code to
5777 check the definedness and validity of ADDR.
5779 Generate IR to do a shadow load from ADDR and return the V bits.
5780 The loaded type is TY. The loaded data is then (shadow) widened by
5781 using VWIDEN, which can be Iop_INVALID to denote a no-op. If GUARD
5782 evaluates to False at run time then the returned Vbits are simply
5783 VALT instead. Note therefore that the argument type of VWIDEN must
5784 be TY and the result type of VWIDEN must equal the type of VALT.
5786 static
5787 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
5788 IREndness end, IRType ty,
5789 IRAtom* addr, UInt bias,
5790 IRAtom* guard,
5791 IROp vwiden, IRAtom* valt )
5793 /* Sanity check the conversion operation, and also set TYWIDE. */
5794 IRType tyWide = Ity_INVALID;
5795 switch (vwiden) {
5796 case Iop_INVALID:
5797 tyWide = ty;
5798 break;
5799 case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
5800 tyWide = Ity_I32;
5801 break;
5802 default:
5803 VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
5806 /* If the guard evaluates to True, this will hold the loaded V bits
5807 at TY. If the guard evaluates to False, this will be all
5808 ones, meaning "all undefined", in which case we will have to
5809 replace it using an ITE below. */
5810 IRAtom* iftrue1
5811 = assignNew('V', mce, ty,
5812 expr2vbits_Load(mce, end, ty, addr, bias, guard));
5813 /* Now (shadow-) widen the loaded V bits to the desired width. In
5814 the guard-is-False case, the allowable widening operators will
5815 in the worst case (unsigned widening) at least leave the
5816 pre-widened part as being marked all-undefined, and in the best
5817 case (signed widening) mark the whole widened result as
5818 undefined. Anyway, it doesn't matter really, since in this case
5819 we will replace said value with the default value |valt| using an
5820 ITE. */
5821 IRAtom* iftrue2
5822 = vwiden == Iop_INVALID
5823 ? iftrue1
5824 : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
5825 /* These are the V bits we will return if the load doesn't take
5826 place. */
5827 IRAtom* iffalse
5828 = valt;
5829 /* Prepare the cond for the ITE. Convert a NULL cond into
5830 something that iropt knows how to fold out later. */
5831 IRAtom* cond
5832 = guard == NULL ? mkU1(1) : guard;
5833 /* And assemble the final result. */
5834 return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
5838 /* A simpler handler for guarded loads, in which there is no
5839 conversion operation, and the default V bit return (when the guard
5840 evaluates to False at runtime) is "all defined". If there is no
5841 guard expression or the guard is always TRUE this function behaves
5842 like expr2vbits_Load. It is assumed that definedness of GUARD has
5843 already been checked at the call site. */
5844 static
5845 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
5846 IREndness end, IRType ty,
5847 IRAtom* addr, UInt bias,
5848 IRAtom *guard )
5850 return expr2vbits_Load_guarded_General(
5851 mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
5856 static
5857 IRAtom* expr2vbits_ITE ( MCEnv* mce,
5858 IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
5860 IRAtom *vbitsC, *vbits0, *vbits1;
5861 IRType ty;
5862 /* Given ITE(cond, iftrue, iffalse), generate
5863 ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5864 That is, steer the V bits like the originals, but trash the
5865 result if the steering value is undefined. This gives
5866 lazy propagation. */
5867 tl_assert(isOriginalAtom(mce, cond));
5868 tl_assert(isOriginalAtom(mce, iftrue));
5869 tl_assert(isOriginalAtom(mce, iffalse));
5871 vbitsC = expr2vbits(mce, cond, HuOth); // could we use HuPCa here?
5872 vbits1 = expr2vbits(mce, iftrue, HuOth);
5873 vbits0 = expr2vbits(mce, iffalse, HuOth);
5874 ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
5876 return
5877 mkUifU(mce, ty, assignNew('V', mce, ty,
5878 IRExpr_ITE(cond, vbits1, vbits0)),
5879 mkPCastTo(mce, ty, vbitsC) );
5882 /* --------- This is the main expression-handling function. --------- */
5884 static
5885 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e,
5886 HowUsed hu/*use HuOth if unknown*/ )
5888 switch (e->tag) {
5890 case Iex_Get:
5891 return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
5893 case Iex_GetI:
5894 return shadow_GETI( mce, e->Iex.GetI.descr,
5895 e->Iex.GetI.ix, e->Iex.GetI.bias );
5897 case Iex_RdTmp:
5898 return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
5900 case Iex_Const:
5901 return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
5903 case Iex_Qop:
5904 return expr2vbits_Qop(
5905 mce,
5906 e->Iex.Qop.details->op,
5907 e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
5908 e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
5911 case Iex_Triop:
5912 return expr2vbits_Triop(
5913 mce,
5914 e->Iex.Triop.details->op,
5915 e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
5916 e->Iex.Triop.details->arg3
5919 case Iex_Binop:
5920 return expr2vbits_Binop(
5921 mce,
5922 e->Iex.Binop.op,
5923 e->Iex.Binop.arg1, e->Iex.Binop.arg2,
5927 case Iex_Unop:
5928 return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5930 case Iex_Load:
5931 return expr2vbits_Load( mce, e->Iex.Load.end,
5932 e->Iex.Load.ty,
5933 e->Iex.Load.addr, 0/*addr bias*/,
5934 NULL/* guard == "always True"*/ );
5936 case Iex_CCall:
5937 return mkLazyN( mce, e->Iex.CCall.args,
5938 e->Iex.CCall.retty,
5939 e->Iex.CCall.cee );
5941 case Iex_ITE:
5942 return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5943 e->Iex.ITE.iffalse);
5945 default:
5946 VG_(printf)("\n");
5947 ppIRExpr(e);
5948 VG_(printf)("\n");
5949 VG_(tool_panic)("memcheck: expr2vbits");
5954 /*------------------------------------------------------------*/
5955 /*--- Generate shadow stmts from all kinds of IRStmts. ---*/
5956 /*------------------------------------------------------------*/
5958 /* Widen a value to the host word size. */
5960 static
5961 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5963 IRType ty, tyH;
5965 /* vatom is vbits-value and as such can only have a shadow type. */
5966 tl_assert(isShadowAtom(mce,vatom));
5968 ty = typeOfIRExpr(mce->sb->tyenv, vatom);
5969 tyH = mce->hWordTy;
5971 if (tyH == Ity_I32) {
5972 switch (ty) {
5973 case Ity_I32:
5974 return vatom;
5975 case Ity_I16:
5976 return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5977 case Ity_I8:
5978 return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5979 default:
5980 goto unhandled;
5982 } else
5983 if (tyH == Ity_I64) {
5984 switch (ty) {
5985 case Ity_I32:
5986 return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5987 case Ity_I16:
5988 return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5989 assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5990 case Ity_I8:
5991 return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5992 assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5993 default:
5994 goto unhandled;
5996 } else {
5997 goto unhandled;
5999 unhandled:
6000 VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
6001 VG_(tool_panic)("zwidenToHostWord");
6005 /* Generate a shadow store. |addr| is always the original address
6006 atom. You can pass in either originals or V-bits for the data
6007 atom, but obviously not both. This function generates a check for
6008 the definedness and (indirectly) the validity of |addr|, but only
6009 when |guard| evaluates to True at run time (or is NULL).
6011 |guard| :: Ity_I1 controls whether the store really happens; NULL
6012 means it unconditionally does. Note that |guard| itself is not
6013 checked for definedness; the caller of this function must do that
6014 if necessary.
6016 static
6017 void do_shadow_Store ( MCEnv* mce,
6018 IREndness end,
6019 IRAtom* addr, UInt bias,
6020 IRAtom* data, IRAtom* vdata,
6021 IRAtom* guard )
6023 IROp mkAdd;
6024 IRType ty, tyAddr;
6025 void* helper = NULL;
6026 const HChar* hname = NULL;
6027 IRConst* c;
6029 tyAddr = mce->hWordTy;
6030 mkAdd = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
6031 tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
6032 tl_assert( end == Iend_LE || end == Iend_BE );
6034 if (data) {
6035 tl_assert(!vdata);
6036 tl_assert(isOriginalAtom(mce, data));
6037 tl_assert(bias == 0);
6038 vdata = expr2vbits( mce, data, HuOth );
6039 } else {
6040 tl_assert(vdata);
6043 tl_assert(isOriginalAtom(mce,addr));
6044 tl_assert(isShadowAtom(mce,vdata));
6046 if (guard) {
6047 tl_assert(isOriginalAtom(mce, guard));
6048 tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
6051 ty = typeOfIRExpr(mce->sb->tyenv, vdata);
6053 // If we're not doing undefined value checking, pretend that this value
6054 // is "all valid". That lets Vex's optimiser remove some of the V bit
6055 // shadow computation ops that precede it.
6056 if (MC_(clo_mc_level) == 1) {
6057 switch (ty) {
6058 case Ity_V256: // V256 weirdness -- used four times
6059 c = IRConst_V256(V_BITS32_DEFINED); break;
6060 case Ity_V128: // V128 weirdness -- used twice
6061 c = IRConst_V128(V_BITS16_DEFINED); break;
6062 case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
6063 case Ity_I64: c = IRConst_U64 (V_BITS64_DEFINED); break;
6064 case Ity_I32: c = IRConst_U32 (V_BITS32_DEFINED); break;
6065 case Ity_I16: c = IRConst_U16 (V_BITS16_DEFINED); break;
6066 case Ity_I8: c = IRConst_U8 (V_BITS8_DEFINED); break;
6067 default: VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
6069 vdata = IRExpr_Const( c );
6072 /* First, emit a definedness test for the address. This also sets
6073 the address (shadow) to 'defined' following the test. Both of
6074 those actions are gated on |guard|. */
6075 complainIfUndefined( mce, addr, guard );
6077 /* Now decide which helper function to call to write the data V
6078 bits into shadow memory. */
6079 if (end == Iend_LE) {
6080 switch (ty) {
6081 case Ity_V256: /* we'll use the helper four times */
6082 case Ity_V128: /* we'll use the helper twice */
6083 case Ity_I128: /* we'll use the helper twice */
6084 case Ity_I64: helper = &MC_(helperc_STOREV64le);
6085 hname = "MC_(helperc_STOREV64le)";
6086 break;
6087 case Ity_I32: helper = &MC_(helperc_STOREV32le);
6088 hname = "MC_(helperc_STOREV32le)";
6089 break;
6090 case Ity_I16: helper = &MC_(helperc_STOREV16le);
6091 hname = "MC_(helperc_STOREV16le)";
6092 break;
6093 case Ity_I8: helper = &MC_(helperc_STOREV8);
6094 hname = "MC_(helperc_STOREV8)";
6095 break;
6096 default: VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
6098 } else {
6099 switch (ty) {
6100 case Ity_V128: /* we'll use the helper twice */
6101 case Ity_I64: helper = &MC_(helperc_STOREV64be);
6102 hname = "MC_(helperc_STOREV64be)";
6103 break;
6104 case Ity_I32: helper = &MC_(helperc_STOREV32be);
6105 hname = "MC_(helperc_STOREV32be)";
6106 break;
6107 case Ity_I16: helper = &MC_(helperc_STOREV16be);
6108 hname = "MC_(helperc_STOREV16be)";
6109 break;
6110 case Ity_I8: helper = &MC_(helperc_STOREV8);
6111 hname = "MC_(helperc_STOREV8)";
6112 break;
6113 /* Note, no V256 case here, because no big-endian target that
6114 we support, has 256 vectors. */
6115 default: VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
6119 if (UNLIKELY(ty == Ity_V256)) {
6121 /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
6122 Q3 being the most significant lane. */
6123 /* These are the offsets of the Qs in memory. */
6124 Int offQ0, offQ1, offQ2, offQ3;
6126 /* Various bits for constructing the 4 lane helper calls */
6127 IRDirty *diQ0, *diQ1, *diQ2, *diQ3;
6128 IRAtom *addrQ0, *addrQ1, *addrQ2, *addrQ3;
6129 IRAtom *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
6130 IRAtom *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
6132 if (end == Iend_LE) {
6133 offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
6134 } else {
6135 offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
6138 eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
6139 addrQ0 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
6140 vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
6141 diQ0 = unsafeIRDirty_0_N(
6142 1/*regparms*/,
6143 hname, VG_(fnptr_to_fnentry)( helper ),
6144 mkIRExprVec_2( addrQ0, vdataQ0 )
6147 eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
6148 addrQ1 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
6149 vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
6150 diQ1 = unsafeIRDirty_0_N(
6151 1/*regparms*/,
6152 hname, VG_(fnptr_to_fnentry)( helper ),
6153 mkIRExprVec_2( addrQ1, vdataQ1 )
6156 eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
6157 addrQ2 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
6158 vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
6159 diQ2 = unsafeIRDirty_0_N(
6160 1/*regparms*/,
6161 hname, VG_(fnptr_to_fnentry)( helper ),
6162 mkIRExprVec_2( addrQ2, vdataQ2 )
6165 eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
6166 addrQ3 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
6167 vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
6168 diQ3 = unsafeIRDirty_0_N(
6169 1/*regparms*/,
6170 hname, VG_(fnptr_to_fnentry)( helper ),
6171 mkIRExprVec_2( addrQ3, vdataQ3 )
6174 if (guard)
6175 diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
6177 setHelperAnns( mce, diQ0 );
6178 setHelperAnns( mce, diQ1 );
6179 setHelperAnns( mce, diQ2 );
6180 setHelperAnns( mce, diQ3 );
6181 stmt( 'V', mce, IRStmt_Dirty(diQ0) );
6182 stmt( 'V', mce, IRStmt_Dirty(diQ1) );
6183 stmt( 'V', mce, IRStmt_Dirty(diQ2) );
6184 stmt( 'V', mce, IRStmt_Dirty(diQ3) );
6187 else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
6189 /* V128/I128-bit case */
6190 /* See comment in next clause re 64-bit regparms */
6191 /* also, need to be careful about endianness */
6193 Int offLo64, offHi64;
6194 IRDirty *diLo64, *diHi64;
6195 IRAtom *addrLo64, *addrHi64;
6196 IRAtom *vdataLo64, *vdataHi64;
6197 IRAtom *eBiasLo64, *eBiasHi64;
6198 IROp opGetLO64, opGetHI64;
6200 if (end == Iend_LE) {
6201 offLo64 = 0;
6202 offHi64 = 8;
6203 } else {
6204 offLo64 = 8;
6205 offHi64 = 0;
6208 if (ty == Ity_V128) {
6209 opGetLO64 = Iop_V128to64;
6210 opGetHI64 = Iop_V128HIto64;
6211 } else {
6212 opGetLO64 = Iop_128to64;
6213 opGetHI64 = Iop_128HIto64;
6216 eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
6217 addrLo64 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
6218 vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
6219 diLo64 = unsafeIRDirty_0_N(
6220 1/*regparms*/,
6221 hname, VG_(fnptr_to_fnentry)( helper ),
6222 mkIRExprVec_2( addrLo64, vdataLo64 )
6224 eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
6225 addrHi64 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
6226 vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
6227 diHi64 = unsafeIRDirty_0_N(
6228 1/*regparms*/,
6229 hname, VG_(fnptr_to_fnentry)( helper ),
6230 mkIRExprVec_2( addrHi64, vdataHi64 )
6232 if (guard) diLo64->guard = guard;
6233 if (guard) diHi64->guard = guard;
6234 setHelperAnns( mce, diLo64 );
6235 setHelperAnns( mce, diHi64 );
6236 stmt( 'V', mce, IRStmt_Dirty(diLo64) );
6237 stmt( 'V', mce, IRStmt_Dirty(diHi64) );
6239 } else {
6241 IRDirty *di;
6242 IRAtom *addrAct;
6244 /* 8/16/32/64-bit cases */
6245 /* Generate the actual address into addrAct. */
6246 if (bias == 0) {
6247 addrAct = addr;
6248 } else {
6249 IRAtom* eBias = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
6250 addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
6253 if (ty == Ity_I64) {
6254 /* We can't do this with regparm 2 on 32-bit platforms, since
6255 the back ends aren't clever enough to handle 64-bit
6256 regparm args. Therefore be different. */
6257 di = unsafeIRDirty_0_N(
6258 1/*regparms*/,
6259 hname, VG_(fnptr_to_fnentry)( helper ),
6260 mkIRExprVec_2( addrAct, vdata )
6262 } else {
6263 di = unsafeIRDirty_0_N(
6264 2/*regparms*/,
6265 hname, VG_(fnptr_to_fnentry)( helper ),
6266 mkIRExprVec_2( addrAct,
6267 zwidenToHostWord( mce, vdata ))
6270 if (guard) di->guard = guard;
6271 setHelperAnns( mce, di );
6272 stmt( 'V', mce, IRStmt_Dirty(di) );
6278 /* Do lazy pessimistic propagation through a dirty helper call, by
6279 looking at the annotations on it. This is the most complex part of
6280 Memcheck. */
6282 static IRType szToITy ( Int n )
6284 switch (n) {
6285 case 1: return Ity_I8;
6286 case 2: return Ity_I16;
6287 case 4: return Ity_I32;
6288 case 8: return Ity_I64;
6289 default: VG_(tool_panic)("szToITy(memcheck)");
6293 static
6294 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
6296 Int i, k, n, toDo, gSz, gOff;
6297 IRAtom *src, *here, *curr;
6298 IRType tySrc, tyDst;
6299 IRTemp dst;
6300 IREndness end;
6302 /* What's the native endianness? We need to know this. */
6303 # if defined(VG_BIGENDIAN)
6304 end = Iend_BE;
6305 # elif defined(VG_LITTLEENDIAN)
6306 end = Iend_LE;
6307 # else
6308 # error "Unknown endianness"
6309 # endif
6311 /* First check the guard. */
6312 complainIfUndefined(mce, d->guard, NULL);
6314 /* Now round up all inputs and PCast over them. */
6315 curr = definedOfType(Ity_I32);
6317 /* Inputs: unmasked args
6318 Note: arguments are evaluated REGARDLESS of the guard expression */
6319 for (i = 0; d->args[i]; i++) {
6320 IRAtom* arg = d->args[i];
6321 if ( (d->cee->mcx_mask & (1<<i))
6322 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
6323 /* ignore this arg */
6324 } else {
6325 here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg, HuOth) );
6326 curr = mkUifU32(mce, here, curr);
6330 /* Inputs: guest state that we read. */
6331 for (i = 0; i < d->nFxState; i++) {
6332 tl_assert(d->fxState[i].fx != Ifx_None);
6333 if (d->fxState[i].fx == Ifx_Write)
6334 continue;
6336 /* Enumerate the described state segments */
6337 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6338 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6339 gSz = d->fxState[i].size;
6341 /* Ignore any sections marked as 'always defined'. */
6342 if (isAlwaysDefd(mce, gOff, gSz)) {
6343 if (0)
6344 VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6345 gOff, gSz);
6346 continue;
6349 /* This state element is read or modified. So we need to
6350 consider it. If larger than 8 bytes, deal with it in
6351 8-byte chunks. */
6352 while (True) {
6353 tl_assert(gSz >= 0);
6354 if (gSz == 0) break;
6355 n = gSz <= 8 ? gSz : 8;
6356 /* update 'curr' with UifU of the state slice
6357 gOff .. gOff+n-1 */
6358 tySrc = szToITy( n );
6360 /* Observe the guard expression. If it is false use an
6361 all-bits-defined bit pattern */
6362 IRAtom *cond, *iffalse, *iftrue;
6364 cond = assignNew('V', mce, Ity_I1, d->guard);
6365 iftrue = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
6366 iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
6367 src = assignNew('V', mce, tySrc,
6368 IRExpr_ITE(cond, iftrue, iffalse));
6370 here = mkPCastTo( mce, Ity_I32, src );
6371 curr = mkUifU32(mce, here, curr);
6372 gSz -= n;
6373 gOff += n;
6378 /* Inputs: memory. First set up some info needed regardless of
6379 whether we're doing reads or writes. */
6381 if (d->mFx != Ifx_None) {
6382 /* Because we may do multiple shadow loads/stores from the same
6383 base address, it's best to do a single test of its
6384 definedness right now. Post-instrumentation optimisation
6385 should remove all but this test. */
6386 IRType tyAddr;
6387 tl_assert(d->mAddr);
6388 complainIfUndefined(mce, d->mAddr, d->guard);
6390 tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
6391 tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
6392 tl_assert(tyAddr == mce->hWordTy); /* not really right */
6395 /* Deal with memory inputs (reads or modifies) */
6396 if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6397 toDo = d->mSize;
6398 /* chew off 32-bit chunks. We don't care about the endianness
6399 since it's all going to be condensed down to a single bit,
6400 but nevertheless choose an endianness which is hopefully
6401 native to the platform. */
6402 while (toDo >= 4) {
6403 here = mkPCastTo(
6404 mce, Ity_I32,
6405 expr2vbits_Load_guarded_Simple(
6406 mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
6408 curr = mkUifU32(mce, here, curr);
6409 toDo -= 4;
6411 /* chew off 16-bit chunks */
6412 while (toDo >= 2) {
6413 here = mkPCastTo(
6414 mce, Ity_I32,
6415 expr2vbits_Load_guarded_Simple(
6416 mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
6418 curr = mkUifU32(mce, here, curr);
6419 toDo -= 2;
6421 /* chew off the remaining 8-bit chunk, if any */
6422 if (toDo == 1) {
6423 here = mkPCastTo(
6424 mce, Ity_I32,
6425 expr2vbits_Load_guarded_Simple(
6426 mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
6428 curr = mkUifU32(mce, here, curr);
6429 toDo -= 1;
6431 tl_assert(toDo == 0);
6434 /* Whew! So curr is a 32-bit V-value summarising pessimistically
6435 all the inputs to the helper. Now we need to re-distribute the
6436 results to all destinations. */
6438 /* Outputs: the destination temporary, if there is one. */
6439 if (d->tmp != IRTemp_INVALID) {
6440 dst = findShadowTmpV(mce, d->tmp);
6441 tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
6442 assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
6445 /* Outputs: guest state that we write or modify. */
6446 for (i = 0; i < d->nFxState; i++) {
6447 tl_assert(d->fxState[i].fx != Ifx_None);
6448 if (d->fxState[i].fx == Ifx_Read)
6449 continue;
6451 /* Enumerate the described state segments */
6452 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6453 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6454 gSz = d->fxState[i].size;
6456 /* Ignore any sections marked as 'always defined'. */
6457 if (isAlwaysDefd(mce, gOff, gSz))
6458 continue;
6460 /* This state element is written or modified. So we need to
6461 consider it. If larger than 8 bytes, deal with it in
6462 8-byte chunks. */
6463 while (True) {
6464 tl_assert(gSz >= 0);
6465 if (gSz == 0) break;
6466 n = gSz <= 8 ? gSz : 8;
6467 /* Write suitably-casted 'curr' to the state slice
6468 gOff .. gOff+n-1 */
6469 tyDst = szToITy( n );
6470 do_shadow_PUT( mce, gOff,
6471 NULL, /* original atom */
6472 mkPCastTo( mce, tyDst, curr ), d->guard );
6473 gSz -= n;
6474 gOff += n;
6479 /* Outputs: memory that we write or modify. Same comments about
6480 endianness as above apply. */
6481 if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
6482 toDo = d->mSize;
6483 /* chew off 32-bit chunks */
6484 while (toDo >= 4) {
6485 do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6486 NULL, /* original data */
6487 mkPCastTo( mce, Ity_I32, curr ),
6488 d->guard );
6489 toDo -= 4;
6491 /* chew off 16-bit chunks */
6492 while (toDo >= 2) {
6493 do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6494 NULL, /* original data */
6495 mkPCastTo( mce, Ity_I16, curr ),
6496 d->guard );
6497 toDo -= 2;
6499 /* chew off the remaining 8-bit chunk, if any */
6500 if (toDo == 1) {
6501 do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6502 NULL, /* original data */
6503 mkPCastTo( mce, Ity_I8, curr ),
6504 d->guard );
6505 toDo -= 1;
6507 tl_assert(toDo == 0);
6513 /* We have an ABI hint telling us that [base .. base+len-1] is to
6514 become undefined ("writable"). Generate code to call a helper to
6515 notify the A/V bit machinery of this fact.
6517 We call
6518 void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
6519 Addr nia );
6521 static
6522 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
6524 IRDirty* di;
6526 if (MC_(clo_mc_level) == 3) {
6527 di = unsafeIRDirty_0_N(
6528 3/*regparms*/,
6529 "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
6530 VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
6531 mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
6533 } else {
6534 /* We ignore the supplied nia, since it is irrelevant. */
6535 tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
6536 /* Special-case the len==128 case, since that is for amd64-ELF,
6537 which is a very common target. */
6538 if (len == 128) {
6539 di = unsafeIRDirty_0_N(
6540 1/*regparms*/,
6541 "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
6542 VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
6543 mkIRExprVec_1( base )
6545 } else {
6546 di = unsafeIRDirty_0_N(
6547 2/*regparms*/,
6548 "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
6549 VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
6550 mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
6555 stmt( 'V', mce, IRStmt_Dirty(di) );
6559 /* ------ Dealing with IRCAS (big and complex) ------ */
6561 /* FWDS */
6562 static IRAtom* gen_load_b ( MCEnv* mce, Int szB,
6563 IRAtom* baseaddr, Int offset );
6564 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
6565 static void gen_store_b ( MCEnv* mce, Int szB,
6566 IRAtom* baseaddr, Int offset, IRAtom* dataB,
6567 IRAtom* guard );
6569 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
6570 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
6573 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
6574 IRExpr.Consts, else this asserts. If they are both Consts, it
6575 doesn't do anything. So that just leaves the RdTmp case.
6577 In which case: this assigns the shadow value SHADOW to the IR
6578 shadow temporary associated with ORIG. That is, ORIG, being an
6579 original temporary, will have a shadow temporary associated with
6580 it. However, in the case envisaged here, there will so far have
6581 been no IR emitted to actually write a shadow value into that
6582 temporary. What this routine does is to (emit IR to) copy the
6583 value in SHADOW into said temporary, so that after this call,
6584 IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
6585 value in SHADOW.
6587 Point is to allow callers to compute "by hand" a shadow value for
6588 ORIG, and force it to be associated with ORIG.
6590 How do we know that that shadow associated with ORIG has not so far
6591 been assigned to? Well, we don't per se know that, but supposing
6592 it had. Then this routine would create a second assignment to it,
6593 and later the IR sanity checker would barf. But that never
6594 happens. QED.
6596 static void bind_shadow_tmp_to_orig ( UChar how,
6597 MCEnv* mce,
6598 IRAtom* orig, IRAtom* shadow )
6600 tl_assert(isOriginalAtom(mce, orig));
6601 tl_assert(isShadowAtom(mce, shadow));
6602 switch (orig->tag) {
6603 case Iex_Const:
6604 tl_assert(shadow->tag == Iex_Const);
6605 break;
6606 case Iex_RdTmp:
6607 tl_assert(shadow->tag == Iex_RdTmp);
6608 if (how == 'V') {
6609 assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
6610 shadow);
6611 } else {
6612 tl_assert(how == 'B');
6613 assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
6614 shadow);
6616 break;
6617 default:
6618 tl_assert(0);
6623 static
6624 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
6626 /* Scheme is (both single- and double- cases):
6628 1. fetch data#,dataB (the proposed new value)
6630 2. fetch expd#,expdB (what we expect to see at the address)
6632 3. check definedness of address
6634 4. load old#,oldB from shadow memory; this also checks
6635 addressibility of the address
6637 5. the CAS itself
6639 6. compute "expected == old". See COMMENT_ON_CasCmpEQ below.
6641 7. if "expected == old" (as computed by (6))
6642 store data#,dataB to shadow memory
6644 Note that 5 reads 'old' but 4 reads 'old#'. Similarly, 5 stores
6645 'data' but 7 stores 'data#'. Hence it is possible for the
6646 shadow data to be incorrectly checked and/or updated:
6648 * 7 is at least gated correctly, since the 'expected == old'
6649 condition is derived from outputs of 5. However, the shadow
6650 write could happen too late: imagine after 5 we are
6651 descheduled, a different thread runs, writes a different
6652 (shadow) value at the address, and then we resume, hence
6653 overwriting the shadow value written by the other thread.
6655 Because the original memory access is atomic, there's no way to
6656 make both the original and shadow accesses into a single atomic
6657 thing, hence this is unavoidable.
6659 At least as Valgrind stands, I don't think it's a problem, since
6660 we're single threaded *and* we guarantee that there are no
6661 context switches during the execution of any specific superblock
6662 -- context switches can only happen at superblock boundaries.
6664 If Valgrind ever becomes MT in the future, then it might be more
6665 of a problem. A possible kludge would be to artificially
6666 associate with the location, a lock, which we must acquire and
6667 release around the transaction as a whole. Hmm, that probably
6668 would't work properly since it only guards us against other
6669 threads doing CASs on the same location, not against other
6670 threads doing normal reads and writes.
6672 ------------------------------------------------------------
6674 COMMENT_ON_CasCmpEQ:
6676 Note two things. Firstly, in the sequence above, we compute
6677 "expected == old", but we don't check definedness of it. Why
6678 not? Also, the x86 and amd64 front ends use
6679 Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6680 determination (expected == old ?) for themselves, and we also
6681 don't check definedness for those primops; we just say that the
6682 result is defined. Why? Details follow.
6684 x86/amd64 contains various forms of locked insns:
6685 * lock prefix before all basic arithmetic insn;
6686 eg lock xorl %reg1,(%reg2)
6687 * atomic exchange reg-mem
6688 * compare-and-swaps
6690 Rather than attempt to represent them all, which would be a
6691 royal PITA, I used a result from Maurice Herlihy
6692 (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6693 demonstrates that compare-and-swap is a primitive more general
6694 than the other two, and so can be used to represent all of them.
6695 So the translation scheme for (eg) lock incl (%reg) is as
6696 follows:
6698 again:
6699 old = * %reg
6700 new = old + 1
6701 atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6703 The "atomically" is the CAS bit. The scheme is always the same:
6704 get old value from memory, compute new value, atomically stuff
6705 new value back in memory iff the old value has not changed (iow,
6706 no other thread modified it in the meantime). If it has changed
6707 then we've been out-raced and we have to start over.
6709 Now that's all very neat, but it has the bad side effect of
6710 introducing an explicit equality test into the translation.
6711 Consider the behaviour of said code on a memory location which
6712 is uninitialised. We will wind up doing a comparison on
6713 uninitialised data, and mc duly complains.
6715 What's difficult about this is, the common case is that the
6716 location is uncontended, and so we're usually comparing the same
6717 value (* %reg) with itself. So we shouldn't complain even if it
6718 is undefined. But mc doesn't know that.
6720 My solution is to mark the == in the IR specially, so as to tell
6721 mc that it almost certainly compares a value with itself, and we
6722 should just regard the result as always defined. Rather than
6723 add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6724 Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6726 So there's always the question of, can this give a false
6727 negative? eg, imagine that initially, * %reg is defined; and we
6728 read that; but then in the gap between the read and the CAS, a
6729 different thread writes an undefined (and different) value at
6730 the location. Then the CAS in this thread will fail and we will
6731 go back to "again:", but without knowing that the trip back
6732 there was based on an undefined comparison. No matter; at least
6733 the other thread won the race and the location is correctly
6734 marked as undefined. What if it wrote an uninitialised version
6735 of the same value that was there originally, though?
6737 etc etc. Seems like there's a small corner case in which we
6738 might lose the fact that something's defined -- we're out-raced
6739 in between the "old = * reg" and the "atomically {", _and_ the
6740 other thread is writing in an undefined version of what's
6741 already there. Well, that seems pretty unlikely.
6745 If we ever need to reinstate it .. code which generates a
6746 definedness test for "expected == old" was removed at r10432 of
6747 this file.
6749 if (cas->oldHi == IRTemp_INVALID) {
6750 do_shadow_CAS_single( mce, cas );
6751 } else {
6752 do_shadow_CAS_double( mce, cas );
6757 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
6759 IRAtom *vdataLo = NULL, *bdataLo = NULL;
6760 IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6761 IRAtom *voldLo = NULL, *boldLo = NULL;
6762 IRAtom *expd_eq_old = NULL;
6763 IROp opCasCmpEQ;
6764 Int elemSzB;
6765 IRType elemTy;
6766 Bool otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6768 /* single CAS */
6769 tl_assert(cas->oldHi == IRTemp_INVALID);
6770 tl_assert(cas->expdHi == NULL);
6771 tl_assert(cas->dataHi == NULL);
6773 elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6774 switch (elemTy) {
6775 case Ity_I8: elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8; break;
6776 case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
6777 case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
6778 case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
6779 default: tl_assert(0); /* IR defn disallows any other types */
6782 /* 1. fetch data# (the proposed new value) */
6783 tl_assert(isOriginalAtom(mce, cas->dataLo));
6784 vdataLo
6785 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6786 tl_assert(isShadowAtom(mce, vdataLo));
6787 if (otrak) {
6788 bdataLo
6789 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6790 tl_assert(isShadowAtom(mce, bdataLo));
6793 /* 2. fetch expected# (what we expect to see at the address) */
6794 tl_assert(isOriginalAtom(mce, cas->expdLo));
6795 vexpdLo
6796 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6797 tl_assert(isShadowAtom(mce, vexpdLo));
6798 if (otrak) {
6799 bexpdLo
6800 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6801 tl_assert(isShadowAtom(mce, bexpdLo));
6804 /* 3. check definedness of address */
6805 /* 4. fetch old# from shadow memory; this also checks
6806 addressibility of the address */
6807 voldLo
6808 = assignNew(
6809 'V', mce, elemTy,
6810 expr2vbits_Load(
6811 mce,
6812 cas->end, elemTy, cas->addr, 0/*Addr bias*/,
6813 NULL/*always happens*/
6815 bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6816 if (otrak) {
6817 boldLo
6818 = assignNew('B', mce, Ity_I32,
6819 gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
6820 bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6823 /* 5. the CAS itself */
6824 stmt( 'C', mce, IRStmt_CAS(cas) );
6826 /* 6. compute "expected == old" */
6827 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6828 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6829 tree, but it's not copied from the input block. */
6830 expd_eq_old
6831 = assignNew('C', mce, Ity_I1,
6832 binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
6834 /* 7. if "expected == old"
6835 store data# to shadow memory */
6836 do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
6837 NULL/*data*/, vdataLo/*vdata*/,
6838 expd_eq_old/*guard for store*/ );
6839 if (otrak) {
6840 gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
6841 bdataLo/*bdata*/,
6842 expd_eq_old/*guard for store*/ );
6847 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
6849 IRAtom *vdataHi = NULL, *bdataHi = NULL;
6850 IRAtom *vdataLo = NULL, *bdataLo = NULL;
6851 IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
6852 IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6853 IRAtom *voldHi = NULL, *boldHi = NULL;
6854 IRAtom *voldLo = NULL, *boldLo = NULL;
6855 IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
6856 IRAtom *expd_eq_old = NULL, *zero = NULL;
6857 IROp opCasCmpEQ, opOr, opXor;
6858 Int elemSzB, memOffsLo, memOffsHi;
6859 IRType elemTy;
6860 Bool otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6862 /* double CAS */
6863 tl_assert(cas->oldHi != IRTemp_INVALID);
6864 tl_assert(cas->expdHi != NULL);
6865 tl_assert(cas->dataHi != NULL);
6867 elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6868 switch (elemTy) {
6869 case Ity_I8:
6870 opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
6871 elemSzB = 1; zero = mkU8(0);
6872 break;
6873 case Ity_I16:
6874 opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
6875 elemSzB = 2; zero = mkU16(0);
6876 break;
6877 case Ity_I32:
6878 opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
6879 elemSzB = 4; zero = mkU32(0);
6880 break;
6881 case Ity_I64:
6882 opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
6883 elemSzB = 8; zero = mkU64(0);
6884 break;
6885 default:
6886 tl_assert(0); /* IR defn disallows any other types */
6889 /* 1. fetch data# (the proposed new value) */
6890 tl_assert(isOriginalAtom(mce, cas->dataHi));
6891 tl_assert(isOriginalAtom(mce, cas->dataLo));
6892 vdataHi
6893 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi, HuOth));
6894 vdataLo
6895 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6896 tl_assert(isShadowAtom(mce, vdataHi));
6897 tl_assert(isShadowAtom(mce, vdataLo));
6898 if (otrak) {
6899 bdataHi
6900 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
6901 bdataLo
6902 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6903 tl_assert(isShadowAtom(mce, bdataHi));
6904 tl_assert(isShadowAtom(mce, bdataLo));
6907 /* 2. fetch expected# (what we expect to see at the address) */
6908 tl_assert(isOriginalAtom(mce, cas->expdHi));
6909 tl_assert(isOriginalAtom(mce, cas->expdLo));
6910 vexpdHi
6911 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi, HuOth));
6912 vexpdLo
6913 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6914 tl_assert(isShadowAtom(mce, vexpdHi));
6915 tl_assert(isShadowAtom(mce, vexpdLo));
6916 if (otrak) {
6917 bexpdHi
6918 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
6919 bexpdLo
6920 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6921 tl_assert(isShadowAtom(mce, bexpdHi));
6922 tl_assert(isShadowAtom(mce, bexpdLo));
6925 /* 3. check definedness of address */
6926 /* 4. fetch old# from shadow memory; this also checks
6927 addressibility of the address */
6928 if (cas->end == Iend_LE) {
6929 memOffsLo = 0;
6930 memOffsHi = elemSzB;
6931 } else {
6932 tl_assert(cas->end == Iend_BE);
6933 memOffsLo = elemSzB;
6934 memOffsHi = 0;
6936 voldHi
6937 = assignNew(
6938 'V', mce, elemTy,
6939 expr2vbits_Load(
6940 mce,
6941 cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6942 NULL/*always happens*/
6944 voldLo
6945 = assignNew(
6946 'V', mce, elemTy,
6947 expr2vbits_Load(
6948 mce,
6949 cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6950 NULL/*always happens*/
6952 bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6953 bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6954 if (otrak) {
6955 boldHi
6956 = assignNew('B', mce, Ity_I32,
6957 gen_load_b(mce, elemSzB, cas->addr,
6958 memOffsHi/*addr bias*/));
6959 boldLo
6960 = assignNew('B', mce, Ity_I32,
6961 gen_load_b(mce, elemSzB, cas->addr,
6962 memOffsLo/*addr bias*/));
6963 bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6964 bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6967 /* 5. the CAS itself */
6968 stmt( 'C', mce, IRStmt_CAS(cas) );
6970 /* 6. compute "expected == old" */
6971 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6972 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6973 tree, but it's not copied from the input block. */
6975 xHi = oldHi ^ expdHi;
6976 xLo = oldLo ^ expdLo;
6977 xHL = xHi | xLo;
6978 expd_eq_old = xHL == 0;
6980 xHi = assignNew('C', mce, elemTy,
6981 binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6982 xLo = assignNew('C', mce, elemTy,
6983 binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6984 xHL = assignNew('C', mce, elemTy,
6985 binop(opOr, xHi, xLo));
6986 expd_eq_old
6987 = assignNew('C', mce, Ity_I1,
6988 binop(opCasCmpEQ, xHL, zero));
6990 /* 7. if "expected == old"
6991 store data# to shadow memory */
6992 do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6993 NULL/*data*/, vdataHi/*vdata*/,
6994 expd_eq_old/*guard for store*/ );
6995 do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6996 NULL/*data*/, vdataLo/*vdata*/,
6997 expd_eq_old/*guard for store*/ );
6998 if (otrak) {
6999 gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
7000 bdataHi/*bdata*/,
7001 expd_eq_old/*guard for store*/ );
7002 gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
7003 bdataLo/*bdata*/,
7004 expd_eq_old/*guard for store*/ );
7009 /* ------ Dealing with LL/SC (not difficult) ------ */
7011 static void do_shadow_LLSC ( MCEnv* mce,
7012 IREndness stEnd,
7013 IRTemp stResult,
7014 IRExpr* stAddr,
7015 IRExpr* stStoredata )
7017 /* In short: treat a load-linked like a normal load followed by an
7018 assignment of the loaded (shadow) data to the result temporary.
7019 Treat a store-conditional like a normal store, and mark the
7020 result temporary as defined. */
7021 IRType resTy = typeOfIRTemp(mce->sb->tyenv, stResult);
7022 IRTemp resTmp = findShadowTmpV(mce, stResult);
7024 tl_assert(isIRAtom(stAddr));
7025 if (stStoredata)
7026 tl_assert(isIRAtom(stStoredata));
7028 if (stStoredata == NULL) {
7029 /* Load Linked */
7030 /* Just treat this as a normal load, followed by an assignment of
7031 the value to .result. */
7032 /* Stay sane */
7033 tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7034 || resTy == Ity_I16 || resTy == Ity_I8);
7035 assign( 'V', mce, resTmp,
7036 expr2vbits_Load(
7037 mce, stEnd, resTy, stAddr, 0/*addr bias*/,
7038 NULL/*always happens*/) );
7039 } else {
7040 /* Store Conditional */
7041 /* Stay sane */
7042 IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
7043 stStoredata);
7044 tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
7045 || dataTy == Ity_I16 || dataTy == Ity_I8);
7046 do_shadow_Store( mce, stEnd,
7047 stAddr, 0/* addr bias */,
7048 stStoredata,
7049 NULL /* shadow data */,
7050 NULL/*guard*/ );
7051 /* This is a store conditional, so it writes to .result a value
7052 indicating whether or not the store succeeded. Just claim
7053 this value is always defined. In the PowerPC interpretation
7054 of store-conditional, definedness of the success indication
7055 depends on whether the address of the store matches the
7056 reservation address. But we can't tell that here (and
7057 anyway, we're not being PowerPC-specific). At least we are
7058 guaranteed that the definedness of the store address, and its
7059 addressibility, will be checked as per normal. So it seems
7060 pretty safe to just say that the success indication is always
7061 defined.
7063 In schemeS, for origin tracking, we must correspondingly set
7064 a no-origin value for the origin shadow of .result.
7066 tl_assert(resTy == Ity_I1);
7067 assign( 'V', mce, resTmp, definedOfType(resTy) );
7072 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7074 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
7076 complainIfUndefined(mce, sg->guard, NULL);
7077 /* do_shadow_Store will generate code to check the definedness and
7078 validity of sg->addr, in the case where sg->guard evaluates to
7079 True at run-time. */
7080 do_shadow_Store( mce, sg->end,
7081 sg->addr, 0/* addr bias */,
7082 sg->data,
7083 NULL /* shadow data */,
7084 sg->guard );
7087 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
7089 complainIfUndefined(mce, lg->guard, NULL);
7090 /* expr2vbits_Load_guarded_General will generate code to check the
7091 definedness and validity of lg->addr, in the case where
7092 lg->guard evaluates to True at run-time. */
7094 /* Look at the LoadG's built-in conversion operation, to determine
7095 the source (actual loaded data) type, and the equivalent IROp.
7096 NOTE that implicitly we are taking a widening operation to be
7097 applied to original atoms and producing one that applies to V
7098 bits. Since signed and unsigned widening are self-shadowing,
7099 this is a straight copy of the op (modulo swapping from the
7100 IRLoadGOp form to the IROp form). Note also therefore that this
7101 implicitly duplicates the logic to do with said widening ops in
7102 expr2vbits_Unop. See comment at the start of expr2vbits_Unop. */
7103 IROp vwiden = Iop_INVALID;
7104 IRType loadedTy = Ity_INVALID;
7105 switch (lg->cvt) {
7106 case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
7107 case ILGop_Ident64: loadedTy = Ity_I64; vwiden = Iop_INVALID; break;
7108 case ILGop_Ident32: loadedTy = Ity_I32; vwiden = Iop_INVALID; break;
7109 case ILGop_16Uto32: loadedTy = Ity_I16; vwiden = Iop_16Uto32; break;
7110 case ILGop_16Sto32: loadedTy = Ity_I16; vwiden = Iop_16Sto32; break;
7111 case ILGop_8Uto32: loadedTy = Ity_I8; vwiden = Iop_8Uto32; break;
7112 case ILGop_8Sto32: loadedTy = Ity_I8; vwiden = Iop_8Sto32; break;
7113 default: VG_(tool_panic)("do_shadow_LoadG");
7116 IRAtom* vbits_alt
7117 = expr2vbits( mce, lg->alt, HuOth );
7118 IRAtom* vbits_final
7119 = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
7120 lg->addr, 0/*addr bias*/,
7121 lg->guard, vwiden, vbits_alt );
7122 /* And finally, bind the V bits to the destination temporary. */
7123 assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
7127 /*------------------------------------------------------------*/
7128 /*--- Origin tracking stuff ---*/
7129 /*------------------------------------------------------------*/
7131 /* Almost identical to findShadowTmpV. */
7132 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
7134 TempMapEnt* ent;
7135 /* VG_(indexXA) range-checks 'orig', hence no need to check
7136 here. */
7137 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7138 tl_assert(ent->kind == Orig);
7139 if (ent->shadowB == IRTemp_INVALID) {
7140 IRTemp tmpB
7141 = newTemp( mce, Ity_I32, BSh );
7142 /* newTemp may cause mce->tmpMap to resize, hence previous results
7143 from VG_(indexXA) are invalid. */
7144 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7145 tl_assert(ent->kind == Orig);
7146 tl_assert(ent->shadowB == IRTemp_INVALID);
7147 ent->shadowB = tmpB;
7149 return ent->shadowB;
7152 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
7154 return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
7158 /* Make a guarded origin load, with no special handling in the
7159 didn't-happen case. A GUARD of NULL is assumed to mean "always
7160 True".
7162 Generate IR to do a shadow origins load from BASEADDR+OFFSET and
7163 return the otag. The loaded size is SZB. If GUARD evaluates to
7164 False at run time then the returned otag is zero.
7166 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
7167 IRAtom* baseaddr,
7168 Int offset, IRExpr* guard )
7170 void* hFun;
7171 const HChar* hName;
7172 IRTemp bTmp;
7173 IRDirty* di;
7174 IRType aTy = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7175 IROp opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7176 IRAtom* ea = baseaddr;
7177 if (offset != 0) {
7178 IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7179 : mkU64( (Long)(Int)offset );
7180 ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
7182 bTmp = newTemp(mce, mce->hWordTy, BSh);
7184 switch (szB) {
7185 case 1: hFun = (void*)&MC_(helperc_b_load1);
7186 hName = "MC_(helperc_b_load1)";
7187 break;
7188 case 2: hFun = (void*)&MC_(helperc_b_load2);
7189 hName = "MC_(helperc_b_load2)";
7190 break;
7191 case 4: hFun = (void*)&MC_(helperc_b_load4);
7192 hName = "MC_(helperc_b_load4)";
7193 break;
7194 case 8: hFun = (void*)&MC_(helperc_b_load8);
7195 hName = "MC_(helperc_b_load8)";
7196 break;
7197 case 16: hFun = (void*)&MC_(helperc_b_load16);
7198 hName = "MC_(helperc_b_load16)";
7199 break;
7200 case 32: hFun = (void*)&MC_(helperc_b_load32);
7201 hName = "MC_(helperc_b_load32)";
7202 break;
7203 default:
7204 VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
7205 tl_assert(0);
7207 di = unsafeIRDirty_1_N(
7208 bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
7209 mkIRExprVec_1( ea )
7211 if (guard) {
7212 di->guard = guard;
7213 /* Ideally the didn't-happen return value here would be
7214 all-zeroes (unknown-origin), so it'd be harmless if it got
7215 used inadvertently. We slum it out with the IR-mandated
7216 default value (0b01 repeating, 0x55 etc) as that'll probably
7217 trump all legitimate otags via Max32, and it's pretty
7218 obviously bogus. */
7220 /* no need to mess with any annotations. This call accesses
7221 neither guest state nor guest memory. */
7222 stmt( 'B', mce, IRStmt_Dirty(di) );
7223 if (mce->hWordTy == Ity_I64) {
7224 /* 64-bit host */
7225 IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
7226 assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
7227 return mkexpr(bTmp32);
7228 } else {
7229 /* 32-bit host */
7230 return mkexpr(bTmp);
7235 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET. The
7236 loaded size is SZB. The load is regarded as unconditional (always
7237 happens).
7239 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
7240 Int offset )
7242 return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
7246 /* The most general handler for guarded origin loads. A GUARD of NULL
7247 is assumed to mean "always True".
7249 Generate IR to do a shadow origin load from ADDR+BIAS and return
7250 the B bits. The loaded type is TY. If GUARD evaluates to False at
7251 run time then the returned B bits are simply BALT instead.
7253 static
7254 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
7255 IRType ty,
7256 IRAtom* addr, UInt bias,
7257 IRAtom* guard, IRAtom* balt )
7259 /* If the guard evaluates to True, this will hold the loaded
7260 origin. If the guard evaluates to False, this will be zero,
7261 meaning "unknown origin", in which case we will have to replace
7262 it using an ITE below. */
7263 IRAtom* iftrue
7264 = assignNew('B', mce, Ity_I32,
7265 gen_guarded_load_b(mce, sizeofIRType(ty),
7266 addr, bias, guard));
7267 /* These are the bits we will return if the load doesn't take
7268 place. */
7269 IRAtom* iffalse
7270 = balt;
7271 /* Prepare the cond for the ITE. Convert a NULL cond into
7272 something that iropt knows how to fold out later. */
7273 IRAtom* cond
7274 = guard == NULL ? mkU1(1) : guard;
7275 /* And assemble the final result. */
7276 return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
7280 /* Generate a shadow origins store. guard :: Ity_I1 controls whether
7281 the store really happens; NULL means it unconditionally does. */
7282 static void gen_store_b ( MCEnv* mce, Int szB,
7283 IRAtom* baseaddr, Int offset, IRAtom* dataB,
7284 IRAtom* guard )
7286 void* hFun;
7287 const HChar* hName;
7288 IRDirty* di;
7289 IRType aTy = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7290 IROp opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7291 IRAtom* ea = baseaddr;
7292 if (guard) {
7293 tl_assert(isOriginalAtom(mce, guard));
7294 tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
7296 if (offset != 0) {
7297 IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7298 : mkU64( (Long)(Int)offset );
7299 ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
7301 if (mce->hWordTy == Ity_I64)
7302 dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
7304 switch (szB) {
7305 case 1: hFun = (void*)&MC_(helperc_b_store1);
7306 hName = "MC_(helperc_b_store1)";
7307 break;
7308 case 2: hFun = (void*)&MC_(helperc_b_store2);
7309 hName = "MC_(helperc_b_store2)";
7310 break;
7311 case 4: hFun = (void*)&MC_(helperc_b_store4);
7312 hName = "MC_(helperc_b_store4)";
7313 break;
7314 case 8: hFun = (void*)&MC_(helperc_b_store8);
7315 hName = "MC_(helperc_b_store8)";
7316 break;
7317 case 16: hFun = (void*)&MC_(helperc_b_store16);
7318 hName = "MC_(helperc_b_store16)";
7319 break;
7320 case 32: hFun = (void*)&MC_(helperc_b_store32);
7321 hName = "MC_(helperc_b_store32)";
7322 break;
7323 default:
7324 tl_assert(0);
7326 di = unsafeIRDirty_0_N( 2/*regparms*/,
7327 hName, VG_(fnptr_to_fnentry)( hFun ),
7328 mkIRExprVec_2( ea, dataB )
7330 /* no need to mess with any annotations. This call accesses
7331 neither guest state nor guest memory. */
7332 if (guard) di->guard = guard;
7333 stmt( 'B', mce, IRStmt_Dirty(di) );
7336 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
7337 IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7338 if (eTy == Ity_I64)
7339 return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
7340 if (eTy == Ity_I32)
7341 return e;
7342 tl_assert(0);
7345 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
7346 IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7347 tl_assert(eTy == Ity_I32);
7348 if (dstTy == Ity_I64)
7349 return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
7350 tl_assert(0);
7354 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
7356 tl_assert(MC_(clo_mc_level) == 3);
7358 switch (e->tag) {
7360 case Iex_GetI: {
7361 IRRegArray* descr_b;
7362 IRAtom *t1, *t2, *t3, *t4;
7363 IRRegArray* descr = e->Iex.GetI.descr;
7364 IRType equivIntTy
7365 = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7366 /* If this array is unshadowable for whatever reason, use the
7367 usual approximation. */
7368 if (equivIntTy == Ity_INVALID)
7369 return mkU32(0);
7370 tl_assert(sizeofIRType(equivIntTy) >= 4);
7371 tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7372 descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7373 equivIntTy, descr->nElems );
7374 /* Do a shadow indexed get of the same size, giving t1. Take
7375 the bottom 32 bits of it, giving t2. Compute into t3 the
7376 origin for the index (almost certainly zero, but there's
7377 no harm in being completely general here, since iropt will
7378 remove any useless code), and fold it in, giving a final
7379 value t4. */
7380 t1 = assignNew( 'B', mce, equivIntTy,
7381 IRExpr_GetI( descr_b, e->Iex.GetI.ix,
7382 e->Iex.GetI.bias ));
7383 t2 = narrowTo32( mce, t1 );
7384 t3 = schemeE( mce, e->Iex.GetI.ix );
7385 t4 = gen_maxU32( mce, t2, t3 );
7386 return t4;
7388 case Iex_CCall: {
7389 Int i;
7390 IRAtom* here;
7391 IRExpr** args = e->Iex.CCall.args;
7392 IRAtom* curr = mkU32(0);
7393 for (i = 0; args[i]; i++) {
7394 tl_assert(i < 32);
7395 tl_assert(isOriginalAtom(mce, args[i]));
7396 /* Only take notice of this arg if the callee's
7397 mc-exclusion mask does not say it is to be excluded. */
7398 if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
7399 /* the arg is to be excluded from definedness checking.
7400 Do nothing. */
7401 if (0) VG_(printf)("excluding %s(%d)\n",
7402 e->Iex.CCall.cee->name, i);
7403 } else {
7404 /* calculate the arg's definedness, and pessimistically
7405 merge it in. */
7406 here = schemeE( mce, args[i] );
7407 curr = gen_maxU32( mce, curr, here );
7410 return curr;
7412 case Iex_Load: {
7413 Int dszB;
7414 dszB = sizeofIRType(e->Iex.Load.ty);
7415 /* assert that the B value for the address is already
7416 available (somewhere) */
7417 tl_assert(isIRAtom(e->Iex.Load.addr));
7418 tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
7419 return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
7421 case Iex_ITE: {
7422 IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
7423 IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
7424 IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
7425 return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
7427 case Iex_Qop: {
7428 IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
7429 IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
7430 IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
7431 IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
7432 return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
7433 gen_maxU32( mce, b3, b4 ) );
7435 case Iex_Triop: {
7436 IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
7437 IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
7438 IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
7439 return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
7441 case Iex_Binop: {
7442 switch (e->Iex.Binop.op) {
7443 case Iop_CasCmpEQ8: case Iop_CasCmpNE8:
7444 case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
7445 case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
7446 case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
7447 /* Just say these all produce a defined result,
7448 regardless of their arguments. See
7449 COMMENT_ON_CasCmpEQ in this file. */
7450 return mkU32(0);
7451 default: {
7452 IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
7453 IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
7454 return gen_maxU32( mce, b1, b2 );
7457 tl_assert(0);
7458 /*NOTREACHED*/
7460 case Iex_Unop: {
7461 IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
7462 return b1;
7464 case Iex_Const:
7465 return mkU32(0);
7466 case Iex_RdTmp:
7467 return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
7468 case Iex_Get: {
7469 Int b_offset = MC_(get_otrack_shadow_offset)(
7470 e->Iex.Get.offset,
7471 sizeofIRType(e->Iex.Get.ty)
7473 tl_assert(b_offset >= -1
7474 && b_offset <= mce->layout->total_sizeB -4);
7475 if (b_offset >= 0) {
7476 /* FIXME: this isn't an atom! */
7477 return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
7478 Ity_I32 );
7480 return mkU32(0);
7482 default:
7483 VG_(printf)("mc_translate.c: schemeE: unhandled: ");
7484 ppIRExpr(e);
7485 VG_(tool_panic)("memcheck:schemeE");
7490 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
7492 // This is a hacked version of do_shadow_Dirty
7493 Int i, k, n, toDo, gSz, gOff;
7494 IRAtom *here, *curr;
7495 IRTemp dst;
7497 /* First check the guard. */
7498 curr = schemeE( mce, d->guard );
7500 /* Now round up all inputs and maxU32 over them. */
7502 /* Inputs: unmasked args
7503 Note: arguments are evaluated REGARDLESS of the guard expression */
7504 for (i = 0; d->args[i]; i++) {
7505 IRAtom* arg = d->args[i];
7506 if ( (d->cee->mcx_mask & (1<<i))
7507 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
7508 /* ignore this arg */
7509 } else {
7510 here = schemeE( mce, arg );
7511 curr = gen_maxU32( mce, curr, here );
7515 /* Inputs: guest state that we read. */
7516 for (i = 0; i < d->nFxState; i++) {
7517 tl_assert(d->fxState[i].fx != Ifx_None);
7518 if (d->fxState[i].fx == Ifx_Write)
7519 continue;
7521 /* Enumerate the described state segments */
7522 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7523 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7524 gSz = d->fxState[i].size;
7526 /* Ignore any sections marked as 'always defined'. */
7527 if (isAlwaysDefd(mce, gOff, gSz)) {
7528 if (0)
7529 VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7530 gOff, gSz);
7531 continue;
7534 /* This state element is read or modified. So we need to
7535 consider it. If larger than 4 bytes, deal with it in
7536 4-byte chunks. */
7537 while (True) {
7538 Int b_offset;
7539 tl_assert(gSz >= 0);
7540 if (gSz == 0) break;
7541 n = gSz <= 4 ? gSz : 4;
7542 /* update 'curr' with maxU32 of the state slice
7543 gOff .. gOff+n-1 */
7544 b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7545 if (b_offset != -1) {
7546 /* Observe the guard expression. If it is false use 0, i.e.
7547 nothing is known about the origin */
7548 IRAtom *cond, *iffalse, *iftrue;
7550 cond = assignNew( 'B', mce, Ity_I1, d->guard);
7551 iffalse = mkU32(0);
7552 iftrue = assignNew( 'B', mce, Ity_I32,
7553 IRExpr_Get(b_offset
7554 + 2*mce->layout->total_sizeB,
7555 Ity_I32));
7556 here = assignNew( 'B', mce, Ity_I32,
7557 IRExpr_ITE(cond, iftrue, iffalse));
7558 curr = gen_maxU32( mce, curr, here );
7560 gSz -= n;
7561 gOff += n;
7566 /* Inputs: memory */
7568 if (d->mFx != Ifx_None) {
7569 /* Because we may do multiple shadow loads/stores from the same
7570 base address, it's best to do a single test of its
7571 definedness right now. Post-instrumentation optimisation
7572 should remove all but this test. */
7573 tl_assert(d->mAddr);
7574 here = schemeE( mce, d->mAddr );
7575 curr = gen_maxU32( mce, curr, here );
7578 /* Deal with memory inputs (reads or modifies) */
7579 if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
7580 toDo = d->mSize;
7581 /* chew off 32-bit chunks. We don't care about the endianness
7582 since it's all going to be condensed down to a single bit,
7583 but nevertheless choose an endianness which is hopefully
7584 native to the platform. */
7585 while (toDo >= 4) {
7586 here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
7587 d->guard );
7588 curr = gen_maxU32( mce, curr, here );
7589 toDo -= 4;
7591 /* handle possible 16-bit excess */
7592 while (toDo >= 2) {
7593 here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
7594 d->guard );
7595 curr = gen_maxU32( mce, curr, here );
7596 toDo -= 2;
7598 /* chew off the remaining 8-bit chunk, if any */
7599 if (toDo == 1) {
7600 here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
7601 d->guard );
7602 curr = gen_maxU32( mce, curr, here );
7603 toDo -= 1;
7605 tl_assert(toDo == 0);
7608 /* Whew! So curr is a 32-bit B-value which should give an origin
7609 of some use if any of the inputs to the helper are undefined.
7610 Now we need to re-distribute the results to all destinations. */
7612 /* Outputs: the destination temporary, if there is one. */
7613 if (d->tmp != IRTemp_INVALID) {
7614 dst = findShadowTmpB(mce, d->tmp);
7615 assign( 'V', mce, dst, curr );
7618 /* Outputs: guest state that we write or modify. */
7619 for (i = 0; i < d->nFxState; i++) {
7620 tl_assert(d->fxState[i].fx != Ifx_None);
7621 if (d->fxState[i].fx == Ifx_Read)
7622 continue;
7624 /* Enumerate the described state segments */
7625 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7626 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7627 gSz = d->fxState[i].size;
7629 /* Ignore any sections marked as 'always defined'. */
7630 if (isAlwaysDefd(mce, gOff, gSz))
7631 continue;
7633 /* This state element is written or modified. So we need to
7634 consider it. If larger than 4 bytes, deal with it in
7635 4-byte chunks. */
7636 while (True) {
7637 Int b_offset;
7638 tl_assert(gSz >= 0);
7639 if (gSz == 0) break;
7640 n = gSz <= 4 ? gSz : 4;
7641 /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7642 b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7643 if (b_offset != -1) {
7645 /* If the guard expression evaluates to false we simply Put
7646 the value that is already stored in the guest state slot */
7647 IRAtom *cond, *iffalse;
7649 cond = assignNew('B', mce, Ity_I1,
7650 d->guard);
7651 iffalse = assignNew('B', mce, Ity_I32,
7652 IRExpr_Get(b_offset +
7653 2*mce->layout->total_sizeB,
7654 Ity_I32));
7655 curr = assignNew('V', mce, Ity_I32,
7656 IRExpr_ITE(cond, curr, iffalse));
7658 stmt( 'B', mce, IRStmt_Put(b_offset
7659 + 2*mce->layout->total_sizeB,
7660 curr ));
7662 gSz -= n;
7663 gOff += n;
7668 /* Outputs: memory that we write or modify. Same comments about
7669 endianness as above apply. */
7670 if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7671 toDo = d->mSize;
7672 /* chew off 32-bit chunks */
7673 while (toDo >= 4) {
7674 gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7675 d->guard );
7676 toDo -= 4;
7678 /* handle possible 16-bit excess */
7679 while (toDo >= 2) {
7680 gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7681 d->guard );
7682 toDo -= 2;
7684 /* chew off the remaining 8-bit chunk, if any */
7685 if (toDo == 1) {
7686 gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7687 d->guard );
7688 toDo -= 1;
7690 tl_assert(toDo == 0);
7695 /* Generate IR for origin shadowing for a general guarded store. */
7696 static void do_origins_Store_guarded ( MCEnv* mce,
7697 IREndness stEnd,
7698 IRExpr* stAddr,
7699 IRExpr* stData,
7700 IRExpr* guard )
7702 Int dszB;
7703 IRAtom* dataB;
7704 /* assert that the B value for the address is already available
7705 (somewhere), since the call to schemeE will want to see it.
7706 XXXX how does this actually ensure that?? */
7707 tl_assert(isIRAtom(stAddr));
7708 tl_assert(isIRAtom(stData));
7709 dszB = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7710 dataB = schemeE( mce, stData );
7711 gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7715 /* Generate IR for origin shadowing for a plain store. */
7716 static void do_origins_Store_plain ( MCEnv* mce,
7717 IREndness stEnd,
7718 IRExpr* stAddr,
7719 IRExpr* stData )
7721 do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7722 NULL/*guard*/ );
7726 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7728 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7730 do_origins_Store_guarded( mce, sg->end, sg->addr,
7731 sg->data, sg->guard );
7734 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7736 IRType loadedTy = Ity_INVALID;
7737 switch (lg->cvt) {
7738 case ILGop_IdentV128: loadedTy = Ity_V128; break;
7739 case ILGop_Ident64: loadedTy = Ity_I64; break;
7740 case ILGop_Ident32: loadedTy = Ity_I32; break;
7741 case ILGop_16Uto32: loadedTy = Ity_I16; break;
7742 case ILGop_16Sto32: loadedTy = Ity_I16; break;
7743 case ILGop_8Uto32: loadedTy = Ity_I8; break;
7744 case ILGop_8Sto32: loadedTy = Ity_I8; break;
7745 default: VG_(tool_panic)("schemeS.IRLoadG");
7747 IRAtom* ori_alt
7748 = schemeE( mce,lg->alt );
7749 IRAtom* ori_final
7750 = expr2ori_Load_guarded_General(mce, loadedTy,
7751 lg->addr, 0/*addr bias*/,
7752 lg->guard, ori_alt );
7753 /* And finally, bind the origin to the destination temporary. */
7754 assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7758 static void schemeS ( MCEnv* mce, IRStmt* st )
7760 tl_assert(MC_(clo_mc_level) == 3);
7762 switch (st->tag) {
7764 case Ist_AbiHint:
7765 /* The value-check instrumenter handles this - by arranging
7766 to pass the address of the next instruction to
7767 MC_(helperc_MAKE_STACK_UNINIT). This is all that needs to
7768 happen for origin tracking w.r.t. AbiHints. So there is
7769 nothing to do here. */
7770 break;
7772 case Ist_PutI: {
7773 IRPutI *puti = st->Ist.PutI.details;
7774 IRRegArray* descr_b;
7775 IRAtom *t1, *t2, *t3, *t4;
7776 IRRegArray* descr = puti->descr;
7777 IRType equivIntTy
7778 = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7779 /* If this array is unshadowable for whatever reason,
7780 generate no code. */
7781 if (equivIntTy == Ity_INVALID)
7782 break;
7783 tl_assert(sizeofIRType(equivIntTy) >= 4);
7784 tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7785 descr_b
7786 = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7787 equivIntTy, descr->nElems );
7788 /* Compute a value to Put - the conjoinment of the origin for
7789 the data to be Put-ted (obviously) and of the index value
7790 (not so obviously). */
7791 t1 = schemeE( mce, puti->data );
7792 t2 = schemeE( mce, puti->ix );
7793 t3 = gen_maxU32( mce, t1, t2 );
7794 t4 = zWidenFrom32( mce, equivIntTy, t3 );
7795 stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7796 puti->bias, t4) ));
7797 break;
7800 case Ist_Dirty:
7801 do_origins_Dirty( mce, st->Ist.Dirty.details );
7802 break;
7804 case Ist_Store:
7805 do_origins_Store_plain( mce, st->Ist.Store.end,
7806 st->Ist.Store.addr,
7807 st->Ist.Store.data );
7808 break;
7810 case Ist_StoreG:
7811 do_origins_StoreG( mce, st->Ist.StoreG.details );
7812 break;
7814 case Ist_LoadG:
7815 do_origins_LoadG( mce, st->Ist.LoadG.details );
7816 break;
7818 case Ist_LLSC: {
7819 /* In short: treat a load-linked like a normal load followed
7820 by an assignment of the loaded (shadow) data the result
7821 temporary. Treat a store-conditional like a normal store,
7822 and mark the result temporary as defined. */
7823 if (st->Ist.LLSC.storedata == NULL) {
7824 /* Load Linked */
7825 IRType resTy
7826 = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7827 IRExpr* vanillaLoad
7828 = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7829 tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7830 || resTy == Ity_I16 || resTy == Ity_I8);
7831 assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7832 schemeE(mce, vanillaLoad));
7833 } else {
7834 /* Store conditional */
7835 do_origins_Store_plain( mce, st->Ist.LLSC.end,
7836 st->Ist.LLSC.addr,
7837 st->Ist.LLSC.storedata );
7838 /* For the rationale behind this, see comments at the
7839 place where the V-shadow for .result is constructed, in
7840 do_shadow_LLSC. In short, we regard .result as
7841 always-defined. */
7842 assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7843 mkU32(0) );
7845 break;
7848 case Ist_Put: {
7849 Int b_offset
7850 = MC_(get_otrack_shadow_offset)(
7851 st->Ist.Put.offset,
7852 sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7854 if (b_offset >= 0) {
7855 /* FIXME: this isn't an atom! */
7856 stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7857 schemeE( mce, st->Ist.Put.data )) );
7859 break;
7862 case Ist_WrTmp:
7863 assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7864 schemeE(mce, st->Ist.WrTmp.data) );
7865 break;
7867 case Ist_MBE:
7868 case Ist_NoOp:
7869 case Ist_Exit:
7870 case Ist_IMark:
7871 break;
7873 default:
7874 VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7875 ppIRStmt(st);
7876 VG_(tool_panic)("memcheck:schemeS");
7881 /*------------------------------------------------------------*/
7882 /*--- Post-tree-build final tidying ---*/
7883 /*------------------------------------------------------------*/
7885 /* This exploits the observation that Memcheck often produces
7886 repeated conditional calls of the form
7888 Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7890 with the same guard expression G guarding the same helper call.
7891 The second and subsequent calls are redundant. This usually
7892 results from instrumentation of guest code containing multiple
7893 memory references at different constant offsets from the same base
7894 register. After optimisation of the instrumentation, you get a
7895 test for the definedness of the base register for each memory
7896 reference, which is kinda pointless. MC_(final_tidy) therefore
7897 looks for such repeated calls and removes all but the first. */
7900 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7901 gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7902 get almost all the benefits of this transformation whilst causing
7903 the slide-back case to just often enough to be verifiably
7904 correct. For posterity, the numbers are:
7906 bz2-32
7908 1 4,336 (112,212 -> 1,709,473; ratio 15.2)
7909 2 4,336 (112,194 -> 1,669,895; ratio 14.9)
7910 3 4,336 (112,194 -> 1,660,713; ratio 14.8)
7911 4 4,336 (112,194 -> 1,658,555; ratio 14.8)
7912 5 4,336 (112,194 -> 1,655,447; ratio 14.8)
7913 6 4,336 (112,194 -> 1,655,101; ratio 14.8)
7914 7 4,336 (112,194 -> 1,654,858; ratio 14.7)
7915 8 4,336 (112,194 -> 1,654,810; ratio 14.7)
7916 10 4,336 (112,194 -> 1,654,621; ratio 14.7)
7917 12 4,336 (112,194 -> 1,654,678; ratio 14.7)
7918 16 4,336 (112,194 -> 1,654,494; ratio 14.7)
7919 32 4,336 (112,194 -> 1,654,602; ratio 14.7)
7920 inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7922 bz2-64
7924 1 4,113 (107,329 -> 1,822,171; ratio 17.0)
7925 2 4,113 (107,329 -> 1,806,443; ratio 16.8)
7926 3 4,113 (107,329 -> 1,803,967; ratio 16.8)
7927 4 4,113 (107,329 -> 1,802,785; ratio 16.8)
7928 5 4,113 (107,329 -> 1,802,412; ratio 16.8)
7929 6 4,113 (107,329 -> 1,802,062; ratio 16.8)
7930 7 4,113 (107,329 -> 1,801,976; ratio 16.8)
7931 8 4,113 (107,329 -> 1,801,886; ratio 16.8)
7932 10 4,113 (107,329 -> 1,801,653; ratio 16.8)
7933 12 4,113 (107,329 -> 1,801,526; ratio 16.8)
7934 16 4,113 (107,329 -> 1,801,298; ratio 16.8)
7935 32 4,113 (107,329 -> 1,800,827; ratio 16.8)
7936 inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7939 /* Structs for recording which (helper, guard) pairs we have already
7940 seen. */
7942 #define N_TIDYING_PAIRS 16
7944 typedef
7945 struct { void* entry; IRExpr* guard; }
7946 Pair;
7948 typedef
7949 struct {
7950 Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
7951 UInt pairsUsed;
7953 Pairs;
7956 /* Return True if e1 and e2 definitely denote the same value (used to
7957 compare guards). Return False if unknown; False is the safe
7958 answer. Since guest registers and guest memory do not have the
7959 SSA property we must return False if any Gets or Loads appear in
7960 the expression. This implicitly assumes that e1 and e2 have the
7961 same IR type, which is always true here -- the type is Ity_I1. */
7963 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
7965 if (e1->tag != e2->tag)
7966 return False;
7967 switch (e1->tag) {
7968 case Iex_Const:
7969 return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
7970 case Iex_Binop:
7971 return e1->Iex.Binop.op == e2->Iex.Binop.op
7972 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
7973 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
7974 case Iex_Unop:
7975 return e1->Iex.Unop.op == e2->Iex.Unop.op
7976 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
7977 case Iex_RdTmp:
7978 return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
7979 case Iex_ITE:
7980 return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
7981 && sameIRValue( e1->Iex.ITE.iftrue, e2->Iex.ITE.iftrue )
7982 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
7983 case Iex_Qop:
7984 case Iex_Triop:
7985 case Iex_CCall:
7986 /* be lazy. Could define equality for these, but they never
7987 appear to be used. */
7988 return False;
7989 case Iex_Get:
7990 case Iex_GetI:
7991 case Iex_Load:
7992 /* be conservative - these may not give the same value each
7993 time */
7994 return False;
7995 case Iex_Binder:
7996 /* should never see this */
7997 /* fallthrough */
7998 default:
7999 VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
8000 ppIRExpr(e1);
8001 VG_(tool_panic)("memcheck:sameIRValue");
8002 return False;
8006 /* See if 'pairs' already has an entry for (entry, guard). Return
8007 True if so. If not, add an entry. */
8009 static
8010 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
8012 UInt i, n = tidyingEnv->pairsUsed;
8013 tl_assert(n <= N_TIDYING_PAIRS);
8014 for (i = 0; i < n; i++) {
8015 if (tidyingEnv->pairs[i].entry == entry
8016 && sameIRValue(tidyingEnv->pairs[i].guard, guard))
8017 return True;
8019 /* (guard, entry) wasn't found in the array. Add it at the end.
8020 If the array is already full, slide the entries one slot
8021 backwards. This means we will lose to ability to detect
8022 duplicates from the pair in slot zero, but that happens so
8023 rarely that it's unlikely to have much effect on overall code
8024 quality. Also, this strategy loses the check for the oldest
8025 tracked exit (memory reference, basically) and so that is (I'd
8026 guess) least likely to be re-used after this point. */
8027 tl_assert(i == n);
8028 if (n == N_TIDYING_PAIRS) {
8029 for (i = 1; i < N_TIDYING_PAIRS; i++) {
8030 tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
8032 tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
8033 tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
8034 } else {
8035 tl_assert(n < N_TIDYING_PAIRS);
8036 tidyingEnv->pairs[n].entry = entry;
8037 tidyingEnv->pairs[n].guard = guard;
8038 n++;
8039 tidyingEnv->pairsUsed = n;
8041 return False;
8044 static Bool is_helperc_value_checkN_fail ( const HChar* name )
8046 /* This is expensive because it happens a lot. We are checking to
8047 see whether |name| is one of the following 8 strings:
8049 MC_(helperc_value_check8_fail_no_o)
8050 MC_(helperc_value_check4_fail_no_o)
8051 MC_(helperc_value_check0_fail_no_o)
8052 MC_(helperc_value_check1_fail_no_o)
8053 MC_(helperc_value_check8_fail_w_o)
8054 MC_(helperc_value_check0_fail_w_o)
8055 MC_(helperc_value_check1_fail_w_o)
8056 MC_(helperc_value_check4_fail_w_o)
8058 To speed it up, check the common prefix just once, rather than
8059 all 8 times.
8061 const HChar* prefix = "MC_(helperc_value_check";
8063 HChar n, p;
8064 while (True) {
8065 n = *name;
8066 p = *prefix;
8067 if (p == 0) break; /* ran off the end of the prefix */
8068 /* We still have some prefix to use */
8069 if (n == 0) return False; /* have prefix, but name ran out */
8070 if (n != p) return False; /* have both pfx and name, but no match */
8071 name++;
8072 prefix++;
8075 /* Check the part after the prefix. */
8076 tl_assert(*prefix == 0 && *name != 0);
8077 return 0==VG_(strcmp)(name, "8_fail_no_o)")
8078 || 0==VG_(strcmp)(name, "4_fail_no_o)")
8079 || 0==VG_(strcmp)(name, "0_fail_no_o)")
8080 || 0==VG_(strcmp)(name, "1_fail_no_o)")
8081 || 0==VG_(strcmp)(name, "8_fail_w_o)")
8082 || 0==VG_(strcmp)(name, "4_fail_w_o)")
8083 || 0==VG_(strcmp)(name, "0_fail_w_o)")
8084 || 0==VG_(strcmp)(name, "1_fail_w_o)");
8087 IRSB* MC_(final_tidy) ( IRSB* sb_in )
8089 Int i;
8090 IRStmt* st;
8091 IRDirty* di;
8092 IRExpr* guard;
8093 IRCallee* cee;
8094 Bool alreadyPresent;
8095 Pairs pairs;
8097 pairs.pairsUsed = 0;
8099 pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
8100 pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
8102 /* Scan forwards through the statements. Each time a call to one
8103 of the relevant helpers is seen, check if we have made a
8104 previous call to the same helper using the same guard
8105 expression, and if so, delete the call. */
8106 for (i = 0; i < sb_in->stmts_used; i++) {
8107 st = sb_in->stmts[i];
8108 tl_assert(st);
8109 if (st->tag != Ist_Dirty)
8110 continue;
8111 di = st->Ist.Dirty.details;
8112 guard = di->guard;
8113 tl_assert(guard);
8114 if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
8115 cee = di->cee;
8116 if (!is_helperc_value_checkN_fail( cee->name ))
8117 continue;
8118 /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
8119 guard 'guard'. Check if we have already seen a call to this
8120 function with the same guard. If so, delete it. If not,
8121 add it to the set of calls we do know about. */
8122 alreadyPresent = check_or_add( &pairs, guard, cee->addr );
8123 if (alreadyPresent) {
8124 sb_in->stmts[i] = IRStmt_NoOp();
8125 if (0) VG_(printf)("XX\n");
8129 tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
8130 tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
8132 return sb_in;
8135 #undef N_TIDYING_PAIRS
8138 /*------------------------------------------------------------*/
8139 /*--- Startup assertion checking ---*/
8140 /*------------------------------------------------------------*/
8142 void MC_(do_instrumentation_startup_checks)( void )
8144 /* Make a best-effort check to see that is_helperc_value_checkN_fail
8145 is working as we expect. */
8147 # define CHECK(_expected, _string) \
8148 tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
8150 /* It should identify these 8, and no others, as targets. */
8151 CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
8152 CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
8153 CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
8154 CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
8155 CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
8156 CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
8157 CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
8158 CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
8160 /* Ad-hoc selection of other strings gathered via a quick test. */
8161 CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
8162 CHECK(False, "amd64g_dirtyhelper_RDTSC");
8163 CHECK(False, "MC_(helperc_b_load1)");
8164 CHECK(False, "MC_(helperc_b_load2)");
8165 CHECK(False, "MC_(helperc_b_load4)");
8166 CHECK(False, "MC_(helperc_b_load8)");
8167 CHECK(False, "MC_(helperc_b_load16)");
8168 CHECK(False, "MC_(helperc_b_load32)");
8169 CHECK(False, "MC_(helperc_b_store1)");
8170 CHECK(False, "MC_(helperc_b_store2)");
8171 CHECK(False, "MC_(helperc_b_store4)");
8172 CHECK(False, "MC_(helperc_b_store8)");
8173 CHECK(False, "MC_(helperc_b_store16)");
8174 CHECK(False, "MC_(helperc_b_store32)");
8175 CHECK(False, "MC_(helperc_LOADV8)");
8176 CHECK(False, "MC_(helperc_LOADV16le)");
8177 CHECK(False, "MC_(helperc_LOADV32le)");
8178 CHECK(False, "MC_(helperc_LOADV64le)");
8179 CHECK(False, "MC_(helperc_LOADV128le)");
8180 CHECK(False, "MC_(helperc_LOADV256le)");
8181 CHECK(False, "MC_(helperc_STOREV16le)");
8182 CHECK(False, "MC_(helperc_STOREV32le)");
8183 CHECK(False, "MC_(helperc_STOREV64le)");
8184 CHECK(False, "MC_(helperc_STOREV8)");
8185 CHECK(False, "track_die_mem_stack_8");
8186 CHECK(False, "track_new_mem_stack_8_w_ECU");
8187 CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
8188 CHECK(False, "VG_(unknown_SP_update_w_ECU)");
8190 # undef CHECK
8194 /*------------------------------------------------------------*/
8195 /*--- Memcheck main ---*/
8196 /*------------------------------------------------------------*/
8198 static Bool isBogusAtom ( IRAtom* at )
8200 if (at->tag == Iex_RdTmp)
8201 return False;
8202 tl_assert(at->tag == Iex_Const);
8204 ULong n = 0;
8205 IRConst* con = at->Iex.Const.con;
8206 switch (con->tag) {
8207 case Ico_U1: return False;
8208 case Ico_U8: n = (ULong)con->Ico.U8; break;
8209 case Ico_U16: n = (ULong)con->Ico.U16; break;
8210 case Ico_U32: n = (ULong)con->Ico.U32; break;
8211 case Ico_U64: n = (ULong)con->Ico.U64; break;
8212 case Ico_F32: return False;
8213 case Ico_F64: return False;
8214 case Ico_F32i: return False;
8215 case Ico_F64i: return False;
8216 case Ico_V128: return False;
8217 case Ico_V256: return False;
8218 default: ppIRExpr(at); tl_assert(0);
8220 /* VG_(printf)("%llx\n", n); */
8221 /* Shortcuts */
8222 if (LIKELY(n <= 0x0000000000001000ULL)) return False;
8223 if (LIKELY(n >= 0xFFFFFFFFFFFFF000ULL)) return False;
8224 /* The list of bogus atoms is: */
8225 return (/*32*/ n == 0xFEFEFEFFULL
8226 /*32*/ || n == 0x80808080ULL
8227 /*32*/ || n == 0x7F7F7F7FULL
8228 /*32*/ || n == 0x7EFEFEFFULL
8229 /*32*/ || n == 0x81010100ULL
8230 /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
8231 /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
8232 /*64*/ || n == 0x0000000000008080ULL
8233 /*64*/ || n == 0x8080808080808080ULL
8234 /*64*/ || n == 0x0101010101010101ULL
8239 /* Does 'st' mention any of the literals identified/listed in
8240 isBogusAtom()? */
8241 static inline Bool containsBogusLiterals ( /*FLAT*/ IRStmt* st )
8243 Int i;
8244 IRExpr* e;
8245 IRDirty* d;
8246 IRCAS* cas;
8247 switch (st->tag) {
8248 case Ist_WrTmp:
8249 e = st->Ist.WrTmp.data;
8250 switch (e->tag) {
8251 case Iex_Get:
8252 case Iex_RdTmp:
8253 return False;
8254 case Iex_Const:
8255 return isBogusAtom(e);
8256 case Iex_Unop:
8257 return isBogusAtom(e->Iex.Unop.arg)
8258 || e->Iex.Unop.op == Iop_GetMSBs8x16;
8259 case Iex_GetI:
8260 return isBogusAtom(e->Iex.GetI.ix);
8261 case Iex_Binop:
8262 return isBogusAtom(e->Iex.Binop.arg1)
8263 || isBogusAtom(e->Iex.Binop.arg2);
8264 case Iex_Triop:
8265 return isBogusAtom(e->Iex.Triop.details->arg1)
8266 || isBogusAtom(e->Iex.Triop.details->arg2)
8267 || isBogusAtom(e->Iex.Triop.details->arg3);
8268 case Iex_Qop:
8269 return isBogusAtom(e->Iex.Qop.details->arg1)
8270 || isBogusAtom(e->Iex.Qop.details->arg2)
8271 || isBogusAtom(e->Iex.Qop.details->arg3)
8272 || isBogusAtom(e->Iex.Qop.details->arg4);
8273 case Iex_ITE:
8274 return isBogusAtom(e->Iex.ITE.cond)
8275 || isBogusAtom(e->Iex.ITE.iftrue)
8276 || isBogusAtom(e->Iex.ITE.iffalse);
8277 case Iex_Load:
8278 return isBogusAtom(e->Iex.Load.addr);
8279 case Iex_CCall:
8280 for (i = 0; e->Iex.CCall.args[i]; i++)
8281 if (isBogusAtom(e->Iex.CCall.args[i]))
8282 return True;
8283 return False;
8284 default:
8285 goto unhandled;
8287 case Ist_Dirty:
8288 d = st->Ist.Dirty.details;
8289 for (i = 0; d->args[i]; i++) {
8290 IRAtom* atom = d->args[i];
8291 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
8292 if (isBogusAtom(atom))
8293 return True;
8296 if (isBogusAtom(d->guard))
8297 return True;
8298 if (d->mAddr && isBogusAtom(d->mAddr))
8299 return True;
8300 return False;
8301 case Ist_Put:
8302 return isBogusAtom(st->Ist.Put.data);
8303 case Ist_PutI:
8304 return isBogusAtom(st->Ist.PutI.details->ix)
8305 || isBogusAtom(st->Ist.PutI.details->data);
8306 case Ist_Store:
8307 return isBogusAtom(st->Ist.Store.addr)
8308 || isBogusAtom(st->Ist.Store.data);
8309 case Ist_StoreG: {
8310 IRStoreG* sg = st->Ist.StoreG.details;
8311 return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
8312 || isBogusAtom(sg->guard);
8314 case Ist_LoadG: {
8315 IRLoadG* lg = st->Ist.LoadG.details;
8316 return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
8317 || isBogusAtom(lg->guard);
8319 case Ist_Exit:
8320 return isBogusAtom(st->Ist.Exit.guard);
8321 case Ist_AbiHint:
8322 return isBogusAtom(st->Ist.AbiHint.base)
8323 || isBogusAtom(st->Ist.AbiHint.nia);
8324 case Ist_NoOp:
8325 case Ist_IMark:
8326 case Ist_MBE:
8327 return False;
8328 case Ist_CAS:
8329 cas = st->Ist.CAS.details;
8330 return isBogusAtom(cas->addr)
8331 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
8332 || isBogusAtom(cas->expdLo)
8333 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
8334 || isBogusAtom(cas->dataLo);
8335 case Ist_LLSC:
8336 return isBogusAtom(st->Ist.LLSC.addr)
8337 || (st->Ist.LLSC.storedata
8338 ? isBogusAtom(st->Ist.LLSC.storedata)
8339 : False);
8340 default:
8341 unhandled:
8342 ppIRStmt(st);
8343 VG_(tool_panic)("hasBogusLiterals");
8348 /* This is the pre-instrumentation analysis. It does a backwards pass over
8349 the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
8350 the block.
8352 Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
8353 as a positive result from that is a strong indication that we need to
8354 expensively instrument add/sub in the block. We do both analyses in one
8355 pass, even though they are independent, so as to avoid the overhead of
8356 having to traverse the whole block twice.
8358 The usage pass proceeds as follows. Let max= be the max operation in the
8359 HowUsed lattice, hence
8361 X max= Y means X = max(X, Y)
8363 then
8365 for t in original tmps . useEnv[t] = HuUnU
8367 for t used in the block's . next field
8368 useEnv[t] max= HuPCa // because jmp targets are PCast-tested
8370 for st iterating *backwards* in the block
8372 match st
8374 case "t1 = load(t2)" // case 1
8375 useEnv[t2] max= HuPCa
8377 case "t1 = add(t2, t3)" // case 2
8378 useEnv[t2] max= useEnv[t1]
8379 useEnv[t3] max= useEnv[t1]
8381 other
8382 for t in st.usedTmps // case 3
8383 useEnv[t] max= HuOth
8384 // same as useEnv[t] = HuOth
8386 The general idea is that we accumulate, in useEnv[], information about
8387 how each tmp is used. That can be updated as we work further back
8388 through the block and find more uses of it, but its HowUsed value can
8389 only ascend the lattice, not descend.
8391 Initially we mark all tmps as unused. In case (1), if a tmp is seen to
8392 be used as a memory address, then its use is at least HuPCa. The point
8393 is that for a memory address we will add instrumentation to check if any
8394 bit of the address is undefined, which means that we won't need expensive
8395 V-bit propagation through an add expression that computed the address --
8396 cheap add instrumentation will be equivalent.
8398 Note in case (1) that if we have previously seen a non-memory-address use
8399 of the tmp, then its use will already be HuOth and will be unchanged by
8400 the max= operation. And if it turns out that the source of the tmp was
8401 an add, then we'll have to expensively instrument the add, because we
8402 can't prove that, for the previous non-memory-address use of the tmp,
8403 cheap and expensive instrumentation will be equivalent.
8405 In case 2, we propagate the usage-mode of the result of an add back
8406 through to its operands. Again, we use max= so as to take account of the
8407 fact that t2 or t3 might later in the block (viz, earlier in the
8408 iteration) have been used in a way that requires expensive add
8409 instrumentation.
8411 In case 3, we deal with all other tmp uses. We assume that we'll need a
8412 result that is as accurate as possible, so we max= HuOth into its use
8413 mode. Since HuOth is the top of the lattice, that's equivalent to just
8414 setting its use to HuOth.
8416 The net result of all this is that:
8418 tmps that are used either
8419 - only as a memory address, or
8420 - only as part of a tree of adds that computes a memory address,
8421 and has no other use
8422 are marked as HuPCa, and so we can instrument their generating Add
8423 nodes cheaply, which is the whole point of this analysis
8425 tmps that are used any other way at all are marked as HuOth
8427 tmps that are unused are marked as HuUnU. We don't expect to see any
8428 since we expect that the incoming IR has had all dead assignments
8429 removed by previous optimisation passes. Nevertheless the analysis is
8430 correct even in the presence of dead tmps.
8432 A final comment on dead tmps. In case 1 and case 2, we could actually
8433 conditionalise the updates thusly:
8435 if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa } // case 1
8437 if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] } // case 2
8438 if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] } // case 2
8440 In other words, if the assigned-to tmp |t1| is never used, then there's
8441 no point in propagating any use through to its operands. That won't
8442 change the final HuPCa-vs-HuOth results, which is what we care about.
8443 Given that we expect to get dead-code-free inputs, there's no point in
8444 adding this extra refinement.
8447 /* Helper for |preInstrumentationAnalysis|. */
8448 static inline void noteTmpUsesIn ( /*MOD*/HowUsed* useEnv,
8449 UInt tyenvUsed,
8450 HowUsed newUse, IRAtom* at )
8452 /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
8453 seen a use of |newUse|. So, merge that info into |t|'s accumulated
8454 use info. */
8455 switch (at->tag) {
8456 case Iex_GSPTR:
8457 case Iex_VECRET:
8458 case Iex_Const:
8459 return;
8460 case Iex_RdTmp: {
8461 IRTemp t = at->Iex.RdTmp.tmp;
8462 tl_assert(t < tyenvUsed); // "is an original tmp"
8463 // The "max" operation in the lattice
8464 if (newUse > useEnv[t]) useEnv[t] = newUse;
8465 return;
8467 default:
8468 // We should never get here -- it implies non-flat IR
8469 ppIRExpr(at);
8470 VG_(tool_panic)("noteTmpUsesIn");
8472 /*NOTREACHED*/
8473 tl_assert(0);
8477 static void preInstrumentationAnalysis ( /*OUT*/HowUsed** useEnvP,
8478 /*OUT*/Bool* hasBogusLiteralsP,
8479 const IRSB* sb_in )
8481 const UInt nOrigTmps = (UInt)sb_in->tyenv->types_used;
8483 // We've seen no bogus literals so far.
8484 Bool bogus = False;
8486 // This is calloc'd, so implicitly all entries are initialised to HuUnU.
8487 HowUsed* useEnv = VG_(calloc)("mc.preInstrumentationAnalysis.1",
8488 nOrigTmps, sizeof(HowUsed));
8490 // Firstly, roll in contributions from the final dst address.
8491 bogus = isBogusAtom(sb_in->next);
8492 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, sb_in->next);
8494 // Now work backwards through the stmts.
8495 for (Int i = sb_in->stmts_used-1; i >= 0; i--) {
8496 IRStmt* st = sb_in->stmts[i];
8498 // Deal with literals.
8499 if (LIKELY(!bogus)) {
8500 bogus = containsBogusLiterals(st);
8503 // Deal with tmp uses.
8504 switch (st->tag) {
8505 case Ist_WrTmp: {
8506 IRTemp dst = st->Ist.WrTmp.tmp;
8507 IRExpr* rhs = st->Ist.WrTmp.data;
8508 // This is the one place where we have to consider all possible
8509 // tags for |rhs|, and can't just assume it is a tmp or a const.
8510 switch (rhs->tag) {
8511 case Iex_RdTmp:
8512 // just propagate demand for |dst| into this tmp use.
8513 noteTmpUsesIn(useEnv, nOrigTmps, useEnv[dst], rhs);
8514 break;
8515 case Iex_Unop:
8516 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.Unop.arg);
8517 break;
8518 case Iex_Binop:
8519 if (rhs->Iex.Binop.op == Iop_Add64
8520 || rhs->Iex.Binop.op == Iop_Add32) {
8521 // propagate demand for |dst| through to the operands.
8522 noteTmpUsesIn(useEnv, nOrigTmps,
8523 useEnv[dst], rhs->Iex.Binop.arg1);
8524 noteTmpUsesIn(useEnv, nOrigTmps,
8525 useEnv[dst], rhs->Iex.Binop.arg2);
8526 } else {
8527 // just say that the operands are used in some unknown way.
8528 noteTmpUsesIn(useEnv, nOrigTmps,
8529 HuOth, rhs->Iex.Binop.arg1);
8530 noteTmpUsesIn(useEnv, nOrigTmps,
8531 HuOth, rhs->Iex.Binop.arg2);
8533 break;
8534 case Iex_Triop: {
8535 // All operands are used in some unknown way.
8536 IRTriop* tri = rhs->Iex.Triop.details;
8537 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg1);
8538 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg2);
8539 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg3);
8540 break;
8542 case Iex_Qop: {
8543 // All operands are used in some unknown way.
8544 IRQop* qop = rhs->Iex.Qop.details;
8545 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg1);
8546 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg2);
8547 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg3);
8548 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg4);
8549 break;
8551 case Iex_Load:
8552 // The address will be checked (== PCasted).
8553 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.Load.addr);
8554 break;
8555 case Iex_ITE:
8556 // The condition is PCasted, the then- and else-values
8557 // aren't.
8558 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.ITE.cond);
8559 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iftrue);
8560 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iffalse);
8561 break;
8562 case Iex_CCall:
8563 // The args are used in unknown ways.
8564 for (IRExpr** args = rhs->Iex.CCall.args; *args; args++) {
8565 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8567 break;
8568 case Iex_GetI: {
8569 // The index will be checked/PCasted (see do_shadow_GETI)
8570 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.GetI.ix);
8571 break;
8573 case Iex_Const:
8574 case Iex_Get:
8575 break;
8576 default:
8577 ppIRExpr(rhs);
8578 VG_(tool_panic)("preInstrumentationAnalysis:"
8579 " unhandled IRExpr");
8581 break;
8583 case Ist_Store:
8584 // The address will be checked (== PCasted). The data will be
8585 // used in some unknown way.
8586 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Store.addr);
8587 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Store.data);
8588 break;
8589 case Ist_Exit:
8590 // The guard will be checked (== PCasted)
8591 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Exit.guard);
8592 break;
8593 case Ist_Put:
8594 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Put.data);
8595 break;
8596 case Ist_PutI: {
8597 IRPutI* putI = st->Ist.PutI.details;
8598 // The index will be checked/PCasted (see do_shadow_PUTI). The
8599 // data will be used in an unknown way.
8600 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, putI->ix);
8601 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, putI->data);
8602 break;
8604 case Ist_Dirty: {
8605 IRDirty* d = st->Ist.Dirty.details;
8606 // The guard will be checked (== PCasted)
8607 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, d->guard);
8608 // The args will be used in unknown ways.
8609 for (IRExpr** args = d->args; *args; args++) {
8610 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8612 break;
8614 case Ist_CAS: {
8615 IRCAS* cas = st->Ist.CAS.details;
8616 // Address will be pcasted, everything else used as unknown
8617 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, cas->addr);
8618 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdLo);
8619 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataLo);
8620 if (cas->expdHi)
8621 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdHi);
8622 if (cas->dataHi)
8623 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataHi);
8624 break;
8626 case Ist_AbiHint:
8627 // Both exprs are used in unknown ways. TODO: can we safely
8628 // just ignore AbiHints?
8629 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.base);
8630 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.nia);
8631 break;
8632 case Ist_StoreG: {
8633 // We might be able to do better, and use HuPCa for the addr.
8634 // It's not immediately obvious that we can, because the address
8635 // is regarded as "used" only when the guard is true.
8636 IRStoreG* sg = st->Ist.StoreG.details;
8637 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->addr);
8638 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->data);
8639 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->guard);
8640 break;
8642 case Ist_LoadG: {
8643 // Per similar comments to Ist_StoreG .. not sure whether this
8644 // is really optimal.
8645 IRLoadG* lg = st->Ist.LoadG.details;
8646 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->addr);
8647 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->alt);
8648 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->guard);
8649 break;
8651 case Ist_LLSC: {
8652 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.LLSC.addr);
8653 if (st->Ist.LLSC.storedata)
8654 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.LLSC.storedata);
8655 break;
8657 case Ist_MBE:
8658 case Ist_IMark:
8659 case Ist_NoOp:
8660 break;
8661 default: {
8662 ppIRStmt(st);
8663 VG_(tool_panic)("preInstrumentationAnalysis: unhandled IRStmt");
8666 } // Now work backwards through the stmts.
8668 // Return the computed use env and the bogus-atom flag.
8669 tl_assert(*useEnvP == NULL);
8670 *useEnvP = useEnv;
8672 tl_assert(*hasBogusLiteralsP == False);
8673 *hasBogusLiteralsP = bogus;
8677 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
8678 IRSB* sb_in,
8679 const VexGuestLayout* layout,
8680 const VexGuestExtents* vge,
8681 const VexArchInfo* archinfo_host,
8682 IRType gWordTy, IRType hWordTy )
8684 Bool verboze = 0||False;
8685 Int i, j, first_stmt;
8686 IRStmt* st;
8687 MCEnv mce;
8688 IRSB* sb_out;
8690 if (gWordTy != hWordTy) {
8691 /* We don't currently support this case. */
8692 VG_(tool_panic)("host/guest word size mismatch");
8695 /* Check we're not completely nuts */
8696 tl_assert(sizeof(UWord) == sizeof(void*));
8697 tl_assert(sizeof(Word) == sizeof(void*));
8698 tl_assert(sizeof(Addr) == sizeof(void*));
8699 tl_assert(sizeof(ULong) == 8);
8700 tl_assert(sizeof(Long) == 8);
8701 tl_assert(sizeof(UInt) == 4);
8702 tl_assert(sizeof(Int) == 4);
8704 tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
8706 /* Set up SB */
8707 sb_out = deepCopyIRSBExceptStmts(sb_in);
8709 /* Set up the running environment. Both .sb and .tmpMap are
8710 modified as we go along. Note that tmps are added to both
8711 .sb->tyenv and .tmpMap together, so the valid index-set for
8712 those two arrays should always be identical. */
8713 VG_(memset)(&mce, 0, sizeof(mce));
8714 mce.sb = sb_out;
8715 mce.trace = verboze;
8716 mce.layout = layout;
8717 mce.hWordTy = hWordTy;
8718 mce.tmpHowUsed = NULL;
8720 /* BEGIN decide on expense levels for instrumentation. */
8722 /* Initially, select the cheap version of everything for which we have an
8723 option. */
8724 DetailLevelByOp__set_all( &mce.dlbo, DLcheap );
8726 /* Take account of the --expensive-definedness-checks= flag. */
8727 if (MC_(clo_expensive_definedness_checks) == EdcNO) {
8728 /* We just selected 'cheap for everything', so we don't need to do
8729 anything here. mce.tmpHowUsed remains NULL. */
8731 else if (MC_(clo_expensive_definedness_checks) == EdcYES) {
8732 /* Select 'expensive for everything'. mce.tmpHowUsed remains NULL. */
8733 DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8735 else {
8736 tl_assert(MC_(clo_expensive_definedness_checks) == EdcAUTO);
8737 /* We'll make our own selection, based on known per-target constraints
8738 and also on analysis of the block to be instrumented. First, set
8739 up default values for detail levels.
8741 On x86 and amd64, we'll routinely encounter code optimised by LLVM
8742 5 and above. Enable accurate interpretation of the following.
8743 LLVM uses adds for some bitfield inserts, and we get a lot of false
8744 errors if the cheap interpretation is used, alas. Could solve this
8745 much better if we knew which of such adds came from x86/amd64 LEA
8746 instructions, since these are the only ones really needing the
8747 expensive interpretation, but that would require some way to tag
8748 them in the _toIR.c front ends, which is a lot of faffing around.
8749 So for now we use preInstrumentationAnalysis() to detect adds which
8750 are used only to construct memory addresses, which is an
8751 approximation to the above, and is self-contained.*/
8752 # if defined(VGA_x86)
8753 mce.dlbo.dl_Add32 = DLauto;
8754 mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8755 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8756 # elif defined(VGA_amd64)
8757 mce.dlbo.dl_Add32 = DLexpensive;
8758 mce.dlbo.dl_Add64 = DLauto;
8759 mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8760 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8761 mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8762 # elif defined(VGA_ppc64le)
8763 // Needed by (at least) set_AV_CR6() in the front end.
8764 mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8765 # elif defined(VGA_arm64)
8766 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8767 mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8768 # elif defined(VGA_arm)
8769 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8770 # endif
8772 /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8773 fill it in. */
8774 Bool hasBogusLiterals = False;
8775 preInstrumentationAnalysis( &mce.tmpHowUsed, &hasBogusLiterals, sb_in );
8777 if (hasBogusLiterals) {
8778 /* This happens very rarely. In this case just select expensive
8779 for everything, and throw away the tmp-use analysis results. */
8780 DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8781 VG_(free)( mce.tmpHowUsed );
8782 mce.tmpHowUsed = NULL;
8783 } else {
8784 /* Nothing. mce.tmpHowUsed contains tmp-use analysis results,
8785 which will be used for some subset of Iop_{Add,Sub}{32,64},
8786 based on which ones are set to DLauto for this target. */
8790 DetailLevelByOp__check_sanity( &mce.dlbo );
8792 if (0) {
8793 // Debug printing: which tmps have been identified as PCast-only use
8794 if (mce.tmpHowUsed) {
8795 VG_(printf)("Cheapies: ");
8796 for (UInt q = 0; q < sb_in->tyenv->types_used; q++) {
8797 if (mce.tmpHowUsed[q] == HuPCa) {
8798 VG_(printf)("t%u ", q);
8801 VG_(printf)("\n");
8804 // Debug printing: number of ops by detail level
8805 UChar nCheap = DetailLevelByOp__count( &mce.dlbo, DLcheap );
8806 UChar nAuto = DetailLevelByOp__count( &mce.dlbo, DLauto );
8807 UChar nExpensive = DetailLevelByOp__count( &mce.dlbo, DLexpensive );
8808 tl_assert(nCheap + nAuto + nExpensive == 8);
8810 VG_(printf)("%u,%u,%u ", nCheap, nAuto, nExpensive);
8812 /* END decide on expense levels for instrumentation. */
8814 /* Initialise the running the tmp environment. */
8816 mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
8817 sizeof(TempMapEnt));
8818 VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
8819 for (i = 0; i < sb_in->tyenv->types_used; i++) {
8820 TempMapEnt ent;
8821 ent.kind = Orig;
8822 ent.shadowV = IRTemp_INVALID;
8823 ent.shadowB = IRTemp_INVALID;
8824 VG_(addToXA)( mce.tmpMap, &ent );
8826 tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
8828 /* Finally, begin instrumentation. */
8829 /* Copy verbatim any IR preamble preceding the first IMark */
8831 tl_assert(mce.sb == sb_out);
8832 tl_assert(mce.sb != sb_in);
8834 i = 0;
8835 while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
8837 st = sb_in->stmts[i];
8838 tl_assert(st);
8839 tl_assert(isFlatIRStmt(st));
8841 stmt( 'C', &mce, sb_in->stmts[i] );
8842 i++;
8845 /* Nasty problem. IR optimisation of the pre-instrumented IR may
8846 cause the IR following the preamble to contain references to IR
8847 temporaries defined in the preamble. Because the preamble isn't
8848 instrumented, these temporaries don't have any shadows.
8849 Nevertheless uses of them following the preamble will cause
8850 memcheck to generate references to their shadows. End effect is
8851 to cause IR sanity check failures, due to references to
8852 non-existent shadows. This is only evident for the complex
8853 preambles used for function wrapping on TOC-afflicted platforms
8854 (ppc64-linux).
8856 The following loop therefore scans the preamble looking for
8857 assignments to temporaries. For each one found it creates an
8858 assignment to the corresponding (V) shadow temp, marking it as
8859 'defined'. This is the same resulting IR as if the main
8860 instrumentation loop before had been applied to the statement
8861 'tmp = CONSTANT'.
8863 Similarly, if origin tracking is enabled, we must generate an
8864 assignment for the corresponding origin (B) shadow, claiming
8865 no-origin, as appropriate for a defined value.
8867 for (j = 0; j < i; j++) {
8868 if (sb_in->stmts[j]->tag == Ist_WrTmp) {
8869 /* findShadowTmpV checks its arg is an original tmp;
8870 no need to assert that here. */
8871 IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
8872 IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
8873 IRType ty_v = typeOfIRTemp(sb_out->tyenv, tmp_v);
8874 assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
8875 if (MC_(clo_mc_level) == 3) {
8876 IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
8877 tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
8878 assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
8880 if (0) {
8881 VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
8882 ppIRType( ty_v );
8883 VG_(printf)("\n");
8888 /* Iterate over the remaining stmts to generate instrumentation. */
8890 tl_assert(sb_in->stmts_used > 0);
8891 tl_assert(i >= 0);
8892 tl_assert(i < sb_in->stmts_used);
8893 tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
8895 for (/* use current i*/; i < sb_in->stmts_used; i++) {
8897 st = sb_in->stmts[i];
8898 first_stmt = sb_out->stmts_used;
8900 if (verboze) {
8901 VG_(printf)("\n");
8902 ppIRStmt(st);
8903 VG_(printf)("\n");
8906 if (MC_(clo_mc_level) == 3) {
8907 /* See comments on case Ist_CAS below. */
8908 if (st->tag != Ist_CAS)
8909 schemeS( &mce, st );
8912 /* Generate instrumentation code for each stmt ... */
8914 switch (st->tag) {
8916 case Ist_WrTmp: {
8917 IRTemp dst = st->Ist.WrTmp.tmp;
8918 tl_assert(dst < (UInt)sb_in->tyenv->types_used);
8919 HowUsed hu = mce.tmpHowUsed ? mce.tmpHowUsed[dst]
8920 : HuOth/*we don't know, so play safe*/;
8921 assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
8922 expr2vbits( &mce, st->Ist.WrTmp.data, hu ));
8923 break;
8926 case Ist_Put:
8927 do_shadow_PUT( &mce,
8928 st->Ist.Put.offset,
8929 st->Ist.Put.data,
8930 NULL /* shadow atom */, NULL /* guard */ );
8931 break;
8933 case Ist_PutI:
8934 do_shadow_PUTI( &mce, st->Ist.PutI.details);
8935 break;
8937 case Ist_Store:
8938 do_shadow_Store( &mce, st->Ist.Store.end,
8939 st->Ist.Store.addr, 0/* addr bias */,
8940 st->Ist.Store.data,
8941 NULL /* shadow data */,
8942 NULL/*guard*/ );
8943 break;
8945 case Ist_StoreG:
8946 do_shadow_StoreG( &mce, st->Ist.StoreG.details );
8947 break;
8949 case Ist_LoadG:
8950 do_shadow_LoadG( &mce, st->Ist.LoadG.details );
8951 break;
8953 case Ist_Exit:
8954 complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
8955 break;
8957 case Ist_IMark:
8958 break;
8960 case Ist_NoOp:
8961 case Ist_MBE:
8962 break;
8964 case Ist_Dirty:
8965 do_shadow_Dirty( &mce, st->Ist.Dirty.details );
8966 break;
8968 case Ist_AbiHint:
8969 do_AbiHint( &mce, st->Ist.AbiHint.base,
8970 st->Ist.AbiHint.len,
8971 st->Ist.AbiHint.nia );
8972 break;
8974 case Ist_CAS:
8975 do_shadow_CAS( &mce, st->Ist.CAS.details );
8976 /* Note, do_shadow_CAS copies the CAS itself to the output
8977 block, because it needs to add instrumentation both
8978 before and after it. Hence skip the copy below. Also
8979 skip the origin-tracking stuff (call to schemeS) above,
8980 since that's all tangled up with it too; do_shadow_CAS
8981 does it all. */
8982 break;
8984 case Ist_LLSC:
8985 do_shadow_LLSC( &mce,
8986 st->Ist.LLSC.end,
8987 st->Ist.LLSC.result,
8988 st->Ist.LLSC.addr,
8989 st->Ist.LLSC.storedata );
8990 break;
8992 default:
8993 VG_(printf)("\n");
8994 ppIRStmt(st);
8995 VG_(printf)("\n");
8996 VG_(tool_panic)("memcheck: unhandled IRStmt");
8998 } /* switch (st->tag) */
9000 if (0 && verboze) {
9001 for (j = first_stmt; j < sb_out->stmts_used; j++) {
9002 VG_(printf)(" ");
9003 ppIRStmt(sb_out->stmts[j]);
9004 VG_(printf)("\n");
9006 VG_(printf)("\n");
9009 /* ... and finally copy the stmt itself to the output. Except,
9010 skip the copy of IRCASs; see comments on case Ist_CAS
9011 above. */
9012 if (st->tag != Ist_CAS)
9013 stmt('C', &mce, st);
9016 /* Now we need to complain if the jump target is undefined. */
9017 first_stmt = sb_out->stmts_used;
9019 if (verboze) {
9020 VG_(printf)("sb_in->next = ");
9021 ppIRExpr(sb_in->next);
9022 VG_(printf)("\n\n");
9025 complainIfUndefined( &mce, sb_in->next, NULL );
9027 if (0 && verboze) {
9028 for (j = first_stmt; j < sb_out->stmts_used; j++) {
9029 VG_(printf)(" ");
9030 ppIRStmt(sb_out->stmts[j]);
9031 VG_(printf)("\n");
9033 VG_(printf)("\n");
9036 /* If this fails, there's been some serious snafu with tmp management,
9037 that should be investigated. */
9038 tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
9039 VG_(deleteXA)( mce.tmpMap );
9041 if (mce.tmpHowUsed) {
9042 VG_(free)( mce.tmpHowUsed );
9045 tl_assert(mce.sb == sb_out);
9046 return sb_out;
9050 /*--------------------------------------------------------------------*/
9051 /*--- end mc_translate.c ---*/
9052 /*--------------------------------------------------------------------*/