2 /*--------------------------------------------------------------------*/
3 /*--- Instrument IR to perform memory checking operations. ---*/
4 /*--- mc_translate.c ---*/
5 /*--------------------------------------------------------------------*/
8 This file is part of MemCheck, a heavyweight Valgrind tool for
9 detecting memory errors.
11 Copyright (C) 2000-2017 Julian Seward
14 This program is free software; you can redistribute it and/or
15 modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of the
17 License, or (at your option) any later version.
19 This program is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, see <http://www.gnu.org/licenses/>.
27 The GNU General Public License is contained in the file COPYING.
30 #include "pub_tool_basics.h"
31 #include "pub_tool_poolalloc.h" // For mc_include.h
32 #include "pub_tool_hashtable.h" // For mc_include.h
33 #include "pub_tool_libcassert.h"
34 #include "pub_tool_libcprint.h"
35 #include "pub_tool_tooliface.h"
36 #include "pub_tool_machine.h" // VG_(fnptr_to_fnentry)
37 #include "pub_tool_xarray.h"
38 #include "pub_tool_mallocfree.h"
39 #include "pub_tool_libcbase.h"
41 #include "mc_include.h"
44 /* FIXMEs JRS 2011-June-16.
46 Check the interpretation for vector narrowing and widening ops,
47 particularly the saturating ones. I suspect they are either overly
48 pessimistic and/or wrong.
50 Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
51 saturating shifts): the interpretation is overly pessimistic.
52 See comments on the relevant cases below for details.
54 Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
55 both rounding and non-rounding variants): ditto
58 /* This file implements the Memcheck instrumentation, and in
59 particular contains the core of its undefined value detection
60 machinery. For a comprehensive background of the terminology,
61 algorithms and rationale used herein, read:
63 Using Valgrind to detect undefined value errors with
66 Julian Seward and Nicholas Nethercote
68 2005 USENIX Annual Technical Conference (General Track),
69 Anaheim, CA, USA, April 10-15, 2005.
73 Here is as good a place as any to record exactly when V bits are and
74 should be checked, why, and what function is responsible.
77 Memcheck complains when an undefined value is used:
79 1. In the condition of a conditional branch. Because it could cause
80 incorrect control flow, and thus cause incorrect externally-visible
81 behaviour. [mc_translate.c:complainIfUndefined]
83 2. As an argument to a system call, or as the value that specifies
84 the system call number. Because it could cause an incorrect
85 externally-visible side effect. [mc_translate.c:mc_pre_reg_read]
87 3. As the address in a load or store. Because it could cause an
88 incorrect value to be used later, which could cause externally-visible
89 behaviour (eg. via incorrect control flow or an incorrect system call
90 argument) [complainIfUndefined]
92 4. As the target address of a branch. Because it could cause incorrect
93 control flow. [complainIfUndefined]
95 5. As an argument to setenv, unsetenv, or putenv. Because it could put
96 an incorrect value into the external environment.
97 [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
99 6. As the index in a GETI or PUTI operation. I'm not sure why... (njn).
100 [complainIfUndefined]
102 7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
103 VALGRIND_CHECK_VALUE_IS_DEFINED client requests. Because the user
104 requested it. [in memcheck.h]
107 Memcheck also complains, but should not, when an undefined value is used:
109 8. As the shift value in certain SIMD shift operations (but not in the
110 standard integer shift operations). This inconsistency is due to
111 historical reasons.) [complainIfUndefined]
114 Memcheck does not complain, but should, when an undefined value is used:
116 9. As an input to a client request. Because the client request may
117 affect the visible behaviour -- see bug #144362 for an example
118 involving the malloc replacements in vg_replace_malloc.c and
119 VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
120 isn't identified. That bug report also has some info on how to solve
121 the problem. [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
124 In practice, 1 and 2 account for the vast majority of cases.
127 /* Generation of addr-definedness, addr-validity and
128 guard-definedness checks pertaining to loads and stores (Iex_Load,
129 Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
130 loads/stores) was re-checked 11 May 2013. */
133 /*------------------------------------------------------------*/
134 /*--- Forward decls ---*/
135 /*------------------------------------------------------------*/
139 // See below for comments explaining what this is for.
141 enum __attribute__((packed
)) { HuUnU
=0, HuPCa
=1, HuOth
=2 }
144 static IRType
shadowTypeV ( IRType ty
);
145 static IRExpr
* expr2vbits ( struct _MCEnv
* mce
, IRExpr
* e
,
146 HowUsed hu
/*use HuOth if unknown*/ );
147 static IRTemp
findShadowTmpB ( struct _MCEnv
* mce
, IRTemp orig
);
149 static IRExpr
*i128_const_zero(void);
152 /*------------------------------------------------------------*/
153 /*--- Memcheck running state, and tmp management. ---*/
154 /*------------------------------------------------------------*/
156 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
157 propagation scheme, and a more expensive, more precise vbit propagation
158 scheme. This enum describes, for such an IROp, which scheme to use. */
161 // Use the cheaper, less-exact variant.
163 // Choose between cheap and expensive based on analysis of the block
164 // to be instrumented. Note that the choice may be done on a
165 // per-instance basis of the IROp that this DetailLevel describes.
167 // Use the more expensive, more-exact variant.
173 /* A readonly part of the running state. For IROps that have both a
174 less-exact and more-exact interpretation, records which interpretation is
178 // For Add32/64 and Sub32/64, all 3 settings are allowed. For the
179 // DLauto case, a per-instance decision is to be made by inspecting
180 // the associated tmp's entry in MCEnv.tmpHowUsed.
181 DetailLevel dl_Add32
;
182 DetailLevel dl_Add64
;
183 DetailLevel dl_Sub32
;
184 DetailLevel dl_Sub64
;
185 // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
187 DetailLevel dl_CmpEQ64_CmpNE64
;
188 DetailLevel dl_CmpEQ32_CmpNE32
;
189 DetailLevel dl_CmpEQ16_CmpNE16
;
190 DetailLevel dl_CmpEQ8_CmpNE8
;
194 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp
* dlbo
,
201 dlbo
->dl_CmpEQ64_CmpNE64
= dl
;
202 dlbo
->dl_CmpEQ32_CmpNE32
= dl
;
203 dlbo
->dl_CmpEQ16_CmpNE16
= dl
;
204 dlbo
->dl_CmpEQ8_CmpNE8
= dl
;
207 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp
* dlbo
)
209 tl_assert(dlbo
->dl_Add32
>= DLcheap
&& dlbo
->dl_Add32
<= DLexpensive
);
210 tl_assert(dlbo
->dl_Add64
>= DLcheap
&& dlbo
->dl_Add64
<= DLexpensive
);
211 tl_assert(dlbo
->dl_Sub32
>= DLcheap
&& dlbo
->dl_Sub32
<= DLexpensive
);
212 tl_assert(dlbo
->dl_Sub64
>= DLcheap
&& dlbo
->dl_Sub64
<= DLexpensive
);
213 tl_assert(dlbo
->dl_CmpEQ64_CmpNE64
== DLcheap
214 || dlbo
->dl_CmpEQ64_CmpNE64
== DLexpensive
);
215 tl_assert(dlbo
->dl_CmpEQ32_CmpNE32
== DLcheap
216 || dlbo
->dl_CmpEQ32_CmpNE32
== DLexpensive
);
217 tl_assert(dlbo
->dl_CmpEQ16_CmpNE16
== DLcheap
218 || dlbo
->dl_CmpEQ16_CmpNE16
== DLexpensive
);
219 tl_assert(dlbo
->dl_CmpEQ8_CmpNE8
== DLcheap
220 || dlbo
->dl_CmpEQ8_CmpNE8
== DLexpensive
);
223 static UInt
DetailLevelByOp__count ( const DetailLevelByOp
* dlbo
,
227 n
+= (dlbo
->dl_Add32
== dl
? 1 : 0);
228 n
+= (dlbo
->dl_Add64
== dl
? 1 : 0);
229 n
+= (dlbo
->dl_Sub32
== dl
? 1 : 0);
230 n
+= (dlbo
->dl_Sub64
== dl
? 1 : 0);
231 n
+= (dlbo
->dl_CmpEQ64_CmpNE64
== dl
? 1 : 0);
232 n
+= (dlbo
->dl_CmpEQ32_CmpNE32
== dl
? 1 : 0);
233 n
+= (dlbo
->dl_CmpEQ16_CmpNE16
== dl
? 1 : 0);
234 n
+= (dlbo
->dl_CmpEQ8_CmpNE8
== dl
? 1 : 0);
239 /* Carries info about a particular tmp. The tmp's number is not
240 recorded, as this is implied by (equal to) its index in the tmpMap
241 in MCEnv. The tmp's type is also not recorded, as this is present
244 When .kind is Orig, .shadowV and .shadowB may give the identities
245 of the temps currently holding the associated definedness (shadowV)
246 and origin (shadowB) values, or these may be IRTemp_INVALID if code
247 to compute such values has not yet been emitted.
249 When .kind is VSh or BSh then the tmp is holds a V- or B- value,
250 and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
251 illogical for a shadow tmp itself to be shadowed.
254 enum { Orig
=1, VSh
=2, BSh
=3 }
266 /* A |HowUsed| value carries analysis results about how values are used,
267 pertaining to whether we need to instrument integer adds expensively or
268 not. The running state carries a (readonly) mapping from original tmp to
269 a HowUsed value for it. A usage value can be one of three values,
270 forming a 3-point chain lattice.
272 HuOth ("Other") used in some arbitrary way
274 HuPCa ("PCast") used *only* in effectively a PCast, in which all
275 | we care about is the all-defined vs not-all-defined distinction
277 HuUnU ("Unused") not used at all.
279 The "safe" (don't-know) end of the lattice is "HuOth". See comments
280 below in |preInstrumentationAnalysis| for further details.
284 enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
288 // Not actually necessary, but we don't want to waste D1 space.
289 STATIC_ASSERT(sizeof(HowUsed
) == 1);
292 /* Carries around state during memcheck instrumentation. */
295 /* MODIFIED: the superblock being constructed. IRStmts are
300 /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
301 current kind and possibly shadow temps for each temp in the
302 IRSB being constructed. Note that it does not contain the
303 type of each tmp. If you want to know the type, look at the
304 relevant entry in sb->tyenv. It follows that at all times
305 during the instrumentation process, the valid indices for
306 tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
307 total number of Orig, V- and B- temps allocated so far.
309 The reason for this strange split (types in one place, all
310 other info in another) is that we need the types to be
311 attached to sb so as to make it possible to do
312 "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
313 instrumentation process. */
314 XArray
* /* of TempMapEnt */ tmpMap
;
316 /* READONLY: contains details of which ops should be expensively
318 DetailLevelByOp dlbo
;
320 /* READONLY: for each original tmp, how the tmp is used. This is
321 computed by |preInstrumentationAnalysis|. Valid indices are
322 0 .. #temps_in_sb-1 (same as for tmpMap). */
325 /* READONLY: the guest layout. This indicates which parts of
326 the guest state should be regarded as 'always defined'. */
327 const VexGuestLayout
* layout
;
329 /* READONLY: the host word type. Needed for constructing
330 arguments of type 'HWord' to be passed to helper functions.
331 Ity_I32 or Ity_I64 only. */
337 /* SHADOW TMP MANAGEMENT. Shadow tmps are allocated lazily (on
338 demand), as they are encountered. This is for two reasons.
340 (1) (less important reason): Many original tmps are unused due to
341 initial IR optimisation, and we do not want to spaces in tables
344 Shadow IRTemps are therefore allocated on demand. mce.tmpMap is a
345 table indexed [0 .. n_types-1], which gives the current shadow for
346 each original tmp, or INVALID_IRTEMP if none is so far assigned.
347 It is necessary to support making multiple assignments to a shadow
348 -- specifically, after testing a shadow for definedness, it needs
349 to be made defined. But IR's SSA property disallows this.
351 (2) (more important reason): Therefore, when a shadow needs to get
352 a new value, a new temporary is created, the value is assigned to
353 that, and the tmpMap is updated to reflect the new binding.
355 A corollary is that if the tmpMap maps a given tmp to
356 IRTemp_INVALID and we are hoping to read that shadow tmp, it means
357 there's a read-before-write error in the original tmps. The IR
358 sanity checker should catch all such anomalies, however.
361 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
362 both the table in mce->sb and to our auxiliary mapping. Note that
363 newTemp may cause mce->tmpMap to resize, hence previous results
364 from VG_(indexXA)(mce->tmpMap) are invalidated. */
365 static IRTemp
newTemp ( MCEnv
* mce
, IRType ty
, TempKind kind
)
369 IRTemp tmp
= newIRTemp(mce
->sb
->tyenv
, ty
);
371 ent
.shadowV
= IRTemp_INVALID
;
372 ent
.shadowB
= IRTemp_INVALID
;
373 newIx
= VG_(addToXA
)( mce
->tmpMap
, &ent
);
374 tl_assert(newIx
== (Word
)tmp
);
379 /* Find the tmp currently shadowing the given original tmp. If none
380 so far exists, allocate one. */
381 static IRTemp
findShadowTmpV ( MCEnv
* mce
, IRTemp orig
)
384 /* VG_(indexXA) range-checks 'orig', hence no need to check
386 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
387 tl_assert(ent
->kind
== Orig
);
388 if (ent
->shadowV
== IRTemp_INVALID
) {
390 = newTemp( mce
, shadowTypeV(mce
->sb
->tyenv
->types
[orig
]), VSh
);
391 /* newTemp may cause mce->tmpMap to resize, hence previous results
392 from VG_(indexXA) are invalid. */
393 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
394 tl_assert(ent
->kind
== Orig
);
395 tl_assert(ent
->shadowV
== IRTemp_INVALID
);
401 /* Allocate a new shadow for the given original tmp. This means any
402 previous shadow is abandoned. This is needed because it is
403 necessary to give a new value to a shadow once it has been tested
404 for undefinedness, but unfortunately IR's SSA property disallows
405 this. Instead we must abandon the old shadow, allocate a new one
406 and use that instead.
408 This is the same as findShadowTmpV, except we don't bother to see
409 if a shadow temp already existed -- we simply allocate a new one
411 static void newShadowTmpV ( MCEnv
* mce
, IRTemp orig
)
414 /* VG_(indexXA) range-checks 'orig', hence no need to check
416 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
417 tl_assert(ent
->kind
== Orig
);
420 = newTemp( mce
, shadowTypeV(mce
->sb
->tyenv
->types
[orig
]), VSh
);
421 /* newTemp may cause mce->tmpMap to resize, hence previous results
422 from VG_(indexXA) are invalid. */
423 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
424 tl_assert(ent
->kind
== Orig
);
430 /*------------------------------------------------------------*/
431 /*--- IRAtoms -- a subset of IRExprs ---*/
432 /*------------------------------------------------------------*/
434 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
435 isIRAtom() in libvex_ir.h. Because this instrumenter expects flat
436 input, most of this code deals in atoms. Usefully, a value atom
437 always has a V-value which is also an atom: constants are shadowed
438 by constants, and temps are shadowed by the corresponding shadow
441 typedef IRExpr IRAtom
;
443 /* (used for sanity checks only): is this an atom which looks
444 like it's from original code? */
445 static Bool
isOriginalAtom ( MCEnv
* mce
, IRAtom
* a1
)
447 if (a1
->tag
== Iex_Const
)
449 if (a1
->tag
== Iex_RdTmp
) {
450 TempMapEnt
* ent
= VG_(indexXA
)( mce
->tmpMap
, a1
->Iex
.RdTmp
.tmp
);
451 return ent
->kind
== Orig
;
456 /* (used for sanity checks only): is this an atom which looks
457 like it's from shadow code? */
458 static Bool
isShadowAtom ( MCEnv
* mce
, IRAtom
* a1
)
460 if (a1
->tag
== Iex_Const
)
462 if (a1
->tag
== Iex_RdTmp
) {
463 TempMapEnt
* ent
= VG_(indexXA
)( mce
->tmpMap
, a1
->Iex
.RdTmp
.tmp
);
464 return ent
->kind
== VSh
|| ent
->kind
== BSh
;
469 /* (used for sanity checks only): check that both args are atoms and
470 are identically-kinded. */
471 static Bool
sameKindedAtoms ( IRAtom
* a1
, IRAtom
* a2
)
473 if (a1
->tag
== Iex_RdTmp
&& a2
->tag
== Iex_RdTmp
)
475 if (a1
->tag
== Iex_Const
&& a2
->tag
== Iex_Const
)
481 /*------------------------------------------------------------*/
482 /*--- Type management ---*/
483 /*------------------------------------------------------------*/
485 /* Shadow state is always accessed using integer types. This returns
486 an integer type with the same size (as per sizeofIRType) as the
487 given type. The only valid shadow types are Bit, I8, I16, I32,
488 I64, I128, V128, V256. */
490 static IRType
shadowTypeV ( IRType ty
)
498 case Ity_I128
: return ty
;
499 case Ity_F16
: return Ity_I16
;
500 case Ity_F32
: return Ity_I32
;
501 case Ity_D32
: return Ity_I32
;
502 case Ity_F64
: return Ity_I64
;
503 case Ity_D64
: return Ity_I64
;
504 case Ity_F128
: return Ity_I128
;
505 case Ity_D128
: return Ity_I128
;
506 case Ity_V128
: return Ity_V128
;
507 case Ity_V256
: return Ity_V256
;
508 default: ppIRType(ty
);
509 VG_(tool_panic
)("memcheck:shadowTypeV");
513 /* Produce a 'defined' value of the given shadow type. Should only be
514 supplied shadow types (Bit/I8/I16/I32/UI64). */
515 static IRExpr
* definedOfType ( IRType ty
) {
517 case Ity_I1
: return IRExpr_Const(IRConst_U1(False
));
518 case Ity_I8
: return IRExpr_Const(IRConst_U8(0));
519 case Ity_I16
: return IRExpr_Const(IRConst_U16(0));
520 case Ity_I32
: return IRExpr_Const(IRConst_U32(0));
521 case Ity_I64
: return IRExpr_Const(IRConst_U64(0));
522 case Ity_I128
: return i128_const_zero();
523 case Ity_V128
: return IRExpr_Const(IRConst_V128(0x0000));
524 case Ity_V256
: return IRExpr_Const(IRConst_V256(0x00000000));
525 default: VG_(tool_panic
)("memcheck:definedOfType");
530 /*------------------------------------------------------------*/
531 /*--- Constructing IR fragments ---*/
532 /*------------------------------------------------------------*/
534 /* add stmt to a bb */
535 static inline void stmt ( HChar cat
, MCEnv
* mce
, IRStmt
* st
) {
537 VG_(printf
)(" %c: ", cat
);
541 addStmtToIRSB(mce
->sb
, st
);
544 /* assign value to tmp */
546 void assign ( HChar cat
, MCEnv
* mce
, IRTemp tmp
, IRExpr
* expr
) {
547 stmt(cat
, mce
, IRStmt_WrTmp(tmp
,expr
));
550 /* build various kinds of expressions */
551 #define triop(_op, _arg1, _arg2, _arg3) \
552 IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
553 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
554 #define unop(_op, _arg) IRExpr_Unop((_op),(_arg))
555 #define mkU1(_n) IRExpr_Const(IRConst_U1(_n))
556 #define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
557 #define mkU16(_n) IRExpr_Const(IRConst_U16(_n))
558 #define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
559 #define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
560 #define mkV128(_n) IRExpr_Const(IRConst_V128(_n))
561 #define mkexpr(_tmp) IRExpr_RdTmp((_tmp))
563 /* Bind the given expression to a new temporary, and return the
564 temporary. This effectively converts an arbitrary expression into
567 'ty' is the type of 'e' and hence the type that the new temporary
568 needs to be. But passing it in is redundant, since we can deduce
569 the type merely by inspecting 'e'. So at least use that fact to
570 assert that the two types agree. */
571 static IRAtom
* assignNew ( HChar cat
, MCEnv
* mce
, IRType ty
, IRExpr
* e
)
575 IRType tyE
= typeOfIRExpr(mce
->sb
->tyenv
, e
);
577 tl_assert(tyE
== ty
); /* so 'ty' is redundant (!) */
579 case 'V': k
= VSh
; break;
580 case 'B': k
= BSh
; break;
581 case 'C': k
= Orig
; break;
582 /* happens when we are making up new "orig"
583 expressions, for IRCAS handling */
584 default: tl_assert(0);
586 t
= newTemp(mce
, ty
, k
);
587 assign(cat
, mce
, t
, e
);
592 /*------------------------------------------------------------*/
593 /*--- Helper functions for 128-bit ops ---*/
594 /*------------------------------------------------------------*/
596 static IRExpr
*i128_const_zero(void)
598 IRAtom
* z64
= IRExpr_Const(IRConst_U64(0));
599 return binop(Iop_64HLto128
, z64
, z64
);
602 /* There are no I128-bit loads and/or stores [as generated by any
603 current front ends]. So we do not need to worry about that in
607 /*------------------------------------------------------------*/
608 /*--- Constructing definedness primitive ops ---*/
609 /*------------------------------------------------------------*/
611 /* --------- Defined-if-either-defined --------- */
613 static IRAtom
* mkDifD1 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
614 tl_assert(isShadowAtom(mce
,a1
));
615 tl_assert(isShadowAtom(mce
,a2
));
616 return assignNew('V', mce
, Ity_I1
, binop(Iop_And1
, a1
, a2
));
619 static IRAtom
* mkDifD8 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
620 tl_assert(isShadowAtom(mce
,a1
));
621 tl_assert(isShadowAtom(mce
,a2
));
622 return assignNew('V', mce
, Ity_I8
, binop(Iop_And8
, a1
, a2
));
625 static IRAtom
* mkDifD16 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
626 tl_assert(isShadowAtom(mce
,a1
));
627 tl_assert(isShadowAtom(mce
,a2
));
628 return assignNew('V', mce
, Ity_I16
, binop(Iop_And16
, a1
, a2
));
631 static IRAtom
* mkDifD32 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
632 tl_assert(isShadowAtom(mce
,a1
));
633 tl_assert(isShadowAtom(mce
,a2
));
634 return assignNew('V', mce
, Ity_I32
, binop(Iop_And32
, a1
, a2
));
637 static IRAtom
* mkDifD64 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
638 tl_assert(isShadowAtom(mce
,a1
));
639 tl_assert(isShadowAtom(mce
,a2
));
640 return assignNew('V', mce
, Ity_I64
, binop(Iop_And64
, a1
, a2
));
643 static IRAtom
* mkDifDV128 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
644 tl_assert(isShadowAtom(mce
,a1
));
645 tl_assert(isShadowAtom(mce
,a2
));
646 return assignNew('V', mce
, Ity_V128
, binop(Iop_AndV128
, a1
, a2
));
649 static IRAtom
* mkDifDV256 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
650 tl_assert(isShadowAtom(mce
,a1
));
651 tl_assert(isShadowAtom(mce
,a2
));
652 return assignNew('V', mce
, Ity_V256
, binop(Iop_AndV256
, a1
, a2
));
655 /* --------- Undefined-if-either-undefined --------- */
657 static IRAtom
* mkUifU1 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
658 tl_assert(isShadowAtom(mce
,a1
));
659 tl_assert(isShadowAtom(mce
,a2
));
660 return assignNew('V', mce
, Ity_I1
, binop(Iop_Or1
, a1
, a2
));
663 static IRAtom
* mkUifU8 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
664 tl_assert(isShadowAtom(mce
,a1
));
665 tl_assert(isShadowAtom(mce
,a2
));
666 return assignNew('V', mce
, Ity_I8
, binop(Iop_Or8
, a1
, a2
));
669 static IRAtom
* mkUifU16 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
670 tl_assert(isShadowAtom(mce
,a1
));
671 tl_assert(isShadowAtom(mce
,a2
));
672 return assignNew('V', mce
, Ity_I16
, binop(Iop_Or16
, a1
, a2
));
675 static IRAtom
* mkUifU32 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
676 tl_assert(isShadowAtom(mce
,a1
));
677 tl_assert(isShadowAtom(mce
,a2
));
678 return assignNew('V', mce
, Ity_I32
, binop(Iop_Or32
, a1
, a2
));
681 static IRAtom
* mkUifU64 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
682 tl_assert(isShadowAtom(mce
,a1
));
683 tl_assert(isShadowAtom(mce
,a2
));
684 return assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, a1
, a2
));
687 static IRAtom
* mkUifU128 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
688 IRAtom
*tmp1
, *tmp2
, *tmp3
, *tmp4
, *tmp5
, *tmp6
;
689 tl_assert(isShadowAtom(mce
,a1
));
690 tl_assert(isShadowAtom(mce
,a2
));
691 tmp1
= assignNew('V', mce
, Ity_I64
, unop(Iop_128to64
, a1
));
692 tmp2
= assignNew('V', mce
, Ity_I64
, unop(Iop_128HIto64
, a1
));
693 tmp3
= assignNew('V', mce
, Ity_I64
, unop(Iop_128to64
, a2
));
694 tmp4
= assignNew('V', mce
, Ity_I64
, unop(Iop_128HIto64
, a2
));
695 tmp5
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, tmp1
, tmp3
));
696 tmp6
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, tmp2
, tmp4
));
698 return assignNew('V', mce
, Ity_I128
, binop(Iop_64HLto128
, tmp6
, tmp5
));
701 static IRAtom
* mkUifUV128 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
702 tl_assert(isShadowAtom(mce
,a1
));
703 tl_assert(isShadowAtom(mce
,a2
));
704 return assignNew('V', mce
, Ity_V128
, binop(Iop_OrV128
, a1
, a2
));
707 static IRAtom
* mkUifUV256 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
708 tl_assert(isShadowAtom(mce
,a1
));
709 tl_assert(isShadowAtom(mce
,a2
));
710 return assignNew('V', mce
, Ity_V256
, binop(Iop_OrV256
, a1
, a2
));
713 static IRAtom
* mkUifU ( MCEnv
* mce
, IRType vty
, IRAtom
* a1
, IRAtom
* a2
) {
715 case Ity_I8
: return mkUifU8(mce
, a1
, a2
);
716 case Ity_I16
: return mkUifU16(mce
, a1
, a2
);
717 case Ity_I32
: return mkUifU32(mce
, a1
, a2
);
718 case Ity_I64
: return mkUifU64(mce
, a1
, a2
);
719 case Ity_I128
: return mkUifU128(mce
, a1
, a2
);
720 case Ity_V128
: return mkUifUV128(mce
, a1
, a2
);
721 case Ity_V256
: return mkUifUV256(mce
, a1
, a2
);
723 VG_(printf
)("\n"); ppIRType(vty
); VG_(printf
)("\n");
724 VG_(tool_panic
)("memcheck:mkUifU");
728 /* --------- The Left-family of operations. --------- */
730 static IRAtom
* mkLeft8 ( MCEnv
* mce
, IRAtom
* a1
) {
731 tl_assert(isShadowAtom(mce
,a1
));
732 return assignNew('V', mce
, Ity_I8
, unop(Iop_Left8
, a1
));
735 static IRAtom
* mkLeft16 ( MCEnv
* mce
, IRAtom
* a1
) {
736 tl_assert(isShadowAtom(mce
,a1
));
737 return assignNew('V', mce
, Ity_I16
, unop(Iop_Left16
, a1
));
740 static IRAtom
* mkLeft32 ( MCEnv
* mce
, IRAtom
* a1
) {
741 tl_assert(isShadowAtom(mce
,a1
));
742 return assignNew('V', mce
, Ity_I32
, unop(Iop_Left32
, a1
));
745 static IRAtom
* mkLeft64 ( MCEnv
* mce
, IRAtom
* a1
) {
746 tl_assert(isShadowAtom(mce
,a1
));
747 return assignNew('V', mce
, Ity_I64
, unop(Iop_Left64
, a1
));
750 /* --------- The Right-family of operations. --------- */
752 /* Unfortunately these are a lot more expensive then their Left
753 counterparts. Fortunately they are only very rarely used -- only for
754 count-leading-zeroes instrumentation. */
756 static IRAtom
* mkRight32 ( MCEnv
* mce
, IRAtom
* a1
)
758 for (Int i
= 1; i
<= 16; i
*= 2) {
761 = assignNew('V', mce
, Ity_I32
, binop(Iop_Shr32
, a1
, mkU8(i
)));
762 a1
= assignNew('V', mce
, Ity_I32
, binop(Iop_Or32
, a1
, tmp
));
767 static IRAtom
* mkRight64 ( MCEnv
* mce
, IRAtom
* a1
)
769 for (Int i
= 1; i
<= 32; i
*= 2) {
772 = assignNew('V', mce
, Ity_I64
, binop(Iop_Shr64
, a1
, mkU8(i
)));
773 a1
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, a1
, tmp
));
778 /* --------- 'Improvement' functions for AND/OR. --------- */
780 /* ImproveAND(data, vbits) = data OR vbits. Defined (0) data 0s give
781 defined (0); all other -> undefined (1).
783 static IRAtom
* mkImproveAND1 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
785 tl_assert(isOriginalAtom(mce
, data
));
786 tl_assert(isShadowAtom(mce
, vbits
));
787 tl_assert(sameKindedAtoms(data
, vbits
));
788 return assignNew('V', mce
, Ity_I1
, binop(Iop_Or1
, data
, vbits
));
791 static IRAtom
* mkImproveAND8 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
793 tl_assert(isOriginalAtom(mce
, data
));
794 tl_assert(isShadowAtom(mce
, vbits
));
795 tl_assert(sameKindedAtoms(data
, vbits
));
796 return assignNew('V', mce
, Ity_I8
, binop(Iop_Or8
, data
, vbits
));
799 static IRAtom
* mkImproveAND16 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
801 tl_assert(isOriginalAtom(mce
, data
));
802 tl_assert(isShadowAtom(mce
, vbits
));
803 tl_assert(sameKindedAtoms(data
, vbits
));
804 return assignNew('V', mce
, Ity_I16
, binop(Iop_Or16
, data
, vbits
));
807 static IRAtom
* mkImproveAND32 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
809 tl_assert(isOriginalAtom(mce
, data
));
810 tl_assert(isShadowAtom(mce
, vbits
));
811 tl_assert(sameKindedAtoms(data
, vbits
));
812 return assignNew('V', mce
, Ity_I32
, binop(Iop_Or32
, data
, vbits
));
815 static IRAtom
* mkImproveAND64 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
817 tl_assert(isOriginalAtom(mce
, data
));
818 tl_assert(isShadowAtom(mce
, vbits
));
819 tl_assert(sameKindedAtoms(data
, vbits
));
820 return assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, data
, vbits
));
823 static IRAtom
* mkImproveANDV128 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
825 tl_assert(isOriginalAtom(mce
, data
));
826 tl_assert(isShadowAtom(mce
, vbits
));
827 tl_assert(sameKindedAtoms(data
, vbits
));
828 return assignNew('V', mce
, Ity_V128
, binop(Iop_OrV128
, data
, vbits
));
831 static IRAtom
* mkImproveANDV256 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
833 tl_assert(isOriginalAtom(mce
, data
));
834 tl_assert(isShadowAtom(mce
, vbits
));
835 tl_assert(sameKindedAtoms(data
, vbits
));
836 return assignNew('V', mce
, Ity_V256
, binop(Iop_OrV256
, data
, vbits
));
839 /* ImproveOR(data, vbits) = ~data OR vbits. Defined (0) data 1s give
840 defined (0); all other -> undefined (1).
842 static IRAtom
* mkImproveOR1 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
844 tl_assert(isOriginalAtom(mce
, data
));
845 tl_assert(isShadowAtom(mce
, vbits
));
846 tl_assert(sameKindedAtoms(data
, vbits
));
850 assignNew('V', mce
, Ity_I1
, unop(Iop_Not1
, data
)),
854 static IRAtom
* mkImproveOR8 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
856 tl_assert(isOriginalAtom(mce
, data
));
857 tl_assert(isShadowAtom(mce
, vbits
));
858 tl_assert(sameKindedAtoms(data
, vbits
));
862 assignNew('V', mce
, Ity_I8
, unop(Iop_Not8
, data
)),
866 static IRAtom
* mkImproveOR16 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
868 tl_assert(isOriginalAtom(mce
, data
));
869 tl_assert(isShadowAtom(mce
, vbits
));
870 tl_assert(sameKindedAtoms(data
, vbits
));
874 assignNew('V', mce
, Ity_I16
, unop(Iop_Not16
, data
)),
878 static IRAtom
* mkImproveOR32 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
880 tl_assert(isOriginalAtom(mce
, data
));
881 tl_assert(isShadowAtom(mce
, vbits
));
882 tl_assert(sameKindedAtoms(data
, vbits
));
886 assignNew('V', mce
, Ity_I32
, unop(Iop_Not32
, data
)),
890 static IRAtom
* mkImproveOR64 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
892 tl_assert(isOriginalAtom(mce
, data
));
893 tl_assert(isShadowAtom(mce
, vbits
));
894 tl_assert(sameKindedAtoms(data
, vbits
));
898 assignNew('V', mce
, Ity_I64
, unop(Iop_Not64
, data
)),
902 static IRAtom
* mkImproveORV128 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
904 tl_assert(isOriginalAtom(mce
, data
));
905 tl_assert(isShadowAtom(mce
, vbits
));
906 tl_assert(sameKindedAtoms(data
, vbits
));
910 assignNew('V', mce
, Ity_V128
, unop(Iop_NotV128
, data
)),
914 static IRAtom
* mkImproveORV256 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
916 tl_assert(isOriginalAtom(mce
, data
));
917 tl_assert(isShadowAtom(mce
, vbits
));
918 tl_assert(sameKindedAtoms(data
, vbits
));
922 assignNew('V', mce
, Ity_V256
, unop(Iop_NotV256
, data
)),
926 /* --------- Pessimising casts. --------- */
928 /* The function returns an expression of type DST_TY. If any of the VBITS
929 is undefined (value == 1) the resulting expression has all bits set to
930 1. Otherwise, all bits are 0. */
932 static IRAtom
* mkPCastTo( MCEnv
* mce
, IRType dst_ty
, IRAtom
* vbits
)
937 /* Note, dst_ty is a shadow type, not an original type. */
938 tl_assert(isShadowAtom(mce
,vbits
));
939 src_ty
= typeOfIRExpr(mce
->sb
->tyenv
, vbits
);
941 /* Fast-track some common cases */
942 if (src_ty
== Ity_I32
&& dst_ty
== Ity_I32
)
943 return assignNew('V', mce
, Ity_I32
, unop(Iop_CmpwNEZ32
, vbits
));
945 if (src_ty
== Ity_I64
&& dst_ty
== Ity_I64
)
946 return assignNew('V', mce
, Ity_I64
, unop(Iop_CmpwNEZ64
, vbits
));
948 if (src_ty
== Ity_I32
&& dst_ty
== Ity_I64
) {
949 /* PCast the arg, then clone it. */
950 IRAtom
* tmp
= assignNew('V', mce
, Ity_I32
, unop(Iop_CmpwNEZ32
, vbits
));
951 return assignNew('V', mce
, Ity_I64
, binop(Iop_32HLto64
, tmp
, tmp
));
954 if (src_ty
== Ity_I32
&& dst_ty
== Ity_V128
) {
955 /* PCast the arg, then clone it 4 times. */
956 IRAtom
* tmp
= assignNew('V', mce
, Ity_I32
, unop(Iop_CmpwNEZ32
, vbits
));
957 tmp
= assignNew('V', mce
, Ity_I64
, binop(Iop_32HLto64
, tmp
, tmp
));
958 return assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
, tmp
, tmp
));
961 if (src_ty
== Ity_I32
&& dst_ty
== Ity_V256
) {
962 /* PCast the arg, then clone it 8 times. */
963 IRAtom
* tmp
= assignNew('V', mce
, Ity_I32
, unop(Iop_CmpwNEZ32
, vbits
));
964 tmp
= assignNew('V', mce
, Ity_I64
, binop(Iop_32HLto64
, tmp
, tmp
));
965 tmp
= assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
, tmp
, tmp
));
966 return assignNew('V', mce
, Ity_V256
, binop(Iop_V128HLtoV256
, tmp
, tmp
));
969 if (src_ty
== Ity_I64
&& dst_ty
== Ity_I32
) {
970 /* PCast the arg. This gives all 0s or all 1s. Then throw away
972 IRAtom
* tmp
= assignNew('V', mce
, Ity_I64
, unop(Iop_CmpwNEZ64
, vbits
));
973 return assignNew('V', mce
, Ity_I32
, unop(Iop_64to32
, tmp
));
976 if (src_ty
== Ity_V128
&& dst_ty
== Ity_I64
) {
977 /* Use InterleaveHI64x2 to copy the top half of the vector into
978 the bottom half. Then we can UifU it with the original, throw
979 away the upper half of the result, and PCast-I64-to-I64
981 // Generates vbits[127:64] : vbits[127:64]
983 = assignNew('V', mce
, Ity_V128
,
984 binop(Iop_InterleaveHI64x2
, vbits
, vbits
));
986 // UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
987 // == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
989 = mkUifUV128(mce
, hi64hi64
, vbits
);
990 // Generates UifU(vbits[127:64],vbits[63:0])
992 = assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, lohi64
));
994 // PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
995 // == PCast-to-I64( vbits[127:0] )
997 = assignNew('V', mce
, Ity_I64
, unop(Iop_CmpwNEZ64
, lo64
));
1001 /* Else do it the slow way .. */
1002 /* First of all, collapse vbits down to a single bit. */
1009 tmp1
= assignNew('V', mce
, Ity_I1
, unop(Iop_CmpNEZ8
, vbits
));
1012 tmp1
= assignNew('V', mce
, Ity_I1
, unop(Iop_CmpNEZ16
, vbits
));
1015 tmp1
= assignNew('V', mce
, Ity_I1
, unop(Iop_CmpNEZ32
, vbits
));
1018 tmp1
= assignNew('V', mce
, Ity_I1
, unop(Iop_CmpNEZ64
, vbits
));
1021 /* Gah. Chop it in half, OR the halves together, and compare
1023 IRAtom
* tmp2
= assignNew('V', mce
, Ity_I64
, unop(Iop_128HIto64
, vbits
));
1024 IRAtom
* tmp3
= assignNew('V', mce
, Ity_I64
, unop(Iop_128to64
, vbits
));
1025 IRAtom
* tmp4
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, tmp2
, tmp3
));
1026 tmp1
= assignNew('V', mce
, Ity_I1
,
1027 unop(Iop_CmpNEZ64
, tmp4
));
1031 /* Chop it in half, OR the halves together, and compare that
1034 IRAtom
* tmp2
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128HIto64
, vbits
));
1035 IRAtom
* tmp3
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, vbits
));
1036 IRAtom
* tmp4
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, tmp2
, tmp3
));
1037 tmp1
= assignNew('V', mce
, Ity_I1
,
1038 unop(Iop_CmpNEZ64
, tmp4
));
1043 VG_(tool_panic
)("mkPCastTo(1)");
1046 /* Now widen up to the dst type. */
1051 return assignNew('V', mce
, Ity_I8
, unop(Iop_1Sto8
, tmp1
));
1053 return assignNew('V', mce
, Ity_I16
, unop(Iop_1Sto16
, tmp1
));
1055 return assignNew('V', mce
, Ity_I32
, unop(Iop_1Sto32
, tmp1
));
1057 return assignNew('V', mce
, Ity_I64
, unop(Iop_1Sto64
, tmp1
));
1059 tmp1
= assignNew('V', mce
, Ity_I64
, unop(Iop_1Sto64
, tmp1
));
1060 tmp1
= assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
, tmp1
, tmp1
));
1063 tmp1
= assignNew('V', mce
, Ity_I64
, unop(Iop_1Sto64
, tmp1
));
1064 tmp1
= assignNew('V', mce
, Ity_I128
, binop(Iop_64HLto128
, tmp1
, tmp1
));
1067 tmp1
= assignNew('V', mce
, Ity_I64
, unop(Iop_1Sto64
, tmp1
));
1068 tmp1
= assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
,
1070 tmp1
= assignNew('V', mce
, Ity_V256
, binop(Iop_V128HLtoV256
,
1075 VG_(tool_panic
)("mkPCastTo(2)");
1079 /* This is a minor variant. It takes an arg of some type and returns
1080 a value of the same type. The result consists entirely of Defined
1081 (zero) bits except its least significant bit, which is a PCast of
1082 the entire argument down to a single bit. */
1083 static IRAtom
* mkPCastXXtoXXlsb ( MCEnv
* mce
, IRAtom
* varg
, IRType ty
)
1085 if (ty
== Ity_V128
) {
1086 /* --- Case for V128 --- */
1087 IRAtom
* varg128
= varg
;
1088 // generates: PCast-to-I64(varg128)
1089 IRAtom
* pcdTo64
= mkPCastTo(mce
, Ity_I64
, varg128
);
1090 // Now introduce zeros (defined bits) in the top 63 places
1091 // generates: Def--(63)--Def PCast-to-I1(varg128)
1093 = assignNew('V', mce
, Ity_I64
, binop(Iop_And64
, pcdTo64
, mkU64(1)));
1094 // generates: Def--(64)--Def
1096 = definedOfType(Ity_I64
);
1097 // generates: Def--(127)--Def PCast-to-I1(varg128)
1099 = assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
, d64
, d63pc
));
1102 if (ty
== Ity_I64
) {
1103 /* --- Case for I64 --- */
1105 IRAtom
* pcd
= mkPCastTo(mce
, Ity_I64
, varg
);
1106 // Zero (Def) out the top 63 bits
1108 = assignNew('V', mce
, Ity_I64
, binop(Iop_And64
, pcd
, mkU64(1)));
1115 /* --------- Optimistic casts. --------- */
1117 /* The function takes and returns an expression of type TY. If any of the
1118 VBITS indicate defined (value == 0) the resulting expression has all bits
1119 set to 0. Otherwise, all bits are 1. In words, if any bits are defined
1120 then all bits are made to be defined.
1122 In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1124 static IRAtom
* mkOCastAt( MCEnv
* mce
, IRType ty
, IRAtom
* vbits
)
1126 IROp opSUB
, opSHR
, opSAR
;
1131 opSUB
= Iop_Sub64
; opSHR
= Iop_Shr64
; opSAR
= Iop_Sar64
; sh
= 63;
1134 opSUB
= Iop_Sub32
; opSHR
= Iop_Shr32
; opSAR
= Iop_Sar32
; sh
= 31;
1137 opSUB
= Iop_Sub16
; opSHR
= Iop_Shr16
; opSAR
= Iop_Sar16
; sh
= 15;
1140 opSUB
= Iop_Sub8
; opSHR
= Iop_Shr8
; opSAR
= Iop_Sar8
; sh
= 7;
1144 VG_(tool_panic
)("mkOCastTo");
1148 shr1
= assignNew('V', mce
,ty
, binop(opSHR
, vbits
, mkU8(1)));
1149 at
= assignNew('V', mce
,ty
, binop(opSUB
, vbits
, shr1
));
1150 at
= assignNew('V', mce
,ty
, binop(opSAR
, at
, mkU8(sh
)));
1155 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1157 Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1158 PCasting to Ity_U1. However, sometimes it is necessary to be more
1159 accurate. The insight is that the result is defined if two
1160 corresponding bits can be found, one from each argument, so that
1161 both bits are defined but are different -- that makes EQ say "No"
1162 and NE say "Yes". Hence, we compute an improvement term and DifD
1163 it onto the "normal" (UifU) result.
1178 vec contains 0 (defined) bits where the corresponding arg bits
1179 are defined but different, and 1 bits otherwise.
1181 vec = Or<sz>( vxx, // 0 iff bit defined
1182 vyy, // 0 iff bit defined
1183 Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1186 If any bit of vec is 0, the result is defined and so the
1187 improvement term should produce 0...0, else it should produce
1190 Hence require for the improvement term:
1192 OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1194 which you can think of as an "optimistic cast" (OCast, the opposite of
1195 the normal "pessimistic cast" (PCast) family. An OCast says all bits
1196 are defined if any bit is defined.
1198 It is possible to show that
1200 if vec == 1...1 then 1...1 else 0...0
1202 can be implemented in straight-line code as
1204 (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1206 We note that vec contains the sub-term Or<sz>(vxx, vyy). Since UifU is
1207 implemented with Or (since 1 signifies undefinedness), this is a
1208 duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1211 let naive = UifU<sz>(vxx, vyy)
1212 vec = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1214 PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1216 This was extensively re-analysed and checked on 6 July 05 and again
1219 static IRAtom
* expensiveCmpEQorNE ( MCEnv
* mce
,
1221 IRAtom
* vxx
, IRAtom
* vyy
,
1222 IRAtom
* xx
, IRAtom
* yy
)
1224 IRAtom
*naive
, *vec
, *improved
, *final_cast
;
1225 IROp opDIFD
, opUIFU
, opOR
, opXOR
, opNOT
;
1227 tl_assert(isShadowAtom(mce
,vxx
));
1228 tl_assert(isShadowAtom(mce
,vyy
));
1229 tl_assert(isOriginalAtom(mce
,xx
));
1230 tl_assert(isOriginalAtom(mce
,yy
));
1231 tl_assert(sameKindedAtoms(vxx
,xx
));
1232 tl_assert(sameKindedAtoms(vyy
,yy
));
1264 VG_(tool_panic
)("expensiveCmpEQorNE");
1268 = assignNew('V', mce
, ty
, binop(opUIFU
, vxx
, vyy
));
1278 assignNew('V', mce
,ty
, binop(opXOR
, xx
, yy
))))));
1281 = assignNew( 'V', mce
,ty
,
1282 binop(opDIFD
, naive
, mkOCastAt(mce
, ty
, vec
)));
1285 = mkPCastTo( mce
, Ity_I1
, improved
);
1290 /* Check if we can know, despite the uncertain bits, that xx is greater than yy.
1291 Notice that it's xx > yy and not the other way around. This is Intel syntax
1292 with destination first. It will appear reversed in gdb disassembly (AT&T
1295 static IRAtom
* expensiveCmpGT ( MCEnv
* mce
,
1297 IRAtom
* vxx
, IRAtom
* vyy
,
1298 IRAtom
* xx
, IRAtom
* yy
)
1300 IROp opAND
, opOR
, opXOR
, opNOT
, opSHL
;
1302 unsigned int word_size
;
1305 tl_assert(isShadowAtom(mce
,vxx
));
1306 tl_assert(isShadowAtom(mce
,vyy
));
1307 tl_assert(isOriginalAtom(mce
,xx
));
1308 tl_assert(isOriginalAtom(mce
,yy
));
1309 tl_assert(sameKindedAtoms(vxx
,xx
));
1310 tl_assert(sameKindedAtoms(vyy
,yy
));
1313 case Iop_CmpGT64Sx2
:
1314 case Iop_CmpGT64Ux2
:
1315 opSHL
= Iop_ShlN64x2
;
1318 case Iop_CmpGT32Sx4
:
1319 case Iop_CmpGT32Ux4
:
1320 opSHL
= Iop_ShlN32x4
;
1323 case Iop_CmpGT16Sx8
:
1324 case Iop_CmpGT16Ux8
:
1325 opSHL
= Iop_ShlN16x8
;
1328 case Iop_CmpGT8Sx16
:
1329 case Iop_CmpGT8Ux16
:
1330 opSHL
= Iop_ShlN8x16
;
1334 VG_(tool_panic
)("expensiveCmpGT");
1338 case Iop_CmpGT64Sx2
:
1339 case Iop_CmpGT32Sx4
:
1340 case Iop_CmpGT16Sx8
:
1341 case Iop_CmpGT8Sx16
:
1344 case Iop_CmpGT64Ux2
:
1345 case Iop_CmpGT32Ux4
:
1346 case Iop_CmpGT16Ux8
:
1347 case Iop_CmpGT8Ux16
:
1351 VG_(tool_panic
)("expensiveCmpGT");
1355 opAND
= Iop_AndV128
;
1357 opXOR
= Iop_XorV128
;
1358 opNOT
= Iop_NotV128
;
1362 // For unsigned it's easy to make the min and max: Just set the unknown
1363 // bits all to 0s or 1s. For signed it's harder because having a 1 in the
1364 // MSB makes a number smaller, not larger! We can work around this by
1365 // flipping the MSB before and after computing the min and max values.
1366 IRAtom
*all_ones
= mkV128(0xffff);
1367 MSBs
= assignNew('V', mce
, ty
, binop(opSHL
, all_ones
, mkU8(word_size
-1)));
1368 xx
= assignNew('V', mce
, ty
, binop(opXOR
, xx
, MSBs
));
1369 yy
= assignNew('V', mce
, ty
, binop(opXOR
, yy
, MSBs
));
1370 // From here on out, we're dealing with MSB-flipped integers.
1372 // We can combine xx and vxx to create two values: the largest that xx could
1373 // possibly be and the smallest that xx could possibly be. Likewise, we can
1374 // do the same for yy. We'll call those max_xx and min_xx and max_yy and
1376 IRAtom
*not_vxx
= assignNew('V', mce
, ty
, unop(opNOT
, vxx
));
1377 IRAtom
*not_vyy
= assignNew('V', mce
, ty
, unop(opNOT
, vyy
));
1378 IRAtom
*max_xx
= assignNew('V', mce
, ty
, binop(opOR
, xx
, vxx
));
1379 IRAtom
*min_xx
= assignNew('V', mce
, ty
, binop(opAND
, xx
, not_vxx
));
1380 IRAtom
*max_yy
= assignNew('V', mce
, ty
, binop(opOR
, yy
, vyy
));
1381 IRAtom
*min_yy
= assignNew('V', mce
, ty
, binop(opAND
, yy
, not_vyy
));
1384 max_xx
= assignNew('V', mce
, ty
, binop(opXOR
, max_xx
, MSBs
));
1385 min_xx
= assignNew('V', mce
, ty
, binop(opXOR
, min_xx
, MSBs
));
1386 max_yy
= assignNew('V', mce
, ty
, binop(opXOR
, max_yy
, MSBs
));
1387 min_yy
= assignNew('V', mce
, ty
, binop(opXOR
, min_yy
, MSBs
));
1389 IRAtom
*min_xx_gt_max_yy
= assignNew('V', mce
, ty
, binop(opGT
, min_xx
, max_yy
));
1390 IRAtom
*max_xx_gt_min_yy
= assignNew('V', mce
, ty
, binop(opGT
, max_xx
, min_yy
));
1391 // If min_xx is greater than max_yy then xx is surely greater than yy so we know
1392 // our answer for sure. If max_xx is not greater than min_yy then xx can't
1393 // possible be greater than yy so again we know the answer for sure. For all
1394 // other cases, we can't know.
1396 // So the result is defined if:
1398 // min_xx_gt_max_yy | ~max_xx_gt_min_yy
1400 // Because defined in vbits is 0s and not 1s, we need to invert that:
1402 // ~(min_xx_gt_max_yy | ~max_xx_gt_min_yy)
1404 // We can use DeMorgan's Law to simplify the above:
1406 // ~min_xx_gt_max_yy & max_xx_gt_min_yy
1407 IRAtom
*not_min_xx_gt_max_yy
= assignNew('V', mce
, ty
, unop(opNOT
, min_xx_gt_max_yy
));
1408 return assignNew('V', mce
, ty
, binop(opAND
, not_min_xx_gt_max_yy
, max_xx_gt_min_yy
));
1411 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1413 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1415 CmpORD32S(x,y) = 1<<3 if x <s y
1419 and similarly the unsigned variant. The default interpretation is:
1421 CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1424 The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1425 are zero and therefore defined (viz, zero).
1427 Also deal with a special case better:
1431 Here, bit 3 (LT) of the result is a copy of the top bit of x and
1432 will be defined even if the rest of x isn't. In which case we do:
1434 CmpORD32S#(x,x#,0,{impliedly 0}#)
1435 = PCast(x#) & (3<<1) -- standard interp for GT#,EQ#
1436 | (x# >>u 31) << 3 -- LT# = x#[31]
1438 Analogous handling for CmpORD64{S,U}.
1440 static Bool
isZeroU32 ( IRAtom
* e
)
1443 toBool( e
->tag
== Iex_Const
1444 && e
->Iex
.Const
.con
->tag
== Ico_U32
1445 && e
->Iex
.Const
.con
->Ico
.U32
== 0 );
1448 static Bool
isZeroU64 ( IRAtom
* e
)
1451 toBool( e
->tag
== Iex_Const
1452 && e
->Iex
.Const
.con
->tag
== Ico_U64
1453 && e
->Iex
.Const
.con
->Ico
.U64
== 0 );
1456 static IRAtom
* doCmpORD ( MCEnv
* mce
,
1458 IRAtom
* xxhash
, IRAtom
* yyhash
,
1459 IRAtom
* xx
, IRAtom
* yy
)
1461 Bool m64
= cmp_op
== Iop_CmpORD64S
|| cmp_op
== Iop_CmpORD64U
;
1462 Bool syned
= cmp_op
== Iop_CmpORD64S
|| cmp_op
== Iop_CmpORD32S
;
1463 IROp opOR
= m64
? Iop_Or64
: Iop_Or32
;
1464 IROp opAND
= m64
? Iop_And64
: Iop_And32
;
1465 IROp opSHL
= m64
? Iop_Shl64
: Iop_Shl32
;
1466 IROp opSHR
= m64
? Iop_Shr64
: Iop_Shr32
;
1467 IROp op1UtoWS
= m64
? Iop_1Uto64
: Iop_1Uto32
;
1468 IRType ty
= m64
? Ity_I64
: Ity_I32
;
1469 Int width
= m64
? 64 : 32;
1471 Bool (*isZero
)(IRAtom
*) = m64
? isZeroU64
: isZeroU32
;
1473 tl_assert(isShadowAtom(mce
,xxhash
));
1474 tl_assert(isShadowAtom(mce
,yyhash
));
1475 tl_assert(isOriginalAtom(mce
,xx
));
1476 tl_assert(isOriginalAtom(mce
,yy
));
1477 tl_assert(sameKindedAtoms(xxhash
,xx
));
1478 tl_assert(sameKindedAtoms(yyhash
,yy
));
1479 tl_assert(cmp_op
== Iop_CmpORD32S
|| cmp_op
== Iop_CmpORD32U
1480 || cmp_op
== Iop_CmpORD64S
|| cmp_op
== Iop_CmpORD64U
);
1483 ppIROp(cmp_op
); VG_(printf
)(" ");
1484 ppIRExpr(xx
); VG_(printf
)(" "); ppIRExpr( yy
); VG_(printf
)("\n");
1487 if (syned
&& isZero(yy
)) {
1488 /* fancy interpretation */
1489 /* if yy is zero, then it must be fully defined (zero#). */
1490 tl_assert(isZero(yyhash
));
1491 // This is still inaccurate, but I don't think it matters, since
1492 // nobody writes code of the form
1493 // "is <partially-undefined-value> signedly greater than zero?".
1494 // We therefore simply declare "x >s 0" to be undefined if any bit in
1495 // x is undefined. That's clearly suboptimal in some cases. Eg, if
1496 // the highest order bit is a defined 1 then x is negative so it
1497 // doesn't matter whether the remaining bits are defined or not.
1503 mkPCastTo(mce
,ty
, xxhash
),
1504 m64
? mkU64(1<<2) : mkU32(1<<2)
1506 // For "x <s 0", we can just copy the definedness of the top bit of x
1507 // and we have a precise result.
1515 binop(opSHR
, xxhash
, mkU8(width
-1))),
1518 // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
1524 assignNew('V', mce
,ty
,
1527 expensiveCmpEQorNE(mce
, ty
, xxhash
, yyhash
, xx
, yy
))
1534 assignNew('V', mce
,ty
, binop(opOR
, t_lt_0_0_0
, t_0_gt_0_0
)),
1538 /* standard interpretation */
1539 IRAtom
* sevenLeft1
= m64
? mkU64(7<<1) : mkU32(7<<1);
1544 mkUifU(mce
,ty
, xxhash
,yyhash
)),
1551 /*------------------------------------------------------------*/
1552 /*--- Emit a test and complaint if something is undefined. ---*/
1553 /*------------------------------------------------------------*/
1555 static IRAtom
* schemeE ( MCEnv
* mce
, IRExpr
* e
); /* fwds */
1558 /* Set the annotations on a dirty helper to indicate that the stack
1559 pointer and instruction pointers might be read. This is the
1560 behaviour of all 'emit-a-complaint' style functions we might
1563 static void setHelperAnns ( MCEnv
* mce
, IRDirty
* di
) {
1565 di
->fxState
[0].fx
= Ifx_Read
;
1566 di
->fxState
[0].offset
= mce
->layout
->offset_SP
;
1567 di
->fxState
[0].size
= mce
->layout
->sizeof_SP
;
1568 di
->fxState
[0].nRepeats
= 0;
1569 di
->fxState
[0].repeatLen
= 0;
1570 di
->fxState
[1].fx
= Ifx_Read
;
1571 di
->fxState
[1].offset
= mce
->layout
->offset_IP
;
1572 di
->fxState
[1].size
= mce
->layout
->sizeof_IP
;
1573 di
->fxState
[1].nRepeats
= 0;
1574 di
->fxState
[1].repeatLen
= 0;
1578 /* Check the supplied *original* |atom| for undefinedness, and emit a
1579 complaint if so. Once that happens, mark it as defined. This is
1580 possible because the atom is either a tmp or literal. If it's a
1581 tmp, it will be shadowed by a tmp, and so we can set the shadow to
1582 be defined. In fact as mentioned above, we will have to allocate a
1583 new tmp to carry the new 'defined' shadow value, and update the
1584 original->tmp mapping accordingly; we cannot simply assign a new
1585 value to an existing shadow tmp as this breaks SSAness.
1587 The checks are performed, any resulting complaint emitted, and
1588 |atom|'s shadow temp set to 'defined', ONLY in the case that
1589 |guard| evaluates to True at run-time. If it evaluates to False
1590 then no action is performed. If |guard| is NULL (the usual case)
1591 then it is assumed to be always-true, and hence these actions are
1592 performed unconditionally.
1594 This routine does not generate code to check the definedness of
1595 |guard|. The caller is assumed to have taken care of that already.
1597 static void complainIfUndefined ( MCEnv
* mce
, IRAtom
* atom
, IRExpr
*guard
)
1610 // Don't do V bit tests if we're not reporting undefined value errors.
1611 if (MC_(clo_mc_level
) == 1)
1615 tl_assert(isOriginalAtom(mce
, guard
));
1617 /* Since the original expression is atomic, there's no duplicated
1618 work generated by making multiple V-expressions for it. So we
1619 don't really care about the possibility that someone else may
1620 also create a V-interpretion for it. */
1621 tl_assert(isOriginalAtom(mce
, atom
));
1622 vatom
= expr2vbits( mce
, atom
, HuOth
);
1623 tl_assert(isShadowAtom(mce
, vatom
));
1624 tl_assert(sameKindedAtoms(atom
, vatom
));
1626 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vatom
);
1628 /* sz is only used for constructing the error message */
1629 sz
= ty
==Ity_I1
? 0 : sizeofIRType(ty
);
1631 cond
= mkPCastTo( mce
, Ity_I1
, vatom
);
1632 /* cond will be 0 if all defined, and 1 if any not defined. */
1634 /* Get the origin info for the value we are about to check. At
1635 least, if we are doing origin tracking. If not, use a dummy
1637 if (MC_(clo_mc_level
) == 3) {
1638 origin
= schemeE( mce
, atom
);
1639 if (mce
->hWordTy
== Ity_I64
) {
1640 origin
= assignNew( 'B', mce
, Ity_I64
, unop(Iop_32Uto64
, origin
) );
1654 fn
= &MC_(helperc_value_check0_fail_w_o
);
1655 nm
= "MC_(helperc_value_check0_fail_w_o)";
1656 args
= mkIRExprVec_1(origin
);
1659 fn
= &MC_(helperc_value_check0_fail_no_o
);
1660 nm
= "MC_(helperc_value_check0_fail_no_o)";
1661 args
= mkIRExprVec_0();
1667 fn
= &MC_(helperc_value_check1_fail_w_o
);
1668 nm
= "MC_(helperc_value_check1_fail_w_o)";
1669 args
= mkIRExprVec_1(origin
);
1672 fn
= &MC_(helperc_value_check1_fail_no_o
);
1673 nm
= "MC_(helperc_value_check1_fail_no_o)";
1674 args
= mkIRExprVec_0();
1680 fn
= &MC_(helperc_value_check4_fail_w_o
);
1681 nm
= "MC_(helperc_value_check4_fail_w_o)";
1682 args
= mkIRExprVec_1(origin
);
1685 fn
= &MC_(helperc_value_check4_fail_no_o
);
1686 nm
= "MC_(helperc_value_check4_fail_no_o)";
1687 args
= mkIRExprVec_0();
1693 fn
= &MC_(helperc_value_check8_fail_w_o
);
1694 nm
= "MC_(helperc_value_check8_fail_w_o)";
1695 args
= mkIRExprVec_1(origin
);
1698 fn
= &MC_(helperc_value_check8_fail_no_o
);
1699 nm
= "MC_(helperc_value_check8_fail_no_o)";
1700 args
= mkIRExprVec_0();
1707 fn
= &MC_(helperc_value_checkN_fail_w_o
);
1708 nm
= "MC_(helperc_value_checkN_fail_w_o)";
1709 args
= mkIRExprVec_2( mkIRExpr_HWord( sz
), origin
);
1712 fn
= &MC_(helperc_value_checkN_fail_no_o
);
1713 nm
= "MC_(helperc_value_checkN_fail_no_o)";
1714 args
= mkIRExprVec_1( mkIRExpr_HWord( sz
) );
1719 VG_(tool_panic
)("unexpected szB");
1725 tl_assert(nargs
>= 0 && nargs
<= 2);
1726 tl_assert( (MC_(clo_mc_level
) == 3 && origin
!= NULL
)
1727 || (MC_(clo_mc_level
) == 2 && origin
== NULL
) );
1729 di
= unsafeIRDirty_0_N( nargs
/*regparms*/, nm
,
1730 VG_(fnptr_to_fnentry
)( fn
), args
);
1731 di
->guard
= cond
; // and cond is PCast-to-1(atom#)
1733 /* If the complaint is to be issued under a guard condition, AND
1734 that into the guard condition for the helper call. */
1736 IRAtom
*g1
= assignNew('V', mce
, Ity_I32
, unop(Iop_1Uto32
, di
->guard
));
1737 IRAtom
*g2
= assignNew('V', mce
, Ity_I32
, unop(Iop_1Uto32
, guard
));
1738 IRAtom
*e
= assignNew('V', mce
, Ity_I32
, binop(Iop_And32
, g1
, g2
));
1739 di
->guard
= assignNew('V', mce
, Ity_I1
, unop(Iop_32to1
, e
));
1742 setHelperAnns( mce
, di
);
1743 stmt( 'V', mce
, IRStmt_Dirty(di
));
1745 /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1746 defined -- but only in the case where the guard evaluates to
1747 True at run-time. Do the update by setting the orig->shadow
1748 mapping for tmp to reflect the fact that this shadow is getting
1750 tl_assert(isIRAtom(vatom
));
1751 /* sameKindedAtoms ... */
1752 if (vatom
->tag
== Iex_RdTmp
) {
1753 tl_assert(atom
->tag
== Iex_RdTmp
);
1754 if (guard
== NULL
) {
1755 // guard is 'always True', hence update unconditionally
1756 newShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
);
1757 assign('V', mce
, findShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
),
1760 // update the temp only conditionally. Do this by copying
1761 // its old value when the guard is False.
1763 IRTemp old_tmpV
= findShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
);
1764 newShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
);
1766 = assignNew('V', mce
, shadowTypeV(ty
),
1767 IRExpr_ITE(guard
, definedOfType(ty
),
1769 assign('V', mce
, findShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
), new_tmpV
);
1775 /*------------------------------------------------------------*/
1776 /*--- Shadowing PUTs/GETs, and indexed variants thereof ---*/
1777 /*------------------------------------------------------------*/
1779 /* Examine the always-defined sections declared in layout to see if
1780 the (offset,size) section is within one. Note, is is an error to
1781 partially fall into such a region: (offset,size) should either be
1782 completely in such a region or completely not-in such a region.
1784 static Bool
isAlwaysDefd ( MCEnv
* mce
, Int offset
, Int size
)
1786 Int minoffD
, maxoffD
, i
;
1787 Int minoff
= offset
;
1788 Int maxoff
= minoff
+ size
- 1;
1789 tl_assert((minoff
& ~0xFFFF) == 0);
1790 tl_assert((maxoff
& ~0xFFFF) == 0);
1792 for (i
= 0; i
< mce
->layout
->n_alwaysDefd
; i
++) {
1793 minoffD
= mce
->layout
->alwaysDefd
[i
].offset
;
1794 maxoffD
= minoffD
+ mce
->layout
->alwaysDefd
[i
].size
- 1;
1795 tl_assert((minoffD
& ~0xFFFF) == 0);
1796 tl_assert((maxoffD
& ~0xFFFF) == 0);
1798 if (maxoff
< minoffD
|| maxoffD
< minoff
)
1799 continue; /* no overlap */
1800 if (minoff
>= minoffD
&& maxoff
<= maxoffD
)
1801 return True
; /* completely contained in an always-defd section */
1803 VG_(tool_panic
)("memcheck:isAlwaysDefd:partial overlap");
1805 return False
; /* could not find any containing section */
1809 /* Generate into bb suitable actions to shadow this Put. If the state
1810 slice is marked 'always defined', do nothing. Otherwise, write the
1811 supplied V bits to the shadow state. We can pass in either an
1812 original atom or a V-atom, but not both. In the former case the
1813 relevant V-bits are then generated from the original.
1814 We assume here, that the definedness of GUARD has already been checked.
1817 void do_shadow_PUT ( MCEnv
* mce
, Int offset
,
1818 IRAtom
* atom
, IRAtom
* vatom
, IRExpr
*guard
)
1822 // Don't do shadow PUTs if we're not doing undefined value checking.
1823 // Their absence lets Vex's optimiser remove all the shadow computation
1824 // that they depend on, which includes GETs of the shadow registers.
1825 if (MC_(clo_mc_level
) == 1)
1830 tl_assert(isOriginalAtom(mce
, atom
));
1831 vatom
= expr2vbits( mce
, atom
, HuOth
);
1834 tl_assert(isShadowAtom(mce
, vatom
));
1837 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vatom
);
1838 tl_assert(ty
!= Ity_I1
);
1839 if (isAlwaysDefd(mce
, offset
, sizeofIRType(ty
))) {
1841 /* emit code to emit a complaint if any of the vbits are 1. */
1842 /* complainIfUndefined(mce, atom); */
1844 /* Do a plain shadow Put. */
1846 /* If the guard expression evaluates to false we simply Put the value
1847 that is already stored in the guest state slot */
1848 IRAtom
*cond
, *iffalse
;
1850 cond
= assignNew('V', mce
, Ity_I1
, guard
);
1851 iffalse
= assignNew('V', mce
, ty
,
1852 IRExpr_Get(offset
+ mce
->layout
->total_sizeB
, ty
));
1853 vatom
= assignNew('V', mce
, ty
, IRExpr_ITE(cond
, vatom
, iffalse
));
1855 stmt( 'V', mce
, IRStmt_Put( offset
+ mce
->layout
->total_sizeB
, vatom
));
1860 /* Return an expression which contains the V bits corresponding to the
1861 given GETI (passed in in pieces).
1864 void do_shadow_PUTI ( MCEnv
* mce
, IRPutI
*puti
)
1869 IRRegArray
* descr
= puti
->descr
;
1870 IRAtom
* ix
= puti
->ix
;
1871 Int bias
= puti
->bias
;
1872 IRAtom
* atom
= puti
->data
;
1874 // Don't do shadow PUTIs if we're not doing undefined value checking.
1875 // Their absence lets Vex's optimiser remove all the shadow computation
1876 // that they depend on, which includes GETIs of the shadow registers.
1877 if (MC_(clo_mc_level
) == 1)
1880 tl_assert(isOriginalAtom(mce
,atom
));
1881 vatom
= expr2vbits( mce
, atom
, HuOth
);
1882 tl_assert(sameKindedAtoms(atom
, vatom
));
1884 tyS
= shadowTypeV(ty
);
1885 arrSize
= descr
->nElems
* sizeofIRType(ty
);
1886 tl_assert(ty
!= Ity_I1
);
1887 tl_assert(isOriginalAtom(mce
,ix
));
1888 complainIfUndefined(mce
, ix
, NULL
);
1889 if (isAlwaysDefd(mce
, descr
->base
, arrSize
)) {
1891 /* emit code to emit a complaint if any of the vbits are 1. */
1892 /* complainIfUndefined(mce, atom); */
1894 /* Do a cloned version of the Put that refers to the shadow
1896 IRRegArray
* new_descr
1897 = mkIRRegArray( descr
->base
+ mce
->layout
->total_sizeB
,
1898 tyS
, descr
->nElems
);
1899 stmt( 'V', mce
, IRStmt_PutI( mkIRPutI(new_descr
, ix
, bias
, vatom
) ));
1904 /* Return an expression which contains the V bits corresponding to the
1905 given GET (passed in in pieces).
1908 IRExpr
* shadow_GET ( MCEnv
* mce
, Int offset
, IRType ty
)
1910 IRType tyS
= shadowTypeV(ty
);
1911 tl_assert(ty
!= Ity_I1
);
1912 tl_assert(ty
!= Ity_I128
);
1913 if (isAlwaysDefd(mce
, offset
, sizeofIRType(ty
))) {
1914 /* Always defined, return all zeroes of the relevant type */
1915 return definedOfType(tyS
);
1917 /* return a cloned version of the Get that refers to the shadow
1919 /* FIXME: this isn't an atom! */
1920 return IRExpr_Get( offset
+ mce
->layout
->total_sizeB
, tyS
);
1925 /* Return an expression which contains the V bits corresponding to the
1926 given GETI (passed in in pieces).
1929 IRExpr
* shadow_GETI ( MCEnv
* mce
,
1930 IRRegArray
* descr
, IRAtom
* ix
, Int bias
)
1932 IRType ty
= descr
->elemTy
;
1933 IRType tyS
= shadowTypeV(ty
);
1934 Int arrSize
= descr
->nElems
* sizeofIRType(ty
);
1935 tl_assert(ty
!= Ity_I1
);
1936 tl_assert(isOriginalAtom(mce
,ix
));
1937 complainIfUndefined(mce
, ix
, NULL
);
1938 if (isAlwaysDefd(mce
, descr
->base
, arrSize
)) {
1939 /* Always defined, return all zeroes of the relevant type */
1940 return definedOfType(tyS
);
1942 /* return a cloned version of the Get that refers to the shadow
1944 IRRegArray
* new_descr
1945 = mkIRRegArray( descr
->base
+ mce
->layout
->total_sizeB
,
1946 tyS
, descr
->nElems
);
1947 return IRExpr_GetI( new_descr
, ix
, bias
);
1952 /*------------------------------------------------------------*/
1953 /*--- Generating approximations for unknown operations, ---*/
1954 /*--- using lazy-propagate semantics ---*/
1955 /*------------------------------------------------------------*/
1957 /* Lazy propagation of undefinedness from two values, resulting in the
1958 specified shadow type.
1961 IRAtom
* mkLazy2 ( MCEnv
* mce
, IRType finalVty
, IRAtom
* va1
, IRAtom
* va2
)
1964 IRType t1
= typeOfIRExpr(mce
->sb
->tyenv
, va1
);
1965 IRType t2
= typeOfIRExpr(mce
->sb
->tyenv
, va2
);
1966 tl_assert(isShadowAtom(mce
,va1
));
1967 tl_assert(isShadowAtom(mce
,va2
));
1969 /* The general case is inefficient because PCast is an expensive
1970 operation. Here are some special cases which use PCast only
1971 once rather than twice. */
1973 /* I64 x I64 -> I64 */
1974 if (t1
== Ity_I64
&& t2
== Ity_I64
&& finalVty
== Ity_I64
) {
1975 if (0) VG_(printf
)("mkLazy2: I64 x I64 -> I64\n");
1976 at
= mkUifU(mce
, Ity_I64
, va1
, va2
);
1977 at
= mkPCastTo(mce
, Ity_I64
, at
);
1981 /* I64 x I64 -> I32 */
1982 if (t1
== Ity_I64
&& t2
== Ity_I64
&& finalVty
== Ity_I32
) {
1983 if (0) VG_(printf
)("mkLazy2: I64 x I64 -> I32\n");
1984 at
= mkUifU(mce
, Ity_I64
, va1
, va2
);
1985 at
= mkPCastTo(mce
, Ity_I32
, at
);
1989 /* I32 x I32 -> I32 */
1990 if (t1
== Ity_I32
&& t2
== Ity_I32
&& finalVty
== Ity_I32
) {
1991 if (0) VG_(printf
)("mkLazy2: I32 x I32 -> I32\n");
1992 at
= mkUifU(mce
, Ity_I32
, va1
, va2
);
1993 at
= mkPCastTo(mce
, Ity_I32
, at
);
1998 VG_(printf
)("mkLazy2 ");
2007 /* General case: force everything via 32-bit intermediaries. */
2008 at
= mkPCastTo(mce
, Ity_I32
, va1
);
2009 at
= mkUifU(mce
, Ity_I32
, at
, mkPCastTo(mce
, Ity_I32
, va2
));
2010 at
= mkPCastTo(mce
, finalVty
, at
);
2015 /* 3-arg version of the above. */
2017 IRAtom
* mkLazy3 ( MCEnv
* mce
, IRType finalVty
,
2018 IRAtom
* va1
, IRAtom
* va2
, IRAtom
* va3
)
2021 IRType t1
= typeOfIRExpr(mce
->sb
->tyenv
, va1
);
2022 IRType t2
= typeOfIRExpr(mce
->sb
->tyenv
, va2
);
2023 IRType t3
= typeOfIRExpr(mce
->sb
->tyenv
, va3
);
2024 tl_assert(isShadowAtom(mce
,va1
));
2025 tl_assert(isShadowAtom(mce
,va2
));
2026 tl_assert(isShadowAtom(mce
,va3
));
2028 /* The general case is inefficient because PCast is an expensive
2029 operation. Here are some special cases which use PCast only
2030 twice rather than three times. */
2032 /* I32 x I64 x I64 -> I64 */
2033 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2034 if (t1
== Ity_I32
&& t2
== Ity_I64
&& t3
== Ity_I64
2035 && finalVty
== Ity_I64
) {
2036 if (0) VG_(printf
)("mkLazy3: I32 x I64 x I64 -> I64\n");
2037 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
2038 mode indication which is fully defined, this should get
2039 folded out later. */
2040 at
= mkPCastTo(mce
, Ity_I64
, va1
);
2041 /* Now fold in 2nd and 3rd args. */
2042 at
= mkUifU(mce
, Ity_I64
, at
, va2
);
2043 at
= mkUifU(mce
, Ity_I64
, at
, va3
);
2044 /* and PCast once again. */
2045 at
= mkPCastTo(mce
, Ity_I64
, at
);
2049 /* I32 x I8 x I64 -> I64 */
2050 if (t1
== Ity_I32
&& t2
== Ity_I8
&& t3
== Ity_I64
2051 && finalVty
== Ity_I64
) {
2052 if (0) VG_(printf
)("mkLazy3: I32 x I8 x I64 -> I64\n");
2053 /* Widen 1st and 2nd args to I64. Since 1st arg is typically a
2054 * rounding mode indication which is fully defined, this should
2055 * get folded out later.
2057 IRAtom
* at1
= mkPCastTo(mce
, Ity_I64
, va1
);
2058 IRAtom
* at2
= mkPCastTo(mce
, Ity_I64
, va2
);
2059 at
= mkUifU(mce
, Ity_I64
, at1
, at2
); // UifU(PCast(va1), PCast(va2))
2060 at
= mkUifU(mce
, Ity_I64
, at
, va3
);
2061 /* and PCast once again. */
2062 at
= mkPCastTo(mce
, Ity_I64
, at
);
2066 /* I32 x I64 x I64 -> I32 */
2067 if (t1
== Ity_I32
&& t2
== Ity_I64
&& t3
== Ity_I64
2068 && finalVty
== Ity_I32
) {
2069 if (0) VG_(printf
)("mkLazy3: I32 x I64 x I64 -> I32\n");
2070 at
= mkPCastTo(mce
, Ity_I64
, va1
);
2071 at
= mkUifU(mce
, Ity_I64
, at
, va2
);
2072 at
= mkUifU(mce
, Ity_I64
, at
, va3
);
2073 at
= mkPCastTo(mce
, Ity_I32
, at
);
2077 /* I32 x I32 x I32 -> I32 */
2078 /* 32-bit FP idiom, as (eg) happens on ARM */
2079 if (t1
== Ity_I32
&& t2
== Ity_I32
&& t3
== Ity_I32
2080 && finalVty
== Ity_I32
) {
2081 if (0) VG_(printf
)("mkLazy3: I32 x I32 x I32 -> I32\n");
2083 at
= mkUifU(mce
, Ity_I32
, at
, va2
);
2084 at
= mkUifU(mce
, Ity_I32
, at
, va3
);
2085 at
= mkPCastTo(mce
, Ity_I32
, at
);
2089 /* I32 x I16 x I16 -> I16 */
2090 /* 16-bit half-precision FP idiom, as (eg) happens on arm64 v8.2 onwards */
2091 if (t1
== Ity_I32
&& t2
== Ity_I16
&& t3
== Ity_I16
2092 && finalVty
== Ity_I16
) {
2093 if (0) VG_(printf
)("mkLazy3: I32 x I16 x I16 -> I16\n");
2094 at
= mkPCastTo(mce
, Ity_I16
, va1
);
2095 at
= mkUifU(mce
, Ity_I16
, at
, va2
);
2096 at
= mkUifU(mce
, Ity_I16
, at
, va3
);
2097 at
= mkPCastTo(mce
, Ity_I16
, at
);
2101 /* I32 x I128 x I128 -> I128 */
2102 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2103 if (t1
== Ity_I32
&& t2
== Ity_I128
&& t3
== Ity_I128
2104 && finalVty
== Ity_I128
) {
2105 if (0) VG_(printf
)("mkLazy3: I32 x I128 x I128 -> I128\n");
2106 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
2107 mode indication which is fully defined, this should get
2108 folded out later. */
2109 at
= mkPCastTo(mce
, Ity_I128
, va1
);
2110 /* Now fold in 2nd and 3rd args. */
2111 at
= mkUifU(mce
, Ity_I128
, at
, va2
);
2112 at
= mkUifU(mce
, Ity_I128
, at
, va3
);
2113 /* and PCast once again. */
2114 at
= mkPCastTo(mce
, Ity_I128
, at
);
2118 /* I32 x I8 x I128 -> I128 */
2119 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2120 if (t1
== Ity_I32
&& t2
== Ity_I8
&& t3
== Ity_I128
2121 && finalVty
== Ity_I128
) {
2122 if (0) VG_(printf
)("mkLazy3: I32 x I8 x I128 -> I128\n");
2123 /* Use I64 as an intermediate type, which means PCasting all 3
2124 args to I64 to start with. 1st arg is typically a rounding
2125 mode indication which is fully defined, so we hope that it
2126 will get folded out later. */
2127 IRAtom
* at1
= mkPCastTo(mce
, Ity_I64
, va1
);
2128 IRAtom
* at2
= mkPCastTo(mce
, Ity_I64
, va2
);
2129 IRAtom
* at3
= mkPCastTo(mce
, Ity_I64
, va3
);
2130 /* Now UifU all three together. */
2131 at
= mkUifU(mce
, Ity_I64
, at1
, at2
); // UifU(PCast(va1), PCast(va2))
2132 at
= mkUifU(mce
, Ity_I64
, at
, at3
); // ... `UifU` PCast(va3)
2133 /* and PCast once again. */
2134 at
= mkPCastTo(mce
, Ity_I128
, at
);
2138 VG_(printf
)("mkLazy3: ");
2144 VG_(printf
)(" -> ");
2150 /* General case: force everything via 32-bit intermediaries. */
2152 at = mkPCastTo(mce, Ity_I32, va1);
2153 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2154 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
2155 at = mkPCastTo(mce, finalVty, at);
2161 /* 4-arg version of the above. */
2163 IRAtom
* mkLazy4 ( MCEnv
* mce
, IRType finalVty
,
2164 IRAtom
* va1
, IRAtom
* va2
, IRAtom
* va3
, IRAtom
* va4
)
2167 IRType t1
= typeOfIRExpr(mce
->sb
->tyenv
, va1
);
2168 IRType t2
= typeOfIRExpr(mce
->sb
->tyenv
, va2
);
2169 IRType t3
= typeOfIRExpr(mce
->sb
->tyenv
, va3
);
2170 IRType t4
= typeOfIRExpr(mce
->sb
->tyenv
, va4
);
2171 tl_assert(isShadowAtom(mce
,va1
));
2172 tl_assert(isShadowAtom(mce
,va2
));
2173 tl_assert(isShadowAtom(mce
,va3
));
2174 tl_assert(isShadowAtom(mce
,va4
));
2176 /* The general case is inefficient because PCast is an expensive
2177 operation. Here are some special cases which use PCast only
2178 twice rather than three times. */
2180 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2182 if (t1
== Ity_I32
&& t2
== Ity_I128
&& t3
== Ity_I128
&& t4
== Ity_I128
2183 && finalVty
== Ity_I128
) {
2184 if (0) VG_(printf
)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
2185 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
2186 mode indication which is fully defined, this should get
2187 folded out later. */
2188 at
= mkPCastTo(mce
, Ity_I128
, va1
);
2189 /* Now fold in 2nd, 3rd, 4th args. */
2190 at
= mkUifU(mce
, Ity_I128
, at
, va2
);
2191 at
= mkUifU(mce
, Ity_I128
, at
, va3
);
2192 at
= mkUifU(mce
, Ity_I128
, at
, va4
);
2193 /* and PCast once again. */
2194 at
= mkPCastTo(mce
, Ity_I128
, at
);
2198 /* I32 x I64 x I64 x I64 -> I64 */
2199 if (t1
== Ity_I32
&& t2
== Ity_I64
&& t3
== Ity_I64
&& t4
== Ity_I64
2200 && finalVty
== Ity_I64
) {
2201 if (0) VG_(printf
)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
2202 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
2203 mode indication which is fully defined, this should get
2204 folded out later. */
2205 at
= mkPCastTo(mce
, Ity_I64
, va1
);
2206 /* Now fold in 2nd, 3rd, 4th args. */
2207 at
= mkUifU(mce
, Ity_I64
, at
, va2
);
2208 at
= mkUifU(mce
, Ity_I64
, at
, va3
);
2209 at
= mkUifU(mce
, Ity_I64
, at
, va4
);
2210 /* and PCast once again. */
2211 at
= mkPCastTo(mce
, Ity_I64
, at
);
2214 /* I32 x I32 x I32 x I32 -> I32 */
2215 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2216 if (t1
== Ity_I32
&& t2
== Ity_I32
&& t3
== Ity_I32
&& t4
== Ity_I32
2217 && finalVty
== Ity_I32
) {
2218 if (0) VG_(printf
)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2220 /* Now fold in 2nd, 3rd, 4th args. */
2221 at
= mkUifU(mce
, Ity_I32
, at
, va2
);
2222 at
= mkUifU(mce
, Ity_I32
, at
, va3
);
2223 at
= mkUifU(mce
, Ity_I32
, at
, va4
);
2224 at
= mkPCastTo(mce
, Ity_I32
, at
);
2228 if (t1
== Ity_I32
&& t2
== Ity_I8
&& t3
== Ity_I8
&& t4
== Ity_I8
2229 && finalVty
== Ity_I32
) {
2230 if (0) VG_(printf
)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2231 at
= mkPCastTo(mce
, Ity_I8
, va1
);
2232 /* Now fold in 2nd, 3rd, 4th args. */
2233 at
= mkUifU(mce
, Ity_I8
, at
, va2
);
2234 at
= mkUifU(mce
, Ity_I8
, at
, va3
);
2235 at
= mkUifU(mce
, Ity_I8
, at
, va4
);
2236 at
= mkPCastTo(mce
, Ity_I32
, at
);
2240 if (t1
== Ity_I64
&& t2
== Ity_I8
&& t3
== Ity_I8
&& t4
== Ity_I8
2241 && finalVty
== Ity_I64
) {
2242 if (0) VG_(printf
)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2243 at
= mkPCastTo(mce
, Ity_I8
, va1
);
2244 /* Now fold in 2nd, 3rd, 4th args. */
2245 at
= mkUifU(mce
, Ity_I8
, at
, va2
);
2246 at
= mkUifU(mce
, Ity_I8
, at
, va3
);
2247 at
= mkUifU(mce
, Ity_I8
, at
, va4
);
2248 at
= mkPCastTo(mce
, Ity_I64
, at
);
2253 VG_(printf
)("mkLazy4: ");
2261 VG_(printf
)(" -> ");
2270 /* Do the lazy propagation game from a null-terminated vector of
2271 atoms. This is presumably the arguments to a helper call, so the
2272 IRCallee info is also supplied in order that we can know which
2273 arguments should be ignored (via the .mcx_mask field).
2276 IRAtom
* mkLazyN ( MCEnv
* mce
,
2277 IRAtom
** exprvec
, IRType finalVtype
, IRCallee
* cee
)
2283 Bool mergeTy64
= True
;
2285 /* Decide on the type of the merge intermediary. If all relevant
2286 args are I64, then it's I64. In all other circumstances, use
2288 for (i
= 0; exprvec
[i
]; i
++) {
2290 tl_assert(isOriginalAtom(mce
, exprvec
[i
]));
2291 if (cee
->mcx_mask
& (1<<i
))
2293 if (typeOfIRExpr(mce
->sb
->tyenv
, exprvec
[i
]) != Ity_I64
)
2297 mergeTy
= mergeTy64
? Ity_I64
: Ity_I32
;
2298 curr
= definedOfType(mergeTy
);
2300 for (i
= 0; exprvec
[i
]; i
++) {
2302 tl_assert(isOriginalAtom(mce
, exprvec
[i
]));
2303 /* Only take notice of this arg if the callee's mc-exclusion
2304 mask does not say it is to be excluded. */
2305 if (cee
->mcx_mask
& (1<<i
)) {
2306 /* the arg is to be excluded from definedness checking. Do
2308 if (0) VG_(printf
)("excluding %s(%d)\n", cee
->name
, i
);
2310 /* calculate the arg's definedness, and pessimistically merge
2312 here
= mkPCastTo( mce
, mergeTy
, expr2vbits(mce
, exprvec
[i
], HuOth
) );
2314 ? mkUifU64(mce
, here
, curr
)
2315 : mkUifU32(mce
, here
, curr
);
2318 return mkPCastTo(mce
, finalVtype
, curr
);
2322 /*------------------------------------------------------------*/
2323 /*--- Generating expensive sequences for exact carry-chain ---*/
2324 /*--- propagation in add/sub and related operations. ---*/
2325 /*------------------------------------------------------------*/
2328 IRAtom
* expensiveAddSub ( MCEnv
* mce
,
2331 IRAtom
* qaa
, IRAtom
* qbb
,
2332 IRAtom
* aa
, IRAtom
* bb
)
2334 IRAtom
*a_min
, *b_min
, *a_max
, *b_max
;
2335 IROp opAND
, opOR
, opXOR
, opNOT
, opADD
, opSUB
;
2337 tl_assert(isShadowAtom(mce
,qaa
));
2338 tl_assert(isShadowAtom(mce
,qbb
));
2339 tl_assert(isOriginalAtom(mce
,aa
));
2340 tl_assert(isOriginalAtom(mce
,bb
));
2341 tl_assert(sameKindedAtoms(qaa
,aa
));
2342 tl_assert(sameKindedAtoms(qbb
,bb
));
2362 VG_(tool_panic
)("expensiveAddSub");
2365 // a_min = aa & ~qaa
2366 a_min
= assignNew('V', mce
,ty
,
2368 assignNew('V', mce
,ty
, unop(opNOT
, qaa
))));
2370 // b_min = bb & ~qbb
2371 b_min
= assignNew('V', mce
,ty
,
2373 assignNew('V', mce
,ty
, unop(opNOT
, qbb
))));
2376 a_max
= assignNew('V', mce
,ty
, binop(opOR
, aa
, qaa
));
2379 b_max
= assignNew('V', mce
,ty
, binop(opOR
, bb
, qbb
));
2382 // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2384 assignNew('V', mce
,ty
,
2386 assignNew('V', mce
,ty
, binop(opOR
, qaa
, qbb
)),
2387 assignNew('V', mce
,ty
,
2389 assignNew('V', mce
,ty
, binop(opADD
, a_min
, b_min
)),
2390 assignNew('V', mce
,ty
, binop(opADD
, a_max
, b_max
))
2396 // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2398 assignNew('V', mce
,ty
,
2400 assignNew('V', mce
,ty
, binop(opOR
, qaa
, qbb
)),
2401 assignNew('V', mce
,ty
,
2403 assignNew('V', mce
,ty
, binop(opSUB
, a_min
, b_max
)),
2404 assignNew('V', mce
,ty
, binop(opSUB
, a_max
, b_min
))
2415 IRAtom
* expensiveCountTrailingZeroes ( MCEnv
* mce
, IROp czop
,
2416 IRAtom
* atom
, IRAtom
* vatom
)
2419 IROp xorOp
, subOp
, andOp
;
2421 IRAtom
*improver
, *improved
;
2422 tl_assert(isShadowAtom(mce
,vatom
));
2423 tl_assert(isOriginalAtom(mce
,atom
));
2424 tl_assert(sameKindedAtoms(atom
,vatom
));
2427 case Iop_Ctz32
: case Iop_CtzNat32
:
2434 case Iop_Ctz64
: case Iop_CtzNat64
:
2443 VG_(tool_panic
)("memcheck:expensiveCountTrailingZeroes");
2446 // improver = atom ^ (atom - 1)
2448 // That is, improver has its low ctz(atom)+1 bits equal to one;
2449 // higher bits (if any) equal to zero. So it's exactly the right
2450 // mask to use to remove the irrelevant undefined input bits.
2451 /* Here are some examples:
2452 atom = U...U 1 0...0
2453 atom-1 = U...U 0 1...1
2454 ^ed = 0...0 1 11111, which correctly describes which bits of |atom|
2455 actually influence the result
2459 ^ed = 11111, also a correct mask for the input: all input bits
2461 Another boundary case
2464 ^ed = 0..0 1, also a correct mask: only the rightmost input bit
2466 Now with misc U bits interspersed:
2467 atom = U...U 1 0 U...U 0 1 0...0
2468 atom-1 = U...U 1 0 U...U 0 0 1...1
2469 ^ed = 0...0 0 0 0...0 0 1 1...1, also correct
2470 (Per re-check/analysis of 14 Nov 2018)
2472 improver
= assignNew('V', mce
,ty
,
2475 assignNew('V', mce
, ty
,
2476 binop(subOp
, atom
, one
))));
2478 // improved = vatom & improver
2480 // That is, treat any V bits to the left of the rightmost ctz(atom)+1
2481 // bits as "defined".
2482 improved
= assignNew('V', mce
, ty
,
2483 binop(andOp
, vatom
, improver
));
2485 // Return pessimizing cast of improved.
2486 return mkPCastTo(mce
, ty
, improved
);
2490 IRAtom
* expensiveCountLeadingZeroes ( MCEnv
* mce
, IROp czop
,
2491 IRAtom
* atom
, IRAtom
* vatom
)
2494 IROp shrOp
, notOp
, andOp
;
2495 IRAtom
* (*mkRight
)(MCEnv
*, IRAtom
*);
2496 IRAtom
*improver
, *improved
;
2497 tl_assert(isShadowAtom(mce
,vatom
));
2498 tl_assert(isOriginalAtom(mce
,atom
));
2499 tl_assert(sameKindedAtoms(atom
,vatom
));
2502 case Iop_Clz32
: case Iop_ClzNat32
:
2507 mkRight
= mkRight32
;
2509 case Iop_Clz64
: case Iop_ClzNat64
:
2514 mkRight
= mkRight64
;
2518 VG_(tool_panic
)("memcheck:expensiveCountLeadingZeroes");
2521 // This is in principle very similar to how expensiveCountTrailingZeroes
2522 // works. That function computed an "improver", which it used to mask
2523 // off all but the rightmost 1-bit and the zeroes to the right of it,
2524 // hence removing irrelevant bits from the input. Here, we play the
2525 // exact same game but with the left-vs-right roles interchanged.
2526 // Unfortunately calculation of the improver in this case is
2527 // significantly more expensive.
2529 // improver = ~(RIGHT(atom) >>u 1)
2531 // That is, improver has its upper clz(atom)+1 bits equal to one;
2532 // lower bits (if any) equal to zero. So it's exactly the right
2533 // mask to use to remove the irrelevant undefined input bits.
2534 /* Here are some examples:
2535 atom = 0...0 1 U...U
2536 R(atom) = 0...0 1 1...1
2537 R(atom) >>u 1 = 0...0 0 1...1
2538 ~(R(atom) >>u 1) = 1...1 1 0...0
2539 which correctly describes which bits of |atom|
2540 actually influence the result
2544 R(atom) >>u 1 = 0...0
2545 ~(R(atom) >>u 1) = 1...1
2546 also a correct mask for the input: all input bits
2548 Another boundary case
2551 R(atom) >>u 1 = 0 1..1
2552 ~(R(atom) >>u 1) = 1 0..0
2553 also a correct mask: only the leftmost input bit
2555 Now with misc U bits interspersed:
2556 atom = 0...0 1 U...U 0 1 U...U
2557 R(atom) = 0...0 1 1...1 1 1 1...1
2558 R(atom) >>u 1 = 0...0 0 1...1 1 1 1...1
2559 ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
2560 (Per initial implementation of 15 Nov 2018)
2562 improver
= mkRight(mce
, atom
);
2563 improver
= assignNew('V', mce
, ty
, binop(shrOp
, improver
, mkU8(1)));
2564 improver
= assignNew('V', mce
, ty
, unop(notOp
, improver
));
2566 // improved = vatom & improver
2568 // That is, treat any V bits to the right of the leftmost clz(atom)+1
2569 // bits as "defined".
2570 improved
= assignNew('V', mce
, ty
,
2571 binop(andOp
, vatom
, improver
));
2573 // Return pessimizing cast of improved.
2574 return mkPCastTo(mce
, ty
, improved
);
2578 /*------------------------------------------------------------*/
2579 /*--- Scalar shifts. ---*/
2580 /*------------------------------------------------------------*/
2582 /* Produce an interpretation for (aa << bb) (or >>s, >>u). The basic
2583 idea is to shift the definedness bits by the original shift amount.
2584 This introduces 0s ("defined") in new positions for left shifts and
2585 unsigned right shifts, and copies the top definedness bit for
2586 signed right shifts. So, conveniently, applying the original shift
2587 operator to the definedness bits for the left arg is exactly the
2592 However if the shift amount is undefined then the whole result
2593 is undefined. Hence need:
2595 (qaa << bb) `UifU` PCast(qbb)
2597 If the shift amount bb is a literal than qbb will say 'all defined'
2598 and the UifU and PCast will get folded out by post-instrumentation
2601 static IRAtom
* scalarShift ( MCEnv
* mce
,
2604 IRAtom
* qaa
, IRAtom
* qbb
,
2605 IRAtom
* aa
, IRAtom
* bb
)
2607 tl_assert(isShadowAtom(mce
,qaa
));
2608 tl_assert(isShadowAtom(mce
,qbb
));
2609 tl_assert(isOriginalAtom(mce
,aa
));
2610 tl_assert(isOriginalAtom(mce
,bb
));
2611 tl_assert(sameKindedAtoms(qaa
,aa
));
2612 tl_assert(sameKindedAtoms(qbb
,bb
));
2617 assignNew('V', mce
, ty
, binop(original_op
, qaa
, bb
)),
2618 mkPCastTo(mce
, ty
, qbb
)
2624 /*------------------------------------------------------------*/
2625 /*--- Helpers for dealing with vector primops. ---*/
2626 /*------------------------------------------------------------*/
2628 /* Vector pessimisation -- pessimise within each lane individually. */
2630 static IRAtom
* mkPCast8x16 ( MCEnv
* mce
, IRAtom
* at
)
2632 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ8x16
, at
));
2635 static IRAtom
* mkPCast16x8 ( MCEnv
* mce
, IRAtom
* at
)
2637 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ16x8
, at
));
2640 static IRAtom
* mkPCast32x4 ( MCEnv
* mce
, IRAtom
* at
)
2642 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ32x4
, at
));
2645 static IRAtom
* mkPCast64x2 ( MCEnv
* mce
, IRAtom
* at
)
2647 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ64x2
, at
));
2650 static IRAtom
* mkPCast128x1 ( MCEnv
* mce
, IRAtom
* at
)
2652 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ128x1
, at
));
2655 static IRAtom
* mkPCast64x4 ( MCEnv
* mce
, IRAtom
* at
)
2657 return assignNew('V', mce
, Ity_V256
, unop(Iop_CmpNEZ64x4
, at
));
2660 static IRAtom
* mkPCast32x8 ( MCEnv
* mce
, IRAtom
* at
)
2662 return assignNew('V', mce
, Ity_V256
, unop(Iop_CmpNEZ32x8
, at
));
2665 static IRAtom
* mkPCast32x2 ( MCEnv
* mce
, IRAtom
* at
)
2667 return assignNew('V', mce
, Ity_I64
, unop(Iop_CmpNEZ32x2
, at
));
2670 static IRAtom
* mkPCast16x16 ( MCEnv
* mce
, IRAtom
* at
)
2672 return assignNew('V', mce
, Ity_V256
, unop(Iop_CmpNEZ16x16
, at
));
2675 static IRAtom
* mkPCast16x4 ( MCEnv
* mce
, IRAtom
* at
)
2677 return assignNew('V', mce
, Ity_I64
, unop(Iop_CmpNEZ16x4
, at
));
2680 static IRAtom
* mkPCast8x32 ( MCEnv
* mce
, IRAtom
* at
)
2682 return assignNew('V', mce
, Ity_V256
, unop(Iop_CmpNEZ8x32
, at
));
2685 static IRAtom
* mkPCast8x8 ( MCEnv
* mce
, IRAtom
* at
)
2687 return assignNew('V', mce
, Ity_I64
, unop(Iop_CmpNEZ8x8
, at
));
2690 static IRAtom
* mkPCast16x2 ( MCEnv
* mce
, IRAtom
* at
)
2692 return assignNew('V', mce
, Ity_I32
, unop(Iop_CmpNEZ16x2
, at
));
2695 static IRAtom
* mkPCast8x4 ( MCEnv
* mce
, IRAtom
* at
)
2697 return assignNew('V', mce
, Ity_I32
, unop(Iop_CmpNEZ8x4
, at
));
2701 /* Here's a simple scheme capable of handling ops derived from SSE1
2702 code and while only generating ops that can be efficiently
2703 implemented in SSE1. */
2705 /* All-lanes versions are straightforward:
2707 binary32Fx4(x,y) ==> PCast32x4(UifUV128(x#,y#))
2709 unary32Fx4(x,y) ==> PCast32x4(x#)
2711 Lowest-lane-only versions are more complex:
2713 binary32F0x4(x,y) ==> SetV128lo32(
2715 PCast32(V128to32(UifUV128(x#,y#)))
2718 This is perhaps not so obvious. In particular, it's faster to
2719 do a V128-bit UifU and then take the bottom 32 bits than the more
2720 obvious scheme of taking the bottom 32 bits of each operand
2721 and doing a 32-bit UifU. Basically since UifU is fast and
2722 chopping lanes off vector values is slow.
2726 unary32F0x4(x) ==> SetV128lo32(
2728 PCast32(V128to32(x#))
2733 PCast32(v#) = 1Sto32(CmpNE32(v#,0))
2734 PCast32x4(v#) = CmpNEZ32x4(v#)
2738 IRAtom
* binary32Fx4 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2741 tl_assert(isShadowAtom(mce
, vatomX
));
2742 tl_assert(isShadowAtom(mce
, vatomY
));
2743 at
= mkUifUV128(mce
, vatomX
, vatomY
);
2744 at
= assignNew('V', mce
, Ity_V128
, mkPCast32x4(mce
, at
));
2749 IRAtom
* unary32Fx4 ( MCEnv
* mce
, IRAtom
* vatomX
)
2752 tl_assert(isShadowAtom(mce
, vatomX
));
2753 at
= assignNew('V', mce
, Ity_V128
, mkPCast32x4(mce
, vatomX
));
2758 IRAtom
* binary32F0x4 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2761 tl_assert(isShadowAtom(mce
, vatomX
));
2762 tl_assert(isShadowAtom(mce
, vatomY
));
2763 at
= mkUifUV128(mce
, vatomX
, vatomY
);
2764 at
= assignNew('V', mce
, Ity_I32
, unop(Iop_V128to32
, at
));
2765 at
= mkPCastTo(mce
, Ity_I32
, at
);
2766 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SetV128lo32
, vatomX
, at
));
2771 IRAtom
* unary32F0x4 ( MCEnv
* mce
, IRAtom
* vatomX
)
2774 tl_assert(isShadowAtom(mce
, vatomX
));
2775 at
= assignNew('V', mce
, Ity_I32
, unop(Iop_V128to32
, vatomX
));
2776 at
= mkPCastTo(mce
, Ity_I32
, at
);
2777 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SetV128lo32
, vatomX
, at
));
2781 /* --- ... and ... 64Fx2 versions of the same ... --- */
2784 IRAtom
* binary64Fx2 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2787 tl_assert(isShadowAtom(mce
, vatomX
));
2788 tl_assert(isShadowAtom(mce
, vatomY
));
2789 at
= mkUifUV128(mce
, vatomX
, vatomY
);
2790 at
= assignNew('V', mce
, Ity_V128
, mkPCast64x2(mce
, at
));
2795 IRAtom
* unary64Fx2 ( MCEnv
* mce
, IRAtom
* vatomX
)
2798 tl_assert(isShadowAtom(mce
, vatomX
));
2799 at
= assignNew('V', mce
, Ity_V128
, mkPCast64x2(mce
, vatomX
));
2804 IRAtom
* binary64F0x2 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2807 tl_assert(isShadowAtom(mce
, vatomX
));
2808 tl_assert(isShadowAtom(mce
, vatomY
));
2809 at
= mkUifUV128(mce
, vatomX
, vatomY
);
2810 at
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, at
));
2811 at
= mkPCastTo(mce
, Ity_I64
, at
);
2812 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SetV128lo64
, vatomX
, at
));
2817 IRAtom
* unary64F0x2 ( MCEnv
* mce
, IRAtom
* vatomX
)
2820 tl_assert(isShadowAtom(mce
, vatomX
));
2821 at
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, vatomX
));
2822 at
= mkPCastTo(mce
, Ity_I64
, at
);
2823 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SetV128lo64
, vatomX
, at
));
2827 /* --- --- ... and ... 16Fx8 versions of the same --- --- */
2830 IRAtom
* binary16Fx8 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2833 tl_assert(isShadowAtom(mce
, vatomX
));
2834 tl_assert(isShadowAtom(mce
, vatomY
));
2835 at
= mkUifUV128(mce
, vatomX
, vatomY
);
2836 at
= assignNew('V', mce
, Ity_V128
, mkPCast16x8(mce
, at
));
2841 IRAtom
* unary16Fx8 ( MCEnv
* mce
, IRAtom
* vatomX
)
2844 tl_assert(isShadowAtom(mce
, vatomX
));
2845 at
= assignNew('V', mce
, Ity_V128
, mkPCast16x8(mce
, vatomX
));
2849 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2853 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2856 IRAtom
* binary32Fx2 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2859 tl_assert(isShadowAtom(mce
, vatomX
));
2860 tl_assert(isShadowAtom(mce
, vatomY
));
2861 at
= mkUifU64(mce
, vatomX
, vatomY
);
2862 at
= assignNew('V', mce
, Ity_I64
, mkPCast32x2(mce
, at
));
2867 IRAtom
* unary32Fx2 ( MCEnv
* mce
, IRAtom
* vatomX
)
2870 tl_assert(isShadowAtom(mce
, vatomX
));
2871 at
= assignNew('V', mce
, Ity_I64
, mkPCast32x2(mce
, vatomX
));
2875 /* --- ... and ... 64Fx4 versions of the same ... --- */
2878 IRAtom
* binary64Fx4 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2881 tl_assert(isShadowAtom(mce
, vatomX
));
2882 tl_assert(isShadowAtom(mce
, vatomY
));
2883 at
= mkUifUV256(mce
, vatomX
, vatomY
);
2884 at
= assignNew('V', mce
, Ity_V256
, mkPCast64x4(mce
, at
));
2889 IRAtom
* unary64Fx4 ( MCEnv
* mce
, IRAtom
* vatomX
)
2892 tl_assert(isShadowAtom(mce
, vatomX
));
2893 at
= assignNew('V', mce
, Ity_V256
, mkPCast64x4(mce
, vatomX
));
2897 /* --- ... and ... 32Fx8 versions of the same ... --- */
2900 IRAtom
* binary32Fx8 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2903 tl_assert(isShadowAtom(mce
, vatomX
));
2904 tl_assert(isShadowAtom(mce
, vatomY
));
2905 at
= mkUifUV256(mce
, vatomX
, vatomY
);
2906 at
= assignNew('V', mce
, Ity_V256
, mkPCast32x8(mce
, at
));
2911 IRAtom
* unary32Fx8 ( MCEnv
* mce
, IRAtom
* vatomX
)
2914 tl_assert(isShadowAtom(mce
, vatomX
));
2915 at
= assignNew('V', mce
, Ity_V256
, mkPCast32x8(mce
, vatomX
));
2919 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2922 IRAtom
* binary64Fx2_w_rm ( MCEnv
* mce
, IRAtom
* vRM
,
2923 IRAtom
* vatomX
, IRAtom
* vatomY
)
2925 /* This is the same as binary64Fx2, except that we subsequently
2926 pessimise vRM (definedness of the rounding mode), widen to 128
2927 bits and UifU it into the result. As with the scalar cases, if
2928 the RM is a constant then it is defined and so this extra bit
2929 will get constant-folded out later. */
2930 // "do" the vector args
2931 IRAtom
* t1
= binary64Fx2(mce
, vatomX
, vatomY
);
2932 // PCast the RM, and widen it to 128 bits
2933 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
2934 // Roll it into the result
2935 t1
= mkUifUV128(mce
, t1
, t2
);
2939 /* --- ... and ... 32Fx4 versions of the same --- */
2942 IRAtom
* binary32Fx4_w_rm ( MCEnv
* mce
, IRAtom
* vRM
,
2943 IRAtom
* vatomX
, IRAtom
* vatomY
)
2945 IRAtom
* t1
= binary32Fx4(mce
, vatomX
, vatomY
);
2946 // PCast the RM, and widen it to 128 bits
2947 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
2948 // Roll it into the result
2949 t1
= mkUifUV128(mce
, t1
, t2
);
2953 /* --- ... and ... 64Fx4 versions of the same --- */
2956 IRAtom
* binary64Fx4_w_rm ( MCEnv
* mce
, IRAtom
* vRM
,
2957 IRAtom
* vatomX
, IRAtom
* vatomY
)
2959 IRAtom
* t1
= binary64Fx4(mce
, vatomX
, vatomY
);
2960 // PCast the RM, and widen it to 256 bits
2961 IRAtom
* t2
= mkPCastTo(mce
, Ity_V256
, vRM
);
2962 // Roll it into the result
2963 t1
= mkUifUV256(mce
, t1
, t2
);
2967 /* --- ... and ... 16Fx8 versions of the same --- */
2970 IRAtom
* binary16Fx8_w_rm ( MCEnv
* mce
, IRAtom
* vRM
,
2971 IRAtom
* vatomX
, IRAtom
* vatomY
)
2973 IRAtom
* t1
= binary16Fx8(mce
, vatomX
, vatomY
);
2974 // PCast the RM, and widen it to 128 bits
2975 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
2976 // Roll it into the result
2977 t1
= mkUifUV128(mce
, t1
, t2
);
2981 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2985 /* --- ... and ... 32Fx8 versions of the same --- */
2988 IRAtom
* binary32Fx8_w_rm ( MCEnv
* mce
, IRAtom
* vRM
,
2989 IRAtom
* vatomX
, IRAtom
* vatomY
)
2991 IRAtom
* t1
= binary32Fx8(mce
, vatomX
, vatomY
);
2992 // PCast the RM, and widen it to 256 bits
2993 IRAtom
* t2
= mkPCastTo(mce
, Ity_V256
, vRM
);
2994 // Roll it into the result
2995 t1
= mkUifUV256(mce
, t1
, t2
);
2999 /* --- 64Fx2 unary FP ops, with rounding mode --- */
3002 IRAtom
* unary64Fx2_w_rm ( MCEnv
* mce
, IRAtom
* vRM
, IRAtom
* vatomX
)
3004 /* Same scheme as binary64Fx2_w_rm. */
3005 // "do" the vector arg
3006 IRAtom
* t1
= unary64Fx2(mce
, vatomX
);
3007 // PCast the RM, and widen it to 128 bits
3008 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
3009 // Roll it into the result
3010 t1
= mkUifUV128(mce
, t1
, t2
);
3014 /* --- ... and ... 32Fx4 versions of the same --- */
3017 IRAtom
* unary32Fx4_w_rm ( MCEnv
* mce
, IRAtom
* vRM
, IRAtom
* vatomX
)
3019 /* Same scheme as binaryFx4_w_rm. */
3020 IRAtom
* t1
= unary32Fx4(mce
, vatomX
);
3021 // PCast the RM, and widen it to 128 bits
3022 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
3023 // Roll it into the result
3024 t1
= mkUifUV128(mce
, t1
, t2
);
3028 /* --- ... and ... 16Fx8 versions of the same --- */
3031 IRAtom
* unary16Fx8_w_rm ( MCEnv
* mce
, IRAtom
* vRM
, IRAtom
* vatomX
)
3033 /* Same scheme as binaryFx4_w_rm. */
3034 IRAtom
* t1
= unary16Fx8(mce
, vatomX
);
3035 // PCast the RM, and widen it to 128 bits
3036 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
3037 // Roll it into the result
3038 t1
= mkUifUV128(mce
, t1
, t2
);
3042 /* --- ... and ... 32Fx8 versions of the same --- */
3045 IRAtom
* unary32Fx8_w_rm ( MCEnv
* mce
, IRAtom
* vRM
, IRAtom
* vatomX
)
3047 /* Same scheme as unary32Fx8_w_rm. */
3048 IRAtom
* t1
= unary32Fx8(mce
, vatomX
);
3049 // PCast the RM, and widen it to 256 bits
3050 IRAtom
* t2
= mkPCastTo(mce
, Ity_V256
, vRM
);
3051 // Roll it into the result
3052 t1
= mkUifUV256(mce
, t1
, t2
);
3057 /* --- --- Vector saturated narrowing --- --- */
3059 /* We used to do something very clever here, but on closer inspection
3060 (2011-Jun-15), and in particular bug #279698, it turns out to be
3061 wrong. Part of the problem came from the fact that for a long
3062 time, the IR primops to do with saturated narrowing were
3063 underspecified and managed to confuse multiple cases which needed
3064 to be separate: the op names had a signedness qualifier, but in
3065 fact the source and destination signednesses needed to be specified
3066 independently, so the op names really need two independent
3067 signedness specifiers.
3069 As of 2011-Jun-15 (ish) the underspecification was sorted out
3070 properly. The incorrect instrumentation remained, though. That
3071 has now (2011-Oct-22) been fixed.
3073 What we now do is simple:
3075 Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
3076 number of lanes, X is the source lane width and signedness, and Y
3077 is the destination lane width and signedness. In all cases the
3078 destination lane width is half the source lane width, so the names
3079 have a bit of redundancy, but are at least easy to read.
3081 For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
3084 Let Vanilla(OP) be a function that takes OP, one of these
3085 saturating narrowing ops, and produces the same "shaped" narrowing
3086 op which is not saturating, but merely dumps the most significant
3087 bits. "same shape" means that the lane numbers and widths are the
3090 For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
3091 = Iop_NarrowBin32to16x8,
3092 that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
3093 dumping the top half of each lane.
3095 So, with that in place, the scheme is simple, and it is simple to
3096 pessimise each lane individually and then apply Vanilla(OP) so as
3097 to get the result in the right "shape". If the original OP is
3098 QNarrowBinXtoYxZ then we produce
3100 Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
3102 or for the case when OP is unary (Iop_QNarrowUn*)
3104 Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
3107 IROp
vanillaNarrowingOpOfShape ( IROp qnarrowOp
)
3109 switch (qnarrowOp
) {
3110 /* Binary: (128, 128) -> 128 */
3111 case Iop_QNarrowBin16Sto8Ux16
:
3112 case Iop_QNarrowBin16Sto8Sx16
:
3113 case Iop_QNarrowBin16Uto8Ux16
:
3114 case Iop_QNarrowBin64Sto32Sx4
:
3115 case Iop_QNarrowBin64Uto32Ux4
:
3116 return Iop_NarrowBin16to8x16
;
3117 case Iop_QNarrowBin32Sto16Ux8
:
3118 case Iop_QNarrowBin32Sto16Sx8
:
3119 case Iop_QNarrowBin32Uto16Ux8
:
3120 return Iop_NarrowBin32to16x8
;
3121 /* Binary: (64, 64) -> 64 */
3122 case Iop_QNarrowBin32Sto16Sx4
:
3123 return Iop_NarrowBin32to16x4
;
3124 case Iop_QNarrowBin16Sto8Ux8
:
3125 case Iop_QNarrowBin16Sto8Sx8
:
3126 return Iop_NarrowBin16to8x8
;
3127 /* Unary: 128 -> 64 */
3128 case Iop_QNarrowUn64Uto32Ux2
:
3129 case Iop_QNarrowUn64Sto32Sx2
:
3130 case Iop_QNarrowUn64Sto32Ux2
:
3131 return Iop_NarrowUn64to32x2
;
3132 case Iop_QNarrowUn32Uto16Ux4
:
3133 case Iop_QNarrowUn32Sto16Sx4
:
3134 case Iop_QNarrowUn32Sto16Ux4
:
3135 case Iop_F32toF16x4_DEP
:
3136 return Iop_NarrowUn32to16x4
;
3137 case Iop_QNarrowUn16Uto8Ux8
:
3138 case Iop_QNarrowUn16Sto8Sx8
:
3139 case Iop_QNarrowUn16Sto8Ux8
:
3140 return Iop_NarrowUn16to8x8
;
3143 VG_(tool_panic
)("vanillaNarrowOpOfShape");
3148 IRAtom
* vectorNarrowBinV128 ( MCEnv
* mce
, IROp narrow_op
,
3149 IRAtom
* vatom1
, IRAtom
* vatom2
)
3151 IRAtom
*at1
, *at2
, *at3
;
3152 IRAtom
* (*pcast
)( MCEnv
*, IRAtom
* );
3153 switch (narrow_op
) {
3154 case Iop_QNarrowBin64Sto32Sx4
: pcast
= mkPCast32x4
; break;
3155 case Iop_QNarrowBin64Uto32Ux4
: pcast
= mkPCast32x4
; break;
3156 case Iop_QNarrowBin32Sto16Sx8
: pcast
= mkPCast32x4
; break;
3157 case Iop_QNarrowBin32Uto16Ux8
: pcast
= mkPCast32x4
; break;
3158 case Iop_QNarrowBin32Sto16Ux8
: pcast
= mkPCast32x4
; break;
3159 case Iop_QNarrowBin16Sto8Sx16
: pcast
= mkPCast16x8
; break;
3160 case Iop_QNarrowBin16Uto8Ux16
: pcast
= mkPCast16x8
; break;
3161 case Iop_QNarrowBin16Sto8Ux16
: pcast
= mkPCast16x8
; break;
3162 default: VG_(tool_panic
)("vectorNarrowBinV128");
3164 IROp vanilla_narrow
= vanillaNarrowingOpOfShape(narrow_op
);
3165 tl_assert(isShadowAtom(mce
,vatom1
));
3166 tl_assert(isShadowAtom(mce
,vatom2
));
3167 at1
= assignNew('V', mce
, Ity_V128
, pcast(mce
, vatom1
));
3168 at2
= assignNew('V', mce
, Ity_V128
, pcast(mce
, vatom2
));
3169 at3
= assignNew('V', mce
, Ity_V128
, binop(vanilla_narrow
, at1
, at2
));
3174 IRAtom
* vectorNarrowBin64 ( MCEnv
* mce
, IROp narrow_op
,
3175 IRAtom
* vatom1
, IRAtom
* vatom2
)
3177 IRAtom
*at1
, *at2
, *at3
;
3178 IRAtom
* (*pcast
)( MCEnv
*, IRAtom
* );
3179 switch (narrow_op
) {
3180 case Iop_QNarrowBin32Sto16Sx4
: pcast
= mkPCast32x2
; break;
3181 case Iop_QNarrowBin16Sto8Sx8
: pcast
= mkPCast16x4
; break;
3182 case Iop_QNarrowBin16Sto8Ux8
: pcast
= mkPCast16x4
; break;
3183 default: VG_(tool_panic
)("vectorNarrowBin64");
3185 IROp vanilla_narrow
= vanillaNarrowingOpOfShape(narrow_op
);
3186 tl_assert(isShadowAtom(mce
,vatom1
));
3187 tl_assert(isShadowAtom(mce
,vatom2
));
3188 at1
= assignNew('V', mce
, Ity_I64
, pcast(mce
, vatom1
));
3189 at2
= assignNew('V', mce
, Ity_I64
, pcast(mce
, vatom2
));
3190 at3
= assignNew('V', mce
, Ity_I64
, binop(vanilla_narrow
, at1
, at2
));
3195 IRAtom
* vectorNarrowUnV128 ( MCEnv
* mce
, IROp narrow_op
,
3199 IRAtom
* (*pcast
)( MCEnv
*, IRAtom
* );
3200 tl_assert(isShadowAtom(mce
,vatom1
));
3201 /* For vanilla narrowing (non-saturating), we can just apply
3202 the op directly to the V bits. */
3203 switch (narrow_op
) {
3204 case Iop_NarrowUn16to8x8
:
3205 case Iop_NarrowUn32to16x4
:
3206 case Iop_NarrowUn64to32x2
:
3207 case Iop_F32toF16x4_DEP
:
3208 at1
= assignNew('V', mce
, Ity_I64
, unop(narrow_op
, vatom1
));
3211 break; /* Do Plan B */
3213 /* Plan B: for ops that involve a saturation operation on the args,
3214 we must PCast before the vanilla narrow. */
3215 switch (narrow_op
) {
3216 case Iop_QNarrowUn16Sto8Sx8
: pcast
= mkPCast16x8
; break;
3217 case Iop_QNarrowUn16Sto8Ux8
: pcast
= mkPCast16x8
; break;
3218 case Iop_QNarrowUn16Uto8Ux8
: pcast
= mkPCast16x8
; break;
3219 case Iop_QNarrowUn32Sto16Sx4
: pcast
= mkPCast32x4
; break;
3220 case Iop_QNarrowUn32Sto16Ux4
: pcast
= mkPCast32x4
; break;
3221 case Iop_QNarrowUn32Uto16Ux4
: pcast
= mkPCast32x4
; break;
3222 case Iop_QNarrowUn64Sto32Sx2
: pcast
= mkPCast64x2
; break;
3223 case Iop_QNarrowUn64Sto32Ux2
: pcast
= mkPCast64x2
; break;
3224 case Iop_QNarrowUn64Uto32Ux2
: pcast
= mkPCast64x2
; break;
3225 default: VG_(tool_panic
)("vectorNarrowUnV128");
3227 IROp vanilla_narrow
= vanillaNarrowingOpOfShape(narrow_op
);
3228 at1
= assignNew('V', mce
, Ity_V128
, pcast(mce
, vatom1
));
3229 at2
= assignNew('V', mce
, Ity_I64
, unop(vanilla_narrow
, at1
));
3234 IRAtom
* vectorWidenI64 ( MCEnv
* mce
, IROp longen_op
,
3238 IRAtom
* (*pcast
)( MCEnv
*, IRAtom
* );
3239 switch (longen_op
) {
3240 case Iop_Widen8Uto16x8
: pcast
= mkPCast16x8
; break;
3241 case Iop_Widen8Sto16x8
: pcast
= mkPCast16x8
; break;
3242 case Iop_Widen16Uto32x4
: pcast
= mkPCast32x4
; break;
3243 case Iop_Widen16Sto32x4
: pcast
= mkPCast32x4
; break;
3244 case Iop_Widen32Uto64x2
: pcast
= mkPCast64x2
; break;
3245 case Iop_Widen32Sto64x2
: pcast
= mkPCast64x2
; break;
3246 case Iop_F16toF32x4
: pcast
= mkPCast32x4
; break;
3247 default: VG_(tool_panic
)("vectorWidenI64");
3249 tl_assert(isShadowAtom(mce
,vatom1
));
3250 at1
= assignNew('V', mce
, Ity_V128
, unop(longen_op
, vatom1
));
3251 at2
= assignNew('V', mce
, Ity_V128
, pcast(mce
, at1
));
3256 /* --- --- Vector integer arithmetic --- --- */
3258 /* Simple ... UifU the args and per-lane pessimise the results. */
3260 /* --- V256-bit versions --- */
3263 IRAtom
* binary8Ix32 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3266 at
= mkUifUV256(mce
, vatom1
, vatom2
);
3267 at
= mkPCast8x32(mce
, at
);
3272 IRAtom
* binary16Ix16 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3275 at
= mkUifUV256(mce
, vatom1
, vatom2
);
3276 at
= mkPCast16x16(mce
, at
);
3281 IRAtom
* binary32Ix8 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3284 at
= mkUifUV256(mce
, vatom1
, vatom2
);
3285 at
= mkPCast32x8(mce
, at
);
3290 IRAtom
* binary64Ix4 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3293 at
= mkUifUV256(mce
, vatom1
, vatom2
);
3294 at
= mkPCast64x4(mce
, at
);
3298 /* --- V128-bit versions --- */
3301 IRAtom
* binary8Ix16 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3304 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3305 at
= mkPCast8x16(mce
, at
);
3310 IRAtom
* binary16Ix8 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3313 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3314 at
= mkPCast16x8(mce
, at
);
3319 IRAtom
* binary32Ix4 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3322 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3323 at
= mkPCast32x4(mce
, at
);
3328 IRAtom
* binary64Ix2 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3331 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3332 at
= mkPCast64x2(mce
, at
);
3337 IRAtom
* binary128Ix1 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3340 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3341 at
= mkPCast128x1(mce
, at
);
3345 /* --- 64-bit versions --- */
3348 IRAtom
* binary8Ix8 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3351 at
= mkUifU64(mce
, vatom1
, vatom2
);
3352 at
= mkPCast8x8(mce
, at
);
3357 IRAtom
* binary16Ix4 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3360 at
= mkUifU64(mce
, vatom1
, vatom2
);
3361 at
= mkPCast16x4(mce
, at
);
3366 IRAtom
* binary32Ix2 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3369 at
= mkUifU64(mce
, vatom1
, vatom2
);
3370 at
= mkPCast32x2(mce
, at
);
3375 IRAtom
* binary64Ix1 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3378 at
= mkUifU64(mce
, vatom1
, vatom2
);
3379 at
= mkPCastTo(mce
, Ity_I64
, at
);
3383 /* --- 32-bit versions --- */
3386 IRAtom
* binary8Ix4 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3389 at
= mkUifU32(mce
, vatom1
, vatom2
);
3390 at
= mkPCast8x4(mce
, at
);
3395 IRAtom
* binary16Ix2 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3398 at
= mkUifU32(mce
, vatom1
, vatom2
);
3399 at
= mkPCast16x2(mce
, at
);
3404 /*------------------------------------------------------------*/
3405 /*--- Generate shadow values from all kinds of IRExprs. ---*/
3406 /*------------------------------------------------------------*/
3409 IRAtom
* expr2vbits_Qop ( MCEnv
* mce
,
3411 IRAtom
* atom1
, IRAtom
* atom2
,
3412 IRAtom
* atom3
, IRAtom
* atom4
)
3414 IRAtom
* vatom1
= expr2vbits( mce
, atom1
, HuOth
);
3415 IRAtom
* vatom2
= expr2vbits( mce
, atom2
, HuOth
);
3416 IRAtom
* vatom3
= expr2vbits( mce
, atom3
, HuOth
);
3417 IRAtom
* vatom4
= expr2vbits( mce
, atom4
, HuOth
);
3419 tl_assert(isOriginalAtom(mce
,atom1
));
3420 tl_assert(isOriginalAtom(mce
,atom2
));
3421 tl_assert(isOriginalAtom(mce
,atom3
));
3422 tl_assert(isOriginalAtom(mce
,atom4
));
3423 tl_assert(isShadowAtom(mce
,vatom1
));
3424 tl_assert(isShadowAtom(mce
,vatom2
));
3425 tl_assert(isShadowAtom(mce
,vatom3
));
3426 tl_assert(isShadowAtom(mce
,vatom4
));
3427 tl_assert(sameKindedAtoms(atom1
,vatom1
));
3428 tl_assert(sameKindedAtoms(atom2
,vatom2
));
3429 tl_assert(sameKindedAtoms(atom3
,vatom3
));
3430 tl_assert(sameKindedAtoms(atom4
,vatom4
));
3433 case Iop_MAddF64r32
:
3435 case Iop_MSubF64r32
:
3436 /* I32(rm) x F64 x F64 x F64 -> F64 */
3437 return mkLazy4(mce
, Ity_I64
, vatom1
, vatom2
, vatom3
, vatom4
);
3441 /* I32(rm) x F32 x F32 x F32 -> F32 */
3442 return mkLazy4(mce
, Ity_I32
, vatom1
, vatom2
, vatom3
, vatom4
);
3446 case Iop_NegMAddF128
:
3447 case Iop_NegMSubF128
:
3448 /* I32(rm) x F128 x F128 x F128 -> F128 */
3449 return mkLazy4(mce
, Ity_I128
, vatom1
, vatom2
, vatom3
, vatom4
);
3451 /* V256-bit data-steering */
3452 case Iop_64x4toV256
:
3453 return assignNew('V', mce
, Ity_V256
,
3454 IRExpr_Qop(op
, vatom1
, vatom2
, vatom3
, vatom4
));
3456 /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3458 return mkLazy4(mce
, Ity_I32
, vatom1
, vatom2
, vatom3
, vatom4
);
3460 return mkLazy4(mce
, Ity_I64
, vatom1
, vatom2
, vatom3
, vatom4
);
3463 VG_(tool_panic
)("memcheck:expr2vbits_Qop");
3469 IRAtom
* expr2vbits_Triop ( MCEnv
* mce
,
3471 IRAtom
* atom1
, IRAtom
* atom2
, IRAtom
* atom3
)
3473 IRAtom
* vatom1
= expr2vbits( mce
, atom1
, HuOth
);
3474 IRAtom
* vatom2
= expr2vbits( mce
, atom2
, HuOth
);
3475 IRAtom
* vatom3
= expr2vbits( mce
, atom3
, HuOth
);
3477 tl_assert(isOriginalAtom(mce
,atom1
));
3478 tl_assert(isOriginalAtom(mce
,atom2
));
3479 tl_assert(isOriginalAtom(mce
,atom3
));
3480 tl_assert(isShadowAtom(mce
,vatom1
));
3481 tl_assert(isShadowAtom(mce
,vatom2
));
3482 tl_assert(isShadowAtom(mce
,vatom3
));
3483 tl_assert(sameKindedAtoms(atom1
,vatom1
));
3484 tl_assert(sameKindedAtoms(atom2
,vatom2
));
3485 tl_assert(sameKindedAtoms(atom3
,vatom3
));
3495 case Iop_QuantizeD128
:
3496 /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3497 return mkLazy3(mce
, Ity_I128
, vatom1
, vatom2
, vatom3
);
3516 case Iop_QuantizeD64
:
3517 /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3518 return mkLazy3(mce
, Ity_I64
, vatom1
, vatom2
, vatom3
);
3519 case Iop_PRemC3210F64
:
3520 case Iop_PRem1C3210F64
:
3521 /* I32(rm) x F64 x F64 -> I32 */
3522 return mkLazy3(mce
, Ity_I32
, vatom1
, vatom2
, vatom3
);
3527 /* I32(rm) x F32 x F32 -> I32 */
3528 return mkLazy3(mce
, Ity_I32
, vatom1
, vatom2
, vatom3
);
3531 /* I32(rm) x F16 x F16 -> I16 */
3532 return mkLazy3(mce
, Ity_I16
, vatom1
, vatom2
, vatom3
);
3533 case Iop_SignificanceRoundD64
:
3534 /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3535 return mkLazy3(mce
, Ity_I64
, vatom1
, vatom2
, vatom3
);
3536 case Iop_SignificanceRoundD128
:
3537 /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3538 return mkLazy3(mce
, Ity_I128
, vatom1
, vatom2
, vatom3
);
3540 /* (V128, V128, I8) -> V128 */
3541 complainIfUndefined(mce
, atom3
, NULL
);
3542 return assignNew('V', mce
, Ity_V128
, triop(op
, vatom1
, vatom2
, atom3
));
3544 /* (I64, I64, I8) -> I64 */
3545 complainIfUndefined(mce
, atom3
, NULL
);
3546 return assignNew('V', mce
, Ity_I64
, triop(op
, vatom1
, vatom2
, atom3
));
3547 case Iop_SetElem8x8
:
3548 case Iop_SetElem16x4
:
3549 case Iop_SetElem32x2
:
3550 complainIfUndefined(mce
, atom2
, NULL
);
3551 return assignNew('V', mce
, Ity_I64
, triop(op
, vatom1
, atom2
, vatom3
));
3553 case Iop_SetElem8x16
:
3554 case Iop_SetElem16x8
:
3555 case Iop_SetElem32x4
:
3556 case Iop_SetElem64x2
:
3557 complainIfUndefined(mce
, atom2
, NULL
);
3558 return assignNew('V', mce
, Ity_V128
, triop(op
, vatom1
, atom2
, vatom3
));
3560 /* Int 128-bit Integer three arg */
3561 case Iop_2xMultU64Add128CarryOut
:
3562 case Iop_Perm8x16x2
:
3563 /* (V128, V128, V128) -> V128 */
3564 complainIfUndefined(mce
, atom3
, NULL
);
3567 assignNew('V', mce
, Ity_V128
, triop(op
, vatom1
, vatom2
, atom3
)),
3568 mkPCast8x16(mce
, vatom3
)
3571 /* Vector FP with rounding mode as the first arg */
3576 case Iop_Scale2_64Fx2
:
3577 return binary64Fx2_w_rm(mce
, vatom1
, vatom2
, vatom3
);
3583 case Iop_Scale2_32Fx4
:
3584 return binary32Fx4_w_rm(mce
, vatom1
, vatom2
, vatom3
);
3590 return binary64Fx4_w_rm(mce
, vatom1
, vatom2
, vatom3
);
3592 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision
3597 return binary16Fx8_w_rm(mce
, vatom1
, vatom2
, vatom3
);
3603 return binary32Fx8_w_rm(mce
, vatom1
, vatom2
, vatom3
);
3605 case Iop_F32x4_2toQ16x8
:
3606 return assignNew('V', mce
, Ity_V128
,
3607 binop(Iop_PackEvenLanes16x8
,
3608 unary32Fx4_w_rm(mce
, vatom1
, vatom2
),
3609 unary32Fx4_w_rm(mce
, vatom1
, vatom3
)));
3610 case Iop_F64x2_2toQ32x4
:
3611 return assignNew('V', mce
, Ity_V128
,
3612 binop(Iop_PackEvenLanes32x4
,
3613 unary64Fx2_w_rm(mce
, vatom1
, vatom2
),
3614 unary64Fx2_w_rm(mce
, vatom1
, vatom3
)));
3618 VG_(tool_panic
)("memcheck:expr2vbits_Triop");
3624 IRAtom
* expr2vbits_Binop ( MCEnv
* mce
,
3626 IRAtom
* atom1
, IRAtom
* atom2
,
3627 HowUsed hu
/*use HuOth if unknown*/ )
3629 IRType and_or_ty
= Ity_INVALID
;
3630 IRAtom
* (*uifu
) (MCEnv
*, IRAtom
*, IRAtom
*) = NULL
;
3631 IRAtom
* (*difd
) (MCEnv
*, IRAtom
*, IRAtom
*) = NULL
;
3632 IRAtom
* (*improve
) (MCEnv
*, IRAtom
*, IRAtom
*) = NULL
;
3634 IRAtom
* vatom1
= expr2vbits( mce
, atom1
, HuOth
);
3635 IRAtom
* vatom2
= expr2vbits( mce
, atom2
, HuOth
);
3637 tl_assert(isOriginalAtom(mce
,atom1
));
3638 tl_assert(isOriginalAtom(mce
,atom2
));
3639 tl_assert(isShadowAtom(mce
,vatom1
));
3640 tl_assert(isShadowAtom(mce
,vatom2
));
3641 tl_assert(sameKindedAtoms(atom1
,vatom1
));
3642 tl_assert(sameKindedAtoms(atom2
,vatom2
));
3657 return binary16Ix2(mce
, vatom1
, vatom2
);
3669 return binary8Ix4(mce
, vatom1
, vatom2
);
3682 /* Same scheme as with all other shifts. */
3683 complainIfUndefined(mce
, atom2
, NULL
);
3684 return assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
));
3686 case Iop_QNarrowBin32Sto16Sx4
:
3687 case Iop_QNarrowBin16Sto8Sx8
:
3688 case Iop_QNarrowBin16Sto8Ux8
:
3689 return vectorNarrowBin64(mce
, op
, vatom1
, vatom2
);
3708 case Iop_PolynomialMul8x8
:
3709 return binary8Ix8(mce
, vatom1
, vatom2
);
3720 case Iop_MulHi16Sx4
:
3721 case Iop_MulHi16Ux4
:
3722 case Iop_CmpGT16Sx4
:
3723 case Iop_CmpGT16Ux4
:
3730 case Iop_QDMulHi16Sx4
:
3731 case Iop_QRDMulHi16Sx4
:
3732 return binary16Ix4(mce
, vatom1
, vatom2
);
3740 case Iop_CmpGT32Sx2
:
3741 case Iop_CmpGT32Ux2
:
3750 case Iop_QDMulHi32Sx2
:
3751 case Iop_QRDMulHi32Sx2
:
3752 return binary32Ix2(mce
, vatom1
, vatom2
);
3761 return binary64Ix1(mce
, vatom1
, vatom2
);
3763 case Iop_QShlNsatSU8x8
:
3764 case Iop_QShlNsatUU8x8
:
3765 case Iop_QShlNsatSS8x8
:
3766 complainIfUndefined(mce
, atom2
, NULL
);
3767 return mkPCast8x8(mce
, vatom1
);
3769 case Iop_QShlNsatSU16x4
:
3770 case Iop_QShlNsatUU16x4
:
3771 case Iop_QShlNsatSS16x4
:
3772 complainIfUndefined(mce
, atom2
, NULL
);
3773 return mkPCast16x4(mce
, vatom1
);
3775 case Iop_QShlNsatSU32x2
:
3776 case Iop_QShlNsatUU32x2
:
3777 case Iop_QShlNsatSS32x2
:
3778 complainIfUndefined(mce
, atom2
, NULL
);
3779 return mkPCast32x2(mce
, vatom1
);
3781 case Iop_QShlNsatSU64x1
:
3782 case Iop_QShlNsatUU64x1
:
3783 case Iop_QShlNsatSS64x1
:
3784 complainIfUndefined(mce
, atom2
, NULL
);
3785 return mkPCast32x2(mce
, vatom1
);
3787 case Iop_PwMax32Sx2
:
3788 case Iop_PwMax32Ux2
:
3789 case Iop_PwMin32Sx2
:
3790 case Iop_PwMin32Ux2
:
3791 case Iop_PwMax32Fx2
:
3792 case Iop_PwMin32Fx2
:
3793 return assignNew('V', mce
, Ity_I64
,
3794 binop(Iop_PwMax32Ux2
,
3795 mkPCast32x2(mce
, vatom1
),
3796 mkPCast32x2(mce
, vatom2
)));
3798 case Iop_PwMax16Sx4
:
3799 case Iop_PwMax16Ux4
:
3800 case Iop_PwMin16Sx4
:
3801 case Iop_PwMin16Ux4
:
3802 return assignNew('V', mce
, Ity_I64
,
3803 binop(Iop_PwMax16Ux4
,
3804 mkPCast16x4(mce
, vatom1
),
3805 mkPCast16x4(mce
, vatom2
)));
3811 return assignNew('V', mce
, Ity_I64
,
3812 binop(Iop_PwMax8Ux8
,
3813 mkPCast8x8(mce
, vatom1
),
3814 mkPCast8x8(mce
, vatom2
)));
3817 case Iop_PwAdd32Fx2
:
3818 return mkPCast32x2(mce
,
3819 assignNew('V', mce
, Ity_I64
,
3820 binop(Iop_PwAdd32x2
,
3821 mkPCast32x2(mce
, vatom1
),
3822 mkPCast32x2(mce
, vatom2
))));
3825 return mkPCast16x4(mce
,
3826 assignNew('V', mce
, Ity_I64
,
3827 binop(op
, mkPCast16x4(mce
, vatom1
),
3828 mkPCast16x4(mce
, vatom2
))));
3831 return mkPCast8x8(mce
,
3832 assignNew('V', mce
, Ity_I64
,
3833 binop(op
, mkPCast8x8(mce
, vatom1
),
3834 mkPCast8x8(mce
, vatom2
))));
3840 return mkUifU64(mce
,
3841 assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
)),
3842 mkPCast8x8(mce
,vatom2
)
3849 return mkUifU64(mce
,
3850 assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
)),
3851 mkPCast16x4(mce
,vatom2
)
3858 return mkUifU64(mce
,
3859 assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
)),
3860 mkPCast32x2(mce
,vatom2
)
3863 /* 64-bit data-steering */
3864 case Iop_InterleaveLO32x2
:
3865 case Iop_InterleaveLO16x4
:
3866 case Iop_InterleaveLO8x8
:
3867 case Iop_InterleaveHI32x2
:
3868 case Iop_InterleaveHI16x4
:
3869 case Iop_InterleaveHI8x8
:
3870 case Iop_CatOddLanes8x8
:
3871 case Iop_CatEvenLanes8x8
:
3872 case Iop_CatOddLanes16x4
:
3873 case Iop_CatEvenLanes16x4
:
3874 case Iop_InterleaveOddLanes8x8
:
3875 case Iop_InterleaveEvenLanes8x8
:
3876 case Iop_InterleaveOddLanes16x4
:
3877 case Iop_InterleaveEvenLanes16x4
:
3878 return assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, vatom2
));
3880 case Iop_GetElem8x8
:
3881 complainIfUndefined(mce
, atom2
, NULL
);
3882 return assignNew('V', mce
, Ity_I8
, binop(op
, vatom1
, atom2
));
3883 case Iop_GetElem16x4
:
3884 complainIfUndefined(mce
, atom2
, NULL
);
3885 return assignNew('V', mce
, Ity_I16
, binop(op
, vatom1
, atom2
));
3886 case Iop_GetElem32x2
:
3887 complainIfUndefined(mce
, atom2
, NULL
);
3888 return assignNew('V', mce
, Ity_I32
, binop(op
, vatom1
, atom2
));
3890 /* Perm8x8: rearrange values in left arg using steering values from
3891 right arg. So rearrange the vbits in the same way but pessimise wrt
3892 steering values. We assume that unused bits in the steering value
3893 are defined zeros, so we can safely PCast within each lane of the the
3894 steering value without having to take precautions to avoid a
3895 dependency on those unused bits.
3897 This is also correct for PermOrZero8x8, but it is a bit subtle. For
3898 each lane, if bit 7 of the steering value is zero, then we'll steer
3899 the shadow value exactly as per Perm8x8. If that bit is one, then
3900 the operation will set the resulting (concrete) value to zero. That
3901 means it is defined, and should have a shadow value of zero. Hence
3902 in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
3903 as Perm8x8) and then pessimise against the steering values. */
3905 case Iop_PermOrZero8x8
:
3908 assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
)),
3909 mkPCast8x8(mce
, vatom2
)
3914 case Iop_I32StoF32x4
:
3915 case Iop_F32toI32Sx4
:
3917 return unary16Fx8_w_rm(mce
, vatom1
, vatom2
);
3919 return unary32Fx4_w_rm(mce
, vatom1
, vatom2
);
3921 return unary64Fx2_w_rm(mce
, vatom1
, vatom2
);
3935 /* Same scheme as with all other shifts. Note: 22 Oct 05:
3936 this is wrong now, scalar shifts are done properly lazily.
3937 Vector shifts should be fixed too. */
3938 complainIfUndefined(mce
, atom2
, NULL
);
3939 return assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
));
3941 /* V x V shifts/rotates are done using the standard lazy scheme. */
3942 /* For the non-rounding variants of bi-di vector x vector
3943 shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3944 But note that this is overly pessimistic, because in fact only
3945 the bottom 8 bits of each lane of the second argument are taken
3946 into account when shifting. So really we ought to ignore
3947 undefinedness in bits 8 and above of each lane in the
3956 return mkUifUV128(mce
,
3957 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
3958 mkPCast8x16(mce
,vatom2
)
3968 return mkUifUV128(mce
,
3969 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
3970 mkPCast16x8(mce
,vatom2
)
3980 return mkUifUV128(mce
,
3981 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
3982 mkPCast32x4(mce
,vatom2
)
3992 return mkUifUV128(mce
,
3993 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
3994 mkPCast64x2(mce
,vatom2
)
3997 /* For the rounding variants of bi-di vector x vector shifts, the
3998 rounding adjustment can cause undefinedness to propagate through
3999 the entire lane, in the worst case. Too complex to handle
4000 properly .. just UifU the arguments and then PCast them.
4001 Suboptimal but safe. */
4004 return binary8Ix16(mce
, vatom1
, vatom2
);
4007 return binary16Ix8(mce
, vatom1
, vatom2
);
4010 return binary32Ix4(mce
, vatom1
, vatom2
);
4013 return binary64Ix2(mce
, vatom1
, vatom2
);
4015 case Iop_F32ToFixed32Ux4_RZ
:
4016 case Iop_F32ToFixed32Sx4_RZ
:
4017 case Iop_Fixed32UToF32x4_RN
:
4018 case Iop_Fixed32SToF32x4_RN
:
4019 complainIfUndefined(mce
, atom2
, NULL
);
4020 return mkPCast32x4(mce
, vatom1
);
4022 case Iop_F32ToFixed32Ux2_RZ
:
4023 case Iop_F32ToFixed32Sx2_RZ
:
4024 case Iop_Fixed32UToF32x2_RN
:
4025 case Iop_Fixed32SToF32x2_RN
:
4026 complainIfUndefined(mce
, atom2
, NULL
);
4027 return mkPCast32x2(mce
, vatom1
);
4041 case Iop_QAddExtUSsatSS8x16
:
4042 case Iop_QAddExtSUsatUU8x16
:
4047 case Iop_MulHi8Sx16
:
4048 case Iop_MulHi8Ux16
:
4049 case Iop_PolynomialMul8x16
:
4050 case Iop_PolynomialMulAdd8x16
:
4051 return binary8Ix16(mce
, vatom1
, vatom2
);
4057 case Iop_MulHi16Sx8
:
4058 case Iop_MulHi16Ux8
:
4068 case Iop_QAddExtUSsatSS16x8
:
4069 case Iop_QAddExtSUsatUU16x8
:
4073 case Iop_QDMulHi16Sx8
:
4074 case Iop_QRDMulHi16Sx8
:
4075 case Iop_PolynomialMulAdd16x8
:
4076 /* PwExtUSMulQAdd8x16 is a bit subtle. The effect of it is that each
4077 16-bit chunk of the output is formed from corresponding 16-bit chunks
4078 of the input args, so we can treat it like an other binary 16x8
4079 operation. That's despite it having '8x16' in its name. */
4080 case Iop_PwExtUSMulQAdd8x16
:
4081 return binary16Ix8(mce
, vatom1
, vatom2
);
4083 case Iop_CmpGT64Sx2
:
4084 case Iop_CmpGT64Ux2
:
4085 case Iop_CmpGT32Sx4
:
4086 case Iop_CmpGT32Ux4
:
4087 case Iop_CmpGT16Sx8
:
4088 case Iop_CmpGT16Ux8
:
4089 case Iop_CmpGT8Sx16
:
4090 case Iop_CmpGT8Ux16
:
4091 return expensiveCmpGT(mce
, op
,
4092 vatom1
, vatom2
, atom1
, atom2
);
4099 case Iop_QAddExtUSsatSS32x4
:
4100 case Iop_QAddExtSUsatUU32x4
:
4111 case Iop_MulHi32Sx4
:
4112 case Iop_MulHi32Ux4
:
4113 case Iop_QDMulHi32Sx4
:
4114 case Iop_QRDMulHi32Sx4
:
4115 case Iop_PolynomialMulAdd32x4
:
4116 return binary32Ix4(mce
, vatom1
, vatom2
);
4133 case Iop_QAddExtUSsatSS64x2
:
4134 case Iop_QAddExtSUsatUU64x2
:
4135 case Iop_PolynomialMulAdd64x2
:
4136 case Iop_CipherV128
:
4137 case Iop_CipherLV128
:
4138 case Iop_NCipherV128
:
4139 case Iop_NCipherLV128
:
4140 case Iop_MulI128by10E
:
4141 case Iop_MulI128by10ECarry
:
4142 return binary64Ix2(mce
, vatom1
, vatom2
);
4146 case Iop_CmpNEZ128x1
:
4147 return binary128Ix1(mce
, vatom1
, vatom2
);
4155 /* I128 x I128 -> I128 */
4156 return mkLazy2(mce
, Ity_V128
, vatom1
, vatom2
);
4158 case Iop_QNarrowBin64Sto32Sx4
:
4159 case Iop_QNarrowBin64Uto32Ux4
:
4160 case Iop_QNarrowBin32Sto16Sx8
:
4161 case Iop_QNarrowBin32Uto16Ux8
:
4162 case Iop_QNarrowBin32Sto16Ux8
:
4163 case Iop_QNarrowBin16Sto8Sx16
:
4164 case Iop_QNarrowBin16Uto8Ux16
:
4165 case Iop_QNarrowBin16Sto8Ux16
:
4166 return vectorNarrowBinV128(mce
, op
, vatom1
, vatom2
);
4170 case Iop_CmpLT64Fx2
:
4171 case Iop_CmpLE64Fx2
:
4172 case Iop_CmpEQ64Fx2
:
4173 case Iop_CmpUN64Fx2
:
4174 case Iop_RecipStep64Fx2
:
4175 case Iop_RSqrtStep64Fx2
:
4176 return binary64Fx2(mce
, vatom1
, vatom2
);
4178 case Iop_CmpLT16Fx8
:
4179 case Iop_CmpLE16Fx8
:
4180 case Iop_CmpEQ16Fx8
:
4181 return binary16Fx8(mce
, vatom1
, vatom2
);
4188 case Iop_CmpLT64F0x2
:
4189 case Iop_CmpLE64F0x2
:
4190 case Iop_CmpEQ64F0x2
:
4191 case Iop_CmpUN64F0x2
:
4193 return binary64F0x2(mce
, vatom1
, vatom2
);
4197 case Iop_CmpLT32Fx4
:
4198 case Iop_CmpLE32Fx4
:
4199 case Iop_CmpEQ32Fx4
:
4200 case Iop_CmpUN32Fx4
:
4201 case Iop_CmpGT32Fx4
:
4202 case Iop_CmpGE32Fx4
:
4203 case Iop_RecipStep32Fx4
:
4204 case Iop_RSqrtStep32Fx4
:
4205 return binary32Fx4(mce
, vatom1
, vatom2
);
4211 case Iop_CmpEQ32Fx2
:
4212 case Iop_CmpGT32Fx2
:
4213 case Iop_CmpGE32Fx2
:
4215 case Iop_RecipStep32Fx2
:
4216 case Iop_RSqrtStep32Fx2
:
4217 return binary32Fx2(mce
, vatom1
, vatom2
);
4224 case Iop_CmpLT32F0x4
:
4225 case Iop_CmpLE32F0x4
:
4226 case Iop_CmpEQ32F0x4
:
4227 case Iop_CmpUN32F0x4
:
4229 return binary32F0x4(mce
, vatom1
, vatom2
);
4231 case Iop_QShlNsatSU8x16
:
4232 case Iop_QShlNsatUU8x16
:
4233 case Iop_QShlNsatSS8x16
:
4234 complainIfUndefined(mce
, atom2
, NULL
);
4235 return mkPCast8x16(mce
, vatom1
);
4237 case Iop_QShlNsatSU16x8
:
4238 case Iop_QShlNsatUU16x8
:
4239 case Iop_QShlNsatSS16x8
:
4240 complainIfUndefined(mce
, atom2
, NULL
);
4241 return mkPCast16x8(mce
, vatom1
);
4243 case Iop_QShlNsatSU32x4
:
4244 case Iop_QShlNsatUU32x4
:
4245 case Iop_QShlNsatSS32x4
:
4246 complainIfUndefined(mce
, atom2
, NULL
);
4247 return mkPCast32x4(mce
, vatom1
);
4249 case Iop_QShlNsatSU64x2
:
4250 case Iop_QShlNsatUU64x2
:
4251 case Iop_QShlNsatSS64x2
:
4252 complainIfUndefined(mce
, atom2
, NULL
);
4253 return mkPCast32x4(mce
, vatom1
);
4255 /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
4256 To make this simpler, do the following:
4257 * complain if the shift amount (the I8) is undefined
4258 * pcast each lane at the wide width
4259 * truncate each lane to half width
4260 * pcast the resulting 64-bit value to a single bit and use
4261 that as the least significant bit of the upper half of the
4263 case Iop_QandQShrNnarrow64Uto32Ux2
:
4264 case Iop_QandQSarNnarrow64Sto32Sx2
:
4265 case Iop_QandQSarNnarrow64Sto32Ux2
:
4266 case Iop_QandQRShrNnarrow64Uto32Ux2
:
4267 case Iop_QandQRSarNnarrow64Sto32Sx2
:
4268 case Iop_QandQRSarNnarrow64Sto32Ux2
:
4269 case Iop_QandQShrNnarrow32Uto16Ux4
:
4270 case Iop_QandQSarNnarrow32Sto16Sx4
:
4271 case Iop_QandQSarNnarrow32Sto16Ux4
:
4272 case Iop_QandQRShrNnarrow32Uto16Ux4
:
4273 case Iop_QandQRSarNnarrow32Sto16Sx4
:
4274 case Iop_QandQRSarNnarrow32Sto16Ux4
:
4275 case Iop_QandQShrNnarrow16Uto8Ux8
:
4276 case Iop_QandQSarNnarrow16Sto8Sx8
:
4277 case Iop_QandQSarNnarrow16Sto8Ux8
:
4278 case Iop_QandQRShrNnarrow16Uto8Ux8
:
4279 case Iop_QandQRSarNnarrow16Sto8Sx8
:
4280 case Iop_QandQRSarNnarrow16Sto8Ux8
:
4282 IRAtom
* (*fnPessim
) (MCEnv
*, IRAtom
*) = NULL
;
4283 IROp opNarrow
= Iop_INVALID
;
4285 case Iop_QandQShrNnarrow64Uto32Ux2
:
4286 case Iop_QandQSarNnarrow64Sto32Sx2
:
4287 case Iop_QandQSarNnarrow64Sto32Ux2
:
4288 case Iop_QandQRShrNnarrow64Uto32Ux2
:
4289 case Iop_QandQRSarNnarrow64Sto32Sx2
:
4290 case Iop_QandQRSarNnarrow64Sto32Ux2
:
4291 fnPessim
= mkPCast64x2
;
4292 opNarrow
= Iop_NarrowUn64to32x2
;
4294 case Iop_QandQShrNnarrow32Uto16Ux4
:
4295 case Iop_QandQSarNnarrow32Sto16Sx4
:
4296 case Iop_QandQSarNnarrow32Sto16Ux4
:
4297 case Iop_QandQRShrNnarrow32Uto16Ux4
:
4298 case Iop_QandQRSarNnarrow32Sto16Sx4
:
4299 case Iop_QandQRSarNnarrow32Sto16Ux4
:
4300 fnPessim
= mkPCast32x4
;
4301 opNarrow
= Iop_NarrowUn32to16x4
;
4303 case Iop_QandQShrNnarrow16Uto8Ux8
:
4304 case Iop_QandQSarNnarrow16Sto8Sx8
:
4305 case Iop_QandQSarNnarrow16Sto8Ux8
:
4306 case Iop_QandQRShrNnarrow16Uto8Ux8
:
4307 case Iop_QandQRSarNnarrow16Sto8Sx8
:
4308 case Iop_QandQRSarNnarrow16Sto8Ux8
:
4309 fnPessim
= mkPCast16x8
;
4310 opNarrow
= Iop_NarrowUn16to8x8
;
4315 complainIfUndefined(mce
, atom2
, NULL
);
4316 // Pessimised shift result
4318 = fnPessim(mce
, vatom1
);
4319 // Narrowed, pessimised shift result
4321 = assignNew('V', mce
, Ity_I64
, unop(opNarrow
, shV
));
4322 // Generates: Def--(63)--Def PCast-to-I1(narrowed)
4323 IRAtom
* qV
= mkPCastXXtoXXlsb(mce
, shVnarrowed
, Ity_I64
);
4324 // and assemble the result
4325 return assignNew('V', mce
, Ity_V128
,
4326 binop(Iop_64HLtoV128
, qV
, shVnarrowed
));
4331 case Iop_QDMull32Sx2
:
4332 return vectorWidenI64(mce
, Iop_Widen32Sto64x2
,
4333 mkUifU64(mce
, vatom1
, vatom2
));
4337 case Iop_QDMull16Sx4
:
4338 return vectorWidenI64(mce
, Iop_Widen16Sto32x4
,
4339 mkUifU64(mce
, vatom1
, vatom2
));
4343 case Iop_PolynomialMull8x8
:
4344 return vectorWidenI64(mce
, Iop_Widen8Sto16x8
,
4345 mkUifU64(mce
, vatom1
, vatom2
));
4348 return mkPCast32x4(mce
,
4349 assignNew('V', mce
, Ity_V128
, binop(op
, mkPCast32x4(mce
, vatom1
),
4350 mkPCast32x4(mce
, vatom2
))));
4353 return mkPCast16x8(mce
,
4354 assignNew('V', mce
, Ity_V128
, binop(op
, mkPCast16x8(mce
, vatom1
),
4355 mkPCast16x8(mce
, vatom2
))));
4358 return mkPCast8x16(mce
,
4359 assignNew('V', mce
, Ity_V128
, binop(op
, mkPCast8x16(mce
, vatom1
),
4360 mkPCast8x16(mce
, vatom2
))));
4362 /* V128-bit data-steering */
4363 case Iop_SetV128lo32
:
4364 case Iop_SetV128lo64
:
4365 case Iop_64HLtoV128
:
4366 case Iop_InterleaveLO64x2
:
4367 case Iop_InterleaveLO32x4
:
4368 case Iop_InterleaveLO16x8
:
4369 case Iop_InterleaveLO8x16
:
4370 case Iop_InterleaveHI64x2
:
4371 case Iop_InterleaveHI32x4
:
4372 case Iop_InterleaveHI16x8
:
4373 case Iop_InterleaveHI8x16
:
4374 case Iop_CatOddLanes8x16
:
4375 case Iop_CatOddLanes16x8
:
4376 case Iop_CatOddLanes32x4
:
4377 case Iop_CatEvenLanes8x16
:
4378 case Iop_CatEvenLanes16x8
:
4379 case Iop_CatEvenLanes32x4
:
4380 case Iop_InterleaveOddLanes8x16
:
4381 case Iop_InterleaveOddLanes16x8
:
4382 case Iop_InterleaveOddLanes32x4
:
4383 case Iop_InterleaveEvenLanes8x16
:
4384 case Iop_InterleaveEvenLanes16x8
:
4385 case Iop_InterleaveEvenLanes32x4
:
4386 case Iop_PackOddLanes8x16
:
4387 case Iop_PackOddLanes16x8
:
4388 case Iop_PackOddLanes32x4
:
4389 case Iop_PackEvenLanes8x16
:
4390 case Iop_PackEvenLanes16x8
:
4391 case Iop_PackEvenLanes32x4
:
4392 return assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, vatom2
));
4394 case Iop_GetElem8x16
:
4395 complainIfUndefined(mce
, atom2
, NULL
);
4396 return assignNew('V', mce
, Ity_I8
, binop(op
, vatom1
, atom2
));
4397 case Iop_GetElem16x8
:
4398 complainIfUndefined(mce
, atom2
, NULL
);
4399 return assignNew('V', mce
, Ity_I16
, binop(op
, vatom1
, atom2
));
4400 case Iop_GetElem32x4
:
4401 complainIfUndefined(mce
, atom2
, NULL
);
4402 return assignNew('V', mce
, Ity_I32
, binop(op
, vatom1
, atom2
));
4403 case Iop_GetElem64x2
:
4404 complainIfUndefined(mce
, atom2
, NULL
);
4405 return assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
));
4407 /* Perm8x16: rearrange values in left arg using steering values
4408 from right arg. So rearrange the vbits in the same way but
4409 pessimise wrt steering values. Perm32x4 ditto. */
4410 /* PermOrZero8x16: see comments above for PermOrZero8x8. */
4412 case Iop_PermOrZero8x16
:
4415 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
4416 mkPCast8x16(mce
, vatom2
)
4421 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
4422 mkPCast32x4(mce
, vatom2
)
4425 /* These two take the lower half of each 16-bit lane, sign/zero
4426 extend it to 32, and multiply together, producing a 32x4
4427 result (and implicitly ignoring half the operand bits). So
4428 treat it as a bunch of independent 16x8 operations, but then
4429 do 32-bit shifts left-right to copy the lower half results
4430 (which are all 0s or all 1s due to PCasting in binary16Ix8)
4431 into the upper half of each result lane. */
4432 case Iop_MullEven16Ux8
:
4433 case Iop_MullEven16Sx8
: {
4435 at
= binary16Ix8(mce
,vatom1
,vatom2
);
4436 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_ShlN32x4
, at
, mkU8(16)));
4437 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SarN32x4
, at
, mkU8(16)));
4441 /* Same deal as Iop_MullEven16{S,U}x8 */
4442 case Iop_MullEven8Ux16
:
4443 case Iop_MullEven8Sx16
: {
4445 at
= binary8Ix16(mce
,vatom1
,vatom2
);
4446 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_ShlN16x8
, at
, mkU8(8)));
4447 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SarN16x8
, at
, mkU8(8)));
4451 /* Same deal as Iop_MullEven16{S,U}x8 */
4452 case Iop_MullEven32Ux4
:
4453 case Iop_MullEven32Sx4
: {
4455 at
= binary32Ix4(mce
,vatom1
,vatom2
);
4456 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_ShlN64x2
, at
, mkU8(32)));
4457 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SarN64x2
, at
, mkU8(32)));
4461 /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4462 32x4 -> 16x8 laneage, discarding the upper half of each lane.
4463 Simply apply same op to the V bits, since this really no more
4464 than a data steering operation. */
4465 case Iop_NarrowBin32to16x8
:
4466 case Iop_NarrowBin16to8x16
:
4467 case Iop_NarrowBin64to32x4
:
4468 return assignNew('V', mce
, Ity_V128
,
4469 binop(op
, vatom1
, vatom2
));
4474 case Iop_I128StoBCD128
:
4475 /* Same scheme as with all other shifts. Note: 10 Nov 05:
4476 this is wrong now, scalar shifts are done properly lazily.
4477 Vector shifts should be fixed too. */
4478 complainIfUndefined(mce
, atom2
, NULL
);
4479 return assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
));
4481 case Iop_I128UtoF128
: /* I128 -> F128 */
4482 case Iop_I128StoF128
: /* I128 -> F128 */
4483 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4487 return mkLazy2(mce
, Ity_V128
, vatom1
, vatom2
);
4492 complainIfUndefined(mce
, atom2
, NULL
);
4493 return assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
));
4495 /* I128-bit data-steering */
4497 return assignNew('V', mce
, Ity_I128
, binop(op
, vatom1
, vatom2
));
4503 return binary64Fx4(mce
, vatom1
, vatom2
);
4507 return binary32Fx8(mce
, vatom1
, vatom2
);
4509 /* V256-bit data-steering */
4510 case Iop_V128HLtoV256
:
4511 return assignNew('V', mce
, Ity_V256
, binop(op
, vatom1
, vatom2
));
4513 /* Scalar floating point */
4517 /* I32(rm) x F32 -> I64 */
4518 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4521 /* I32(rm) x I64 -> F32 */
4522 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4524 case Iop_RoundF64toInt
:
4525 case Iop_RoundF64toF32
:
4535 case Iop_RecpExpF64
:
4536 /* I32(rm) x I64/F64 -> I64/F64 */
4537 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4541 case Iop_RoundD64toInt
:
4542 /* I32(rm) x D64 -> D64 */
4543 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4547 case Iop_RoundD128toInt
:
4548 /* I32(rm) x D128 -> D128 */
4549 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4551 case Iop_RoundF128toInt
:
4552 /* I32(rm) x F128 -> F128 */
4553 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4559 /* I32(rm) x I64/D64 -> D64/I64 */
4560 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4568 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4569 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4577 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4578 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4582 case Iop_F128toD128
:
4585 case Iop_D128toF128
:
4586 case Iop_I128StoD128
:
4587 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4588 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4591 /* I32(rm) x F16 -> F16 */
4592 return mkLazy2(mce
, Ity_I16
, vatom1
, vatom2
);
4594 case Iop_RoundF32toInt
:
4596 case Iop_RecpExpF32
:
4597 /* I32(rm) x I32/F32 -> I32/F32 */
4598 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4601 /* I32(rm) x F128 -> F128 */
4602 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4608 /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4609 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4613 /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4614 return mkLazy2(mce
, Ity_I16
, vatom1
, vatom2
);
4616 case Iop_F128toI32S
: /* IRRoundingMode(I32) x F128 -> signed I32 */
4617 case Iop_F128toI32U
: /* IRRoundingMode(I32) x F128 -> unsigned I32 */
4618 case Iop_F128toF32
: /* IRRoundingMode(I32) x F128 -> F32 */
4619 case Iop_D128toI32S
: /* IRRoundingMode(I32) x D128 -> signed I32 */
4620 case Iop_D128toI32U
: /* IRRoundingMode(I32) x D128 -> unsigned I32 */
4621 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4623 case Iop_F128toI128S
: /* IRRoundingMode(I32) x F128 -> signed I128 */
4624 case Iop_RndF128
: /* IRRoundingMode(I32) x F128 -> F128 */
4625 case Iop_D128toI128S
: /* IRRoundingMode(I32) x D128 -> signed I128 */
4626 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4628 case Iop_F128toI64S
: /* IRRoundingMode(I32) x F128 -> signed I64 */
4629 case Iop_F128toI64U
: /* IRRoundingMode(I32) x F128 -> unsigned I64 */
4630 case Iop_F128toF64
: /* IRRoundingMode(I32) x F128 -> F64 */
4631 case Iop_D128toD64
: /* IRRoundingMode(I64) x D128 -> D64 */
4632 case Iop_D128toI64S
: /* IRRoundingMode(I64) x D128 -> signed I64 */
4633 case Iop_D128toI64U
: /* IRRoundingMode(I32) x D128 -> unsigned I64 */
4634 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4636 case Iop_F64HLtoF128
:
4637 case Iop_D64HLtoD128
:
4638 return assignNew('V', mce
, Ity_I128
,
4639 binop(Iop_64HLto128
, vatom1
, vatom2
));
4647 /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4648 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4651 /* First arg is I32 (rounding mode), second is D64 (data). */
4652 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4655 /* First arg is I32 (rounding mode), second is F64 (data). */
4656 return mkLazy2(mce
, Ity_I16
, vatom1
, vatom2
);
4658 case Iop_InsertExpD64
:
4659 /* I64 x I64 -> D64 */
4660 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4662 case Iop_InsertExpD128
:
4663 /* I64 x I128 -> D128 */
4664 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4673 case Iop_CmpExpD128
:
4674 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4678 /* F32 x F32 -> F32 */
4679 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4683 /* F64 x F64 -> F64 */
4684 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4686 /* non-FP after here */
4688 case Iop_DivModU64to32
:
4689 case Iop_DivModS64to32
:
4690 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4692 case Iop_DivModU128to64
:
4693 case Iop_DivModS128to64
:
4694 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4697 return assignNew('V', mce
, Ity_I16
, binop(op
, vatom1
, vatom2
));
4699 return assignNew('V', mce
, Ity_I32
, binop(op
, vatom1
, vatom2
));
4701 return assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, vatom2
));
4703 case Iop_DivModU64to64
:
4704 case Iop_DivModS64to64
: {
4705 IRAtom
* vTmp64
= mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4706 return assignNew('V', mce
, Ity_I128
,
4707 binop(Iop_64HLto128
, vTmp64
, vTmp64
));
4712 IRAtom
* vLo64
= mkLeft64(mce
, mkUifU64(mce
, vatom1
,vatom2
));
4713 IRAtom
* vHi64
= mkPCastTo(mce
, Ity_I64
, vLo64
);
4714 return assignNew('V', mce
, Ity_I128
,
4715 binop(Iop_64HLto128
, vHi64
, vLo64
));
4718 case Iop_DivModU32to32
:
4719 case Iop_DivModS32to32
: {
4720 IRAtom
* vTmp32
= mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4721 return assignNew('V', mce
, Ity_I64
,
4722 binop(Iop_32HLto64
, vTmp32
, vTmp32
));
4727 IRAtom
* vLo32
= mkLeft32(mce
, mkUifU32(mce
, vatom1
,vatom2
));
4728 IRAtom
* vHi32
= mkPCastTo(mce
, Ity_I32
, vLo32
);
4729 return assignNew('V', mce
, Ity_I64
,
4730 binop(Iop_32HLto64
, vHi32
, vLo32
));
4735 IRAtom
* vLo16
= mkLeft16(mce
, mkUifU16(mce
, vatom1
,vatom2
));
4736 IRAtom
* vHi16
= mkPCastTo(mce
, Ity_I16
, vLo16
);
4737 return assignNew('V', mce
, Ity_I32
,
4738 binop(Iop_16HLto32
, vHi16
, vLo16
));
4743 IRAtom
* vLo8
= mkLeft8(mce
, mkUifU8(mce
, vatom1
,vatom2
));
4744 IRAtom
* vHi8
= mkPCastTo(mce
, Ity_I8
, vLo8
);
4745 return assignNew('V', mce
, Ity_I16
, binop(Iop_8HLto16
, vHi8
, vLo8
));
4748 case Iop_Sad8Ux4
: /* maybe we could do better? ftm, do mkLazy2. */
4753 case Iop_QAdd32S
: /* could probably do better */
4754 case Iop_QSub32S
: /* could probably do better */
4755 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4761 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4764 if (mce
->dlbo
.dl_Add32
== DLexpensive
4765 || (mce
->dlbo
.dl_Add32
== DLauto
&& hu
== HuOth
)) {
4766 return expensiveAddSub(mce
,True
,Ity_I32
,
4767 vatom1
,vatom2
, atom1
,atom2
);
4769 goto cheap_AddSub32
;
4772 if (mce
->dlbo
.dl_Sub32
== DLexpensive
4773 || (mce
->dlbo
.dl_Sub32
== DLauto
&& hu
== HuOth
)) {
4774 return expensiveAddSub(mce
,False
,Ity_I32
,
4775 vatom1
,vatom2
, atom1
,atom2
);
4777 goto cheap_AddSub32
;
4782 return mkLeft32(mce
, mkUifU32(mce
, vatom1
,vatom2
));
4788 return doCmpORD(mce
, op
, vatom1
,vatom2
, atom1
,atom2
);
4791 if (mce
->dlbo
.dl_Add64
== DLexpensive
4792 || (mce
->dlbo
.dl_Add64
== DLauto
&& hu
== HuOth
)) {
4793 return expensiveAddSub(mce
,True
,Ity_I64
,
4794 vatom1
,vatom2
, atom1
,atom2
);
4796 goto cheap_AddSub64
;
4799 if (mce
->dlbo
.dl_Sub64
== DLexpensive
4800 || (mce
->dlbo
.dl_Sub64
== DLauto
&& hu
== HuOth
)) {
4801 return expensiveAddSub(mce
,False
,Ity_I64
,
4802 vatom1
,vatom2
, atom1
,atom2
);
4804 goto cheap_AddSub64
;
4809 return mkLeft64(mce
, mkUifU64(mce
, vatom1
,vatom2
));
4814 return mkLeft16(mce
, mkUifU16(mce
, vatom1
,vatom2
));
4819 return mkLeft8(mce
, mkUifU8(mce
, vatom1
,vatom2
));
4822 case Iop_CmpEQ64
: case Iop_CmpNE64
:
4823 if (mce
->dlbo
.dl_CmpEQ64_CmpNE64
== DLexpensive
)
4824 goto expensive_cmp64
;
4829 case Iop_ExpCmpNE64
:
4830 return expensiveCmpEQorNE(mce
,Ity_I64
, vatom1
,vatom2
, atom1
,atom2
);
4833 case Iop_CmpLE64S
: case Iop_CmpLE64U
:
4834 case Iop_CmpLT64U
: case Iop_CmpLT64S
:
4835 return mkPCastTo(mce
, Ity_I1
, mkUifU64(mce
, vatom1
,vatom2
));
4838 case Iop_CmpEQ32
: case Iop_CmpNE32
:
4839 if (mce
->dlbo
.dl_CmpEQ32_CmpNE32
== DLexpensive
)
4840 goto expensive_cmp32
;
4845 case Iop_ExpCmpNE32
:
4846 return expensiveCmpEQorNE(mce
,Ity_I32
, vatom1
,vatom2
, atom1
,atom2
);
4849 case Iop_CmpLE32S
: case Iop_CmpLE32U
:
4850 case Iop_CmpLT32U
: case Iop_CmpLT32S
:
4851 return mkPCastTo(mce
, Ity_I1
, mkUifU32(mce
, vatom1
,vatom2
));
4854 case Iop_CmpEQ16
: case Iop_CmpNE16
:
4855 if (mce
->dlbo
.dl_CmpEQ16_CmpNE16
== DLexpensive
)
4856 goto expensive_cmp16
;
4861 case Iop_ExpCmpNE16
:
4862 return expensiveCmpEQorNE(mce
,Ity_I16
, vatom1
,vatom2
, atom1
,atom2
);
4865 return mkPCastTo(mce
, Ity_I1
, mkUifU16(mce
, vatom1
,vatom2
));
4868 case Iop_CmpEQ8
: case Iop_CmpNE8
:
4869 if (mce
->dlbo
.dl_CmpEQ8_CmpNE8
== DLexpensive
)
4870 goto expensive_cmp8
;
4875 return expensiveCmpEQorNE(mce
,Ity_I8
, vatom1
,vatom2
, atom1
,atom2
);
4878 return mkPCastTo(mce
, Ity_I1
, mkUifU8(mce
, vatom1
,vatom2
));
4880 ////---- end CmpXX{64,32,16,8}
4882 case Iop_CasCmpEQ8
: case Iop_CasCmpNE8
:
4883 case Iop_CasCmpEQ16
: case Iop_CasCmpNE16
:
4884 case Iop_CasCmpEQ32
: case Iop_CasCmpNE32
:
4885 case Iop_CasCmpEQ64
: case Iop_CasCmpNE64
:
4886 /* Just say these all produce a defined result, regardless
4887 of their arguments. See COMMENT_ON_CasCmpEQ in this file. */
4888 return assignNew('V', mce
, Ity_I1
, definedOfType(Ity_I1
));
4890 case Iop_Shl64
: case Iop_Shr64
: case Iop_Sar64
:
4891 return scalarShift( mce
, Ity_I64
, op
, vatom1
,vatom2
, atom1
,atom2
);
4893 case Iop_Shl32
: case Iop_Shr32
: case Iop_Sar32
:
4894 return scalarShift( mce
, Ity_I32
, op
, vatom1
,vatom2
, atom1
,atom2
);
4896 case Iop_Shl16
: case Iop_Shr16
: case Iop_Sar16
:
4897 return scalarShift( mce
, Ity_I16
, op
, vatom1
,vatom2
, atom1
,atom2
);
4899 case Iop_Shl8
: case Iop_Shr8
: case Iop_Sar8
:
4900 return scalarShift( mce
, Ity_I8
, op
, vatom1
,vatom2
, atom1
,atom2
);
4903 uifu
= mkUifUV256
; difd
= mkDifDV256
;
4904 and_or_ty
= Ity_V256
; improve
= mkImproveANDV256
; goto do_And_Or
;
4906 uifu
= mkUifUV128
; difd
= mkDifDV128
;
4907 and_or_ty
= Ity_V128
; improve
= mkImproveANDV128
; goto do_And_Or
;
4909 uifu
= mkUifU64
; difd
= mkDifD64
;
4910 and_or_ty
= Ity_I64
; improve
= mkImproveAND64
; goto do_And_Or
;
4912 uifu
= mkUifU32
; difd
= mkDifD32
;
4913 and_or_ty
= Ity_I32
; improve
= mkImproveAND32
; goto do_And_Or
;
4915 uifu
= mkUifU16
; difd
= mkDifD16
;
4916 and_or_ty
= Ity_I16
; improve
= mkImproveAND16
; goto do_And_Or
;
4918 uifu
= mkUifU8
; difd
= mkDifD8
;
4919 and_or_ty
= Ity_I8
; improve
= mkImproveAND8
; goto do_And_Or
;
4921 uifu
= mkUifU1
; difd
= mkDifD1
;
4922 and_or_ty
= Ity_I1
; improve
= mkImproveAND1
; goto do_And_Or
;
4925 uifu
= mkUifUV256
; difd
= mkDifDV256
;
4926 and_or_ty
= Ity_V256
; improve
= mkImproveORV256
; goto do_And_Or
;
4928 uifu
= mkUifUV128
; difd
= mkDifDV128
;
4929 and_or_ty
= Ity_V128
; improve
= mkImproveORV128
; goto do_And_Or
;
4931 uifu
= mkUifU64
; difd
= mkDifD64
;
4932 and_or_ty
= Ity_I64
; improve
= mkImproveOR64
; goto do_And_Or
;
4934 uifu
= mkUifU32
; difd
= mkDifD32
;
4935 and_or_ty
= Ity_I32
; improve
= mkImproveOR32
; goto do_And_Or
;
4937 uifu
= mkUifU16
; difd
= mkDifD16
;
4938 and_or_ty
= Ity_I16
; improve
= mkImproveOR16
; goto do_And_Or
;
4940 uifu
= mkUifU8
; difd
= mkDifD8
;
4941 and_or_ty
= Ity_I8
; improve
= mkImproveOR8
; goto do_And_Or
;
4943 uifu
= mkUifU1
; difd
= mkDifD1
;
4944 and_or_ty
= Ity_I1
; improve
= mkImproveOR1
; goto do_And_Or
;
4947 return assignNew('V', mce
, and_or_ty
,
4948 difd(mce
, uifu(mce
, vatom1
, vatom2
),
4949 difd(mce
, improve(mce
, atom1
, vatom1
),
4950 improve(mce
, atom2
, vatom2
) ) ) );
4953 return mkUifU8(mce
, vatom1
, vatom2
);
4955 return mkUifU16(mce
, vatom1
, vatom2
);
4957 return mkUifU32(mce
, vatom1
, vatom2
);
4959 return mkUifU64(mce
, vatom1
, vatom2
);
4961 return mkUifUV128(mce
, vatom1
, vatom2
);
4963 return mkUifUV256(mce
, vatom1
, vatom2
);
4975 /* Same scheme as with all other shifts. Note: 22 Oct 05:
4976 this is wrong now, scalar shifts are done properly lazily.
4977 Vector shifts should be fixed too. */
4978 complainIfUndefined(mce
, atom2
, NULL
);
4979 return assignNew('V', mce
, Ity_V256
, binop(op
, vatom1
, atom2
));
4988 case Iop_CmpGT8Sx32
:
4994 return binary8Ix32(mce
, vatom1
, vatom2
);
4996 case Iop_QSub16Ux16
:
4997 case Iop_QSub16Sx16
:
5000 case Iop_MulHi16Sx16
:
5001 case Iop_MulHi16Ux16
:
5006 case Iop_CmpGT16Sx16
:
5007 case Iop_CmpEQ16x16
:
5009 case Iop_QAdd16Ux16
:
5010 case Iop_QAdd16Sx16
:
5012 return binary16Ix16(mce
, vatom1
, vatom2
);
5015 case Iop_CmpGT32Sx8
:
5023 return binary32Ix8(mce
, vatom1
, vatom2
);
5028 case Iop_CmpGT64Sx4
:
5029 return binary64Ix4(mce
, vatom1
, vatom2
);
5031 case Iop_I32StoF32x8
:
5032 case Iop_F32toI32Sx8
:
5033 return unary32Fx8_w_rm(mce
, vatom1
, vatom2
);
5035 /* Perm32x8: rearrange values in left arg using steering values
5036 from right arg. So rearrange the vbits in the same way but
5037 pessimise wrt steering values. */
5041 assignNew('V', mce
, Ity_V256
, binop(op
, vatom1
, atom2
)),
5042 mkPCast32x8(mce
, vatom2
)
5045 /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
5046 Handle the shifted results in the same way that other
5047 binary Q ops are handled, eg QSub: UifU the two args,
5048 then pessimise -- which is binaryNIxM. But for the upper
5049 V128, we require to generate just 1 bit which is the
5050 pessimised shift result, with 127 defined zeroes above it.
5052 Note that this overly pessimistic in that in fact only the
5053 bottom 8 bits of each lane of the second arg determine the shift
5054 amount. Really we ought to ignore any undefinedness in the
5055 rest of the lanes of the second arg. */
5056 case Iop_QandSQsh64x2
: case Iop_QandUQsh64x2
:
5057 case Iop_QandSQRsh64x2
: case Iop_QandUQRsh64x2
:
5058 case Iop_QandSQsh32x4
: case Iop_QandUQsh32x4
:
5059 case Iop_QandSQRsh32x4
: case Iop_QandUQRsh32x4
:
5060 case Iop_QandSQsh16x8
: case Iop_QandUQsh16x8
:
5061 case Iop_QandSQRsh16x8
: case Iop_QandUQRsh16x8
:
5062 case Iop_QandSQsh8x16
: case Iop_QandUQsh8x16
:
5063 case Iop_QandSQRsh8x16
: case Iop_QandUQRsh8x16
:
5065 // The function to generate the pessimised shift result
5066 IRAtom
* (*binaryNIxM
)(MCEnv
*,IRAtom
*,IRAtom
*) = NULL
;
5068 case Iop_QandSQsh64x2
:
5069 case Iop_QandUQsh64x2
:
5070 case Iop_QandSQRsh64x2
:
5071 case Iop_QandUQRsh64x2
:
5072 binaryNIxM
= binary64Ix2
;
5074 case Iop_QandSQsh32x4
:
5075 case Iop_QandUQsh32x4
:
5076 case Iop_QandSQRsh32x4
:
5077 case Iop_QandUQRsh32x4
:
5078 binaryNIxM
= binary32Ix4
;
5080 case Iop_QandSQsh16x8
:
5081 case Iop_QandUQsh16x8
:
5082 case Iop_QandSQRsh16x8
:
5083 case Iop_QandUQRsh16x8
:
5084 binaryNIxM
= binary16Ix8
;
5086 case Iop_QandSQsh8x16
:
5087 case Iop_QandUQsh8x16
:
5088 case Iop_QandSQRsh8x16
:
5089 case Iop_QandUQRsh8x16
:
5090 binaryNIxM
= binary8Ix16
;
5095 tl_assert(binaryNIxM
);
5096 // Pessimised shift result, shV[127:0]
5097 IRAtom
* shV
= binaryNIxM(mce
, vatom1
, vatom2
);
5098 // Generates: Def--(127)--Def PCast-to-I1(shV)
5099 IRAtom
* qV
= mkPCastXXtoXXlsb(mce
, shV
, Ity_V128
);
5100 // and assemble the result
5101 return assignNew('V', mce
, Ity_V256
,
5102 binop(Iop_V128HLtoV256
, qV
, shV
));
5105 case Iop_F32toF16x4
: {
5106 // First, PCast the input vector, retaining the 32x4 format.
5107 IRAtom
* pcasted
= mkPCast32x4(mce
, vatom2
); // :: 32x4
5108 // Now truncate each 32 bit lane to 16 bits. Since we already PCasted
5109 // the input, we're not going to lose any information.
5111 = assignNew('V', mce
, Ity_I64
, unop(Iop_V128HIto64
, pcasted
));//32x2
5113 = assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, pcasted
)); // 32x2
5115 = assignNew('V', mce
, Ity_I64
, binop(Iop_NarrowBin32to16x4
,
5116 pcHI64
, pcLO64
)); // 16x4
5117 // Finally, roll in any badness from the rounding mode.
5118 IRAtom
* rmPCasted
= mkPCastTo(mce
, Ity_I64
, vatom1
);
5119 return mkUifU64(mce
, narrowed
, rmPCasted
);
5122 case Iop_F32toF16x8
: {
5123 // Same scheme as for Iop_F32toF16x4.
5124 IRAtom
* pcasted
= mkPCast32x8(mce
, vatom2
); // :: 32x8
5126 = assignNew('V', mce
, Ity_V128
, unop(Iop_V256toV128_1
,
5129 = assignNew('V', mce
, Ity_V128
, unop(Iop_V256toV128_0
,
5132 = assignNew('V', mce
, Ity_V128
, binop(Iop_NarrowBin32to16x8
,
5133 pcHI128
, pcLO128
)); // 16x8
5134 // Finally, roll in any badness from the rounding mode.
5135 IRAtom
* rmPCasted
= mkPCastTo(mce
, Ity_V128
, vatom1
);
5136 return mkUifUV128(mce
, narrowed
, rmPCasted
);
5141 VG_(tool_panic
)("memcheck:expr2vbits_Binop");
5147 IRExpr
* expr2vbits_Unop ( MCEnv
* mce
, IROp op
, IRAtom
* atom
)
5149 /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
5150 selection of shadow operation implicitly duplicates the logic in
5151 do_shadow_LoadG and should be kept in sync (in the very unlikely
5152 event that the interpretation of such widening ops changes in
5153 future). See comment in do_shadow_LoadG. */
5154 IRAtom
* vatom
= expr2vbits( mce
, atom
, HuOth
);
5155 tl_assert(isOriginalAtom(mce
,atom
));
5160 case Iop_RSqrtEst64Fx2
:
5161 case Iop_RecipEst64Fx2
:
5162 case Iop_Log2_64Fx2
:
5163 return unary64Fx2(mce
, vatom
);
5165 case Iop_Sqrt64F0x2
:
5166 return unary64F0x2(mce
, vatom
);
5169 case Iop_RSqrtEst32Fx8
:
5170 case Iop_RecipEst32Fx8
:
5171 return unary32Fx8(mce
, vatom
);
5174 return unary64Fx4(mce
, vatom
);
5176 case Iop_RecipEst32Fx4
:
5177 case Iop_I32UtoF32x4_DEP
:
5178 case Iop_I32StoF32x4_DEP
:
5179 case Iop_QF32toI32Ux4_RZ
:
5180 case Iop_QF32toI32Sx4_RZ
:
5181 case Iop_RoundF32x4_RM
:
5182 case Iop_RoundF32x4_RP
:
5183 case Iop_RoundF32x4_RN
:
5184 case Iop_RoundF32x4_RZ
:
5185 case Iop_RecipEst32Ux4
:
5188 case Iop_RSqrtEst32Fx4
:
5189 case Iop_Log2_32Fx4
:
5190 case Iop_Exp2_32Fx4
:
5191 return unary32Fx4(mce
, vatom
);
5193 case Iop_I32UtoF32x2_DEP
:
5194 case Iop_I32StoF32x2_DEP
:
5195 case Iop_RecipEst32Fx2
:
5196 case Iop_RecipEst32Ux2
:
5199 case Iop_RSqrtEst32Fx2
:
5200 return unary32Fx2(mce
, vatom
);
5202 case Iop_Sqrt32F0x4
:
5203 case Iop_RSqrtEst32F0x4
:
5204 case Iop_RecipEst32F0x4
:
5205 return unary32F0x4(mce
, vatom
);
5209 return unary16Fx8(mce
, vatom
);
5211 // These are self-shadowing.
5217 case Iop_Reverse1sIn8_x16
:
5218 case Iop_Reverse8sIn16_x8
:
5219 case Iop_Reverse8sIn32_x4
:
5220 case Iop_Reverse16sIn32_x4
:
5221 case Iop_Reverse8sIn64_x2
:
5222 case Iop_Reverse16sIn64_x2
:
5223 case Iop_Reverse32sIn64_x2
:
5224 case Iop_V256toV128_1
: case Iop_V256toV128_0
:
5225 case Iop_ZeroHI64ofV128
:
5226 case Iop_ZeroHI96ofV128
:
5227 case Iop_ZeroHI112ofV128
:
5228 case Iop_ZeroHI120ofV128
:
5229 case Iop_ReinterpI128asV128
: /* I128 -> V128 */
5230 return assignNew('V', mce
, Ity_V128
, unop(op
, vatom
));
5232 case Iop_F128HItoF64
: /* F128 -> high half of F128 */
5233 case Iop_D128HItoD64
: /* D128 -> high half of D128 */
5234 return assignNew('V', mce
, Ity_I64
, unop(Iop_128HIto64
, vatom
));
5236 case Iop_F128LOtoF64
: /* F128 -> low half of F128 */
5237 case Iop_D128LOtoD64
: /* D128 -> low half of D128 */
5238 return assignNew('V', mce
, Ity_I64
, unop(Iop_128to64
, vatom
));
5243 case Iop_TruncF128toI128S
: /* F128 -> I128S */
5244 case Iop_TruncF128toI128U
: /* F128 -> I128U */
5245 case Iop_ReinterpV128asI128
: /* V128 -> I128 */
5246 case Iop_ReinterpI128asF128
:
5247 case Iop_ReinterpF128asI128
:
5248 return mkPCastTo(mce
, Ity_I128
, vatom
);
5250 case Iop_BCD128toI128S
:
5251 case Iop_MulI128by10
:
5252 case Iop_MulI128by10Carry
:
5253 case Iop_F16toF64x2
:
5254 case Iop_F64toF16x2_DEP
:
5255 // FIXME JRS 2018-Nov-15. This is surely not correct!
5258 case Iop_ReinterpI32asF32
:
5259 case Iop_ReinterpF32asI32
:
5260 return assignNew('V', mce
, Ity_I32
, vatom
);
5262 case Iop_ReinterpF64asI64
:
5263 case Iop_ReinterpI64asF64
:
5264 case Iop_ReinterpI64asD64
:
5265 case Iop_ReinterpD64asI64
:
5266 return assignNew('V', mce
, Ity_I64
, vatom
);
5268 case Iop_I32StoF128
: /* signed I32 -> F128 */
5269 case Iop_I64StoF128
: /* signed I64 -> F128 */
5270 case Iop_I32UtoF128
: /* unsigned I32 -> F128 */
5271 case Iop_I64UtoF128
: /* unsigned I64 -> F128 */
5272 case Iop_F32toF128
: /* F32 -> F128 */
5273 case Iop_F64toF128
: /* F64 -> F128 */
5274 case Iop_I32StoD128
: /* signed I64 -> D128 */
5275 case Iop_I64StoD128
: /* signed I64 -> D128 */
5276 case Iop_I32UtoD128
: /* unsigned I32 -> D128 */
5277 case Iop_I64UtoD128
: /* unsigned I64 -> D128 */
5278 return mkPCastTo(mce
, Ity_I128
, vatom
);
5286 case Iop_RSqrtEst5GoodF64
:
5287 case Iop_RoundF64toF64_NEAREST
:
5288 case Iop_RoundF64toF64_NegINF
:
5289 case Iop_RoundF64toF64_PosINF
:
5290 case Iop_RoundF64toF64_ZERO
:
5291 case Iop_RoundF64toIntA0
:
5292 case Iop_RoundF64toIntE
:
5296 case Iop_ExtractExpD64
: /* D64 -> I64 */
5297 case Iop_ExtractExpD128
: /* D128 -> I64 */
5298 case Iop_ExtractSigD64
: /* D64 -> I64 */
5299 case Iop_ExtractSigD128
: /* D128 -> I64 */
5302 return mkPCastTo(mce
, Ity_I64
, vatom
);
5305 return mkPCastTo(mce
, Ity_I128
, vatom
);
5307 case Iop_TruncF64asF32
:
5311 case Iop_RoundF32toIntA0
:
5312 case Iop_RoundF32toIntE
:
5313 return mkPCastTo(mce
, Ity_I32
, vatom
);
5317 return mkPCastTo(mce
, Ity_I16
, vatom
);
5319 case Iop_Ctz32
: case Iop_CtzNat32
:
5320 case Iop_Ctz64
: case Iop_CtzNat64
:
5321 return expensiveCountTrailingZeroes(mce
, op
, atom
, vatom
);
5323 case Iop_Clz32
: case Iop_ClzNat32
:
5324 case Iop_Clz64
: case Iop_ClzNat64
:
5325 return expensiveCountLeadingZeroes(mce
, op
, atom
, vatom
);
5327 // PopCount32: this is slightly pessimistic. It is true that the
5328 // result depends on all input bits, so that aspect of the PCast is
5329 // correct. However, regardless of the input, only the lowest 5 bits
5330 // out of the output can ever be undefined. So we could actually
5331 // "improve" the results here by marking the top 27 bits of output as
5332 // defined. A similar comment applies for PopCount64.
5333 case Iop_PopCount32
:
5334 return mkPCastTo(mce
, Ity_I32
, vatom
);
5335 case Iop_PopCount64
:
5336 return mkPCastTo(mce
, Ity_I64
, vatom
);
5338 // These are self-shadowing.
5348 case Iop_V128HIto64
:
5354 case Iop_Reverse8sIn16_x4
:
5355 case Iop_Reverse8sIn32_x2
:
5356 case Iop_Reverse16sIn32_x2
:
5357 case Iop_Reverse8sIn64_x1
:
5358 case Iop_Reverse16sIn64_x1
:
5359 case Iop_Reverse32sIn64_x1
:
5360 case Iop_V256to64_0
: case Iop_V256to64_1
:
5361 case Iop_V256to64_2
: case Iop_V256to64_3
:
5362 return assignNew('V', mce
, Ity_I64
, unop(op
, vatom
));
5364 // These are self-shadowing.
5374 case Iop_Reverse8sIn32_x1
:
5375 return assignNew('V', mce
, Ity_I32
, unop(op
, vatom
));
5377 // These are self-shadowing.
5384 case Iop_GetMSBs8x16
:
5385 return assignNew('V', mce
, Ity_I16
, unop(op
, vatom
));
5387 // These are self-shadowing.
5394 case Iop_GetMSBs8x8
:
5395 return assignNew('V', mce
, Ity_I8
, unop(op
, vatom
));
5398 return assignNew('V', mce
, Ity_I1
, unop(Iop_32to1
, vatom
));
5401 return assignNew('V', mce
, Ity_I1
, unop(Iop_64to1
, vatom
));
5410 // FIXME JRS 2018-Nov-15. This is surely not correct!
5418 return mkPCast8x8(mce
, vatom
);
5420 case Iop_CmpNEZ8x16
:
5426 return mkPCast8x16(mce
, vatom
);
5428 case Iop_CmpNEZ16x4
:
5432 return mkPCast16x4(mce
, vatom
);
5434 case Iop_CmpNEZ16x8
:
5439 return mkPCast16x8(mce
, vatom
);
5441 case Iop_CmpNEZ32x2
:
5444 case Iop_F32toI32Ux2_RZ
:
5445 case Iop_F32toI32Sx2_RZ
:
5447 return mkPCast32x2(mce
, vatom
);
5449 case Iop_CmpNEZ32x4
:
5452 case Iop_F32toI32Ux4_RZ
:
5453 case Iop_F32toI32Sx4_RZ
:
5455 case Iop_RSqrtEst32Ux4
:
5457 return mkPCast32x4(mce
, vatom
);
5459 case Iop_TruncF128toI32S
: /* F128 -> I32S (result stored in 64-bits) */
5460 case Iop_TruncF128toI32U
: /* F128 -> I32U (result stored in 64-bits) */
5462 return mkPCastTo(mce
, Ity_I32
, vatom
);
5464 case Iop_TruncF128toI64S
: /* F128 -> I64S */
5465 case Iop_TruncF128toI64U
: /* F128 -> I64U */
5467 return mkPCastTo(mce
, Ity_I64
, vatom
);
5469 case Iop_CmpNEZ64x2
:
5470 case Iop_CipherSV128
:
5474 return mkPCast64x2(mce
, vatom
);
5476 // This is self-shadowing.
5477 case Iop_PwBitMtxXpose64x2
:
5478 return assignNew('V', mce
, Ity_V128
, unop(op
, vatom
));
5480 case Iop_NarrowUn16to8x8
:
5481 case Iop_NarrowUn32to16x4
:
5482 case Iop_NarrowUn64to32x2
:
5483 case Iop_QNarrowUn16Sto8Sx8
:
5484 case Iop_QNarrowUn16Sto8Ux8
:
5485 case Iop_QNarrowUn16Uto8Ux8
:
5486 case Iop_QNarrowUn32Sto16Sx4
:
5487 case Iop_QNarrowUn32Sto16Ux4
:
5488 case Iop_QNarrowUn32Uto16Ux4
:
5489 case Iop_QNarrowUn64Sto32Sx2
:
5490 case Iop_QNarrowUn64Sto32Ux2
:
5491 case Iop_QNarrowUn64Uto32Ux2
:
5492 return vectorNarrowUnV128(mce
, op
, vatom
);
5494 // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not
5496 case Iop_F32toF16x4_DEP
:
5497 return vectorNarrowUnV128(mce
, op
, vatom
);
5499 case Iop_Widen8Sto16x8
:
5500 case Iop_Widen8Uto16x8
:
5501 case Iop_Widen16Sto32x4
:
5502 case Iop_Widen16Uto32x4
:
5503 case Iop_Widen32Sto64x2
:
5504 case Iop_Widen32Uto64x2
:
5505 return vectorWidenI64(mce
, op
, vatom
);
5507 case Iop_F16toF32x4
:
5508 // JRS 2019 Mar 17: this definitely isn't right, but it probably works
5509 // OK by accident if -- as seems likely -- the F16 to F32 conversion
5510 // preserves will generate an output 32 bits with at least one 1 bit
5511 // set if there's one or more 1 bits set in the input 16 bits. More
5512 // correct code for this is just below, but commented out, so as to
5513 // avoid short-term backend failures on targets that can't do
5514 // Iop_Interleave{LO,HI}16x4.
5515 return vectorWidenI64(mce
, op
, vatom
);
5517 case Iop_F16toF32x8
: {
5518 // PCast the input at 16x8. This makes each lane hold either all
5519 // zeroes or all ones.
5520 IRAtom
* pcasted
= mkPCast16x8(mce
, vatom
); // :: I16x8
5521 // Now double the width of each lane to 32 bits. Because the lanes are
5522 // all zeroes or all ones, we can just copy the each lane twice into
5523 // the result. Here's the low half:
5524 IRAtom
* widenedLO
// :: I32x4
5525 = assignNew('V', mce
, Ity_V128
, binop(Iop_InterleaveLO16x8
,
5527 // And the high half:
5528 IRAtom
* widenedHI
// :: I32x4
5529 = assignNew('V', mce
, Ity_V128
, binop(Iop_InterleaveHI16x8
,
5531 // Glue them back together:
5532 return assignNew('V', mce
, Ity_V256
, binop(Iop_V128HLtoV256
,
5533 widenedHI
, widenedLO
));
5536 // See comment just above, for Iop_F16toF32x4
5537 //case Iop_F16toF32x4: {
5538 // // Same scheme as F16toF32x4
5539 // IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4
5540 // IRAtom* widenedLO // :: I32x2
5541 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4,
5542 // pcasted, pcasted));
5543 // IRAtom* widenedHI // :: I32x4
5544 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4,
5545 // pcasted, pcasted));
5546 // // Glue them back together:
5547 // return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
5548 // widenedHI, widenedLO));
5551 case Iop_PwAddL32Ux2
:
5552 case Iop_PwAddL32Sx2
:
5553 return mkPCastTo(mce
, Ity_I64
,
5554 assignNew('V', mce
, Ity_I64
, unop(op
, mkPCast32x2(mce
, vatom
))));
5556 case Iop_PwAddL16Ux4
:
5557 case Iop_PwAddL16Sx4
:
5558 return mkPCast32x2(mce
,
5559 assignNew('V', mce
, Ity_I64
, unop(op
, mkPCast16x4(mce
, vatom
))));
5561 case Iop_PwAddL8Ux8
:
5562 case Iop_PwAddL8Sx8
:
5563 return mkPCast16x4(mce
,
5564 assignNew('V', mce
, Ity_I64
, unop(op
, mkPCast8x8(mce
, vatom
))));
5566 case Iop_PwAddL32Ux4
:
5567 case Iop_PwAddL32Sx4
:
5568 return mkPCast64x2(mce
,
5569 assignNew('V', mce
, Ity_V128
, unop(op
, mkPCast32x4(mce
, vatom
))));
5571 case Iop_PwAddL64Ux2
:
5572 return mkPCast128x1(mce
,
5573 assignNew('V', mce
, Ity_V128
, unop(op
, mkPCast64x2(mce
, vatom
))));
5575 case Iop_PwAddL16Ux8
:
5576 case Iop_PwAddL16Sx8
:
5577 return mkPCast32x4(mce
,
5578 assignNew('V', mce
, Ity_V128
, unop(op
, mkPCast16x8(mce
, vatom
))));
5580 case Iop_PwAddL8Ux16
:
5581 case Iop_PwAddL8Sx16
:
5582 return mkPCast16x8(mce
,
5583 assignNew('V', mce
, Ity_V128
, unop(op
, mkPCast8x16(mce
, vatom
))));
5588 VG_(tool_panic
)("memcheck:expr2vbits_Unop");
5593 /* Worker function -- do not call directly. See comments on
5594 expr2vbits_Load for the meaning of |guard|.
5596 Generates IR to (1) perform a definedness test of |addr|, (2)
5597 perform a validity test of |addr|, and (3) return the Vbits for the
5598 location indicated by |addr|. All of this only happens when
5599 |guard| is NULL or |guard| evaluates to True at run time.
5601 If |guard| evaluates to False at run time, the returned value is
5602 the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5605 The definedness of |guard| itself is not checked. That is assumed
5606 to have been done before this point, by the caller. */
5608 IRAtom
* expr2vbits_Load_WRK ( MCEnv
* mce
,
5609 IREndness end
, IRType ty
,
5610 IRAtom
* addr
, UInt bias
, IRAtom
* guard
)
5612 tl_assert(isOriginalAtom(mce
,addr
));
5613 tl_assert(end
== Iend_LE
|| end
== Iend_BE
);
5615 /* First, emit a definedness test for the address. This also sets
5616 the address (shadow) to 'defined' following the test. */
5617 complainIfUndefined( mce
, addr
, guard
);
5619 /* Now cook up a call to the relevant helper function, to read the data V
5620 bits from shadow memory. Note that I128 loads are done by pretending
5621 we're doing a V128 load, and then converting the resulting V128 vbits
5622 word to an I128, right at the end of this function -- see `castedToI128`
5623 below. (It's only a minor hack :-) This pertains to bug 444399. */
5624 ty
= shadowTypeV(ty
);
5626 void* helper
= NULL
;
5627 const HChar
* hname
= NULL
;
5628 Bool ret_via_outparam
= False
;
5630 if (end
== Iend_LE
) {
5632 case Ity_V256
: helper
= &MC_(helperc_LOADV256le
);
5633 hname
= "MC_(helperc_LOADV256le)";
5634 ret_via_outparam
= True
;
5636 case Ity_I128
: // fallthrough. See comment above.
5637 case Ity_V128
: helper
= &MC_(helperc_LOADV128le
);
5638 hname
= "MC_(helperc_LOADV128le)";
5639 ret_via_outparam
= True
;
5641 case Ity_I64
: helper
= &MC_(helperc_LOADV64le
);
5642 hname
= "MC_(helperc_LOADV64le)";
5644 case Ity_I32
: helper
= &MC_(helperc_LOADV32le
);
5645 hname
= "MC_(helperc_LOADV32le)";
5647 case Ity_I16
: helper
= &MC_(helperc_LOADV16le
);
5648 hname
= "MC_(helperc_LOADV16le)";
5650 case Ity_I8
: helper
= &MC_(helperc_LOADV8
);
5651 hname
= "MC_(helperc_LOADV8)";
5653 default: ppIRType(ty
);
5654 VG_(tool_panic
)("memcheck:expr2vbits_Load_WRK(LE)");
5658 case Ity_V256
: helper
= &MC_(helperc_LOADV256be
);
5659 hname
= "MC_(helperc_LOADV256be)";
5660 ret_via_outparam
= True
;
5662 case Ity_V128
: helper
= &MC_(helperc_LOADV128be
);
5663 hname
= "MC_(helperc_LOADV128be)";
5664 ret_via_outparam
= True
;
5666 case Ity_I64
: helper
= &MC_(helperc_LOADV64be
);
5667 hname
= "MC_(helperc_LOADV64be)";
5669 case Ity_I32
: helper
= &MC_(helperc_LOADV32be
);
5670 hname
= "MC_(helperc_LOADV32be)";
5672 case Ity_I16
: helper
= &MC_(helperc_LOADV16be
);
5673 hname
= "MC_(helperc_LOADV16be)";
5675 case Ity_I8
: helper
= &MC_(helperc_LOADV8
);
5676 hname
= "MC_(helperc_LOADV8)";
5678 default: ppIRType(ty
);
5679 VG_(tool_panic
)("memcheck:expr2vbits_Load_WRK(BE)");
5686 /* Generate the actual address into addrAct. */
5693 IRType tyAddr
= mce
->hWordTy
;
5694 tl_assert( tyAddr
== Ity_I32
|| tyAddr
== Ity_I64
);
5695 mkAdd
= tyAddr
==Ity_I32
? Iop_Add32
: Iop_Add64
;
5696 eBias
= tyAddr
==Ity_I32
? mkU32(bias
) : mkU64(bias
);
5697 addrAct
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBias
) );
5700 /* We need to have a place to park the V bits we're just about to
5702 IRTemp datavbits
= newTemp(mce
, ty
== Ity_I128
? Ity_V128
: ty
, VSh
);
5704 /* Here's the call. */
5706 if (ret_via_outparam
) {
5707 di
= unsafeIRDirty_1_N( datavbits
,
5709 hname
, VG_(fnptr_to_fnentry
)( helper
),
5710 mkIRExprVec_2( IRExpr_VECRET(), addrAct
) );
5712 di
= unsafeIRDirty_1_N( datavbits
,
5714 hname
, VG_(fnptr_to_fnentry
)( helper
),
5715 mkIRExprVec_1( addrAct
) );
5718 setHelperAnns( mce
, di
);
5721 /* Ideally the didn't-happen return value here would be all-ones
5722 (all-undefined), so it'd be obvious if it got used
5723 inadvertently. We can get by with the IR-mandated default
5724 value (0b01 repeating, 0x55 etc) as that'll still look pretty
5725 undefined if it ever leaks out. */
5727 stmt( 'V', mce
, IRStmt_Dirty(di
) );
5729 if (ty
== Ity_I128
) {
5730 IRAtom
* castedToI128
5731 = assignNew('V', mce
, Ity_I128
,
5732 unop(Iop_ReinterpV128asI128
, mkexpr(datavbits
)));
5733 return castedToI128
;
5735 return mkexpr(datavbits
);
5740 /* Generate IR to do a shadow load. The helper is expected to check
5741 the validity of the address and return the V bits for that address.
5742 This can optionally be controlled by a guard, which is assumed to
5743 be True if NULL. In the case where the guard is False at runtime,
5744 the helper will return the didn't-do-the-call value of 0x55..55.
5745 Since that means "completely undefined result", the caller of
5746 this function will need to fix up the result somehow in that
5749 Caller of this function is also expected to have checked the
5750 definedness of |guard| before this point.
5753 IRAtom
* expr2vbits_Load ( MCEnv
* mce
,
5754 IREndness end
, IRType ty
,
5755 IRAtom
* addr
, UInt bias
,
5758 tl_assert(end
== Iend_LE
|| end
== Iend_BE
);
5759 switch (shadowTypeV(ty
)) {
5767 return expr2vbits_Load_WRK(mce
, end
, ty
, addr
, bias
, guard
);
5769 VG_(tool_panic
)("expr2vbits_Load");
5774 /* The most general handler for guarded loads. Assumes the
5775 definedness of GUARD has already been checked by the caller. A
5776 GUARD of NULL is assumed to mean "always True". Generates code to
5777 check the definedness and validity of ADDR.
5779 Generate IR to do a shadow load from ADDR and return the V bits.
5780 The loaded type is TY. The loaded data is then (shadow) widened by
5781 using VWIDEN, which can be Iop_INVALID to denote a no-op. If GUARD
5782 evaluates to False at run time then the returned Vbits are simply
5783 VALT instead. Note therefore that the argument type of VWIDEN must
5784 be TY and the result type of VWIDEN must equal the type of VALT.
5787 IRAtom
* expr2vbits_Load_guarded_General ( MCEnv
* mce
,
5788 IREndness end
, IRType ty
,
5789 IRAtom
* addr
, UInt bias
,
5791 IROp vwiden
, IRAtom
* valt
)
5793 /* Sanity check the conversion operation, and also set TYWIDE. */
5794 IRType tyWide
= Ity_INVALID
;
5799 case Iop_16Uto32
: case Iop_16Sto32
: case Iop_8Uto32
: case Iop_8Sto32
:
5803 VG_(tool_panic
)("memcheck:expr2vbits_Load_guarded_General");
5806 /* If the guard evaluates to True, this will hold the loaded V bits
5807 at TY. If the guard evaluates to False, this will be all
5808 ones, meaning "all undefined", in which case we will have to
5809 replace it using an ITE below. */
5811 = assignNew('V', mce
, ty
,
5812 expr2vbits_Load(mce
, end
, ty
, addr
, bias
, guard
));
5813 /* Now (shadow-) widen the loaded V bits to the desired width. In
5814 the guard-is-False case, the allowable widening operators will
5815 in the worst case (unsigned widening) at least leave the
5816 pre-widened part as being marked all-undefined, and in the best
5817 case (signed widening) mark the whole widened result as
5818 undefined. Anyway, it doesn't matter really, since in this case
5819 we will replace said value with the default value |valt| using an
5822 = vwiden
== Iop_INVALID
5824 : assignNew('V', mce
, tyWide
, unop(vwiden
, iftrue1
));
5825 /* These are the V bits we will return if the load doesn't take
5829 /* Prepare the cond for the ITE. Convert a NULL cond into
5830 something that iropt knows how to fold out later. */
5832 = guard
== NULL
? mkU1(1) : guard
;
5833 /* And assemble the final result. */
5834 return assignNew('V', mce
, tyWide
, IRExpr_ITE(cond
, iftrue2
, iffalse
));
5838 /* A simpler handler for guarded loads, in which there is no
5839 conversion operation, and the default V bit return (when the guard
5840 evaluates to False at runtime) is "all defined". If there is no
5841 guard expression or the guard is always TRUE this function behaves
5842 like expr2vbits_Load. It is assumed that definedness of GUARD has
5843 already been checked at the call site. */
5845 IRAtom
* expr2vbits_Load_guarded_Simple ( MCEnv
* mce
,
5846 IREndness end
, IRType ty
,
5847 IRAtom
* addr
, UInt bias
,
5850 return expr2vbits_Load_guarded_General(
5851 mce
, end
, ty
, addr
, bias
, guard
, Iop_INVALID
, definedOfType(ty
)
5857 IRAtom
* expr2vbits_ITE ( MCEnv
* mce
,
5858 IRAtom
* cond
, IRAtom
* iftrue
, IRAtom
* iffalse
)
5860 IRAtom
*vbitsC
, *vbits0
, *vbits1
;
5862 /* Given ITE(cond, iftrue, iffalse), generate
5863 ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5864 That is, steer the V bits like the originals, but trash the
5865 result if the steering value is undefined. This gives
5866 lazy propagation. */
5867 tl_assert(isOriginalAtom(mce
, cond
));
5868 tl_assert(isOriginalAtom(mce
, iftrue
));
5869 tl_assert(isOriginalAtom(mce
, iffalse
));
5871 vbitsC
= expr2vbits(mce
, cond
, HuOth
); // could we use HuPCa here?
5872 vbits1
= expr2vbits(mce
, iftrue
, HuOth
);
5873 vbits0
= expr2vbits(mce
, iffalse
, HuOth
);
5874 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vbits0
);
5877 mkUifU(mce
, ty
, assignNew('V', mce
, ty
,
5878 IRExpr_ITE(cond
, vbits1
, vbits0
)),
5879 mkPCastTo(mce
, ty
, vbitsC
) );
5882 /* --------- This is the main expression-handling function. --------- */
5885 IRExpr
* expr2vbits ( MCEnv
* mce
, IRExpr
* e
,
5886 HowUsed hu
/*use HuOth if unknown*/ )
5891 return shadow_GET( mce
, e
->Iex
.Get
.offset
, e
->Iex
.Get
.ty
);
5894 return shadow_GETI( mce
, e
->Iex
.GetI
.descr
,
5895 e
->Iex
.GetI
.ix
, e
->Iex
.GetI
.bias
);
5898 return IRExpr_RdTmp( findShadowTmpV(mce
, e
->Iex
.RdTmp
.tmp
) );
5901 return definedOfType(shadowTypeV(typeOfIRExpr(mce
->sb
->tyenv
, e
)));
5904 return expr2vbits_Qop(
5906 e
->Iex
.Qop
.details
->op
,
5907 e
->Iex
.Qop
.details
->arg1
, e
->Iex
.Qop
.details
->arg2
,
5908 e
->Iex
.Qop
.details
->arg3
, e
->Iex
.Qop
.details
->arg4
5912 return expr2vbits_Triop(
5914 e
->Iex
.Triop
.details
->op
,
5915 e
->Iex
.Triop
.details
->arg1
, e
->Iex
.Triop
.details
->arg2
,
5916 e
->Iex
.Triop
.details
->arg3
5920 return expr2vbits_Binop(
5923 e
->Iex
.Binop
.arg1
, e
->Iex
.Binop
.arg2
,
5928 return expr2vbits_Unop( mce
, e
->Iex
.Unop
.op
, e
->Iex
.Unop
.arg
);
5931 return expr2vbits_Load( mce
, e
->Iex
.Load
.end
,
5933 e
->Iex
.Load
.addr
, 0/*addr bias*/,
5934 NULL
/* guard == "always True"*/ );
5937 return mkLazyN( mce
, e
->Iex
.CCall
.args
,
5942 return expr2vbits_ITE( mce
, e
->Iex
.ITE
.cond
, e
->Iex
.ITE
.iftrue
,
5943 e
->Iex
.ITE
.iffalse
);
5949 VG_(tool_panic
)("memcheck: expr2vbits");
5954 /*------------------------------------------------------------*/
5955 /*--- Generate shadow stmts from all kinds of IRStmts. ---*/
5956 /*------------------------------------------------------------*/
5958 /* Widen a value to the host word size. */
5961 IRExpr
* zwidenToHostWord ( MCEnv
* mce
, IRAtom
* vatom
)
5965 /* vatom is vbits-value and as such can only have a shadow type. */
5966 tl_assert(isShadowAtom(mce
,vatom
));
5968 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vatom
);
5971 if (tyH
== Ity_I32
) {
5976 return assignNew('V', mce
, tyH
, unop(Iop_16Uto32
, vatom
));
5978 return assignNew('V', mce
, tyH
, unop(Iop_8Uto32
, vatom
));
5983 if (tyH
== Ity_I64
) {
5986 return assignNew('V', mce
, tyH
, unop(Iop_32Uto64
, vatom
));
5988 return assignNew('V', mce
, tyH
, unop(Iop_32Uto64
,
5989 assignNew('V', mce
, Ity_I32
, unop(Iop_16Uto32
, vatom
))));
5991 return assignNew('V', mce
, tyH
, unop(Iop_32Uto64
,
5992 assignNew('V', mce
, Ity_I32
, unop(Iop_8Uto32
, vatom
))));
6000 VG_(printf
)("\nty = "); ppIRType(ty
); VG_(printf
)("\n");
6001 VG_(tool_panic
)("zwidenToHostWord");
6005 /* Generate a shadow store. |addr| is always the original address
6006 atom. You can pass in either originals or V-bits for the data
6007 atom, but obviously not both. This function generates a check for
6008 the definedness and (indirectly) the validity of |addr|, but only
6009 when |guard| evaluates to True at run time (or is NULL).
6011 |guard| :: Ity_I1 controls whether the store really happens; NULL
6012 means it unconditionally does. Note that |guard| itself is not
6013 checked for definedness; the caller of this function must do that
6017 void do_shadow_Store ( MCEnv
* mce
,
6019 IRAtom
* addr
, UInt bias
,
6020 IRAtom
* data
, IRAtom
* vdata
,
6025 void* helper
= NULL
;
6026 const HChar
* hname
= NULL
;
6029 tyAddr
= mce
->hWordTy
;
6030 mkAdd
= tyAddr
==Ity_I32
? Iop_Add32
: Iop_Add64
;
6031 tl_assert( tyAddr
== Ity_I32
|| tyAddr
== Ity_I64
);
6032 tl_assert( end
== Iend_LE
|| end
== Iend_BE
);
6036 tl_assert(isOriginalAtom(mce
, data
));
6037 tl_assert(bias
== 0);
6038 vdata
= expr2vbits( mce
, data
, HuOth
);
6043 tl_assert(isOriginalAtom(mce
,addr
));
6044 tl_assert(isShadowAtom(mce
,vdata
));
6047 tl_assert(isOriginalAtom(mce
, guard
));
6048 tl_assert(typeOfIRExpr(mce
->sb
->tyenv
, guard
) == Ity_I1
);
6051 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vdata
);
6053 // If we're not doing undefined value checking, pretend that this value
6054 // is "all valid". That lets Vex's optimiser remove some of the V bit
6055 // shadow computation ops that precede it.
6056 if (MC_(clo_mc_level
) == 1) {
6058 case Ity_V256
: // V256 weirdness -- used four times
6059 c
= IRConst_V256(V_BITS32_DEFINED
); break;
6060 case Ity_V128
: // V128 weirdness -- used twice
6061 c
= IRConst_V128(V_BITS16_DEFINED
); break;
6062 case Ity_I128
: c
= IRConst_U128(V_BITS16_DEFINED
); break;
6063 case Ity_I64
: c
= IRConst_U64 (V_BITS64_DEFINED
); break;
6064 case Ity_I32
: c
= IRConst_U32 (V_BITS32_DEFINED
); break;
6065 case Ity_I16
: c
= IRConst_U16 (V_BITS16_DEFINED
); break;
6066 case Ity_I8
: c
= IRConst_U8 (V_BITS8_DEFINED
); break;
6067 default: VG_(tool_panic
)("memcheck:do_shadow_Store(LE)");
6069 vdata
= IRExpr_Const( c
);
6072 /* First, emit a definedness test for the address. This also sets
6073 the address (shadow) to 'defined' following the test. Both of
6074 those actions are gated on |guard|. */
6075 complainIfUndefined( mce
, addr
, guard
);
6077 /* Now decide which helper function to call to write the data V
6078 bits into shadow memory. */
6079 if (end
== Iend_LE
) {
6081 case Ity_V256
: /* we'll use the helper four times */
6082 case Ity_V128
: /* we'll use the helper twice */
6083 case Ity_I128
: /* we'll use the helper twice */
6084 case Ity_I64
: helper
= &MC_(helperc_STOREV64le
);
6085 hname
= "MC_(helperc_STOREV64le)";
6087 case Ity_I32
: helper
= &MC_(helperc_STOREV32le
);
6088 hname
= "MC_(helperc_STOREV32le)";
6090 case Ity_I16
: helper
= &MC_(helperc_STOREV16le
);
6091 hname
= "MC_(helperc_STOREV16le)";
6093 case Ity_I8
: helper
= &MC_(helperc_STOREV8
);
6094 hname
= "MC_(helperc_STOREV8)";
6096 default: VG_(tool_panic
)("memcheck:do_shadow_Store(LE)");
6100 case Ity_V128
: /* we'll use the helper twice */
6101 case Ity_I64
: helper
= &MC_(helperc_STOREV64be
);
6102 hname
= "MC_(helperc_STOREV64be)";
6104 case Ity_I32
: helper
= &MC_(helperc_STOREV32be
);
6105 hname
= "MC_(helperc_STOREV32be)";
6107 case Ity_I16
: helper
= &MC_(helperc_STOREV16be
);
6108 hname
= "MC_(helperc_STOREV16be)";
6110 case Ity_I8
: helper
= &MC_(helperc_STOREV8
);
6111 hname
= "MC_(helperc_STOREV8)";
6113 /* Note, no V256 case here, because no big-endian target that
6114 we support, has 256 vectors. */
6115 default: VG_(tool_panic
)("memcheck:do_shadow_Store(BE)");
6119 if (UNLIKELY(ty
== Ity_V256
)) {
6121 /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
6122 Q3 being the most significant lane. */
6123 /* These are the offsets of the Qs in memory. */
6124 Int offQ0
, offQ1
, offQ2
, offQ3
;
6126 /* Various bits for constructing the 4 lane helper calls */
6127 IRDirty
*diQ0
, *diQ1
, *diQ2
, *diQ3
;
6128 IRAtom
*addrQ0
, *addrQ1
, *addrQ2
, *addrQ3
;
6129 IRAtom
*vdataQ0
, *vdataQ1
, *vdataQ2
, *vdataQ3
;
6130 IRAtom
*eBiasQ0
, *eBiasQ1
, *eBiasQ2
, *eBiasQ3
;
6132 if (end
== Iend_LE
) {
6133 offQ0
= 0; offQ1
= 8; offQ2
= 16; offQ3
= 24;
6135 offQ3
= 0; offQ2
= 8; offQ1
= 16; offQ0
= 24;
6138 eBiasQ0
= tyAddr
==Ity_I32
? mkU32(bias
+offQ0
) : mkU64(bias
+offQ0
);
6139 addrQ0
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasQ0
) );
6140 vdataQ0
= assignNew('V', mce
, Ity_I64
, unop(Iop_V256to64_0
, vdata
));
6141 diQ0
= unsafeIRDirty_0_N(
6143 hname
, VG_(fnptr_to_fnentry
)( helper
),
6144 mkIRExprVec_2( addrQ0
, vdataQ0
)
6147 eBiasQ1
= tyAddr
==Ity_I32
? mkU32(bias
+offQ1
) : mkU64(bias
+offQ1
);
6148 addrQ1
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasQ1
) );
6149 vdataQ1
= assignNew('V', mce
, Ity_I64
, unop(Iop_V256to64_1
, vdata
));
6150 diQ1
= unsafeIRDirty_0_N(
6152 hname
, VG_(fnptr_to_fnentry
)( helper
),
6153 mkIRExprVec_2( addrQ1
, vdataQ1
)
6156 eBiasQ2
= tyAddr
==Ity_I32
? mkU32(bias
+offQ2
) : mkU64(bias
+offQ2
);
6157 addrQ2
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasQ2
) );
6158 vdataQ2
= assignNew('V', mce
, Ity_I64
, unop(Iop_V256to64_2
, vdata
));
6159 diQ2
= unsafeIRDirty_0_N(
6161 hname
, VG_(fnptr_to_fnentry
)( helper
),
6162 mkIRExprVec_2( addrQ2
, vdataQ2
)
6165 eBiasQ3
= tyAddr
==Ity_I32
? mkU32(bias
+offQ3
) : mkU64(bias
+offQ3
);
6166 addrQ3
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasQ3
) );
6167 vdataQ3
= assignNew('V', mce
, Ity_I64
, unop(Iop_V256to64_3
, vdata
));
6168 diQ3
= unsafeIRDirty_0_N(
6170 hname
, VG_(fnptr_to_fnentry
)( helper
),
6171 mkIRExprVec_2( addrQ3
, vdataQ3
)
6175 diQ0
->guard
= diQ1
->guard
= diQ2
->guard
= diQ3
->guard
= guard
;
6177 setHelperAnns( mce
, diQ0
);
6178 setHelperAnns( mce
, diQ1
);
6179 setHelperAnns( mce
, diQ2
);
6180 setHelperAnns( mce
, diQ3
);
6181 stmt( 'V', mce
, IRStmt_Dirty(diQ0
) );
6182 stmt( 'V', mce
, IRStmt_Dirty(diQ1
) );
6183 stmt( 'V', mce
, IRStmt_Dirty(diQ2
) );
6184 stmt( 'V', mce
, IRStmt_Dirty(diQ3
) );
6187 else if (UNLIKELY(ty
== Ity_V128
|| ty
== Ity_I128
)) {
6189 /* V128/I128-bit case */
6190 /* See comment in next clause re 64-bit regparms */
6191 /* also, need to be careful about endianness */
6193 Int offLo64
, offHi64
;
6194 IRDirty
*diLo64
, *diHi64
;
6195 IRAtom
*addrLo64
, *addrHi64
;
6196 IRAtom
*vdataLo64
, *vdataHi64
;
6197 IRAtom
*eBiasLo64
, *eBiasHi64
;
6198 IROp opGetLO64
, opGetHI64
;
6200 if (end
== Iend_LE
) {
6208 if (ty
== Ity_V128
) {
6209 opGetLO64
= Iop_V128to64
;
6210 opGetHI64
= Iop_V128HIto64
;
6212 opGetLO64
= Iop_128to64
;
6213 opGetHI64
= Iop_128HIto64
;
6216 eBiasLo64
= tyAddr
==Ity_I32
? mkU32(bias
+offLo64
) : mkU64(bias
+offLo64
);
6217 addrLo64
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasLo64
) );
6218 vdataLo64
= assignNew('V', mce
, Ity_I64
, unop(opGetLO64
, vdata
));
6219 diLo64
= unsafeIRDirty_0_N(
6221 hname
, VG_(fnptr_to_fnentry
)( helper
),
6222 mkIRExprVec_2( addrLo64
, vdataLo64
)
6224 eBiasHi64
= tyAddr
==Ity_I32
? mkU32(bias
+offHi64
) : mkU64(bias
+offHi64
);
6225 addrHi64
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasHi64
) );
6226 vdataHi64
= assignNew('V', mce
, Ity_I64
, unop(opGetHI64
, vdata
));
6227 diHi64
= unsafeIRDirty_0_N(
6229 hname
, VG_(fnptr_to_fnentry
)( helper
),
6230 mkIRExprVec_2( addrHi64
, vdataHi64
)
6232 if (guard
) diLo64
->guard
= guard
;
6233 if (guard
) diHi64
->guard
= guard
;
6234 setHelperAnns( mce
, diLo64
);
6235 setHelperAnns( mce
, diHi64
);
6236 stmt( 'V', mce
, IRStmt_Dirty(diLo64
) );
6237 stmt( 'V', mce
, IRStmt_Dirty(diHi64
) );
6244 /* 8/16/32/64-bit cases */
6245 /* Generate the actual address into addrAct. */
6249 IRAtom
* eBias
= tyAddr
==Ity_I32
? mkU32(bias
) : mkU64(bias
);
6250 addrAct
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBias
));
6253 if (ty
== Ity_I64
) {
6254 /* We can't do this with regparm 2 on 32-bit platforms, since
6255 the back ends aren't clever enough to handle 64-bit
6256 regparm args. Therefore be different. */
6257 di
= unsafeIRDirty_0_N(
6259 hname
, VG_(fnptr_to_fnentry
)( helper
),
6260 mkIRExprVec_2( addrAct
, vdata
)
6263 di
= unsafeIRDirty_0_N(
6265 hname
, VG_(fnptr_to_fnentry
)( helper
),
6266 mkIRExprVec_2( addrAct
,
6267 zwidenToHostWord( mce
, vdata
))
6270 if (guard
) di
->guard
= guard
;
6271 setHelperAnns( mce
, di
);
6272 stmt( 'V', mce
, IRStmt_Dirty(di
) );
6278 /* Do lazy pessimistic propagation through a dirty helper call, by
6279 looking at the annotations on it. This is the most complex part of
6282 static IRType
szToITy ( Int n
)
6285 case 1: return Ity_I8
;
6286 case 2: return Ity_I16
;
6287 case 4: return Ity_I32
;
6288 case 8: return Ity_I64
;
6289 default: VG_(tool_panic
)("szToITy(memcheck)");
6294 void do_shadow_Dirty ( MCEnv
* mce
, IRDirty
* d
)
6296 Int i
, k
, n
, toDo
, gSz
, gOff
;
6297 IRAtom
*src
, *here
, *curr
;
6298 IRType tySrc
, tyDst
;
6302 /* What's the native endianness? We need to know this. */
6303 # if defined(VG_BIGENDIAN)
6305 # elif defined(VG_LITTLEENDIAN)
6308 # error "Unknown endianness"
6311 /* First check the guard. */
6312 complainIfUndefined(mce
, d
->guard
, NULL
);
6314 /* Now round up all inputs and PCast over them. */
6315 curr
= definedOfType(Ity_I32
);
6317 /* Inputs: unmasked args
6318 Note: arguments are evaluated REGARDLESS of the guard expression */
6319 for (i
= 0; d
->args
[i
]; i
++) {
6320 IRAtom
* arg
= d
->args
[i
];
6321 if ( (d
->cee
->mcx_mask
& (1<<i
))
6322 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg
)) ) {
6323 /* ignore this arg */
6325 here
= mkPCastTo( mce
, Ity_I32
, expr2vbits(mce
, arg
, HuOth
) );
6326 curr
= mkUifU32(mce
, here
, curr
);
6330 /* Inputs: guest state that we read. */
6331 for (i
= 0; i
< d
->nFxState
; i
++) {
6332 tl_assert(d
->fxState
[i
].fx
!= Ifx_None
);
6333 if (d
->fxState
[i
].fx
== Ifx_Write
)
6336 /* Enumerate the described state segments */
6337 for (k
= 0; k
< 1 + d
->fxState
[i
].nRepeats
; k
++) {
6338 gOff
= d
->fxState
[i
].offset
+ k
* d
->fxState
[i
].repeatLen
;
6339 gSz
= d
->fxState
[i
].size
;
6341 /* Ignore any sections marked as 'always defined'. */
6342 if (isAlwaysDefd(mce
, gOff
, gSz
)) {
6344 VG_(printf
)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6349 /* This state element is read or modified. So we need to
6350 consider it. If larger than 8 bytes, deal with it in
6353 tl_assert(gSz
>= 0);
6354 if (gSz
== 0) break;
6355 n
= gSz
<= 8 ? gSz
: 8;
6356 /* update 'curr' with UifU of the state slice
6358 tySrc
= szToITy( n
);
6360 /* Observe the guard expression. If it is false use an
6361 all-bits-defined bit pattern */
6362 IRAtom
*cond
, *iffalse
, *iftrue
;
6364 cond
= assignNew('V', mce
, Ity_I1
, d
->guard
);
6365 iftrue
= assignNew('V', mce
, tySrc
, shadow_GET(mce
, gOff
, tySrc
));
6366 iffalse
= assignNew('V', mce
, tySrc
, definedOfType(tySrc
));
6367 src
= assignNew('V', mce
, tySrc
,
6368 IRExpr_ITE(cond
, iftrue
, iffalse
));
6370 here
= mkPCastTo( mce
, Ity_I32
, src
);
6371 curr
= mkUifU32(mce
, here
, curr
);
6378 /* Inputs: memory. First set up some info needed regardless of
6379 whether we're doing reads or writes. */
6381 if (d
->mFx
!= Ifx_None
) {
6382 /* Because we may do multiple shadow loads/stores from the same
6383 base address, it's best to do a single test of its
6384 definedness right now. Post-instrumentation optimisation
6385 should remove all but this test. */
6387 tl_assert(d
->mAddr
);
6388 complainIfUndefined(mce
, d
->mAddr
, d
->guard
);
6390 tyAddr
= typeOfIRExpr(mce
->sb
->tyenv
, d
->mAddr
);
6391 tl_assert(tyAddr
== Ity_I32
|| tyAddr
== Ity_I64
);
6392 tl_assert(tyAddr
== mce
->hWordTy
); /* not really right */
6395 /* Deal with memory inputs (reads or modifies) */
6396 if (d
->mFx
== Ifx_Read
|| d
->mFx
== Ifx_Modify
) {
6398 /* chew off 32-bit chunks. We don't care about the endianness
6399 since it's all going to be condensed down to a single bit,
6400 but nevertheless choose an endianness which is hopefully
6401 native to the platform. */
6405 expr2vbits_Load_guarded_Simple(
6406 mce
, end
, Ity_I32
, d
->mAddr
, d
->mSize
- toDo
, d
->guard
)
6408 curr
= mkUifU32(mce
, here
, curr
);
6411 /* chew off 16-bit chunks */
6415 expr2vbits_Load_guarded_Simple(
6416 mce
, end
, Ity_I16
, d
->mAddr
, d
->mSize
- toDo
, d
->guard
)
6418 curr
= mkUifU32(mce
, here
, curr
);
6421 /* chew off the remaining 8-bit chunk, if any */
6425 expr2vbits_Load_guarded_Simple(
6426 mce
, end
, Ity_I8
, d
->mAddr
, d
->mSize
- toDo
, d
->guard
)
6428 curr
= mkUifU32(mce
, here
, curr
);
6431 tl_assert(toDo
== 0);
6434 /* Whew! So curr is a 32-bit V-value summarising pessimistically
6435 all the inputs to the helper. Now we need to re-distribute the
6436 results to all destinations. */
6438 /* Outputs: the destination temporary, if there is one. */
6439 if (d
->tmp
!= IRTemp_INVALID
) {
6440 dst
= findShadowTmpV(mce
, d
->tmp
);
6441 tyDst
= typeOfIRTemp(mce
->sb
->tyenv
, d
->tmp
);
6442 assign( 'V', mce
, dst
, mkPCastTo( mce
, tyDst
, curr
) );
6445 /* Outputs: guest state that we write or modify. */
6446 for (i
= 0; i
< d
->nFxState
; i
++) {
6447 tl_assert(d
->fxState
[i
].fx
!= Ifx_None
);
6448 if (d
->fxState
[i
].fx
== Ifx_Read
)
6451 /* Enumerate the described state segments */
6452 for (k
= 0; k
< 1 + d
->fxState
[i
].nRepeats
; k
++) {
6453 gOff
= d
->fxState
[i
].offset
+ k
* d
->fxState
[i
].repeatLen
;
6454 gSz
= d
->fxState
[i
].size
;
6456 /* Ignore any sections marked as 'always defined'. */
6457 if (isAlwaysDefd(mce
, gOff
, gSz
))
6460 /* This state element is written or modified. So we need to
6461 consider it. If larger than 8 bytes, deal with it in
6464 tl_assert(gSz
>= 0);
6465 if (gSz
== 0) break;
6466 n
= gSz
<= 8 ? gSz
: 8;
6467 /* Write suitably-casted 'curr' to the state slice
6469 tyDst
= szToITy( n
);
6470 do_shadow_PUT( mce
, gOff
,
6471 NULL
, /* original atom */
6472 mkPCastTo( mce
, tyDst
, curr
), d
->guard
);
6479 /* Outputs: memory that we write or modify. Same comments about
6480 endianness as above apply. */
6481 if (d
->mFx
== Ifx_Write
|| d
->mFx
== Ifx_Modify
) {
6483 /* chew off 32-bit chunks */
6485 do_shadow_Store( mce
, end
, d
->mAddr
, d
->mSize
- toDo
,
6486 NULL
, /* original data */
6487 mkPCastTo( mce
, Ity_I32
, curr
),
6491 /* chew off 16-bit chunks */
6493 do_shadow_Store( mce
, end
, d
->mAddr
, d
->mSize
- toDo
,
6494 NULL
, /* original data */
6495 mkPCastTo( mce
, Ity_I16
, curr
),
6499 /* chew off the remaining 8-bit chunk, if any */
6501 do_shadow_Store( mce
, end
, d
->mAddr
, d
->mSize
- toDo
,
6502 NULL
, /* original data */
6503 mkPCastTo( mce
, Ity_I8
, curr
),
6507 tl_assert(toDo
== 0);
6513 /* We have an ABI hint telling us that [base .. base+len-1] is to
6514 become undefined ("writable"). Generate code to call a helper to
6515 notify the A/V bit machinery of this fact.
6518 void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
6522 void do_AbiHint ( MCEnv
* mce
, IRExpr
* base
, Int len
, IRExpr
* nia
)
6526 if (MC_(clo_mc_level
) == 3) {
6527 di
= unsafeIRDirty_0_N(
6529 "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
6530 VG_(fnptr_to_fnentry
)( &MC_(helperc_MAKE_STACK_UNINIT_w_o
) ),
6531 mkIRExprVec_3( base
, mkIRExpr_HWord( (UInt
)len
), nia
)
6534 /* We ignore the supplied nia, since it is irrelevant. */
6535 tl_assert(MC_(clo_mc_level
) == 2 || MC_(clo_mc_level
) == 1);
6536 /* Special-case the len==128 case, since that is for amd64-ELF,
6537 which is a very common target. */
6539 di
= unsafeIRDirty_0_N(
6541 "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
6542 VG_(fnptr_to_fnentry
)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o
)),
6543 mkIRExprVec_1( base
)
6546 di
= unsafeIRDirty_0_N(
6548 "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
6549 VG_(fnptr_to_fnentry
)( &MC_(helperc_MAKE_STACK_UNINIT_no_o
) ),
6550 mkIRExprVec_2( base
, mkIRExpr_HWord( (UInt
)len
) )
6555 stmt( 'V', mce
, IRStmt_Dirty(di
) );
6559 /* ------ Dealing with IRCAS (big and complex) ------ */
6562 static IRAtom
* gen_load_b ( MCEnv
* mce
, Int szB
,
6563 IRAtom
* baseaddr
, Int offset
);
6564 static IRAtom
* gen_maxU32 ( MCEnv
* mce
, IRAtom
* b1
, IRAtom
* b2
);
6565 static void gen_store_b ( MCEnv
* mce
, Int szB
,
6566 IRAtom
* baseaddr
, Int offset
, IRAtom
* dataB
,
6569 static void do_shadow_CAS_single ( MCEnv
* mce
, IRCAS
* cas
);
6570 static void do_shadow_CAS_double ( MCEnv
* mce
, IRCAS
* cas
);
6573 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
6574 IRExpr.Consts, else this asserts. If they are both Consts, it
6575 doesn't do anything. So that just leaves the RdTmp case.
6577 In which case: this assigns the shadow value SHADOW to the IR
6578 shadow temporary associated with ORIG. That is, ORIG, being an
6579 original temporary, will have a shadow temporary associated with
6580 it. However, in the case envisaged here, there will so far have
6581 been no IR emitted to actually write a shadow value into that
6582 temporary. What this routine does is to (emit IR to) copy the
6583 value in SHADOW into said temporary, so that after this call,
6584 IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
6587 Point is to allow callers to compute "by hand" a shadow value for
6588 ORIG, and force it to be associated with ORIG.
6590 How do we know that that shadow associated with ORIG has not so far
6591 been assigned to? Well, we don't per se know that, but supposing
6592 it had. Then this routine would create a second assignment to it,
6593 and later the IR sanity checker would barf. But that never
6596 static void bind_shadow_tmp_to_orig ( UChar how
,
6598 IRAtom
* orig
, IRAtom
* shadow
)
6600 tl_assert(isOriginalAtom(mce
, orig
));
6601 tl_assert(isShadowAtom(mce
, shadow
));
6602 switch (orig
->tag
) {
6604 tl_assert(shadow
->tag
== Iex_Const
);
6607 tl_assert(shadow
->tag
== Iex_RdTmp
);
6609 assign('V', mce
, findShadowTmpV(mce
,orig
->Iex
.RdTmp
.tmp
),
6612 tl_assert(how
== 'B');
6613 assign('B', mce
, findShadowTmpB(mce
,orig
->Iex
.RdTmp
.tmp
),
6624 void do_shadow_CAS ( MCEnv
* mce
, IRCAS
* cas
)
6626 /* Scheme is (both single- and double- cases):
6628 1. fetch data#,dataB (the proposed new value)
6630 2. fetch expd#,expdB (what we expect to see at the address)
6632 3. check definedness of address
6634 4. load old#,oldB from shadow memory; this also checks
6635 addressibility of the address
6639 6. compute "expected == old". See COMMENT_ON_CasCmpEQ below.
6641 7. if "expected == old" (as computed by (6))
6642 store data#,dataB to shadow memory
6644 Note that 5 reads 'old' but 4 reads 'old#'. Similarly, 5 stores
6645 'data' but 7 stores 'data#'. Hence it is possible for the
6646 shadow data to be incorrectly checked and/or updated:
6648 * 7 is at least gated correctly, since the 'expected == old'
6649 condition is derived from outputs of 5. However, the shadow
6650 write could happen too late: imagine after 5 we are
6651 descheduled, a different thread runs, writes a different
6652 (shadow) value at the address, and then we resume, hence
6653 overwriting the shadow value written by the other thread.
6655 Because the original memory access is atomic, there's no way to
6656 make both the original and shadow accesses into a single atomic
6657 thing, hence this is unavoidable.
6659 At least as Valgrind stands, I don't think it's a problem, since
6660 we're single threaded *and* we guarantee that there are no
6661 context switches during the execution of any specific superblock
6662 -- context switches can only happen at superblock boundaries.
6664 If Valgrind ever becomes MT in the future, then it might be more
6665 of a problem. A possible kludge would be to artificially
6666 associate with the location, a lock, which we must acquire and
6667 release around the transaction as a whole. Hmm, that probably
6668 would't work properly since it only guards us against other
6669 threads doing CASs on the same location, not against other
6670 threads doing normal reads and writes.
6672 ------------------------------------------------------------
6674 COMMENT_ON_CasCmpEQ:
6676 Note two things. Firstly, in the sequence above, we compute
6677 "expected == old", but we don't check definedness of it. Why
6678 not? Also, the x86 and amd64 front ends use
6679 Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6680 determination (expected == old ?) for themselves, and we also
6681 don't check definedness for those primops; we just say that the
6682 result is defined. Why? Details follow.
6684 x86/amd64 contains various forms of locked insns:
6685 * lock prefix before all basic arithmetic insn;
6686 eg lock xorl %reg1,(%reg2)
6687 * atomic exchange reg-mem
6690 Rather than attempt to represent them all, which would be a
6691 royal PITA, I used a result from Maurice Herlihy
6692 (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6693 demonstrates that compare-and-swap is a primitive more general
6694 than the other two, and so can be used to represent all of them.
6695 So the translation scheme for (eg) lock incl (%reg) is as
6701 atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6703 The "atomically" is the CAS bit. The scheme is always the same:
6704 get old value from memory, compute new value, atomically stuff
6705 new value back in memory iff the old value has not changed (iow,
6706 no other thread modified it in the meantime). If it has changed
6707 then we've been out-raced and we have to start over.
6709 Now that's all very neat, but it has the bad side effect of
6710 introducing an explicit equality test into the translation.
6711 Consider the behaviour of said code on a memory location which
6712 is uninitialised. We will wind up doing a comparison on
6713 uninitialised data, and mc duly complains.
6715 What's difficult about this is, the common case is that the
6716 location is uncontended, and so we're usually comparing the same
6717 value (* %reg) with itself. So we shouldn't complain even if it
6718 is undefined. But mc doesn't know that.
6720 My solution is to mark the == in the IR specially, so as to tell
6721 mc that it almost certainly compares a value with itself, and we
6722 should just regard the result as always defined. Rather than
6723 add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6724 Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6726 So there's always the question of, can this give a false
6727 negative? eg, imagine that initially, * %reg is defined; and we
6728 read that; but then in the gap between the read and the CAS, a
6729 different thread writes an undefined (and different) value at
6730 the location. Then the CAS in this thread will fail and we will
6731 go back to "again:", but without knowing that the trip back
6732 there was based on an undefined comparison. No matter; at least
6733 the other thread won the race and the location is correctly
6734 marked as undefined. What if it wrote an uninitialised version
6735 of the same value that was there originally, though?
6737 etc etc. Seems like there's a small corner case in which we
6738 might lose the fact that something's defined -- we're out-raced
6739 in between the "old = * reg" and the "atomically {", _and_ the
6740 other thread is writing in an undefined version of what's
6741 already there. Well, that seems pretty unlikely.
6745 If we ever need to reinstate it .. code which generates a
6746 definedness test for "expected == old" was removed at r10432 of
6749 if (cas
->oldHi
== IRTemp_INVALID
) {
6750 do_shadow_CAS_single( mce
, cas
);
6752 do_shadow_CAS_double( mce
, cas
);
6757 static void do_shadow_CAS_single ( MCEnv
* mce
, IRCAS
* cas
)
6759 IRAtom
*vdataLo
= NULL
, *bdataLo
= NULL
;
6760 IRAtom
*vexpdLo
= NULL
, *bexpdLo
= NULL
;
6761 IRAtom
*voldLo
= NULL
, *boldLo
= NULL
;
6762 IRAtom
*expd_eq_old
= NULL
;
6766 Bool otrak
= MC_(clo_mc_level
) >= 3; /* a shorthand */
6769 tl_assert(cas
->oldHi
== IRTemp_INVALID
);
6770 tl_assert(cas
->expdHi
== NULL
);
6771 tl_assert(cas
->dataHi
== NULL
);
6773 elemTy
= typeOfIRExpr(mce
->sb
->tyenv
, cas
->expdLo
);
6775 case Ity_I8
: elemSzB
= 1; opCasCmpEQ
= Iop_CasCmpEQ8
; break;
6776 case Ity_I16
: elemSzB
= 2; opCasCmpEQ
= Iop_CasCmpEQ16
; break;
6777 case Ity_I32
: elemSzB
= 4; opCasCmpEQ
= Iop_CasCmpEQ32
; break;
6778 case Ity_I64
: elemSzB
= 8; opCasCmpEQ
= Iop_CasCmpEQ64
; break;
6779 default: tl_assert(0); /* IR defn disallows any other types */
6782 /* 1. fetch data# (the proposed new value) */
6783 tl_assert(isOriginalAtom(mce
, cas
->dataLo
));
6785 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->dataLo
, HuOth
));
6786 tl_assert(isShadowAtom(mce
, vdataLo
));
6789 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->dataLo
));
6790 tl_assert(isShadowAtom(mce
, bdataLo
));
6793 /* 2. fetch expected# (what we expect to see at the address) */
6794 tl_assert(isOriginalAtom(mce
, cas
->expdLo
));
6796 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->expdLo
, HuOth
));
6797 tl_assert(isShadowAtom(mce
, vexpdLo
));
6800 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->expdLo
));
6801 tl_assert(isShadowAtom(mce
, bexpdLo
));
6804 /* 3. check definedness of address */
6805 /* 4. fetch old# from shadow memory; this also checks
6806 addressibility of the address */
6812 cas
->end
, elemTy
, cas
->addr
, 0/*Addr bias*/,
6813 NULL
/*always happens*/
6815 bind_shadow_tmp_to_orig('V', mce
, mkexpr(cas
->oldLo
), voldLo
);
6818 = assignNew('B', mce
, Ity_I32
,
6819 gen_load_b(mce
, elemSzB
, cas
->addr
, 0/*addr bias*/));
6820 bind_shadow_tmp_to_orig('B', mce
, mkexpr(cas
->oldLo
), boldLo
);
6823 /* 5. the CAS itself */
6824 stmt( 'C', mce
, IRStmt_CAS(cas
) );
6826 /* 6. compute "expected == old" */
6827 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6828 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6829 tree, but it's not copied from the input block. */
6831 = assignNew('C', mce
, Ity_I1
,
6832 binop(opCasCmpEQ
, cas
->expdLo
, mkexpr(cas
->oldLo
)));
6834 /* 7. if "expected == old"
6835 store data# to shadow memory */
6836 do_shadow_Store( mce
, cas
->end
, cas
->addr
, 0/*bias*/,
6837 NULL
/*data*/, vdataLo
/*vdata*/,
6838 expd_eq_old
/*guard for store*/ );
6840 gen_store_b( mce
, elemSzB
, cas
->addr
, 0/*offset*/,
6842 expd_eq_old
/*guard for store*/ );
6847 static void do_shadow_CAS_double ( MCEnv
* mce
, IRCAS
* cas
)
6849 IRAtom
*vdataHi
= NULL
, *bdataHi
= NULL
;
6850 IRAtom
*vdataLo
= NULL
, *bdataLo
= NULL
;
6851 IRAtom
*vexpdHi
= NULL
, *bexpdHi
= NULL
;
6852 IRAtom
*vexpdLo
= NULL
, *bexpdLo
= NULL
;
6853 IRAtom
*voldHi
= NULL
, *boldHi
= NULL
;
6854 IRAtom
*voldLo
= NULL
, *boldLo
= NULL
;
6855 IRAtom
*xHi
= NULL
, *xLo
= NULL
, *xHL
= NULL
;
6856 IRAtom
*expd_eq_old
= NULL
, *zero
= NULL
;
6857 IROp opCasCmpEQ
, opOr
, opXor
;
6858 Int elemSzB
, memOffsLo
, memOffsHi
;
6860 Bool otrak
= MC_(clo_mc_level
) >= 3; /* a shorthand */
6863 tl_assert(cas
->oldHi
!= IRTemp_INVALID
);
6864 tl_assert(cas
->expdHi
!= NULL
);
6865 tl_assert(cas
->dataHi
!= NULL
);
6867 elemTy
= typeOfIRExpr(mce
->sb
->tyenv
, cas
->expdLo
);
6870 opCasCmpEQ
= Iop_CasCmpEQ8
; opOr
= Iop_Or8
; opXor
= Iop_Xor8
;
6871 elemSzB
= 1; zero
= mkU8(0);
6874 opCasCmpEQ
= Iop_CasCmpEQ16
; opOr
= Iop_Or16
; opXor
= Iop_Xor16
;
6875 elemSzB
= 2; zero
= mkU16(0);
6878 opCasCmpEQ
= Iop_CasCmpEQ32
; opOr
= Iop_Or32
; opXor
= Iop_Xor32
;
6879 elemSzB
= 4; zero
= mkU32(0);
6882 opCasCmpEQ
= Iop_CasCmpEQ64
; opOr
= Iop_Or64
; opXor
= Iop_Xor64
;
6883 elemSzB
= 8; zero
= mkU64(0);
6886 tl_assert(0); /* IR defn disallows any other types */
6889 /* 1. fetch data# (the proposed new value) */
6890 tl_assert(isOriginalAtom(mce
, cas
->dataHi
));
6891 tl_assert(isOriginalAtom(mce
, cas
->dataLo
));
6893 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->dataHi
, HuOth
));
6895 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->dataLo
, HuOth
));
6896 tl_assert(isShadowAtom(mce
, vdataHi
));
6897 tl_assert(isShadowAtom(mce
, vdataLo
));
6900 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->dataHi
));
6902 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->dataLo
));
6903 tl_assert(isShadowAtom(mce
, bdataHi
));
6904 tl_assert(isShadowAtom(mce
, bdataLo
));
6907 /* 2. fetch expected# (what we expect to see at the address) */
6908 tl_assert(isOriginalAtom(mce
, cas
->expdHi
));
6909 tl_assert(isOriginalAtom(mce
, cas
->expdLo
));
6911 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->expdHi
, HuOth
));
6913 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->expdLo
, HuOth
));
6914 tl_assert(isShadowAtom(mce
, vexpdHi
));
6915 tl_assert(isShadowAtom(mce
, vexpdLo
));
6918 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->expdHi
));
6920 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->expdLo
));
6921 tl_assert(isShadowAtom(mce
, bexpdHi
));
6922 tl_assert(isShadowAtom(mce
, bexpdLo
));
6925 /* 3. check definedness of address */
6926 /* 4. fetch old# from shadow memory; this also checks
6927 addressibility of the address */
6928 if (cas
->end
== Iend_LE
) {
6930 memOffsHi
= elemSzB
;
6932 tl_assert(cas
->end
== Iend_BE
);
6933 memOffsLo
= elemSzB
;
6941 cas
->end
, elemTy
, cas
->addr
, memOffsHi
/*Addr bias*/,
6942 NULL
/*always happens*/
6949 cas
->end
, elemTy
, cas
->addr
, memOffsLo
/*Addr bias*/,
6950 NULL
/*always happens*/
6952 bind_shadow_tmp_to_orig('V', mce
, mkexpr(cas
->oldHi
), voldHi
);
6953 bind_shadow_tmp_to_orig('V', mce
, mkexpr(cas
->oldLo
), voldLo
);
6956 = assignNew('B', mce
, Ity_I32
,
6957 gen_load_b(mce
, elemSzB
, cas
->addr
,
6958 memOffsHi
/*addr bias*/));
6960 = assignNew('B', mce
, Ity_I32
,
6961 gen_load_b(mce
, elemSzB
, cas
->addr
,
6962 memOffsLo
/*addr bias*/));
6963 bind_shadow_tmp_to_orig('B', mce
, mkexpr(cas
->oldHi
), boldHi
);
6964 bind_shadow_tmp_to_orig('B', mce
, mkexpr(cas
->oldLo
), boldLo
);
6967 /* 5. the CAS itself */
6968 stmt( 'C', mce
, IRStmt_CAS(cas
) );
6970 /* 6. compute "expected == old" */
6971 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6972 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6973 tree, but it's not copied from the input block. */
6975 xHi = oldHi ^ expdHi;
6976 xLo = oldLo ^ expdLo;
6978 expd_eq_old = xHL == 0;
6980 xHi
= assignNew('C', mce
, elemTy
,
6981 binop(opXor
, cas
->expdHi
, mkexpr(cas
->oldHi
)));
6982 xLo
= assignNew('C', mce
, elemTy
,
6983 binop(opXor
, cas
->expdLo
, mkexpr(cas
->oldLo
)));
6984 xHL
= assignNew('C', mce
, elemTy
,
6985 binop(opOr
, xHi
, xLo
));
6987 = assignNew('C', mce
, Ity_I1
,
6988 binop(opCasCmpEQ
, xHL
, zero
));
6990 /* 7. if "expected == old"
6991 store data# to shadow memory */
6992 do_shadow_Store( mce
, cas
->end
, cas
->addr
, memOffsHi
/*bias*/,
6993 NULL
/*data*/, vdataHi
/*vdata*/,
6994 expd_eq_old
/*guard for store*/ );
6995 do_shadow_Store( mce
, cas
->end
, cas
->addr
, memOffsLo
/*bias*/,
6996 NULL
/*data*/, vdataLo
/*vdata*/,
6997 expd_eq_old
/*guard for store*/ );
6999 gen_store_b( mce
, elemSzB
, cas
->addr
, memOffsHi
/*offset*/,
7001 expd_eq_old
/*guard for store*/ );
7002 gen_store_b( mce
, elemSzB
, cas
->addr
, memOffsLo
/*offset*/,
7004 expd_eq_old
/*guard for store*/ );
7009 /* ------ Dealing with LL/SC (not difficult) ------ */
7011 static void do_shadow_LLSC ( MCEnv
* mce
,
7015 IRExpr
* stStoredata
)
7017 /* In short: treat a load-linked like a normal load followed by an
7018 assignment of the loaded (shadow) data to the result temporary.
7019 Treat a store-conditional like a normal store, and mark the
7020 result temporary as defined. */
7021 IRType resTy
= typeOfIRTemp(mce
->sb
->tyenv
, stResult
);
7022 IRTemp resTmp
= findShadowTmpV(mce
, stResult
);
7024 tl_assert(isIRAtom(stAddr
));
7026 tl_assert(isIRAtom(stStoredata
));
7028 if (stStoredata
== NULL
) {
7030 /* Just treat this as a normal load, followed by an assignment of
7031 the value to .result. */
7033 tl_assert(resTy
== Ity_I128
|| resTy
== Ity_I64
|| resTy
== Ity_I32
7034 || resTy
== Ity_I16
|| resTy
== Ity_I8
);
7035 assign( 'V', mce
, resTmp
,
7037 mce
, stEnd
, resTy
, stAddr
, 0/*addr bias*/,
7038 NULL
/*always happens*/) );
7040 /* Store Conditional */
7042 IRType dataTy
= typeOfIRExpr(mce
->sb
->tyenv
,
7044 tl_assert(dataTy
== Ity_I128
|| dataTy
== Ity_I64
|| dataTy
== Ity_I32
7045 || dataTy
== Ity_I16
|| dataTy
== Ity_I8
);
7046 do_shadow_Store( mce
, stEnd
,
7047 stAddr
, 0/* addr bias */,
7049 NULL
/* shadow data */,
7051 /* This is a store conditional, so it writes to .result a value
7052 indicating whether or not the store succeeded. Just claim
7053 this value is always defined. In the PowerPC interpretation
7054 of store-conditional, definedness of the success indication
7055 depends on whether the address of the store matches the
7056 reservation address. But we can't tell that here (and
7057 anyway, we're not being PowerPC-specific). At least we are
7058 guaranteed that the definedness of the store address, and its
7059 addressibility, will be checked as per normal. So it seems
7060 pretty safe to just say that the success indication is always
7063 In schemeS, for origin tracking, we must correspondingly set
7064 a no-origin value for the origin shadow of .result.
7066 tl_assert(resTy
== Ity_I1
);
7067 assign( 'V', mce
, resTmp
, definedOfType(resTy
) );
7072 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7074 static void do_shadow_StoreG ( MCEnv
* mce
, IRStoreG
* sg
)
7076 complainIfUndefined(mce
, sg
->guard
, NULL
);
7077 /* do_shadow_Store will generate code to check the definedness and
7078 validity of sg->addr, in the case where sg->guard evaluates to
7079 True at run-time. */
7080 do_shadow_Store( mce
, sg
->end
,
7081 sg
->addr
, 0/* addr bias */,
7083 NULL
/* shadow data */,
7087 static void do_shadow_LoadG ( MCEnv
* mce
, IRLoadG
* lg
)
7089 complainIfUndefined(mce
, lg
->guard
, NULL
);
7090 /* expr2vbits_Load_guarded_General will generate code to check the
7091 definedness and validity of lg->addr, in the case where
7092 lg->guard evaluates to True at run-time. */
7094 /* Look at the LoadG's built-in conversion operation, to determine
7095 the source (actual loaded data) type, and the equivalent IROp.
7096 NOTE that implicitly we are taking a widening operation to be
7097 applied to original atoms and producing one that applies to V
7098 bits. Since signed and unsigned widening are self-shadowing,
7099 this is a straight copy of the op (modulo swapping from the
7100 IRLoadGOp form to the IROp form). Note also therefore that this
7101 implicitly duplicates the logic to do with said widening ops in
7102 expr2vbits_Unop. See comment at the start of expr2vbits_Unop. */
7103 IROp vwiden
= Iop_INVALID
;
7104 IRType loadedTy
= Ity_INVALID
;
7106 case ILGop_IdentV128
: loadedTy
= Ity_V128
; vwiden
= Iop_INVALID
; break;
7107 case ILGop_Ident64
: loadedTy
= Ity_I64
; vwiden
= Iop_INVALID
; break;
7108 case ILGop_Ident32
: loadedTy
= Ity_I32
; vwiden
= Iop_INVALID
; break;
7109 case ILGop_16Uto32
: loadedTy
= Ity_I16
; vwiden
= Iop_16Uto32
; break;
7110 case ILGop_16Sto32
: loadedTy
= Ity_I16
; vwiden
= Iop_16Sto32
; break;
7111 case ILGop_8Uto32
: loadedTy
= Ity_I8
; vwiden
= Iop_8Uto32
; break;
7112 case ILGop_8Sto32
: loadedTy
= Ity_I8
; vwiden
= Iop_8Sto32
; break;
7113 default: VG_(tool_panic
)("do_shadow_LoadG");
7117 = expr2vbits( mce
, lg
->alt
, HuOth
);
7119 = expr2vbits_Load_guarded_General(mce
, lg
->end
, loadedTy
,
7120 lg
->addr
, 0/*addr bias*/,
7121 lg
->guard
, vwiden
, vbits_alt
);
7122 /* And finally, bind the V bits to the destination temporary. */
7123 assign( 'V', mce
, findShadowTmpV(mce
, lg
->dst
), vbits_final
);
7127 /*------------------------------------------------------------*/
7128 /*--- Origin tracking stuff ---*/
7129 /*------------------------------------------------------------*/
7131 /* Almost identical to findShadowTmpV. */
7132 static IRTemp
findShadowTmpB ( MCEnv
* mce
, IRTemp orig
)
7135 /* VG_(indexXA) range-checks 'orig', hence no need to check
7137 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
7138 tl_assert(ent
->kind
== Orig
);
7139 if (ent
->shadowB
== IRTemp_INVALID
) {
7141 = newTemp( mce
, Ity_I32
, BSh
);
7142 /* newTemp may cause mce->tmpMap to resize, hence previous results
7143 from VG_(indexXA) are invalid. */
7144 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
7145 tl_assert(ent
->kind
== Orig
);
7146 tl_assert(ent
->shadowB
== IRTemp_INVALID
);
7147 ent
->shadowB
= tmpB
;
7149 return ent
->shadowB
;
7152 static IRAtom
* gen_maxU32 ( MCEnv
* mce
, IRAtom
* b1
, IRAtom
* b2
)
7154 return assignNew( 'B', mce
, Ity_I32
, binop(Iop_Max32U
, b1
, b2
) );
7158 /* Make a guarded origin load, with no special handling in the
7159 didn't-happen case. A GUARD of NULL is assumed to mean "always
7162 Generate IR to do a shadow origins load from BASEADDR+OFFSET and
7163 return the otag. The loaded size is SZB. If GUARD evaluates to
7164 False at run time then the returned otag is zero.
7166 static IRAtom
* gen_guarded_load_b ( MCEnv
* mce
, Int szB
,
7168 Int offset
, IRExpr
* guard
)
7174 IRType aTy
= typeOfIRExpr( mce
->sb
->tyenv
, baseaddr
);
7175 IROp opAdd
= aTy
== Ity_I32
? Iop_Add32
: Iop_Add64
;
7176 IRAtom
* ea
= baseaddr
;
7178 IRAtom
* off
= aTy
== Ity_I32
? mkU32( offset
)
7179 : mkU64( (Long
)(Int
)offset
);
7180 ea
= assignNew( 'B', mce
, aTy
, binop(opAdd
, ea
, off
));
7182 bTmp
= newTemp(mce
, mce
->hWordTy
, BSh
);
7185 case 1: hFun
= (void*)&MC_(helperc_b_load1
);
7186 hName
= "MC_(helperc_b_load1)";
7188 case 2: hFun
= (void*)&MC_(helperc_b_load2
);
7189 hName
= "MC_(helperc_b_load2)";
7191 case 4: hFun
= (void*)&MC_(helperc_b_load4
);
7192 hName
= "MC_(helperc_b_load4)";
7194 case 8: hFun
= (void*)&MC_(helperc_b_load8
);
7195 hName
= "MC_(helperc_b_load8)";
7197 case 16: hFun
= (void*)&MC_(helperc_b_load16
);
7198 hName
= "MC_(helperc_b_load16)";
7200 case 32: hFun
= (void*)&MC_(helperc_b_load32
);
7201 hName
= "MC_(helperc_b_load32)";
7204 VG_(printf
)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB
);
7207 di
= unsafeIRDirty_1_N(
7208 bTmp
, 1/*regparms*/, hName
, VG_(fnptr_to_fnentry
)( hFun
),
7213 /* Ideally the didn't-happen return value here would be
7214 all-zeroes (unknown-origin), so it'd be harmless if it got
7215 used inadvertently. We slum it out with the IR-mandated
7216 default value (0b01 repeating, 0x55 etc) as that'll probably
7217 trump all legitimate otags via Max32, and it's pretty
7220 /* no need to mess with any annotations. This call accesses
7221 neither guest state nor guest memory. */
7222 stmt( 'B', mce
, IRStmt_Dirty(di
) );
7223 if (mce
->hWordTy
== Ity_I64
) {
7225 IRTemp bTmp32
= newTemp(mce
, Ity_I32
, BSh
);
7226 assign( 'B', mce
, bTmp32
, unop(Iop_64to32
, mkexpr(bTmp
)) );
7227 return mkexpr(bTmp32
);
7230 return mkexpr(bTmp
);
7235 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET. The
7236 loaded size is SZB. The load is regarded as unconditional (always
7239 static IRAtom
* gen_load_b ( MCEnv
* mce
, Int szB
, IRAtom
* baseaddr
,
7242 return gen_guarded_load_b(mce
, szB
, baseaddr
, offset
, NULL
/*guard*/);
7246 /* The most general handler for guarded origin loads. A GUARD of NULL
7247 is assumed to mean "always True".
7249 Generate IR to do a shadow origin load from ADDR+BIAS and return
7250 the B bits. The loaded type is TY. If GUARD evaluates to False at
7251 run time then the returned B bits are simply BALT instead.
7254 IRAtom
* expr2ori_Load_guarded_General ( MCEnv
* mce
,
7256 IRAtom
* addr
, UInt bias
,
7257 IRAtom
* guard
, IRAtom
* balt
)
7259 /* If the guard evaluates to True, this will hold the loaded
7260 origin. If the guard evaluates to False, this will be zero,
7261 meaning "unknown origin", in which case we will have to replace
7262 it using an ITE below. */
7264 = assignNew('B', mce
, Ity_I32
,
7265 gen_guarded_load_b(mce
, sizeofIRType(ty
),
7266 addr
, bias
, guard
));
7267 /* These are the bits we will return if the load doesn't take
7271 /* Prepare the cond for the ITE. Convert a NULL cond into
7272 something that iropt knows how to fold out later. */
7274 = guard
== NULL
? mkU1(1) : guard
;
7275 /* And assemble the final result. */
7276 return assignNew('B', mce
, Ity_I32
, IRExpr_ITE(cond
, iftrue
, iffalse
));
7280 /* Generate a shadow origins store. guard :: Ity_I1 controls whether
7281 the store really happens; NULL means it unconditionally does. */
7282 static void gen_store_b ( MCEnv
* mce
, Int szB
,
7283 IRAtom
* baseaddr
, Int offset
, IRAtom
* dataB
,
7289 IRType aTy
= typeOfIRExpr( mce
->sb
->tyenv
, baseaddr
);
7290 IROp opAdd
= aTy
== Ity_I32
? Iop_Add32
: Iop_Add64
;
7291 IRAtom
* ea
= baseaddr
;
7293 tl_assert(isOriginalAtom(mce
, guard
));
7294 tl_assert(typeOfIRExpr(mce
->sb
->tyenv
, guard
) == Ity_I1
);
7297 IRAtom
* off
= aTy
== Ity_I32
? mkU32( offset
)
7298 : mkU64( (Long
)(Int
)offset
);
7299 ea
= assignNew( 'B', mce
, aTy
, binop(opAdd
, ea
, off
));
7301 if (mce
->hWordTy
== Ity_I64
)
7302 dataB
= assignNew( 'B', mce
, Ity_I64
, unop(Iop_32Uto64
, dataB
));
7305 case 1: hFun
= (void*)&MC_(helperc_b_store1
);
7306 hName
= "MC_(helperc_b_store1)";
7308 case 2: hFun
= (void*)&MC_(helperc_b_store2
);
7309 hName
= "MC_(helperc_b_store2)";
7311 case 4: hFun
= (void*)&MC_(helperc_b_store4
);
7312 hName
= "MC_(helperc_b_store4)";
7314 case 8: hFun
= (void*)&MC_(helperc_b_store8
);
7315 hName
= "MC_(helperc_b_store8)";
7317 case 16: hFun
= (void*)&MC_(helperc_b_store16
);
7318 hName
= "MC_(helperc_b_store16)";
7320 case 32: hFun
= (void*)&MC_(helperc_b_store32
);
7321 hName
= "MC_(helperc_b_store32)";
7326 di
= unsafeIRDirty_0_N( 2/*regparms*/,
7327 hName
, VG_(fnptr_to_fnentry
)( hFun
),
7328 mkIRExprVec_2( ea
, dataB
)
7330 /* no need to mess with any annotations. This call accesses
7331 neither guest state nor guest memory. */
7332 if (guard
) di
->guard
= guard
;
7333 stmt( 'B', mce
, IRStmt_Dirty(di
) );
7336 static IRAtom
* narrowTo32 ( MCEnv
* mce
, IRAtom
* e
) {
7337 IRType eTy
= typeOfIRExpr(mce
->sb
->tyenv
, e
);
7339 return assignNew( 'B', mce
, Ity_I32
, unop(Iop_64to32
, e
) );
7345 static IRAtom
* zWidenFrom32 ( MCEnv
* mce
, IRType dstTy
, IRAtom
* e
) {
7346 IRType eTy
= typeOfIRExpr(mce
->sb
->tyenv
, e
);
7347 tl_assert(eTy
== Ity_I32
);
7348 if (dstTy
== Ity_I64
)
7349 return assignNew( 'B', mce
, Ity_I64
, unop(Iop_32Uto64
, e
) );
7354 static IRAtom
* schemeE ( MCEnv
* mce
, IRExpr
* e
)
7356 tl_assert(MC_(clo_mc_level
) == 3);
7361 IRRegArray
* descr_b
;
7362 IRAtom
*t1
, *t2
, *t3
, *t4
;
7363 IRRegArray
* descr
= e
->Iex
.GetI
.descr
;
7365 = MC_(get_otrack_reg_array_equiv_int_type
)(descr
);
7366 /* If this array is unshadowable for whatever reason, use the
7367 usual approximation. */
7368 if (equivIntTy
== Ity_INVALID
)
7370 tl_assert(sizeofIRType(equivIntTy
) >= 4);
7371 tl_assert(sizeofIRType(equivIntTy
) == sizeofIRType(descr
->elemTy
));
7372 descr_b
= mkIRRegArray( descr
->base
+ 2*mce
->layout
->total_sizeB
,
7373 equivIntTy
, descr
->nElems
);
7374 /* Do a shadow indexed get of the same size, giving t1. Take
7375 the bottom 32 bits of it, giving t2. Compute into t3 the
7376 origin for the index (almost certainly zero, but there's
7377 no harm in being completely general here, since iropt will
7378 remove any useless code), and fold it in, giving a final
7380 t1
= assignNew( 'B', mce
, equivIntTy
,
7381 IRExpr_GetI( descr_b
, e
->Iex
.GetI
.ix
,
7382 e
->Iex
.GetI
.bias
));
7383 t2
= narrowTo32( mce
, t1
);
7384 t3
= schemeE( mce
, e
->Iex
.GetI
.ix
);
7385 t4
= gen_maxU32( mce
, t2
, t3
);
7391 IRExpr
** args
= e
->Iex
.CCall
.args
;
7392 IRAtom
* curr
= mkU32(0);
7393 for (i
= 0; args
[i
]; i
++) {
7395 tl_assert(isOriginalAtom(mce
, args
[i
]));
7396 /* Only take notice of this arg if the callee's
7397 mc-exclusion mask does not say it is to be excluded. */
7398 if (e
->Iex
.CCall
.cee
->mcx_mask
& (1<<i
)) {
7399 /* the arg is to be excluded from definedness checking.
7401 if (0) VG_(printf
)("excluding %s(%d)\n",
7402 e
->Iex
.CCall
.cee
->name
, i
);
7404 /* calculate the arg's definedness, and pessimistically
7406 here
= schemeE( mce
, args
[i
] );
7407 curr
= gen_maxU32( mce
, curr
, here
);
7414 dszB
= sizeofIRType(e
->Iex
.Load
.ty
);
7415 /* assert that the B value for the address is already
7416 available (somewhere) */
7417 tl_assert(isIRAtom(e
->Iex
.Load
.addr
));
7418 tl_assert(mce
->hWordTy
== Ity_I32
|| mce
->hWordTy
== Ity_I64
);
7419 return gen_load_b( mce
, dszB
, e
->Iex
.Load
.addr
, 0 );
7422 IRAtom
* b1
= schemeE( mce
, e
->Iex
.ITE
.cond
);
7423 IRAtom
* b3
= schemeE( mce
, e
->Iex
.ITE
.iftrue
);
7424 IRAtom
* b2
= schemeE( mce
, e
->Iex
.ITE
.iffalse
);
7425 return gen_maxU32( mce
, b1
, gen_maxU32( mce
, b2
, b3
));
7428 IRAtom
* b1
= schemeE( mce
, e
->Iex
.Qop
.details
->arg1
);
7429 IRAtom
* b2
= schemeE( mce
, e
->Iex
.Qop
.details
->arg2
);
7430 IRAtom
* b3
= schemeE( mce
, e
->Iex
.Qop
.details
->arg3
);
7431 IRAtom
* b4
= schemeE( mce
, e
->Iex
.Qop
.details
->arg4
);
7432 return gen_maxU32( mce
, gen_maxU32( mce
, b1
, b2
),
7433 gen_maxU32( mce
, b3
, b4
) );
7436 IRAtom
* b1
= schemeE( mce
, e
->Iex
.Triop
.details
->arg1
);
7437 IRAtom
* b2
= schemeE( mce
, e
->Iex
.Triop
.details
->arg2
);
7438 IRAtom
* b3
= schemeE( mce
, e
->Iex
.Triop
.details
->arg3
);
7439 return gen_maxU32( mce
, b1
, gen_maxU32( mce
, b2
, b3
) );
7442 switch (e
->Iex
.Binop
.op
) {
7443 case Iop_CasCmpEQ8
: case Iop_CasCmpNE8
:
7444 case Iop_CasCmpEQ16
: case Iop_CasCmpNE16
:
7445 case Iop_CasCmpEQ32
: case Iop_CasCmpNE32
:
7446 case Iop_CasCmpEQ64
: case Iop_CasCmpNE64
:
7447 /* Just say these all produce a defined result,
7448 regardless of their arguments. See
7449 COMMENT_ON_CasCmpEQ in this file. */
7452 IRAtom
* b1
= schemeE( mce
, e
->Iex
.Binop
.arg1
);
7453 IRAtom
* b2
= schemeE( mce
, e
->Iex
.Binop
.arg2
);
7454 return gen_maxU32( mce
, b1
, b2
);
7461 IRAtom
* b1
= schemeE( mce
, e
->Iex
.Unop
.arg
);
7467 return mkexpr( findShadowTmpB( mce
, e
->Iex
.RdTmp
.tmp
));
7469 Int b_offset
= MC_(get_otrack_shadow_offset
)(
7471 sizeofIRType(e
->Iex
.Get
.ty
)
7473 tl_assert(b_offset
>= -1
7474 && b_offset
<= mce
->layout
->total_sizeB
-4);
7475 if (b_offset
>= 0) {
7476 /* FIXME: this isn't an atom! */
7477 return IRExpr_Get( b_offset
+ 2*mce
->layout
->total_sizeB
,
7483 VG_(printf
)("mc_translate.c: schemeE: unhandled: ");
7485 VG_(tool_panic
)("memcheck:schemeE");
7490 static void do_origins_Dirty ( MCEnv
* mce
, IRDirty
* d
)
7492 // This is a hacked version of do_shadow_Dirty
7493 Int i
, k
, n
, toDo
, gSz
, gOff
;
7494 IRAtom
*here
, *curr
;
7497 /* First check the guard. */
7498 curr
= schemeE( mce
, d
->guard
);
7500 /* Now round up all inputs and maxU32 over them. */
7502 /* Inputs: unmasked args
7503 Note: arguments are evaluated REGARDLESS of the guard expression */
7504 for (i
= 0; d
->args
[i
]; i
++) {
7505 IRAtom
* arg
= d
->args
[i
];
7506 if ( (d
->cee
->mcx_mask
& (1<<i
))
7507 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg
)) ) {
7508 /* ignore this arg */
7510 here
= schemeE( mce
, arg
);
7511 curr
= gen_maxU32( mce
, curr
, here
);
7515 /* Inputs: guest state that we read. */
7516 for (i
= 0; i
< d
->nFxState
; i
++) {
7517 tl_assert(d
->fxState
[i
].fx
!= Ifx_None
);
7518 if (d
->fxState
[i
].fx
== Ifx_Write
)
7521 /* Enumerate the described state segments */
7522 for (k
= 0; k
< 1 + d
->fxState
[i
].nRepeats
; k
++) {
7523 gOff
= d
->fxState
[i
].offset
+ k
* d
->fxState
[i
].repeatLen
;
7524 gSz
= d
->fxState
[i
].size
;
7526 /* Ignore any sections marked as 'always defined'. */
7527 if (isAlwaysDefd(mce
, gOff
, gSz
)) {
7529 VG_(printf
)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7534 /* This state element is read or modified. So we need to
7535 consider it. If larger than 4 bytes, deal with it in
7539 tl_assert(gSz
>= 0);
7540 if (gSz
== 0) break;
7541 n
= gSz
<= 4 ? gSz
: 4;
7542 /* update 'curr' with maxU32 of the state slice
7544 b_offset
= MC_(get_otrack_shadow_offset
)(gOff
, 4);
7545 if (b_offset
!= -1) {
7546 /* Observe the guard expression. If it is false use 0, i.e.
7547 nothing is known about the origin */
7548 IRAtom
*cond
, *iffalse
, *iftrue
;
7550 cond
= assignNew( 'B', mce
, Ity_I1
, d
->guard
);
7552 iftrue
= assignNew( 'B', mce
, Ity_I32
,
7554 + 2*mce
->layout
->total_sizeB
,
7556 here
= assignNew( 'B', mce
, Ity_I32
,
7557 IRExpr_ITE(cond
, iftrue
, iffalse
));
7558 curr
= gen_maxU32( mce
, curr
, here
);
7566 /* Inputs: memory */
7568 if (d
->mFx
!= Ifx_None
) {
7569 /* Because we may do multiple shadow loads/stores from the same
7570 base address, it's best to do a single test of its
7571 definedness right now. Post-instrumentation optimisation
7572 should remove all but this test. */
7573 tl_assert(d
->mAddr
);
7574 here
= schemeE( mce
, d
->mAddr
);
7575 curr
= gen_maxU32( mce
, curr
, here
);
7578 /* Deal with memory inputs (reads or modifies) */
7579 if (d
->mFx
== Ifx_Read
|| d
->mFx
== Ifx_Modify
) {
7581 /* chew off 32-bit chunks. We don't care about the endianness
7582 since it's all going to be condensed down to a single bit,
7583 but nevertheless choose an endianness which is hopefully
7584 native to the platform. */
7586 here
= gen_guarded_load_b( mce
, 4, d
->mAddr
, d
->mSize
- toDo
,
7588 curr
= gen_maxU32( mce
, curr
, here
);
7591 /* handle possible 16-bit excess */
7593 here
= gen_guarded_load_b( mce
, 2, d
->mAddr
, d
->mSize
- toDo
,
7595 curr
= gen_maxU32( mce
, curr
, here
);
7598 /* chew off the remaining 8-bit chunk, if any */
7600 here
= gen_guarded_load_b( mce
, 1, d
->mAddr
, d
->mSize
- toDo
,
7602 curr
= gen_maxU32( mce
, curr
, here
);
7605 tl_assert(toDo
== 0);
7608 /* Whew! So curr is a 32-bit B-value which should give an origin
7609 of some use if any of the inputs to the helper are undefined.
7610 Now we need to re-distribute the results to all destinations. */
7612 /* Outputs: the destination temporary, if there is one. */
7613 if (d
->tmp
!= IRTemp_INVALID
) {
7614 dst
= findShadowTmpB(mce
, d
->tmp
);
7615 assign( 'V', mce
, dst
, curr
);
7618 /* Outputs: guest state that we write or modify. */
7619 for (i
= 0; i
< d
->nFxState
; i
++) {
7620 tl_assert(d
->fxState
[i
].fx
!= Ifx_None
);
7621 if (d
->fxState
[i
].fx
== Ifx_Read
)
7624 /* Enumerate the described state segments */
7625 for (k
= 0; k
< 1 + d
->fxState
[i
].nRepeats
; k
++) {
7626 gOff
= d
->fxState
[i
].offset
+ k
* d
->fxState
[i
].repeatLen
;
7627 gSz
= d
->fxState
[i
].size
;
7629 /* Ignore any sections marked as 'always defined'. */
7630 if (isAlwaysDefd(mce
, gOff
, gSz
))
7633 /* This state element is written or modified. So we need to
7634 consider it. If larger than 4 bytes, deal with it in
7638 tl_assert(gSz
>= 0);
7639 if (gSz
== 0) break;
7640 n
= gSz
<= 4 ? gSz
: 4;
7641 /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7642 b_offset
= MC_(get_otrack_shadow_offset
)(gOff
, 4);
7643 if (b_offset
!= -1) {
7645 /* If the guard expression evaluates to false we simply Put
7646 the value that is already stored in the guest state slot */
7647 IRAtom
*cond
, *iffalse
;
7649 cond
= assignNew('B', mce
, Ity_I1
,
7651 iffalse
= assignNew('B', mce
, Ity_I32
,
7652 IRExpr_Get(b_offset
+
7653 2*mce
->layout
->total_sizeB
,
7655 curr
= assignNew('V', mce
, Ity_I32
,
7656 IRExpr_ITE(cond
, curr
, iffalse
));
7658 stmt( 'B', mce
, IRStmt_Put(b_offset
7659 + 2*mce
->layout
->total_sizeB
,
7668 /* Outputs: memory that we write or modify. Same comments about
7669 endianness as above apply. */
7670 if (d
->mFx
== Ifx_Write
|| d
->mFx
== Ifx_Modify
) {
7672 /* chew off 32-bit chunks */
7674 gen_store_b( mce
, 4, d
->mAddr
, d
->mSize
- toDo
, curr
,
7678 /* handle possible 16-bit excess */
7680 gen_store_b( mce
, 2, d
->mAddr
, d
->mSize
- toDo
, curr
,
7684 /* chew off the remaining 8-bit chunk, if any */
7686 gen_store_b( mce
, 1, d
->mAddr
, d
->mSize
- toDo
, curr
,
7690 tl_assert(toDo
== 0);
7695 /* Generate IR for origin shadowing for a general guarded store. */
7696 static void do_origins_Store_guarded ( MCEnv
* mce
,
7704 /* assert that the B value for the address is already available
7705 (somewhere), since the call to schemeE will want to see it.
7706 XXXX how does this actually ensure that?? */
7707 tl_assert(isIRAtom(stAddr
));
7708 tl_assert(isIRAtom(stData
));
7709 dszB
= sizeofIRType( typeOfIRExpr(mce
->sb
->tyenv
, stData
) );
7710 dataB
= schemeE( mce
, stData
);
7711 gen_store_b( mce
, dszB
, stAddr
, 0/*offset*/, dataB
, guard
);
7715 /* Generate IR for origin shadowing for a plain store. */
7716 static void do_origins_Store_plain ( MCEnv
* mce
,
7721 do_origins_Store_guarded ( mce
, stEnd
, stAddr
, stData
,
7726 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7728 static void do_origins_StoreG ( MCEnv
* mce
, IRStoreG
* sg
)
7730 do_origins_Store_guarded( mce
, sg
->end
, sg
->addr
,
7731 sg
->data
, sg
->guard
);
7734 static void do_origins_LoadG ( MCEnv
* mce
, IRLoadG
* lg
)
7736 IRType loadedTy
= Ity_INVALID
;
7738 case ILGop_IdentV128
: loadedTy
= Ity_V128
; break;
7739 case ILGop_Ident64
: loadedTy
= Ity_I64
; break;
7740 case ILGop_Ident32
: loadedTy
= Ity_I32
; break;
7741 case ILGop_16Uto32
: loadedTy
= Ity_I16
; break;
7742 case ILGop_16Sto32
: loadedTy
= Ity_I16
; break;
7743 case ILGop_8Uto32
: loadedTy
= Ity_I8
; break;
7744 case ILGop_8Sto32
: loadedTy
= Ity_I8
; break;
7745 default: VG_(tool_panic
)("schemeS.IRLoadG");
7748 = schemeE( mce
,lg
->alt
);
7750 = expr2ori_Load_guarded_General(mce
, loadedTy
,
7751 lg
->addr
, 0/*addr bias*/,
7752 lg
->guard
, ori_alt
);
7753 /* And finally, bind the origin to the destination temporary. */
7754 assign( 'B', mce
, findShadowTmpB(mce
, lg
->dst
), ori_final
);
7758 static void schemeS ( MCEnv
* mce
, IRStmt
* st
)
7760 tl_assert(MC_(clo_mc_level
) == 3);
7765 /* The value-check instrumenter handles this - by arranging
7766 to pass the address of the next instruction to
7767 MC_(helperc_MAKE_STACK_UNINIT). This is all that needs to
7768 happen for origin tracking w.r.t. AbiHints. So there is
7769 nothing to do here. */
7773 IRPutI
*puti
= st
->Ist
.PutI
.details
;
7774 IRRegArray
* descr_b
;
7775 IRAtom
*t1
, *t2
, *t3
, *t4
;
7776 IRRegArray
* descr
= puti
->descr
;
7778 = MC_(get_otrack_reg_array_equiv_int_type
)(descr
);
7779 /* If this array is unshadowable for whatever reason,
7780 generate no code. */
7781 if (equivIntTy
== Ity_INVALID
)
7783 tl_assert(sizeofIRType(equivIntTy
) >= 4);
7784 tl_assert(sizeofIRType(equivIntTy
) == sizeofIRType(descr
->elemTy
));
7786 = mkIRRegArray( descr
->base
+ 2*mce
->layout
->total_sizeB
,
7787 equivIntTy
, descr
->nElems
);
7788 /* Compute a value to Put - the conjoinment of the origin for
7789 the data to be Put-ted (obviously) and of the index value
7790 (not so obviously). */
7791 t1
= schemeE( mce
, puti
->data
);
7792 t2
= schemeE( mce
, puti
->ix
);
7793 t3
= gen_maxU32( mce
, t1
, t2
);
7794 t4
= zWidenFrom32( mce
, equivIntTy
, t3
);
7795 stmt( 'B', mce
, IRStmt_PutI( mkIRPutI(descr_b
, puti
->ix
,
7801 do_origins_Dirty( mce
, st
->Ist
.Dirty
.details
);
7805 do_origins_Store_plain( mce
, st
->Ist
.Store
.end
,
7807 st
->Ist
.Store
.data
);
7811 do_origins_StoreG( mce
, st
->Ist
.StoreG
.details
);
7815 do_origins_LoadG( mce
, st
->Ist
.LoadG
.details
);
7819 /* In short: treat a load-linked like a normal load followed
7820 by an assignment of the loaded (shadow) data the result
7821 temporary. Treat a store-conditional like a normal store,
7822 and mark the result temporary as defined. */
7823 if (st
->Ist
.LLSC
.storedata
== NULL
) {
7826 = typeOfIRTemp(mce
->sb
->tyenv
, st
->Ist
.LLSC
.result
);
7828 = IRExpr_Load(st
->Ist
.LLSC
.end
, resTy
, st
->Ist
.LLSC
.addr
);
7829 tl_assert(resTy
== Ity_I128
|| resTy
== Ity_I64
|| resTy
== Ity_I32
7830 || resTy
== Ity_I16
|| resTy
== Ity_I8
);
7831 assign( 'B', mce
, findShadowTmpB(mce
, st
->Ist
.LLSC
.result
),
7832 schemeE(mce
, vanillaLoad
));
7834 /* Store conditional */
7835 do_origins_Store_plain( mce
, st
->Ist
.LLSC
.end
,
7837 st
->Ist
.LLSC
.storedata
);
7838 /* For the rationale behind this, see comments at the
7839 place where the V-shadow for .result is constructed, in
7840 do_shadow_LLSC. In short, we regard .result as
7842 assign( 'B', mce
, findShadowTmpB(mce
, st
->Ist
.LLSC
.result
),
7850 = MC_(get_otrack_shadow_offset
)(
7852 sizeofIRType(typeOfIRExpr(mce
->sb
->tyenv
, st
->Ist
.Put
.data
))
7854 if (b_offset
>= 0) {
7855 /* FIXME: this isn't an atom! */
7856 stmt( 'B', mce
, IRStmt_Put(b_offset
+ 2*mce
->layout
->total_sizeB
,
7857 schemeE( mce
, st
->Ist
.Put
.data
)) );
7863 assign( 'B', mce
, findShadowTmpB(mce
, st
->Ist
.WrTmp
.tmp
),
7864 schemeE(mce
, st
->Ist
.WrTmp
.data
) );
7874 VG_(printf
)("mc_translate.c: schemeS: unhandled: ");
7876 VG_(tool_panic
)("memcheck:schemeS");
7881 /*------------------------------------------------------------*/
7882 /*--- Post-tree-build final tidying ---*/
7883 /*------------------------------------------------------------*/
7885 /* This exploits the observation that Memcheck often produces
7886 repeated conditional calls of the form
7888 Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7890 with the same guard expression G guarding the same helper call.
7891 The second and subsequent calls are redundant. This usually
7892 results from instrumentation of guest code containing multiple
7893 memory references at different constant offsets from the same base
7894 register. After optimisation of the instrumentation, you get a
7895 test for the definedness of the base register for each memory
7896 reference, which is kinda pointless. MC_(final_tidy) therefore
7897 looks for such repeated calls and removes all but the first. */
7900 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7901 gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7902 get almost all the benefits of this transformation whilst causing
7903 the slide-back case to just often enough to be verifiably
7904 correct. For posterity, the numbers are:
7908 1 4,336 (112,212 -> 1,709,473; ratio 15.2)
7909 2 4,336 (112,194 -> 1,669,895; ratio 14.9)
7910 3 4,336 (112,194 -> 1,660,713; ratio 14.8)
7911 4 4,336 (112,194 -> 1,658,555; ratio 14.8)
7912 5 4,336 (112,194 -> 1,655,447; ratio 14.8)
7913 6 4,336 (112,194 -> 1,655,101; ratio 14.8)
7914 7 4,336 (112,194 -> 1,654,858; ratio 14.7)
7915 8 4,336 (112,194 -> 1,654,810; ratio 14.7)
7916 10 4,336 (112,194 -> 1,654,621; ratio 14.7)
7917 12 4,336 (112,194 -> 1,654,678; ratio 14.7)
7918 16 4,336 (112,194 -> 1,654,494; ratio 14.7)
7919 32 4,336 (112,194 -> 1,654,602; ratio 14.7)
7920 inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7924 1 4,113 (107,329 -> 1,822,171; ratio 17.0)
7925 2 4,113 (107,329 -> 1,806,443; ratio 16.8)
7926 3 4,113 (107,329 -> 1,803,967; ratio 16.8)
7927 4 4,113 (107,329 -> 1,802,785; ratio 16.8)
7928 5 4,113 (107,329 -> 1,802,412; ratio 16.8)
7929 6 4,113 (107,329 -> 1,802,062; ratio 16.8)
7930 7 4,113 (107,329 -> 1,801,976; ratio 16.8)
7931 8 4,113 (107,329 -> 1,801,886; ratio 16.8)
7932 10 4,113 (107,329 -> 1,801,653; ratio 16.8)
7933 12 4,113 (107,329 -> 1,801,526; ratio 16.8)
7934 16 4,113 (107,329 -> 1,801,298; ratio 16.8)
7935 32 4,113 (107,329 -> 1,800,827; ratio 16.8)
7936 inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7939 /* Structs for recording which (helper, guard) pairs we have already
7942 #define N_TIDYING_PAIRS 16
7945 struct { void* entry
; IRExpr
* guard
; }
7950 Pair pairs
[N_TIDYING_PAIRS
+1/*for bounds checking*/];
7956 /* Return True if e1 and e2 definitely denote the same value (used to
7957 compare guards). Return False if unknown; False is the safe
7958 answer. Since guest registers and guest memory do not have the
7959 SSA property we must return False if any Gets or Loads appear in
7960 the expression. This implicitly assumes that e1 and e2 have the
7961 same IR type, which is always true here -- the type is Ity_I1. */
7963 static Bool
sameIRValue ( IRExpr
* e1
, IRExpr
* e2
)
7965 if (e1
->tag
!= e2
->tag
)
7969 return eqIRConst( e1
->Iex
.Const
.con
, e2
->Iex
.Const
.con
);
7971 return e1
->Iex
.Binop
.op
== e2
->Iex
.Binop
.op
7972 && sameIRValue(e1
->Iex
.Binop
.arg1
, e2
->Iex
.Binop
.arg1
)
7973 && sameIRValue(e1
->Iex
.Binop
.arg2
, e2
->Iex
.Binop
.arg2
);
7975 return e1
->Iex
.Unop
.op
== e2
->Iex
.Unop
.op
7976 && sameIRValue(e1
->Iex
.Unop
.arg
, e2
->Iex
.Unop
.arg
);
7978 return e1
->Iex
.RdTmp
.tmp
== e2
->Iex
.RdTmp
.tmp
;
7980 return sameIRValue( e1
->Iex
.ITE
.cond
, e2
->Iex
.ITE
.cond
)
7981 && sameIRValue( e1
->Iex
.ITE
.iftrue
, e2
->Iex
.ITE
.iftrue
)
7982 && sameIRValue( e1
->Iex
.ITE
.iffalse
, e2
->Iex
.ITE
.iffalse
);
7986 /* be lazy. Could define equality for these, but they never
7987 appear to be used. */
7992 /* be conservative - these may not give the same value each
7996 /* should never see this */
7999 VG_(printf
)("mc_translate.c: sameIRValue: unhandled: ");
8001 VG_(tool_panic
)("memcheck:sameIRValue");
8006 /* See if 'pairs' already has an entry for (entry, guard). Return
8007 True if so. If not, add an entry. */
8010 Bool
check_or_add ( Pairs
* tidyingEnv
, IRExpr
* guard
, void* entry
)
8012 UInt i
, n
= tidyingEnv
->pairsUsed
;
8013 tl_assert(n
<= N_TIDYING_PAIRS
);
8014 for (i
= 0; i
< n
; i
++) {
8015 if (tidyingEnv
->pairs
[i
].entry
== entry
8016 && sameIRValue(tidyingEnv
->pairs
[i
].guard
, guard
))
8019 /* (guard, entry) wasn't found in the array. Add it at the end.
8020 If the array is already full, slide the entries one slot
8021 backwards. This means we will lose to ability to detect
8022 duplicates from the pair in slot zero, but that happens so
8023 rarely that it's unlikely to have much effect on overall code
8024 quality. Also, this strategy loses the check for the oldest
8025 tracked exit (memory reference, basically) and so that is (I'd
8026 guess) least likely to be re-used after this point. */
8028 if (n
== N_TIDYING_PAIRS
) {
8029 for (i
= 1; i
< N_TIDYING_PAIRS
; i
++) {
8030 tidyingEnv
->pairs
[i
-1] = tidyingEnv
->pairs
[i
];
8032 tidyingEnv
->pairs
[N_TIDYING_PAIRS
-1].entry
= entry
;
8033 tidyingEnv
->pairs
[N_TIDYING_PAIRS
-1].guard
= guard
;
8035 tl_assert(n
< N_TIDYING_PAIRS
);
8036 tidyingEnv
->pairs
[n
].entry
= entry
;
8037 tidyingEnv
->pairs
[n
].guard
= guard
;
8039 tidyingEnv
->pairsUsed
= n
;
8044 static Bool
is_helperc_value_checkN_fail ( const HChar
* name
)
8046 /* This is expensive because it happens a lot. We are checking to
8047 see whether |name| is one of the following 8 strings:
8049 MC_(helperc_value_check8_fail_no_o)
8050 MC_(helperc_value_check4_fail_no_o)
8051 MC_(helperc_value_check0_fail_no_o)
8052 MC_(helperc_value_check1_fail_no_o)
8053 MC_(helperc_value_check8_fail_w_o)
8054 MC_(helperc_value_check0_fail_w_o)
8055 MC_(helperc_value_check1_fail_w_o)
8056 MC_(helperc_value_check4_fail_w_o)
8058 To speed it up, check the common prefix just once, rather than
8061 const HChar
* prefix
= "MC_(helperc_value_check";
8067 if (p
== 0) break; /* ran off the end of the prefix */
8068 /* We still have some prefix to use */
8069 if (n
== 0) return False
; /* have prefix, but name ran out */
8070 if (n
!= p
) return False
; /* have both pfx and name, but no match */
8075 /* Check the part after the prefix. */
8076 tl_assert(*prefix
== 0 && *name
!= 0);
8077 return 0==VG_(strcmp
)(name
, "8_fail_no_o)")
8078 || 0==VG_(strcmp
)(name
, "4_fail_no_o)")
8079 || 0==VG_(strcmp
)(name
, "0_fail_no_o)")
8080 || 0==VG_(strcmp
)(name
, "1_fail_no_o)")
8081 || 0==VG_(strcmp
)(name
, "8_fail_w_o)")
8082 || 0==VG_(strcmp
)(name
, "4_fail_w_o)")
8083 || 0==VG_(strcmp
)(name
, "0_fail_w_o)")
8084 || 0==VG_(strcmp
)(name
, "1_fail_w_o)");
8087 IRSB
* MC_(final_tidy
) ( IRSB
* sb_in
)
8094 Bool alreadyPresent
;
8097 pairs
.pairsUsed
= 0;
8099 pairs
.pairs
[N_TIDYING_PAIRS
].entry
= (void*)0x123;
8100 pairs
.pairs
[N_TIDYING_PAIRS
].guard
= (IRExpr
*)0x456;
8102 /* Scan forwards through the statements. Each time a call to one
8103 of the relevant helpers is seen, check if we have made a
8104 previous call to the same helper using the same guard
8105 expression, and if so, delete the call. */
8106 for (i
= 0; i
< sb_in
->stmts_used
; i
++) {
8107 st
= sb_in
->stmts
[i
];
8109 if (st
->tag
!= Ist_Dirty
)
8111 di
= st
->Ist
.Dirty
.details
;
8114 if (0) { ppIRExpr(guard
); VG_(printf
)("\n"); }
8116 if (!is_helperc_value_checkN_fail( cee
->name
))
8118 /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
8119 guard 'guard'. Check if we have already seen a call to this
8120 function with the same guard. If so, delete it. If not,
8121 add it to the set of calls we do know about. */
8122 alreadyPresent
= check_or_add( &pairs
, guard
, cee
->addr
);
8123 if (alreadyPresent
) {
8124 sb_in
->stmts
[i
] = IRStmt_NoOp();
8125 if (0) VG_(printf
)("XX\n");
8129 tl_assert(pairs
.pairs
[N_TIDYING_PAIRS
].entry
== (void*)0x123);
8130 tl_assert(pairs
.pairs
[N_TIDYING_PAIRS
].guard
== (IRExpr
*)0x456);
8135 #undef N_TIDYING_PAIRS
8138 /*------------------------------------------------------------*/
8139 /*--- Startup assertion checking ---*/
8140 /*------------------------------------------------------------*/
8142 void MC_(do_instrumentation_startup_checks
)( void )
8144 /* Make a best-effort check to see that is_helperc_value_checkN_fail
8145 is working as we expect. */
8147 # define CHECK(_expected, _string) \
8148 tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
8150 /* It should identify these 8, and no others, as targets. */
8151 CHECK(True
, "MC_(helperc_value_check8_fail_no_o)");
8152 CHECK(True
, "MC_(helperc_value_check4_fail_no_o)");
8153 CHECK(True
, "MC_(helperc_value_check0_fail_no_o)");
8154 CHECK(True
, "MC_(helperc_value_check1_fail_no_o)");
8155 CHECK(True
, "MC_(helperc_value_check8_fail_w_o)");
8156 CHECK(True
, "MC_(helperc_value_check0_fail_w_o)");
8157 CHECK(True
, "MC_(helperc_value_check1_fail_w_o)");
8158 CHECK(True
, "MC_(helperc_value_check4_fail_w_o)");
8160 /* Ad-hoc selection of other strings gathered via a quick test. */
8161 CHECK(False
, "amd64g_dirtyhelper_CPUID_avx2");
8162 CHECK(False
, "amd64g_dirtyhelper_RDTSC");
8163 CHECK(False
, "MC_(helperc_b_load1)");
8164 CHECK(False
, "MC_(helperc_b_load2)");
8165 CHECK(False
, "MC_(helperc_b_load4)");
8166 CHECK(False
, "MC_(helperc_b_load8)");
8167 CHECK(False
, "MC_(helperc_b_load16)");
8168 CHECK(False
, "MC_(helperc_b_load32)");
8169 CHECK(False
, "MC_(helperc_b_store1)");
8170 CHECK(False
, "MC_(helperc_b_store2)");
8171 CHECK(False
, "MC_(helperc_b_store4)");
8172 CHECK(False
, "MC_(helperc_b_store8)");
8173 CHECK(False
, "MC_(helperc_b_store16)");
8174 CHECK(False
, "MC_(helperc_b_store32)");
8175 CHECK(False
, "MC_(helperc_LOADV8)");
8176 CHECK(False
, "MC_(helperc_LOADV16le)");
8177 CHECK(False
, "MC_(helperc_LOADV32le)");
8178 CHECK(False
, "MC_(helperc_LOADV64le)");
8179 CHECK(False
, "MC_(helperc_LOADV128le)");
8180 CHECK(False
, "MC_(helperc_LOADV256le)");
8181 CHECK(False
, "MC_(helperc_STOREV16le)");
8182 CHECK(False
, "MC_(helperc_STOREV32le)");
8183 CHECK(False
, "MC_(helperc_STOREV64le)");
8184 CHECK(False
, "MC_(helperc_STOREV8)");
8185 CHECK(False
, "track_die_mem_stack_8");
8186 CHECK(False
, "track_new_mem_stack_8_w_ECU");
8187 CHECK(False
, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
8188 CHECK(False
, "VG_(unknown_SP_update_w_ECU)");
8194 /*------------------------------------------------------------*/
8195 /*--- Memcheck main ---*/
8196 /*------------------------------------------------------------*/
8198 static Bool
isBogusAtom ( IRAtom
* at
)
8200 if (at
->tag
== Iex_RdTmp
)
8202 tl_assert(at
->tag
== Iex_Const
);
8205 IRConst
* con
= at
->Iex
.Const
.con
;
8207 case Ico_U1
: return False
;
8208 case Ico_U8
: n
= (ULong
)con
->Ico
.U8
; break;
8209 case Ico_U16
: n
= (ULong
)con
->Ico
.U16
; break;
8210 case Ico_U32
: n
= (ULong
)con
->Ico
.U32
; break;
8211 case Ico_U64
: n
= (ULong
)con
->Ico
.U64
; break;
8212 case Ico_F32
: return False
;
8213 case Ico_F64
: return False
;
8214 case Ico_F32i
: return False
;
8215 case Ico_F64i
: return False
;
8216 case Ico_V128
: return False
;
8217 case Ico_V256
: return False
;
8218 default: ppIRExpr(at
); tl_assert(0);
8220 /* VG_(printf)("%llx\n", n); */
8222 if (LIKELY(n
<= 0x0000000000001000ULL
)) return False
;
8223 if (LIKELY(n
>= 0xFFFFFFFFFFFFF000ULL
)) return False
;
8224 /* The list of bogus atoms is: */
8225 return (/*32*/ n
== 0xFEFEFEFFULL
8226 /*32*/ || n
== 0x80808080ULL
8227 /*32*/ || n
== 0x7F7F7F7FULL
8228 /*32*/ || n
== 0x7EFEFEFFULL
8229 /*32*/ || n
== 0x81010100ULL
8230 /*64*/ || n
== 0xFFFFFFFFFEFEFEFFULL
8231 /*64*/ || n
== 0xFEFEFEFEFEFEFEFFULL
8232 /*64*/ || n
== 0x0000000000008080ULL
8233 /*64*/ || n
== 0x8080808080808080ULL
8234 /*64*/ || n
== 0x0101010101010101ULL
8239 /* Does 'st' mention any of the literals identified/listed in
8241 static inline Bool
containsBogusLiterals ( /*FLAT*/ IRStmt
* st
)
8249 e
= st
->Ist
.WrTmp
.data
;
8255 return isBogusAtom(e
);
8257 return isBogusAtom(e
->Iex
.Unop
.arg
)
8258 || e
->Iex
.Unop
.op
== Iop_GetMSBs8x16
;
8260 return isBogusAtom(e
->Iex
.GetI
.ix
);
8262 return isBogusAtom(e
->Iex
.Binop
.arg1
)
8263 || isBogusAtom(e
->Iex
.Binop
.arg2
);
8265 return isBogusAtom(e
->Iex
.Triop
.details
->arg1
)
8266 || isBogusAtom(e
->Iex
.Triop
.details
->arg2
)
8267 || isBogusAtom(e
->Iex
.Triop
.details
->arg3
);
8269 return isBogusAtom(e
->Iex
.Qop
.details
->arg1
)
8270 || isBogusAtom(e
->Iex
.Qop
.details
->arg2
)
8271 || isBogusAtom(e
->Iex
.Qop
.details
->arg3
)
8272 || isBogusAtom(e
->Iex
.Qop
.details
->arg4
);
8274 return isBogusAtom(e
->Iex
.ITE
.cond
)
8275 || isBogusAtom(e
->Iex
.ITE
.iftrue
)
8276 || isBogusAtom(e
->Iex
.ITE
.iffalse
);
8278 return isBogusAtom(e
->Iex
.Load
.addr
);
8280 for (i
= 0; e
->Iex
.CCall
.args
[i
]; i
++)
8281 if (isBogusAtom(e
->Iex
.CCall
.args
[i
]))
8288 d
= st
->Ist
.Dirty
.details
;
8289 for (i
= 0; d
->args
[i
]; i
++) {
8290 IRAtom
* atom
= d
->args
[i
];
8291 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom
))) {
8292 if (isBogusAtom(atom
))
8296 if (isBogusAtom(d
->guard
))
8298 if (d
->mAddr
&& isBogusAtom(d
->mAddr
))
8302 return isBogusAtom(st
->Ist
.Put
.data
);
8304 return isBogusAtom(st
->Ist
.PutI
.details
->ix
)
8305 || isBogusAtom(st
->Ist
.PutI
.details
->data
);
8307 return isBogusAtom(st
->Ist
.Store
.addr
)
8308 || isBogusAtom(st
->Ist
.Store
.data
);
8310 IRStoreG
* sg
= st
->Ist
.StoreG
.details
;
8311 return isBogusAtom(sg
->addr
) || isBogusAtom(sg
->data
)
8312 || isBogusAtom(sg
->guard
);
8315 IRLoadG
* lg
= st
->Ist
.LoadG
.details
;
8316 return isBogusAtom(lg
->addr
) || isBogusAtom(lg
->alt
)
8317 || isBogusAtom(lg
->guard
);
8320 return isBogusAtom(st
->Ist
.Exit
.guard
);
8322 return isBogusAtom(st
->Ist
.AbiHint
.base
)
8323 || isBogusAtom(st
->Ist
.AbiHint
.nia
);
8329 cas
= st
->Ist
.CAS
.details
;
8330 return isBogusAtom(cas
->addr
)
8331 || (cas
->expdHi
? isBogusAtom(cas
->expdHi
) : False
)
8332 || isBogusAtom(cas
->expdLo
)
8333 || (cas
->dataHi
? isBogusAtom(cas
->dataHi
) : False
)
8334 || isBogusAtom(cas
->dataLo
);
8336 return isBogusAtom(st
->Ist
.LLSC
.addr
)
8337 || (st
->Ist
.LLSC
.storedata
8338 ? isBogusAtom(st
->Ist
.LLSC
.storedata
)
8343 VG_(tool_panic
)("hasBogusLiterals");
8348 /* This is the pre-instrumentation analysis. It does a backwards pass over
8349 the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
8352 Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
8353 as a positive result from that is a strong indication that we need to
8354 expensively instrument add/sub in the block. We do both analyses in one
8355 pass, even though they are independent, so as to avoid the overhead of
8356 having to traverse the whole block twice.
8358 The usage pass proceeds as follows. Let max= be the max operation in the
8359 HowUsed lattice, hence
8361 X max= Y means X = max(X, Y)
8365 for t in original tmps . useEnv[t] = HuUnU
8367 for t used in the block's . next field
8368 useEnv[t] max= HuPCa // because jmp targets are PCast-tested
8370 for st iterating *backwards* in the block
8374 case "t1 = load(t2)" // case 1
8375 useEnv[t2] max= HuPCa
8377 case "t1 = add(t2, t3)" // case 2
8378 useEnv[t2] max= useEnv[t1]
8379 useEnv[t3] max= useEnv[t1]
8382 for t in st.usedTmps // case 3
8383 useEnv[t] max= HuOth
8384 // same as useEnv[t] = HuOth
8386 The general idea is that we accumulate, in useEnv[], information about
8387 how each tmp is used. That can be updated as we work further back
8388 through the block and find more uses of it, but its HowUsed value can
8389 only ascend the lattice, not descend.
8391 Initially we mark all tmps as unused. In case (1), if a tmp is seen to
8392 be used as a memory address, then its use is at least HuPCa. The point
8393 is that for a memory address we will add instrumentation to check if any
8394 bit of the address is undefined, which means that we won't need expensive
8395 V-bit propagation through an add expression that computed the address --
8396 cheap add instrumentation will be equivalent.
8398 Note in case (1) that if we have previously seen a non-memory-address use
8399 of the tmp, then its use will already be HuOth and will be unchanged by
8400 the max= operation. And if it turns out that the source of the tmp was
8401 an add, then we'll have to expensively instrument the add, because we
8402 can't prove that, for the previous non-memory-address use of the tmp,
8403 cheap and expensive instrumentation will be equivalent.
8405 In case 2, we propagate the usage-mode of the result of an add back
8406 through to its operands. Again, we use max= so as to take account of the
8407 fact that t2 or t3 might later in the block (viz, earlier in the
8408 iteration) have been used in a way that requires expensive add
8411 In case 3, we deal with all other tmp uses. We assume that we'll need a
8412 result that is as accurate as possible, so we max= HuOth into its use
8413 mode. Since HuOth is the top of the lattice, that's equivalent to just
8414 setting its use to HuOth.
8416 The net result of all this is that:
8418 tmps that are used either
8419 - only as a memory address, or
8420 - only as part of a tree of adds that computes a memory address,
8421 and has no other use
8422 are marked as HuPCa, and so we can instrument their generating Add
8423 nodes cheaply, which is the whole point of this analysis
8425 tmps that are used any other way at all are marked as HuOth
8427 tmps that are unused are marked as HuUnU. We don't expect to see any
8428 since we expect that the incoming IR has had all dead assignments
8429 removed by previous optimisation passes. Nevertheless the analysis is
8430 correct even in the presence of dead tmps.
8432 A final comment on dead tmps. In case 1 and case 2, we could actually
8433 conditionalise the updates thusly:
8435 if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa } // case 1
8437 if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] } // case 2
8438 if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] } // case 2
8440 In other words, if the assigned-to tmp |t1| is never used, then there's
8441 no point in propagating any use through to its operands. That won't
8442 change the final HuPCa-vs-HuOth results, which is what we care about.
8443 Given that we expect to get dead-code-free inputs, there's no point in
8444 adding this extra refinement.
8447 /* Helper for |preInstrumentationAnalysis|. */
8448 static inline void noteTmpUsesIn ( /*MOD*/HowUsed
* useEnv
,
8450 HowUsed newUse
, IRAtom
* at
)
8452 /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
8453 seen a use of |newUse|. So, merge that info into |t|'s accumulated
8461 IRTemp t
= at
->Iex
.RdTmp
.tmp
;
8462 tl_assert(t
< tyenvUsed
); // "is an original tmp"
8463 // The "max" operation in the lattice
8464 if (newUse
> useEnv
[t
]) useEnv
[t
] = newUse
;
8468 // We should never get here -- it implies non-flat IR
8470 VG_(tool_panic
)("noteTmpUsesIn");
8477 static void preInstrumentationAnalysis ( /*OUT*/HowUsed
** useEnvP
,
8478 /*OUT*/Bool
* hasBogusLiteralsP
,
8481 const UInt nOrigTmps
= (UInt
)sb_in
->tyenv
->types_used
;
8483 // We've seen no bogus literals so far.
8486 // This is calloc'd, so implicitly all entries are initialised to HuUnU.
8487 HowUsed
* useEnv
= VG_(calloc
)("mc.preInstrumentationAnalysis.1",
8488 nOrigTmps
, sizeof(HowUsed
));
8490 // Firstly, roll in contributions from the final dst address.
8491 bogus
= isBogusAtom(sb_in
->next
);
8492 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, sb_in
->next
);
8494 // Now work backwards through the stmts.
8495 for (Int i
= sb_in
->stmts_used
-1; i
>= 0; i
--) {
8496 IRStmt
* st
= sb_in
->stmts
[i
];
8498 // Deal with literals.
8499 if (LIKELY(!bogus
)) {
8500 bogus
= containsBogusLiterals(st
);
8503 // Deal with tmp uses.
8506 IRTemp dst
= st
->Ist
.WrTmp
.tmp
;
8507 IRExpr
* rhs
= st
->Ist
.WrTmp
.data
;
8508 // This is the one place where we have to consider all possible
8509 // tags for |rhs|, and can't just assume it is a tmp or a const.
8512 // just propagate demand for |dst| into this tmp use.
8513 noteTmpUsesIn(useEnv
, nOrigTmps
, useEnv
[dst
], rhs
);
8516 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, rhs
->Iex
.Unop
.arg
);
8519 if (rhs
->Iex
.Binop
.op
== Iop_Add64
8520 || rhs
->Iex
.Binop
.op
== Iop_Add32
) {
8521 // propagate demand for |dst| through to the operands.
8522 noteTmpUsesIn(useEnv
, nOrigTmps
,
8523 useEnv
[dst
], rhs
->Iex
.Binop
.arg1
);
8524 noteTmpUsesIn(useEnv
, nOrigTmps
,
8525 useEnv
[dst
], rhs
->Iex
.Binop
.arg2
);
8527 // just say that the operands are used in some unknown way.
8528 noteTmpUsesIn(useEnv
, nOrigTmps
,
8529 HuOth
, rhs
->Iex
.Binop
.arg1
);
8530 noteTmpUsesIn(useEnv
, nOrigTmps
,
8531 HuOth
, rhs
->Iex
.Binop
.arg2
);
8535 // All operands are used in some unknown way.
8536 IRTriop
* tri
= rhs
->Iex
.Triop
.details
;
8537 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, tri
->arg1
);
8538 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, tri
->arg2
);
8539 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, tri
->arg3
);
8543 // All operands are used in some unknown way.
8544 IRQop
* qop
= rhs
->Iex
.Qop
.details
;
8545 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, qop
->arg1
);
8546 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, qop
->arg2
);
8547 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, qop
->arg3
);
8548 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, qop
->arg4
);
8552 // The address will be checked (== PCasted).
8553 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, rhs
->Iex
.Load
.addr
);
8556 // The condition is PCasted, the then- and else-values
8558 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, rhs
->Iex
.ITE
.cond
);
8559 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, rhs
->Iex
.ITE
.iftrue
);
8560 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, rhs
->Iex
.ITE
.iffalse
);
8563 // The args are used in unknown ways.
8564 for (IRExpr
** args
= rhs
->Iex
.CCall
.args
; *args
; args
++) {
8565 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, *args
);
8569 // The index will be checked/PCasted (see do_shadow_GETI)
8570 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, rhs
->Iex
.GetI
.ix
);
8578 VG_(tool_panic
)("preInstrumentationAnalysis:"
8579 " unhandled IRExpr");
8584 // The address will be checked (== PCasted). The data will be
8585 // used in some unknown way.
8586 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, st
->Ist
.Store
.addr
);
8587 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.Store
.data
);
8590 // The guard will be checked (== PCasted)
8591 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, st
->Ist
.Exit
.guard
);
8594 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.Put
.data
);
8597 IRPutI
* putI
= st
->Ist
.PutI
.details
;
8598 // The index will be checked/PCasted (see do_shadow_PUTI). The
8599 // data will be used in an unknown way.
8600 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, putI
->ix
);
8601 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, putI
->data
);
8605 IRDirty
* d
= st
->Ist
.Dirty
.details
;
8606 // The guard will be checked (== PCasted)
8607 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, d
->guard
);
8608 // The args will be used in unknown ways.
8609 for (IRExpr
** args
= d
->args
; *args
; args
++) {
8610 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, *args
);
8615 IRCAS
* cas
= st
->Ist
.CAS
.details
;
8616 // Address will be pcasted, everything else used as unknown
8617 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, cas
->addr
);
8618 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, cas
->expdLo
);
8619 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, cas
->dataLo
);
8621 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, cas
->expdHi
);
8623 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, cas
->dataHi
);
8627 // Both exprs are used in unknown ways. TODO: can we safely
8628 // just ignore AbiHints?
8629 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.AbiHint
.base
);
8630 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.AbiHint
.nia
);
8633 // We might be able to do better, and use HuPCa for the addr.
8634 // It's not immediately obvious that we can, because the address
8635 // is regarded as "used" only when the guard is true.
8636 IRStoreG
* sg
= st
->Ist
.StoreG
.details
;
8637 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, sg
->addr
);
8638 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, sg
->data
);
8639 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, sg
->guard
);
8643 // Per similar comments to Ist_StoreG .. not sure whether this
8644 // is really optimal.
8645 IRLoadG
* lg
= st
->Ist
.LoadG
.details
;
8646 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, lg
->addr
);
8647 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, lg
->alt
);
8648 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, lg
->guard
);
8652 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, st
->Ist
.LLSC
.addr
);
8653 if (st
->Ist
.LLSC
.storedata
)
8654 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.LLSC
.storedata
);
8663 VG_(tool_panic
)("preInstrumentationAnalysis: unhandled IRStmt");
8666 } // Now work backwards through the stmts.
8668 // Return the computed use env and the bogus-atom flag.
8669 tl_assert(*useEnvP
== NULL
);
8672 tl_assert(*hasBogusLiteralsP
== False
);
8673 *hasBogusLiteralsP
= bogus
;
8677 IRSB
* MC_(instrument
) ( VgCallbackClosure
* closure
,
8679 const VexGuestLayout
* layout
,
8680 const VexGuestExtents
* vge
,
8681 const VexArchInfo
* archinfo_host
,
8682 IRType gWordTy
, IRType hWordTy
)
8684 Bool verboze
= 0||False
;
8685 Int i
, j
, first_stmt
;
8690 if (gWordTy
!= hWordTy
) {
8691 /* We don't currently support this case. */
8692 VG_(tool_panic
)("host/guest word size mismatch");
8695 /* Check we're not completely nuts */
8696 tl_assert(sizeof(UWord
) == sizeof(void*));
8697 tl_assert(sizeof(Word
) == sizeof(void*));
8698 tl_assert(sizeof(Addr
) == sizeof(void*));
8699 tl_assert(sizeof(ULong
) == 8);
8700 tl_assert(sizeof(Long
) == 8);
8701 tl_assert(sizeof(UInt
) == 4);
8702 tl_assert(sizeof(Int
) == 4);
8704 tl_assert(MC_(clo_mc_level
) >= 1 && MC_(clo_mc_level
) <= 3);
8707 sb_out
= deepCopyIRSBExceptStmts(sb_in
);
8709 /* Set up the running environment. Both .sb and .tmpMap are
8710 modified as we go along. Note that tmps are added to both
8711 .sb->tyenv and .tmpMap together, so the valid index-set for
8712 those two arrays should always be identical. */
8713 VG_(memset
)(&mce
, 0, sizeof(mce
));
8715 mce
.trace
= verboze
;
8716 mce
.layout
= layout
;
8717 mce
.hWordTy
= hWordTy
;
8718 mce
.tmpHowUsed
= NULL
;
8720 /* BEGIN decide on expense levels for instrumentation. */
8722 /* Initially, select the cheap version of everything for which we have an
8724 DetailLevelByOp__set_all( &mce
.dlbo
, DLcheap
);
8726 /* Take account of the --expensive-definedness-checks= flag. */
8727 if (MC_(clo_expensive_definedness_checks
) == EdcNO
) {
8728 /* We just selected 'cheap for everything', so we don't need to do
8729 anything here. mce.tmpHowUsed remains NULL. */
8731 else if (MC_(clo_expensive_definedness_checks
) == EdcYES
) {
8732 /* Select 'expensive for everything'. mce.tmpHowUsed remains NULL. */
8733 DetailLevelByOp__set_all( &mce
.dlbo
, DLexpensive
);
8736 tl_assert(MC_(clo_expensive_definedness_checks
) == EdcAUTO
);
8737 /* We'll make our own selection, based on known per-target constraints
8738 and also on analysis of the block to be instrumented. First, set
8739 up default values for detail levels.
8741 On x86 and amd64, we'll routinely encounter code optimised by LLVM
8742 5 and above. Enable accurate interpretation of the following.
8743 LLVM uses adds for some bitfield inserts, and we get a lot of false
8744 errors if the cheap interpretation is used, alas. Could solve this
8745 much better if we knew which of such adds came from x86/amd64 LEA
8746 instructions, since these are the only ones really needing the
8747 expensive interpretation, but that would require some way to tag
8748 them in the _toIR.c front ends, which is a lot of faffing around.
8749 So for now we use preInstrumentationAnalysis() to detect adds which
8750 are used only to construct memory addresses, which is an
8751 approximation to the above, and is self-contained.*/
8752 # if defined(VGA_x86)
8753 mce
.dlbo
.dl_Add32
= DLauto
;
8754 mce
.dlbo
.dl_CmpEQ16_CmpNE16
= DLexpensive
;
8755 mce
.dlbo
.dl_CmpEQ32_CmpNE32
= DLexpensive
;
8756 # elif defined(VGA_amd64)
8757 mce
.dlbo
.dl_Add32
= DLexpensive
;
8758 mce
.dlbo
.dl_Add64
= DLauto
;
8759 mce
.dlbo
.dl_CmpEQ16_CmpNE16
= DLexpensive
;
8760 mce
.dlbo
.dl_CmpEQ32_CmpNE32
= DLexpensive
;
8761 mce
.dlbo
.dl_CmpEQ64_CmpNE64
= DLexpensive
;
8762 # elif defined(VGA_ppc64le)
8763 // Needed by (at least) set_AV_CR6() in the front end.
8764 mce
.dlbo
.dl_CmpEQ64_CmpNE64
= DLexpensive
;
8765 # elif defined(VGA_arm64)
8766 mce
.dlbo
.dl_CmpEQ32_CmpNE32
= DLexpensive
;
8767 mce
.dlbo
.dl_CmpEQ64_CmpNE64
= DLexpensive
;
8768 # elif defined(VGA_arm)
8769 mce
.dlbo
.dl_CmpEQ32_CmpNE32
= DLexpensive
;
8772 /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8774 Bool hasBogusLiterals
= False
;
8775 preInstrumentationAnalysis( &mce
.tmpHowUsed
, &hasBogusLiterals
, sb_in
);
8777 if (hasBogusLiterals
) {
8778 /* This happens very rarely. In this case just select expensive
8779 for everything, and throw away the tmp-use analysis results. */
8780 DetailLevelByOp__set_all( &mce
.dlbo
, DLexpensive
);
8781 VG_(free
)( mce
.tmpHowUsed
);
8782 mce
.tmpHowUsed
= NULL
;
8784 /* Nothing. mce.tmpHowUsed contains tmp-use analysis results,
8785 which will be used for some subset of Iop_{Add,Sub}{32,64},
8786 based on which ones are set to DLauto for this target. */
8790 DetailLevelByOp__check_sanity( &mce
.dlbo
);
8793 // Debug printing: which tmps have been identified as PCast-only use
8794 if (mce
.tmpHowUsed
) {
8795 VG_(printf
)("Cheapies: ");
8796 for (UInt q
= 0; q
< sb_in
->tyenv
->types_used
; q
++) {
8797 if (mce
.tmpHowUsed
[q
] == HuPCa
) {
8798 VG_(printf
)("t%u ", q
);
8804 // Debug printing: number of ops by detail level
8805 UChar nCheap
= DetailLevelByOp__count( &mce
.dlbo
, DLcheap
);
8806 UChar nAuto
= DetailLevelByOp__count( &mce
.dlbo
, DLauto
);
8807 UChar nExpensive
= DetailLevelByOp__count( &mce
.dlbo
, DLexpensive
);
8808 tl_assert(nCheap
+ nAuto
+ nExpensive
== 8);
8810 VG_(printf
)("%u,%u,%u ", nCheap
, nAuto
, nExpensive
);
8812 /* END decide on expense levels for instrumentation. */
8814 /* Initialise the running the tmp environment. */
8816 mce
.tmpMap
= VG_(newXA
)( VG_(malloc
), "mc.MC_(instrument).1", VG_(free
),
8817 sizeof(TempMapEnt
));
8818 VG_(hintSizeXA
) (mce
.tmpMap
, sb_in
->tyenv
->types_used
);
8819 for (i
= 0; i
< sb_in
->tyenv
->types_used
; i
++) {
8822 ent
.shadowV
= IRTemp_INVALID
;
8823 ent
.shadowB
= IRTemp_INVALID
;
8824 VG_(addToXA
)( mce
.tmpMap
, &ent
);
8826 tl_assert( VG_(sizeXA
)( mce
.tmpMap
) == sb_in
->tyenv
->types_used
);
8828 /* Finally, begin instrumentation. */
8829 /* Copy verbatim any IR preamble preceding the first IMark */
8831 tl_assert(mce
.sb
== sb_out
);
8832 tl_assert(mce
.sb
!= sb_in
);
8835 while (i
< sb_in
->stmts_used
&& sb_in
->stmts
[i
]->tag
!= Ist_IMark
) {
8837 st
= sb_in
->stmts
[i
];
8839 tl_assert(isFlatIRStmt(st
));
8841 stmt( 'C', &mce
, sb_in
->stmts
[i
] );
8845 /* Nasty problem. IR optimisation of the pre-instrumented IR may
8846 cause the IR following the preamble to contain references to IR
8847 temporaries defined in the preamble. Because the preamble isn't
8848 instrumented, these temporaries don't have any shadows.
8849 Nevertheless uses of them following the preamble will cause
8850 memcheck to generate references to their shadows. End effect is
8851 to cause IR sanity check failures, due to references to
8852 non-existent shadows. This is only evident for the complex
8853 preambles used for function wrapping on TOC-afflicted platforms
8856 The following loop therefore scans the preamble looking for
8857 assignments to temporaries. For each one found it creates an
8858 assignment to the corresponding (V) shadow temp, marking it as
8859 'defined'. This is the same resulting IR as if the main
8860 instrumentation loop before had been applied to the statement
8863 Similarly, if origin tracking is enabled, we must generate an
8864 assignment for the corresponding origin (B) shadow, claiming
8865 no-origin, as appropriate for a defined value.
8867 for (j
= 0; j
< i
; j
++) {
8868 if (sb_in
->stmts
[j
]->tag
== Ist_WrTmp
) {
8869 /* findShadowTmpV checks its arg is an original tmp;
8870 no need to assert that here. */
8871 IRTemp tmp_o
= sb_in
->stmts
[j
]->Ist
.WrTmp
.tmp
;
8872 IRTemp tmp_v
= findShadowTmpV(&mce
, tmp_o
);
8873 IRType ty_v
= typeOfIRTemp(sb_out
->tyenv
, tmp_v
);
8874 assign( 'V', &mce
, tmp_v
, definedOfType( ty_v
) );
8875 if (MC_(clo_mc_level
) == 3) {
8876 IRTemp tmp_b
= findShadowTmpB(&mce
, tmp_o
);
8877 tl_assert(typeOfIRTemp(sb_out
->tyenv
, tmp_b
) == Ity_I32
);
8878 assign( 'B', &mce
, tmp_b
, mkU32(0)/* UNKNOWN ORIGIN */);
8881 VG_(printf
)("create shadow tmp(s) for preamble tmp [%d] ty ", j
);
8888 /* Iterate over the remaining stmts to generate instrumentation. */
8890 tl_assert(sb_in
->stmts_used
> 0);
8892 tl_assert(i
< sb_in
->stmts_used
);
8893 tl_assert(sb_in
->stmts
[i
]->tag
== Ist_IMark
);
8895 for (/* use current i*/; i
< sb_in
->stmts_used
; i
++) {
8897 st
= sb_in
->stmts
[i
];
8898 first_stmt
= sb_out
->stmts_used
;
8906 if (MC_(clo_mc_level
) == 3) {
8907 /* See comments on case Ist_CAS below. */
8908 if (st
->tag
!= Ist_CAS
)
8909 schemeS( &mce
, st
);
8912 /* Generate instrumentation code for each stmt ... */
8917 IRTemp dst
= st
->Ist
.WrTmp
.tmp
;
8918 tl_assert(dst
< (UInt
)sb_in
->tyenv
->types_used
);
8919 HowUsed hu
= mce
.tmpHowUsed
? mce
.tmpHowUsed
[dst
]
8920 : HuOth
/*we don't know, so play safe*/;
8921 assign( 'V', &mce
, findShadowTmpV(&mce
, st
->Ist
.WrTmp
.tmp
),
8922 expr2vbits( &mce
, st
->Ist
.WrTmp
.data
, hu
));
8927 do_shadow_PUT( &mce
,
8930 NULL
/* shadow atom */, NULL
/* guard */ );
8934 do_shadow_PUTI( &mce
, st
->Ist
.PutI
.details
);
8938 do_shadow_Store( &mce
, st
->Ist
.Store
.end
,
8939 st
->Ist
.Store
.addr
, 0/* addr bias */,
8941 NULL
/* shadow data */,
8946 do_shadow_StoreG( &mce
, st
->Ist
.StoreG
.details
);
8950 do_shadow_LoadG( &mce
, st
->Ist
.LoadG
.details
);
8954 complainIfUndefined( &mce
, st
->Ist
.Exit
.guard
, NULL
);
8965 do_shadow_Dirty( &mce
, st
->Ist
.Dirty
.details
);
8969 do_AbiHint( &mce
, st
->Ist
.AbiHint
.base
,
8970 st
->Ist
.AbiHint
.len
,
8971 st
->Ist
.AbiHint
.nia
);
8975 do_shadow_CAS( &mce
, st
->Ist
.CAS
.details
);
8976 /* Note, do_shadow_CAS copies the CAS itself to the output
8977 block, because it needs to add instrumentation both
8978 before and after it. Hence skip the copy below. Also
8979 skip the origin-tracking stuff (call to schemeS) above,
8980 since that's all tangled up with it too; do_shadow_CAS
8985 do_shadow_LLSC( &mce
,
8987 st
->Ist
.LLSC
.result
,
8989 st
->Ist
.LLSC
.storedata
);
8996 VG_(tool_panic
)("memcheck: unhandled IRStmt");
8998 } /* switch (st->tag) */
9001 for (j
= first_stmt
; j
< sb_out
->stmts_used
; j
++) {
9003 ppIRStmt(sb_out
->stmts
[j
]);
9009 /* ... and finally copy the stmt itself to the output. Except,
9010 skip the copy of IRCASs; see comments on case Ist_CAS
9012 if (st
->tag
!= Ist_CAS
)
9013 stmt('C', &mce
, st
);
9016 /* Now we need to complain if the jump target is undefined. */
9017 first_stmt
= sb_out
->stmts_used
;
9020 VG_(printf
)("sb_in->next = ");
9021 ppIRExpr(sb_in
->next
);
9022 VG_(printf
)("\n\n");
9025 complainIfUndefined( &mce
, sb_in
->next
, NULL
);
9028 for (j
= first_stmt
; j
< sb_out
->stmts_used
; j
++) {
9030 ppIRStmt(sb_out
->stmts
[j
]);
9036 /* If this fails, there's been some serious snafu with tmp management,
9037 that should be investigated. */
9038 tl_assert( VG_(sizeXA
)( mce
.tmpMap
) == mce
.sb
->tyenv
->types_used
);
9039 VG_(deleteXA
)( mce
.tmpMap
);
9041 if (mce
.tmpHowUsed
) {
9042 VG_(free
)( mce
.tmpHowUsed
);
9045 tl_assert(mce
.sb
== sb_out
);
9050 /*--------------------------------------------------------------------*/
9051 /*--- end mc_translate.c ---*/
9052 /*--------------------------------------------------------------------*/