2 /*--------------------------------------------------------------------*/
3 /*--- Cachegrind: everything but the simulation itself. ---*/
5 /*--------------------------------------------------------------------*/
8 This file is part of Cachegrind, a high-precision tracing profiler
11 Copyright (C) 2002-2023 Nicholas Nethercote
14 This program is free software; you can redistribute it and/or
15 modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of the
17 License, or (at your option) any later version.
19 This program is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, see <http://www.gnu.org/licenses/>.
27 The GNU General Public License is contained in the file COPYING.
30 #include "pub_tool_basics.h"
31 #include "pub_tool_debuginfo.h"
32 #include "pub_tool_libcbase.h"
33 #include "pub_tool_libcassert.h"
34 #include "pub_tool_libcfile.h"
35 #include "pub_tool_libcprint.h"
36 #include "pub_tool_libcproc.h"
37 #include "pub_tool_mallocfree.h"
38 #include "pub_tool_options.h"
39 #include "pub_tool_oset.h"
40 #include "pub_tool_tooliface.h"
41 #include "pub_tool_transtab.h"
42 #include "pub_tool_xarray.h"
43 #include "pub_tool_clientstate.h"
44 #include "pub_tool_machine.h" // VG_(fnptr_to_fnentry)
46 #include "cachegrind.h"
49 #include "cg_branchpred.c"
51 /*------------------------------------------------------------*/
53 /*------------------------------------------------------------*/
55 /* Set to 1 for very verbose debugging */
58 /*------------------------------------------------------------*/
60 /*------------------------------------------------------------*/
62 static Bool clo_cache_sim
= False
; /* do cache simulation? */
63 static Bool clo_branch_sim
= False
; /* do branch simulation? */
64 static Bool clo_instr_at_start
= True
; /* instrument at startup? */
65 static const HChar
* clo_cachegrind_out_file
= "cachegrind.out.%p";
67 /*------------------------------------------------------------*/
68 /*--- Cachesim configuration ---*/
69 /*------------------------------------------------------------*/
71 static Int min_line_size
= 0; /* min of L1 and LL cache line sizes */
73 /*------------------------------------------------------------*/
74 /*--- Types and Data Structures ---*/
75 /*------------------------------------------------------------*/
79 ULong a
; /* total # memory accesses of this kind */
80 ULong m1
; /* misses in the first level cache */
81 ULong mL
; /* misses in the second level cache */
87 ULong b
; /* total # branches of this kind */
88 ULong mp
; /* number of branches mispredicted */
92 //------------------------------------------------------------
93 // Primary data structure #1: CC table
94 // - Holds the per-source-line hit/miss stats, grouped by file/function/line.
95 // - an ordered set of CCs. CC indexing done by file/function/line (as
96 // determined from the instrAddr).
97 // - Traversed for dumping stats at end in file/func/line hierarchy.
107 CodeLoc loc
; /* Source location that these counts pertain to */
108 CacheCC Ir
; /* Insn read counts */
109 CacheCC Dr
; /* Data read counts */
110 CacheCC Dw
; /* Data write/modify counts */
111 BranchCC Bc
; /* Conditional branch counts */
112 BranchCC Bi
; /* Indirect branch counts */
115 // First compare file, then fn, then line.
116 static Word
cmp_CodeLoc_LineCC(const void *vloc
, const void *vcc
)
119 const CodeLoc
* a
= (const CodeLoc
*)vloc
;
120 const CodeLoc
* b
= &(((const LineCC
*)vcc
)->loc
);
122 res
= VG_(strcmp
)(a
->file
, b
->file
);
126 res
= VG_(strcmp
)(a
->fn
, b
->fn
);
130 return a
->line
- b
->line
;
133 static OSet
* CC_table
;
135 //------------------------------------------------------------
136 // Primary data structure #2: InstrInfo table
137 // - Holds the cached info about each instr that is used for simulation.
138 // - table(SB_start_addr, list(InstrInfo))
139 // - For each SB, each InstrInfo in the list holds info about the
140 // instruction (instrLen, instrAddr, etc), plus a pointer to its line
141 // CC. This node is what's passed to the simulation function.
142 // - When SBs are discarded the relevant list(instr_details) is freed.
144 typedef struct _InstrInfo InstrInfo
;
148 LineCC
* parent
; // parent line-CC
151 typedef struct _SB_info SB_info
;
153 Addr SB_addr
; // key; MUST BE FIRST
158 static OSet
* instrInfoTable
;
160 //------------------------------------------------------------
161 // Secondary data structure: string table
162 // - holds strings, avoiding dups
163 // - used for filenames and function names, each of which will be
164 // pointed to by one or more CCs.
165 // - it also allows equality checks just by pointer comparison, which
166 // is good when printing the output file at the end.
168 static OSet
* stringTable
;
170 //------------------------------------------------------------
172 static Int distinct_files
= 0;
173 static Int distinct_fns
= 0;
174 static Int distinct_lines
= 0;
175 static Int distinct_instrsGen
= 0;
176 static Int distinct_instrsNoX
= 0;
178 static Int full_debugs
= 0;
179 static Int file_line_debugs
= 0;
180 static Int fn_debugs
= 0;
181 static Int no_debugs
= 0;
183 //------------------------------------------------------------
184 // Instrumentation control
185 static Bool instr_enabled
= True
;
187 /*------------------------------------------------------------*/
188 /*--- String table operations ---*/
189 /*------------------------------------------------------------*/
191 static Word
stringCmp( const void* key
, const void* elem
)
193 return VG_(strcmp
)(*(const HChar
*const *)key
, *(const HChar
*const *)elem
);
196 // Get a permanent string; either pull it out of the string table if it's
197 // been encountered before, or dup it and put it into the string table.
198 static HChar
* get_perm_string(const HChar
* s
)
200 HChar
** s_ptr
= VG_(OSetGen_Lookup
)(stringTable
, &s
);
204 HChar
** s_node
= VG_(OSetGen_AllocNode
)(stringTable
, sizeof(HChar
*));
205 *s_node
= VG_(strdup
)("cg.main.gps.1", s
);
206 VG_(OSetGen_Insert
)(stringTable
, s_node
);
211 /*------------------------------------------------------------*/
212 /*--- CC table operations ---*/
213 /*------------------------------------------------------------*/
215 static void get_debug_info(Addr instr_addr
, const HChar
**dir
,
216 const HChar
**file
, const HChar
**fn
, UInt
* line
)
218 DiEpoch ep
= VG_(current_DiEpoch
)();
219 Bool found_file_line
= VG_(get_filename_linenum
)(
225 Bool found_fn
= VG_(get_fnname
)(ep
, instr_addr
, fn
);
227 if (!found_file_line
) {
235 if (found_file_line
) {
236 if (found_fn
) full_debugs
++;
237 else file_line_debugs
++;
239 if (found_fn
) fn_debugs
++;
244 // Do a three step traversal: by file, then fn, then line.
245 // Returns a pointer to the line CC, creates a new one if necessary.
246 static LineCC
* get_lineCC(Addr origAddr
)
248 const HChar
*fn
, *file
, *dir
;
253 get_debug_info(origAddr
, &dir
, &file
, &fn
, &line
);
255 // Form an absolute pathname if a directory is available
256 HChar absfile
[VG_(strlen
)(dir
) + 1 + VG_(strlen
)(file
) + 1];
259 VG_(sprintf
)(absfile
, "%s/%s", dir
, file
);
261 VG_(sprintf
)(absfile
, "%s", file
);
268 lineCC
= VG_(OSetGen_Lookup
)(CC_table
, &loc
);
270 // Allocate and zero a new node.
271 lineCC
= VG_(OSetGen_AllocNode
)(CC_table
, sizeof(LineCC
));
272 lineCC
->loc
.file
= get_perm_string(loc
.file
);
273 lineCC
->loc
.fn
= get_perm_string(loc
.fn
);
274 lineCC
->loc
.line
= loc
.line
;
288 VG_(OSetGen_Insert
)(CC_table
, lineCC
);
294 /*------------------------------------------------------------*/
295 /*--- Cache simulation functions ---*/
296 /*------------------------------------------------------------*/
298 /* A common case for an instruction read event is that the
299 * bytes read belong to the same cache line in both L1I and LL
300 * (if cache line sizes of L1 and LL are the same).
301 * As this can be detected at instrumentation time, and results
302 * in faster simulation, special-casing is benefical.
304 * Abbreviations used in var/function names:
305 * IrNoX - instruction read does not cross cache lines
306 * IrGen - generic instruction read; not detected as IrNoX
307 * Ir - not known / not important whether it is an IrNoX
310 // Only used with --cache-sim=no.
312 void log_1Ir(InstrInfo
* n
)
317 // Only used with --cache-sim=no.
319 void log_2Ir(InstrInfo
* n
, InstrInfo
* n2
)
325 // Only used with --cache-sim=no.
327 void log_3Ir(InstrInfo
* n
, InstrInfo
* n2
, InstrInfo
* n3
)
334 // Generic case for instruction reads: may cross cache lines.
335 // All other Ir handlers expect IrNoX instruction reads.
337 void log_1IrGen_0D_cache_access(InstrInfo
* n
)
339 //VG_(printf)("1IrGen_0D : CCaddr=0x%010lx, iaddr=0x%010lx, isize=%lu\n",
340 // n, n->instr_addr, n->instr_len);
341 cachesim_I1_doref_Gen(n
->instr_addr
, n
->instr_len
,
342 &n
->parent
->Ir
.m1
, &n
->parent
->Ir
.mL
);
347 void log_1IrNoX_0D_cache_access(InstrInfo
* n
)
349 //VG_(printf)("1IrNoX_0D : CCaddr=0x%010lx, iaddr=0x%010lx, isize=%lu\n",
350 // n, n->instr_addr, n->instr_len);
351 cachesim_I1_doref_NoX(n
->instr_addr
, n
->instr_len
,
352 &n
->parent
->Ir
.m1
, &n
->parent
->Ir
.mL
);
357 void log_2IrNoX_0D_cache_access(InstrInfo
* n
, InstrInfo
* n2
)
359 //VG_(printf)("2IrNoX_0D : CC1addr=0x%010lx, i1addr=0x%010lx, i1size=%lu\n"
360 // " CC2addr=0x%010lx, i2addr=0x%010lx, i2size=%lu\n",
361 // n, n->instr_addr, n->instr_len,
362 // n2, n2->instr_addr, n2->instr_len);
363 cachesim_I1_doref_NoX(n
->instr_addr
, n
->instr_len
,
364 &n
->parent
->Ir
.m1
, &n
->parent
->Ir
.mL
);
366 cachesim_I1_doref_NoX(n2
->instr_addr
, n2
->instr_len
,
367 &n2
->parent
->Ir
.m1
, &n2
->parent
->Ir
.mL
);
372 void log_3IrNoX_0D_cache_access(InstrInfo
* n
, InstrInfo
* n2
, InstrInfo
* n3
)
374 //VG_(printf)("3IrNoX_0D : CC1addr=0x%010lx, i1addr=0x%010lx, i1size=%lu\n"
375 // " CC2addr=0x%010lx, i2addr=0x%010lx, i2size=%lu\n"
376 // " CC3addr=0x%010lx, i3addr=0x%010lx, i3size=%lu\n",
377 // n, n->instr_addr, n->instr_len,
378 // n2, n2->instr_addr, n2->instr_len,
379 // n3, n3->instr_addr, n3->instr_len);
380 cachesim_I1_doref_NoX(n
->instr_addr
, n
->instr_len
,
381 &n
->parent
->Ir
.m1
, &n
->parent
->Ir
.mL
);
383 cachesim_I1_doref_NoX(n2
->instr_addr
, n2
->instr_len
,
384 &n2
->parent
->Ir
.m1
, &n2
->parent
->Ir
.mL
);
386 cachesim_I1_doref_NoX(n3
->instr_addr
, n3
->instr_len
,
387 &n3
->parent
->Ir
.m1
, &n3
->parent
->Ir
.mL
);
392 void log_1IrNoX_1Dr_cache_access(InstrInfo
* n
, Addr data_addr
, Word data_size
)
394 //VG_(printf)("1IrNoX_1Dr: CCaddr=0x%010lx, iaddr=0x%010lx, isize=%lu\n"
395 // " daddr=0x%010lx, dsize=%lu\n",
396 // n, n->instr_addr, n->instr_len, data_addr, data_size);
397 cachesim_I1_doref_NoX(n
->instr_addr
, n
->instr_len
,
398 &n
->parent
->Ir
.m1
, &n
->parent
->Ir
.mL
);
401 cachesim_D1_doref(data_addr
, data_size
,
402 &n
->parent
->Dr
.m1
, &n
->parent
->Dr
.mL
);
407 void log_1IrNoX_1Dw_cache_access(InstrInfo
* n
, Addr data_addr
, Word data_size
)
409 //VG_(printf)("1IrNoX_1Dw: CCaddr=0x%010lx, iaddr=0x%010lx, isize=%lu\n"
410 // " daddr=0x%010lx, dsize=%lu\n",
411 // n, n->instr_addr, n->instr_len, data_addr, data_size);
412 cachesim_I1_doref_NoX(n
->instr_addr
, n
->instr_len
,
413 &n
->parent
->Ir
.m1
, &n
->parent
->Ir
.mL
);
416 cachesim_D1_doref(data_addr
, data_size
,
417 &n
->parent
->Dw
.m1
, &n
->parent
->Dw
.mL
);
421 /* Note that addEvent_D_guarded assumes that log_0Ir_1Dr_cache_access
422 and log_0Ir_1Dw_cache_access have exactly the same prototype. If
423 you change them, you must change addEvent_D_guarded too. */
425 void log_0Ir_1Dr_cache_access(InstrInfo
* n
, Addr data_addr
, Word data_size
)
427 //VG_(printf)("0Ir_1Dr: CCaddr=0x%010lx, daddr=0x%010lx, dsize=%lu\n",
428 // n, data_addr, data_size);
429 cachesim_D1_doref(data_addr
, data_size
,
430 &n
->parent
->Dr
.m1
, &n
->parent
->Dr
.mL
);
434 /* See comment on log_0Ir_1Dr_cache_access. */
436 void log_0Ir_1Dw_cache_access(InstrInfo
* n
, Addr data_addr
, Word data_size
)
438 //VG_(printf)("0Ir_1Dw: CCaddr=0x%010lx, daddr=0x%010lx, dsize=%lu\n",
439 // n, data_addr, data_size);
440 cachesim_D1_doref(data_addr
, data_size
,
441 &n
->parent
->Dw
.m1
, &n
->parent
->Dw
.mL
);
445 /* For branches, we consult two different predictors, one which
446 predicts taken/untaken for conditional branches, and the other
447 which predicts the branch target address for indirect branches
448 (jump-to-register style ones). */
451 void log_cond_branch(InstrInfo
* n
, Word taken
)
453 //VG_(printf)("cbrnch: CCaddr=0x%010lx, taken=0x%010lx\n",
457 += (1 & do_cond_branch_predict(n
->instr_addr
, taken
));
461 void log_ind_branch(InstrInfo
* n
, UWord actual_dst
)
463 //VG_(printf)("ibrnch: CCaddr=0x%010lx, dst=0x%010lx\n",
467 += (1 & do_ind_branch_predict(n
->instr_addr
, actual_dst
));
471 /*------------------------------------------------------------*/
472 /*--- Instrumentation types and structures ---*/
473 /*------------------------------------------------------------*/
475 /* Maintain an ordered list of memory events which are outstanding, in
476 the sense that no IR has yet been generated to do the relevant
477 helper calls. The BB is scanned top to bottom and memory events
478 are added to the end of the list, merging with the most recent
479 notified event where possible (Dw immediately following Dr and
480 having the same size and EA can be merged).
482 This merging is done so that for architectures which have
483 load-op-store instructions (x86, amd64), the insn is treated as if
484 it makes just one memory reference (a modify), rather than two (a
485 read followed by a write at the same address).
487 At various points the list will need to be flushed, that is, IR
488 generated from it. That must happen before any possible exit from
489 the block (the end, or an IRStmt_Exit). Flushing also takes place
490 when there is no space to add a new event.
492 If we require the simulation statistics to be up to date with
493 respect to possible memory exceptions, then the list would have to
494 be flushed before each memory reference. That would however lose
495 performance by inhibiting event-merging during flushing.
497 Flushing the list consists of walking it start to end and emitting
498 instrumentation IR for each event, in the order in which they
499 appear. It may be possible to emit a single call for two adjacent
500 events in order to reduce the number of helper function calls made.
501 For example, it could well be profitable to handle two adjacent Ir
502 events with a single helper call. */
510 Ev_IrNoX
, // Instruction read not crossing cache lines
511 Ev_IrGen
, // Generic Ir, not being detected as IrNoX
514 Ev_Dm
, // Data modify (read then write)
515 Ev_Bc
, // branch conditional
516 Ev_Bi
// branch indirect (to unknown destination)
542 IRAtom
* taken
; /* :: Ity_I1 */
551 static void init_Event ( Event
* ev
) {
552 VG_(memset
)(ev
, 0, sizeof(Event
));
555 static IRAtom
* get_Event_dea ( Event
* ev
) {
557 case Ev_Dr
: return ev
->Ev
.Dr
.ea
;
558 case Ev_Dw
: return ev
->Ev
.Dw
.ea
;
559 case Ev_Dm
: return ev
->Ev
.Dm
.ea
;
560 default: tl_assert(0);
564 static Int
get_Event_dszB ( Event
* ev
) {
566 case Ev_Dr
: return ev
->Ev
.Dr
.szB
;
567 case Ev_Dw
: return ev
->Ev
.Dw
.szB
;
568 case Ev_Dm
: return ev
->Ev
.Dm
.szB
;
569 default: tl_assert(0);
574 /* Up to this many unnotified events are allowed. Number is
575 arbitrary. Larger numbers allow more event merging to occur, but
576 potentially induce more spilling due to extending live ranges of
577 address temporaries. */
581 /* A struct which holds all the running state during instrumentation.
582 Mostly to avoid passing loads of parameters everywhere. */
585 /* The current outstanding-memory-event list. */
586 Event events
[N_EVENTS
];
589 /* The array of InstrInfo bins for the BB. */
592 /* Number InstrInfo bins 'used' so far. */
595 /* The output SB being constructed. */
601 /*------------------------------------------------------------*/
602 /*--- Instrumentation main ---*/
603 /*------------------------------------------------------------*/
605 // Note that origAddr is the real origAddr, not the address of the first
606 // instruction in the block (they can be different due to redirection).
608 SB_info
* get_SB_info(IRSB
* sbIn
, Addr origAddr
)
614 // Count number of original instrs in SB
616 for (i
= 0; i
< sbIn
->stmts_used
; i
++) {
618 if (Ist_IMark
== st
->tag
) n_instrs
++;
621 // Check that we don't have an entry for this BB in the instr-info table.
622 // If this assertion fails, there has been some screwup: some
623 // translations must have been discarded but Cachegrind hasn't discarded
624 // the corresponding entries in the instr-info table.
625 sbInfo
= VG_(OSetGen_Lookup
)(instrInfoTable
, &origAddr
);
626 tl_assert(NULL
== sbInfo
);
628 // BB never translated before (at this address, at least; could have
629 // been unloaded and then reloaded elsewhere in memory)
630 sbInfo
= VG_(OSetGen_AllocNode
)(instrInfoTable
,
631 sizeof(SB_info
) + n_instrs
*sizeof(InstrInfo
));
632 sbInfo
->SB_addr
= origAddr
;
633 sbInfo
->n_instrs
= n_instrs
;
634 VG_(OSetGen_Insert
)( instrInfoTable
, sbInfo
);
640 static void showEvent ( Event
* ev
)
644 VG_(printf
)("IrGen %p\n", ev
->inode
);
647 VG_(printf
)("IrNoX %p\n", ev
->inode
);
650 VG_(printf
)("Dr %p %d EA=", ev
->inode
, ev
->Ev
.Dr
.szB
);
651 ppIRExpr(ev
->Ev
.Dr
.ea
);
655 VG_(printf
)("Dw %p %d EA=", ev
->inode
, ev
->Ev
.Dw
.szB
);
656 ppIRExpr(ev
->Ev
.Dw
.ea
);
660 VG_(printf
)("Dm %p %d EA=", ev
->inode
, ev
->Ev
.Dm
.szB
);
661 ppIRExpr(ev
->Ev
.Dm
.ea
);
665 VG_(printf
)("Bc %p GA=", ev
->inode
);
666 ppIRExpr(ev
->Ev
.Bc
.taken
);
670 VG_(printf
)("Bi %p DST=", ev
->inode
);
671 ppIRExpr(ev
->Ev
.Bi
.dst
);
680 // Reserve and initialise an InstrInfo for the first mention of a new insn.
682 InstrInfo
* setup_InstrInfo ( CgState
* cgs
, Addr instr_addr
, UInt instr_len
)
685 tl_assert(cgs
->sbInfo_i
>= 0);
686 tl_assert(cgs
->sbInfo_i
< cgs
->sbInfo
->n_instrs
);
687 i_node
= &cgs
->sbInfo
->instrs
[ cgs
->sbInfo_i
];
688 i_node
->instr_addr
= instr_addr
;
689 i_node
->instr_len
= instr_len
;
690 i_node
->parent
= get_lineCC(instr_addr
);
696 /* Generate code for all outstanding memory events, and mark the queue
697 empty. Code is generated into cgs->bbOut, and this activity
698 'consumes' slots in cgs->sbInfo. */
700 static void flushEvents ( CgState
* cgs
)
703 const HChar
* helperName
;
713 while (i
< cgs
->events_used
) {
720 /* generate IR to notify event i and possibly the ones
721 immediately following it. */
722 tl_assert(i
>= 0 && i
< cgs
->events_used
);
724 ev
= &cgs
->events
[i
];
725 ev2
= ( i
< cgs
->events_used
-1 ? &cgs
->events
[i
+1] : NULL
);
726 ev3
= ( i
< cgs
->events_used
-2 ? &cgs
->events
[i
+2] : NULL
);
729 VG_(printf
)(" flush ");
733 i_node_expr
= mkIRExpr_HWord( (HWord
)ev
->inode
);
735 /* Decide on helper fn to call and args to pass it, and advance
739 /* Merge an IrNoX with a following Dr/Dm. */
740 if (ev2
&& (ev2
->tag
== Ev_Dr
|| ev2
->tag
== Ev_Dm
)) {
741 /* Why is this true? It's because we're merging an Ir
742 with a following Dr or Dm. The Ir derives from the
743 instruction's IMark and the Dr/Dm from data
744 references which follow it. In short it holds
745 because each insn starts with an IMark, hence an
746 Ev_Ir, and so these Dr/Dm must pertain to the
747 immediately preceding Ir. Same applies to analogous
748 assertions in the subsequent cases. */
749 tl_assert(ev2
->inode
== ev
->inode
);
750 helperName
= "log_1IrNoX_1Dr_cache_access";
751 helperAddr
= &log_1IrNoX_1Dr_cache_access
;
752 argv
= mkIRExprVec_3( i_node_expr
,
754 mkIRExpr_HWord( get_Event_dszB(ev2
) ) );
758 /* Merge an IrNoX with a following Dw. */
760 if (ev2
&& ev2
->tag
== Ev_Dw
) {
761 tl_assert(ev2
->inode
== ev
->inode
);
762 helperName
= "log_1IrNoX_1Dw_cache_access";
763 helperAddr
= &log_1IrNoX_1Dw_cache_access
;
764 argv
= mkIRExprVec_3( i_node_expr
,
766 mkIRExpr_HWord( get_Event_dszB(ev2
) ) );
770 /* Merge an IrNoX with two following IrNoX's. */
772 if (ev2
&& ev3
&& ev2
->tag
== Ev_IrNoX
&& ev3
->tag
== Ev_IrNoX
)
775 helperName
= "log_3IrNoX_0D_cache_access";
776 helperAddr
= &log_3IrNoX_0D_cache_access
;
778 helperName
= "log_3Ir";
779 helperAddr
= &log_3Ir
;
781 argv
= mkIRExprVec_3( i_node_expr
,
782 mkIRExpr_HWord( (HWord
)ev2
->inode
),
783 mkIRExpr_HWord( (HWord
)ev3
->inode
) );
787 /* Merge an IrNoX with one following IrNoX. */
789 if (ev2
&& ev2
->tag
== Ev_IrNoX
) {
791 helperName
= "log_2IrNoX_0D_cache_access";
792 helperAddr
= &log_2IrNoX_0D_cache_access
;
794 helperName
= "log_2Ir";
795 helperAddr
= &log_2Ir
;
797 argv
= mkIRExprVec_2( i_node_expr
,
798 mkIRExpr_HWord( (HWord
)ev2
->inode
) );
802 /* No merging possible; emit as-is. */
805 helperName
= "log_1IrNoX_0D_cache_access";
806 helperAddr
= &log_1IrNoX_0D_cache_access
;
808 helperName
= "log_1Ir";
809 helperAddr
= &log_1Ir
;
811 argv
= mkIRExprVec_1( i_node_expr
);
818 helperName
= "log_1IrGen_0D_cache_access";
819 helperAddr
= &log_1IrGen_0D_cache_access
;
821 helperName
= "log_1Ir";
822 helperAddr
= &log_1Ir
;
824 argv
= mkIRExprVec_1( i_node_expr
);
830 /* Data read or modify */
831 helperName
= "log_0Ir_1Dr_cache_access";
832 helperAddr
= &log_0Ir_1Dr_cache_access
;
833 argv
= mkIRExprVec_3( i_node_expr
,
835 mkIRExpr_HWord( get_Event_dszB(ev
) ) );
841 helperName
= "log_0Ir_1Dw_cache_access";
842 helperAddr
= &log_0Ir_1Dw_cache_access
;
843 argv
= mkIRExprVec_3( i_node_expr
,
845 mkIRExpr_HWord( get_Event_dszB(ev
) ) );
850 /* Conditional branch */
851 helperName
= "log_cond_branch";
852 helperAddr
= &log_cond_branch
;
853 argv
= mkIRExprVec_2( i_node_expr
, ev
->Ev
.Bc
.taken
);
858 /* Branch to an unknown destination */
859 helperName
= "log_ind_branch";
860 helperAddr
= &log_ind_branch
;
861 argv
= mkIRExprVec_2( i_node_expr
, ev
->Ev
.Bi
.dst
);
869 /* Add the helper. */
870 tl_assert(helperName
);
871 tl_assert(helperAddr
);
873 di
= unsafeIRDirty_0_N( regparms
,
874 helperName
, VG_(fnptr_to_fnentry
)( helperAddr
),
876 addStmtToIRSB( cgs
->sbOut
, IRStmt_Dirty(di
) );
879 cgs
->events_used
= 0;
882 static void addEvent_Ir ( CgState
* cgs
, InstrInfo
* inode
)
885 if (cgs
->events_used
== N_EVENTS
)
887 tl_assert(cgs
->events_used
>= 0 && cgs
->events_used
< N_EVENTS
);
888 evt
= &cgs
->events
[cgs
->events_used
];
891 if (cachesim_is_IrNoX(inode
->instr_addr
, inode
->instr_len
)) {
893 distinct_instrsNoX
++;
896 distinct_instrsGen
++;
902 void addEvent_Dr ( CgState
* cgs
, InstrInfo
* inode
, Int datasize
, IRAtom
* ea
)
904 tl_assert(isIRAtom(ea
));
909 tl_assert(datasize
>= 1 && datasize
<= min_line_size
);
911 if (cgs
->events_used
== N_EVENTS
) {
914 tl_assert(cgs
->events_used
>= 0 && cgs
->events_used
< N_EVENTS
);
915 Event
* evt
= &cgs
->events
[cgs
->events_used
];
919 evt
->Ev
.Dr
.szB
= datasize
;
925 void addEvent_Dw ( CgState
* cgs
, InstrInfo
* inode
, Int datasize
, IRAtom
* ea
)
927 tl_assert(isIRAtom(ea
));
932 tl_assert(datasize
>= 1 && datasize
<= min_line_size
);
934 /* Is it possible to merge this write with the preceding read? */
935 if (cgs
->events_used
> 0) {
936 Event
* lastEvt
= &cgs
->events
[cgs
->events_used
-1];
937 if ( lastEvt
->tag
== Ev_Dr
938 && lastEvt
->Ev
.Dr
.szB
== datasize
939 && lastEvt
->inode
== inode
940 && eqIRAtom(lastEvt
->Ev
.Dr
.ea
, ea
))
942 lastEvt
->tag
= Ev_Dm
;
947 /* No. Add as normal. */
948 if (cgs
->events_used
== N_EVENTS
)
950 tl_assert(cgs
->events_used
>= 0 && cgs
->events_used
< N_EVENTS
);
951 Event
* evt
= &cgs
->events
[cgs
->events_used
];
955 evt
->Ev
.Dw
.szB
= datasize
;
961 void addEvent_D_guarded ( CgState
* cgs
, InstrInfo
* inode
,
962 Int datasize
, IRAtom
* ea
, IRAtom
* guard
,
965 tl_assert(isIRAtom(ea
));
967 tl_assert(isIRAtom(guard
));
972 tl_assert(datasize
>= 1 && datasize
<= min_line_size
);
974 /* Adding guarded memory actions and merging them with the existing
975 queue is too complex. Simply flush the queue and add this
976 action immediately. Since guarded loads and stores are pretty
977 rare, this is not thought likely to cause any noticeable
978 performance loss as a result of the loss of event-merging
980 tl_assert(cgs
->events_used
>= 0);
982 tl_assert(cgs
->events_used
== 0);
983 /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */
985 const HChar
* helperName
;
990 i_node_expr
= mkIRExpr_HWord( (HWord
)inode
);
991 helperName
= isWrite
? "log_0Ir_1Dw_cache_access"
992 : "log_0Ir_1Dr_cache_access";
993 helperAddr
= isWrite
? &log_0Ir_1Dw_cache_access
994 : &log_0Ir_1Dr_cache_access
;
995 argv
= mkIRExprVec_3( i_node_expr
,
996 ea
, mkIRExpr_HWord( datasize
) );
998 di
= unsafeIRDirty_0_N(
1000 helperName
, VG_(fnptr_to_fnentry
)( helperAddr
),
1003 addStmtToIRSB( cgs
->sbOut
, IRStmt_Dirty(di
) );
1008 void addEvent_Bc ( CgState
* cgs
, InstrInfo
* inode
, IRAtom
* guard
)
1011 tl_assert(isIRAtom(guard
));
1012 tl_assert(typeOfIRExpr(cgs
->sbOut
->tyenv
, guard
)
1013 == (sizeof(RegWord
)==4 ? Ity_I32
: Ity_I64
));
1014 if (!clo_branch_sim
)
1016 if (cgs
->events_used
== N_EVENTS
)
1018 tl_assert(cgs
->events_used
>= 0 && cgs
->events_used
< N_EVENTS
);
1019 evt
= &cgs
->events
[cgs
->events_used
];
1023 evt
->Ev
.Bc
.taken
= guard
;
1028 void addEvent_Bi ( CgState
* cgs
, InstrInfo
* inode
, IRAtom
* whereTo
)
1031 tl_assert(isIRAtom(whereTo
));
1032 tl_assert(typeOfIRExpr(cgs
->sbOut
->tyenv
, whereTo
)
1033 == (sizeof(RegWord
)==4 ? Ity_I32
: Ity_I64
));
1034 if (!clo_branch_sim
)
1036 if (cgs
->events_used
== N_EVENTS
)
1038 tl_assert(cgs
->events_used
>= 0 && cgs
->events_used
< N_EVENTS
);
1039 evt
= &cgs
->events
[cgs
->events_used
];
1043 evt
->Ev
.Bi
.dst
= whereTo
;
1047 ////////////////////////////////////////////////////////////
1051 IRSB
* cg_instrument ( VgCallbackClosure
* closure
,
1053 const VexGuestLayout
* layout
,
1054 const VexGuestExtents
* vge
,
1055 const VexArchInfo
* archinfo_host
,
1056 IRType gWordTy
, IRType hWordTy
)
1061 Addr cia
; /* address of current insn */
1063 IRTypeEnv
* tyenv
= sbIn
->tyenv
;
1064 InstrInfo
* curr_inode
= NULL
;
1066 if (gWordTy
!= hWordTy
) {
1067 /* We don't currently support this case. */
1068 VG_(tool_panic
)("host/guest word size mismatch");
1071 if (!instr_enabled
) {
1076 cgs
.sbOut
= deepCopyIRSBExceptStmts(sbIn
);
1078 // Copy verbatim any IR preamble preceding the first IMark
1080 while (i
< sbIn
->stmts_used
&& sbIn
->stmts
[i
]->tag
!= Ist_IMark
) {
1081 addStmtToIRSB( cgs
.sbOut
, sbIn
->stmts
[i
] );
1085 // Get the first statement, and initial cia from it
1086 tl_assert(sbIn
->stmts_used
> 0);
1087 tl_assert(i
< sbIn
->stmts_used
);
1088 st
= sbIn
->stmts
[i
];
1089 tl_assert(Ist_IMark
== st
->tag
);
1091 cia
= st
->Ist
.IMark
.addr
;
1092 isize
= st
->Ist
.IMark
.len
;
1093 // If Vex fails to decode an instruction, the size will be zero.
1094 // Pretend otherwise.
1095 if (isize
== 0) isize
= VG_MIN_INSTR_SZB
;
1097 // Set up running state and get block info
1098 tl_assert(closure
->readdr
== vge
->base
[0]);
1099 cgs
.events_used
= 0;
1100 cgs
.sbInfo
= get_SB_info(sbIn
, (Addr
)closure
->readdr
);
1104 VG_(printf
)("\n\n---------- cg_instrument ----------\n");
1106 // Traverse the block, initialising inodes, adding events and flushing as
1108 for (/*use current i*/; i
< sbIn
->stmts_used
; i
++) {
1110 st
= sbIn
->stmts
[i
];
1111 tl_assert(isFlatIRStmt(st
));
1122 cia
= st
->Ist
.IMark
.addr
;
1123 isize
= st
->Ist
.IMark
.len
;
1125 // If Vex fails to decode an instruction, the size will be zero.
1126 // Pretend otherwise.
1127 if (isize
== 0) isize
= VG_MIN_INSTR_SZB
;
1129 // Sanity-check size.
1130 tl_assert( (VG_MIN_INSTR_SZB
<= isize
&& isize
<= VG_MAX_INSTR_SZB
)
1131 || VG_CLREQ_SZB
== isize
);
1133 // Get space for and init the inode, record it as the current one.
1134 // Subsequent Dr/Dw/Dm events from the same instruction will
1136 curr_inode
= setup_InstrInfo(&cgs
, cia
, isize
);
1138 addEvent_Ir( &cgs
, curr_inode
);
1142 IRExpr
* data
= st
->Ist
.WrTmp
.data
;
1143 if (data
->tag
== Iex_Load
) {
1144 IRExpr
* aexpr
= data
->Iex
.Load
.addr
;
1145 // Note also, endianness info is ignored. I guess
1146 // that's not interesting.
1147 addEvent_Dr( &cgs
, curr_inode
, sizeofIRType(data
->Iex
.Load
.ty
),
1154 IRExpr
* data
= st
->Ist
.Store
.data
;
1155 IRExpr
* aexpr
= st
->Ist
.Store
.addr
;
1156 addEvent_Dw( &cgs
, curr_inode
,
1157 sizeofIRType(typeOfIRExpr(tyenv
, data
)), aexpr
);
1162 IRStoreG
* sg
= st
->Ist
.StoreG
.details
;
1163 IRExpr
* data
= sg
->data
;
1164 IRExpr
* addr
= sg
->addr
;
1165 IRType type
= typeOfIRExpr(tyenv
, data
);
1166 tl_assert(type
!= Ity_INVALID
);
1167 addEvent_D_guarded( &cgs
, curr_inode
,
1168 sizeofIRType(type
), addr
, sg
->guard
,
1174 IRLoadG
* lg
= st
->Ist
.LoadG
.details
;
1175 IRType type
= Ity_INVALID
; /* loaded type */
1176 IRType typeWide
= Ity_INVALID
; /* after implicit widening */
1177 IRExpr
* addr
= lg
->addr
;
1178 typeOfIRLoadGOp(lg
->cvt
, &typeWide
, &type
);
1179 tl_assert(type
!= Ity_INVALID
);
1180 addEvent_D_guarded( &cgs
, curr_inode
,
1181 sizeofIRType(type
), addr
, lg
->guard
,
1182 False
/*!isWrite*/ );
1188 IRDirty
* d
= st
->Ist
.Dirty
.details
;
1189 if (d
->mFx
!= Ifx_None
) {
1190 /* This dirty helper accesses memory. Collect the details. */
1191 tl_assert(d
->mAddr
!= NULL
);
1192 tl_assert(d
->mSize
!= 0);
1193 dataSize
= d
->mSize
;
1194 // Large (eg. 28B, 108B, 512B on x86) data-sized
1195 // instructions will be done inaccurately, but they're
1196 // very rare and this avoids errors from hitting more
1197 // than two cache lines in the simulation.
1198 if (dataSize
> min_line_size
)
1199 dataSize
= min_line_size
;
1200 if (d
->mFx
== Ifx_Read
|| d
->mFx
== Ifx_Modify
)
1201 addEvent_Dr( &cgs
, curr_inode
, dataSize
, d
->mAddr
);
1202 if (d
->mFx
== Ifx_Write
|| d
->mFx
== Ifx_Modify
)
1203 addEvent_Dw( &cgs
, curr_inode
, dataSize
, d
->mAddr
);
1205 tl_assert(d
->mAddr
== NULL
);
1206 tl_assert(d
->mSize
== 0);
1212 /* We treat it as a read and a write of the location. I
1213 think that is the same behaviour as it was before IRCAS
1214 was introduced, since prior to that point, the Vex
1215 front ends would translate a lock-prefixed instruction
1216 into a (normal) read followed by a (normal) write. */
1218 IRCAS
* cas
= st
->Ist
.CAS
.details
;
1219 tl_assert(cas
->addr
!= NULL
);
1220 tl_assert(cas
->dataLo
!= NULL
);
1221 dataSize
= sizeofIRType(typeOfIRExpr(tyenv
, cas
->dataLo
));
1222 if (cas
->dataHi
!= NULL
)
1223 dataSize
*= 2; /* since it's a doubleword-CAS */
1224 /* I don't think this can ever happen, but play safe. */
1225 if (dataSize
> min_line_size
)
1226 dataSize
= min_line_size
;
1227 addEvent_Dr( &cgs
, curr_inode
, dataSize
, cas
->addr
);
1228 addEvent_Dw( &cgs
, curr_inode
, dataSize
, cas
->addr
);
1234 if (st
->Ist
.LLSC
.storedata
== NULL
) {
1236 dataTy
= typeOfIRTemp(tyenv
, st
->Ist
.LLSC
.result
);
1237 addEvent_Dr( &cgs
, curr_inode
,
1238 sizeofIRType(dataTy
), st
->Ist
.LLSC
.addr
);
1239 /* flush events before LL, should help SC to succeed */
1240 flushEvents( &cgs
);
1243 dataTy
= typeOfIRExpr(tyenv
, st
->Ist
.LLSC
.storedata
);
1244 addEvent_Dw( &cgs
, curr_inode
,
1245 sizeofIRType(dataTy
), st
->Ist
.LLSC
.addr
);
1251 // call branch predictor only if this is a branch in guest code
1252 if ( (st
->Ist
.Exit
.jk
== Ijk_Boring
) ||
1253 (st
->Ist
.Exit
.jk
== Ijk_Call
) ||
1254 (st
->Ist
.Exit
.jk
== Ijk_Ret
) )
1256 /* Stuff to widen the guard expression to a host word, so
1257 we can pass it to the branch predictor simulation
1258 functions easily. */
1262 IRType tyW
= hWordTy
;
1263 IROp widen
= tyW
==Ity_I32
? Iop_1Uto32
: Iop_1Uto64
;
1264 IROp opXOR
= tyW
==Ity_I32
? Iop_Xor32
: Iop_Xor64
;
1265 IRTemp guard1
= newIRTemp(cgs
.sbOut
->tyenv
, Ity_I1
);
1266 IRTemp guardW
= newIRTemp(cgs
.sbOut
->tyenv
, tyW
);
1267 IRTemp guard
= newIRTemp(cgs
.sbOut
->tyenv
, tyW
);
1268 IRExpr
* one
= tyW
==Ity_I32
? IRExpr_Const(IRConst_U32(1))
1269 : IRExpr_Const(IRConst_U64(1));
1271 /* First we need to figure out whether the side exit got
1272 inverted by the ir optimiser. To do that, figure out
1273 the next (fallthrough) instruction's address and the
1274 side exit address and see if they are the same. */
1277 /* Side exit address */
1278 dst
= st
->Ist
.Exit
.dst
;
1279 if (tyW
== Ity_I32
) {
1280 tl_assert(dst
->tag
== Ico_U32
);
1283 tl_assert(tyW
== Ity_I64
);
1284 tl_assert(dst
->tag
== Ico_U64
);
1288 inverted
= nia
== sea
;
1290 /* Widen the guard expression. */
1291 addStmtToIRSB( cgs
.sbOut
,
1292 IRStmt_WrTmp( guard1
, st
->Ist
.Exit
.guard
));
1293 addStmtToIRSB( cgs
.sbOut
,
1294 IRStmt_WrTmp( guardW
,
1296 IRExpr_RdTmp(guard1
))) );
1297 /* If the exit is inverted, invert the sense of the guard. */
1302 inverted
? IRExpr_Binop(opXOR
, IRExpr_RdTmp(guardW
), one
)
1303 : IRExpr_RdTmp(guardW
)
1305 /* And post the event. */
1306 addEvent_Bc( &cgs
, curr_inode
, IRExpr_RdTmp(guard
) );
1309 /* We may never reach the next statement, so need to flush
1310 all outstanding transactions now. */
1311 flushEvents( &cgs
);
1321 /* Copy the original statement */
1322 addStmtToIRSB( cgs
.sbOut
, st
);
1330 /* Deal with branches to unknown destinations. Except ignore ones
1331 which are function returns as we assume the return stack
1332 predictor never mispredicts. */
1333 if ((sbIn
->jumpkind
== Ijk_Boring
) || (sbIn
->jumpkind
== Ijk_Call
)) {
1334 if (0) { ppIRExpr( sbIn
->next
); VG_(printf
)("\n"); }
1335 switch (sbIn
->next
->tag
) {
1337 break; /* boring - branch to known address */
1339 /* looks like an indirect branch (branch to unknown) */
1340 addEvent_Bi( &cgs
, curr_inode
, sbIn
->next
);
1343 /* shouldn't happen - if the incoming IR is properly
1344 flattened, should only have tmp and const cases to
1350 /* At the end of the bb. Flush outstandings. */
1351 flushEvents( &cgs
);
1353 /* done. stay sane ... */
1354 tl_assert(cgs
.sbInfo_i
== cgs
.sbInfo
->n_instrs
);
1357 VG_(printf
)( "goto {");
1358 ppIRJumpKind(sbIn
->jumpkind
);
1360 ppIRExpr( sbIn
->next
);
1361 VG_(printf
)( "}\n");
1367 /*------------------------------------------------------------*/
1368 /*--- Cache configuration ---*/
1369 /*------------------------------------------------------------*/
1371 static cache_t clo_I1_cache
= UNDEFINED_CACHE
;
1372 static cache_t clo_D1_cache
= UNDEFINED_CACHE
;
1373 static cache_t clo_LL_cache
= UNDEFINED_CACHE
;
1375 /*------------------------------------------------------------*/
1376 /*--- cg_fini() and related function ---*/
1377 /*------------------------------------------------------------*/
1379 // Total reads/writes/misses. Calculated during CC traversal at the end.
1381 static CacheCC Ir_total
;
1382 static CacheCC Dr_total
;
1383 static CacheCC Dw_total
;
1384 static BranchCC Bc_total
;
1385 static BranchCC Bi_total
;
1387 static void fprint_CC_table_and_calc_totals(void)
1391 HChar
*currFile
= NULL
;
1392 const HChar
*currFn
= NULL
;
1395 // Setup output filename. Nb: it's important to do this now, ie. as late
1396 // as possible. If we do it at start-up and the program forks and the
1397 // output file format string contains a %p (pid) specifier, both the
1398 // parent and child will incorrectly write to the same file; this
1399 // happened in 3.3.0.
1400 HChar
* cachegrind_out_file
=
1401 VG_(expand_file_name
)("--cachegrind-out-file", clo_cachegrind_out_file
);
1403 fp
= VG_(fopen
)(cachegrind_out_file
, VKI_O_CREAT
|VKI_O_TRUNC
|VKI_O_WRONLY
,
1404 VKI_S_IRUSR
|VKI_S_IWUSR
);
1406 // If the file can't be opened for whatever reason (conflict
1407 // between multiple cachegrinded processes?), give up now.
1408 VG_(umsg
)("error: can't open output data file '%s'\n",
1409 cachegrind_out_file
);
1410 VG_(umsg
)(" ... so detailed results will be missing.\n");
1411 VG_(free
)(cachegrind_out_file
);
1414 VG_(free
)(cachegrind_out_file
);
1417 if (clo_cache_sim
) {
1418 // "desc:" lines (giving I1/D1/LL cache configuration). The spaces after
1419 // the 2nd colon makes cg_annotate's output look nicer.
1420 VG_(fprintf
)(fp
, "desc: I1 cache: %s\n"
1421 "desc: D1 cache: %s\n"
1422 "desc: LL cache: %s\n",
1423 I1
.desc_line
, D1
.desc_line
, LL
.desc_line
);
1427 VG_(fprintf
)(fp
, "cmd: %s", VG_(args_the_exename
));
1428 for (i
= 0; i
< VG_(sizeXA
)( VG_(args_for_client
) ); i
++) {
1429 HChar
* arg
= * (HChar
**) VG_(indexXA
)( VG_(args_for_client
), i
);
1430 VG_(fprintf
)(fp
, " %s", arg
);
1433 if (clo_cache_sim
&& clo_branch_sim
) {
1434 VG_(fprintf
)(fp
, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
1437 else if (clo_cache_sim
&& !clo_branch_sim
) {
1438 VG_(fprintf
)(fp
, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
1441 else if (!clo_cache_sim
&& clo_branch_sim
) {
1442 VG_(fprintf
)(fp
, "\nevents: Ir Bc Bcm Bi Bim\n");
1445 VG_(fprintf
)(fp
, "\nevents: Ir\n");
1448 // Traverse every lineCC
1449 VG_(OSetGen_ResetIter
)(CC_table
);
1450 while ( (lineCC
= VG_(OSetGen_Next
)(CC_table
)) ) {
1451 Bool just_hit_a_new_file
= False
;
1452 // If we've hit a new file, print a "fl=" line. Note that because
1453 // each string is stored exactly once in the string table, we can use
1454 // pointer comparison rather than strcmp() to test for equality, which
1455 // is good because most of the time the comparisons are equal and so
1456 // the whole strings would have to be checked.
1457 if ( lineCC
->loc
.file
!= currFile
) {
1458 currFile
= lineCC
->loc
.file
;
1459 VG_(fprintf
)(fp
, "fl=%s\n", currFile
);
1461 just_hit_a_new_file
= True
;
1463 // If we've hit a new function, print a "fn=" line. We know to do
1464 // this when the function name changes, and also every time we hit a
1465 // new file (in which case the new function name might be the same as
1466 // in the old file, hence the just_hit_a_new_file test).
1467 if ( just_hit_a_new_file
|| lineCC
->loc
.fn
!= currFn
) {
1468 currFn
= lineCC
->loc
.fn
;
1469 VG_(fprintf
)(fp
, "fn=%s\n", currFn
);
1474 if (clo_cache_sim
&& clo_branch_sim
) {
1475 VG_(fprintf
)(fp
, "%d %llu %llu %llu"
1478 " %llu %llu %llu %llu\n",
1480 lineCC
->Ir
.a
, lineCC
->Ir
.m1
, lineCC
->Ir
.mL
,
1481 lineCC
->Dr
.a
, lineCC
->Dr
.m1
, lineCC
->Dr
.mL
,
1482 lineCC
->Dw
.a
, lineCC
->Dw
.m1
, lineCC
->Dw
.mL
,
1483 lineCC
->Bc
.b
, lineCC
->Bc
.mp
,
1484 lineCC
->Bi
.b
, lineCC
->Bi
.mp
);
1486 else if (clo_cache_sim
&& !clo_branch_sim
) {
1487 VG_(fprintf
)(fp
, "%d %llu %llu %llu"
1489 " %llu %llu %llu\n",
1491 lineCC
->Ir
.a
, lineCC
->Ir
.m1
, lineCC
->Ir
.mL
,
1492 lineCC
->Dr
.a
, lineCC
->Dr
.m1
, lineCC
->Dr
.mL
,
1493 lineCC
->Dw
.a
, lineCC
->Dw
.m1
, lineCC
->Dw
.mL
);
1495 else if (!clo_cache_sim
&& clo_branch_sim
) {
1496 VG_(fprintf
)(fp
, "%d %llu"
1497 " %llu %llu %llu %llu\n",
1500 lineCC
->Bc
.b
, lineCC
->Bc
.mp
,
1501 lineCC
->Bi
.b
, lineCC
->Bi
.mp
);
1504 VG_(fprintf
)(fp
, "%d %llu\n",
1509 // Update summary stats
1510 Ir_total
.a
+= lineCC
->Ir
.a
;
1511 Ir_total
.m1
+= lineCC
->Ir
.m1
;
1512 Ir_total
.mL
+= lineCC
->Ir
.mL
;
1513 Dr_total
.a
+= lineCC
->Dr
.a
;
1514 Dr_total
.m1
+= lineCC
->Dr
.m1
;
1515 Dr_total
.mL
+= lineCC
->Dr
.mL
;
1516 Dw_total
.a
+= lineCC
->Dw
.a
;
1517 Dw_total
.m1
+= lineCC
->Dw
.m1
;
1518 Dw_total
.mL
+= lineCC
->Dw
.mL
;
1519 Bc_total
.b
+= lineCC
->Bc
.b
;
1520 Bc_total
.mp
+= lineCC
->Bc
.mp
;
1521 Bi_total
.b
+= lineCC
->Bi
.b
;
1522 Bi_total
.mp
+= lineCC
->Bi
.mp
;
1527 // Summary stats must come after rest of table, since we calculate them
1528 // during traversal.
1529 if (clo_cache_sim
&& clo_branch_sim
) {
1530 VG_(fprintf
)(fp
, "summary:"
1534 " %llu %llu %llu %llu\n",
1535 Ir_total
.a
, Ir_total
.m1
, Ir_total
.mL
,
1536 Dr_total
.a
, Dr_total
.m1
, Dr_total
.mL
,
1537 Dw_total
.a
, Dw_total
.m1
, Dw_total
.mL
,
1538 Bc_total
.b
, Bc_total
.mp
,
1539 Bi_total
.b
, Bi_total
.mp
);
1541 else if (clo_cache_sim
&& !clo_branch_sim
) {
1542 VG_(fprintf
)(fp
, "summary:"
1545 " %llu %llu %llu\n",
1546 Ir_total
.a
, Ir_total
.m1
, Ir_total
.mL
,
1547 Dr_total
.a
, Dr_total
.m1
, Dr_total
.mL
,
1548 Dw_total
.a
, Dw_total
.m1
, Dw_total
.mL
);
1550 else if (!clo_cache_sim
&& clo_branch_sim
) {
1551 VG_(fprintf
)(fp
, "summary:"
1553 " %llu %llu %llu %llu\n",
1555 Bc_total
.b
, Bc_total
.mp
,
1556 Bi_total
.b
, Bi_total
.mp
);
1559 VG_(fprintf
)(fp
, "summary:"
1567 static UInt
ULong_width(ULong n
)
1575 return w
+ (w
-1)/3; // add space for commas
1578 static void cg_fini(Int exitcode
)
1580 static HChar fmt
[128]; // OK; large enough
1584 ULong LL_total_m
, LL_total_mr
, LL_total_mw
,
1585 LL_total
, LL_total_r
, LL_total_w
;
1588 fprint_CC_table_and_calc_totals();
1590 if (VG_(clo_verbosity
) == 0)
1593 // Nb: this isn't called "MAX" because that overshadows a global on Darwin.
1594 #define CG_MAX(a, b) ((a) >= (b) ? (a) : (b))
1596 /* I cache results. Use the I_refs value to determine the first column
1598 l1
= ULong_width(Ir_total
.a
);
1599 l2
= ULong_width(CG_MAX(Dr_total
.a
, Bc_total
.b
));
1600 l3
= ULong_width(CG_MAX(Dw_total
.a
, Bi_total
.b
));
1602 /* Make format string, getting width right for numbers */
1603 VG_(sprintf
)(fmt
, "%%s %%,%dllu\n", l1
);
1605 /* Always print this */
1606 VG_(umsg
)(fmt
, "I refs: ", Ir_total
.a
);
1608 /* If cache profiling is enabled, show D access numbers and all
1610 if (clo_cache_sim
) {
1611 VG_(umsg
)(fmt
, "I1 misses: ", Ir_total
.m1
);
1612 VG_(umsg
)(fmt
, "LLi misses: ", Ir_total
.mL
);
1614 if (0 == Ir_total
.a
) Ir_total
.a
= 1;
1615 VG_(umsg
)("I1 miss rate: %*.2f%%\n", l1
,
1616 Ir_total
.m1
* 100.0 / Ir_total
.a
);
1617 VG_(umsg
)("LLi miss rate: %*.2f%%\n", l1
,
1618 Ir_total
.mL
* 100.0 / Ir_total
.a
);
1621 /* D cache results. Use the D_refs.rd and D_refs.wr values to
1622 * determine the width of columns 2 & 3. */
1623 D_total
.a
= Dr_total
.a
+ Dw_total
.a
;
1624 D_total
.m1
= Dr_total
.m1
+ Dw_total
.m1
;
1625 D_total
.mL
= Dr_total
.mL
+ Dw_total
.mL
;
1627 /* Make format string, getting width right for numbers */
1628 VG_(sprintf
)(fmt
, "%%s %%,%dllu (%%,%dllu rd + %%,%dllu wr)\n",
1631 VG_(umsg
)(fmt
, "D refs: ",
1632 D_total
.a
, Dr_total
.a
, Dw_total
.a
);
1633 VG_(umsg
)(fmt
, "D1 misses: ",
1634 D_total
.m1
, Dr_total
.m1
, Dw_total
.m1
);
1635 VG_(umsg
)(fmt
, "LLd misses: ",
1636 D_total
.mL
, Dr_total
.mL
, Dw_total
.mL
);
1638 if (0 == D_total
.a
) D_total
.a
= 1;
1639 if (0 == Dr_total
.a
) Dr_total
.a
= 1;
1640 if (0 == Dw_total
.a
) Dw_total
.a
= 1;
1641 VG_(umsg
)("D1 miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1642 l1
, D_total
.m1
* 100.0 / D_total
.a
,
1643 l2
, Dr_total
.m1
* 100.0 / Dr_total
.a
,
1644 l3
, Dw_total
.m1
* 100.0 / Dw_total
.a
);
1645 VG_(umsg
)("LLd miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1646 l1
, D_total
.mL
* 100.0 / D_total
.a
,
1647 l2
, Dr_total
.mL
* 100.0 / Dr_total
.a
,
1648 l3
, Dw_total
.mL
* 100.0 / Dw_total
.a
);
1651 /* LL overall results */
1653 LL_total
= Dr_total
.m1
+ Dw_total
.m1
+ Ir_total
.m1
;
1654 LL_total_r
= Dr_total
.m1
+ Ir_total
.m1
;
1655 LL_total_w
= Dw_total
.m1
;
1656 VG_(umsg
)(fmt
, "LL refs: ",
1657 LL_total
, LL_total_r
, LL_total_w
);
1659 LL_total_m
= Dr_total
.mL
+ Dw_total
.mL
+ Ir_total
.mL
;
1660 LL_total_mr
= Dr_total
.mL
+ Ir_total
.mL
;
1661 LL_total_mw
= Dw_total
.mL
;
1662 VG_(umsg
)(fmt
, "LL misses: ",
1663 LL_total_m
, LL_total_mr
, LL_total_mw
);
1665 VG_(umsg
)("LL miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1666 l1
, LL_total_m
* 100.0 / (Ir_total
.a
+ D_total
.a
),
1667 l2
, LL_total_mr
* 100.0 / (Ir_total
.a
+ Dr_total
.a
),
1668 l3
, LL_total_mw
* 100.0 / Dw_total
.a
);
1671 /* If branch profiling is enabled, show branch overall results. */
1672 if (clo_branch_sim
) {
1673 /* Make format string, getting width right for numbers */
1674 VG_(sprintf
)(fmt
, "%%s %%,%dllu (%%,%dllu cond + %%,%dllu ind)\n",
1677 if (0 == Bc_total
.b
) Bc_total
.b
= 1;
1678 if (0 == Bi_total
.b
) Bi_total
.b
= 1;
1679 B_total
.b
= Bc_total
.b
+ Bi_total
.b
;
1680 B_total
.mp
= Bc_total
.mp
+ Bi_total
.mp
;
1683 VG_(umsg
)(fmt
, "Branches: ",
1684 B_total
.b
, Bc_total
.b
, Bi_total
.b
);
1686 VG_(umsg
)(fmt
, "Mispredicts: ",
1687 B_total
.mp
, Bc_total
.mp
, Bi_total
.mp
);
1689 VG_(umsg
)("Mispred rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1690 l1
, B_total
.mp
* 100.0 / B_total
.b
,
1691 l2
, Bc_total
.mp
* 100.0 / Bc_total
.b
,
1692 l3
, Bi_total
.mp
* 100.0 / Bi_total
.b
);
1696 if (VG_(clo_stats
)) {
1697 Int debug_lookups
= full_debugs
+ fn_debugs
+
1698 file_line_debugs
+ no_debugs
;
1701 VG_(dmsg
)("cachegrind: distinct files : %d\n", distinct_files
);
1702 VG_(dmsg
)("cachegrind: distinct functions : %d\n", distinct_fns
);
1703 VG_(dmsg
)("cachegrind: distinct lines : %d\n", distinct_lines
);
1704 VG_(dmsg
)("cachegrind: distinct instrs NoX: %d\n", distinct_instrsNoX
);
1705 VG_(dmsg
)("cachegrind: distinct instrs Gen: %d\n", distinct_instrsGen
);
1706 VG_(dmsg
)("cachegrind: debug lookups : %d\n", debug_lookups
);
1708 VG_(dmsg
)("cachegrind: with full info:%6.1f%% (%d)\n",
1709 full_debugs
* 100.0 / debug_lookups
, full_debugs
);
1710 VG_(dmsg
)("cachegrind: with file/line info:%6.1f%% (%d)\n",
1711 file_line_debugs
* 100.0 / debug_lookups
, file_line_debugs
);
1712 VG_(dmsg
)("cachegrind: with fn name info:%6.1f%% (%d)\n",
1713 fn_debugs
* 100.0 / debug_lookups
, fn_debugs
);
1714 VG_(dmsg
)("cachegrind: with zero info:%6.1f%% (%d)\n",
1715 no_debugs
* 100.0 / debug_lookups
, no_debugs
);
1717 VG_(dmsg
)("cachegrind: string table size: %u\n",
1718 VG_(OSetGen_Size
)(stringTable
));
1719 VG_(dmsg
)("cachegrind: CC table size: %u\n",
1720 VG_(OSetGen_Size
)(CC_table
));
1721 VG_(dmsg
)("cachegrind: InstrInfo table size: %u\n",
1722 VG_(OSetGen_Size
)(instrInfoTable
));
1726 /*--------------------------------------------------------------------*/
1727 /*--- Discarding BB info ---*/
1728 /*--------------------------------------------------------------------*/
1730 // Called when a translation is removed from the translation cache for
1731 // any reason at all: to free up space, because the guest code was
1732 // unmapped or modified, or for any arbitrary reason.
1734 void cg_discard_superblock_info ( Addr orig_addr64
, VexGuestExtents vge
)
1736 Addr orig_addr
= vge
.base
[0];
1738 tl_assert(vge
.n_used
> 0);
1741 VG_(printf
)( "discard_basic_block_info: %p, %p, %llu\n",
1743 (void*)vge
.base
[0], (ULong
)vge
.len
[0]);
1745 // Get SB info, remove from table, free SB info. Simple! Unless
1746 // instrumentation is currently disabled, in which case we won't have an SB
1747 // info. Note that we use orig_addr, not the first instruction address in
1749 SB_info
* sbInfo
= VG_(OSetGen_Remove
)(instrInfoTable
, &orig_addr
);
1751 tl_assert(instr_enabled
);
1752 VG_(OSetGen_FreeNode
)(instrInfoTable
, sbInfo
);
1754 tl_assert(!instr_enabled
);
1758 /*--------------------------------------------------------------------*/
1759 /*--- Command line processing ---*/
1760 /*--------------------------------------------------------------------*/
1762 static Bool
cg_process_cmd_line_option(const HChar
* arg
)
1764 if (VG_(str_clo_cache_opt
)(arg
,
1769 else if VG_STR_CLO( arg
, "--cachegrind-out-file", clo_cachegrind_out_file
) {}
1770 else if VG_BOOL_CLO(arg
, "--cache-sim", clo_cache_sim
) {}
1771 else if VG_BOOL_CLO(arg
, "--branch-sim", clo_branch_sim
) {}
1772 else if VG_BOOL_CLO(arg
, "--instr-at-start", clo_instr_at_start
) {}
1779 static void cg_print_usage(void)
1782 " --cachegrind-out-file=<file> output file name [cachegrind.out.%%p]\n"
1783 " --cache-sim=yes|no collect cache stats? [no]\n"
1784 " --branch-sim=yes|no collect branch prediction stats? [no]\n"
1785 " --instr-at-start=yes|no instrument at start? [yes]\n"
1787 VG_(print_cache_clo_opts
)();
1790 static void cg_print_debug_usage(void)
1797 /*--------------------------------------------------------------------*/
1798 /*--- Client requests ---*/
1799 /*--------------------------------------------------------------------*/
1801 static void set_instr_enabled(Bool enable
)
1804 // Enable instrumentation.
1805 if (!instr_enabled
) {
1806 // Discard first, then update `instr_enabled`;
1807 // `cg_discard_superblock_info` relies on that.
1808 VG_(discard_translations_safely
)((Addr
)0x1000, ~(SizeT
)0xfff, "cachegrind");
1809 instr_enabled
= True
;
1811 VG_(dmsg
)("warning: CACHEGRIND_START_INSTRUMENTATION called,\n");
1812 VG_(dmsg
)(" but instrumentation is already enabled\n");
1815 // Disable instrumentation.
1816 if (instr_enabled
) {
1817 // Discard first, then update `instr_enabled`;
1818 // `cg_discard_superblock_info` relies on that.
1819 VG_(discard_translations_safely
)((Addr
)0x1000, ~(SizeT
)0xfff, "cachegrind");
1820 instr_enabled
= False
;
1822 VG_(dmsg
)("warning: CACHEGRIND_STOP_INSTRUMENTATION called,\n");
1823 VG_(dmsg
)(" but instrumentation is already disabled\n");
1828 static Bool
cg_handle_client_request(ThreadId tid
, UWord
*args
, UWord
*ret
)
1830 if (!VG_IS_TOOL_USERREQ('C', 'G', args
[0])
1831 && VG_USERREQ__GDB_MONITOR_COMMAND
!= args
[0])
1835 case VG_USERREQ__CG_START_INSTRUMENTATION
:
1836 set_instr_enabled(True
);
1840 case VG_USERREQ__CG_STOP_INSTRUMENTATION
:
1841 set_instr_enabled(False
);
1846 VG_(message
)(Vg_UserMsg
,
1847 "Warning: unknown cachegrind client request code %llx\n",
1853 /*--------------------------------------------------------------------*/
1855 /*--------------------------------------------------------------------*/
1857 static void cg_post_clo_init(void); /* just below */
1859 static void cg_pre_clo_init(void)
1861 VG_(details_name
) ("Cachegrind");
1862 VG_(details_version
) (NULL
);
1863 VG_(details_description
) ("a high-precision tracing profiler");
1864 VG_(details_copyright_author
)(
1865 "Copyright (C) 2002-2024, and GNU GPL'd, by Nicholas Nethercote et al.");
1866 VG_(details_bug_reports_to
) (VG_BUGS_TO
);
1867 VG_(details_avg_translation_sizeB
) ( 500 );
1869 VG_(clo_vex_control
).iropt_register_updates_default
1870 = VG_(clo_px_file_backed
)
1871 = VexRegUpdSpAtMemAccess
; // overridable by the user.
1873 VG_(basic_tool_funcs
) (cg_post_clo_init
,
1877 VG_(needs_superblock_discards
)(cg_discard_superblock_info
);
1878 VG_(needs_command_line_options
)(cg_process_cmd_line_option
,
1880 cg_print_debug_usage
);
1881 VG_(needs_client_requests
)(cg_handle_client_request
);
1884 static void cg_post_clo_init(void)
1886 cache_t I1c
, D1c
, LLc
;
1889 VG_(OSetGen_Create
)(offsetof(LineCC
, loc
),
1891 VG_(malloc
), "cg.main.cpci.1",
1894 VG_(OSetGen_Create
)(/*keyOff*/0,
1896 VG_(malloc
), "cg.main.cpci.2",
1899 VG_(OSetGen_Create
)(/*keyOff*/0,
1901 VG_(malloc
), "cg.main.cpci.3",
1904 if (clo_cache_sim
) {
1905 VG_(post_clo_init_configure_caches
)(&I1c
, &D1c
, &LLc
,
1910 // min_line_size is used to make sure that we never feed
1911 // accesses to the simulator straddling more than two
1912 // cache lines at any cache level
1913 min_line_size
= (I1c
.line_size
< D1c
.line_size
) ? I1c
.line_size
: D1c
.line_size
;
1914 min_line_size
= (LLc
.line_size
< min_line_size
) ? LLc
.line_size
: min_line_size
;
1916 Int largest_load_or_store_size
1917 = VG_(machine_get_size_of_largest_guest_register
)();
1918 if (min_line_size
< largest_load_or_store_size
) {
1919 /* We can't continue, because the cache simulation might
1920 straddle more than 2 lines, and it will assert. So let's
1921 just stop before we start. */
1922 VG_(umsg
)("Cachegrind: cannot continue: the minimum line size (%d)\n",
1923 (Int
)min_line_size
);
1924 VG_(umsg
)(" must be equal to or larger than the maximum register size (%d)\n",
1925 largest_load_or_store_size
);
1926 VG_(umsg
)(" but it is not. Exiting now.\n");
1930 cachesim_initcaches(I1c
, D1c
, LLc
);
1933 // When instrumentation client requests are enabled, we start with
1934 // instrumentation off.
1935 if (!clo_instr_at_start
) {
1936 instr_enabled
= False
;
1940 VG_DETERMINE_INTERFACE_VERSION(cg_pre_clo_init
)
1942 /*--------------------------------------------------------------------*/
1944 /*--------------------------------------------------------------------*/