1 /*--------------------------------------------------------------------*/
2 /*--- Cache simulation. ---*/
4 /*--------------------------------------------------------------------*/
7 This file is part of Callgrind, a Valgrind tool for call graph
10 Copyright (C) 2003-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
12 This tool is derived from and contains code from Cachegrind
13 Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
15 This program is free software; you can redistribute it and/or
16 modify it under the terms of the GNU General Public License as
17 published by the Free Software Foundation; either version 2 of the
18 License, or (at your option) any later version.
20 This program is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 General Public License for more details.
25 You should have received a copy of the GNU General Public License
26 along with this program; if not, see <http://www.gnu.org/licenses/>.
28 The GNU General Public License is contained in the file COPYING.
35 - simulates a write-allocate cache
36 - (block --> set) hash function uses simple bit selection
37 - handling of references straddling two cache blocks:
38 - counts as only one cache access (not two)
39 - both blocks hit --> one hit
40 - one block hits, the other misses --> one miss
41 - both blocks miss --> one miss (not two)
44 /* Cache configuration */
47 /* additional structures for cache use info, separated
48 * according usage frequency:
49 * - line_loaded : pointer to cost center of instruction
50 * which loaded the line into cache.
51 * Needed to increment counters when line is evicted.
52 * - line_use : updated on every access
56 UInt mask
; /* e.g. for 64Byte line size 1bit/2Byte */
61 line_use
* dep_use
; /* point to higher-level cacheblock for this memline */
70 int line_size
; /* bytes */
71 Bool sectored
; /* prefetch nearside cacheline on read */
77 HChar desc_line
[128]; // large enough
89 * States of flat caches in our model.
90 * We use a 2-level hierarchy,
92 static cache_t2 I1
, D1
, LL
;
94 /* Lower bits of cache tags are used as flags for a cache line */
95 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
96 #define CACHELINE_DIRTY 1
99 /* Cache simulator Options */
100 static Bool clo_simulate_writeback
= False
;
101 static Bool clo_simulate_hwpref
= False
;
102 static Bool clo_simulate_sectors
= False
;
103 static Bool clo_collect_cacheuse
= False
;
105 /* Following global vars are setup before by setup_bbcc():
107 * - Addr CLG_(bb_base) (instruction start address of original BB)
108 * - ULong* CLG_(cost_base) (start of cost array for BB)
112 ULong
* CLG_(cost_base
);
114 static InstrInfo
* current_ii
;
116 /* Cache use offsets */
117 /* The offsets are only correct because all per-instruction event sets get
118 * the "Use" set added first !
120 static Int off_I1_AcCost
= 0;
121 static Int off_I1_SpLoss
= 1;
122 static Int off_D1_AcCost
= 0;
123 static Int off_D1_SpLoss
= 1;
124 static Int off_LL_AcCost
= 2;
125 static Int off_LL_SpLoss
= 3;
127 /* Cache access types */
128 typedef enum { Read
= 0, Write
= CACHELINE_DIRTY
} RefType
;
130 /* Result of a reference into a flat cache */
131 typedef enum { Hit
= 0, Miss
, MissDirty
} CacheResult
;
133 /* Result of a reference into a hierarchical cache model */
138 WriteBackMemAccess
} CacheModelResult
;
140 typedef CacheModelResult (*simcall_type
)(Addr
, UChar
);
143 simcall_type I1_Read
;
144 simcall_type D1_Read
;
145 simcall_type D1_Write
;
148 /*------------------------------------------------------------*/
149 /*--- Cache Simulator Initialization ---*/
150 /*------------------------------------------------------------*/
152 static void cachesim_clearcache(cache_t2
* c
)
156 for (i
= 0; i
< c
->sets
* c
->assoc
; i
++)
159 for (i
= 0; i
< c
->sets
* c
->assoc
; i
++) {
160 c
->loaded
[i
].memline
= 0;
161 c
->loaded
[i
].use_base
= 0;
162 c
->loaded
[i
].dep_use
= 0;
163 c
->loaded
[i
].iaddr
= 0;
166 c
->tags
[i
] = i
% c
->assoc
; /* init lower bits as pointer */
171 static void cacheuse_initcache(cache_t2
* c
);
173 /* By this point, the size/assoc/line_size has been checked. */
174 static void cachesim_initcache(cache_t config
, cache_t2
* c
)
176 c
->size
= config
.size
;
177 c
->assoc
= config
.assoc
;
178 c
->line_size
= config
.line_size
;
179 c
->sectored
= False
; // FIXME
181 c
->sets
= (c
->size
/ c
->line_size
) / c
->assoc
;
182 c
->sets_min_1
= c
->sets
- 1;
183 c
->line_size_bits
= VG_(log2
)(c
->line_size
);
184 c
->tag_shift
= c
->line_size_bits
+ VG_(log2
)(c
->sets
);
185 c
->tag_mask
= ~((1u<<c
->tag_shift
)-1);
187 /* Can bits in tag entries be used for flags?
188 * Should be always true as MIN_LINE_SIZE >= 16 */
189 CLG_ASSERT( (c
->tag_mask
& CACHELINE_FLAGMASK
) == 0);
192 VG_(sprintf
)(c
->desc_line
, "%d B, %d B, direct-mapped%s",
193 c
->size
, c
->line_size
,
194 c
->sectored
? ", sectored":"");
196 VG_(sprintf
)(c
->desc_line
, "%d B, %d B, %d-way associative%s",
197 c
->size
, c
->line_size
, c
->assoc
,
198 c
->sectored
? ", sectored":"");
201 c
->tags
= (UWord
*) CLG_MALLOC("cl.sim.cs_ic.1",
202 sizeof(UWord
) * c
->sets
* c
->assoc
);
203 if (clo_collect_cacheuse
)
204 cacheuse_initcache(c
);
207 cachesim_clearcache(c
);
212 static void print_cache(cache_t2
* c
)
216 /* Note initialisation and update of 'i'. */
217 for (i
= 0, set
= 0; set
< c
->sets
; set
++) {
218 for (way
= 0; way
< c
->assoc
; way
++, i
++) {
219 VG_(printf
)("%8x ", c
->tags
[i
]);
227 /*------------------------------------------------------------*/
228 /*--- Simple Cache Simulation ---*/
229 /*------------------------------------------------------------*/
232 * Model: single inclusive, 2-level cache hierarchy (L1/LL)
233 * with write-allocate
235 * For simple cache hit/miss counts, we do not have to
236 * maintain the dirty state of lines (no need to distinguish
237 * read/write references), and the resulting counts are the
238 * same for write-through and write-back caches.
240 * Simulator functions:
241 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
242 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
244 __attribute__((always_inline
))
246 CacheResult
cachesim_setref(cache_t2
* c
, UInt set_no
, UWord tag
)
251 set
= &(c
->tags
[set_no
* c
->assoc
]);
253 /* This loop is unrolled for just the first case, which is the most */
254 /* common. We can't unroll any further because it would screw up */
255 /* if we have a direct-mapped (1-way) cache. */
259 /* If the tag is one other than the MRU, move it into the MRU spot */
260 /* and shuffle the rest down. */
261 for (i
= 1; i
< c
->assoc
; i
++) {
263 for (j
= i
; j
> 0; j
--) {
271 /* A miss; install this tag as MRU, shuffle rest down. */
272 for (j
= c
->assoc
- 1; j
> 0; j
--) {
280 __attribute__((always_inline
))
282 CacheResult
cachesim_ref(cache_t2
* c
, Addr a
, UChar size
)
284 UWord block1
= a
>> c
->line_size_bits
;
285 UWord block2
= (a
+size
-1) >> c
->line_size_bits
;
286 UInt set1
= block1
& c
->sets_min_1
;
287 /* the tag does not need to include bits specifying the set,
288 * but it can, and this saves instructions */
291 /* Access entirely within line. */
292 if (block1
== block2
)
293 return cachesim_setref(c
, set1
, tag1
);
295 /* Access straddles two lines. */
296 else if (block1
+ 1 == block2
) {
297 UInt set2
= block2
& c
->sets_min_1
;
300 /* the call updates cache structures as side effect */
301 CacheResult res1
= cachesim_setref(c
, set1
, tag1
);
302 CacheResult res2
= cachesim_setref(c
, set2
, tag2
);
303 return ((res1
== Miss
) || (res2
== Miss
)) ? Miss
: Hit
;
306 VG_(printf
)("addr: %lx size: %u blocks: %lu %lu",
307 a
, size
, block1
, block2
);
308 VG_(tool_panic
)("item straddles more than two cache sets");
314 CacheModelResult
cachesim_I1_ref(Addr a
, UChar size
)
316 if ( cachesim_ref( &I1
, a
, size
) == Hit
) return L1_Hit
;
317 if ( cachesim_ref( &LL
, a
, size
) == Hit
) return LL_Hit
;
322 CacheModelResult
cachesim_D1_ref(Addr a
, UChar size
)
324 if ( cachesim_ref( &D1
, a
, size
) == Hit
) return L1_Hit
;
325 if ( cachesim_ref( &LL
, a
, size
) == Hit
) return LL_Hit
;
330 /*------------------------------------------------------------*/
331 /*--- Write Back Cache Simulation ---*/
332 /*------------------------------------------------------------*/
335 * More complex model: L1 Write-through, LL Write-back
336 * This needs to distinguish among read and write references.
338 * Simulator functions:
339 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
340 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
341 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
345 * With write-back, result can be a miss evicting a dirty line
346 * The dirty state of a cache line is stored in Bit0 of the tag for
347 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
348 * type (Read/Write), the line gets dirty on a write.
350 __attribute__((always_inline
))
352 CacheResult
cachesim_setref_wb(cache_t2
* c
, RefType ref
, UInt set_no
, UWord tag
)
357 set
= &(c
->tags
[set_no
* c
->assoc
]);
359 /* This loop is unrolled for just the first case, which is the most */
360 /* common. We can't unroll any further because it would screw up */
361 /* if we have a direct-mapped (1-way) cache. */
362 if (tag
== (set
[0] & ~CACHELINE_DIRTY
)) {
366 /* If the tag is one other than the MRU, move it into the MRU spot */
367 /* and shuffle the rest down. */
368 for (i
= 1; i
< c
->assoc
; i
++) {
369 if (tag
== (set
[i
] & ~CACHELINE_DIRTY
)) {
370 tmp_tag
= set
[i
] | ref
; // update dirty flag
371 for (j
= i
; j
> 0; j
--) {
379 /* A miss; install this tag as MRU, shuffle rest down. */
380 tmp_tag
= set
[c
->assoc
- 1];
381 for (j
= c
->assoc
- 1; j
> 0; j
--) {
386 return (tmp_tag
& CACHELINE_DIRTY
) ? MissDirty
: Miss
;
389 __attribute__((always_inline
))
391 CacheResult
cachesim_ref_wb(cache_t2
* c
, RefType ref
, Addr a
, UChar size
)
393 UInt set1
= ( a
>> c
->line_size_bits
) & (c
->sets_min_1
);
394 UInt set2
= ((a
+size
-1) >> c
->line_size_bits
) & (c
->sets_min_1
);
395 UWord tag
= a
& c
->tag_mask
;
397 /* Access entirely within line. */
399 return cachesim_setref_wb(c
, ref
, set1
, tag
);
401 /* Access straddles two lines. */
402 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
403 else if (((set1
+ 1) & (c
->sets_min_1
)) == set2
) {
404 UWord tag2
= (a
+size
-1) & c
->tag_mask
;
406 /* the call updates cache structures as side effect */
407 CacheResult res1
= cachesim_setref_wb(c
, ref
, set1
, tag
);
408 CacheResult res2
= cachesim_setref_wb(c
, ref
, set2
, tag2
);
410 if ((res1
== MissDirty
) || (res2
== MissDirty
)) return MissDirty
;
411 return ((res1
== Miss
) || (res2
== Miss
)) ? Miss
: Hit
;
414 VG_(printf
)("addr: %lx size: %u sets: %u %u", a
, size
, set1
, set2
);
415 VG_(tool_panic
)("item straddles more than two cache sets");
422 CacheModelResult
cachesim_I1_Read(Addr a
, UChar size
)
424 if ( cachesim_ref( &I1
, a
, size
) == Hit
) return L1_Hit
;
425 switch( cachesim_ref_wb( &LL
, Read
, a
, size
) ) {
426 case Hit
: return LL_Hit
;
427 case Miss
: return MemAccess
;
430 return WriteBackMemAccess
;
434 CacheModelResult
cachesim_D1_Read(Addr a
, UChar size
)
436 if ( cachesim_ref( &D1
, a
, size
) == Hit
) return L1_Hit
;
437 switch( cachesim_ref_wb( &LL
, Read
, a
, size
) ) {
438 case Hit
: return LL_Hit
;
439 case Miss
: return MemAccess
;
442 return WriteBackMemAccess
;
446 CacheModelResult
cachesim_D1_Write(Addr a
, UChar size
)
448 if ( cachesim_ref( &D1
, a
, size
) == Hit
) {
449 /* Even for a L1 hit, the write-trough L1 passes
450 * the write to the LL to make the LL line dirty.
451 * But this causes no latency, so return the hit.
453 cachesim_ref_wb( &LL
, Write
, a
, size
);
456 switch( cachesim_ref_wb( &LL
, Write
, a
, size
) ) {
457 case Hit
: return LL_Hit
;
458 case Miss
: return MemAccess
;
461 return WriteBackMemAccess
;
465 /*------------------------------------------------------------*/
466 /*--- Hardware Prefetch Simulation ---*/
467 /*------------------------------------------------------------*/
469 static ULong prefetch_up
= 0;
470 static ULong prefetch_down
= 0;
473 #define PF_PAGEBITS 12
475 static UInt pf_lastblock
[PF_STREAMS
];
476 static Int pf_seqblocks
[PF_STREAMS
];
479 void prefetch_clear(void)
482 for(i
=0;i
<PF_STREAMS
;i
++)
483 pf_lastblock
[i
] = pf_seqblocks
[i
] = 0;
487 * HW Prefetch emulation
488 * Start prefetching when detecting sequential access to 3 memory blocks.
489 * One stream can be detected per 4k page.
492 void prefetch_LL_doref(Addr a
)
494 UInt stream
= (a
>> PF_PAGEBITS
) % PF_STREAMS
;
495 UInt block
= ( a
>> LL
.line_size_bits
);
497 if (block
!= pf_lastblock
[stream
]) {
498 if (pf_seqblocks
[stream
] == 0) {
499 if (pf_lastblock
[stream
] +1 == block
) pf_seqblocks
[stream
]++;
500 else if (pf_lastblock
[stream
] -1 == block
) pf_seqblocks
[stream
]--;
502 else if (pf_seqblocks
[stream
] >0) {
503 if (pf_lastblock
[stream
] +1 == block
) {
504 pf_seqblocks
[stream
]++;
505 if (pf_seqblocks
[stream
] >= 2) {
507 cachesim_ref(&LL
, a
+ 5 * LL
.line_size
,1);
510 else pf_seqblocks
[stream
] = 0;
512 else if (pf_seqblocks
[stream
] <0) {
513 if (pf_lastblock
[stream
] -1 == block
) {
514 pf_seqblocks
[stream
]--;
515 if (pf_seqblocks
[stream
] <= -2) {
517 cachesim_ref(&LL
, a
- 5 * LL
.line_size
,1);
520 else pf_seqblocks
[stream
] = 0;
522 pf_lastblock
[stream
] = block
;
526 /* simple model with hardware prefetch */
529 CacheModelResult
prefetch_I1_ref(Addr a
, UChar size
)
531 if ( cachesim_ref( &I1
, a
, size
) == Hit
) return L1_Hit
;
532 prefetch_LL_doref(a
);
533 if ( cachesim_ref( &LL
, a
, size
) == Hit
) return LL_Hit
;
538 CacheModelResult
prefetch_D1_ref(Addr a
, UChar size
)
540 if ( cachesim_ref( &D1
, a
, size
) == Hit
) return L1_Hit
;
541 prefetch_LL_doref(a
);
542 if ( cachesim_ref( &LL
, a
, size
) == Hit
) return LL_Hit
;
547 /* complex model with hardware prefetch */
550 CacheModelResult
prefetch_I1_Read(Addr a
, UChar size
)
552 if ( cachesim_ref( &I1
, a
, size
) == Hit
) return L1_Hit
;
553 prefetch_LL_doref(a
);
554 switch( cachesim_ref_wb( &LL
, Read
, a
, size
) ) {
555 case Hit
: return LL_Hit
;
556 case Miss
: return MemAccess
;
559 return WriteBackMemAccess
;
563 CacheModelResult
prefetch_D1_Read(Addr a
, UChar size
)
565 if ( cachesim_ref( &D1
, a
, size
) == Hit
) return L1_Hit
;
566 prefetch_LL_doref(a
);
567 switch( cachesim_ref_wb( &LL
, Read
, a
, size
) ) {
568 case Hit
: return LL_Hit
;
569 case Miss
: return MemAccess
;
572 return WriteBackMemAccess
;
576 CacheModelResult
prefetch_D1_Write(Addr a
, UChar size
)
578 prefetch_LL_doref(a
);
579 if ( cachesim_ref( &D1
, a
, size
) == Hit
) {
580 /* Even for a L1 hit, the write-trough L1 passes
581 * the write to the LL to make the LL line dirty.
582 * But this causes no latency, so return the hit.
584 cachesim_ref_wb( &LL
, Write
, a
, size
);
587 switch( cachesim_ref_wb( &LL
, Write
, a
, size
) ) {
588 case Hit
: return LL_Hit
;
589 case Miss
: return MemAccess
;
592 return WriteBackMemAccess
;
596 /*------------------------------------------------------------*/
597 /*--- Cache Simulation with use metric collection ---*/
598 /*------------------------------------------------------------*/
600 /* can not be combined with write-back or prefetch */
603 void cacheuse_initcache(cache_t2
* c
)
606 unsigned int start_mask
, start_val
;
607 unsigned int end_mask
, end_val
;
609 c
->use
= CLG_MALLOC("cl.sim.cu_ic.1",
610 sizeof(line_use
) * c
->sets
* c
->assoc
);
611 c
->loaded
= CLG_MALLOC("cl.sim.cu_ic.2",
612 sizeof(line_loaded
) * c
->sets
* c
->assoc
);
613 c
->line_start_mask
= CLG_MALLOC("cl.sim.cu_ic.3",
614 sizeof(int) * c
->line_size
);
615 c
->line_end_mask
= CLG_MALLOC("cl.sim.cu_ic.4",
616 sizeof(int) * c
->line_size
);
618 c
->line_size_mask
= c
->line_size
-1;
620 /* Meaning of line_start_mask/line_end_mask
621 * Example: for a given cache line, you get an access starting at
622 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
623 * line size of 32, you have 1 bit per byte in the mask:
625 * bit31 bit8 bit5 bit 0
627 * 11..111111100000 line_start_mask[5]
628 * 00..000111111111 line_end_mask[(5+4)-1]
630 * use_mask |= line_start_mask[5] && line_end_mask[8]
633 start_val
= end_val
= ~0;
634 if (c
->line_size
< 32) {
635 int bits_per_byte
= 32/c
->line_size
;
636 start_mask
= (1<<bits_per_byte
)-1;
637 end_mask
= start_mask
<< (32-bits_per_byte
);
638 for(i
=0;i
<c
->line_size
;i
++) {
639 c
->line_start_mask
[i
] = start_val
;
640 start_val
= start_val
& ~start_mask
;
641 start_mask
= start_mask
<< bits_per_byte
;
643 c
->line_end_mask
[c
->line_size
-i
-1] = end_val
;
644 end_val
= end_val
& ~end_mask
;
645 end_mask
= end_mask
>> bits_per_byte
;
649 int bytes_per_bit
= c
->line_size
/32;
652 for(i
=0;i
<c
->line_size
;i
++) {
653 c
->line_start_mask
[i
] = start_val
;
654 c
->line_end_mask
[c
->line_size
-i
-1] = end_val
;
655 if ( ((i
+1)%bytes_per_bit
) == 0) {
656 start_val
&= ~start_mask
;
657 end_val
&= ~end_mask
;
664 CLG_DEBUG(6, "Config %s:\n", c
->desc_line
);
665 for(i
=0;i
<c
->line_size
;i
++) {
666 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
667 i
, (UInt
)c
->line_start_mask
[i
], (UInt
)c
->line_end_mask
[i
]);
670 /* We use lower tag bits as offset pointers to cache use info.
671 * I.e. some cache parameters don't work.
673 if ( (1<<c
->tag_shift
) < c
->assoc
) {
674 VG_(message
)(Vg_DebugMsg
,
675 "error: Use associativity < %d for cache use statistics!\n",
677 VG_(tool_panic
)("Unsupported cache configuration");
682 /* for I1/D1 caches */
683 #define CACHEUSE(L) \
685 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
687 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
688 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
689 UWord tag = a & L.tag_mask; \
692 UWord *set, tmp_tag; \
695 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n", \
696 L.name, a, size, set1, set2); \
698 /* First case: word entirely within line. */ \
699 if (set1 == set2) { \
701 set = &(L.tags[set1 * L.assoc]); \
702 use_mask = L.line_start_mask[a & L.line_size_mask] & \
703 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
705 /* This loop is unrolled for just the first case, which is the most */\
706 /* common. We can't unroll any further because it would screw up */\
707 /* if we have a direct-mapped (1-way) cache. */\
708 if (tag == (set[0] & L.tag_mask)) { \
709 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
710 L.use[idx].count ++; \
711 L.use[idx].mask |= use_mask; \
712 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
713 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
714 use_mask, L.use[idx].mask, L.use[idx].count); \
717 /* If the tag is one other than the MRU, move it into the MRU spot */\
718 /* and shuffle the rest down. */\
719 for (i = 1; i < L.assoc; i++) { \
720 if (tag == (set[i] & L.tag_mask)) { \
722 for (j = i; j > 0; j--) { \
723 set[j] = set[j - 1]; \
726 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
727 L.use[idx].count ++; \
728 L.use[idx].mask |= use_mask; \
729 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
730 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
731 use_mask, L.use[idx].mask, L.use[idx].count); \
736 /* A miss; install this tag as MRU, shuffle rest down. */ \
737 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
738 for (j = L.assoc - 1; j > 0; j--) { \
739 set[j] = set[j - 1]; \
741 set[0] = tag | tmp_tag; \
742 idx = (set1 * L.assoc) + tmp_tag; \
743 return update_##L##_use(&L, idx, \
744 use_mask, a &~ L.line_size_mask); \
746 /* Second case: word straddles two lines. */ \
747 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
748 } else if (((set1 + 1) & (L.sets_min_1)) == set2) { \
749 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */ \
750 set = &(L.tags[set1 * L.assoc]); \
751 use_mask = L.line_start_mask[a & L.line_size_mask]; \
752 if (tag == (set[0] & L.tag_mask)) { \
753 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
754 L.use[idx].count ++; \
755 L.use[idx].mask |= use_mask; \
756 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
757 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
758 use_mask, L.use[idx].mask, L.use[idx].count); \
761 for (i = 1; i < L.assoc; i++) { \
762 if (tag == (set[i] & L.tag_mask)) { \
764 for (j = i; j > 0; j--) { \
765 set[j] = set[j - 1]; \
768 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
769 L.use[idx].count ++; \
770 L.use[idx].mask |= use_mask; \
771 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
772 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
773 use_mask, L.use[idx].mask, L.use[idx].count); \
777 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
778 for (j = L.assoc - 1; j > 0; j--) { \
779 set[j] = set[j - 1]; \
781 set[0] = tag | tmp_tag; \
782 idx = (set1 * L.assoc) + tmp_tag; \
783 miss1 = update_##L##_use(&L, idx, \
784 use_mask, a &~ L.line_size_mask); \
786 set = &(L.tags[set2 * L.assoc]); \
787 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
788 tag2 = (a+size-1) & L.tag_mask; \
789 if (tag2 == (set[0] & L.tag_mask)) { \
790 idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \
791 L.use[idx].count ++; \
792 L.use[idx].mask |= use_mask; \
793 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
794 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
795 use_mask, L.use[idx].mask, L.use[idx].count); \
798 for (i = 1; i < L.assoc; i++) { \
799 if (tag2 == (set[i] & L.tag_mask)) { \
801 for (j = i; j > 0; j--) { \
802 set[j] = set[j - 1]; \
805 idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \
806 L.use[idx].count ++; \
807 L.use[idx].mask |= use_mask; \
808 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
809 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
810 use_mask, L.use[idx].mask, L.use[idx].count); \
814 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
815 for (j = L.assoc - 1; j > 0; j--) { \
816 set[j] = set[j - 1]; \
818 set[0] = tag2 | tmp_tag; \
819 idx = (set2 * L.assoc) + tmp_tag; \
820 miss2 = update_##L##_use(&L, idx, \
821 use_mask, (a+size-1) &~ L.line_size_mask); \
822 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit; \
825 VG_(printf)("addr: %#lx size: %u sets: %u %u", a, size, set1, set2); \
826 VG_(tool_panic)("item straddles more than two cache sets"); \
832 /* logarithmic bitcounting algorithm, see
833 * http://graphics.stanford.edu/~seander/bithacks.html
835 static __inline__
unsigned int countBits(unsigned int bits
)
837 unsigned int c
; // store the total here
838 const int S
[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
839 const int B
[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
842 c
= ((c
>> S
[0]) & B
[0]) + (c
& B
[0]);
843 c
= ((c
>> S
[1]) & B
[1]) + (c
& B
[1]);
844 c
= ((c
>> S
[2]) & B
[2]) + (c
& B
[2]);
845 c
= ((c
>> S
[3]) & B
[3]) + (c
& B
[3]);
846 c
= ((c
>> S
[4]) & B
[4]) + (c
& B
[4]);
850 static void update_LL_use(int idx
, Addr memline
)
852 line_loaded
* loaded
= &(LL
.loaded
[idx
]);
853 line_use
* use
= &(LL
.use
[idx
]);
854 int i
= ((32 - countBits(use
->mask
)) * LL
.line_size
)>>5;
856 CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
857 idx
, CLG_(bb_base
) + current_ii
->instr_offset
, memline
);
859 CLG_DEBUG(2, " old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",
860 use
->count
, i
, use
->mask
, loaded
->memline
, loaded
->iaddr
);
861 CLG_DEBUG(2, " collect: %d, use_base %p\n",
862 CLG_(current_state
).collect
, loaded
->use_base
);
864 if (CLG_(current_state
).collect
&& loaded
->use_base
) {
865 (loaded
->use_base
)[off_LL_AcCost
] += 1000 / use
->count
;
866 (loaded
->use_base
)[off_LL_SpLoss
] += i
;
873 loaded
->memline
= memline
;
874 loaded
->iaddr
= CLG_(bb_base
) + current_ii
->instr_offset
;
875 loaded
->use_base
= (CLG_(current_state
).nonskipped
) ?
876 CLG_(current_state
).nonskipped
->skipped
:
877 CLG_(cost_base
) + current_ii
->cost_offset
;
881 CacheModelResult
cacheuse_LL_access(Addr memline
, line_loaded
* l1_loaded
)
883 UInt setNo
= (memline
>> LL
.line_size_bits
) & (LL
.sets_min_1
);
884 UWord
* set
= &(LL
.tags
[setNo
* LL
.assoc
]);
885 UWord tag
= memline
& LL
.tag_mask
;
890 CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %u\n", memline
, setNo
);
892 if (tag
== (set
[0] & LL
.tag_mask
)) {
893 idx
= (setNo
* LL
.assoc
) + (set
[0] & ~LL
.tag_mask
);
894 l1_loaded
->dep_use
= &(LL
.use
[idx
]);
896 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
897 idx
, LL
.loaded
[idx
].memline
, LL
.loaded
[idx
].iaddr
,
898 LL
.use
[idx
].mask
, LL
.use
[idx
].count
);
901 for (i
= 1; i
< LL
.assoc
; i
++) {
902 if (tag
== (set
[i
] & LL
.tag_mask
)) {
904 for (j
= i
; j
> 0; j
--) {
908 idx
= (setNo
* LL
.assoc
) + (tmp_tag
& ~LL
.tag_mask
);
909 l1_loaded
->dep_use
= &(LL
.use
[idx
]);
911 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
912 i
, idx
, LL
.loaded
[idx
].memline
, LL
.loaded
[idx
].iaddr
,
913 LL
.use
[idx
].mask
, LL
.use
[idx
].count
);
918 /* A miss; install this tag as MRU, shuffle rest down. */
919 tmp_tag
= set
[LL
.assoc
- 1] & ~LL
.tag_mask
;
920 for (j
= LL
.assoc
- 1; j
> 0; j
--) {
923 set
[0] = tag
| tmp_tag
;
924 idx
= (setNo
* LL
.assoc
) + tmp_tag
;
925 l1_loaded
->dep_use
= &(LL
.use
[idx
]);
927 update_LL_use(idx
, memline
);
935 #define UPDATE_USE(L) \
937 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
938 UInt mask, Addr memline) \
940 line_loaded* loaded = &(cache->loaded[idx]); \
941 line_use* use = &(cache->use[idx]); \
942 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
944 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
945 cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
946 if (use->count>0) { \
947 CLG_DEBUG(2, " old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",\
948 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
949 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
950 CLG_(current_state).collect, loaded->use_base); \
952 if (CLG_(current_state).collect && loaded->use_base) { \
953 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
954 (loaded->use_base)[off_##L##_SpLoss] += c; \
956 /* FIXME (?): L1/LL line sizes must be equal ! */ \
957 loaded->dep_use->mask |= use->mask; \
958 loaded->dep_use->count += use->count; \
964 loaded->memline = memline; \
965 loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset; \
966 loaded->use_base = (CLG_(current_state).nonskipped) ? \
967 CLG_(current_state).nonskipped->skipped : \
968 CLG_(cost_base) + current_ii->cost_offset; \
970 if (memline == 0) return LL_Hit; \
971 return cacheuse_LL_access(memline, loaded); \
982 void cacheuse_finish(void)
985 InstrInfo ii
= { 0,0,0,0 };
987 if (!CLG_(current_state
).collect
) return;
990 current_ii
= &ii
; /* needs to be set for update_XX_use */
993 /* update usage counters */
995 for (i
= 0; i
< I1
.sets
* I1
.assoc
; i
++)
996 if (I1
.loaded
[i
].use_base
)
997 update_I1_use( &I1
, i
, 0,0);
1000 for (i
= 0; i
< D1
.sets
* D1
.assoc
; i
++)
1001 if (D1
.loaded
[i
].use_base
)
1002 update_D1_use( &D1
, i
, 0,0);
1005 for (i
= 0; i
< LL
.sets
* LL
.assoc
; i
++)
1006 if (LL
.loaded
[i
].use_base
)
1007 update_LL_use(i
, 0);
1014 /*------------------------------------------------------------*/
1015 /*--- Helper functions called by instrumented code ---*/
1016 /*------------------------------------------------------------*/
1020 void inc_costs(CacheModelResult r
, ULong
* c1
, ULong
* c2
)
1023 case WriteBackMemAccess
:
1024 if (clo_simulate_writeback
) {
1047 const HChar
* cacheRes(CacheModelResult r
)
1050 case L1_Hit
: return "L1 Hit ";
1051 case LL_Hit
: return "LL Hit ";
1052 case MemAccess
: return "LL Miss";
1053 case WriteBackMemAccess
: return "LL Miss (dirty)";
1061 static void log_1I0D(InstrInfo
* ii
)
1063 CacheModelResult IrRes
;
1066 IrRes
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
);
1068 CLG_DEBUG(6, "log_1I0D: Ir %#lx/%u => %s\n",
1069 CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
, cacheRes(IrRes
));
1071 if (CLG_(current_state
).collect
) {
1074 if (CLG_(current_state
).nonskipped
)
1075 cost_Ir
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1077 cost_Ir
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_IR
];
1079 inc_costs(IrRes
, cost_Ir
,
1080 CLG_(current_state
).cost
+ fullOffset(EG_IR
) );
1085 static void log_2I0D(InstrInfo
* ii1
, InstrInfo
* ii2
)
1087 CacheModelResult Ir1Res
, Ir2Res
;
1088 ULong
*global_cost_Ir
;
1091 Ir1Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii1
->instr_offset
, ii1
->instr_size
);
1093 Ir2Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii2
->instr_offset
, ii2
->instr_size
);
1095 CLG_DEBUG(6, "log_2I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1096 CLG_(bb_base
) + ii1
->instr_offset
, ii1
->instr_size
, cacheRes(Ir1Res
),
1097 CLG_(bb_base
) + ii2
->instr_offset
, ii2
->instr_size
, cacheRes(Ir2Res
) );
1099 if (!CLG_(current_state
).collect
) return;
1101 global_cost_Ir
= CLG_(current_state
).cost
+ fullOffset(EG_IR
);
1102 if (CLG_(current_state
).nonskipped
) {
1103 ULong
* skipped_cost_Ir
=
1104 CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1106 inc_costs(Ir1Res
, global_cost_Ir
, skipped_cost_Ir
);
1107 inc_costs(Ir2Res
, global_cost_Ir
, skipped_cost_Ir
);
1111 inc_costs(Ir1Res
, global_cost_Ir
,
1112 CLG_(cost_base
) + ii1
->cost_offset
+ ii1
->eventset
->offset
[EG_IR
]);
1113 inc_costs(Ir2Res
, global_cost_Ir
,
1114 CLG_(cost_base
) + ii2
->cost_offset
+ ii2
->eventset
->offset
[EG_IR
]);
1118 static void log_3I0D(InstrInfo
* ii1
, InstrInfo
* ii2
, InstrInfo
* ii3
)
1120 CacheModelResult Ir1Res
, Ir2Res
, Ir3Res
;
1121 ULong
*global_cost_Ir
;
1124 Ir1Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii1
->instr_offset
, ii1
->instr_size
);
1126 Ir2Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii2
->instr_offset
, ii2
->instr_size
);
1128 Ir3Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii3
->instr_offset
, ii3
->instr_size
);
1130 CLG_DEBUG(6, "log_3I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1131 CLG_(bb_base
) + ii1
->instr_offset
, ii1
->instr_size
, cacheRes(Ir1Res
),
1132 CLG_(bb_base
) + ii2
->instr_offset
, ii2
->instr_size
, cacheRes(Ir2Res
),
1133 CLG_(bb_base
) + ii3
->instr_offset
, ii3
->instr_size
, cacheRes(Ir3Res
) );
1135 if (!CLG_(current_state
).collect
) return;
1137 global_cost_Ir
= CLG_(current_state
).cost
+ fullOffset(EG_IR
);
1138 if (CLG_(current_state
).nonskipped
) {
1139 ULong
* skipped_cost_Ir
=
1140 CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1141 inc_costs(Ir1Res
, global_cost_Ir
, skipped_cost_Ir
);
1142 inc_costs(Ir2Res
, global_cost_Ir
, skipped_cost_Ir
);
1143 inc_costs(Ir3Res
, global_cost_Ir
, skipped_cost_Ir
);
1147 inc_costs(Ir1Res
, global_cost_Ir
,
1148 CLG_(cost_base
) + ii1
->cost_offset
+ ii1
->eventset
->offset
[EG_IR
]);
1149 inc_costs(Ir2Res
, global_cost_Ir
,
1150 CLG_(cost_base
) + ii2
->cost_offset
+ ii2
->eventset
->offset
[EG_IR
]);
1151 inc_costs(Ir3Res
, global_cost_Ir
,
1152 CLG_(cost_base
) + ii3
->cost_offset
+ ii3
->eventset
->offset
[EG_IR
]);
1155 /* Instruction doing a read access */
1158 static void log_1I1Dr(InstrInfo
* ii
, Addr data_addr
, Word data_size
)
1160 CacheModelResult IrRes
, DrRes
;
1163 IrRes
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
);
1164 DrRes
= (*simulator
.D1_Read
)(data_addr
, data_size
);
1166 CLG_DEBUG(6, "log_1I1Dr: Ir %#lx/%u => %s, Dr %#lx/%ld => %s\n",
1167 CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
, cacheRes(IrRes
),
1168 data_addr
, data_size
, cacheRes(DrRes
));
1170 if (CLG_(current_state
).collect
) {
1171 ULong
*cost_Ir
, *cost_Dr
;
1173 if (CLG_(current_state
).nonskipped
) {
1174 cost_Ir
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1175 cost_Dr
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_DR
);
1178 cost_Ir
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_IR
];
1179 cost_Dr
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_DR
];
1182 inc_costs(IrRes
, cost_Ir
,
1183 CLG_(current_state
).cost
+ fullOffset(EG_IR
) );
1184 inc_costs(DrRes
, cost_Dr
,
1185 CLG_(current_state
).cost
+ fullOffset(EG_DR
) );
1190 /* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
1191 have exactly the same prototype. If you change them, you must
1192 change addEvent_D_guarded too. */
1194 static void log_0I1Dr(InstrInfo
* ii
, Addr data_addr
, Word data_size
)
1196 CacheModelResult DrRes
;
1199 DrRes
= (*simulator
.D1_Read
)(data_addr
, data_size
);
1201 CLG_DEBUG(6, "log_0I1Dr: Dr %#lx/%ld => %s\n",
1202 data_addr
, data_size
, cacheRes(DrRes
));
1204 if (CLG_(current_state
).collect
) {
1207 if (CLG_(current_state
).nonskipped
)
1208 cost_Dr
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_DR
);
1210 cost_Dr
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_DR
];
1212 inc_costs(DrRes
, cost_Dr
,
1213 CLG_(current_state
).cost
+ fullOffset(EG_DR
) );
1218 /* Instruction doing a write access */
1221 static void log_1I1Dw(InstrInfo
* ii
, Addr data_addr
, Word data_size
)
1223 CacheModelResult IrRes
, DwRes
;
1226 IrRes
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
);
1227 DwRes
= (*simulator
.D1_Write
)(data_addr
, data_size
);
1229 CLG_DEBUG(6, "log_1I1Dw: Ir %#lx/%u => %s, Dw %#lx/%ld => %s\n",
1230 CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
, cacheRes(IrRes
),
1231 data_addr
, data_size
, cacheRes(DwRes
));
1233 if (CLG_(current_state
).collect
) {
1234 ULong
*cost_Ir
, *cost_Dw
;
1236 if (CLG_(current_state
).nonskipped
) {
1237 cost_Ir
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1238 cost_Dw
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_DW
);
1241 cost_Ir
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_IR
];
1242 cost_Dw
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_DW
];
1245 inc_costs(IrRes
, cost_Ir
,
1246 CLG_(current_state
).cost
+ fullOffset(EG_IR
) );
1247 inc_costs(DwRes
, cost_Dw
,
1248 CLG_(current_state
).cost
+ fullOffset(EG_DW
) );
1252 /* See comment on log_0I1Dr. */
1254 static void log_0I1Dw(InstrInfo
* ii
, Addr data_addr
, Word data_size
)
1256 CacheModelResult DwRes
;
1259 DwRes
= (*simulator
.D1_Write
)(data_addr
, data_size
);
1261 CLG_DEBUG(6, "log_0I1Dw: Dw %#lx/%ld => %s\n",
1262 data_addr
, data_size
, cacheRes(DwRes
));
1264 if (CLG_(current_state
).collect
) {
1267 if (CLG_(current_state
).nonskipped
)
1268 cost_Dw
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_DW
);
1270 cost_Dw
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_DW
];
1272 inc_costs(DwRes
, cost_Dw
,
1273 CLG_(current_state
).cost
+ fullOffset(EG_DW
) );
1279 /*------------------------------------------------------------*/
1280 /*--- Cache configuration ---*/
1281 /*------------------------------------------------------------*/
1283 static cache_t clo_I1_cache
= UNDEFINED_CACHE
;
1284 static cache_t clo_D1_cache
= UNDEFINED_CACHE
;
1285 static cache_t clo_LL_cache
= UNDEFINED_CACHE
;
1287 /* Initialize and clear simulator state */
1288 static void cachesim_post_clo_init(void)
1290 /* Cache configurations. */
1291 cache_t I1c
, D1c
, LLc
;
1293 /* Initialize access handlers */
1294 if (!CLG_(clo
).simulate_cache
) {
1295 CLG_(cachesim
).log_1I0D
= 0;
1296 CLG_(cachesim
).log_1I0D_name
= "(no function)";
1297 CLG_(cachesim
).log_2I0D
= 0;
1298 CLG_(cachesim
).log_2I0D_name
= "(no function)";
1299 CLG_(cachesim
).log_3I0D
= 0;
1300 CLG_(cachesim
).log_3I0D_name
= "(no function)";
1302 CLG_(cachesim
).log_1I1Dr
= 0;
1303 CLG_(cachesim
).log_1I1Dr_name
= "(no function)";
1304 CLG_(cachesim
).log_1I1Dw
= 0;
1305 CLG_(cachesim
).log_1I1Dw_name
= "(no function)";
1307 CLG_(cachesim
).log_0I1Dr
= 0;
1308 CLG_(cachesim
).log_0I1Dr_name
= "(no function)";
1309 CLG_(cachesim
).log_0I1Dw
= 0;
1310 CLG_(cachesim
).log_0I1Dw_name
= "(no function)";
1314 /* Configuration of caches only needed with real cache simulation */
1315 VG_(post_clo_init_configure_caches
)(&I1c
, &D1c
, &LLc
,
1324 // min_line_size is used to make sure that we never feed
1325 // accesses to the simulator straddling more than two
1326 // cache lines at any cache level
1327 CLG_(min_line_size
) = (I1c
.line_size
< D1c
.line_size
)
1328 ? I1c
.line_size
: D1c
.line_size
;
1329 CLG_(min_line_size
) = (LLc
.line_size
< CLG_(min_line_size
))
1330 ? LLc
.line_size
: CLG_(min_line_size
);
1332 Int largest_load_or_store_size
1333 = VG_(machine_get_size_of_largest_guest_register
)();
1334 if (CLG_(min_line_size
) < largest_load_or_store_size
) {
1335 /* We can't continue, because the cache simulation might
1336 straddle more than 2 lines, and it will assert. So let's
1337 just stop before we start. */
1338 VG_(umsg
)("Callgrind: cannot continue: the minimum line size (%d)\n",
1339 (Int
)CLG_(min_line_size
));
1340 VG_(umsg
)(" must be equal to or larger than the maximum register size (%d)\n",
1341 largest_load_or_store_size
);
1342 VG_(umsg
)(" but it is not. Exiting now.\n");
1346 cachesim_initcache(I1c
, &I1
);
1347 cachesim_initcache(D1c
, &D1
);
1348 cachesim_initcache(LLc
, &LL
);
1350 /* the other cache simulators use the standard helpers
1351 * with dispatching via simulator struct */
1353 CLG_(cachesim
).log_1I0D
= log_1I0D
;
1354 CLG_(cachesim
).log_1I0D_name
= "log_1I0D";
1355 CLG_(cachesim
).log_2I0D
= log_2I0D
;
1356 CLG_(cachesim
).log_2I0D_name
= "log_2I0D";
1357 CLG_(cachesim
).log_3I0D
= log_3I0D
;
1358 CLG_(cachesim
).log_3I0D_name
= "log_3I0D";
1360 CLG_(cachesim
).log_1I1Dr
= log_1I1Dr
;
1361 CLG_(cachesim
).log_1I1Dw
= log_1I1Dw
;
1362 CLG_(cachesim
).log_1I1Dr_name
= "log_1I1Dr";
1363 CLG_(cachesim
).log_1I1Dw_name
= "log_1I1Dw";
1365 CLG_(cachesim
).log_0I1Dr
= log_0I1Dr
;
1366 CLG_(cachesim
).log_0I1Dw
= log_0I1Dw
;
1367 CLG_(cachesim
).log_0I1Dr_name
= "log_0I1Dr";
1368 CLG_(cachesim
).log_0I1Dw_name
= "log_0I1Dw";
1370 if (clo_collect_cacheuse
) {
1372 /* Output warning for not supported option combinations */
1373 if (clo_simulate_hwpref
) {
1374 VG_(message
)(Vg_DebugMsg
,
1375 "warning: prefetch simulation can not be "
1376 "used with cache usage\n");
1377 clo_simulate_hwpref
= False
;
1380 if (clo_simulate_writeback
) {
1381 VG_(message
)(Vg_DebugMsg
,
1382 "warning: write-back simulation can not be "
1383 "used with cache usage\n");
1384 clo_simulate_writeback
= False
;
1387 simulator
.I1_Read
= cacheuse_I1_doRead
;
1388 simulator
.D1_Read
= cacheuse_D1_doRead
;
1389 simulator
.D1_Write
= cacheuse_D1_doRead
;
1393 if (clo_simulate_hwpref
) {
1396 if (clo_simulate_writeback
) {
1397 simulator
.I1_Read
= prefetch_I1_Read
;
1398 simulator
.D1_Read
= prefetch_D1_Read
;
1399 simulator
.D1_Write
= prefetch_D1_Write
;
1402 simulator
.I1_Read
= prefetch_I1_ref
;
1403 simulator
.D1_Read
= prefetch_D1_ref
;
1404 simulator
.D1_Write
= prefetch_D1_ref
;
1410 if (clo_simulate_writeback
) {
1411 simulator
.I1_Read
= cachesim_I1_Read
;
1412 simulator
.D1_Read
= cachesim_D1_Read
;
1413 simulator
.D1_Write
= cachesim_D1_Write
;
1416 simulator
.I1_Read
= cachesim_I1_ref
;
1417 simulator
.D1_Read
= cachesim_D1_ref
;
1418 simulator
.D1_Write
= cachesim_D1_ref
;
1423 /* Clear simulator state. Has to be initialized before */
1425 void cachesim_clear(void)
1427 cachesim_clearcache(&I1
);
1428 cachesim_clearcache(&D1
);
1429 cachesim_clearcache(&LL
);
1435 static void cachesim_dump_desc(VgFile
*fp
)
1437 VG_(fprintf
)(fp
, "\ndesc: I1 cache: %s\n", I1
.desc_line
);
1438 VG_(fprintf
)(fp
, "desc: D1 cache: %s\n", D1
.desc_line
);
1439 VG_(fprintf
)(fp
, "desc: LL cache: %s\n", LL
.desc_line
);
1443 void cachesim_print_opts(void)
1446 "\n cache simulator options (does cache simulation if used):\n"
1447 " --simulate-wb=no|yes Count write-back events [no]\n"
1448 " --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1449 #if CLG_EXPERIMENTAL
1450 " --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1452 " --cacheuse=no|yes Collect cache block use [no]\n");
1453 VG_(print_cache_clo_opts
)();
1456 /* Check for command line option for cache configuration.
1457 * Return False if unknown and not handled.
1459 * Called from CLG_(process_cmd_line_option)() in clo.c
1461 static Bool
cachesim_parse_opt(const HChar
* arg
)
1463 if VG_BOOL_CLO(arg
, "--simulate-wb", clo_simulate_writeback
) {}
1464 else if VG_BOOL_CLO(arg
, "--simulate-hwpref", clo_simulate_hwpref
) {}
1465 else if VG_BOOL_CLO(arg
, "--simulate-sectors", clo_simulate_sectors
) {}
1467 else if VG_BOOL_CLO(arg
, "--cacheuse", clo_collect_cacheuse
) {
1468 if (clo_collect_cacheuse
) {
1469 /* Use counters only make sense with fine dumping */
1470 CLG_(clo
).dump_instr
= True
;
1474 else if (VG_(str_clo_cache_opt
)(arg
,
1486 void cachesim_printstat(Int l1
, Int l2
, Int l3
)
1488 FullCost total
= CLG_(total_cost
), D_total
= 0;
1489 ULong LL_total_m
, LL_total_mr
, LL_total_mw
,
1490 LL_total
, LL_total_r
, LL_total_w
;
1492 if ((VG_(clo_verbosity
) >1) && clo_simulate_hwpref
) {
1493 VG_(message
)(Vg_DebugMsg
, "Prefetch Up: %llu\n",
1495 VG_(message
)(Vg_DebugMsg
, "Prefetch Down: %llu\n",
1497 VG_(message
)(Vg_DebugMsg
, "\n");
1500 VG_(message
)(Vg_UserMsg
, "I1 misses: %'*llu\n", l1
,
1501 total
[fullOffset(EG_IR
) +1]);
1503 VG_(message
)(Vg_UserMsg
, "LLi misses: %'*llu\n", l1
,
1504 total
[fullOffset(EG_IR
) +2]);
1506 if (0 == total
[fullOffset(EG_IR
)])
1507 total
[fullOffset(EG_IR
)] = 1;
1509 VG_(message
)(Vg_UserMsg
, "I1 miss rate: %*.2f%%\n", l1
,
1510 total
[fullOffset(EG_IR
)+1] * 100.0 / total
[fullOffset(EG_IR
)]);
1512 VG_(message
)(Vg_UserMsg
, "LLi miss rate: %*.2f%%\n", l1
,
1513 total
[fullOffset(EG_IR
)+2] * 100.0 / total
[fullOffset(EG_IR
)]);
1515 VG_(message
)(Vg_UserMsg
, "\n");
1518 Use the D_refs.rd and D_refs.wr values to determine the
1519 * width of columns 2 & 3. */
1521 D_total
= CLG_(get_eventset_cost
)( CLG_(sets
).full
);
1522 CLG_(init_cost
)( CLG_(sets
).full
, D_total
);
1523 // we only use the first 3 values of D_total, adding up Dr and Dw costs
1524 CLG_(copy_cost
)( CLG_(get_event_set
)(EG_DR
), D_total
, total
+ fullOffset(EG_DR
) );
1525 CLG_(add_cost
) ( CLG_(get_event_set
)(EG_DW
), D_total
, total
+ fullOffset(EG_DW
) );
1527 VG_(message
)(Vg_UserMsg
, "D refs: %'*llu (%'*llu rd + %'*llu wr)\n",
1529 l2
, total
[fullOffset(EG_DR
)],
1530 l3
, total
[fullOffset(EG_DW
)]);
1532 VG_(message
)(Vg_UserMsg
, "D1 misses: %'*llu (%'*llu rd + %'*llu wr)\n",
1534 l2
, total
[fullOffset(EG_DR
)+1],
1535 l3
, total
[fullOffset(EG_DW
)+1]);
1537 VG_(message
)(Vg_UserMsg
, "LLd misses: %'*llu (%'*llu rd + %'*llu wr)\n",
1539 l2
, total
[fullOffset(EG_DR
)+2],
1540 l3
, total
[fullOffset(EG_DW
)+2]);
1542 if (0 == D_total
[0]) D_total
[0] = 1;
1543 if (0 == total
[fullOffset(EG_DR
)]) total
[fullOffset(EG_DR
)] = 1;
1544 if (0 == total
[fullOffset(EG_DW
)]) total
[fullOffset(EG_DW
)] = 1;
1546 VG_(message
)(Vg_UserMsg
, "D1 miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1547 l1
, D_total
[1] * 100.0 / D_total
[0],
1548 l2
, total
[fullOffset(EG_DR
)+1] * 100.0 / total
[fullOffset(EG_DR
)],
1549 l3
, total
[fullOffset(EG_DW
)+1] * 100.0 / total
[fullOffset(EG_DW
)]);
1551 VG_(message
)(Vg_UserMsg
, "LLd miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1552 l1
, D_total
[2] * 100.0 / D_total
[0],
1553 l2
, total
[fullOffset(EG_DR
)+2] * 100.0 / total
[fullOffset(EG_DR
)],
1554 l3
, total
[fullOffset(EG_DW
)+2] * 100.0 / total
[fullOffset(EG_DW
)]);
1555 VG_(message
)(Vg_UserMsg
, "\n");
1559 /* LL overall results */
1562 total
[fullOffset(EG_DR
) +1] +
1563 total
[fullOffset(EG_DW
) +1] +
1564 total
[fullOffset(EG_IR
) +1];
1566 total
[fullOffset(EG_DR
) +1] +
1567 total
[fullOffset(EG_IR
) +1];
1568 LL_total_w
= total
[fullOffset(EG_DW
) +1];
1569 VG_(message
)(Vg_UserMsg
, "LL refs: %'*llu (%'*llu rd + %'*llu wr)\n",
1570 l1
, LL_total
, l2
, LL_total_r
, l3
, LL_total_w
);
1573 total
[fullOffset(EG_DR
) +2] +
1574 total
[fullOffset(EG_DW
) +2] +
1575 total
[fullOffset(EG_IR
) +2];
1577 total
[fullOffset(EG_DR
) +2] +
1578 total
[fullOffset(EG_IR
) +2];
1579 LL_total_mw
= total
[fullOffset(EG_DW
) +2];
1580 VG_(message
)(Vg_UserMsg
, "LL misses: %'*llu (%'*llu rd + %'*llu wr)\n",
1581 l1
, LL_total_m
, l2
, LL_total_mr
, l3
, LL_total_mw
);
1583 VG_(message
)(Vg_UserMsg
, "LL miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1584 l1
, LL_total_m
* 100.0 / (total
[fullOffset(EG_IR
)] + D_total
[0]),
1585 l2
, LL_total_mr
* 100.0 / (total
[fullOffset(EG_IR
)] + total
[fullOffset(EG_DR
)]),
1586 l3
, LL_total_mw
* 100.0 / total
[fullOffset(EG_DW
)]);
1590 /*------------------------------------------------------------*/
1591 /*--- Setup for Event set. ---*/
1592 /*------------------------------------------------------------*/
1594 struct event_sets
CLG_(sets
);
1596 void CLG_(init_eventsets
)(void)
1598 // Event groups from which the event sets are composed
1599 // the "Use" group only is used with "cacheuse" simulation
1600 if (clo_collect_cacheuse
)
1601 CLG_(register_event_group4
)(EG_USE
,
1602 "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1604 if (!CLG_(clo
).simulate_cache
)
1605 CLG_(register_event_group
)(EG_IR
, "Ir");
1606 else if (!clo_simulate_writeback
) {
1607 CLG_(register_event_group3
)(EG_IR
, "Ir", "I1mr", "ILmr");
1608 CLG_(register_event_group3
)(EG_DR
, "Dr", "D1mr", "DLmr");
1609 CLG_(register_event_group3
)(EG_DW
, "Dw", "D1mw", "DLmw");
1611 else { // clo_simulate_writeback
1612 CLG_(register_event_group4
)(EG_IR
, "Ir", "I1mr", "ILmr", "ILdmr");
1613 CLG_(register_event_group4
)(EG_DR
, "Dr", "D1mr", "DLmr", "DLdmr");
1614 CLG_(register_event_group4
)(EG_DW
, "Dw", "D1mw", "DLmw", "DLdmw");
1617 if (CLG_(clo
).simulate_branch
) {
1618 CLG_(register_event_group2
)(EG_BC
, "Bc", "Bcm");
1619 CLG_(register_event_group2
)(EG_BI
, "Bi", "Bim");
1622 if (CLG_(clo
).collect_bus
)
1623 CLG_(register_event_group
)(EG_BUS
, "Ge");
1625 if (CLG_(clo
).collect_alloc
)
1626 CLG_(register_event_group2
)(EG_ALLOC
, "allocCount", "allocSize");
1628 if (CLG_(clo
).collect_systime
!= systime_no
) {
1629 if (CLG_(clo
).collect_systime
== systime_nsec
)
1630 CLG_(register_event_group3
)(EG_SYS
, "sysCount", "sysTime", "sysCpuTime");
1632 CLG_(register_event_group2
)(EG_SYS
, "sysCount", "sysTime");
1635 // event set used as base for instruction self cost
1636 CLG_(sets
).base
= CLG_(get_event_set2
)(EG_USE
, EG_IR
);
1638 // event set comprising all event groups, used for inclusive cost
1639 CLG_(sets
).full
= CLG_(add_event_group2
)(CLG_(sets
).base
, EG_DR
, EG_DW
);
1640 CLG_(sets
).full
= CLG_(add_event_group2
)(CLG_(sets
).full
, EG_BC
, EG_BI
);
1641 CLG_(sets
).full
= CLG_(add_event_group
) (CLG_(sets
).full
, EG_BUS
);
1642 CLG_(sets
).full
= CLG_(add_event_group2
)(CLG_(sets
).full
, EG_ALLOC
, EG_SYS
);
1645 CLG_DEBUG(1, "EventSets:\n");
1646 CLG_(print_eventset
)(-2, CLG_(sets
).base
);
1647 CLG_(print_eventset
)(-2, CLG_(sets
).full
);
1650 /* Not-existing events are silently ignored */
1651 CLG_(dumpmap
) = CLG_(get_eventmapping
)(CLG_(sets
).full
);
1652 CLG_(append_event
)(CLG_(dumpmap
), "Ir");
1653 CLG_(append_event
)(CLG_(dumpmap
), "Dr");
1654 CLG_(append_event
)(CLG_(dumpmap
), "Dw");
1655 CLG_(append_event
)(CLG_(dumpmap
), "I1mr");
1656 CLG_(append_event
)(CLG_(dumpmap
), "D1mr");
1657 CLG_(append_event
)(CLG_(dumpmap
), "D1mw");
1658 CLG_(append_event
)(CLG_(dumpmap
), "ILmr");
1659 CLG_(append_event
)(CLG_(dumpmap
), "DLmr");
1660 CLG_(append_event
)(CLG_(dumpmap
), "DLmw");
1661 CLG_(append_event
)(CLG_(dumpmap
), "ILdmr");
1662 CLG_(append_event
)(CLG_(dumpmap
), "DLdmr");
1663 CLG_(append_event
)(CLG_(dumpmap
), "DLdmw");
1664 CLG_(append_event
)(CLG_(dumpmap
), "Bc");
1665 CLG_(append_event
)(CLG_(dumpmap
), "Bcm");
1666 CLG_(append_event
)(CLG_(dumpmap
), "Bi");
1667 CLG_(append_event
)(CLG_(dumpmap
), "Bim");
1668 CLG_(append_event
)(CLG_(dumpmap
), "AcCost1");
1669 CLG_(append_event
)(CLG_(dumpmap
), "SpLoss1");
1670 CLG_(append_event
)(CLG_(dumpmap
), "AcCost2");
1671 CLG_(append_event
)(CLG_(dumpmap
), "SpLoss2");
1672 CLG_(append_event
)(CLG_(dumpmap
), "Ge");
1673 CLG_(append_event
)(CLG_(dumpmap
), "allocCount");
1674 CLG_(append_event
)(CLG_(dumpmap
), "allocSize");
1675 CLG_(append_event
)(CLG_(dumpmap
), "sysCount");
1676 CLG_(append_event
)(CLG_(dumpmap
), "sysTime");
1677 CLG_(append_event
)(CLG_(dumpmap
), "sysCpuTime");
1681 /* this is called at dump time for every instruction executed */
1682 static void cachesim_add_icost(SimCost cost
, BBCC
* bbcc
,
1683 InstrInfo
* ii
, ULong exe_count
)
1685 if (!CLG_(clo
).simulate_cache
)
1686 cost
[ fullOffset(EG_IR
) ] += exe_count
;
1689 CLG_(add_and_zero_cost2
)( CLG_(sets
).full
, cost
,
1690 ii
->eventset
, bbcc
->cost
+ ii
->cost_offset
);
1694 void cachesim_finish(void)
1696 if (clo_collect_cacheuse
)
1700 /*------------------------------------------------------------*/
1701 /*--- The simulator defined in this file ---*/
1702 /*------------------------------------------------------------*/
1704 struct cachesim_if
CLG_(cachesim
) = {
1705 .print_opts
= cachesim_print_opts
,
1706 .parse_opt
= cachesim_parse_opt
,
1707 .post_clo_init
= cachesim_post_clo_init
,
1708 .clear
= cachesim_clear
,
1709 .dump_desc
= cachesim_dump_desc
,
1710 .printstat
= cachesim_printstat
,
1711 .add_icost
= cachesim_add_icost
,
1712 .finish
= cachesim_finish
,
1714 /* these will be set by cachesim_post_clo_init */
1725 .log_1I0D_name
= "(no function)",
1726 .log_2I0D_name
= "(no function)",
1727 .log_3I0D_name
= "(no function)",
1729 .log_1I1Dr_name
= "(no function)",
1730 .log_1I1Dw_name
= "(no function)",
1732 .log_0I1Dr_name
= "(no function)",
1733 .log_0I1Dw_name
= "(no function)",
1737 /*--------------------------------------------------------------------*/
1738 /*--- end ct_sim.c ---*/
1739 /*--------------------------------------------------------------------*/