1 /*--------------------------------------------------------------------*/
2 /*--- Cache simulation. ---*/
4 /*--------------------------------------------------------------------*/
7 This file is part of Callgrind, a Valgrind tool for call graph
10 Copyright (C) 2003-2013, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
12 This tool is derived from and contains code from Cachegrind
13 Copyright (C) 2002-2013 Nicholas Nethercote (njn@valgrind.org)
15 This program is free software; you can redistribute it and/or
16 modify it under the terms of the GNU General Public License as
17 published by the Free Software Foundation; either version 2 of the
18 License, or (at your option) any later version.
20 This program is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 General Public License for more details.
25 You should have received a copy of the GNU General Public License
26 along with this program; if not, write to the Free Software
27 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
30 The GNU General Public License is contained in the file COPYING.
37 - simulates a write-allocate cache
38 - (block --> set) hash function uses simple bit selection
39 - handling of references straddling two cache blocks:
40 - counts as only one cache access (not two)
41 - both blocks hit --> one hit
42 - one block hits, the other misses --> one miss
43 - both blocks miss --> one miss (not two)
46 /* Cache configuration */
49 /* additional structures for cache use info, separated
50 * according usage frequency:
51 * - line_loaded : pointer to cost center of instruction
52 * which loaded the line into cache.
53 * Needed to increment counters when line is evicted.
54 * - line_use : updated on every access
58 UInt mask
; /* e.g. for 64Byte line size 1bit/2Byte */
63 line_use
* dep_use
; /* point to higher-level cacheblock for this memline */
72 int line_size
; /* bytes */
73 Bool sectored
; /* prefetch nearside cacheline on read */
79 HChar desc_line
[128]; // large enough
91 * States of flat caches in our model.
92 * We use a 2-level hierarchy,
94 static cache_t2 I1
, D1
, LL
;
96 /* Lower bits of cache tags are used as flags for a cache line */
97 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
98 #define CACHELINE_DIRTY 1
101 /* Cache simulator Options */
102 static Bool clo_simulate_writeback
= False
;
103 static Bool clo_simulate_hwpref
= False
;
104 static Bool clo_simulate_sectors
= False
;
105 static Bool clo_collect_cacheuse
= False
;
107 /* Following global vars are setup before by setup_bbcc():
109 * - Addr CLG_(bb_base) (instruction start address of original BB)
110 * - ULong* CLG_(cost_base) (start of cost array for BB)
114 ULong
* CLG_(cost_base
);
116 static InstrInfo
* current_ii
;
118 /* Cache use offsets */
119 /* The offsets are only correct because all per-instruction event sets get
120 * the "Use" set added first !
122 static Int off_I1_AcCost
= 0;
123 static Int off_I1_SpLoss
= 1;
124 static Int off_D1_AcCost
= 0;
125 static Int off_D1_SpLoss
= 1;
126 static Int off_LL_AcCost
= 2;
127 static Int off_LL_SpLoss
= 3;
129 /* Cache access types */
130 typedef enum { Read
= 0, Write
= CACHELINE_DIRTY
} RefType
;
132 /* Result of a reference into a flat cache */
133 typedef enum { Hit
= 0, Miss
, MissDirty
} CacheResult
;
135 /* Result of a reference into a hierarchical cache model */
140 WriteBackMemAccess
} CacheModelResult
;
142 typedef CacheModelResult (*simcall_type
)(Addr
, UChar
);
145 simcall_type I1_Read
;
146 simcall_type D1_Read
;
147 simcall_type D1_Write
;
150 /*------------------------------------------------------------*/
151 /*--- Cache Simulator Initialization ---*/
152 /*------------------------------------------------------------*/
154 static void cachesim_clearcache(cache_t2
* c
)
158 for (i
= 0; i
< c
->sets
* c
->assoc
; i
++)
161 for (i
= 0; i
< c
->sets
* c
->assoc
; i
++) {
162 c
->loaded
[i
].memline
= 0;
163 c
->loaded
[i
].use_base
= 0;
164 c
->loaded
[i
].dep_use
= 0;
165 c
->loaded
[i
].iaddr
= 0;
168 c
->tags
[i
] = i
% c
->assoc
; /* init lower bits as pointer */
173 static void cacheuse_initcache(cache_t2
* c
);
175 /* By this point, the size/assoc/line_size has been checked. */
176 static void cachesim_initcache(cache_t config
, cache_t2
* c
)
178 c
->size
= config
.size
;
179 c
->assoc
= config
.assoc
;
180 c
->line_size
= config
.line_size
;
181 c
->sectored
= False
; // FIXME
183 c
->sets
= (c
->size
/ c
->line_size
) / c
->assoc
;
184 c
->sets_min_1
= c
->sets
- 1;
185 c
->line_size_bits
= VG_(log2
)(c
->line_size
);
186 c
->tag_shift
= c
->line_size_bits
+ VG_(log2
)(c
->sets
);
187 c
->tag_mask
= ~((1u<<c
->tag_shift
)-1);
189 /* Can bits in tag entries be used for flags?
190 * Should be always true as MIN_LINE_SIZE >= 16 */
191 CLG_ASSERT( (c
->tag_mask
& CACHELINE_FLAGMASK
) == 0);
194 VG_(sprintf
)(c
->desc_line
, "%d B, %d B, direct-mapped%s",
195 c
->size
, c
->line_size
,
196 c
->sectored
? ", sectored":"");
198 VG_(sprintf
)(c
->desc_line
, "%d B, %d B, %d-way associative%s",
199 c
->size
, c
->line_size
, c
->assoc
,
200 c
->sectored
? ", sectored":"");
203 c
->tags
= (UWord
*) CLG_MALLOC("cl.sim.cs_ic.1",
204 sizeof(UWord
) * c
->sets
* c
->assoc
);
205 if (clo_collect_cacheuse
)
206 cacheuse_initcache(c
);
209 cachesim_clearcache(c
);
214 static void print_cache(cache_t2
* c
)
218 /* Note initialisation and update of 'i'. */
219 for (i
= 0, set
= 0; set
< c
->sets
; set
++) {
220 for (way
= 0; way
< c
->assoc
; way
++, i
++) {
221 VG_(printf
)("%8x ", c
->tags
[i
]);
229 /*------------------------------------------------------------*/
230 /*--- Simple Cache Simulation ---*/
231 /*------------------------------------------------------------*/
234 * Model: single inclusive, 2-level cache hierarchy (L1/LL)
235 * with write-allocate
237 * For simple cache hit/miss counts, we do not have to
238 * maintain the dirty state of lines (no need to distinguish
239 * read/write references), and the resulting counts are the
240 * same for write-through and write-back caches.
242 * Simulator functions:
243 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
244 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
246 __attribute__((always_inline
))
248 CacheResult
cachesim_setref(cache_t2
* c
, UInt set_no
, UWord tag
)
253 set
= &(c
->tags
[set_no
* c
->assoc
]);
255 /* This loop is unrolled for just the first case, which is the most */
256 /* common. We can't unroll any further because it would screw up */
257 /* if we have a direct-mapped (1-way) cache. */
261 /* If the tag is one other than the MRU, move it into the MRU spot */
262 /* and shuffle the rest down. */
263 for (i
= 1; i
< c
->assoc
; i
++) {
265 for (j
= i
; j
> 0; j
--) {
273 /* A miss; install this tag as MRU, shuffle rest down. */
274 for (j
= c
->assoc
- 1; j
> 0; j
--) {
282 __attribute__((always_inline
))
284 CacheResult
cachesim_ref(cache_t2
* c
, Addr a
, UChar size
)
286 UWord block1
= a
>> c
->line_size_bits
;
287 UWord block2
= (a
+size
-1) >> c
->line_size_bits
;
288 UInt set1
= block1
& c
->sets_min_1
;
289 /* the tag does not need to include bits specifying the set,
290 * but it can, and this saves instructions */
293 /* Access entirely within line. */
294 if (block1
== block2
)
295 return cachesim_setref(c
, set1
, tag1
);
297 /* Access straddles two lines. */
298 else if (block1
+ 1 == block2
) {
299 UInt set2
= block2
& c
->sets_min_1
;
302 /* the call updates cache structures as side effect */
303 CacheResult res1
= cachesim_setref(c
, set1
, tag1
);
304 CacheResult res2
= cachesim_setref(c
, set2
, tag2
);
305 return ((res1
== Miss
) || (res2
== Miss
)) ? Miss
: Hit
;
308 VG_(printf
)("addr: %lx size: %u blocks: %ld %ld",
309 a
, size
, block1
, block2
);
310 VG_(tool_panic
)("item straddles more than two cache sets");
316 CacheModelResult
cachesim_I1_ref(Addr a
, UChar size
)
318 if ( cachesim_ref( &I1
, a
, size
) == Hit
) return L1_Hit
;
319 if ( cachesim_ref( &LL
, a
, size
) == Hit
) return LL_Hit
;
324 CacheModelResult
cachesim_D1_ref(Addr a
, UChar size
)
326 if ( cachesim_ref( &D1
, a
, size
) == Hit
) return L1_Hit
;
327 if ( cachesim_ref( &LL
, a
, size
) == Hit
) return LL_Hit
;
332 /*------------------------------------------------------------*/
333 /*--- Write Back Cache Simulation ---*/
334 /*------------------------------------------------------------*/
337 * More complex model: L1 Write-through, LL Write-back
338 * This needs to distinguish among read and write references.
340 * Simulator functions:
341 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
342 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
343 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
347 * With write-back, result can be a miss evicting a dirty line
348 * The dirty state of a cache line is stored in Bit0 of the tag for
349 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
350 * type (Read/Write), the line gets dirty on a write.
352 __attribute__((always_inline
))
354 CacheResult
cachesim_setref_wb(cache_t2
* c
, RefType ref
, UInt set_no
, UWord tag
)
359 set
= &(c
->tags
[set_no
* c
->assoc
]);
361 /* This loop is unrolled for just the first case, which is the most */
362 /* common. We can't unroll any further because it would screw up */
363 /* if we have a direct-mapped (1-way) cache. */
364 if (tag
== (set
[0] & ~CACHELINE_DIRTY
)) {
368 /* If the tag is one other than the MRU, move it into the MRU spot */
369 /* and shuffle the rest down. */
370 for (i
= 1; i
< c
->assoc
; i
++) {
371 if (tag
== (set
[i
] & ~CACHELINE_DIRTY
)) {
372 tmp_tag
= set
[i
] | ref
; // update dirty flag
373 for (j
= i
; j
> 0; j
--) {
381 /* A miss; install this tag as MRU, shuffle rest down. */
382 tmp_tag
= set
[c
->assoc
- 1];
383 for (j
= c
->assoc
- 1; j
> 0; j
--) {
388 return (tmp_tag
& CACHELINE_DIRTY
) ? MissDirty
: Miss
;
391 __attribute__((always_inline
))
393 CacheResult
cachesim_ref_wb(cache_t2
* c
, RefType ref
, Addr a
, UChar size
)
395 UInt set1
= ( a
>> c
->line_size_bits
) & (c
->sets_min_1
);
396 UInt set2
= ((a
+size
-1) >> c
->line_size_bits
) & (c
->sets_min_1
);
397 UWord tag
= a
& c
->tag_mask
;
399 /* Access entirely within line. */
401 return cachesim_setref_wb(c
, ref
, set1
, tag
);
403 /* Access straddles two lines. */
404 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
405 else if (((set1
+ 1) & (c
->sets_min_1
)) == set2
) {
406 UWord tag2
= (a
+size
-1) & c
->tag_mask
;
408 /* the call updates cache structures as side effect */
409 CacheResult res1
= cachesim_setref_wb(c
, ref
, set1
, tag
);
410 CacheResult res2
= cachesim_setref_wb(c
, ref
, set2
, tag2
);
412 if ((res1
== MissDirty
) || (res2
== MissDirty
)) return MissDirty
;
413 return ((res1
== Miss
) || (res2
== Miss
)) ? Miss
: Hit
;
416 VG_(printf
)("addr: %lx size: %u sets: %d %d", a
, size
, set1
, set2
);
417 VG_(tool_panic
)("item straddles more than two cache sets");
424 CacheModelResult
cachesim_I1_Read(Addr a
, UChar size
)
426 if ( cachesim_ref( &I1
, a
, size
) == Hit
) return L1_Hit
;
427 switch( cachesim_ref_wb( &LL
, Read
, a
, size
) ) {
428 case Hit
: return LL_Hit
;
429 case Miss
: return MemAccess
;
432 return WriteBackMemAccess
;
436 CacheModelResult
cachesim_D1_Read(Addr a
, UChar size
)
438 if ( cachesim_ref( &D1
, a
, size
) == Hit
) return L1_Hit
;
439 switch( cachesim_ref_wb( &LL
, Read
, a
, size
) ) {
440 case Hit
: return LL_Hit
;
441 case Miss
: return MemAccess
;
444 return WriteBackMemAccess
;
448 CacheModelResult
cachesim_D1_Write(Addr a
, UChar size
)
450 if ( cachesim_ref( &D1
, a
, size
) == Hit
) {
451 /* Even for a L1 hit, the write-trough L1 passes
452 * the write to the LL to make the LL line dirty.
453 * But this causes no latency, so return the hit.
455 cachesim_ref_wb( &LL
, Write
, a
, size
);
458 switch( cachesim_ref_wb( &LL
, Write
, a
, size
) ) {
459 case Hit
: return LL_Hit
;
460 case Miss
: return MemAccess
;
463 return WriteBackMemAccess
;
467 /*------------------------------------------------------------*/
468 /*--- Hardware Prefetch Simulation ---*/
469 /*------------------------------------------------------------*/
471 static ULong prefetch_up
= 0;
472 static ULong prefetch_down
= 0;
475 #define PF_PAGEBITS 12
477 static UInt pf_lastblock
[PF_STREAMS
];
478 static Int pf_seqblocks
[PF_STREAMS
];
481 void prefetch_clear(void)
484 for(i
=0;i
<PF_STREAMS
;i
++)
485 pf_lastblock
[i
] = pf_seqblocks
[i
] = 0;
489 * HW Prefetch emulation
490 * Start prefetching when detecting sequential access to 3 memory blocks.
491 * One stream can be detected per 4k page.
494 void prefetch_LL_doref(Addr a
)
496 UInt stream
= (a
>> PF_PAGEBITS
) % PF_STREAMS
;
497 UInt block
= ( a
>> LL
.line_size_bits
);
499 if (block
!= pf_lastblock
[stream
]) {
500 if (pf_seqblocks
[stream
] == 0) {
501 if (pf_lastblock
[stream
] +1 == block
) pf_seqblocks
[stream
]++;
502 else if (pf_lastblock
[stream
] -1 == block
) pf_seqblocks
[stream
]--;
504 else if (pf_seqblocks
[stream
] >0) {
505 if (pf_lastblock
[stream
] +1 == block
) {
506 pf_seqblocks
[stream
]++;
507 if (pf_seqblocks
[stream
] >= 2) {
509 cachesim_ref(&LL
, a
+ 5 * LL
.line_size
,1);
512 else pf_seqblocks
[stream
] = 0;
514 else if (pf_seqblocks
[stream
] <0) {
515 if (pf_lastblock
[stream
] -1 == block
) {
516 pf_seqblocks
[stream
]--;
517 if (pf_seqblocks
[stream
] <= -2) {
519 cachesim_ref(&LL
, a
- 5 * LL
.line_size
,1);
522 else pf_seqblocks
[stream
] = 0;
524 pf_lastblock
[stream
] = block
;
528 /* simple model with hardware prefetch */
531 CacheModelResult
prefetch_I1_ref(Addr a
, UChar size
)
533 if ( cachesim_ref( &I1
, a
, size
) == Hit
) return L1_Hit
;
534 prefetch_LL_doref(a
);
535 if ( cachesim_ref( &LL
, a
, size
) == Hit
) return LL_Hit
;
540 CacheModelResult
prefetch_D1_ref(Addr a
, UChar size
)
542 if ( cachesim_ref( &D1
, a
, size
) == Hit
) return L1_Hit
;
543 prefetch_LL_doref(a
);
544 if ( cachesim_ref( &LL
, a
, size
) == Hit
) return LL_Hit
;
549 /* complex model with hardware prefetch */
552 CacheModelResult
prefetch_I1_Read(Addr a
, UChar size
)
554 if ( cachesim_ref( &I1
, a
, size
) == Hit
) return L1_Hit
;
555 prefetch_LL_doref(a
);
556 switch( cachesim_ref_wb( &LL
, Read
, a
, size
) ) {
557 case Hit
: return LL_Hit
;
558 case Miss
: return MemAccess
;
561 return WriteBackMemAccess
;
565 CacheModelResult
prefetch_D1_Read(Addr a
, UChar size
)
567 if ( cachesim_ref( &D1
, a
, size
) == Hit
) return L1_Hit
;
568 prefetch_LL_doref(a
);
569 switch( cachesim_ref_wb( &LL
, Read
, a
, size
) ) {
570 case Hit
: return LL_Hit
;
571 case Miss
: return MemAccess
;
574 return WriteBackMemAccess
;
578 CacheModelResult
prefetch_D1_Write(Addr a
, UChar size
)
580 prefetch_LL_doref(a
);
581 if ( cachesim_ref( &D1
, a
, size
) == Hit
) {
582 /* Even for a L1 hit, the write-trough L1 passes
583 * the write to the LL to make the LL line dirty.
584 * But this causes no latency, so return the hit.
586 cachesim_ref_wb( &LL
, Write
, a
, size
);
589 switch( cachesim_ref_wb( &LL
, Write
, a
, size
) ) {
590 case Hit
: return LL_Hit
;
591 case Miss
: return MemAccess
;
594 return WriteBackMemAccess
;
598 /*------------------------------------------------------------*/
599 /*--- Cache Simulation with use metric collection ---*/
600 /*------------------------------------------------------------*/
602 /* can not be combined with write-back or prefetch */
605 void cacheuse_initcache(cache_t2
* c
)
608 unsigned int start_mask
, start_val
;
609 unsigned int end_mask
, end_val
;
611 c
->use
= CLG_MALLOC("cl.sim.cu_ic.1",
612 sizeof(line_use
) * c
->sets
* c
->assoc
);
613 c
->loaded
= CLG_MALLOC("cl.sim.cu_ic.2",
614 sizeof(line_loaded
) * c
->sets
* c
->assoc
);
615 c
->line_start_mask
= CLG_MALLOC("cl.sim.cu_ic.3",
616 sizeof(int) * c
->line_size
);
617 c
->line_end_mask
= CLG_MALLOC("cl.sim.cu_ic.4",
618 sizeof(int) * c
->line_size
);
620 c
->line_size_mask
= c
->line_size
-1;
622 /* Meaning of line_start_mask/line_end_mask
623 * Example: for a given cache line, you get an access starting at
624 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
625 * line size of 32, you have 1 bit per byte in the mask:
627 * bit31 bit8 bit5 bit 0
629 * 11..111111100000 line_start_mask[5]
630 * 00..000111111111 line_end_mask[(5+4)-1]
632 * use_mask |= line_start_mask[5] && line_end_mask[8]
635 start_val
= end_val
= ~0;
636 if (c
->line_size
< 32) {
637 int bits_per_byte
= 32/c
->line_size
;
638 start_mask
= (1<<bits_per_byte
)-1;
639 end_mask
= start_mask
<< (32-bits_per_byte
);
640 for(i
=0;i
<c
->line_size
;i
++) {
641 c
->line_start_mask
[i
] = start_val
;
642 start_val
= start_val
& ~start_mask
;
643 start_mask
= start_mask
<< bits_per_byte
;
645 c
->line_end_mask
[c
->line_size
-i
-1] = end_val
;
646 end_val
= end_val
& ~end_mask
;
647 end_mask
= end_mask
>> bits_per_byte
;
651 int bytes_per_bit
= c
->line_size
/32;
654 for(i
=0;i
<c
->line_size
;i
++) {
655 c
->line_start_mask
[i
] = start_val
;
656 c
->line_end_mask
[c
->line_size
-i
-1] = end_val
;
657 if ( ((i
+1)%bytes_per_bit
) == 0) {
658 start_val
&= ~start_mask
;
659 end_val
&= ~end_mask
;
666 CLG_DEBUG(6, "Config %s:\n", c
->desc_line
);
667 for(i
=0;i
<c
->line_size
;i
++) {
668 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
669 i
, c
->line_start_mask
[i
], c
->line_end_mask
[i
]);
672 /* We use lower tag bits as offset pointers to cache use info.
673 * I.e. some cache parameters don't work.
675 if ( (1<<c
->tag_shift
) < c
->assoc
) {
676 VG_(message
)(Vg_DebugMsg
,
677 "error: Use associativity < %d for cache use statistics!\n",
679 VG_(tool_panic
)("Unsupported cache configuration");
684 /* for I1/D1 caches */
685 #define CACHEUSE(L) \
687 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
689 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
690 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
691 UWord tag = a & L.tag_mask; \
694 UWord *set, tmp_tag; \
697 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n", \
698 L.name, a, size, set1, set2); \
700 /* First case: word entirely within line. */ \
701 if (set1 == set2) { \
703 set = &(L.tags[set1 * L.assoc]); \
704 use_mask = L.line_start_mask[a & L.line_size_mask] & \
705 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
707 /* This loop is unrolled for just the first case, which is the most */\
708 /* common. We can't unroll any further because it would screw up */\
709 /* if we have a direct-mapped (1-way) cache. */\
710 if (tag == (set[0] & L.tag_mask)) { \
711 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
712 L.use[idx].count ++; \
713 L.use[idx].mask |= use_mask; \
714 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
715 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
716 use_mask, L.use[idx].mask, L.use[idx].count); \
719 /* If the tag is one other than the MRU, move it into the MRU spot */\
720 /* and shuffle the rest down. */\
721 for (i = 1; i < L.assoc; i++) { \
722 if (tag == (set[i] & L.tag_mask)) { \
724 for (j = i; j > 0; j--) { \
725 set[j] = set[j - 1]; \
728 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
729 L.use[idx].count ++; \
730 L.use[idx].mask |= use_mask; \
731 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
732 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
733 use_mask, L.use[idx].mask, L.use[idx].count); \
738 /* A miss; install this tag as MRU, shuffle rest down. */ \
739 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
740 for (j = L.assoc - 1; j > 0; j--) { \
741 set[j] = set[j - 1]; \
743 set[0] = tag | tmp_tag; \
744 idx = (set1 * L.assoc) + tmp_tag; \
745 return update_##L##_use(&L, idx, \
746 use_mask, a &~ L.line_size_mask); \
748 /* Second case: word straddles two lines. */ \
749 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
750 } else if (((set1 + 1) & (L.sets_min_1)) == set2) { \
751 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */ \
752 set = &(L.tags[set1 * L.assoc]); \
753 use_mask = L.line_start_mask[a & L.line_size_mask]; \
754 if (tag == (set[0] & L.tag_mask)) { \
755 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
756 L.use[idx].count ++; \
757 L.use[idx].mask |= use_mask; \
758 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
759 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
760 use_mask, L.use[idx].mask, L.use[idx].count); \
763 for (i = 1; i < L.assoc; i++) { \
764 if (tag == (set[i] & L.tag_mask)) { \
766 for (j = i; j > 0; j--) { \
767 set[j] = set[j - 1]; \
770 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
771 L.use[idx].count ++; \
772 L.use[idx].mask |= use_mask; \
773 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
774 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
775 use_mask, L.use[idx].mask, L.use[idx].count); \
779 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
780 for (j = L.assoc - 1; j > 0; j--) { \
781 set[j] = set[j - 1]; \
783 set[0] = tag | tmp_tag; \
784 idx = (set1 * L.assoc) + tmp_tag; \
785 miss1 = update_##L##_use(&L, idx, \
786 use_mask, a &~ L.line_size_mask); \
788 set = &(L.tags[set2 * L.assoc]); \
789 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
790 tag2 = (a+size-1) & L.tag_mask; \
791 if (tag2 == (set[0] & L.tag_mask)) { \
792 idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \
793 L.use[idx].count ++; \
794 L.use[idx].mask |= use_mask; \
795 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
796 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
797 use_mask, L.use[idx].mask, L.use[idx].count); \
800 for (i = 1; i < L.assoc; i++) { \
801 if (tag2 == (set[i] & L.tag_mask)) { \
803 for (j = i; j > 0; j--) { \
804 set[j] = set[j - 1]; \
807 idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \
808 L.use[idx].count ++; \
809 L.use[idx].mask |= use_mask; \
810 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
811 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
812 use_mask, L.use[idx].mask, L.use[idx].count); \
816 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
817 for (j = L.assoc - 1; j > 0; j--) { \
818 set[j] = set[j - 1]; \
820 set[0] = tag2 | tmp_tag; \
821 idx = (set2 * L.assoc) + tmp_tag; \
822 miss2 = update_##L##_use(&L, idx, \
823 use_mask, (a+size-1) &~ L.line_size_mask); \
824 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit; \
827 VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
828 VG_(tool_panic)("item straddles more than two cache sets"); \
834 /* logarithmic bitcounting algorithm, see
835 * http://graphics.stanford.edu/~seander/bithacks.html
837 static __inline__
unsigned int countBits(unsigned int bits
)
839 unsigned int c
; // store the total here
840 const int S
[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
841 const int B
[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
844 c
= ((c
>> S
[0]) & B
[0]) + (c
& B
[0]);
845 c
= ((c
>> S
[1]) & B
[1]) + (c
& B
[1]);
846 c
= ((c
>> S
[2]) & B
[2]) + (c
& B
[2]);
847 c
= ((c
>> S
[3]) & B
[3]) + (c
& B
[3]);
848 c
= ((c
>> S
[4]) & B
[4]) + (c
& B
[4]);
852 static void update_LL_use(int idx
, Addr memline
)
854 line_loaded
* loaded
= &(LL
.loaded
[idx
]);
855 line_use
* use
= &(LL
.use
[idx
]);
856 int i
= ((32 - countBits(use
->mask
)) * LL
.line_size
)>>5;
858 CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
859 idx
, CLG_(bb_base
) + current_ii
->instr_offset
, memline
);
861 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
862 use
->count
, i
, use
->mask
, loaded
->memline
, loaded
->iaddr
);
863 CLG_DEBUG(2, " collect: %d, use_base %p\n",
864 CLG_(current_state
).collect
, loaded
->use_base
);
866 if (CLG_(current_state
).collect
&& loaded
->use_base
) {
867 (loaded
->use_base
)[off_LL_AcCost
] += 1000 / use
->count
;
868 (loaded
->use_base
)[off_LL_SpLoss
] += i
;
875 loaded
->memline
= memline
;
876 loaded
->iaddr
= CLG_(bb_base
) + current_ii
->instr_offset
;
877 loaded
->use_base
= (CLG_(current_state
).nonskipped
) ?
878 CLG_(current_state
).nonskipped
->skipped
:
879 CLG_(cost_base
) + current_ii
->cost_offset
;
883 CacheModelResult
cacheuse_LL_access(Addr memline
, line_loaded
* l1_loaded
)
885 UInt setNo
= (memline
>> LL
.line_size_bits
) & (LL
.sets_min_1
);
886 UWord
* set
= &(LL
.tags
[setNo
* LL
.assoc
]);
887 UWord tag
= memline
& LL
.tag_mask
;
892 CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline
, setNo
);
894 if (tag
== (set
[0] & LL
.tag_mask
)) {
895 idx
= (setNo
* LL
.assoc
) + (set
[0] & ~LL
.tag_mask
);
896 l1_loaded
->dep_use
= &(LL
.use
[idx
]);
898 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
899 idx
, LL
.loaded
[idx
].memline
, LL
.loaded
[idx
].iaddr
,
900 LL
.use
[idx
].mask
, LL
.use
[idx
].count
);
903 for (i
= 1; i
< LL
.assoc
; i
++) {
904 if (tag
== (set
[i
] & LL
.tag_mask
)) {
906 for (j
= i
; j
> 0; j
--) {
910 idx
= (setNo
* LL
.assoc
) + (tmp_tag
& ~LL
.tag_mask
);
911 l1_loaded
->dep_use
= &(LL
.use
[idx
]);
913 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
914 i
, idx
, LL
.loaded
[idx
].memline
, LL
.loaded
[idx
].iaddr
,
915 LL
.use
[idx
].mask
, LL
.use
[idx
].count
);
920 /* A miss; install this tag as MRU, shuffle rest down. */
921 tmp_tag
= set
[LL
.assoc
- 1] & ~LL
.tag_mask
;
922 for (j
= LL
.assoc
- 1; j
> 0; j
--) {
925 set
[0] = tag
| tmp_tag
;
926 idx
= (setNo
* LL
.assoc
) + tmp_tag
;
927 l1_loaded
->dep_use
= &(LL
.use
[idx
]);
929 update_LL_use(idx
, memline
);
937 #define UPDATE_USE(L) \
939 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
940 UInt mask, Addr memline) \
942 line_loaded* loaded = &(cache->loaded[idx]); \
943 line_use* use = &(cache->use[idx]); \
944 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
946 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
947 cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
948 if (use->count>0) { \
949 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
950 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
951 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
952 CLG_(current_state).collect, loaded->use_base); \
954 if (CLG_(current_state).collect && loaded->use_base) { \
955 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
956 (loaded->use_base)[off_##L##_SpLoss] += c; \
958 /* FIXME (?): L1/LL line sizes must be equal ! */ \
959 loaded->dep_use->mask |= use->mask; \
960 loaded->dep_use->count += use->count; \
966 loaded->memline = memline; \
967 loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset; \
968 loaded->use_base = (CLG_(current_state).nonskipped) ? \
969 CLG_(current_state).nonskipped->skipped : \
970 CLG_(cost_base) + current_ii->cost_offset; \
972 if (memline == 0) return LL_Hit; \
973 return cacheuse_LL_access(memline, loaded); \
984 void cacheuse_finish(void)
987 InstrInfo ii
= { 0,0,0,0 };
989 if (!CLG_(current_state
).collect
) return;
992 current_ii
= &ii
; /* needs to be set for update_XX_use */
995 /* update usage counters */
997 for (i
= 0; i
< I1
.sets
* I1
.assoc
; i
++)
998 if (I1
.loaded
[i
].use_base
)
999 update_I1_use( &I1
, i
, 0,0);
1002 for (i
= 0; i
< D1
.sets
* D1
.assoc
; i
++)
1003 if (D1
.loaded
[i
].use_base
)
1004 update_D1_use( &D1
, i
, 0,0);
1007 for (i
= 0; i
< LL
.sets
* LL
.assoc
; i
++)
1008 if (LL
.loaded
[i
].use_base
)
1009 update_LL_use(i
, 0);
1016 /*------------------------------------------------------------*/
1017 /*--- Helper functions called by instrumented code ---*/
1018 /*------------------------------------------------------------*/
1022 void inc_costs(CacheModelResult r
, ULong
* c1
, ULong
* c2
)
1025 case WriteBackMemAccess
:
1026 if (clo_simulate_writeback
) {
1049 const HChar
* cacheRes(CacheModelResult r
)
1052 case L1_Hit
: return "L1 Hit ";
1053 case LL_Hit
: return "LL Hit ";
1054 case MemAccess
: return "LL Miss";
1055 case WriteBackMemAccess
: return "LL Miss (dirty)";
1063 static void log_1I0D(InstrInfo
* ii
)
1065 CacheModelResult IrRes
;
1068 IrRes
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
);
1070 CLG_DEBUG(6, "log_1I0D: Ir %#lx/%u => %s\n",
1071 CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
, cacheRes(IrRes
));
1073 if (CLG_(current_state
).collect
) {
1076 if (CLG_(current_state
).nonskipped
)
1077 cost_Ir
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1079 cost_Ir
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_IR
];
1081 inc_costs(IrRes
, cost_Ir
,
1082 CLG_(current_state
).cost
+ fullOffset(EG_IR
) );
1087 static void log_2I0D(InstrInfo
* ii1
, InstrInfo
* ii2
)
1089 CacheModelResult Ir1Res
, Ir2Res
;
1090 ULong
*global_cost_Ir
;
1093 Ir1Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii1
->instr_offset
, ii1
->instr_size
);
1095 Ir2Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii2
->instr_offset
, ii2
->instr_size
);
1097 CLG_DEBUG(6, "log_2I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1098 CLG_(bb_base
) + ii1
->instr_offset
, ii1
->instr_size
, cacheRes(Ir1Res
),
1099 CLG_(bb_base
) + ii2
->instr_offset
, ii2
->instr_size
, cacheRes(Ir2Res
) );
1101 if (!CLG_(current_state
).collect
) return;
1103 global_cost_Ir
= CLG_(current_state
).cost
+ fullOffset(EG_IR
);
1104 if (CLG_(current_state
).nonskipped
) {
1105 ULong
* skipped_cost_Ir
=
1106 CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1108 inc_costs(Ir1Res
, global_cost_Ir
, skipped_cost_Ir
);
1109 inc_costs(Ir2Res
, global_cost_Ir
, skipped_cost_Ir
);
1113 inc_costs(Ir1Res
, global_cost_Ir
,
1114 CLG_(cost_base
) + ii1
->cost_offset
+ ii1
->eventset
->offset
[EG_IR
]);
1115 inc_costs(Ir2Res
, global_cost_Ir
,
1116 CLG_(cost_base
) + ii2
->cost_offset
+ ii2
->eventset
->offset
[EG_IR
]);
1120 static void log_3I0D(InstrInfo
* ii1
, InstrInfo
* ii2
, InstrInfo
* ii3
)
1122 CacheModelResult Ir1Res
, Ir2Res
, Ir3Res
;
1123 ULong
*global_cost_Ir
;
1126 Ir1Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii1
->instr_offset
, ii1
->instr_size
);
1128 Ir2Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii2
->instr_offset
, ii2
->instr_size
);
1130 Ir3Res
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii3
->instr_offset
, ii3
->instr_size
);
1132 CLG_DEBUG(6, "log_3I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1133 CLG_(bb_base
) + ii1
->instr_offset
, ii1
->instr_size
, cacheRes(Ir1Res
),
1134 CLG_(bb_base
) + ii2
->instr_offset
, ii2
->instr_size
, cacheRes(Ir2Res
),
1135 CLG_(bb_base
) + ii3
->instr_offset
, ii3
->instr_size
, cacheRes(Ir3Res
) );
1137 if (!CLG_(current_state
).collect
) return;
1139 global_cost_Ir
= CLG_(current_state
).cost
+ fullOffset(EG_IR
);
1140 if (CLG_(current_state
).nonskipped
) {
1141 ULong
* skipped_cost_Ir
=
1142 CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1143 inc_costs(Ir1Res
, global_cost_Ir
, skipped_cost_Ir
);
1144 inc_costs(Ir2Res
, global_cost_Ir
, skipped_cost_Ir
);
1145 inc_costs(Ir3Res
, global_cost_Ir
, skipped_cost_Ir
);
1149 inc_costs(Ir1Res
, global_cost_Ir
,
1150 CLG_(cost_base
) + ii1
->cost_offset
+ ii1
->eventset
->offset
[EG_IR
]);
1151 inc_costs(Ir2Res
, global_cost_Ir
,
1152 CLG_(cost_base
) + ii2
->cost_offset
+ ii2
->eventset
->offset
[EG_IR
]);
1153 inc_costs(Ir3Res
, global_cost_Ir
,
1154 CLG_(cost_base
) + ii3
->cost_offset
+ ii3
->eventset
->offset
[EG_IR
]);
1157 /* Instruction doing a read access */
1160 static void log_1I1Dr(InstrInfo
* ii
, Addr data_addr
, Word data_size
)
1162 CacheModelResult IrRes
, DrRes
;
1165 IrRes
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
);
1166 DrRes
= (*simulator
.D1_Read
)(data_addr
, data_size
);
1168 CLG_DEBUG(6, "log_1I1Dr: Ir %#lx/%u => %s, Dr %#lx/%lu => %s\n",
1169 CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
, cacheRes(IrRes
),
1170 data_addr
, data_size
, cacheRes(DrRes
));
1172 if (CLG_(current_state
).collect
) {
1173 ULong
*cost_Ir
, *cost_Dr
;
1175 if (CLG_(current_state
).nonskipped
) {
1176 cost_Ir
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1177 cost_Dr
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_DR
);
1180 cost_Ir
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_IR
];
1181 cost_Dr
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_DR
];
1184 inc_costs(IrRes
, cost_Ir
,
1185 CLG_(current_state
).cost
+ fullOffset(EG_IR
) );
1186 inc_costs(DrRes
, cost_Dr
,
1187 CLG_(current_state
).cost
+ fullOffset(EG_DR
) );
1192 /* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
1193 have exactly the same prototype. If you change them, you must
1194 change addEvent_D_guarded too. */
1196 static void log_0I1Dr(InstrInfo
* ii
, Addr data_addr
, Word data_size
)
1198 CacheModelResult DrRes
;
1201 DrRes
= (*simulator
.D1_Read
)(data_addr
, data_size
);
1203 CLG_DEBUG(6, "log_0I1Dr: Dr %#lx/%lu => %s\n",
1204 data_addr
, data_size
, cacheRes(DrRes
));
1206 if (CLG_(current_state
).collect
) {
1209 if (CLG_(current_state
).nonskipped
)
1210 cost_Dr
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_DR
);
1212 cost_Dr
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_DR
];
1214 inc_costs(DrRes
, cost_Dr
,
1215 CLG_(current_state
).cost
+ fullOffset(EG_DR
) );
1220 /* Instruction doing a write access */
1223 static void log_1I1Dw(InstrInfo
* ii
, Addr data_addr
, Word data_size
)
1225 CacheModelResult IrRes
, DwRes
;
1228 IrRes
= (*simulator
.I1_Read
)(CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
);
1229 DwRes
= (*simulator
.D1_Write
)(data_addr
, data_size
);
1231 CLG_DEBUG(6, "log_1I1Dw: Ir %#lx/%u => %s, Dw %#lx/%lu => %s\n",
1232 CLG_(bb_base
) + ii
->instr_offset
, ii
->instr_size
, cacheRes(IrRes
),
1233 data_addr
, data_size
, cacheRes(DwRes
));
1235 if (CLG_(current_state
).collect
) {
1236 ULong
*cost_Ir
, *cost_Dw
;
1238 if (CLG_(current_state
).nonskipped
) {
1239 cost_Ir
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_IR
);
1240 cost_Dw
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_DW
);
1243 cost_Ir
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_IR
];
1244 cost_Dw
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_DW
];
1247 inc_costs(IrRes
, cost_Ir
,
1248 CLG_(current_state
).cost
+ fullOffset(EG_IR
) );
1249 inc_costs(DwRes
, cost_Dw
,
1250 CLG_(current_state
).cost
+ fullOffset(EG_DW
) );
1254 /* See comment on log_0I1Dr. */
1256 static void log_0I1Dw(InstrInfo
* ii
, Addr data_addr
, Word data_size
)
1258 CacheModelResult DwRes
;
1261 DwRes
= (*simulator
.D1_Write
)(data_addr
, data_size
);
1263 CLG_DEBUG(6, "log_0I1Dw: Dw %#lx/%lu => %s\n",
1264 data_addr
, data_size
, cacheRes(DwRes
));
1266 if (CLG_(current_state
).collect
) {
1269 if (CLG_(current_state
).nonskipped
)
1270 cost_Dw
= CLG_(current_state
).nonskipped
->skipped
+ fullOffset(EG_DW
);
1272 cost_Dw
= CLG_(cost_base
) + ii
->cost_offset
+ ii
->eventset
->offset
[EG_DW
];
1274 inc_costs(DwRes
, cost_Dw
,
1275 CLG_(current_state
).cost
+ fullOffset(EG_DW
) );
1281 /*------------------------------------------------------------*/
1282 /*--- Cache configuration ---*/
1283 /*------------------------------------------------------------*/
1285 static cache_t clo_I1_cache
= UNDEFINED_CACHE
;
1286 static cache_t clo_D1_cache
= UNDEFINED_CACHE
;
1287 static cache_t clo_LL_cache
= UNDEFINED_CACHE
;
1289 /* Initialize and clear simulator state */
1290 static void cachesim_post_clo_init(void)
1292 /* Cache configurations. */
1293 cache_t I1c
, D1c
, LLc
;
1295 /* Initialize access handlers */
1296 if (!CLG_(clo
).simulate_cache
) {
1297 CLG_(cachesim
).log_1I0D
= 0;
1298 CLG_(cachesim
).log_1I0D_name
= "(no function)";
1299 CLG_(cachesim
).log_2I0D
= 0;
1300 CLG_(cachesim
).log_2I0D_name
= "(no function)";
1301 CLG_(cachesim
).log_3I0D
= 0;
1302 CLG_(cachesim
).log_3I0D_name
= "(no function)";
1304 CLG_(cachesim
).log_1I1Dr
= 0;
1305 CLG_(cachesim
).log_1I1Dr_name
= "(no function)";
1306 CLG_(cachesim
).log_1I1Dw
= 0;
1307 CLG_(cachesim
).log_1I1Dw_name
= "(no function)";
1309 CLG_(cachesim
).log_0I1Dr
= 0;
1310 CLG_(cachesim
).log_0I1Dr_name
= "(no function)";
1311 CLG_(cachesim
).log_0I1Dw
= 0;
1312 CLG_(cachesim
).log_0I1Dw_name
= "(no function)";
1316 /* Configuration of caches only needed with real cache simulation */
1317 VG_(post_clo_init_configure_caches
)(&I1c
, &D1c
, &LLc
,
1326 // min_line_size is used to make sure that we never feed
1327 // accesses to the simulator straddling more than two
1328 // cache lines at any cache level
1329 CLG_(min_line_size
) = (I1c
.line_size
< D1c
.line_size
)
1330 ? I1c
.line_size
: D1c
.line_size
;
1331 CLG_(min_line_size
) = (LLc
.line_size
< CLG_(min_line_size
))
1332 ? LLc
.line_size
: CLG_(min_line_size
);
1334 Int largest_load_or_store_size
1335 = VG_(machine_get_size_of_largest_guest_register
)();
1336 if (CLG_(min_line_size
) < largest_load_or_store_size
) {
1337 /* We can't continue, because the cache simulation might
1338 straddle more than 2 lines, and it will assert. So let's
1339 just stop before we start. */
1340 VG_(umsg
)("Callgrind: cannot continue: the minimum line size (%d)\n",
1341 (Int
)CLG_(min_line_size
));
1342 VG_(umsg
)(" must be equal to or larger than the maximum register size (%d)\n",
1343 largest_load_or_store_size
);
1344 VG_(umsg
)(" but it is not. Exiting now.\n");
1348 cachesim_initcache(I1c
, &I1
);
1349 cachesim_initcache(D1c
, &D1
);
1350 cachesim_initcache(LLc
, &LL
);
1352 /* the other cache simulators use the standard helpers
1353 * with dispatching via simulator struct */
1355 CLG_(cachesim
).log_1I0D
= log_1I0D
;
1356 CLG_(cachesim
).log_1I0D_name
= "log_1I0D";
1357 CLG_(cachesim
).log_2I0D
= log_2I0D
;
1358 CLG_(cachesim
).log_2I0D_name
= "log_2I0D";
1359 CLG_(cachesim
).log_3I0D
= log_3I0D
;
1360 CLG_(cachesim
).log_3I0D_name
= "log_3I0D";
1362 CLG_(cachesim
).log_1I1Dr
= log_1I1Dr
;
1363 CLG_(cachesim
).log_1I1Dw
= log_1I1Dw
;
1364 CLG_(cachesim
).log_1I1Dr_name
= "log_1I1Dr";
1365 CLG_(cachesim
).log_1I1Dw_name
= "log_1I1Dw";
1367 CLG_(cachesim
).log_0I1Dr
= log_0I1Dr
;
1368 CLG_(cachesim
).log_0I1Dw
= log_0I1Dw
;
1369 CLG_(cachesim
).log_0I1Dr_name
= "log_0I1Dr";
1370 CLG_(cachesim
).log_0I1Dw_name
= "log_0I1Dw";
1372 if (clo_collect_cacheuse
) {
1374 /* Output warning for not supported option combinations */
1375 if (clo_simulate_hwpref
) {
1376 VG_(message
)(Vg_DebugMsg
,
1377 "warning: prefetch simulation can not be "
1378 "used with cache usage\n");
1379 clo_simulate_hwpref
= False
;
1382 if (clo_simulate_writeback
) {
1383 VG_(message
)(Vg_DebugMsg
,
1384 "warning: write-back simulation can not be "
1385 "used with cache usage\n");
1386 clo_simulate_writeback
= False
;
1389 simulator
.I1_Read
= cacheuse_I1_doRead
;
1390 simulator
.D1_Read
= cacheuse_D1_doRead
;
1391 simulator
.D1_Write
= cacheuse_D1_doRead
;
1395 if (clo_simulate_hwpref
) {
1398 if (clo_simulate_writeback
) {
1399 simulator
.I1_Read
= prefetch_I1_Read
;
1400 simulator
.D1_Read
= prefetch_D1_Read
;
1401 simulator
.D1_Write
= prefetch_D1_Write
;
1404 simulator
.I1_Read
= prefetch_I1_ref
;
1405 simulator
.D1_Read
= prefetch_D1_ref
;
1406 simulator
.D1_Write
= prefetch_D1_ref
;
1412 if (clo_simulate_writeback
) {
1413 simulator
.I1_Read
= cachesim_I1_Read
;
1414 simulator
.D1_Read
= cachesim_D1_Read
;
1415 simulator
.D1_Write
= cachesim_D1_Write
;
1418 simulator
.I1_Read
= cachesim_I1_ref
;
1419 simulator
.D1_Read
= cachesim_D1_ref
;
1420 simulator
.D1_Write
= cachesim_D1_ref
;
1425 /* Clear simulator state. Has to be initialized before */
1427 void cachesim_clear(void)
1429 cachesim_clearcache(&I1
);
1430 cachesim_clearcache(&D1
);
1431 cachesim_clearcache(&LL
);
1437 static void cachesim_dump_desc(VgFile
*fp
)
1439 VG_(fprintf
)(fp
, "\ndesc: I1 cache: %s\n", I1
.desc_line
);
1440 VG_(fprintf
)(fp
, "desc: D1 cache: %s\n", D1
.desc_line
);
1441 VG_(fprintf
)(fp
, "desc: LL cache: %s\n", LL
.desc_line
);
1445 void cachesim_print_opts(void)
1448 "\n cache simulator options (does cache simulation if used):\n"
1449 " --simulate-wb=no|yes Count write-back events [no]\n"
1450 " --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1451 #if CLG_EXPERIMENTAL
1452 " --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1454 " --cacheuse=no|yes Collect cache block use [no]\n");
1455 VG_(print_cache_clo_opts
)();
1458 /* Check for command line option for cache configuration.
1459 * Return False if unknown and not handled.
1461 * Called from CLG_(process_cmd_line_option)() in clo.c
1463 static Bool
cachesim_parse_opt(const HChar
* arg
)
1465 if VG_BOOL_CLO(arg
, "--simulate-wb", clo_simulate_writeback
) {}
1466 else if VG_BOOL_CLO(arg
, "--simulate-hwpref", clo_simulate_hwpref
) {}
1467 else if VG_BOOL_CLO(arg
, "--simulate-sectors", clo_simulate_sectors
) {}
1469 else if VG_BOOL_CLO(arg
, "--cacheuse", clo_collect_cacheuse
) {
1470 if (clo_collect_cacheuse
) {
1471 /* Use counters only make sense with fine dumping */
1472 CLG_(clo
).dump_instr
= True
;
1476 else if (VG_(str_clo_cache_opt
)(arg
,
1488 void cachesim_printstat(Int l1
, Int l2
, Int l3
)
1490 FullCost total
= CLG_(total_cost
), D_total
= 0;
1491 ULong LL_total_m
, LL_total_mr
, LL_total_mw
,
1492 LL_total
, LL_total_r
, LL_total_w
;
1494 if ((VG_(clo_verbosity
) >1) && clo_simulate_hwpref
) {
1495 VG_(message
)(Vg_DebugMsg
, "Prefetch Up: %llu\n",
1497 VG_(message
)(Vg_DebugMsg
, "Prefetch Down: %llu\n",
1499 VG_(message
)(Vg_DebugMsg
, "\n");
1502 VG_(message
)(Vg_UserMsg
, "I1 misses: %'*llu\n", l1
,
1503 total
[fullOffset(EG_IR
) +1]);
1505 VG_(message
)(Vg_UserMsg
, "LLi misses: %'*llu\n", l1
,
1506 total
[fullOffset(EG_IR
) +2]);
1508 if (0 == total
[fullOffset(EG_IR
)])
1509 total
[fullOffset(EG_IR
)] = 1;
1511 VG_(message
)(Vg_UserMsg
, "I1 miss rate: %*.2f%%\n", l1
,
1512 total
[fullOffset(EG_IR
)+1] * 100.0 / total
[fullOffset(EG_IR
)]);
1514 VG_(message
)(Vg_UserMsg
, "LLi miss rate: %*.2f%%\n", l1
,
1515 total
[fullOffset(EG_IR
)+2] * 100.0 / total
[fullOffset(EG_IR
)]);
1517 VG_(message
)(Vg_UserMsg
, "\n");
1520 Use the D_refs.rd and D_refs.wr values to determine the
1521 * width of columns 2 & 3. */
1523 D_total
= CLG_(get_eventset_cost
)( CLG_(sets
).full
);
1524 CLG_(init_cost
)( CLG_(sets
).full
, D_total
);
1525 // we only use the first 3 values of D_total, adding up Dr and Dw costs
1526 CLG_(copy_cost
)( CLG_(get_event_set
)(EG_DR
), D_total
, total
+ fullOffset(EG_DR
) );
1527 CLG_(add_cost
) ( CLG_(get_event_set
)(EG_DW
), D_total
, total
+ fullOffset(EG_DW
) );
1529 VG_(message
)(Vg_UserMsg
, "D refs: %'*llu (%'*llu rd + %'*llu wr)\n",
1531 l2
, total
[fullOffset(EG_DR
)],
1532 l3
, total
[fullOffset(EG_DW
)]);
1534 VG_(message
)(Vg_UserMsg
, "D1 misses: %'*llu (%'*llu rd + %'*llu wr)\n",
1536 l2
, total
[fullOffset(EG_DR
)+1],
1537 l3
, total
[fullOffset(EG_DW
)+1]);
1539 VG_(message
)(Vg_UserMsg
, "LLd misses: %'*llu (%'*llu rd + %'*llu wr)\n",
1541 l2
, total
[fullOffset(EG_DR
)+2],
1542 l3
, total
[fullOffset(EG_DW
)+2]);
1544 if (0 == D_total
[0]) D_total
[0] = 1;
1545 if (0 == total
[fullOffset(EG_DR
)]) total
[fullOffset(EG_DR
)] = 1;
1546 if (0 == total
[fullOffset(EG_DW
)]) total
[fullOffset(EG_DW
)] = 1;
1548 VG_(message
)(Vg_UserMsg
, "D1 miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1549 l1
, D_total
[1] * 100.0 / D_total
[0],
1550 l2
, total
[fullOffset(EG_DR
)+1] * 100.0 / total
[fullOffset(EG_DR
)],
1551 l3
, total
[fullOffset(EG_DW
)+1] * 100.0 / total
[fullOffset(EG_DW
)]);
1553 VG_(message
)(Vg_UserMsg
, "LLd miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1554 l1
, D_total
[2] * 100.0 / D_total
[0],
1555 l2
, total
[fullOffset(EG_DR
)+2] * 100.0 / total
[fullOffset(EG_DR
)],
1556 l3
, total
[fullOffset(EG_DW
)+2] * 100.0 / total
[fullOffset(EG_DW
)]);
1557 VG_(message
)(Vg_UserMsg
, "\n");
1561 /* LL overall results */
1564 total
[fullOffset(EG_DR
) +1] +
1565 total
[fullOffset(EG_DW
) +1] +
1566 total
[fullOffset(EG_IR
) +1];
1568 total
[fullOffset(EG_DR
) +1] +
1569 total
[fullOffset(EG_IR
) +1];
1570 LL_total_w
= total
[fullOffset(EG_DW
) +1];
1571 VG_(message
)(Vg_UserMsg
, "LL refs: %'*llu (%'*llu rd + %'*llu wr)\n",
1572 l1
, LL_total
, l2
, LL_total_r
, l3
, LL_total_w
);
1575 total
[fullOffset(EG_DR
) +2] +
1576 total
[fullOffset(EG_DW
) +2] +
1577 total
[fullOffset(EG_IR
) +2];
1579 total
[fullOffset(EG_DR
) +2] +
1580 total
[fullOffset(EG_IR
) +2];
1581 LL_total_mw
= total
[fullOffset(EG_DW
) +2];
1582 VG_(message
)(Vg_UserMsg
, "LL misses: %'*llu (%'*llu rd + %'*llu wr)\n",
1583 l1
, LL_total_m
, l2
, LL_total_mr
, l3
, LL_total_mw
);
1585 VG_(message
)(Vg_UserMsg
, "LL miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1586 l1
, LL_total_m
* 100.0 / (total
[fullOffset(EG_IR
)] + D_total
[0]),
1587 l2
, LL_total_mr
* 100.0 / (total
[fullOffset(EG_IR
)] + total
[fullOffset(EG_DR
)]),
1588 l3
, LL_total_mw
* 100.0 / total
[fullOffset(EG_DW
)]);
1592 /*------------------------------------------------------------*/
1593 /*--- Setup for Event set. ---*/
1594 /*------------------------------------------------------------*/
1596 struct event_sets
CLG_(sets
);
1598 void CLG_(init_eventsets
)()
1600 // Event groups from which the event sets are composed
1601 // the "Use" group only is used with "cacheuse" simulation
1602 if (clo_collect_cacheuse
)
1603 CLG_(register_event_group4
)(EG_USE
,
1604 "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1606 if (!CLG_(clo
).simulate_cache
)
1607 CLG_(register_event_group
)(EG_IR
, "Ir");
1608 else if (!clo_simulate_writeback
) {
1609 CLG_(register_event_group3
)(EG_IR
, "Ir", "I1mr", "ILmr");
1610 CLG_(register_event_group3
)(EG_DR
, "Dr", "D1mr", "DLmr");
1611 CLG_(register_event_group3
)(EG_DW
, "Dw", "D1mw", "DLmw");
1613 else { // clo_simulate_writeback
1614 CLG_(register_event_group4
)(EG_IR
, "Ir", "I1mr", "ILmr", "ILdmr");
1615 CLG_(register_event_group4
)(EG_DR
, "Dr", "D1mr", "DLmr", "DLdmr");
1616 CLG_(register_event_group4
)(EG_DW
, "Dw", "D1mw", "DLmw", "DLdmw");
1619 if (CLG_(clo
).simulate_branch
) {
1620 CLG_(register_event_group2
)(EG_BC
, "Bc", "Bcm");
1621 CLG_(register_event_group2
)(EG_BI
, "Bi", "Bim");
1624 if (CLG_(clo
).collect_bus
)
1625 CLG_(register_event_group
)(EG_BUS
, "Ge");
1627 if (CLG_(clo
).collect_alloc
)
1628 CLG_(register_event_group2
)(EG_ALLOC
, "allocCount", "allocSize");
1630 if (CLG_(clo
).collect_systime
)
1631 CLG_(register_event_group2
)(EG_SYS
, "sysCount", "sysTime");
1633 // event set used as base for instruction self cost
1634 CLG_(sets
).base
= CLG_(get_event_set2
)(EG_USE
, EG_IR
);
1636 // event set comprising all event groups, used for inclusive cost
1637 CLG_(sets
).full
= CLG_(add_event_group2
)(CLG_(sets
).base
, EG_DR
, EG_DW
);
1638 CLG_(sets
).full
= CLG_(add_event_group2
)(CLG_(sets
).full
, EG_BC
, EG_BI
);
1639 CLG_(sets
).full
= CLG_(add_event_group
) (CLG_(sets
).full
, EG_BUS
);
1640 CLG_(sets
).full
= CLG_(add_event_group2
)(CLG_(sets
).full
, EG_ALLOC
, EG_SYS
);
1643 CLG_DEBUG(1, "EventSets:\n");
1644 CLG_(print_eventset
)(-2, CLG_(sets
).base
);
1645 CLG_(print_eventset
)(-2, CLG_(sets
).full
);
1648 /* Not-existing events are silently ignored */
1649 CLG_(dumpmap
) = CLG_(get_eventmapping
)(CLG_(sets
).full
);
1650 CLG_(append_event
)(CLG_(dumpmap
), "Ir");
1651 CLG_(append_event
)(CLG_(dumpmap
), "Dr");
1652 CLG_(append_event
)(CLG_(dumpmap
), "Dw");
1653 CLG_(append_event
)(CLG_(dumpmap
), "I1mr");
1654 CLG_(append_event
)(CLG_(dumpmap
), "D1mr");
1655 CLG_(append_event
)(CLG_(dumpmap
), "D1mw");
1656 CLG_(append_event
)(CLG_(dumpmap
), "ILmr");
1657 CLG_(append_event
)(CLG_(dumpmap
), "DLmr");
1658 CLG_(append_event
)(CLG_(dumpmap
), "DLmw");
1659 CLG_(append_event
)(CLG_(dumpmap
), "ILdmr");
1660 CLG_(append_event
)(CLG_(dumpmap
), "DLdmr");
1661 CLG_(append_event
)(CLG_(dumpmap
), "DLdmw");
1662 CLG_(append_event
)(CLG_(dumpmap
), "Bc");
1663 CLG_(append_event
)(CLG_(dumpmap
), "Bcm");
1664 CLG_(append_event
)(CLG_(dumpmap
), "Bi");
1665 CLG_(append_event
)(CLG_(dumpmap
), "Bim");
1666 CLG_(append_event
)(CLG_(dumpmap
), "AcCost1");
1667 CLG_(append_event
)(CLG_(dumpmap
), "SpLoss1");
1668 CLG_(append_event
)(CLG_(dumpmap
), "AcCost2");
1669 CLG_(append_event
)(CLG_(dumpmap
), "SpLoss2");
1670 CLG_(append_event
)(CLG_(dumpmap
), "Ge");
1671 CLG_(append_event
)(CLG_(dumpmap
), "allocCount");
1672 CLG_(append_event
)(CLG_(dumpmap
), "allocSize");
1673 CLG_(append_event
)(CLG_(dumpmap
), "sysCount");
1674 CLG_(append_event
)(CLG_(dumpmap
), "sysTime");
1678 /* this is called at dump time for every instruction executed */
1679 static void cachesim_add_icost(SimCost cost
, BBCC
* bbcc
,
1680 InstrInfo
* ii
, ULong exe_count
)
1682 if (!CLG_(clo
).simulate_cache
)
1683 cost
[ fullOffset(EG_IR
) ] += exe_count
;
1686 CLG_(add_and_zero_cost2
)( CLG_(sets
).full
, cost
,
1687 ii
->eventset
, bbcc
->cost
+ ii
->cost_offset
);
1691 void cachesim_finish(void)
1693 if (clo_collect_cacheuse
)
1697 /*------------------------------------------------------------*/
1698 /*--- The simulator defined in this file ---*/
1699 /*------------------------------------------------------------*/
1701 struct cachesim_if
CLG_(cachesim
) = {
1702 .print_opts
= cachesim_print_opts
,
1703 .parse_opt
= cachesim_parse_opt
,
1704 .post_clo_init
= cachesim_post_clo_init
,
1705 .clear
= cachesim_clear
,
1706 .dump_desc
= cachesim_dump_desc
,
1707 .printstat
= cachesim_printstat
,
1708 .add_icost
= cachesim_add_icost
,
1709 .finish
= cachesim_finish
,
1711 /* these will be set by cachesim_post_clo_init */
1722 .log_1I0D_name
= "(no function)",
1723 .log_2I0D_name
= "(no function)",
1724 .log_3I0D_name
= "(no function)",
1726 .log_1I1Dr_name
= "(no function)",
1727 .log_1I1Dw_name
= "(no function)",
1729 .log_0I1Dr_name
= "(no function)",
1730 .log_0I1Dw_name
= "(no function)",
1734 /*--------------------------------------------------------------------*/
1735 /*--- end ct_sim.c ---*/
1736 /*--------------------------------------------------------------------*/