callgrind/sim.c

   1 /*--------------------------------------------------------------------*/
   2 /*--- Cache simulation.                                            ---*/
   3 /*---                                                        sim.c ---*/
   4 /*--------------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Callgrind, a Valgrind tool for call graph
   8    profiling programs.
   9
  10    Copyright (C) 2003-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
  11
  12    This tool is derived from and contains code from Cachegrind
  13    Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
  14
  15    This program is free software; you can redistribute it and/or
  16    modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation; either version 2 of the
  18    License, or (at your option) any later version.
  19
  20    This program is distributed in the hope that it will be useful, but
  21    WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23    General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program; if not, see <http://www.gnu.org/licenses/>.
  27
  28    The GNU General Public License is contained in the file COPYING.
  29 */
  30
  31 #include "global.h"
  32
  33
  34 /* Notes:
  35   - simulates a write-allocate cache
  36   - (block --> set) hash function uses simple bit selection
  37   - handling of references straddling two cache blocks:
  38       - counts as only one cache access (not two)
  39       - both blocks hit                  --> one hit
  40       - one block hits, the other misses --> one miss
  41       - both blocks miss                 --> one miss (not two)
  42 */
  43
  44 /* Cache configuration */
  45 #include "cg_arch.c"
  46
  47 /* additional structures for cache use info, separated
  48  * according usage frequency:
  49  * - line_loaded : pointer to cost center of instruction
  50  *                 which loaded the line into cache.
  51  *                 Needed to increment counters when line is evicted.
  52  * - line_use    : updated on every access
  53  */
  54 typedef struct {
  55   UInt count;
  56   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
  57 } line_use;
  58
  59 typedef struct {
  60   Addr memline, iaddr;
  61   line_use* dep_use; /* point to higher-level cacheblock for this memline */
  62   ULong* use_base;
  63 } line_loaded;
  64
  65 /* Cache state */
  66 typedef struct {
  67    const HChar* name;
  68    int          size;                   /* bytes */
  69    int          assoc;
  70    int          line_size;              /* bytes */
  71    Bool         sectored;  /* prefetch nearside cacheline on read */
  72    int          sets;
  73    int          sets_min_1;
  74    int          line_size_bits;
  75    int          tag_shift;
  76    UWord        tag_mask;
  77    HChar        desc_line[128];    // large enough
  78    UWord*       tags;
  79
  80   /* for cache use */
  81    int          line_size_mask;
  82    int*         line_start_mask;
  83    int*         line_end_mask;
  84    line_loaded* loaded;
  85    line_use*    use;
  86 } cache_t2;
  87
  88 /*
  89  * States of flat caches in our model.
  90  * We use a 2-level hierarchy,
  91  */
  92 static cache_t2 I1, D1, LL;
  93
  94 /* Lower bits of cache tags are used as flags for a cache line */
  95 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
  96 #define CACHELINE_DIRTY    1
  97
  98
  99 /* Cache simulator Options */
 100 static Bool clo_simulate_writeback = False;
 101 static Bool clo_simulate_hwpref = False;
 102 static Bool clo_simulate_sectors = False;
 103 static Bool clo_collect_cacheuse = False;
 104
 105 /* Following global vars are setup before by setup_bbcc():
 106  *
 107  * - Addr   CLG_(bb_base)     (instruction start address of original BB)
 108  * - ULong* CLG_(cost_base)   (start of cost array for BB)
 109  */
 110
 111 Addr   CLG_(bb_base);
 112 ULong* CLG_(cost_base);
 113
 114 static InstrInfo* current_ii;
 115
 116 /* Cache use offsets */
 117 /* The offsets are only correct because all per-instruction event sets get
 118  * the "Use" set added first !
 119  */
 120 static Int off_I1_AcCost  = 0;
 121 static Int off_I1_SpLoss  = 1;
 122 static Int off_D1_AcCost  = 0;
 123 static Int off_D1_SpLoss  = 1;
 124 static Int off_LL_AcCost  = 2;
 125 static Int off_LL_SpLoss  = 3;
 126
 127 /* Cache access types */
 128 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
 129
 130 /* Result of a reference into a flat cache */
 131 typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
 132
 133 /* Result of a reference into a hierarchical cache model */
 134 typedef enum {
 135     L1_Hit,
 136     LL_Hit,
 137     MemAccess,
 138     WriteBackMemAccess } CacheModelResult;
 139
 140 typedef CacheModelResult (*simcall_type)(Addr, UChar);
 141
 142 static struct {
 143     simcall_type I1_Read;
 144     simcall_type D1_Read;
 145     simcall_type D1_Write;
 146 } simulator;
 147
 148 /*------------------------------------------------------------*/
 149 /*--- Cache Simulator Initialization                       ---*/
 150 /*------------------------------------------------------------*/
 151
 152 static void cachesim_clearcache(cache_t2* c)
 153 {
 154   Int i;
 155
 156   for (i = 0; i < c->sets * c->assoc; i++)
 157     c->tags[i] = 0;
 158   if (c->use) {
 159     for (i = 0; i < c->sets * c->assoc; i++) {
 160       c->loaded[i].memline  = 0;
 161       c->loaded[i].use_base = 0;
 162       c->loaded[i].dep_use = 0;
 163       c->loaded[i].iaddr = 0;
 164       c->use[i].mask    = 0;
 165       c->use[i].count   = 0;
 166       c->tags[i] = i % c->assoc; /* init lower bits as pointer */
 167     }
 168   }
 169 }
 170
 171 static void cacheuse_initcache(cache_t2* c);
 172
 173 /* By this point, the size/assoc/line_size has been checked. */
 174 static void cachesim_initcache(cache_t config, cache_t2* c)
 175 {
 176    c->size      = config.size;
 177    c->assoc     = config.assoc;
 178    c->line_size = config.line_size;
 179    c->sectored  = False; // FIXME
 180
 181    c->sets           = (c->size / c->line_size) / c->assoc;
 182    c->sets_min_1     = c->sets - 1;
 183    c->line_size_bits = VG_(log2)(c->line_size);
 184    c->tag_shift     = c->line_size_bits + VG_(log2)(c->sets);
 185    c->tag_mask       = ~((1u<<c->tag_shift)-1);
 186
 187    /* Can bits in tag entries be used for flags?
 188     * Should be always true as MIN_LINE_SIZE >= 16 */
 189    CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
 190
 191    if (c->assoc == 1) {
 192       VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
 193                    c->size, c->line_size,
 194                    c->sectored ? ", sectored":"");
 195    } else {
 196       VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
 197                    c->size, c->line_size, c->assoc,
 198                    c->sectored ? ", sectored":"");
 199    }
 200
 201    c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
 202                                  sizeof(UWord) * c->sets * c->assoc);
 203    if (clo_collect_cacheuse)
 204        cacheuse_initcache(c);
 205    else
 206      c->use = 0;
 207    cachesim_clearcache(c);
 208 }
 209
 210
 211 #if 0
 212 static void print_cache(cache_t2* c)
 213 {
 214    UInt set, way, i;
 215
 216    /* Note initialisation and update of 'i'. */
 217    for (i = 0, set = 0; set < c->sets; set++) {
 218       for (way = 0; way < c->assoc; way++, i++) {
 219          VG_(printf)("%8x ", c->tags[i]);
 220       }
 221       VG_(printf)("\n");
 222    }
 223 }
 224 #endif
 225
 226
 227 /*------------------------------------------------------------*/
 228 /*--- Simple Cache Simulation                              ---*/
 229 /*------------------------------------------------------------*/
 230
 231 /*
 232  * Model: single inclusive, 2-level cache hierarchy (L1/LL)
 233  *        with write-allocate
 234  *
 235  * For simple cache hit/miss counts, we do not have to
 236  * maintain the dirty state of lines (no need to distinguish
 237  * read/write references), and the resulting counts are the
 238  * same for write-through and write-back caches.
 239  *
 240  * Simulator functions:
 241  *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 242  *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 243  */
 244 __attribute__((always_inline))
 245 static __inline__
 246 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
 247 {
 248     int i, j;
 249     UWord *set;
 250
 251     set = &(c->tags[set_no * c->assoc]);
 252
 253     /* This loop is unrolled for just the first case, which is the most */
 254     /* common.  We can't unroll any further because it would screw up   */
 255     /* if we have a direct-mapped (1-way) cache.                        */
 256     if (tag == set[0])
 257         return Hit;
 258
 259     /* If the tag is one other than the MRU, move it into the MRU spot  */
 260     /* and shuffle the rest down.                                       */
 261     for (i = 1; i < c->assoc; i++) {
 262         if (tag == set[i]) {
 263             for (j = i; j > 0; j--) {
 264                 set[j] = set[j - 1];
 265             }
 266             set[0] = tag;
 267             return Hit;
 268         }
 269     }
 270
 271     /* A miss;  install this tag as MRU, shuffle rest down. */
 272     for (j = c->assoc - 1; j > 0; j--) {
 273         set[j] = set[j - 1];
 274     }
 275     set[0] = tag;
 276
 277     return Miss;
 278 }
 279
 280 __attribute__((always_inline))
 281 static __inline__
 282 CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
 283 {
 284     UWord block1 =  a         >> c->line_size_bits;
 285     UWord block2 = (a+size-1) >> c->line_size_bits;
 286     UInt  set1   = block1 & c->sets_min_1;
 287     /* the tag does not need to include bits specifying the set,
 288      * but it can, and this saves instructions */
 289     UWord tag1   = block1;
 290
 291     /* Access entirely within line. */
 292     if (block1 == block2)
 293         return cachesim_setref(c, set1, tag1);
 294
 295     /* Access straddles two lines. */
 296     else if (block1 + 1 == block2) {
 297         UInt  set2 = block2 & c->sets_min_1;
 298         UWord tag2 = block2;
 299
 300         /* the call updates cache structures as side effect */
 301         CacheResult res1 =  cachesim_setref(c, set1, tag1);
 302         CacheResult res2 =  cachesim_setref(c, set2, tag2);
 303         return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 304
 305    } else {
 306        VG_(printf)("addr: %lx  size: %u  blocks: %lu %lu",
 307                    a, size, block1, block2);
 308        VG_(tool_panic)("item straddles more than two cache sets");
 309    }
 310    return Hit;
 311 }
 312
 313 static
 314 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 315 {
 316     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 317     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 318     return MemAccess;
 319 }
 320
 321 static
 322 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 323 {
 324     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 325     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 326     return MemAccess;
 327 }
 328
 329
 330 /*------------------------------------------------------------*/
 331 /*--- Write Back Cache Simulation                          ---*/
 332 /*------------------------------------------------------------*/
 333
 334 /*
 335  * More complex model: L1 Write-through, LL Write-back
 336  * This needs to distinguish among read and write references.
 337  *
 338  * Simulator functions:
 339  *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 340  *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 341  *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 342  */
 343
 344 /*
 345  * With write-back, result can be a miss evicting a dirty line
 346  * The dirty state of a cache line is stored in Bit0 of the tag for
 347  * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
 348  * type (Read/Write), the line gets dirty on a write.
 349  */
 350 __attribute__((always_inline))
 351 static __inline__
 352 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
 353 {
 354     int i, j;
 355     UWord *set, tmp_tag;
 356
 357     set = &(c->tags[set_no * c->assoc]);
 358
 359     /* This loop is unrolled for just the first case, which is the most */
 360     /* common.  We can't unroll any further because it would screw up   */
 361     /* if we have a direct-mapped (1-way) cache.                        */
 362     if (tag == (set[0] & ~CACHELINE_DIRTY)) {
 363         set[0] |= ref;
 364         return Hit;
 365     }
 366     /* If the tag is one other than the MRU, move it into the MRU spot  */
 367     /* and shuffle the rest down.                                       */
 368     for (i = 1; i < c->assoc; i++) {
 369         if (tag == (set[i] & ~CACHELINE_DIRTY)) {
 370             tmp_tag = set[i] | ref; // update dirty flag
 371             for (j = i; j > 0; j--) {
 372                 set[j] = set[j - 1];
 373             }
 374             set[0] = tmp_tag;
 375             return Hit;
 376         }
 377     }
 378
 379     /* A miss;  install this tag as MRU, shuffle rest down. */
 380     tmp_tag = set[c->assoc - 1];
 381     for (j = c->assoc - 1; j > 0; j--) {
 382         set[j] = set[j - 1];
 383     }
 384     set[0] = tag | ref;
 385
 386     return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
 387 }
 388
 389 __attribute__((always_inline))
 390 static __inline__
 391 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
 392 {
 393     UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
 394     UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
 395     UWord tag = a & c->tag_mask;
 396
 397     /* Access entirely within line. */
 398     if (set1 == set2)
 399         return cachesim_setref_wb(c, ref, set1, tag);
 400
 401     /* Access straddles two lines. */
 402     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
 403     else if (((set1 + 1) & (c->sets_min_1)) == set2) {
 404         UWord tag2  = (a+size-1) & c->tag_mask;
 405
 406         /* the call updates cache structures as side effect */
 407         CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
 408         CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
 409
 410         if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
 411         return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 412
 413    } else {
 414        VG_(printf)("addr: %lx  size: %u  sets: %u %u", a, size, set1, set2);
 415        VG_(tool_panic)("item straddles more than two cache sets");
 416    }
 417    return Hit;
 418 }
 419
 420
 421 static
 422 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 423 {
 424     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 425     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 426         case Hit: return LL_Hit;
 427         case Miss: return MemAccess;
 428         default: break;
 429     }
 430     return WriteBackMemAccess;
 431 }
 432
 433 static
 434 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 435 {
 436     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 437     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 438         case Hit: return LL_Hit;
 439         case Miss: return MemAccess;
 440         default: break;
 441     }
 442     return WriteBackMemAccess;
 443 }
 444
 445 static
 446 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 447 {
 448     if ( cachesim_ref( &D1, a, size) == Hit ) {
 449         /* Even for a L1 hit, the write-trough L1 passes
 450          * the write to the LL to make the LL line dirty.
 451          * But this causes no latency, so return the hit.
 452          */
 453         cachesim_ref_wb( &LL, Write, a, size);
 454         return L1_Hit;
 455     }
 456     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
 457         case Hit: return LL_Hit;
 458         case Miss: return MemAccess;
 459         default: break;
 460     }
 461     return WriteBackMemAccess;
 462 }
 463
 464
 465 /*------------------------------------------------------------*/
 466 /*--- Hardware Prefetch Simulation                         ---*/
 467 /*------------------------------------------------------------*/
 468
 469 static ULong prefetch_up = 0;
 470 static ULong prefetch_down = 0;
 471
 472 #define PF_STREAMS  8
 473 #define PF_PAGEBITS 12
 474
 475 static UInt pf_lastblock[PF_STREAMS];
 476 static Int  pf_seqblocks[PF_STREAMS];
 477
 478 static
 479 void prefetch_clear(void)
 480 {
 481   int i;
 482   for(i=0;i<PF_STREAMS;i++)
 483     pf_lastblock[i] = pf_seqblocks[i] = 0;
 484 }
 485
 486 /*
 487  * HW Prefetch emulation
 488  * Start prefetching when detecting sequential access to 3 memory blocks.
 489  * One stream can be detected per 4k page.
 490  */
 491 static __inline__
 492 void prefetch_LL_doref(Addr a)
 493 {
 494   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
 495   UInt block = ( a >> LL.line_size_bits);
 496
 497   if (block != pf_lastblock[stream]) {
 498     if (pf_seqblocks[stream] == 0) {
 499       if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
 500       else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
 501     }
 502     else if (pf_seqblocks[stream] >0) {
 503       if (pf_lastblock[stream] +1 == block) {
 504         pf_seqblocks[stream]++;
 505         if (pf_seqblocks[stream] >= 2) {
 506           prefetch_up++;
 507           cachesim_ref(&LL, a + 5 * LL.line_size,1);
 508         }
 509       }
 510       else pf_seqblocks[stream] = 0;
 511     }
 512     else if (pf_seqblocks[stream] <0) {
 513       if (pf_lastblock[stream] -1 == block) {
 514         pf_seqblocks[stream]--;
 515         if (pf_seqblocks[stream] <= -2) {
 516           prefetch_down++;
 517           cachesim_ref(&LL, a - 5 * LL.line_size,1);
 518         }
 519       }
 520       else pf_seqblocks[stream] = 0;
 521     }
 522     pf_lastblock[stream] = block;
 523   }
 524 }
 525
 526 /* simple model with hardware prefetch */
 527
 528 static
 529 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
 530 {
 531     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 532     prefetch_LL_doref(a);
 533     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 534     return MemAccess;
 535 }
 536
 537 static
 538 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
 539 {
 540     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 541     prefetch_LL_doref(a);
 542     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 543     return MemAccess;
 544 }
 545
 546
 547 /* complex model with hardware prefetch */
 548
 549 static
 550 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
 551 {
 552     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 553     prefetch_LL_doref(a);
 554     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 555         case Hit: return LL_Hit;
 556         case Miss: return MemAccess;
 557         default: break;
 558     }
 559     return WriteBackMemAccess;
 560 }
 561
 562 static
 563 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
 564 {
 565     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 566     prefetch_LL_doref(a);
 567     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 568         case Hit: return LL_Hit;
 569         case Miss: return MemAccess;
 570         default: break;
 571     }
 572     return WriteBackMemAccess;
 573 }
 574
 575 static
 576 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
 577 {
 578     prefetch_LL_doref(a);
 579     if ( cachesim_ref( &D1, a, size) == Hit ) {
 580         /* Even for a L1 hit, the write-trough L1 passes
 581          * the write to the LL to make the LL line dirty.
 582          * But this causes no latency, so return the hit.
 583          */
 584         cachesim_ref_wb( &LL, Write, a, size);
 585         return L1_Hit;
 586     }
 587     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
 588         case Hit: return LL_Hit;
 589         case Miss: return MemAccess;
 590         default: break;
 591     }
 592     return WriteBackMemAccess;
 593 }
 594
 595
 596 /*------------------------------------------------------------*/
 597 /*--- Cache Simulation with use metric collection          ---*/
 598 /*------------------------------------------------------------*/
 599
 600 /* can not be combined with write-back or prefetch */
 601
 602 static
 603 void cacheuse_initcache(cache_t2* c)
 604 {
 605     int i;
 606     unsigned int start_mask, start_val;
 607     unsigned int end_mask, end_val;
 608
 609     c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
 610                            sizeof(line_use) * c->sets * c->assoc);
 611     c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
 612                            sizeof(line_loaded) * c->sets * c->assoc);
 613     c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
 614                                     sizeof(int) * c->line_size);
 615     c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
 616                                   sizeof(int) * c->line_size);
 617
 618     c->line_size_mask = c->line_size-1;
 619
 620     /* Meaning of line_start_mask/line_end_mask
 621      * Example: for a given cache line, you get an access starting at
 622      * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
 623      * line size of 32, you have 1 bit per byte in the mask:
 624      *
 625      *   bit31   bit8 bit5  bit 0
 626      *       |      |  |    |
 627      *       11..111111100000   line_start_mask[5]
 628      *       00..000111111111   line_end_mask[(5+4)-1]
 629      *
 630      *  use_mask |= line_start_mask[5] && line_end_mask[8]
 631      *
 632      */
 633     start_val = end_val = ~0;
 634     if (c->line_size < 32) {
 635         int bits_per_byte = 32/c->line_size;
 636         start_mask = (1<<bits_per_byte)-1;
 637         end_mask   = start_mask << (32-bits_per_byte);
 638         for(i=0;i<c->line_size;i++) {
 639             c->line_start_mask[i] = start_val;
 640             start_val  = start_val & ~start_mask;
 641             start_mask = start_mask << bits_per_byte;
 642
 643             c->line_end_mask[c->line_size-i-1] = end_val;
 644             end_val  = end_val & ~end_mask;
 645             end_mask = end_mask >> bits_per_byte;
 646         }
 647     }
 648     else {
 649         int bytes_per_bit = c->line_size/32;
 650         start_mask = 1;
 651         end_mask   = 1u << 31;
 652         for(i=0;i<c->line_size;i++) {
 653             c->line_start_mask[i] = start_val;
 654             c->line_end_mask[c->line_size-i-1] = end_val;
 655             if ( ((i+1)%bytes_per_bit) == 0) {
 656                 start_val   &= ~start_mask;
 657                 end_val     &= ~end_mask;
 658                 start_mask <<= 1;
 659                 end_mask   >>= 1;
 660             }
 661         }
 662     }
 663
 664     CLG_DEBUG(6, "Config %s:\n", c->desc_line);
 665     for(i=0;i<c->line_size;i++) {
 666         CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
 667                   i, (UInt)c->line_start_mask[i], (UInt)c->line_end_mask[i]);
 668     }
 669
 670     /* We use lower tag bits as offset pointers to cache use info.
 671      * I.e. some cache parameters don't work.
 672      */
 673     if ( (1<<c->tag_shift) < c->assoc) {
 674         VG_(message)(Vg_DebugMsg,
 675                      "error: Use associativity < %d for cache use statistics!\n",
 676                      (1<<c->tag_shift) );
 677         VG_(tool_panic)("Unsupported cache configuration");
 678     }
 679 }
 680
 681
 682 /* for I1/D1 caches */
 683 #define CACHEUSE(L)                                                         \
 684                                                                             \
 685 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
 686 {                                                                           \
 687    UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
 688    UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
 689    UWord tag  = a & L.tag_mask;                                             \
 690    UWord tag2;                                                              \
 691    int i, j, idx;                                                           \
 692    UWord *set, tmp_tag;                                                     \
 693    UInt use_mask;                                                           \
 694                                                                             \
 695    CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n",                \
 696             L.name, a, size, set1, set2);                                   \
 697                                                                             \
 698    /* First case: word entirely within line. */                             \
 699    if (set1 == set2) {                                                      \
 700                                                                             \
 701       set = &(L.tags[set1 * L.assoc]);                                      \
 702       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
 703                  L.line_end_mask[(a+size-1) & L.line_size_mask];            \
 704                                                                             \
 705       /* This loop is unrolled for just the first case, which is the most */\
 706       /* common.  We can't unroll any further because it would screw up   */\
 707       /* if we have a direct-mapped (1-way) cache.                        */\
 708       if (tag == (set[0] & L.tag_mask)) {                                   \
 709         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
 710         L.use[idx].count ++;                                                \
 711         L.use[idx].mask |= use_mask;                                        \
 712         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
 713                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 714                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 715         return L1_Hit;                                                      \
 716       }                                                                     \
 717       /* If the tag is one other than the MRU, move it into the MRU spot  */\
 718       /* and shuffle the rest down.                                       */\
 719       for (i = 1; i < L.assoc; i++) {                                       \
 720          if (tag == (set[i] & L.tag_mask)) {                                \
 721             tmp_tag = set[i];                                               \
 722             for (j = i; j > 0; j--) {                                       \
 723                set[j] = set[j - 1];                                         \
 724             }                                                               \
 725             set[0] = tmp_tag;                                               \
 726             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 727             L.use[idx].count ++;                                            \
 728             L.use[idx].mask |= use_mask;                                    \
 729         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
 730                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 731                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 732             return L1_Hit;                                                  \
 733          }                                                                  \
 734       }                                                                     \
 735                                                                             \
 736       /* A miss;  install this tag as MRU, shuffle rest down. */            \
 737       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 738       for (j = L.assoc - 1; j > 0; j--) {                                   \
 739          set[j] = set[j - 1];                                               \
 740       }                                                                     \
 741       set[0] = tag | tmp_tag;                                               \
 742       idx = (set1 * L.assoc) + tmp_tag;                                     \
 743       return update_##L##_use(&L, idx,                                      \
 744                        use_mask, a &~ L.line_size_mask);                    \
 745                                                                             \
 746    /* Second case: word straddles two lines. */                             \
 747    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
 748    } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
 749       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
 750       set = &(L.tags[set1 * L.assoc]);                                      \
 751       use_mask = L.line_start_mask[a & L.line_size_mask];                   \
 752       if (tag == (set[0] & L.tag_mask)) {                                   \
 753          idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
 754          L.use[idx].count ++;                                               \
 755          L.use[idx].mask |= use_mask;                                       \
 756         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
 757                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 758                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 759          goto block2;                                                       \
 760       }                                                                     \
 761       for (i = 1; i < L.assoc; i++) {                                       \
 762          if (tag == (set[i] & L.tag_mask)) {                                \
 763             tmp_tag = set[i];                                               \
 764             for (j = i; j > 0; j--) {                                       \
 765                set[j] = set[j - 1];                                         \
 766             }                                                               \
 767             set[0] = tmp_tag;                                               \
 768             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 769             L.use[idx].count ++;                                            \
 770             L.use[idx].mask |= use_mask;                                    \
 771         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
 772                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 773                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 774             goto block2;                                                    \
 775          }                                                                  \
 776       }                                                                     \
 777       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 778       for (j = L.assoc - 1; j > 0; j--) {                                   \
 779          set[j] = set[j - 1];                                               \
 780       }                                                                     \
 781       set[0] = tag | tmp_tag;                                               \
 782       idx = (set1 * L.assoc) + tmp_tag;                                     \
 783       miss1 = update_##L##_use(&L, idx,                                     \
 784                        use_mask, a &~ L.line_size_mask);                    \
 785 block2:                                                                     \
 786       set = &(L.tags[set2 * L.assoc]);                                      \
 787       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];            \
 788       tag2  = (a+size-1) & L.tag_mask;                                      \
 789       if (tag2 == (set[0] & L.tag_mask)) {                                  \
 790          idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
 791          L.use[idx].count ++;                                               \
 792          L.use[idx].mask |= use_mask;                                       \
 793         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
 794                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 795                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 796          return miss1;                                                      \
 797       }                                                                     \
 798       for (i = 1; i < L.assoc; i++) {                                       \
 799          if (tag2 == (set[i] & L.tag_mask)) {                               \
 800             tmp_tag = set[i];                                               \
 801             for (j = i; j > 0; j--) {                                       \
 802                set[j] = set[j - 1];                                         \
 803             }                                                               \
 804             set[0] = tmp_tag;                                               \
 805             idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 806             L.use[idx].count ++;                                            \
 807             L.use[idx].mask |= use_mask;                                    \
 808         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
 809                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 810                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 811             return miss1;                                                   \
 812          }                                                                  \
 813       }                                                                     \
 814       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 815       for (j = L.assoc - 1; j > 0; j--) {                                   \
 816          set[j] = set[j - 1];                                               \
 817       }                                                                     \
 818       set[0] = tag2 | tmp_tag;                                              \
 819       idx = (set2 * L.assoc) + tmp_tag;                                     \
 820       miss2 = update_##L##_use(&L, idx,                                     \
 821                        use_mask, (a+size-1) &~ L.line_size_mask);           \
 822       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
 823                                                                             \
 824    } else {                                                                 \
 825        VG_(printf)("addr: %#lx  size: %u  sets: %u %u", a, size, set1, set2); \
 826        VG_(tool_panic)("item straddles more than two cache sets");          \
 827    }                                                                        \
 828    return 0;                                                                \
 829 }
 830
 831
 832 /* logarithmic bitcounting algorithm, see
 833  * http://graphics.stanford.edu/~seander/bithacks.html
 834  */
 835 static __inline__ unsigned int countBits(unsigned int bits)
 836 {
 837   unsigned int c; // store the total here
 838   const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
 839   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
 840
 841   c = bits;
 842   c = ((c >> S[0]) & B[0]) + (c & B[0]);
 843   c = ((c >> S[1]) & B[1]) + (c & B[1]);
 844   c = ((c >> S[2]) & B[2]) + (c & B[2]);
 845   c = ((c >> S[3]) & B[3]) + (c & B[3]);
 846   c = ((c >> S[4]) & B[4]) + (c & B[4]);
 847   return c;
 848 }
 849
 850 static void update_LL_use(int idx, Addr memline)
 851 {
 852   line_loaded* loaded = &(LL.loaded[idx]);
 853   line_use* use = &(LL.use[idx]);
 854   int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
 855
 856   CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
 857            idx, CLG_(bb_base) + current_ii->instr_offset, memline);
 858   if (use->count>0) {
 859     CLG_DEBUG(2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",
 860              use->count, i, use->mask, loaded->memline, loaded->iaddr);
 861     CLG_DEBUG(2, "   collect: %d, use_base %p\n",
 862              CLG_(current_state).collect, loaded->use_base);
 863
 864     if (CLG_(current_state).collect && loaded->use_base) {
 865       (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
 866       (loaded->use_base)[off_LL_SpLoss] += i;
 867     }
 868    }
 869
 870    use->count = 0;
 871    use->mask  = 0;
 872
 873   loaded->memline = memline;
 874   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;
 875   loaded->use_base = (CLG_(current_state).nonskipped) ?
 876     CLG_(current_state).nonskipped->skipped :
 877     CLG_(cost_base) + current_ii->cost_offset;
 878 }
 879
 880 static
 881 CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
 882 {
 883    UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
 884    UWord* set = &(LL.tags[setNo * LL.assoc]);
 885    UWord tag  = memline & LL.tag_mask;
 886
 887    int i, j, idx;
 888    UWord tmp_tag;
 889
 890    CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %u\n", memline, setNo);
 891
 892    if (tag == (set[0] & LL.tag_mask)) {
 893      idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
 894      l1_loaded->dep_use = &(LL.use[idx]);
 895
 896      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
 897                  idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
 898                  LL.use[idx].mask, LL.use[idx].count);
 899      return LL_Hit;
 900    }
 901    for (i = 1; i < LL.assoc; i++) {
 902      if (tag == (set[i] & LL.tag_mask)) {
 903        tmp_tag = set[i];
 904        for (j = i; j > 0; j--) {
 905          set[j] = set[j - 1];
 906        }
 907        set[0] = tmp_tag;
 908        idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
 909        l1_loaded->dep_use = &(LL.use[idx]);
 910
 911         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
 912                  i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
 913                  LL.use[idx].mask, LL.use[idx].count);
 914         return LL_Hit;
 915      }
 916    }
 917
 918    /* A miss;  install this tag as MRU, shuffle rest down. */
 919    tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
 920    for (j = LL.assoc - 1; j > 0; j--) {
 921      set[j] = set[j - 1];
 922    }
 923    set[0] = tag | tmp_tag;
 924    idx = (setNo * LL.assoc) + tmp_tag;
 925    l1_loaded->dep_use = &(LL.use[idx]);
 926
 927    update_LL_use(idx, memline);
 928
 929    return MemAccess;
 930 }
 931
 932
 933
 934
 935 #define UPDATE_USE(L)                                                \
 936                                                                      \
 937 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
 938                                UInt mask, Addr memline)              \
 939 {                                                                    \
 940   line_loaded* loaded = &(cache->loaded[idx]);                       \
 941   line_use* use = &(cache->use[idx]);                                \
 942   int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
 943                                                                      \
 944   CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
 945            cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
 946   if (use->count>0) {                                                \
 947     CLG_DEBUG(2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",\
 948              use->count, c, use->mask, loaded->memline, loaded->iaddr); \
 949     CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
 950              CLG_(current_state).collect, loaded->use_base);         \
 951                                                                      \
 952     if (CLG_(current_state).collect && loaded->use_base) {           \
 953       (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
 954       (loaded->use_base)[off_##L##_SpLoss] += c;                     \
 955                                                                      \
 956       /* FIXME (?): L1/LL line sizes must be equal ! */              \
 957       loaded->dep_use->mask |= use->mask;                            \
 958       loaded->dep_use->count += use->count;                          \
 959     }                                                                \
 960   }                                                                  \
 961                                                                      \
 962   use->count = 1;                                                    \
 963   use->mask  = mask;                                                 \
 964   loaded->memline = memline;                                         \
 965   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;        \
 966   loaded->use_base = (CLG_(current_state).nonskipped) ?              \
 967     CLG_(current_state).nonskipped->skipped :                        \
 968     CLG_(cost_base) + current_ii->cost_offset;                       \
 969                                                                      \
 970   if (memline == 0) return LL_Hit;                                   \
 971   return cacheuse_LL_access(memline, loaded);                        \
 972 }
 973
 974 UPDATE_USE(I1);
 975 UPDATE_USE(D1);
 976
 977 CACHEUSE(I1);
 978 CACHEUSE(D1);
 979
 980
 981 static
 982 void cacheuse_finish(void)
 983 {
 984   int i;
 985   InstrInfo ii = { 0,0,0,0 };
 986
 987   if (!CLG_(current_state).collect) return;
 988
 989   CLG_(bb_base) = 0;
 990   current_ii = &ii; /* needs to be set for update_XX_use */
 991   CLG_(cost_base) = 0;
 992
 993   /* update usage counters */
 994   if (I1.use)
 995     for (i = 0; i < I1.sets * I1.assoc; i++)
 996       if (I1.loaded[i].use_base)
 997         update_I1_use( &I1, i, 0,0);
 998
 999   if (D1.use)
1000     for (i = 0; i < D1.sets * D1.assoc; i++)
1001       if (D1.loaded[i].use_base)
1002         update_D1_use( &D1, i, 0,0);
1003
1004   if (LL.use)
1005     for (i = 0; i < LL.sets * LL.assoc; i++)
1006       if (LL.loaded[i].use_base)
1007         update_LL_use(i, 0);
1008
1009   current_ii = 0;
1010 }
1011
1012
1013
1014 /*------------------------------------------------------------*/
1015 /*--- Helper functions called by instrumented code         ---*/
1016 /*------------------------------------------------------------*/
1017
1018
1019 static __inline__
1020 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1021 {
1022     switch(r) {
1023         case WriteBackMemAccess:
1024             if (clo_simulate_writeback) {
1025                 c1[3]++;
1026                 c2[3]++;
1027             }
1028             // fall through
1029
1030         case MemAccess:
1031             c1[2]++;
1032             c2[2]++;
1033             // fall through
1034
1035         case LL_Hit:
1036             c1[1]++;
1037             c2[1]++;
1038             // fall through
1039
1040         default:
1041             c1[0]++;
1042             c2[0]++;
1043     }
1044 }
1045
1046 static
1047 const HChar* cacheRes(CacheModelResult r)
1048 {
1049     switch(r) {
1050     case L1_Hit:    return "L1 Hit ";
1051     case LL_Hit:    return "LL Hit ";
1052     case MemAccess: return "LL Miss";
1053     case WriteBackMemAccess: return "LL Miss (dirty)";
1054     default:
1055         tl_assert(0);
1056     }
1057     return "??";
1058 }
1059
1060 VG_REGPARM(1)
1061 static void log_1I0D(InstrInfo* ii)
1062 {
1063     CacheModelResult IrRes;
1064
1065     current_ii = ii;
1066     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1067
1068     CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
1069               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1070
1071     if (CLG_(current_state).collect) {
1072         ULong* cost_Ir;
1073
1074         if (CLG_(current_state).nonskipped)
1075             cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1076         else
1077             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1078
1079         inc_costs(IrRes, cost_Ir,
1080                   CLG_(current_state).cost + fullOffset(EG_IR) );
1081     }
1082 }
1083
1084 VG_REGPARM(2)
1085 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1086 {
1087     CacheModelResult Ir1Res, Ir2Res;
1088     ULong *global_cost_Ir;
1089
1090     current_ii = ii1;
1091     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1092     current_ii = ii2;
1093     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1094
1095     CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1096               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1097               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1098
1099     if (!CLG_(current_state).collect) return;
1100
1101     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1102     if (CLG_(current_state).nonskipped) {
1103         ULong* skipped_cost_Ir =
1104             CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1105
1106         inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1107         inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1108         return;
1109     }
1110
1111     inc_costs(Ir1Res, global_cost_Ir,
1112               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1113     inc_costs(Ir2Res, global_cost_Ir,
1114               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1115 }
1116
1117 VG_REGPARM(3)
1118 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1119 {
1120     CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1121     ULong *global_cost_Ir;
1122
1123     current_ii = ii1;
1124     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1125     current_ii = ii2;
1126     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1127     current_ii = ii3;
1128     Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
1129
1130     CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1131               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1132               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1133               CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1134
1135     if (!CLG_(current_state).collect) return;
1136
1137     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1138     if (CLG_(current_state).nonskipped) {
1139         ULong* skipped_cost_Ir =
1140             CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1141         inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1142         inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1143         inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1144         return;
1145     }
1146
1147     inc_costs(Ir1Res, global_cost_Ir,
1148               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1149     inc_costs(Ir2Res, global_cost_Ir,
1150               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1151     inc_costs(Ir3Res, global_cost_Ir,
1152               CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
1153 }
1154
1155 /* Instruction doing a read access */
1156
1157 VG_REGPARM(3)
1158 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1159 {
1160     CacheModelResult IrRes, DrRes;
1161
1162     current_ii = ii;
1163     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1164     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1165
1166     CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%ld => %s\n",
1167               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1168               data_addr, data_size, cacheRes(DrRes));
1169
1170     if (CLG_(current_state).collect) {
1171         ULong *cost_Ir, *cost_Dr;
1172
1173         if (CLG_(current_state).nonskipped) {
1174             cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1175             cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1176         }
1177         else {
1178             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1179             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1180         }
1181
1182         inc_costs(IrRes, cost_Ir,
1183                   CLG_(current_state).cost + fullOffset(EG_IR) );
1184         inc_costs(DrRes, cost_Dr,
1185                   CLG_(current_state).cost + fullOffset(EG_DR) );
1186     }
1187 }
1188
1189
1190 /* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
1191    have exactly the same prototype.  If you change them, you must
1192    change addEvent_D_guarded too. */
1193 VG_REGPARM(3)
1194 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1195 {
1196     CacheModelResult DrRes;
1197
1198     current_ii = ii;
1199     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1200
1201     CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%ld => %s\n",
1202               data_addr, data_size, cacheRes(DrRes));
1203
1204     if (CLG_(current_state).collect) {
1205         ULong *cost_Dr;
1206
1207         if (CLG_(current_state).nonskipped)
1208             cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1209         else
1210             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1211
1212         inc_costs(DrRes, cost_Dr,
1213                   CLG_(current_state).cost + fullOffset(EG_DR) );
1214     }
1215 }
1216
1217
1218 /* Instruction doing a write access */
1219
1220 VG_REGPARM(3)
1221 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1222 {
1223     CacheModelResult IrRes, DwRes;
1224
1225     current_ii = ii;
1226     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1227     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1228
1229     CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%ld => %s\n",
1230               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1231               data_addr, data_size, cacheRes(DwRes));
1232
1233     if (CLG_(current_state).collect) {
1234         ULong *cost_Ir, *cost_Dw;
1235
1236         if (CLG_(current_state).nonskipped) {
1237             cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1238             cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1239         }
1240         else {
1241             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1242             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1243         }
1244
1245         inc_costs(IrRes, cost_Ir,
1246                   CLG_(current_state).cost + fullOffset(EG_IR) );
1247         inc_costs(DwRes, cost_Dw,
1248                   CLG_(current_state).cost + fullOffset(EG_DW) );
1249     }
1250 }
1251
1252 /* See comment on log_0I1Dr. */
1253 VG_REGPARM(3)
1254 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1255 {
1256     CacheModelResult DwRes;
1257
1258     current_ii = ii;
1259     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1260
1261     CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%ld => %s\n",
1262               data_addr, data_size, cacheRes(DwRes));
1263
1264     if (CLG_(current_state).collect) {
1265         ULong *cost_Dw;
1266
1267         if (CLG_(current_state).nonskipped)
1268             cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1269         else
1270             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1271
1272         inc_costs(DwRes, cost_Dw,
1273                   CLG_(current_state).cost + fullOffset(EG_DW) );
1274     }
1275 }
1276
1277
1278
1279 /*------------------------------------------------------------*/
1280 /*--- Cache configuration                                  ---*/
1281 /*------------------------------------------------------------*/
1282
1283 static cache_t clo_I1_cache = UNDEFINED_CACHE;
1284 static cache_t clo_D1_cache = UNDEFINED_CACHE;
1285 static cache_t clo_LL_cache = UNDEFINED_CACHE;
1286
1287 /* Initialize and clear simulator state */
1288 static void cachesim_post_clo_init(void)
1289 {
1290   /* Cache configurations. */
1291   cache_t  I1c, D1c, LLc;
1292
1293   /* Initialize access handlers */
1294   if (!CLG_(clo).simulate_cache) {
1295     CLG_(cachesim).log_1I0D  = 0;
1296     CLG_(cachesim).log_1I0D_name = "(no function)";
1297     CLG_(cachesim).log_2I0D  = 0;
1298     CLG_(cachesim).log_2I0D_name = "(no function)";
1299     CLG_(cachesim).log_3I0D  = 0;
1300     CLG_(cachesim).log_3I0D_name = "(no function)";
1301
1302     CLG_(cachesim).log_1I1Dr = 0;
1303     CLG_(cachesim).log_1I1Dr_name = "(no function)";
1304     CLG_(cachesim).log_1I1Dw = 0;
1305     CLG_(cachesim).log_1I1Dw_name = "(no function)";
1306
1307     CLG_(cachesim).log_0I1Dr = 0;
1308     CLG_(cachesim).log_0I1Dr_name = "(no function)";
1309     CLG_(cachesim).log_0I1Dw = 0;
1310     CLG_(cachesim).log_0I1Dw_name = "(no function)";
1311     return;
1312   }
1313
1314   /* Configuration of caches only needed with real cache simulation */
1315   VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
1316                                       &clo_I1_cache,
1317                                       &clo_D1_cache,
1318                                       &clo_LL_cache);
1319
1320   I1.name = "I1";
1321   D1.name = "D1";
1322   LL.name = "LL";
1323
1324   // min_line_size is used to make sure that we never feed
1325   // accesses to the simulator straddling more than two
1326   // cache lines at any cache level
1327   CLG_(min_line_size) = (I1c.line_size < D1c.line_size)
1328                            ? I1c.line_size : D1c.line_size;
1329   CLG_(min_line_size) = (LLc.line_size < CLG_(min_line_size))
1330                            ? LLc.line_size : CLG_(min_line_size);
1331
1332   Int largest_load_or_store_size
1333      = VG_(machine_get_size_of_largest_guest_register)();
1334   if (CLG_(min_line_size) < largest_load_or_store_size) {
1335      /* We can't continue, because the cache simulation might
1336         straddle more than 2 lines, and it will assert.  So let's
1337         just stop before we start. */
1338      VG_(umsg)("Callgrind: cannot continue: the minimum line size (%d)\n",
1339                (Int)CLG_(min_line_size));
1340      VG_(umsg)("  must be equal to or larger than the maximum register size (%d)\n",
1341                largest_load_or_store_size );
1342      VG_(umsg)("  but it is not.  Exiting now.\n");
1343      VG_(exit)(1);
1344   }
1345
1346   cachesim_initcache(I1c, &I1);
1347   cachesim_initcache(D1c, &D1);
1348   cachesim_initcache(LLc, &LL);
1349
1350   /* the other cache simulators use the standard helpers
1351    * with dispatching via simulator struct */
1352
1353   CLG_(cachesim).log_1I0D  = log_1I0D;
1354   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
1355   CLG_(cachesim).log_2I0D  = log_2I0D;
1356   CLG_(cachesim).log_2I0D_name  = "log_2I0D";
1357   CLG_(cachesim).log_3I0D  = log_3I0D;
1358   CLG_(cachesim).log_3I0D_name  = "log_3I0D";
1359
1360   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1361   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1362   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1363   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1364
1365   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1366   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1367   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1368   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1369
1370   if (clo_collect_cacheuse) {
1371
1372       /* Output warning for not supported option combinations */
1373       if (clo_simulate_hwpref) {
1374           VG_(message)(Vg_DebugMsg,
1375                        "warning: prefetch simulation can not be "
1376                        "used with cache usage\n");
1377           clo_simulate_hwpref = False;
1378       }
1379
1380       if (clo_simulate_writeback) {
1381           VG_(message)(Vg_DebugMsg,
1382                        "warning: write-back simulation can not be "
1383                        "used with cache usage\n");
1384           clo_simulate_writeback = False;
1385       }
1386
1387       simulator.I1_Read  = cacheuse_I1_doRead;
1388       simulator.D1_Read  = cacheuse_D1_doRead;
1389       simulator.D1_Write = cacheuse_D1_doRead;
1390       return;
1391   }
1392
1393   if (clo_simulate_hwpref) {
1394     prefetch_clear();
1395
1396     if (clo_simulate_writeback) {
1397       simulator.I1_Read  = prefetch_I1_Read;
1398       simulator.D1_Read  = prefetch_D1_Read;
1399       simulator.D1_Write = prefetch_D1_Write;
1400     }
1401     else {
1402       simulator.I1_Read  = prefetch_I1_ref;
1403       simulator.D1_Read  = prefetch_D1_ref;
1404       simulator.D1_Write = prefetch_D1_ref;
1405     }
1406
1407     return;
1408   }
1409
1410   if (clo_simulate_writeback) {
1411       simulator.I1_Read  = cachesim_I1_Read;
1412       simulator.D1_Read  = cachesim_D1_Read;
1413       simulator.D1_Write = cachesim_D1_Write;
1414   }
1415   else {
1416       simulator.I1_Read  = cachesim_I1_ref;
1417       simulator.D1_Read  = cachesim_D1_ref;
1418       simulator.D1_Write = cachesim_D1_ref;
1419   }
1420 }
1421
1422
1423 /* Clear simulator state. Has to be initialized before */
1424 static
1425 void cachesim_clear(void)
1426 {
1427   cachesim_clearcache(&I1);
1428   cachesim_clearcache(&D1);
1429   cachesim_clearcache(&LL);
1430
1431   prefetch_clear();
1432 }
1433
1434
1435 static void cachesim_dump_desc(VgFile *fp)
1436 {
1437   VG_(fprintf)(fp, "\ndesc: I1 cache: %s\n", I1.desc_line);
1438   VG_(fprintf)(fp, "desc: D1 cache: %s\n", D1.desc_line);
1439   VG_(fprintf)(fp, "desc: LL cache: %s\n", LL.desc_line);
1440 }
1441
1442 static
1443 void cachesim_print_opts(void)
1444 {
1445   VG_(printf)(
1446 "\n   cache simulator options (does cache simulation if used):\n"
1447 "    --simulate-wb=no|yes      Count write-back events [no]\n"
1448 "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
1449 #if CLG_EXPERIMENTAL
1450 "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1451 #endif
1452 "    --cacheuse=no|yes         Collect cache block use [no]\n");
1453   VG_(print_cache_clo_opts)();
1454 }
1455
1456 /* Check for command line option for cache configuration.
1457  * Return False if unknown and not handled.
1458  *
1459  * Called from CLG_(process_cmd_line_option)() in clo.c
1460  */
1461 static Bool cachesim_parse_opt(const HChar* arg)
1462 {
1463    if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
1464    else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
1465    else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
1466
1467    else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1468       if (clo_collect_cacheuse) {
1469          /* Use counters only make sense with fine dumping */
1470          CLG_(clo).dump_instr = True;
1471       }
1472    }
1473
1474    else if (VG_(str_clo_cache_opt)(arg,
1475                                    &clo_I1_cache,
1476                                    &clo_D1_cache,
1477                                    &clo_LL_cache)) {}
1478
1479    else
1480      return False;
1481
1482   return True;
1483 }
1484
1485 static
1486 void cachesim_printstat(Int l1, Int l2, Int l3)
1487 {
1488   FullCost total = CLG_(total_cost), D_total = 0;
1489   ULong LL_total_m, LL_total_mr, LL_total_mw,
1490     LL_total, LL_total_r, LL_total_w;
1491
1492   if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1493     VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
1494                  prefetch_up);
1495     VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
1496                  prefetch_down);
1497     VG_(message)(Vg_DebugMsg, "\n");
1498   }
1499
1500   VG_(message)(Vg_UserMsg, "I1  misses:    %'*llu\n", l1,
1501                total[fullOffset(EG_IR) +1]);
1502
1503   VG_(message)(Vg_UserMsg, "LLi misses:    %'*llu\n", l1,
1504                total[fullOffset(EG_IR) +2]);
1505
1506   if (0 == total[fullOffset(EG_IR)])
1507     total[fullOffset(EG_IR)] = 1;
1508
1509   VG_(message)(Vg_UserMsg, "I1  miss rate: %*.2f%%\n", l1,
1510                total[fullOffset(EG_IR)+1] * 100.0 / total[fullOffset(EG_IR)]);
1511
1512   VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1,
1513                total[fullOffset(EG_IR)+2] * 100.0 / total[fullOffset(EG_IR)]);
1514
1515   VG_(message)(Vg_UserMsg, "\n");
1516
1517   /* D cache results.
1518      Use the D_refs.rd and D_refs.wr values to determine the
1519    * width of columns 2 & 3. */
1520
1521   D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1522   CLG_(init_cost)( CLG_(sets).full, D_total);
1523   // we only use the first 3 values of D_total, adding up Dr and Dw costs
1524   CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
1525   CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
1526
1527   VG_(message)(Vg_UserMsg, "D   refs:      %'*llu  (%'*llu rd + %'*llu wr)\n",
1528                l1, D_total[0],
1529                l2, total[fullOffset(EG_DR)],
1530                l3, total[fullOffset(EG_DW)]);
1531
1532   VG_(message)(Vg_UserMsg, "D1  misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
1533                l1, D_total[1],
1534                l2, total[fullOffset(EG_DR)+1],
1535                l3, total[fullOffset(EG_DW)+1]);
1536
1537   VG_(message)(Vg_UserMsg, "LLd misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
1538                l1, D_total[2],
1539                l2, total[fullOffset(EG_DR)+2],
1540                l3, total[fullOffset(EG_DW)+2]);
1541
1542   if (0 == D_total[0])   D_total[0] = 1;
1543   if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
1544   if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
1545
1546   VG_(message)(Vg_UserMsg, "D1  miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
1547            l1, D_total[1] * 100.0 / D_total[0],
1548            l2, total[fullOffset(EG_DR)+1] * 100.0 / total[fullOffset(EG_DR)],
1549            l3, total[fullOffset(EG_DW)+1] * 100.0 / total[fullOffset(EG_DW)]);
1550
1551   VG_(message)(Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
1552            l1, D_total[2] * 100.0 / D_total[0],
1553            l2, total[fullOffset(EG_DR)+2] * 100.0 / total[fullOffset(EG_DR)],
1554            l3, total[fullOffset(EG_DW)+2] * 100.0 / total[fullOffset(EG_DW)]);
1555   VG_(message)(Vg_UserMsg, "\n");
1556
1557
1558
1559   /* LL overall results */
1560
1561   LL_total   =
1562     total[fullOffset(EG_DR) +1] +
1563     total[fullOffset(EG_DW) +1] +
1564     total[fullOffset(EG_IR) +1];
1565   LL_total_r =
1566     total[fullOffset(EG_DR) +1] +
1567     total[fullOffset(EG_IR) +1];
1568   LL_total_w = total[fullOffset(EG_DW) +1];
1569   VG_(message)(Vg_UserMsg, "LL refs:       %'*llu  (%'*llu rd + %'*llu wr)\n",
1570                l1, LL_total, l2, LL_total_r, l3, LL_total_w);
1571
1572   LL_total_m  =
1573     total[fullOffset(EG_DR) +2] +
1574     total[fullOffset(EG_DW) +2] +
1575     total[fullOffset(EG_IR) +2];
1576   LL_total_mr =
1577     total[fullOffset(EG_DR) +2] +
1578     total[fullOffset(EG_IR) +2];
1579   LL_total_mw = total[fullOffset(EG_DW) +2];
1580   VG_(message)(Vg_UserMsg, "LL misses:     %'*llu  (%'*llu rd + %'*llu wr)\n",
1581                l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw);
1582
1583   VG_(message)(Vg_UserMsg, "LL miss rate:  %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
1584           l1, LL_total_m  * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]),
1585           l2, LL_total_mr * 100.0 / (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
1586           l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]);
1587 }
1588
1589
1590 /*------------------------------------------------------------*/
1591 /*--- Setup for Event set.                                 ---*/
1592 /*------------------------------------------------------------*/
1593
1594 struct event_sets CLG_(sets);
1595
1596 void CLG_(init_eventsets)()
1597 {
1598     // Event groups from which the event sets are composed
1599     // the "Use" group only is used with "cacheuse" simulation
1600     if (clo_collect_cacheuse)
1601         CLG_(register_event_group4)(EG_USE,
1602                                     "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1603
1604     if (!CLG_(clo).simulate_cache)
1605         CLG_(register_event_group)(EG_IR, "Ir");
1606     else if (!clo_simulate_writeback) {
1607         CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
1608         CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
1609         CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
1610     }
1611     else { // clo_simulate_writeback
1612         CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
1613         CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
1614         CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
1615     }
1616
1617     if (CLG_(clo).simulate_branch) {
1618         CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
1619         CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
1620     }
1621
1622     if (CLG_(clo).collect_bus)
1623         CLG_(register_event_group)(EG_BUS, "Ge");
1624
1625     if (CLG_(clo).collect_alloc)
1626         CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
1627
1628     if (CLG_(clo).collect_systime != systime_no) {
1629        if (CLG_(clo).collect_systime == systime_nsec)
1630           CLG_(register_event_group3)(EG_SYS, "sysCount", "sysTime", "sysCpuTime");
1631        else
1632           CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
1633     }
1634
1635     // event set used as base for instruction self cost
1636     CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
1637
1638     // event set comprising all event groups, used for inclusive cost
1639     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
1640     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
1641     CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
1642     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
1643
1644     CLG_DEBUGIF(1) {
1645         CLG_DEBUG(1, "EventSets:\n");
1646         CLG_(print_eventset)(-2, CLG_(sets).base);
1647         CLG_(print_eventset)(-2, CLG_(sets).full);
1648     }
1649
1650     /* Not-existing events are silently ignored */
1651     CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
1652     CLG_(append_event)(CLG_(dumpmap), "Ir");
1653     CLG_(append_event)(CLG_(dumpmap), "Dr");
1654     CLG_(append_event)(CLG_(dumpmap), "Dw");
1655     CLG_(append_event)(CLG_(dumpmap), "I1mr");
1656     CLG_(append_event)(CLG_(dumpmap), "D1mr");
1657     CLG_(append_event)(CLG_(dumpmap), "D1mw");
1658     CLG_(append_event)(CLG_(dumpmap), "ILmr");
1659     CLG_(append_event)(CLG_(dumpmap), "DLmr");
1660     CLG_(append_event)(CLG_(dumpmap), "DLmw");
1661     CLG_(append_event)(CLG_(dumpmap), "ILdmr");
1662     CLG_(append_event)(CLG_(dumpmap), "DLdmr");
1663     CLG_(append_event)(CLG_(dumpmap), "DLdmw");
1664     CLG_(append_event)(CLG_(dumpmap), "Bc");
1665     CLG_(append_event)(CLG_(dumpmap), "Bcm");
1666     CLG_(append_event)(CLG_(dumpmap), "Bi");
1667     CLG_(append_event)(CLG_(dumpmap), "Bim");
1668     CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1669     CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1670     CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1671     CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1672     CLG_(append_event)(CLG_(dumpmap), "Ge");
1673     CLG_(append_event)(CLG_(dumpmap), "allocCount");
1674     CLG_(append_event)(CLG_(dumpmap), "allocSize");
1675     CLG_(append_event)(CLG_(dumpmap), "sysCount");
1676     CLG_(append_event)(CLG_(dumpmap), "sysTime");
1677     CLG_(append_event)(CLG_(dumpmap), "sysCpuTime");
1678 }
1679
1680
1681 /* this is called at dump time for every instruction executed */
1682 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1683                                InstrInfo* ii, ULong exe_count)
1684 {
1685     if (!CLG_(clo).simulate_cache)
1686         cost[ fullOffset(EG_IR) ] += exe_count;
1687
1688     if (ii->eventset)
1689         CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
1690                                   ii->eventset, bbcc->cost + ii->cost_offset);
1691 }
1692
1693 static
1694 void cachesim_finish(void)
1695 {
1696   if (clo_collect_cacheuse)
1697     cacheuse_finish();
1698 }
1699
1700 /*------------------------------------------------------------*/
1701 /*--- The simulator defined in this file                   ---*/
1702 /*------------------------------------------------------------*/
1703
1704 struct cachesim_if CLG_(cachesim) = {
1705   .print_opts    = cachesim_print_opts,
1706   .parse_opt     = cachesim_parse_opt,
1707   .post_clo_init = cachesim_post_clo_init,
1708   .clear         = cachesim_clear,
1709   .dump_desc     = cachesim_dump_desc,
1710   .printstat     = cachesim_printstat,
1711   .add_icost     = cachesim_add_icost,
1712   .finish        = cachesim_finish,
1713
1714   /* these will be set by cachesim_post_clo_init */
1715   .log_1I0D        = 0,
1716   .log_2I0D        = 0,
1717   .log_3I0D        = 0,
1718
1719   .log_1I1Dr       = 0,
1720   .log_1I1Dw       = 0,
1721
1722   .log_0I1Dr       = 0,
1723   .log_0I1Dw       = 0,
1724
1725   .log_1I0D_name = "(no function)",
1726   .log_2I0D_name = "(no function)",
1727   .log_3I0D_name = "(no function)",
1728
1729   .log_1I1Dr_name = "(no function)",
1730   .log_1I1Dw_name = "(no function)",
1731
1732   .log_0I1Dr_name = "(no function)",
1733   .log_0I1Dw_name = "(no function)",
1734 };
1735
1736
1737 /*--------------------------------------------------------------------*/
1738 /*--- end                                                 ct_sim.c ---*/
1739 /*--------------------------------------------------------------------*/