callgrind/sim.c

   1 /*--------------------------------------------------------------------*/
   2 /*--- Cache simulation.                                            ---*/
   3 /*---                                                        sim.c ---*/
   4 /*--------------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Callgrind, a Valgrind tool for call graph
   8    profiling programs.
   9
  10    Copyright (C) 2003-2013, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
  11
  12    This tool is derived from and contains code from Cachegrind
  13    Copyright (C) 2002-2013 Nicholas Nethercote (njn@valgrind.org)
  14
  15    This program is free software; you can redistribute it and/or
  16    modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation; either version 2 of the
  18    License, or (at your option) any later version.
  19
  20    This program is distributed in the hope that it will be useful, but
  21    WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23    General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program; if not, write to the Free Software
  27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  28    02111-1307, USA.
  29
  30    The GNU General Public License is contained in the file COPYING.
  31 */
  32
  33 #include "global.h"
  34
  35
  36 /* Notes:
  37   - simulates a write-allocate cache
  38   - (block --> set) hash function uses simple bit selection
  39   - handling of references straddling two cache blocks:
  40       - counts as only one cache access (not two)
  41       - both blocks hit                  --> one hit
  42       - one block hits, the other misses --> one miss
  43       - both blocks miss                 --> one miss (not two)
  44 */
  45
  46 /* Cache configuration */
  47 #include "cg_arch.c"
  48
  49 /* additional structures for cache use info, separated
  50  * according usage frequency:
  51  * - line_loaded : pointer to cost center of instruction
  52  *                 which loaded the line into cache.
  53  *                 Needed to increment counters when line is evicted.
  54  * - line_use    : updated on every access
  55  */
  56 typedef struct {
  57   UInt count;
  58   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
  59 } line_use;
  60
  61 typedef struct {
  62   Addr memline, iaddr;
  63   line_use* dep_use; /* point to higher-level cacheblock for this memline */
  64   ULong* use_base;
  65 } line_loaded;
  66
  67 /* Cache state */
  68 typedef struct {
  69    const HChar* name;
  70    int          size;                   /* bytes */
  71    int          assoc;
  72    int          line_size;              /* bytes */
  73    Bool         sectored;  /* prefetch nearside cacheline on read */
  74    int          sets;
  75    int          sets_min_1;
  76    int          line_size_bits;
  77    int          tag_shift;
  78    UWord        tag_mask;
  79    HChar        desc_line[128];    // large enough
  80    UWord*       tags;
  81
  82   /* for cache use */
  83    int          line_size_mask;
  84    int*         line_start_mask;
  85    int*         line_end_mask;
  86    line_loaded* loaded;
  87    line_use*    use;
  88 } cache_t2;
  89
  90 /*
  91  * States of flat caches in our model.
  92  * We use a 2-level hierarchy,
  93  */
  94 static cache_t2 I1, D1, LL;
  95
  96 /* Lower bits of cache tags are used as flags for a cache line */
  97 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
  98 #define CACHELINE_DIRTY    1
  99
 100
 101 /* Cache simulator Options */
 102 static Bool clo_simulate_writeback = False;
 103 static Bool clo_simulate_hwpref = False;
 104 static Bool clo_simulate_sectors = False;
 105 static Bool clo_collect_cacheuse = False;
 106
 107 /* Following global vars are setup before by setup_bbcc():
 108  *
 109  * - Addr   CLG_(bb_base)     (instruction start address of original BB)
 110  * - ULong* CLG_(cost_base)   (start of cost array for BB)
 111  */
 112
 113 Addr   CLG_(bb_base);
 114 ULong* CLG_(cost_base);
 115
 116 static InstrInfo* current_ii;
 117
 118 /* Cache use offsets */
 119 /* The offsets are only correct because all per-instruction event sets get
 120  * the "Use" set added first !
 121  */
 122 static Int off_I1_AcCost  = 0;
 123 static Int off_I1_SpLoss  = 1;
 124 static Int off_D1_AcCost  = 0;
 125 static Int off_D1_SpLoss  = 1;
 126 static Int off_LL_AcCost  = 2;
 127 static Int off_LL_SpLoss  = 3;
 128
 129 /* Cache access types */
 130 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
 131
 132 /* Result of a reference into a flat cache */
 133 typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
 134
 135 /* Result of a reference into a hierarchical cache model */
 136 typedef enum {
 137     L1_Hit,
 138     LL_Hit,
 139     MemAccess,
 140     WriteBackMemAccess } CacheModelResult;
 141
 142 typedef CacheModelResult (*simcall_type)(Addr, UChar);
 143
 144 static struct {
 145     simcall_type I1_Read;
 146     simcall_type D1_Read;
 147     simcall_type D1_Write;
 148 } simulator;
 149
 150 /*------------------------------------------------------------*/
 151 /*--- Cache Simulator Initialization                       ---*/
 152 /*------------------------------------------------------------*/
 153
 154 static void cachesim_clearcache(cache_t2* c)
 155 {
 156   Int i;
 157
 158   for (i = 0; i < c->sets * c->assoc; i++)
 159     c->tags[i] = 0;
 160   if (c->use) {
 161     for (i = 0; i < c->sets * c->assoc; i++) {
 162       c->loaded[i].memline  = 0;
 163       c->loaded[i].use_base = 0;
 164       c->loaded[i].dep_use = 0;
 165       c->loaded[i].iaddr = 0;
 166       c->use[i].mask    = 0;
 167       c->use[i].count   = 0;
 168       c->tags[i] = i % c->assoc; /* init lower bits as pointer */
 169     }
 170   }
 171 }
 172
 173 static void cacheuse_initcache(cache_t2* c);
 174
 175 /* By this point, the size/assoc/line_size has been checked. */
 176 static void cachesim_initcache(cache_t config, cache_t2* c)
 177 {
 178    c->size      = config.size;
 179    c->assoc     = config.assoc;
 180    c->line_size = config.line_size;
 181    c->sectored  = False; // FIXME
 182
 183    c->sets           = (c->size / c->line_size) / c->assoc;
 184    c->sets_min_1     = c->sets - 1;
 185    c->line_size_bits = VG_(log2)(c->line_size);
 186    c->tag_shift     = c->line_size_bits + VG_(log2)(c->sets);
 187    c->tag_mask       = ~((1u<<c->tag_shift)-1);
 188
 189    /* Can bits in tag entries be used for flags?
 190     * Should be always true as MIN_LINE_SIZE >= 16 */
 191    CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
 192
 193    if (c->assoc == 1) {
 194       VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
 195                    c->size, c->line_size,
 196                    c->sectored ? ", sectored":"");
 197    } else {
 198       VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
 199                    c->size, c->line_size, c->assoc,
 200                    c->sectored ? ", sectored":"");
 201    }
 202
 203    c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
 204                                  sizeof(UWord) * c->sets * c->assoc);
 205    if (clo_collect_cacheuse)
 206        cacheuse_initcache(c);
 207    else
 208      c->use = 0;
 209    cachesim_clearcache(c);
 210 }
 211
 212
 213 #if 0
 214 static void print_cache(cache_t2* c)
 215 {
 216    UInt set, way, i;
 217
 218    /* Note initialisation and update of 'i'. */
 219    for (i = 0, set = 0; set < c->sets; set++) {
 220       for (way = 0; way < c->assoc; way++, i++) {
 221          VG_(printf)("%8x ", c->tags[i]);
 222       }
 223       VG_(printf)("\n");
 224    }
 225 }
 226 #endif
 227
 228
 229 /*------------------------------------------------------------*/
 230 /*--- Simple Cache Simulation                              ---*/
 231 /*------------------------------------------------------------*/
 232
 233 /*
 234  * Model: single inclusive, 2-level cache hierarchy (L1/LL)
 235  *        with write-allocate
 236  *
 237  * For simple cache hit/miss counts, we do not have to
 238  * maintain the dirty state of lines (no need to distinguish
 239  * read/write references), and the resulting counts are the
 240  * same for write-through and write-back caches.
 241  *
 242  * Simulator functions:
 243  *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 244  *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 245  */
 246 __attribute__((always_inline))
 247 static __inline__
 248 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
 249 {
 250     int i, j;
 251     UWord *set;
 252
 253     set = &(c->tags[set_no * c->assoc]);
 254
 255     /* This loop is unrolled for just the first case, which is the most */
 256     /* common.  We can't unroll any further because it would screw up   */
 257     /* if we have a direct-mapped (1-way) cache.                        */
 258     if (tag == set[0])
 259         return Hit;
 260
 261     /* If the tag is one other than the MRU, move it into the MRU spot  */
 262     /* and shuffle the rest down.                                       */
 263     for (i = 1; i < c->assoc; i++) {
 264         if (tag == set[i]) {
 265             for (j = i; j > 0; j--) {
 266                 set[j] = set[j - 1];
 267             }
 268             set[0] = tag;
 269             return Hit;
 270         }
 271     }
 272
 273     /* A miss;  install this tag as MRU, shuffle rest down. */
 274     for (j = c->assoc - 1; j > 0; j--) {
 275         set[j] = set[j - 1];
 276     }
 277     set[0] = tag;
 278
 279     return Miss;
 280 }
 281
 282 __attribute__((always_inline))
 283 static __inline__
 284 CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
 285 {
 286     UWord block1 =  a         >> c->line_size_bits;
 287     UWord block2 = (a+size-1) >> c->line_size_bits;
 288     UInt  set1   = block1 & c->sets_min_1;
 289     /* the tag does not need to include bits specifying the set,
 290      * but it can, and this saves instructions */
 291     UWord tag1   = block1;
 292
 293     /* Access entirely within line. */
 294     if (block1 == block2)
 295         return cachesim_setref(c, set1, tag1);
 296
 297     /* Access straddles two lines. */
 298     else if (block1 + 1 == block2) {
 299         UInt  set2 = block2 & c->sets_min_1;
 300         UWord tag2 = block2;
 301
 302         /* the call updates cache structures as side effect */
 303         CacheResult res1 =  cachesim_setref(c, set1, tag1);
 304         CacheResult res2 =  cachesim_setref(c, set2, tag2);
 305         return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 306
 307    } else {
 308        VG_(printf)("addr: %lx  size: %u  blocks: %ld %ld",
 309                    a, size, block1, block2);
 310        VG_(tool_panic)("item straddles more than two cache sets");
 311    }
 312    return Hit;
 313 }
 314
 315 static
 316 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
 317 {
 318     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 319     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 320     return MemAccess;
 321 }
 322
 323 static
 324 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
 325 {
 326     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 327     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 328     return MemAccess;
 329 }
 330
 331
 332 /*------------------------------------------------------------*/
 333 /*--- Write Back Cache Simulation                          ---*/
 334 /*------------------------------------------------------------*/
 335
 336 /*
 337  * More complex model: L1 Write-through, LL Write-back
 338  * This needs to distinguish among read and write references.
 339  *
 340  * Simulator functions:
 341  *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 342  *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 343  *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 344  */
 345
 346 /*
 347  * With write-back, result can be a miss evicting a dirty line
 348  * The dirty state of a cache line is stored in Bit0 of the tag for
 349  * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
 350  * type (Read/Write), the line gets dirty on a write.
 351  */
 352 __attribute__((always_inline))
 353 static __inline__
 354 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
 355 {
 356     int i, j;
 357     UWord *set, tmp_tag;
 358
 359     set = &(c->tags[set_no * c->assoc]);
 360
 361     /* This loop is unrolled for just the first case, which is the most */
 362     /* common.  We can't unroll any further because it would screw up   */
 363     /* if we have a direct-mapped (1-way) cache.                        */
 364     if (tag == (set[0] & ~CACHELINE_DIRTY)) {
 365         set[0] |= ref;
 366         return Hit;
 367     }
 368     /* If the tag is one other than the MRU, move it into the MRU spot  */
 369     /* and shuffle the rest down.                                       */
 370     for (i = 1; i < c->assoc; i++) {
 371         if (tag == (set[i] & ~CACHELINE_DIRTY)) {
 372             tmp_tag = set[i] | ref; // update dirty flag
 373             for (j = i; j > 0; j--) {
 374                 set[j] = set[j - 1];
 375             }
 376             set[0] = tmp_tag;
 377             return Hit;
 378         }
 379     }
 380
 381     /* A miss;  install this tag as MRU, shuffle rest down. */
 382     tmp_tag = set[c->assoc - 1];
 383     for (j = c->assoc - 1; j > 0; j--) {
 384         set[j] = set[j - 1];
 385     }
 386     set[0] = tag | ref;
 387
 388     return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
 389 }
 390
 391 __attribute__((always_inline))
 392 static __inline__
 393 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
 394 {
 395     UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
 396     UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
 397     UWord tag = a & c->tag_mask;
 398
 399     /* Access entirely within line. */
 400     if (set1 == set2)
 401         return cachesim_setref_wb(c, ref, set1, tag);
 402
 403     /* Access straddles two lines. */
 404     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
 405     else if (((set1 + 1) & (c->sets_min_1)) == set2) {
 406         UWord tag2  = (a+size-1) & c->tag_mask;
 407
 408         /* the call updates cache structures as side effect */
 409         CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
 410         CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
 411
 412         if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
 413         return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
 414
 415    } else {
 416        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
 417        VG_(tool_panic)("item straddles more than two cache sets");
 418    }
 419    return Hit;
 420 }
 421
 422
 423 static
 424 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
 425 {
 426     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 427     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 428         case Hit: return LL_Hit;
 429         case Miss: return MemAccess;
 430         default: break;
 431     }
 432     return WriteBackMemAccess;
 433 }
 434
 435 static
 436 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
 437 {
 438     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 439     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 440         case Hit: return LL_Hit;
 441         case Miss: return MemAccess;
 442         default: break;
 443     }
 444     return WriteBackMemAccess;
 445 }
 446
 447 static
 448 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
 449 {
 450     if ( cachesim_ref( &D1, a, size) == Hit ) {
 451         /* Even for a L1 hit, the write-trough L1 passes
 452          * the write to the LL to make the LL line dirty.
 453          * But this causes no latency, so return the hit.
 454          */
 455         cachesim_ref_wb( &LL, Write, a, size);
 456         return L1_Hit;
 457     }
 458     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
 459         case Hit: return LL_Hit;
 460         case Miss: return MemAccess;
 461         default: break;
 462     }
 463     return WriteBackMemAccess;
 464 }
 465
 466
 467 /*------------------------------------------------------------*/
 468 /*--- Hardware Prefetch Simulation                         ---*/
 469 /*------------------------------------------------------------*/
 470
 471 static ULong prefetch_up = 0;
 472 static ULong prefetch_down = 0;
 473
 474 #define PF_STREAMS  8
 475 #define PF_PAGEBITS 12
 476
 477 static UInt pf_lastblock[PF_STREAMS];
 478 static Int  pf_seqblocks[PF_STREAMS];
 479
 480 static
 481 void prefetch_clear(void)
 482 {
 483   int i;
 484   for(i=0;i<PF_STREAMS;i++)
 485     pf_lastblock[i] = pf_seqblocks[i] = 0;
 486 }
 487
 488 /*
 489  * HW Prefetch emulation
 490  * Start prefetching when detecting sequential access to 3 memory blocks.
 491  * One stream can be detected per 4k page.
 492  */
 493 static __inline__
 494 void prefetch_LL_doref(Addr a)
 495 {
 496   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
 497   UInt block = ( a >> LL.line_size_bits);
 498
 499   if (block != pf_lastblock[stream]) {
 500     if (pf_seqblocks[stream] == 0) {
 501       if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
 502       else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
 503     }
 504     else if (pf_seqblocks[stream] >0) {
 505       if (pf_lastblock[stream] +1 == block) {
 506         pf_seqblocks[stream]++;
 507         if (pf_seqblocks[stream] >= 2) {
 508           prefetch_up++;
 509           cachesim_ref(&LL, a + 5 * LL.line_size,1);
 510         }
 511       }
 512       else pf_seqblocks[stream] = 0;
 513     }
 514     else if (pf_seqblocks[stream] <0) {
 515       if (pf_lastblock[stream] -1 == block) {
 516         pf_seqblocks[stream]--;
 517         if (pf_seqblocks[stream] <= -2) {
 518           prefetch_down++;
 519           cachesim_ref(&LL, a - 5 * LL.line_size,1);
 520         }
 521       }
 522       else pf_seqblocks[stream] = 0;
 523     }
 524     pf_lastblock[stream] = block;
 525   }
 526 }
 527
 528 /* simple model with hardware prefetch */
 529
 530 static
 531 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
 532 {
 533     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 534     prefetch_LL_doref(a);
 535     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 536     return MemAccess;
 537 }
 538
 539 static
 540 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
 541 {
 542     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 543     prefetch_LL_doref(a);
 544     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
 545     return MemAccess;
 546 }
 547
 548
 549 /* complex model with hardware prefetch */
 550
 551 static
 552 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
 553 {
 554     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
 555     prefetch_LL_doref(a);
 556     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 557         case Hit: return LL_Hit;
 558         case Miss: return MemAccess;
 559         default: break;
 560     }
 561     return WriteBackMemAccess;
 562 }
 563
 564 static
 565 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
 566 {
 567     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
 568     prefetch_LL_doref(a);
 569     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
 570         case Hit: return LL_Hit;
 571         case Miss: return MemAccess;
 572         default: break;
 573     }
 574     return WriteBackMemAccess;
 575 }
 576
 577 static
 578 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
 579 {
 580     prefetch_LL_doref(a);
 581     if ( cachesim_ref( &D1, a, size) == Hit ) {
 582         /* Even for a L1 hit, the write-trough L1 passes
 583          * the write to the LL to make the LL line dirty.
 584          * But this causes no latency, so return the hit.
 585          */
 586         cachesim_ref_wb( &LL, Write, a, size);
 587         return L1_Hit;
 588     }
 589     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
 590         case Hit: return LL_Hit;
 591         case Miss: return MemAccess;
 592         default: break;
 593     }
 594     return WriteBackMemAccess;
 595 }
 596
 597
 598 /*------------------------------------------------------------*/
 599 /*--- Cache Simulation with use metric collection          ---*/
 600 /*------------------------------------------------------------*/
 601
 602 /* can not be combined with write-back or prefetch */
 603
 604 static
 605 void cacheuse_initcache(cache_t2* c)
 606 {
 607     int i;
 608     unsigned int start_mask, start_val;
 609     unsigned int end_mask, end_val;
 610
 611     c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
 612                            sizeof(line_use) * c->sets * c->assoc);
 613     c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
 614                            sizeof(line_loaded) * c->sets * c->assoc);
 615     c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
 616                                     sizeof(int) * c->line_size);
 617     c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
 618                                   sizeof(int) * c->line_size);
 619
 620     c->line_size_mask = c->line_size-1;
 621
 622     /* Meaning of line_start_mask/line_end_mask
 623      * Example: for a given cache line, you get an access starting at
 624      * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
 625      * line size of 32, you have 1 bit per byte in the mask:
 626      *
 627      *   bit31   bit8 bit5  bit 0
 628      *       |      |  |    |
 629      *       11..111111100000   line_start_mask[5]
 630      *       00..000111111111   line_end_mask[(5+4)-1]
 631      *
 632      *  use_mask |= line_start_mask[5] && line_end_mask[8]
 633      *
 634      */
 635     start_val = end_val = ~0;
 636     if (c->line_size < 32) {
 637         int bits_per_byte = 32/c->line_size;
 638         start_mask = (1<<bits_per_byte)-1;
 639         end_mask   = start_mask << (32-bits_per_byte);
 640         for(i=0;i<c->line_size;i++) {
 641             c->line_start_mask[i] = start_val;
 642             start_val  = start_val & ~start_mask;
 643             start_mask = start_mask << bits_per_byte;
 644
 645             c->line_end_mask[c->line_size-i-1] = end_val;
 646             end_val  = end_val & ~end_mask;
 647             end_mask = end_mask >> bits_per_byte;
 648         }
 649     }
 650     else {
 651         int bytes_per_bit = c->line_size/32;
 652         start_mask = 1;
 653         end_mask   = 1u << 31;
 654         for(i=0;i<c->line_size;i++) {
 655             c->line_start_mask[i] = start_val;
 656             c->line_end_mask[c->line_size-i-1] = end_val;
 657             if ( ((i+1)%bytes_per_bit) == 0) {
 658                 start_val   &= ~start_mask;
 659                 end_val     &= ~end_mask;
 660                 start_mask <<= 1;
 661                 end_mask   >>= 1;
 662             }
 663         }
 664     }
 665
 666     CLG_DEBUG(6, "Config %s:\n", c->desc_line);
 667     for(i=0;i<c->line_size;i++) {
 668         CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
 669                   i, c->line_start_mask[i], c->line_end_mask[i]);
 670     }
 671
 672     /* We use lower tag bits as offset pointers to cache use info.
 673      * I.e. some cache parameters don't work.
 674      */
 675     if ( (1<<c->tag_shift) < c->assoc) {
 676         VG_(message)(Vg_DebugMsg,
 677                      "error: Use associativity < %d for cache use statistics!\n",
 678                      (1<<c->tag_shift) );
 679         VG_(tool_panic)("Unsupported cache configuration");
 680     }
 681 }
 682
 683
 684 /* for I1/D1 caches */
 685 #define CACHEUSE(L)                                                         \
 686                                                                             \
 687 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
 688 {                                                                           \
 689    UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
 690    UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
 691    UWord tag  = a & L.tag_mask;                                             \
 692    UWord tag2;                                                              \
 693    int i, j, idx;                                                           \
 694    UWord *set, tmp_tag;                                                     \
 695    UInt use_mask;                                                           \
 696                                                                             \
 697    CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n",                  \
 698             L.name, a, size, set1, set2);                                   \
 699                                                                             \
 700    /* First case: word entirely within line. */                             \
 701    if (set1 == set2) {                                                      \
 702                                                                             \
 703       set = &(L.tags[set1 * L.assoc]);                                      \
 704       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
 705                  L.line_end_mask[(a+size-1) & L.line_size_mask];            \
 706                                                                             \
 707       /* This loop is unrolled for just the first case, which is the most */\
 708       /* common.  We can't unroll any further because it would screw up   */\
 709       /* if we have a direct-mapped (1-way) cache.                        */\
 710       if (tag == (set[0] & L.tag_mask)) {                                   \
 711         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
 712         L.use[idx].count ++;                                                \
 713         L.use[idx].mask |= use_mask;                                        \
 714         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 715                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 716                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 717         return L1_Hit;                                                      \
 718       }                                                                     \
 719       /* If the tag is one other than the MRU, move it into the MRU spot  */\
 720       /* and shuffle the rest down.                                       */\
 721       for (i = 1; i < L.assoc; i++) {                                       \
 722          if (tag == (set[i] & L.tag_mask)) {                                \
 723             tmp_tag = set[i];                                               \
 724             for (j = i; j > 0; j--) {                                       \
 725                set[j] = set[j - 1];                                         \
 726             }                                                               \
 727             set[0] = tmp_tag;                                               \
 728             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 729             L.use[idx].count ++;                                            \
 730             L.use[idx].mask |= use_mask;                                    \
 731         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 732                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 733                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 734             return L1_Hit;                                                  \
 735          }                                                                  \
 736       }                                                                     \
 737                                                                             \
 738       /* A miss;  install this tag as MRU, shuffle rest down. */            \
 739       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 740       for (j = L.assoc - 1; j > 0; j--) {                                   \
 741          set[j] = set[j - 1];                                               \
 742       }                                                                     \
 743       set[0] = tag | tmp_tag;                                               \
 744       idx = (set1 * L.assoc) + tmp_tag;                                     \
 745       return update_##L##_use(&L, idx,                                      \
 746                        use_mask, a &~ L.line_size_mask);                    \
 747                                                                             \
 748    /* Second case: word straddles two lines. */                             \
 749    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
 750    } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
 751       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
 752       set = &(L.tags[set1 * L.assoc]);                                      \
 753       use_mask = L.line_start_mask[a & L.line_size_mask];                   \
 754       if (tag == (set[0] & L.tag_mask)) {                                   \
 755          idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
 756          L.use[idx].count ++;                                               \
 757          L.use[idx].mask |= use_mask;                                       \
 758         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 759                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 760                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 761          goto block2;                                                       \
 762       }                                                                     \
 763       for (i = 1; i < L.assoc; i++) {                                       \
 764          if (tag == (set[i] & L.tag_mask)) {                                \
 765             tmp_tag = set[i];                                               \
 766             for (j = i; j > 0; j--) {                                       \
 767                set[j] = set[j - 1];                                         \
 768             }                                                               \
 769             set[0] = tmp_tag;                                               \
 770             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 771             L.use[idx].count ++;                                            \
 772             L.use[idx].mask |= use_mask;                                    \
 773         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 774                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 775                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 776             goto block2;                                                    \
 777          }                                                                  \
 778       }                                                                     \
 779       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 780       for (j = L.assoc - 1; j > 0; j--) {                                   \
 781          set[j] = set[j - 1];                                               \
 782       }                                                                     \
 783       set[0] = tag | tmp_tag;                                               \
 784       idx = (set1 * L.assoc) + tmp_tag;                                     \
 785       miss1 = update_##L##_use(&L, idx,                                     \
 786                        use_mask, a &~ L.line_size_mask);                    \
 787 block2:                                                                     \
 788       set = &(L.tags[set2 * L.assoc]);                                      \
 789       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];            \
 790       tag2  = (a+size-1) & L.tag_mask;                                      \
 791       if (tag2 == (set[0] & L.tag_mask)) {                                  \
 792          idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
 793          L.use[idx].count ++;                                               \
 794          L.use[idx].mask |= use_mask;                                       \
 795         CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 796                  idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
 797                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 798          return miss1;                                                      \
 799       }                                                                     \
 800       for (i = 1; i < L.assoc; i++) {                                       \
 801          if (tag2 == (set[i] & L.tag_mask)) {                               \
 802             tmp_tag = set[i];                                               \
 803             for (j = i; j > 0; j--) {                                       \
 804                set[j] = set[j - 1];                                         \
 805             }                                                               \
 806             set[0] = tmp_tag;                                               \
 807             idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
 808             L.use[idx].count ++;                                            \
 809             L.use[idx].mask |= use_mask;                                    \
 810         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
 811                  i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
 812                  use_mask, L.use[idx].mask, L.use[idx].count);              \
 813             return miss1;                                                   \
 814          }                                                                  \
 815       }                                                                     \
 816       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
 817       for (j = L.assoc - 1; j > 0; j--) {                                   \
 818          set[j] = set[j - 1];                                               \
 819       }                                                                     \
 820       set[0] = tag2 | tmp_tag;                                              \
 821       idx = (set2 * L.assoc) + tmp_tag;                                     \
 822       miss2 = update_##L##_use(&L, idx,                                     \
 823                        use_mask, (a+size-1) &~ L.line_size_mask);           \
 824       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
 825                                                                             \
 826    } else {                                                                 \
 827        VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
 828        VG_(tool_panic)("item straddles more than two cache sets");          \
 829    }                                                                        \
 830    return 0;                                                                \
 831 }
 832
 833
 834 /* logarithmic bitcounting algorithm, see
 835  * http://graphics.stanford.edu/~seander/bithacks.html
 836  */
 837 static __inline__ unsigned int countBits(unsigned int bits)
 838 {
 839   unsigned int c; // store the total here
 840   const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
 841   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
 842
 843   c = bits;
 844   c = ((c >> S[0]) & B[0]) + (c & B[0]);
 845   c = ((c >> S[1]) & B[1]) + (c & B[1]);
 846   c = ((c >> S[2]) & B[2]) + (c & B[2]);
 847   c = ((c >> S[3]) & B[3]) + (c & B[3]);
 848   c = ((c >> S[4]) & B[4]) + (c & B[4]);
 849   return c;
 850 }
 851
 852 static void update_LL_use(int idx, Addr memline)
 853 {
 854   line_loaded* loaded = &(LL.loaded[idx]);
 855   line_use* use = &(LL.use[idx]);
 856   int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
 857
 858   CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
 859            idx, CLG_(bb_base) + current_ii->instr_offset, memline);
 860   if (use->count>0) {
 861     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
 862              use->count, i, use->mask, loaded->memline, loaded->iaddr);
 863     CLG_DEBUG(2, "   collect: %d, use_base %p\n",
 864              CLG_(current_state).collect, loaded->use_base);
 865
 866     if (CLG_(current_state).collect && loaded->use_base) {
 867       (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
 868       (loaded->use_base)[off_LL_SpLoss] += i;
 869     }
 870    }
 871
 872    use->count = 0;
 873    use->mask  = 0;
 874
 875   loaded->memline = memline;
 876   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;
 877   loaded->use_base = (CLG_(current_state).nonskipped) ?
 878     CLG_(current_state).nonskipped->skipped :
 879     CLG_(cost_base) + current_ii->cost_offset;
 880 }
 881
 882 static
 883 CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
 884 {
 885    UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
 886    UWord* set = &(LL.tags[setNo * LL.assoc]);
 887    UWord tag  = memline & LL.tag_mask;
 888
 889    int i, j, idx;
 890    UWord tmp_tag;
 891
 892    CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
 893
 894    if (tag == (set[0] & LL.tag_mask)) {
 895      idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
 896      l1_loaded->dep_use = &(LL.use[idx]);
 897
 898      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
 899                  idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
 900                  LL.use[idx].mask, LL.use[idx].count);
 901      return LL_Hit;
 902    }
 903    for (i = 1; i < LL.assoc; i++) {
 904      if (tag == (set[i] & LL.tag_mask)) {
 905        tmp_tag = set[i];
 906        for (j = i; j > 0; j--) {
 907          set[j] = set[j - 1];
 908        }
 909        set[0] = tmp_tag;
 910        idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
 911        l1_loaded->dep_use = &(LL.use[idx]);
 912
 913         CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
 914                  i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
 915                  LL.use[idx].mask, LL.use[idx].count);
 916         return LL_Hit;
 917      }
 918    }
 919
 920    /* A miss;  install this tag as MRU, shuffle rest down. */
 921    tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
 922    for (j = LL.assoc - 1; j > 0; j--) {
 923      set[j] = set[j - 1];
 924    }
 925    set[0] = tag | tmp_tag;
 926    idx = (setNo * LL.assoc) + tmp_tag;
 927    l1_loaded->dep_use = &(LL.use[idx]);
 928
 929    update_LL_use(idx, memline);
 930
 931    return MemAccess;
 932 }
 933
 934
 935
 936
 937 #define UPDATE_USE(L)                                                \
 938                                                                      \
 939 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
 940                                UInt mask, Addr memline)              \
 941 {                                                                    \
 942   line_loaded* loaded = &(cache->loaded[idx]);                       \
 943   line_use* use = &(cache->use[idx]);                                \
 944   int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
 945                                                                      \
 946   CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
 947            cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
 948   if (use->count>0) {                                                \
 949     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
 950              use->count, c, use->mask, loaded->memline, loaded->iaddr); \
 951     CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
 952              CLG_(current_state).collect, loaded->use_base);         \
 953                                                                      \
 954     if (CLG_(current_state).collect && loaded->use_base) {           \
 955       (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
 956       (loaded->use_base)[off_##L##_SpLoss] += c;                     \
 957                                                                      \
 958       /* FIXME (?): L1/LL line sizes must be equal ! */              \
 959       loaded->dep_use->mask |= use->mask;                            \
 960       loaded->dep_use->count += use->count;                          \
 961     }                                                                \
 962   }                                                                  \
 963                                                                      \
 964   use->count = 1;                                                    \
 965   use->mask  = mask;                                                 \
 966   loaded->memline = memline;                                         \
 967   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;        \
 968   loaded->use_base = (CLG_(current_state).nonskipped) ?              \
 969     CLG_(current_state).nonskipped->skipped :                        \
 970     CLG_(cost_base) + current_ii->cost_offset;                       \
 971                                                                      \
 972   if (memline == 0) return LL_Hit;                                   \
 973   return cacheuse_LL_access(memline, loaded);                        \
 974 }
 975
 976 UPDATE_USE(I1);
 977 UPDATE_USE(D1);
 978
 979 CACHEUSE(I1);
 980 CACHEUSE(D1);
 981
 982
 983 static
 984 void cacheuse_finish(void)
 985 {
 986   int i;
 987   InstrInfo ii = { 0,0,0,0 };
 988
 989   if (!CLG_(current_state).collect) return;
 990
 991   CLG_(bb_base) = 0;
 992   current_ii = &ii; /* needs to be set for update_XX_use */
 993   CLG_(cost_base) = 0;
 994
 995   /* update usage counters */
 996   if (I1.use)
 997     for (i = 0; i < I1.sets * I1.assoc; i++)
 998       if (I1.loaded[i].use_base)
 999         update_I1_use( &I1, i, 0,0);
1000
1001   if (D1.use)
1002     for (i = 0; i < D1.sets * D1.assoc; i++)
1003       if (D1.loaded[i].use_base)
1004         update_D1_use( &D1, i, 0,0);
1005
1006   if (LL.use)
1007     for (i = 0; i < LL.sets * LL.assoc; i++)
1008       if (LL.loaded[i].use_base)
1009         update_LL_use(i, 0);
1010
1011   current_ii = 0;
1012 }
1013
1014
1015
1016 /*------------------------------------------------------------*/
1017 /*--- Helper functions called by instrumented code         ---*/
1018 /*------------------------------------------------------------*/
1019
1020
1021 static __inline__
1022 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1023 {
1024     switch(r) {
1025         case WriteBackMemAccess:
1026             if (clo_simulate_writeback) {
1027                 c1[3]++;
1028                 c2[3]++;
1029             }
1030             // fall through
1031
1032         case MemAccess:
1033             c1[2]++;
1034             c2[2]++;
1035             // fall through
1036
1037         case LL_Hit:
1038             c1[1]++;
1039             c2[1]++;
1040             // fall through
1041
1042         default:
1043             c1[0]++;
1044             c2[0]++;
1045     }
1046 }
1047
1048 static
1049 const HChar* cacheRes(CacheModelResult r)
1050 {
1051     switch(r) {
1052     case L1_Hit:    return "L1 Hit ";
1053     case LL_Hit:    return "LL Hit ";
1054     case MemAccess: return "LL Miss";
1055     case WriteBackMemAccess: return "LL Miss (dirty)";
1056     default:
1057         tl_assert(0);
1058     }
1059     return "??";
1060 }
1061
1062 VG_REGPARM(1)
1063 static void log_1I0D(InstrInfo* ii)
1064 {
1065     CacheModelResult IrRes;
1066
1067     current_ii = ii;
1068     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1069
1070     CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
1071               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1072
1073     if (CLG_(current_state).collect) {
1074         ULong* cost_Ir;
1075
1076         if (CLG_(current_state).nonskipped)
1077             cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1078         else
1079             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1080
1081         inc_costs(IrRes, cost_Ir,
1082                   CLG_(current_state).cost + fullOffset(EG_IR) );
1083     }
1084 }
1085
1086 VG_REGPARM(2)
1087 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1088 {
1089     CacheModelResult Ir1Res, Ir2Res;
1090     ULong *global_cost_Ir;
1091
1092     current_ii = ii1;
1093     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1094     current_ii = ii2;
1095     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1096
1097     CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1098               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1099               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1100
1101     if (!CLG_(current_state).collect) return;
1102
1103     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1104     if (CLG_(current_state).nonskipped) {
1105         ULong* skipped_cost_Ir =
1106             CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1107
1108         inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1109         inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1110         return;
1111     }
1112
1113     inc_costs(Ir1Res, global_cost_Ir,
1114               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1115     inc_costs(Ir2Res, global_cost_Ir,
1116               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1117 }
1118
1119 VG_REGPARM(3)
1120 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1121 {
1122     CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1123     ULong *global_cost_Ir;
1124
1125     current_ii = ii1;
1126     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1127     current_ii = ii2;
1128     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1129     current_ii = ii3;
1130     Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
1131
1132     CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1133               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1134               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1135               CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1136
1137     if (!CLG_(current_state).collect) return;
1138
1139     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1140     if (CLG_(current_state).nonskipped) {
1141         ULong* skipped_cost_Ir =
1142             CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1143         inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1144         inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1145         inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1146         return;
1147     }
1148
1149     inc_costs(Ir1Res, global_cost_Ir,
1150               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1151     inc_costs(Ir2Res, global_cost_Ir,
1152               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1153     inc_costs(Ir3Res, global_cost_Ir,
1154               CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
1155 }
1156
1157 /* Instruction doing a read access */
1158
1159 VG_REGPARM(3)
1160 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1161 {
1162     CacheModelResult IrRes, DrRes;
1163
1164     current_ii = ii;
1165     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1166     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1167
1168     CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
1169               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1170               data_addr, data_size, cacheRes(DrRes));
1171
1172     if (CLG_(current_state).collect) {
1173         ULong *cost_Ir, *cost_Dr;
1174
1175         if (CLG_(current_state).nonskipped) {
1176             cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1177             cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1178         }
1179         else {
1180             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1181             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1182         }
1183
1184         inc_costs(IrRes, cost_Ir,
1185                   CLG_(current_state).cost + fullOffset(EG_IR) );
1186         inc_costs(DrRes, cost_Dr,
1187                   CLG_(current_state).cost + fullOffset(EG_DR) );
1188     }
1189 }
1190
1191
1192 /* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
1193    have exactly the same prototype.  If you change them, you must
1194    change addEvent_D_guarded too. */
1195 VG_REGPARM(3)
1196 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1197 {
1198     CacheModelResult DrRes;
1199
1200     current_ii = ii;
1201     DrRes = (*simulator.D1_Read)(data_addr, data_size);
1202
1203     CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
1204               data_addr, data_size, cacheRes(DrRes));
1205
1206     if (CLG_(current_state).collect) {
1207         ULong *cost_Dr;
1208
1209         if (CLG_(current_state).nonskipped)
1210             cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1211         else
1212             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1213
1214         inc_costs(DrRes, cost_Dr,
1215                   CLG_(current_state).cost + fullOffset(EG_DR) );
1216     }
1217 }
1218
1219
1220 /* Instruction doing a write access */
1221
1222 VG_REGPARM(3)
1223 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1224 {
1225     CacheModelResult IrRes, DwRes;
1226
1227     current_ii = ii;
1228     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1229     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1230
1231     CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
1232               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1233               data_addr, data_size, cacheRes(DwRes));
1234
1235     if (CLG_(current_state).collect) {
1236         ULong *cost_Ir, *cost_Dw;
1237
1238         if (CLG_(current_state).nonskipped) {
1239             cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1240             cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1241         }
1242         else {
1243             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1244             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1245         }
1246
1247         inc_costs(IrRes, cost_Ir,
1248                   CLG_(current_state).cost + fullOffset(EG_IR) );
1249         inc_costs(DwRes, cost_Dw,
1250                   CLG_(current_state).cost + fullOffset(EG_DW) );
1251     }
1252 }
1253
1254 /* See comment on log_0I1Dr. */
1255 VG_REGPARM(3)
1256 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1257 {
1258     CacheModelResult DwRes;
1259
1260     current_ii = ii;
1261     DwRes = (*simulator.D1_Write)(data_addr, data_size);
1262
1263     CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
1264               data_addr, data_size, cacheRes(DwRes));
1265
1266     if (CLG_(current_state).collect) {
1267         ULong *cost_Dw;
1268
1269         if (CLG_(current_state).nonskipped)
1270             cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1271         else
1272             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1273
1274         inc_costs(DwRes, cost_Dw,
1275                   CLG_(current_state).cost + fullOffset(EG_DW) );
1276     }
1277 }
1278
1279
1280
1281 /*------------------------------------------------------------*/
1282 /*--- Cache configuration                                  ---*/
1283 /*------------------------------------------------------------*/
1284
1285 static cache_t clo_I1_cache = UNDEFINED_CACHE;
1286 static cache_t clo_D1_cache = UNDEFINED_CACHE;
1287 static cache_t clo_LL_cache = UNDEFINED_CACHE;
1288
1289 /* Initialize and clear simulator state */
1290 static void cachesim_post_clo_init(void)
1291 {
1292   /* Cache configurations. */
1293   cache_t  I1c, D1c, LLc;
1294
1295   /* Initialize access handlers */
1296   if (!CLG_(clo).simulate_cache) {
1297     CLG_(cachesim).log_1I0D  = 0;
1298     CLG_(cachesim).log_1I0D_name = "(no function)";
1299     CLG_(cachesim).log_2I0D  = 0;
1300     CLG_(cachesim).log_2I0D_name = "(no function)";
1301     CLG_(cachesim).log_3I0D  = 0;
1302     CLG_(cachesim).log_3I0D_name = "(no function)";
1303
1304     CLG_(cachesim).log_1I1Dr = 0;
1305     CLG_(cachesim).log_1I1Dr_name = "(no function)";
1306     CLG_(cachesim).log_1I1Dw = 0;
1307     CLG_(cachesim).log_1I1Dw_name = "(no function)";
1308
1309     CLG_(cachesim).log_0I1Dr = 0;
1310     CLG_(cachesim).log_0I1Dr_name = "(no function)";
1311     CLG_(cachesim).log_0I1Dw = 0;
1312     CLG_(cachesim).log_0I1Dw_name = "(no function)";
1313     return;
1314   }
1315
1316   /* Configuration of caches only needed with real cache simulation */
1317   VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
1318                                       &clo_I1_cache,
1319                                       &clo_D1_cache,
1320                                       &clo_LL_cache);
1321
1322   I1.name = "I1";
1323   D1.name = "D1";
1324   LL.name = "LL";
1325
1326   // min_line_size is used to make sure that we never feed
1327   // accesses to the simulator straddling more than two
1328   // cache lines at any cache level
1329   CLG_(min_line_size) = (I1c.line_size < D1c.line_size)
1330                            ? I1c.line_size : D1c.line_size;
1331   CLG_(min_line_size) = (LLc.line_size < CLG_(min_line_size))
1332                            ? LLc.line_size : CLG_(min_line_size);
1333
1334   Int largest_load_or_store_size
1335      = VG_(machine_get_size_of_largest_guest_register)();
1336   if (CLG_(min_line_size) < largest_load_or_store_size) {
1337      /* We can't continue, because the cache simulation might
1338         straddle more than 2 lines, and it will assert.  So let's
1339         just stop before we start. */
1340      VG_(umsg)("Callgrind: cannot continue: the minimum line size (%d)\n",
1341                (Int)CLG_(min_line_size));
1342      VG_(umsg)("  must be equal to or larger than the maximum register size (%d)\n",
1343                largest_load_or_store_size );
1344      VG_(umsg)("  but it is not.  Exiting now.\n");
1345      VG_(exit)(1);
1346   }
1347
1348   cachesim_initcache(I1c, &I1);
1349   cachesim_initcache(D1c, &D1);
1350   cachesim_initcache(LLc, &LL);
1351
1352   /* the other cache simulators use the standard helpers
1353    * with dispatching via simulator struct */
1354
1355   CLG_(cachesim).log_1I0D  = log_1I0D;
1356   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
1357   CLG_(cachesim).log_2I0D  = log_2I0D;
1358   CLG_(cachesim).log_2I0D_name  = "log_2I0D";
1359   CLG_(cachesim).log_3I0D  = log_3I0D;
1360   CLG_(cachesim).log_3I0D_name  = "log_3I0D";
1361
1362   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1363   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1364   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1365   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1366
1367   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1368   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1369   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1370   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1371
1372   if (clo_collect_cacheuse) {
1373
1374       /* Output warning for not supported option combinations */
1375       if (clo_simulate_hwpref) {
1376           VG_(message)(Vg_DebugMsg,
1377                        "warning: prefetch simulation can not be "
1378                        "used with cache usage\n");
1379           clo_simulate_hwpref = False;
1380       }
1381
1382       if (clo_simulate_writeback) {
1383           VG_(message)(Vg_DebugMsg,
1384                        "warning: write-back simulation can not be "
1385                        "used with cache usage\n");
1386           clo_simulate_writeback = False;
1387       }
1388
1389       simulator.I1_Read  = cacheuse_I1_doRead;
1390       simulator.D1_Read  = cacheuse_D1_doRead;
1391       simulator.D1_Write = cacheuse_D1_doRead;
1392       return;
1393   }
1394
1395   if (clo_simulate_hwpref) {
1396     prefetch_clear();
1397
1398     if (clo_simulate_writeback) {
1399       simulator.I1_Read  = prefetch_I1_Read;
1400       simulator.D1_Read  = prefetch_D1_Read;
1401       simulator.D1_Write = prefetch_D1_Write;
1402     }
1403     else {
1404       simulator.I1_Read  = prefetch_I1_ref;
1405       simulator.D1_Read  = prefetch_D1_ref;
1406       simulator.D1_Write = prefetch_D1_ref;
1407     }
1408
1409     return;
1410   }
1411
1412   if (clo_simulate_writeback) {
1413       simulator.I1_Read  = cachesim_I1_Read;
1414       simulator.D1_Read  = cachesim_D1_Read;
1415       simulator.D1_Write = cachesim_D1_Write;
1416   }
1417   else {
1418       simulator.I1_Read  = cachesim_I1_ref;
1419       simulator.D1_Read  = cachesim_D1_ref;
1420       simulator.D1_Write = cachesim_D1_ref;
1421   }
1422 }
1423
1424
1425 /* Clear simulator state. Has to be initialized before */
1426 static
1427 void cachesim_clear(void)
1428 {
1429   cachesim_clearcache(&I1);
1430   cachesim_clearcache(&D1);
1431   cachesim_clearcache(&LL);
1432
1433   prefetch_clear();
1434 }
1435
1436
1437 static void cachesim_dump_desc(VgFile *fp)
1438 {
1439   VG_(fprintf)(fp, "\ndesc: I1 cache: %s\n", I1.desc_line);
1440   VG_(fprintf)(fp, "desc: D1 cache: %s\n", D1.desc_line);
1441   VG_(fprintf)(fp, "desc: LL cache: %s\n", LL.desc_line);
1442 }
1443
1444 static
1445 void cachesim_print_opts(void)
1446 {
1447   VG_(printf)(
1448 "\n   cache simulator options (does cache simulation if used):\n"
1449 "    --simulate-wb=no|yes      Count write-back events [no]\n"
1450 "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
1451 #if CLG_EXPERIMENTAL
1452 "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1453 #endif
1454 "    --cacheuse=no|yes         Collect cache block use [no]\n");
1455   VG_(print_cache_clo_opts)();
1456 }
1457
1458 /* Check for command line option for cache configuration.
1459  * Return False if unknown and not handled.
1460  *
1461  * Called from CLG_(process_cmd_line_option)() in clo.c
1462  */
1463 static Bool cachesim_parse_opt(const HChar* arg)
1464 {
1465    if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
1466    else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
1467    else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
1468
1469    else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1470       if (clo_collect_cacheuse) {
1471          /* Use counters only make sense with fine dumping */
1472          CLG_(clo).dump_instr = True;
1473       }
1474    }
1475
1476    else if (VG_(str_clo_cache_opt)(arg,
1477                                    &clo_I1_cache,
1478                                    &clo_D1_cache,
1479                                    &clo_LL_cache)) {}
1480
1481    else
1482      return False;
1483
1484   return True;
1485 }
1486
1487 static
1488 void cachesim_printstat(Int l1, Int l2, Int l3)
1489 {
1490   FullCost total = CLG_(total_cost), D_total = 0;
1491   ULong LL_total_m, LL_total_mr, LL_total_mw,
1492     LL_total, LL_total_r, LL_total_w;
1493
1494   if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1495     VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
1496                  prefetch_up);
1497     VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
1498                  prefetch_down);
1499     VG_(message)(Vg_DebugMsg, "\n");
1500   }
1501
1502   VG_(message)(Vg_UserMsg, "I1  misses:    %'*llu\n", l1,
1503                total[fullOffset(EG_IR) +1]);
1504
1505   VG_(message)(Vg_UserMsg, "LLi misses:    %'*llu\n", l1,
1506                total[fullOffset(EG_IR) +2]);
1507
1508   if (0 == total[fullOffset(EG_IR)])
1509     total[fullOffset(EG_IR)] = 1;
1510
1511   VG_(message)(Vg_UserMsg, "I1  miss rate: %*.2f%%\n", l1,
1512                total[fullOffset(EG_IR)+1] * 100.0 / total[fullOffset(EG_IR)]);
1513
1514   VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1,
1515                total[fullOffset(EG_IR)+2] * 100.0 / total[fullOffset(EG_IR)]);
1516
1517   VG_(message)(Vg_UserMsg, "\n");
1518
1519   /* D cache results.
1520      Use the D_refs.rd and D_refs.wr values to determine the
1521    * width of columns 2 & 3. */
1522
1523   D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1524   CLG_(init_cost)( CLG_(sets).full, D_total);
1525   // we only use the first 3 values of D_total, adding up Dr and Dw costs
1526   CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
1527   CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
1528
1529   VG_(message)(Vg_UserMsg, "D   refs:      %'*llu  (%'*llu rd + %'*llu wr)\n",
1530                l1, D_total[0],
1531                l2, total[fullOffset(EG_DR)],
1532                l3, total[fullOffset(EG_DW)]);
1533
1534   VG_(message)(Vg_UserMsg, "D1  misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
1535                l1, D_total[1],
1536                l2, total[fullOffset(EG_DR)+1],
1537                l3, total[fullOffset(EG_DW)+1]);
1538
1539   VG_(message)(Vg_UserMsg, "LLd misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
1540                l1, D_total[2],
1541                l2, total[fullOffset(EG_DR)+2],
1542                l3, total[fullOffset(EG_DW)+2]);
1543
1544   if (0 == D_total[0])   D_total[0] = 1;
1545   if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
1546   if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
1547
1548   VG_(message)(Vg_UserMsg, "D1  miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
1549            l1, D_total[1] * 100.0 / D_total[0],
1550            l2, total[fullOffset(EG_DR)+1] * 100.0 / total[fullOffset(EG_DR)],
1551            l3, total[fullOffset(EG_DW)+1] * 100.0 / total[fullOffset(EG_DW)]);
1552
1553   VG_(message)(Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
1554            l1, D_total[2] * 100.0 / D_total[0],
1555            l2, total[fullOffset(EG_DR)+2] * 100.0 / total[fullOffset(EG_DR)],
1556            l3, total[fullOffset(EG_DW)+2] * 100.0 / total[fullOffset(EG_DW)]);
1557   VG_(message)(Vg_UserMsg, "\n");
1558
1559
1560
1561   /* LL overall results */
1562
1563   LL_total   =
1564     total[fullOffset(EG_DR) +1] +
1565     total[fullOffset(EG_DW) +1] +
1566     total[fullOffset(EG_IR) +1];
1567   LL_total_r =
1568     total[fullOffset(EG_DR) +1] +
1569     total[fullOffset(EG_IR) +1];
1570   LL_total_w = total[fullOffset(EG_DW) +1];
1571   VG_(message)(Vg_UserMsg, "LL refs:       %'*llu  (%'*llu rd + %'*llu wr)\n",
1572                l1, LL_total, l2, LL_total_r, l3, LL_total_w);
1573
1574   LL_total_m  =
1575     total[fullOffset(EG_DR) +2] +
1576     total[fullOffset(EG_DW) +2] +
1577     total[fullOffset(EG_IR) +2];
1578   LL_total_mr =
1579     total[fullOffset(EG_DR) +2] +
1580     total[fullOffset(EG_IR) +2];
1581   LL_total_mw = total[fullOffset(EG_DW) +2];
1582   VG_(message)(Vg_UserMsg, "LL misses:     %'*llu  (%'*llu rd + %'*llu wr)\n",
1583                l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw);
1584
1585   VG_(message)(Vg_UserMsg, "LL miss rate:  %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
1586           l1, LL_total_m  * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]),
1587           l2, LL_total_mr * 100.0 / (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
1588           l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]);
1589 }
1590
1591
1592 /*------------------------------------------------------------*/
1593 /*--- Setup for Event set.                                 ---*/
1594 /*------------------------------------------------------------*/
1595
1596 struct event_sets CLG_(sets);
1597
1598 void CLG_(init_eventsets)()
1599 {
1600     // Event groups from which the event sets are composed
1601     // the "Use" group only is used with "cacheuse" simulation
1602     if (clo_collect_cacheuse)
1603         CLG_(register_event_group4)(EG_USE,
1604                                     "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1605
1606     if (!CLG_(clo).simulate_cache)
1607         CLG_(register_event_group)(EG_IR, "Ir");
1608     else if (!clo_simulate_writeback) {
1609         CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
1610         CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
1611         CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
1612     }
1613     else { // clo_simulate_writeback
1614         CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
1615         CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
1616         CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
1617     }
1618
1619     if (CLG_(clo).simulate_branch) {
1620         CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
1621         CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
1622     }
1623
1624     if (CLG_(clo).collect_bus)
1625         CLG_(register_event_group)(EG_BUS, "Ge");
1626
1627     if (CLG_(clo).collect_alloc)
1628         CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
1629
1630     if (CLG_(clo).collect_systime)
1631         CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
1632
1633     // event set used as base for instruction self cost
1634     CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
1635
1636     // event set comprising all event groups, used for inclusive cost
1637     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
1638     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
1639     CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
1640     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
1641
1642     CLG_DEBUGIF(1) {
1643         CLG_DEBUG(1, "EventSets:\n");
1644         CLG_(print_eventset)(-2, CLG_(sets).base);
1645         CLG_(print_eventset)(-2, CLG_(sets).full);
1646     }
1647
1648     /* Not-existing events are silently ignored */
1649     CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
1650     CLG_(append_event)(CLG_(dumpmap), "Ir");
1651     CLG_(append_event)(CLG_(dumpmap), "Dr");
1652     CLG_(append_event)(CLG_(dumpmap), "Dw");
1653     CLG_(append_event)(CLG_(dumpmap), "I1mr");
1654     CLG_(append_event)(CLG_(dumpmap), "D1mr");
1655     CLG_(append_event)(CLG_(dumpmap), "D1mw");
1656     CLG_(append_event)(CLG_(dumpmap), "ILmr");
1657     CLG_(append_event)(CLG_(dumpmap), "DLmr");
1658     CLG_(append_event)(CLG_(dumpmap), "DLmw");
1659     CLG_(append_event)(CLG_(dumpmap), "ILdmr");
1660     CLG_(append_event)(CLG_(dumpmap), "DLdmr");
1661     CLG_(append_event)(CLG_(dumpmap), "DLdmw");
1662     CLG_(append_event)(CLG_(dumpmap), "Bc");
1663     CLG_(append_event)(CLG_(dumpmap), "Bcm");
1664     CLG_(append_event)(CLG_(dumpmap), "Bi");
1665     CLG_(append_event)(CLG_(dumpmap), "Bim");
1666     CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1667     CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1668     CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1669     CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1670     CLG_(append_event)(CLG_(dumpmap), "Ge");
1671     CLG_(append_event)(CLG_(dumpmap), "allocCount");
1672     CLG_(append_event)(CLG_(dumpmap), "allocSize");
1673     CLG_(append_event)(CLG_(dumpmap), "sysCount");
1674     CLG_(append_event)(CLG_(dumpmap), "sysTime");
1675 }
1676
1677
1678 /* this is called at dump time for every instruction executed */
1679 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1680                                InstrInfo* ii, ULong exe_count)
1681 {
1682     if (!CLG_(clo).simulate_cache)
1683         cost[ fullOffset(EG_IR) ] += exe_count;
1684
1685     if (ii->eventset)
1686         CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
1687                                   ii->eventset, bbcc->cost + ii->cost_offset);
1688 }
1689
1690 static
1691 void cachesim_finish(void)
1692 {
1693   if (clo_collect_cacheuse)
1694     cacheuse_finish();
1695 }
1696
1697 /*------------------------------------------------------------*/
1698 /*--- The simulator defined in this file                   ---*/
1699 /*------------------------------------------------------------*/
1700
1701 struct cachesim_if CLG_(cachesim) = {
1702   .print_opts    = cachesim_print_opts,
1703   .parse_opt     = cachesim_parse_opt,
1704   .post_clo_init = cachesim_post_clo_init,
1705   .clear         = cachesim_clear,
1706   .dump_desc     = cachesim_dump_desc,
1707   .printstat     = cachesim_printstat,
1708   .add_icost     = cachesim_add_icost,
1709   .finish        = cachesim_finish,
1710
1711   /* these will be set by cachesim_post_clo_init */
1712   .log_1I0D        = 0,
1713   .log_2I0D        = 0,
1714   .log_3I0D        = 0,
1715
1716   .log_1I1Dr       = 0,
1717   .log_1I1Dw       = 0,
1718
1719   .log_0I1Dr       = 0,
1720   .log_0I1Dw       = 0,
1721
1722   .log_1I0D_name = "(no function)",
1723   .log_2I0D_name = "(no function)",
1724   .log_3I0D_name = "(no function)",
1725
1726   .log_1I1Dr_name = "(no function)",
1727   .log_1I1Dw_name = "(no function)",
1728
1729   .log_0I1Dr_name = "(no function)",
1730   .log_0I1Dw_name = "(no function)",
1731 };
1732
1733
1734 /*--------------------------------------------------------------------*/
1735 /*--- end                                                 ct_sim.c ---*/
1736 /*--------------------------------------------------------------------*/