helgrind/libhb_core.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- LibHB: a library for implementing and checking               ---*/
   4 /*--- the happens-before relationship in concurrent programs.      ---*/
   5 /*---                                                 libhb_main.c ---*/
   6 /*--------------------------------------------------------------------*/
   7
   8 /*
   9    This file is part of LibHB, a library for implementing and checking
  10    the happens-before relationship in concurrent programs.
  11
  12    Copyright (C) 2008-2013 OpenWorks Ltd
  13       info@open-works.co.uk
  14
  15    This program is free software; you can redistribute it and/or
  16    modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation; either version 2 of the
  18    License, or (at your option) any later version.
  19
  20    This program is distributed in the hope that it will be useful, but
  21    WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23    General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program; if not, write to the Free Software
  27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  28    02111-1307, USA.
  29
  30    The GNU General Public License is contained in the file COPYING.
  31 */
  32
  33 #include "pub_tool_basics.h"
  34 #include "pub_tool_poolalloc.h"
  35 #include "pub_tool_libcassert.h"
  36 #include "pub_tool_libcbase.h"
  37 #include "pub_tool_libcprint.h"
  38 #include "pub_tool_mallocfree.h"
  39 #include "pub_tool_wordfm.h"
  40 #include "pub_tool_sparsewa.h"
  41 #include "pub_tool_xarray.h"
  42 #include "pub_tool_oset.h"
  43 #include "pub_tool_threadstate.h"
  44 #include "pub_tool_aspacemgr.h"
  45 #include "pub_tool_execontext.h"
  46 #include "pub_tool_errormgr.h"
  47 #include "pub_tool_options.h"        // VG_(clo_stats)
  48 #include "hg_basics.h"
  49 #include "hg_wordset.h"
  50 #include "hg_lock_n_thread.h"
  51 #include "hg_errors.h"
  52
  53 #include "libhb.h"
  54
  55
  56 /////////////////////////////////////////////////////////////////
  57 /////////////////////////////////////////////////////////////////
  58 //                                                             //
  59 // Debugging #defines                                          //
  60 //                                                             //
  61 /////////////////////////////////////////////////////////////////
  62 /////////////////////////////////////////////////////////////////
  63
  64 /* Check the sanity of shadow values in the core memory state
  65    machine.  Change #if 0 to #if 1 to enable this. */
  66 #if 0
  67 #  define CHECK_MSM 1
  68 #else
  69 #  define CHECK_MSM 0
  70 #endif
  71
  72
  73 /* Check sanity (reference counts, etc) in the conflicting access
  74    machinery.  Change #if 0 to #if 1 to enable this. */
  75 #if 0
  76 #  define CHECK_CEM 1
  77 #else
  78 #  define CHECK_CEM 0
  79 #endif
  80
  81
  82 /* Check sanity in the compressed shadow memory machinery,
  83    particularly in its caching innards.  Unfortunately there's no
  84    almost-zero-cost way to make them selectable at run time.  Hence
  85    set the #if 0 to #if 1 and rebuild if you want them. */
  86 #if 0
  87 #  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
  88 #  define inline __attribute__((noinline))
  89    /* probably want to ditch -fomit-frame-pointer too */
  90 #else
  91 #  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
  92 #endif
  93
  94
  95 /////////////////////////////////////////////////////////////////
  96 /////////////////////////////////////////////////////////////////
  97 //                                                             //
  98 // data decls: VtsID                                           //
  99 //                                                             //
 100 /////////////////////////////////////////////////////////////////
 101 /////////////////////////////////////////////////////////////////
 102
 103 /* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
 104    bits, since they have to be packed into the lowest 30 bits of an
 105    SVal. */
 106 typedef  UInt  VtsID;
 107 #define VtsID_INVALID 0xFFFFFFFF
 108
 109
 110
 111 /////////////////////////////////////////////////////////////////
 112 /////////////////////////////////////////////////////////////////
 113 //                                                             //
 114 // data decls: SVal                                            //
 115 //                                                             //
 116 /////////////////////////////////////////////////////////////////
 117 /////////////////////////////////////////////////////////////////
 118
 119 typedef  ULong  SVal;
 120
 121 /* This value has special significance to the implementation, and callers
 122    may not store it in the shadow memory. */
 123 #define SVal_INVALID (3ULL << 62)
 124
 125 /* This is the default value for shadow memory.  Initially the shadow
 126    memory contains no accessible areas and so all reads produce this
 127    value.  TODO: make this caller-defineable. */
 128 #define SVal_NOACCESS (2ULL << 62)
 129
 130
 131
 132 /////////////////////////////////////////////////////////////////
 133 /////////////////////////////////////////////////////////////////
 134 //                                                             //
 135 // data decls: ScalarTS                                        //
 136 //                                                             //
 137 /////////////////////////////////////////////////////////////////
 138 /////////////////////////////////////////////////////////////////
 139
 140 /* Scalar Timestamp.  We have to store a lot of these, so there is
 141    some effort to make them as small as possible.  Logically they are
 142    a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
 143    We pack it into 64 bits by representing the Thr* using a ThrID, a
 144    small integer (18 bits), and a 46 bit integer for the timestamp
 145    number.  The 46/18 split is arbitary, but has the effect that
 146    Helgrind can only handle programs that create 2^18 or fewer threads
 147    over their entire lifetime, and have no more than 2^46 timestamp
 148    ticks (synchronisation operations on the same thread).
 149
 150    This doesn't seem like much of a limitation.  2^46 ticks is
 151    7.06e+13, and if each tick (optimistically) takes the machine 1000
 152    cycles to process, then the minimum time to process that many ticks
 153    at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
 154    but VTS ticks, which isn't realistic.
 155
 156    NB1: SCALARTS_N_THRBITS must be 29 or lower.  The obvious limit is
 157    32 since a ThrID is a UInt.  29 comes from the fact that
 158    'Thr_n_RCEC', which records information about old accesses, packs
 159    not only a ThrID but also 2+1 other bits (access size and
 160    writeness) in a UInt, hence limiting size to 32-(2+1) == 29.
 161
 162    NB2: thrid values are issued upwards from 1024, and values less
 163    than that aren't valid.  This isn't per se necessary (any order
 164    will do, so long as they are unique), but it does help ensure they
 165    are less likely to get confused with the various other kinds of
 166    small-integer thread ids drifting around (eg, TId).  See also NB5.
 167
 168    NB3: this probably also relies on the fact that Thr's are never
 169    deallocated -- they exist forever.  Hence the 1-1 mapping from
 170    Thr's to thrid values (set up in Thr__new) persists forever.
 171
 172    NB4: temp_max_sized_VTS is allocated at startup and never freed.
 173    It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
 174    ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
 175    making the memory use for this go sky-high.  With
 176    SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
 177    like an OK tradeoff.  If more than 256k threads need to be
 178    supported, we could change SCALARTS_N_THRBITS to 20, which would
 179    facilitate supporting 1 million threads at the cost of 8MB storage
 180    for temp_max_sized_VTS.
 181
 182    NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
 183    ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
 184    must never be a valid ThrID.  Given NB2 that's OK.
 185 */
 186 #define SCALARTS_N_THRBITS 18  /* valid range: 11 to 29 inclusive */
 187
 188 #define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
 189 typedef
 190    struct {
 191       ThrID thrid : SCALARTS_N_THRBITS;
 192       ULong tym   : SCALARTS_N_TYMBITS;
 193    }
 194    ScalarTS;
 195
 196 #define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
 197
 198
 199
 200 /////////////////////////////////////////////////////////////////
 201 /////////////////////////////////////////////////////////////////
 202 //                                                             //
 203 // data decls: Filter                                          //
 204 //                                                             //
 205 /////////////////////////////////////////////////////////////////
 206 /////////////////////////////////////////////////////////////////
 207
 208 // baseline: 5, 9
 209 #define FI_LINE_SZB_LOG2  5
 210 #define FI_NUM_LINES_LOG2 10
 211
 212 #define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
 213 #define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
 214
 215 #define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
 216 #define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
 217
 218 #define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
 219                              & (Addr)(FI_NUM_LINES-1) )
 220
 221
 222 /* In the lines, each 8 bytes are treated individually, and are mapped
 223    to a UShort.  Regardless of endianness of the underlying machine,
 224    bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
 225    the highest address.
 226
 227    Of each bit pair, the higher numbered bit is set if a R has been
 228    seen, so the actual layout is:
 229
 230    15 14             ...  01 00
 231
 232    R  W  for addr+7  ...  R  W  for addr+0
 233
 234    So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
 235 */
 236
 237 /* tags are separated from lines.  tags are Addrs and are
 238    the base address of the line. */
 239 typedef
 240    struct {
 241       UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
 242    }
 243    FiLine;
 244
 245 typedef
 246    struct {
 247       Addr   tags[FI_NUM_LINES];
 248       FiLine lines[FI_NUM_LINES];
 249    }
 250    Filter;
 251
 252
 253
 254 /////////////////////////////////////////////////////////////////
 255 /////////////////////////////////////////////////////////////////
 256 //                                                             //
 257 // data decls: Thr, ULong_n_EC                                 //
 258 //                                                             //
 259 /////////////////////////////////////////////////////////////////
 260 /////////////////////////////////////////////////////////////////
 261
 262 // Records stacks for H1 history mechanism (DRD-style)
 263 typedef
 264    struct { ULong ull; ExeContext* ec; }
 265    ULong_n_EC;
 266
 267
 268 /* How many of the above records to collect for each thread?  Older
 269    ones are dumped when we run out of space.  62.5k requires 1MB per
 270    thread, since each ULong_n_EC record is 16 bytes long.  When more
 271    than N_KWs_N_STACKs_PER_THREAD are present, the older half are
 272    deleted to make space.  Hence in the worst case we will be able to
 273    produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
 274    Kw transitions (segments in this thread).  For the current setting
 275    that gives a guaranteed stack for at least the last 31.25k
 276    segments. */
 277 #define N_KWs_N_STACKs_PER_THREAD 62500
 278
 279
 280 struct _Thr {
 281    /* Current VTSs for this thread.  They change as we go along.  viR
 282       is the VTS to be used for reads, viW for writes.  Usually they
 283       are the same, but can differ when we deal with reader-writer
 284       locks.  It is always the case that
 285          VtsID__cmpLEQ(viW,viR) == True
 286       that is, viW must be the same, or lagging behind, viR. */
 287    VtsID viR;
 288    VtsID viW;
 289
 290    /* Is initially False, and is set to True after the thread really
 291       has done a low-level exit.  When True, we expect to never see
 292       any more memory references done by this thread. */
 293    Bool llexit_done;
 294
 295    /* Is initially False, and is set to True after the thread has been
 296       joined with (reaped by some other thread).  After this point, we
 297       do not expect to see any uses of .viR or .viW, so it is safe to
 298       set them to VtsID_INVALID. */
 299    Bool joinedwith_done;
 300
 301    /* A small integer giving a unique identity to this Thr.  See
 302       comments on the definition of ScalarTS for details. */
 303    ThrID thrid : SCALARTS_N_THRBITS;
 304
 305    /* A filter that removes references for which we believe that
 306       msmcread/msmcwrite will not change the state, nor report a
 307       race. */
 308    Filter* filter;
 309
 310    /* A pointer back to the top level Thread structure.  There is a
 311       1-1 mapping between Thread and Thr structures -- each Thr points
 312       at its corresponding Thread, and vice versa.  Really, Thr and
 313       Thread should be merged into a single structure. */
 314    Thread* hgthread;
 315
 316    /* The ULongs (scalar Kws) in this accumulate in strictly
 317       increasing order, without duplicates.  This is important because
 318       we need to be able to find a given scalar Kw in this array
 319       later, by binary search. */
 320    XArray* /* ULong_n_EC */ local_Kws_n_stacks;
 321 };
 322
 323
 324
 325 /////////////////////////////////////////////////////////////////
 326 /////////////////////////////////////////////////////////////////
 327 //                                                             //
 328 // data decls: SO                                              //
 329 //                                                             //
 330 /////////////////////////////////////////////////////////////////
 331 /////////////////////////////////////////////////////////////////
 332
 333 // (UInt) `echo "Synchronisation object" | md5sum`
 334 #define SO_MAGIC 0x56b3c5b0U
 335
 336 struct _SO {
 337    struct _SO* admin_prev;
 338    struct _SO* admin_next;
 339    VtsID viR; /* r-clock of sender */
 340    VtsID viW; /* w-clock of sender */
 341    UInt  magic;
 342 };
 343
 344
 345
 346 /////////////////////////////////////////////////////////////////
 347 /////////////////////////////////////////////////////////////////
 348 //                                                             //
 349 // Forward declarations                                        //
 350 //                                                             //
 351 /////////////////////////////////////////////////////////////////
 352 /////////////////////////////////////////////////////////////////
 353
 354 /* fwds for
 355    Globals needed by other parts of the library.  These are set
 356    once at startup and then never changed. */
 357 static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
 358 static ExeContext* (*main_get_EC)( Thr* ) = NULL;
 359
 360 /* misc fn and data fwdses */
 361 static void VtsID__rcinc ( VtsID ii );
 362 static void VtsID__rcdec ( VtsID ii );
 363
 364 static inline Bool SVal__isC ( SVal s );
 365 static inline VtsID SVal__unC_Rmin ( SVal s );
 366 static inline VtsID SVal__unC_Wmin ( SVal s );
 367 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
 368
 369 /* A double linked list of all the SO's. */
 370 SO* admin_SO;
 371
 372
 373
 374 /////////////////////////////////////////////////////////////////
 375 /////////////////////////////////////////////////////////////////
 376 //                                                             //
 377 // SECTION BEGIN compressed shadow memory                      //
 378 //                                                             //
 379 /////////////////////////////////////////////////////////////////
 380 /////////////////////////////////////////////////////////////////
 381
 382 #ifndef __HB_ZSM_H
 383 #define __HB_ZSM_H
 384
 385 /* Initialise the library.  Once initialised, it will (or may) call
 386    rcinc and rcdec in response to all the calls below, in order to
 387    allow the user to do reference counting on the SVals stored herein.
 388    It is important to understand, however, that due to internal
 389    caching, the reference counts are in general inaccurate, and can be
 390    both above or below the true reference count for an item.  In
 391    particular, the library may indicate that the reference count for
 392    an item is zero, when in fact it is not.
 393
 394    To make the reference counting exact and therefore non-pointless,
 395    call zsm_flush_cache.  Immediately after it returns, the reference
 396    counts for all items, as deduced by the caller by observing calls
 397    to rcinc and rcdec, will be correct, and so any items with a zero
 398    reference count may be freed (or at least considered to be
 399    unreferenced by this library).
 400 */
 401 static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) );
 402
 403 static void zsm_sset_range  ( Addr, SizeT, SVal );
 404 static void zsm_scopy_range ( Addr, Addr, SizeT );
 405 static void zsm_flush_cache ( void );
 406
 407 #endif /* ! __HB_ZSM_H */
 408
 409
 410 /* Round a up to the next multiple of N.  N must be a power of 2 */
 411 #define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
 412 /* Round a down to the next multiple of N.  N must be a power of 2 */
 413 #define ROUNDDN(a, N)   ((a) & ~(N-1))
 414
 415
 416
 417 /* ------ User-supplied RC functions ------ */
 418 static void(*rcinc)(SVal) = NULL;
 419 static void(*rcdec)(SVal) = NULL;
 420
 421
 422 /* ------ CacheLine ------ */
 423
 424 #define N_LINE_BITS      6 /* must be >= 3 */
 425 #define N_LINE_ARANGE    (1 << N_LINE_BITS)
 426 #define N_LINE_TREES     (N_LINE_ARANGE >> 3)
 427
 428 typedef
 429    struct {
 430       UShort descrs[N_LINE_TREES];
 431       SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
 432    }
 433    CacheLine;
 434
 435 #define TREE_DESCR_16_0 (1<<0)
 436 #define TREE_DESCR_32_0 (1<<1)
 437 #define TREE_DESCR_16_1 (1<<2)
 438 #define TREE_DESCR_64   (1<<3)
 439 #define TREE_DESCR_16_2 (1<<4)
 440 #define TREE_DESCR_32_1 (1<<5)
 441 #define TREE_DESCR_16_3 (1<<6)
 442 #define TREE_DESCR_8_0  (1<<7)
 443 #define TREE_DESCR_8_1  (1<<8)
 444 #define TREE_DESCR_8_2  (1<<9)
 445 #define TREE_DESCR_8_3  (1<<10)
 446 #define TREE_DESCR_8_4  (1<<11)
 447 #define TREE_DESCR_8_5  (1<<12)
 448 #define TREE_DESCR_8_6  (1<<13)
 449 #define TREE_DESCR_8_7  (1<<14)
 450 #define TREE_DESCR_DTY  (1<<15)
 451
 452 typedef
 453    struct {
 454       SVal  dict[4]; /* can represent up to 4 diff values in the line */
 455       UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
 456                                       dict indexes */
 457       /* if dict[0] == SVal_INVALID then dict[1] is the index of the
 458          LineF to use, and dict[2..] are also SVal_INVALID. */
 459    }
 460    LineZ; /* compressed rep for a cache line */
 461
 462 typedef
 463    struct {
 464       Bool inUse;
 465       SVal w64s[N_LINE_ARANGE];
 466    }
 467    LineF; /* full rep for a cache line */
 468
 469 /* Shadow memory.
 470    Primary map is a WordFM Addr SecMap*.
 471    SecMaps cover some page-size-ish section of address space and hold
 472      a compressed representation.
 473    CacheLine-sized chunks of SecMaps are copied into a Cache, being
 474    decompressed when moved into the cache and recompressed on the
 475    way out.  Because of this, the cache must operate as a writeback
 476    cache, not a writethrough one.
 477
 478    Each SecMap must hold a power-of-2 number of CacheLines.  Hence
 479    N_SECMAP_BITS must >= N_LINE_BITS.
 480 */
 481 #define N_SECMAP_BITS   13
 482 #define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
 483
 484 // # CacheLines held by a SecMap
 485 #define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
 486
 487 /* The data in the SecMap is held in the array of LineZs.  Each LineZ
 488    either carries the required data directly, in a compressed
 489    representation, or it holds (in .dict[0]) an index to the LineF in
 490    .linesF that holds the full representation.
 491
 492    Currently-unused LineF's have their .inUse bit set to zero.
 493    Since each in-use LineF is referred to be exactly one LineZ,
 494    the number of .linesZ[] that refer to .linesF should equal
 495    the number of .linesF[] that have .inUse == True.
 496
 497    RC obligations: the RCs presented to the user include exactly
 498    the values in:
 499    * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
 500    * F reps that are in use (.inUse == True)
 501
 502    Hence the following actions at the following transitions are required:
 503
 504    F rep: .inUse==True  -> .inUse==False        -- rcdec_LineF
 505    F rep: .inUse==False -> .inUse==True         -- rcinc_LineF
 506    Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
 507    Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
 508 */
 509 typedef
 510    struct {
 511       UInt   magic;
 512       LineZ  linesZ[N_SECMAP_ZLINES];
 513       LineF* linesF;
 514       UInt   linesF_size;
 515    }
 516    SecMap;
 517
 518 #define SecMap_MAGIC   0x571e58cbU
 519
 520 __attribute__((unused))
 521 static inline Bool is_sane_SecMap ( SecMap* sm ) {
 522    return sm != NULL && sm->magic == SecMap_MAGIC;
 523 }
 524
 525 /* ------ Cache ------ */
 526
 527 #define N_WAY_BITS 16
 528 #define N_WAY_NENT (1 << N_WAY_BITS)
 529
 530 /* Each tag is the address of the associated CacheLine, rounded down
 531    to a CacheLine address boundary.  A CacheLine size must be a power
 532    of 2 and must be 8 or more.  Hence an easy way to initialise the
 533    cache so it is empty is to set all the tag values to any value % 8
 534    != 0, eg 1.  This means all queries in the cache initially miss.
 535    It does however require us to detect and not writeback, any line
 536    with a bogus tag. */
 537 typedef
 538    struct {
 539       CacheLine lyns0[N_WAY_NENT];
 540       Addr      tags0[N_WAY_NENT];
 541    }
 542    Cache;
 543
 544 static inline Bool is_valid_scache_tag ( Addr tag ) {
 545    /* a valid tag should be naturally aligned to the start of
 546       a CacheLine. */
 547    return 0 == (tag & (N_LINE_ARANGE - 1));
 548 }
 549
 550
 551 /* --------- Primary data structures --------- */
 552
 553 /* Shadow memory primary map */
 554 static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
 555 static Cache   cache_shmem;
 556
 557
 558 static UWord stats__secmaps_search       = 0; // # SM finds
 559 static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
 560 static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
 561 static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
 562 static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
 563 static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
 564 static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued
 565 static UWord stats__secmap_linesF_bytes  = 0; //  .. using this much storage
 566 static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter
 567 static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
 568 static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
 569 static UWord stats__cache_F_fetches      = 0; // # F lines fetched
 570 static UWord stats__cache_F_wbacks       = 0; // # F lines written back
 571 static UWord stats__cache_invals         = 0; // # cache invals
 572 static UWord stats__cache_flushes        = 0; // # cache flushes
 573 static UWord stats__cache_totrefs        = 0; // # total accesses
 574 static UWord stats__cache_totmisses      = 0; // # misses
 575 static ULong stats__cache_make_New_arange = 0; // total arange made New
 576 static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
 577 static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
 578 static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
 579 static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
 580 static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
 581 static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
 582 static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
 583 static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
 584 static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
 585 static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
 586 static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
 587 static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
 588 static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
 589 static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
 590 static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
 591 static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
 592 static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
 593 static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
 594 static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
 595 static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
 596 static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
 597 static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
 598 static UWord stats__vts__tick            = 0; // # calls to VTS__tick
 599 static UWord stats__vts__join            = 0; // # calls to VTS__join
 600 static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
 601 static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
 602
 603 // # calls to VTS__cmp_structural w/ slow case
 604 static UWord stats__vts__cmp_structural_slow = 0;
 605
 606 // # calls to VTS__indexAt_SLOW
 607 static UWord stats__vts__indexat_slow = 0;
 608
 609 // # calls to vts_set__find__or__clone_and_add
 610 static UWord stats__vts_set__focaa    = 0;
 611
 612 // # calls to vts_set__find__or__clone_and_add that lead to an
 613 // allocation
 614 static UWord stats__vts_set__focaa_a  = 0;
 615
 616
 617 static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
 618    return a & ~(N_SECMAP_ARANGE - 1);
 619 }
 620 static inline UWord shmem__get_SecMap_offset ( Addr a ) {
 621    return a & (N_SECMAP_ARANGE - 1);
 622 }
 623
 624
 625 /*----------------------------------------------------------------*/
 626 /*--- map_shmem :: WordFM Addr SecMap                          ---*/
 627 /*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
 628 /*----------------------------------------------------------------*/
 629
 630 /*--------------- SecMap allocation --------------- */
 631
 632 static HChar* shmem__bigchunk_next = NULL;
 633 static HChar* shmem__bigchunk_end1 = NULL;
 634
 635 static void* shmem__bigchunk_alloc ( SizeT n )
 636 {
 637    const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
 638    tl_assert(n > 0);
 639    n = VG_ROUNDUP(n, 16);
 640    tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
 641    tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
 642              <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
 643    if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
 644       if (0)
 645       VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
 646                   (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
 647       shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
 648       if (shmem__bigchunk_next == NULL)
 649          VG_(out_of_memory_NORETURN)(
 650             "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
 651       shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
 652    }
 653    tl_assert(shmem__bigchunk_next);
 654    tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
 655    tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
 656    shmem__bigchunk_next += n;
 657    return shmem__bigchunk_next - n;
 658 }
 659
 660 static SecMap* shmem__alloc_SecMap ( void )
 661 {
 662    Word    i, j;
 663    SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) );
 664    if (0) VG_(printf)("alloc_SecMap %p\n",sm);
 665    tl_assert(sm);
 666    sm->magic = SecMap_MAGIC;
 667    for (i = 0; i < N_SECMAP_ZLINES; i++) {
 668       sm->linesZ[i].dict[0] = SVal_NOACCESS;
 669       sm->linesZ[i].dict[1] = SVal_INVALID;
 670       sm->linesZ[i].dict[2] = SVal_INVALID;
 671       sm->linesZ[i].dict[3] = SVal_INVALID;
 672       for (j = 0; j < N_LINE_ARANGE/4; j++)
 673          sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
 674    }
 675    sm->linesF      = NULL;
 676    sm->linesF_size = 0;
 677    stats__secmaps_allocd++;
 678    stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
 679    stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
 680    stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
 681    return sm;
 682 }
 683
 684 typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
 685 static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
 686
 687 static SecMap* shmem__find_SecMap ( Addr ga )
 688 {
 689    SecMap* sm    = NULL;
 690    Addr    gaKey = shmem__round_to_SecMap_base(ga);
 691    // Cache
 692    stats__secmaps_search++;
 693    if (LIKELY(gaKey == smCache[0].gaKey))
 694       return smCache[0].sm;
 695    if (LIKELY(gaKey == smCache[1].gaKey)) {
 696       SMCacheEnt tmp = smCache[0];
 697       smCache[0] = smCache[1];
 698       smCache[1] = tmp;
 699       return smCache[0].sm;
 700    }
 701    if (gaKey == smCache[2].gaKey) {
 702       SMCacheEnt tmp = smCache[1];
 703       smCache[1] = smCache[2];
 704       smCache[2] = tmp;
 705       return smCache[1].sm;
 706    }
 707    // end Cache
 708    stats__secmaps_search_slow++;
 709    if (VG_(lookupFM)( map_shmem,
 710                       NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
 711       tl_assert(sm != NULL);
 712       smCache[2] = smCache[1];
 713       smCache[1] = smCache[0];
 714       smCache[0].gaKey = gaKey;
 715       smCache[0].sm    = sm;
 716    } else {
 717       tl_assert(sm == NULL);
 718    }
 719    return sm;
 720 }
 721
 722 static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
 723 {
 724    SecMap* sm = shmem__find_SecMap ( ga );
 725    if (LIKELY(sm)) {
 726       return sm;
 727    } else {
 728       /* create a new one */
 729       Addr gaKey = shmem__round_to_SecMap_base(ga);
 730       sm = shmem__alloc_SecMap();
 731       tl_assert(sm);
 732       VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
 733       return sm;
 734    }
 735 }
 736
 737
 738 /* ------------ LineF and LineZ related ------------ */
 739
 740 static void rcinc_LineF ( LineF* lineF ) {
 741    UWord i;
 742    tl_assert(lineF->inUse);
 743    for (i = 0; i < N_LINE_ARANGE; i++)
 744       rcinc(lineF->w64s[i]);
 745 }
 746
 747 static void rcdec_LineF ( LineF* lineF ) {
 748    UWord i;
 749    tl_assert(lineF->inUse);
 750    for (i = 0; i < N_LINE_ARANGE; i++)
 751       rcdec(lineF->w64s[i]);
 752 }
 753
 754 static void rcinc_LineZ ( LineZ* lineZ ) {
 755    tl_assert(lineZ->dict[0] != SVal_INVALID);
 756    rcinc(lineZ->dict[0]);
 757    if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]);
 758    if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]);
 759    if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]);
 760 }
 761
 762 static void rcdec_LineZ ( LineZ* lineZ ) {
 763    tl_assert(lineZ->dict[0] != SVal_INVALID);
 764    rcdec(lineZ->dict[0]);
 765    if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]);
 766    if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]);
 767    if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]);
 768 }
 769
 770 inline
 771 static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
 772    Word bix, shft, mask, prep;
 773    tl_assert(ix >= 0);
 774    bix  = ix >> 2;
 775    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
 776    mask = 3 << shft;
 777    prep = b2 << shft;
 778    arr[bix] = (arr[bix] & ~mask) | prep;
 779 }
 780
 781 inline
 782 static UWord read_twobit_array ( UChar* arr, UWord ix ) {
 783    Word bix, shft;
 784    tl_assert(ix >= 0);
 785    bix  = ix >> 2;
 786    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
 787    return (arr[bix] >> shft) & 3;
 788 }
 789
 790 /* Given address 'tag', find either the Z or F line containing relevant
 791    data, so it can be read into the cache.
 792 */
 793 static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
 794                                   /*OUT*/LineF** fp, Addr tag ) {
 795    LineZ* lineZ;
 796    LineF* lineF;
 797    UWord   zix;
 798    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
 799    UWord   smoff = shmem__get_SecMap_offset(tag);
 800    /* since smoff is derived from a valid tag, it should be
 801       cacheline-aligned. */
 802    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
 803    zix = smoff >> N_LINE_BITS;
 804    tl_assert(zix < N_SECMAP_ZLINES);
 805    lineZ = &sm->linesZ[zix];
 806    lineF = NULL;
 807    if (lineZ->dict[0] == SVal_INVALID) {
 808       UInt fix = (UInt)lineZ->dict[1];
 809       tl_assert(sm->linesF);
 810       tl_assert(sm->linesF_size > 0);
 811       tl_assert(fix >= 0 && fix < sm->linesF_size);
 812       lineF = &sm->linesF[fix];
 813       tl_assert(lineF->inUse);
 814       lineZ = NULL;
 815    }
 816    *zp = lineZ;
 817    *fp = lineF;
 818 }
 819
 820 /* Given address 'tag', return the relevant SecMap and the index of
 821    the LineZ within it, in the expectation that the line is to be
 822    overwritten.  Regardless of whether 'tag' is currently associated
 823    with a Z or F representation, to rcdec on the current
 824    representation, in recognition of the fact that the contents are
 825    just about to be overwritten. */
 826 static __attribute__((noinline))
 827 void find_Z_for_writing ( /*OUT*/SecMap** smp,
 828                           /*OUT*/Word* zixp,
 829                           Addr tag ) {
 830    LineZ* lineZ;
 831    LineF* lineF;
 832    UWord   zix;
 833    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
 834    UWord   smoff = shmem__get_SecMap_offset(tag);
 835    /* since smoff is derived from a valid tag, it should be
 836       cacheline-aligned. */
 837    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
 838    zix = smoff >> N_LINE_BITS;
 839    tl_assert(zix < N_SECMAP_ZLINES);
 840    lineZ = &sm->linesZ[zix];
 841    lineF = NULL;
 842    /* re RCs, we are freeing up this LineZ/LineF so that new data can
 843       be parked in it.  Hence have to rcdec it accordingly. */
 844    /* If lineZ has an associated lineF, free it up. */
 845    if (lineZ->dict[0] == SVal_INVALID) {
 846       UInt fix = (UInt)lineZ->dict[1];
 847       tl_assert(sm->linesF);
 848       tl_assert(sm->linesF_size > 0);
 849       tl_assert(fix >= 0 && fix < sm->linesF_size);
 850       lineF = &sm->linesF[fix];
 851       tl_assert(lineF->inUse);
 852       rcdec_LineF(lineF);
 853       lineF->inUse = False;
 854    } else {
 855       rcdec_LineZ(lineZ);
 856    }
 857    *smp  = sm;
 858    *zixp = zix;
 859 }
 860
 861 static __attribute__((noinline))
 862 void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) {
 863    UInt        i, new_size;
 864    LineF* nyu;
 865
 866    if (sm->linesF) {
 867       tl_assert(sm->linesF_size > 0);
 868    } else {
 869       tl_assert(sm->linesF_size == 0);
 870    }
 871
 872    if (sm->linesF) {
 873       for (i = 0; i < sm->linesF_size; i++) {
 874          if (!sm->linesF[i].inUse) {
 875             *fixp = (Word)i;
 876             return;
 877          }
 878       }
 879    }
 880
 881    /* No free F line found.  Expand existing array and try again. */
 882    new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size;
 883    nyu      = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)",
 884                            new_size * sizeof(LineF) );
 885
 886    stats__secmap_linesF_allocd += (new_size - sm->linesF_size);
 887    stats__secmap_linesF_bytes  += (new_size - sm->linesF_size)
 888                                   * sizeof(LineF);
 889
 890    if (0)
 891    VG_(printf)("SM %p: expand F array from %d to %d\n",
 892                sm, (Int)sm->linesF_size, new_size);
 893
 894    for (i = 0; i < new_size; i++)
 895       nyu[i].inUse = False;
 896
 897    if (sm->linesF) {
 898       for (i = 0; i < sm->linesF_size; i++) {
 899          tl_assert(sm->linesF[i].inUse);
 900          nyu[i] = sm->linesF[i];
 901       }
 902       VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) );
 903       HG_(free)(sm->linesF);
 904    }
 905
 906    sm->linesF      = nyu;
 907    sm->linesF_size = new_size;
 908
 909    for (i = 0; i < sm->linesF_size; i++) {
 910       if (!sm->linesF[i].inUse) {
 911          *fixp = (Word)i;
 912          return;
 913       }
 914     }
 915
 916     /*NOTREACHED*/
 917     tl_assert(0);
 918 }
 919
 920
 921 /* ------------ CacheLine and implicit-tree related ------------ */
 922
 923 __attribute__((unused))
 924 static void pp_CacheLine ( CacheLine* cl ) {
 925    Word i;
 926    if (!cl) {
 927       VG_(printf)("%s","pp_CacheLine(NULL)\n");
 928       return;
 929    }
 930    for (i = 0; i < N_LINE_TREES; i++)
 931       VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
 932    for (i = 0; i < N_LINE_ARANGE; i++)
 933       VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
 934 }
 935
 936 static UChar descr_to_validbits ( UShort descr )
 937 {
 938    /* a.k.a Party Time for gcc's constant folder */
 939 #  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
 940                 b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
 941              ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
 942                           ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
 943                           ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
 944                           ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
 945                           ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
 946                           ( (b16_2) << 4)  | ( (b64)   << 3)  | \
 947                           ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
 948                           ( (b16_0) << 0) ) )
 949
 950 #  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
 951              ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
 952                          ( (bit5) << 5) | ( (bit4) << 4) | \
 953                          ( (bit3) << 3) | ( (bit2) << 2) | \
 954                          ( (bit1) << 1) | ( (bit0) << 0) ) )
 955
 956    /* these should all get folded out at compile time */
 957    tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
 958    tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
 959    tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
 960    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
 961    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
 962    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
 963    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
 964    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
 965    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
 966
 967    switch (descr) {
 968    /*
 969               +--------------------------------- TREE_DESCR_8_7
 970               |             +------------------- TREE_DESCR_8_0
 971               |             |  +---------------- TREE_DESCR_16_3
 972               |             |  | +-------------- TREE_DESCR_32_1
 973               |             |  | | +------------ TREE_DESCR_16_2
 974               |             |  | | |  +--------- TREE_DESCR_64
 975               |             |  | | |  |  +------ TREE_DESCR_16_1
 976               |             |  | | |  |  | +---- TREE_DESCR_32_0
 977               |             |  | | |  |  | | +-- TREE_DESCR_16_0
 978               |             |  | | |  |  | | |
 979               |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
 980    case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
 981                                                  return BYTE(1,1,1,1,1,1,1,1);
 982    case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
 983                                                  return BYTE(1,1,0,1,1,1,1,1);
 984    case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
 985                                                  return BYTE(0,1,1,1,1,1,1,1);
 986    case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
 987                                                  return BYTE(0,1,0,1,1,1,1,1);
 988
 989    case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
 990                                                  return BYTE(1,1,1,1,1,1,0,1);
 991    case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
 992                                                  return BYTE(1,1,0,1,1,1,0,1);
 993    case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
 994                                                  return BYTE(0,1,1,1,1,1,0,1);
 995    case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
 996                                                  return BYTE(0,1,0,1,1,1,0,1);
 997
 998    case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
 999                                                  return BYTE(1,1,1,1,0,1,1,1);
1000    case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
1001                                                  return BYTE(1,1,0,1,0,1,1,1);
1002    case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
1003                                                  return BYTE(0,1,1,1,0,1,1,1);
1004    case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
1005                                                  return BYTE(0,1,0,1,0,1,1,1);
1006
1007    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
1008                                                  return BYTE(1,1,1,1,0,1,0,1);
1009    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
1010                                                  return BYTE(1,1,0,1,0,1,0,1);
1011    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
1012                                                  return BYTE(0,1,1,1,0,1,0,1);
1013    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
1014                                                  return BYTE(0,1,0,1,0,1,0,1);
1015
1016    case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
1017                                                  return BYTE(0,0,0,1,1,1,1,1);
1018    case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
1019                                                  return BYTE(0,0,0,1,1,1,0,1);
1020    case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
1021                                                  return BYTE(0,0,0,1,0,1,1,1);
1022    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
1023                                                  return BYTE(0,0,0,1,0,1,0,1);
1024
1025    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
1026                                                  return BYTE(1,1,1,1,0,0,0,1);
1027    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
1028                                                  return BYTE(1,1,0,1,0,0,0,1);
1029    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
1030                                                  return BYTE(0,1,1,1,0,0,0,1);
1031    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
1032                                                  return BYTE(0,1,0,1,0,0,0,1);
1033
1034    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
1035                                                  return BYTE(0,0,0,1,0,0,0,1);
1036
1037    case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
1038                                                  return BYTE(0,0,0,0,0,0,0,1);
1039
1040    default: return BYTE(0,0,0,0,0,0,0,0);
1041                    /* INVALID - any valid descr produces at least one
1042                       valid bit in tree[0..7]*/
1043    }
1044    /* NOTREACHED*/
1045    tl_assert(0);
1046
1047 #  undef DESCR
1048 #  undef BYTE
1049 }
1050
1051 __attribute__((unused))
1052 static Bool is_sane_Descr ( UShort descr ) {
1053    return descr_to_validbits(descr) != 0;
1054 }
1055
1056 static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
1057    VG_(sprintf)(dst,
1058                 "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
1059                 (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
1060                 (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
1061                 (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
1062                 (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
1063                 (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
1064                 (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
1065                 (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
1066                 (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
1067                 (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
1068                 (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
1069                 (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
1070                 (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
1071                 (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
1072                 (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
1073                 (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
1074    );
1075 }
1076 static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
1077    VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
1078                      (Int)((byte & 128) ? 1 : 0),
1079                      (Int)((byte &  64) ? 1 : 0),
1080                      (Int)((byte &  32) ? 1 : 0),
1081                      (Int)((byte &  16) ? 1 : 0),
1082                      (Int)((byte &   8) ? 1 : 0),
1083                      (Int)((byte &   4) ? 1 : 0),
1084                      (Int)((byte &   2) ? 1 : 0),
1085                      (Int)((byte &   1) ? 1 : 0)
1086    );
1087 }
1088
1089 static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
1090    Word  i;
1091    UChar validbits = descr_to_validbits(descr);
1092    HChar buf[128], buf2[128];
1093    if (validbits == 0)
1094       goto bad;
1095    for (i = 0; i < 8; i++) {
1096       if (validbits & (1<<i)) {
1097          if (tree[i] == SVal_INVALID)
1098             goto bad;
1099       } else {
1100          if (tree[i] != SVal_INVALID)
1101             goto bad;
1102       }
1103    }
1104    return True;
1105   bad:
1106    sprintf_Descr( buf, descr );
1107    sprintf_Byte( buf2, validbits );
1108    VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
1109    VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
1110    VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
1111    for (i = 0; i < 8; i++)
1112       VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
1113    VG_(printf)("%s","}\n");
1114    return 0;
1115 }
1116
1117 static Bool is_sane_CacheLine ( CacheLine* cl )
1118 {
1119    Word tno, cloff;
1120
1121    if (!cl) goto bad;
1122
1123    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1124       UShort descr = cl->descrs[tno];
1125       SVal*  tree  = &cl->svals[cloff];
1126       if (!is_sane_Descr_and_Tree(descr, tree))
1127          goto bad;
1128    }
1129    tl_assert(cloff == N_LINE_ARANGE);
1130    return True;
1131   bad:
1132    pp_CacheLine(cl);
1133    return False;
1134 }
1135
1136 static UShort normalise_tree ( /*MOD*/SVal* tree )
1137 {
1138    UShort descr;
1139    /* pre: incoming tree[0..7] does not have any invalid shvals, in
1140       particular no zeroes. */
1141    if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
1142                 || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
1143                 || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
1144                 || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
1145       tl_assert(0);
1146
1147    descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
1148            | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
1149            | TREE_DESCR_8_1 | TREE_DESCR_8_0;
1150    /* build 16-bit layer */
1151    if (tree[1] == tree[0]) {
1152       tree[1] = SVal_INVALID;
1153       descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
1154       descr |= TREE_DESCR_16_0;
1155    }
1156    if (tree[3] == tree[2]) {
1157       tree[3] = SVal_INVALID;
1158       descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
1159       descr |= TREE_DESCR_16_1;
1160    }
1161    if (tree[5] == tree[4]) {
1162       tree[5] = SVal_INVALID;
1163       descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
1164       descr |= TREE_DESCR_16_2;
1165    }
1166    if (tree[7] == tree[6]) {
1167       tree[7] = SVal_INVALID;
1168       descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
1169       descr |= TREE_DESCR_16_3;
1170    }
1171    /* build 32-bit layer */
1172    if (tree[2] == tree[0]
1173        && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
1174       tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
1175       descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
1176       descr |= TREE_DESCR_32_0;
1177    }
1178    if (tree[6] == tree[4]
1179        && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
1180       tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
1181       descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
1182       descr |= TREE_DESCR_32_1;
1183    }
1184    /* build 64-bit layer */
1185    if (tree[4] == tree[0]
1186        && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
1187       tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
1188       descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
1189       descr |= TREE_DESCR_64;
1190    }
1191    return descr;
1192 }
1193
1194 /* This takes a cacheline where all the data is at the leaves
1195    (w8[..]) and builds a correctly normalised tree. */
1196 static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
1197 {
1198    Word tno, cloff;
1199    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1200       SVal* tree = &cl->svals[cloff];
1201       cl->descrs[tno] = normalise_tree( tree );
1202    }
1203    tl_assert(cloff == N_LINE_ARANGE);
1204    if (CHECK_ZSM)
1205       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1206    stats__cline_normalises++;
1207 }
1208
1209
1210 typedef struct { UChar count; SVal sval; } CountedSVal;
1211
1212 static
1213 void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
1214                                /*OUT*/Word* dstUsedP,
1215                                Word nDst, CacheLine* src )
1216 {
1217    Word  tno, cloff, dstUsed;
1218
1219    tl_assert(nDst == N_LINE_ARANGE);
1220    dstUsed = 0;
1221
1222    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1223       UShort descr = src->descrs[tno];
1224       SVal*  tree  = &src->svals[cloff];
1225
1226       /* sequentialise the tree described by (descr,tree). */
1227 #     define PUT(_n,_v)                                \
1228          do { dst[dstUsed  ].count = (_n);             \
1229               dst[dstUsed++].sval  = (_v);             \
1230          } while (0)
1231
1232       /* byte 0 */
1233       if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
1234       if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
1235       if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
1236       if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
1237       /* byte 1 */
1238       if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
1239       /* byte 2 */
1240       if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
1241       if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
1242       /* byte 3 */
1243       if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
1244       /* byte 4 */
1245       if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
1246       if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
1247       if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
1248       /* byte 5 */
1249       if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
1250       /* byte 6 */
1251       if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
1252       if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
1253       /* byte 7 */
1254       if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
1255
1256 #     undef PUT
1257       /* END sequentialise the tree described by (descr,tree). */
1258
1259    }
1260    tl_assert(cloff == N_LINE_ARANGE);
1261    tl_assert(dstUsed <= nDst);
1262
1263    *dstUsedP = dstUsed;
1264 }
1265
1266 /* Write the cacheline 'wix' to backing store.  Where it ends up
1267    is determined by its tag field. */
1268 static __attribute__((noinline)) void cacheline_wback ( UWord wix )
1269 {
1270    Word        i, j, k, m;
1271    Addr        tag;
1272    SecMap*     sm;
1273    CacheLine*  cl;
1274    LineZ* lineZ;
1275    LineF* lineF;
1276    Word        zix, fix, csvalsUsed;
1277    CountedSVal csvals[N_LINE_ARANGE];
1278    SVal        sv;
1279
1280    if (0)
1281    VG_(printf)("scache wback line %d\n", (Int)wix);
1282
1283    tl_assert(wix >= 0 && wix < N_WAY_NENT);
1284
1285    tag =  cache_shmem.tags0[wix];
1286    cl  = &cache_shmem.lyns0[wix];
1287
1288    /* The cache line may have been invalidated; if so, ignore it. */
1289    if (!is_valid_scache_tag(tag))
1290       return;
1291
1292    /* Where are we going to put it? */
1293    sm         = NULL;
1294    lineZ      = NULL;
1295    lineF      = NULL;
1296    zix = fix = -1;
1297
1298    /* find the Z line to write in and rcdec it or the associated F
1299       line. */
1300    find_Z_for_writing( &sm, &zix, tag );
1301
1302    tl_assert(sm);
1303    tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
1304    lineZ = &sm->linesZ[zix];
1305
1306    /* Generate the data to be stored */
1307    if (CHECK_ZSM)
1308       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1309
1310    csvalsUsed = -1;
1311    sequentialise_CacheLine( csvals, &csvalsUsed,
1312                             N_LINE_ARANGE, cl );
1313    tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
1314    if (0) VG_(printf)("%lu ", csvalsUsed);
1315
1316    lineZ->dict[0] = lineZ->dict[1]
1317                   = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1318
1319    /* i indexes actual shadow values, k is cursor in csvals */
1320    i = 0;
1321    for (k = 0; k < csvalsUsed; k++) {
1322
1323       sv = csvals[k].sval;
1324       if (CHECK_ZSM)
1325          tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1326       /* do we already have it? */
1327       if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
1328       if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
1329       if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
1330       if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
1331       /* no.  look for a free slot. */
1332       if (CHECK_ZSM)
1333          tl_assert(sv != SVal_INVALID);
1334       if (lineZ->dict[0]
1335           == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
1336       if (lineZ->dict[1]
1337           == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
1338       if (lineZ->dict[2]
1339           == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
1340       if (lineZ->dict[3]
1341           == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
1342       break; /* we'll have to use the f rep */
1343      dict_ok:
1344       m = csvals[k].count;
1345       if (m == 8) {
1346          write_twobit_array( lineZ->ix2s, i+0, j );
1347          write_twobit_array( lineZ->ix2s, i+1, j );
1348          write_twobit_array( lineZ->ix2s, i+2, j );
1349          write_twobit_array( lineZ->ix2s, i+3, j );
1350          write_twobit_array( lineZ->ix2s, i+4, j );
1351          write_twobit_array( lineZ->ix2s, i+5, j );
1352          write_twobit_array( lineZ->ix2s, i+6, j );
1353          write_twobit_array( lineZ->ix2s, i+7, j );
1354          i += 8;
1355       }
1356       else if (m == 4) {
1357          write_twobit_array( lineZ->ix2s, i+0, j );
1358          write_twobit_array( lineZ->ix2s, i+1, j );
1359          write_twobit_array( lineZ->ix2s, i+2, j );
1360          write_twobit_array( lineZ->ix2s, i+3, j );
1361          i += 4;
1362       }
1363       else if (m == 1) {
1364          write_twobit_array( lineZ->ix2s, i+0, j );
1365          i += 1;
1366       }
1367       else if (m == 2) {
1368          write_twobit_array( lineZ->ix2s, i+0, j );
1369          write_twobit_array( lineZ->ix2s, i+1, j );
1370          i += 2;
1371       }
1372       else {
1373          tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
1374       }
1375
1376    }
1377
1378    if (LIKELY(i == N_LINE_ARANGE)) {
1379       /* Construction of the compressed representation was
1380          successful. */
1381       rcinc_LineZ(lineZ);
1382       stats__cache_Z_wbacks++;
1383    } else {
1384       /* Cannot use the compressed(z) representation.  Use the full(f)
1385          rep instead. */
1386       tl_assert(i >= 0 && i < N_LINE_ARANGE);
1387       alloc_F_for_writing( sm, &fix );
1388       tl_assert(sm->linesF);
1389       tl_assert(sm->linesF_size > 0);
1390       tl_assert(fix >= 0 && fix < (Word)sm->linesF_size);
1391       lineF = &sm->linesF[fix];
1392       tl_assert(!lineF->inUse);
1393       lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1394       lineZ->dict[1] = (SVal)fix;
1395       lineF->inUse = True;
1396       i = 0;
1397       for (k = 0; k < csvalsUsed; k++) {
1398          if (CHECK_ZSM)
1399             tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1400          sv = csvals[k].sval;
1401          if (CHECK_ZSM)
1402             tl_assert(sv != SVal_INVALID);
1403          for (m = csvals[k].count; m > 0; m--) {
1404             lineF->w64s[i] = sv;
1405             i++;
1406          }
1407       }
1408       tl_assert(i == N_LINE_ARANGE);
1409       rcinc_LineF(lineF);
1410       stats__cache_F_wbacks++;
1411    }
1412 }
1413
1414 /* Fetch the cacheline 'wix' from the backing store.  The tag
1415    associated with 'wix' is assumed to have already been filled in;
1416    hence that is used to determine where in the backing store to read
1417    from. */
1418 static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
1419 {
1420    Word       i;
1421    Addr       tag;
1422    CacheLine* cl;
1423    LineZ*     lineZ;
1424    LineF*     lineF;
1425
1426    if (0)
1427    VG_(printf)("scache fetch line %d\n", (Int)wix);
1428
1429    tl_assert(wix >= 0 && wix < N_WAY_NENT);
1430
1431    tag =  cache_shmem.tags0[wix];
1432    cl  = &cache_shmem.lyns0[wix];
1433
1434    /* reject nonsense requests */
1435    tl_assert(is_valid_scache_tag(tag));
1436
1437    lineZ = NULL;
1438    lineF = NULL;
1439    find_ZF_for_reading( &lineZ, &lineF, tag );
1440    tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
1441
1442    /* expand the data into the bottom layer of the tree, then get
1443       cacheline_normalise to build the descriptor array. */
1444    if (lineF) {
1445       tl_assert(lineF->inUse);
1446       for (i = 0; i < N_LINE_ARANGE; i++) {
1447          cl->svals[i] = lineF->w64s[i];
1448       }
1449       stats__cache_F_fetches++;
1450    } else {
1451       for (i = 0; i < N_LINE_ARANGE; i++) {
1452          SVal sv;
1453          UWord ix = read_twobit_array( lineZ->ix2s, i );
1454          /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */
1455          sv = lineZ->dict[ix];
1456          tl_assert(sv != SVal_INVALID);
1457          cl->svals[i] = sv;
1458       }
1459       stats__cache_Z_fetches++;
1460    }
1461    normalise_CacheLine( cl );
1462 }
1463
1464 static void shmem__invalidate_scache ( void ) {
1465    Word wix;
1466    if (0) VG_(printf)("%s","scache inval\n");
1467    tl_assert(!is_valid_scache_tag(1));
1468    for (wix = 0; wix < N_WAY_NENT; wix++) {
1469       cache_shmem.tags0[wix] = 1/*INVALID*/;
1470    }
1471    stats__cache_invals++;
1472 }
1473
1474 static void shmem__flush_and_invalidate_scache ( void ) {
1475    Word wix;
1476    Addr tag;
1477    if (0) VG_(printf)("%s","scache flush and invalidate\n");
1478    tl_assert(!is_valid_scache_tag(1));
1479    for (wix = 0; wix < N_WAY_NENT; wix++) {
1480       tag = cache_shmem.tags0[wix];
1481       if (tag == 1/*INVALID*/) {
1482          /* already invalid; nothing to do */
1483       } else {
1484          tl_assert(is_valid_scache_tag(tag));
1485          cacheline_wback( wix );
1486       }
1487       cache_shmem.tags0[wix] = 1/*INVALID*/;
1488    }
1489    stats__cache_flushes++;
1490    stats__cache_invals++;
1491 }
1492
1493
1494 static inline Bool aligned16 ( Addr a ) {
1495    return 0 == (a & 1);
1496 }
1497 static inline Bool aligned32 ( Addr a ) {
1498    return 0 == (a & 3);
1499 }
1500 static inline Bool aligned64 ( Addr a ) {
1501    return 0 == (a & 7);
1502 }
1503 static inline UWord get_cacheline_offset ( Addr a ) {
1504    return (UWord)(a & (N_LINE_ARANGE - 1));
1505 }
1506 static inline Addr cacheline_ROUNDUP ( Addr a ) {
1507    return ROUNDUP(a, N_LINE_ARANGE);
1508 }
1509 static inline Addr cacheline_ROUNDDN ( Addr a ) {
1510    return ROUNDDN(a, N_LINE_ARANGE);
1511 }
1512 static inline UWord get_treeno ( Addr a ) {
1513    return get_cacheline_offset(a) >> 3;
1514 }
1515 static inline UWord get_tree_offset ( Addr a ) {
1516    return a & 7;
1517 }
1518
1519 static __attribute__((noinline))
1520        CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
1521 static inline CacheLine* get_cacheline ( Addr a )
1522 {
1523    /* tag is 'a' with the in-line offset masked out,
1524       eg a[31]..a[4] 0000 */
1525    Addr       tag = a & ~(N_LINE_ARANGE - 1);
1526    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1527    stats__cache_totrefs++;
1528    if (LIKELY(tag == cache_shmem.tags0[wix])) {
1529       return &cache_shmem.lyns0[wix];
1530    } else {
1531       return get_cacheline_MISS( a );
1532    }
1533 }
1534
1535 static __attribute__((noinline))
1536        CacheLine* get_cacheline_MISS ( Addr a )
1537 {
1538    /* tag is 'a' with the in-line offset masked out,
1539       eg a[31]..a[4] 0000 */
1540
1541    CacheLine* cl;
1542    Addr*      tag_old_p;
1543    Addr       tag = a & ~(N_LINE_ARANGE - 1);
1544    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1545
1546    tl_assert(tag != cache_shmem.tags0[wix]);
1547
1548    /* Dump the old line into the backing store. */
1549    stats__cache_totmisses++;
1550
1551    cl        = &cache_shmem.lyns0[wix];
1552    tag_old_p = &cache_shmem.tags0[wix];
1553
1554    if (is_valid_scache_tag( *tag_old_p )) {
1555       /* EXPENSIVE and REDUNDANT: callee does it */
1556       if (CHECK_ZSM)
1557          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1558       cacheline_wback( wix );
1559    }
1560    /* and reload the new one */
1561    *tag_old_p = tag;
1562    cacheline_fetch( wix );
1563    if (CHECK_ZSM)
1564       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1565    return cl;
1566 }
1567
1568 static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1569    stats__cline_64to32pulldown++;
1570    switch (toff) {
1571       case 0: case 4:
1572          tl_assert(descr & TREE_DESCR_64);
1573          tree[4] = tree[0];
1574          descr &= ~TREE_DESCR_64;
1575          descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
1576          break;
1577       default:
1578          tl_assert(0);
1579    }
1580    return descr;
1581 }
1582
1583 static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1584    stats__cline_32to16pulldown++;
1585    switch (toff) {
1586       case 0: case 2:
1587          if (!(descr & TREE_DESCR_32_0)) {
1588             descr = pulldown_to_32(tree, 0, descr);
1589          }
1590          tl_assert(descr & TREE_DESCR_32_0);
1591          tree[2] = tree[0];
1592          descr &= ~TREE_DESCR_32_0;
1593          descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
1594          break;
1595       case 4: case 6:
1596          if (!(descr & TREE_DESCR_32_1)) {
1597             descr = pulldown_to_32(tree, 4, descr);
1598          }
1599          tl_assert(descr & TREE_DESCR_32_1);
1600          tree[6] = tree[4];
1601          descr &= ~TREE_DESCR_32_1;
1602          descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
1603          break;
1604       default:
1605          tl_assert(0);
1606    }
1607    return descr;
1608 }
1609
1610 static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1611    stats__cline_16to8pulldown++;
1612    switch (toff) {
1613       case 0: case 1:
1614          if (!(descr & TREE_DESCR_16_0)) {
1615             descr = pulldown_to_16(tree, 0, descr);
1616          }
1617          tl_assert(descr & TREE_DESCR_16_0);
1618          tree[1] = tree[0];
1619          descr &= ~TREE_DESCR_16_0;
1620          descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
1621          break;
1622       case 2: case 3:
1623          if (!(descr & TREE_DESCR_16_1)) {
1624             descr = pulldown_to_16(tree, 2, descr);
1625          }
1626          tl_assert(descr & TREE_DESCR_16_1);
1627          tree[3] = tree[2];
1628          descr &= ~TREE_DESCR_16_1;
1629          descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
1630          break;
1631       case 4: case 5:
1632          if (!(descr & TREE_DESCR_16_2)) {
1633             descr = pulldown_to_16(tree, 4, descr);
1634          }
1635          tl_assert(descr & TREE_DESCR_16_2);
1636          tree[5] = tree[4];
1637          descr &= ~TREE_DESCR_16_2;
1638          descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
1639          break;
1640       case 6: case 7:
1641          if (!(descr & TREE_DESCR_16_3)) {
1642             descr = pulldown_to_16(tree, 6, descr);
1643          }
1644          tl_assert(descr & TREE_DESCR_16_3);
1645          tree[7] = tree[6];
1646          descr &= ~TREE_DESCR_16_3;
1647          descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
1648          break;
1649       default:
1650          tl_assert(0);
1651    }
1652    return descr;
1653 }
1654
1655
1656 static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
1657    UShort mask;
1658    switch (toff) {
1659       case 0:
1660          mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
1661          tl_assert( (descr & mask) == mask );
1662          descr &= ~mask;
1663          descr |= TREE_DESCR_16_0;
1664          break;
1665       case 2:
1666          mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
1667          tl_assert( (descr & mask) == mask );
1668          descr &= ~mask;
1669          descr |= TREE_DESCR_16_1;
1670          break;
1671       case 4:
1672          mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
1673          tl_assert( (descr & mask) == mask );
1674          descr &= ~mask;
1675          descr |= TREE_DESCR_16_2;
1676          break;
1677       case 6:
1678          mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
1679          tl_assert( (descr & mask) == mask );
1680          descr &= ~mask;
1681          descr |= TREE_DESCR_16_3;
1682          break;
1683       default:
1684          tl_assert(0);
1685    }
1686    return descr;
1687 }
1688
1689 static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
1690    UShort mask;
1691    switch (toff) {
1692       case 0:
1693          if (!(descr & TREE_DESCR_16_0))
1694             descr = pullup_descr_to_16(descr, 0);
1695          if (!(descr & TREE_DESCR_16_1))
1696             descr = pullup_descr_to_16(descr, 2);
1697          mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
1698          tl_assert( (descr & mask) == mask );
1699          descr &= ~mask;
1700          descr |= TREE_DESCR_32_0;
1701          break;
1702       case 4:
1703          if (!(descr & TREE_DESCR_16_2))
1704             descr = pullup_descr_to_16(descr, 4);
1705          if (!(descr & TREE_DESCR_16_3))
1706             descr = pullup_descr_to_16(descr, 6);
1707          mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
1708          tl_assert( (descr & mask) == mask );
1709          descr &= ~mask;
1710          descr |= TREE_DESCR_32_1;
1711          break;
1712       default:
1713          tl_assert(0);
1714    }
1715    return descr;
1716 }
1717
1718 static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
1719    switch (toff) {
1720       case 0: case 4:
1721          return 0 != (descr & TREE_DESCR_64);
1722       default:
1723          tl_assert(0);
1724    }
1725 }
1726
1727 static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
1728    switch (toff) {
1729       case 0:
1730          return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
1731       case 2:
1732          return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
1733       case 4:
1734          return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
1735       case 6:
1736          return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
1737       default:
1738          tl_assert(0);
1739    }
1740 }
1741
1742 /* ------------ Cache management ------------ */
1743
1744 static void zsm_flush_cache ( void )
1745 {
1746    shmem__flush_and_invalidate_scache();
1747 }
1748
1749
1750 static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
1751 {
1752    tl_assert( sizeof(UWord) == sizeof(Addr) );
1753
1754    rcinc = p_rcinc;
1755    rcdec = p_rcdec;
1756
1757    tl_assert(map_shmem == NULL);
1758    map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
1759                            HG_(free),
1760                            NULL/*unboxed UWord cmp*/);
1761    shmem__invalidate_scache();
1762
1763    /* a SecMap must contain an integral number of CacheLines */
1764    tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
1765    /* also ... a CacheLine holds an integral number of trees */
1766    tl_assert(0 == (N_LINE_ARANGE % 8));
1767 }
1768
1769 /////////////////////////////////////////////////////////////////
1770 /////////////////////////////////////////////////////////////////
1771 //                                                             //
1772 // SECTION END compressed shadow memory                        //
1773 //                                                             //
1774 /////////////////////////////////////////////////////////////////
1775 /////////////////////////////////////////////////////////////////
1776
1777
1778
1779 /////////////////////////////////////////////////////////////////
1780 /////////////////////////////////////////////////////////////////
1781 //                                                             //
1782 // SECTION BEGIN vts primitives                                //
1783 //                                                             //
1784 /////////////////////////////////////////////////////////////////
1785 /////////////////////////////////////////////////////////////////
1786
1787
1788 /* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
1789    being compact stand-ins for Thr*'s.  Use these functions to map
1790    between them. */
1791 static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
1792 static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
1793
1794 __attribute__((noreturn))
1795 static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
1796 {
1797    if (due_to_nThrs) {
1798       const HChar* s =
1799          "\n"
1800          "Helgrind: cannot continue, run aborted: too many threads.\n"
1801          "Sorry.  Helgrind can only handle programs that create\n"
1802          "%'llu or fewer threads over their entire lifetime.\n"
1803          "\n";
1804       VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
1805    } else {
1806       const HChar* s =
1807          "\n"
1808          "Helgrind: cannot continue, run aborted: too many\n"
1809          "synchronisation events.  Sorry. Helgrind can only handle\n"
1810          "programs which perform %'llu or fewer\n"
1811          "inter-thread synchronisation events (locks, unlocks, etc).\n"
1812          "\n";
1813       VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
1814    }
1815    VG_(exit)(1);
1816    /*NOTREACHED*/
1817    tl_assert(0); /*wtf?!*/
1818 }
1819
1820
1821 /* The dead thread (ThrID, actually) table.  A thread may only be
1822    listed here if we have been notified thereof by libhb_async_exit.
1823    New entries are added at the end.  The order isn't important, but
1824    the ThrID values must be unique.  This table lists the identity of
1825    all threads that have ever died -- none are ever removed.  We keep
1826    this table so as to be able to prune entries from VTSs.  We don't
1827    actually need to keep the set of threads that have ever died --
1828    only the threads that have died since the previous round of
1829    pruning.  But it's useful for sanity check purposes to keep the
1830    entire set, so we do. */
1831 static XArray* /* of ThrID */ verydead_thread_table = NULL;
1832
1833 /* Arbitrary total ordering on ThrIDs. */
1834 static Int cmp__ThrID ( const void* v1, const void* v2 ) {
1835    ThrID id1 = *(const ThrID*)v1;
1836    ThrID id2 = *(const ThrID*)v2;
1837    if (id1 < id2) return -1;
1838    if (id1 > id2) return 1;
1839    return 0;
1840 }
1841
1842 static void verydead_thread_table_init ( void )
1843 {
1844    tl_assert(!verydead_thread_table);
1845    verydead_thread_table
1846      = VG_(newXA)( HG_(zalloc),
1847                    "libhb.verydead_thread_table_init.1",
1848                    HG_(free), sizeof(ThrID) );
1849    VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
1850 }
1851
1852
1853 /* A VTS contains .ts, its vector clock, and also .id, a field to hold
1854    a backlink for the caller's convenience.  Since we have no idea
1855    what to set that to in the library, it always gets set to
1856    VtsID_INVALID. */
1857 typedef
1858    struct {
1859       VtsID    id;
1860       UInt     usedTS;
1861       UInt     sizeTS;
1862       ScalarTS ts[0];
1863    }
1864    VTS;
1865
1866 /* Allocate a VTS capable of storing 'sizeTS' entries. */
1867 static VTS* VTS__new ( const HChar* who, UInt sizeTS );
1868
1869 /* Make a clone of 'vts', sizing the new array to exactly match the
1870    number of ScalarTSs present. */
1871 static VTS* VTS__clone ( const HChar* who, VTS* vts );
1872
1873 /* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
1874    array is sized exactly to hold the number of required elements.
1875    'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
1876    must be in strictly increasing order. */
1877 static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel );
1878
1879 /* Delete this VTS in its entirety. */
1880 static void VTS__delete ( VTS* vts );
1881
1882 /* Create a new singleton VTS in 'out'.  Caller must have
1883    pre-allocated 'out' sufficiently big to hold the result in all
1884    possible cases. */
1885 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );
1886
1887 /* Create in 'out' a VTS which is the same as 'vts' except with
1888    vts[me]++, so to speak.  Caller must have pre-allocated 'out'
1889    sufficiently big to hold the result in all possible cases. */
1890 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );
1891
1892 /* Create in 'out' a VTS which is the join (max) of 'a' and
1893    'b'. Caller must have pre-allocated 'out' sufficiently big to hold
1894    the result in all possible cases. */
1895 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );
1896
1897 /* Compute the partial ordering relation of the two args.  Although we
1898    could be completely general and return an enumeration value (EQ,
1899    LT, GT, UN), in fact we only need LEQ, and so we may as well
1900    hardwire that fact.
1901
1902    Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
1903    invald ThrID).  In the latter case, the returned ThrID indicates
1904    the discovered point for which they are not.  There may be more
1905    than one such point, but we only care about seeing one of them, not
1906    all of them.  This rather strange convention is used because
1907    sometimes we want to know the actual index at which they first
1908    differ. */
1909 static UInt VTS__cmpLEQ ( VTS* a, VTS* b );
1910
1911 /* Compute an arbitrary structural (total) ordering on the two args,
1912    based on their VCs, so they can be looked up in a table, tree, etc.
1913    Returns -1, 0 or 1. */
1914 static Word VTS__cmp_structural ( VTS* a, VTS* b );
1915
1916 /* Debugging only.  Display the given VTS in the buffer. */
1917 static void VTS__show ( HChar* buf, Int nBuf, VTS* vts );
1918
1919 /* Debugging only.  Return vts[index], so to speak. */
1920 static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
1921
1922 /* Notify the VTS machinery that a thread has been declared
1923    comprehensively dead: that is, it has done an async exit AND it has
1924    been joined with.  This should ensure that its local clocks (.viR
1925    and .viW) will never again change, and so all mentions of this
1926    thread from all VTSs in the system may be removed. */
1927 static void VTS__declare_thread_very_dead ( Thr* idx );
1928
1929 /*--------------- to do with Vector Timestamps ---------------*/
1930
1931 static Bool is_sane_VTS ( VTS* vts )
1932 {
1933    UWord     i, n;
1934    ScalarTS  *st1, *st2;
1935    if (!vts) return False;
1936    if (vts->usedTS > vts->sizeTS) return False;
1937    n = vts->usedTS;
1938    if (n == 1) {
1939       st1 = &vts->ts[0];
1940       if (st1->tym == 0)
1941          return False;
1942    }
1943    else
1944    if (n >= 2) {
1945       for (i = 0; i < n-1; i++) {
1946          st1 = &vts->ts[i];
1947          st2 = &vts->ts[i+1];
1948          if (st1->thrid >= st2->thrid)
1949             return False;
1950          if (st1->tym == 0 || st2->tym == 0)
1951             return False;
1952       }
1953    }
1954    return True;
1955 }
1956
1957
1958 /* Create a new, empty VTS.
1959 */
1960 static VTS* VTS__new ( const HChar* who, UInt sizeTS )
1961 {
1962    VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
1963    tl_assert(vts->usedTS == 0);
1964    vts->sizeTS = sizeTS;
1965    *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
1966    return vts;
1967 }
1968
1969 /* Clone this VTS.
1970 */
1971 static VTS* VTS__clone ( const HChar* who, VTS* vts )
1972 {
1973    tl_assert(vts);
1974    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
1975    UInt nTS = vts->usedTS;
1976    VTS* clone = VTS__new(who, nTS);
1977    clone->id = vts->id;
1978    clone->sizeTS = nTS;
1979    clone->usedTS = nTS;
1980    UInt i;
1981    for (i = 0; i < nTS; i++) {
1982       clone->ts[i] = vts->ts[i];
1983    }
1984    tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
1985    return clone;
1986 }
1987
1988
1989 /* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
1990    must be in strictly increasing order.  We could obviously do this
1991    much more efficiently (in linear time) if necessary.
1992 */
1993 static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel )
1994 {
1995    UInt i, j;
1996    tl_assert(vts);
1997    tl_assert(thridsToDel);
1998    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
1999    UInt nTS = vts->usedTS;
2000    /* Figure out how many ScalarTSs will remain in the output. */
2001    UInt nReq = nTS;
2002    for (i = 0; i < nTS; i++) {
2003       ThrID thrid = vts->ts[i].thrid;
2004       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2005          nReq--;
2006    }
2007    tl_assert(nReq <= nTS);
2008    /* Copy the ones that will remain. */
2009    VTS* res = VTS__new(who, nReq);
2010    j = 0;
2011    for (i = 0; i < nTS; i++) {
2012       ThrID thrid = vts->ts[i].thrid;
2013       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2014          continue;
2015       res->ts[j++] = vts->ts[i];
2016    }
2017    tl_assert(j == nReq);
2018    tl_assert(j == res->sizeTS);
2019    res->usedTS = j;
2020    tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
2021    return res;
2022 }
2023
2024
2025 /* Delete this VTS in its entirety.
2026 */
2027 static void VTS__delete ( VTS* vts )
2028 {
2029    tl_assert(vts);
2030    tl_assert(vts->usedTS <= vts->sizeTS);
2031    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2032    HG_(free)(vts);
2033 }
2034
2035
2036 /* Create a new singleton VTS.
2037 */
2038 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
2039 {
2040    tl_assert(thr);
2041    tl_assert(tym >= 1);
2042    tl_assert(out);
2043    tl_assert(out->usedTS == 0);
2044    tl_assert(out->sizeTS >= 1);
2045    UInt hi = out->usedTS++;
2046    out->ts[hi].thrid = Thr__to_ThrID(thr);
2047    out->ts[hi].tym   = tym;
2048 }
2049
2050
2051 /* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
2052    not modified.
2053 */
2054 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
2055 {
2056    UInt      i, n;
2057    ThrID     me_thrid;
2058    Bool      found = False;
2059
2060    stats__vts__tick++;
2061
2062    tl_assert(out);
2063    tl_assert(out->usedTS == 0);
2064    if (vts->usedTS >= ThrID_MAX_VALID)
2065       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2066    tl_assert(out->sizeTS >= 1 + vts->usedTS);
2067
2068    tl_assert(me);
2069    me_thrid = Thr__to_ThrID(me);
2070    tl_assert(is_sane_VTS(vts));
2071    n = vts->usedTS;
2072
2073    /* Copy all entries which precede 'me'. */
2074    for (i = 0; i < n; i++) {
2075       ScalarTS* here = &vts->ts[i];
2076       if (UNLIKELY(here->thrid >= me_thrid))
2077          break;
2078       UInt hi = out->usedTS++;
2079       out->ts[hi] = *here;
2080    }
2081
2082    /* 'i' now indicates the next entry to copy, if any.
2083        There are 3 possibilities:
2084        (a) there is no next entry (we used them all up already):
2085            add (me_thrid,1) to the output, and quit
2086        (b) there is a next entry, and its thrid > me_thrid:
2087            add (me_thrid,1) to the output, then copy the remaining entries
2088        (c) there is a next entry, and its thrid == me_thrid:
2089            copy it to the output but increment its timestamp value.
2090            Then copy the remaining entries.  (c) is the common case.
2091    */
2092    tl_assert(i >= 0 && i <= n);
2093    if (i == n) { /* case (a) */
2094       UInt hi = out->usedTS++;
2095       out->ts[hi].thrid = me_thrid;
2096       out->ts[hi].tym   = 1;
2097    } else {
2098       /* cases (b) and (c) */
2099       ScalarTS* here = &vts->ts[i];
2100       if (me_thrid == here->thrid) { /* case (c) */
2101          if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
2102             /* We're hosed.  We have to stop. */
2103             scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
2104          }
2105          UInt hi = out->usedTS++;
2106          out->ts[hi].thrid = here->thrid;
2107          out->ts[hi].tym   = here->tym + 1;
2108          i++;
2109          found = True;
2110       } else { /* case (b) */
2111          UInt hi = out->usedTS++;
2112          out->ts[hi].thrid = me_thrid;
2113          out->ts[hi].tym   = 1;
2114       }
2115       /* And copy any remaining entries. */
2116       for (/*keepgoing*/; i < n; i++) {
2117          ScalarTS* here2 = &vts->ts[i];
2118          UInt hi = out->usedTS++;
2119          out->ts[hi] = *here2;
2120       }
2121    }
2122
2123    tl_assert(is_sane_VTS(out));
2124    tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
2125    tl_assert(out->usedTS <= out->sizeTS);
2126 }
2127
2128
2129 /* Return a new VTS constructed as the join (max) of the 2 args.
2130    Neither arg is modified.
2131 */
2132 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
2133 {
2134    UInt     ia, ib, useda, usedb;
2135    ULong    tyma, tymb, tymMax;
2136    ThrID    thrid;
2137    UInt     ncommon = 0;
2138
2139    stats__vts__join++;
2140
2141    tl_assert(a);
2142    tl_assert(b);
2143    useda = a->usedTS;
2144    usedb = b->usedTS;
2145
2146    tl_assert(out);
2147    tl_assert(out->usedTS == 0);
2148    /* overly conservative test, but doing better involves comparing
2149       the two VTSs, which we don't want to do at this point. */
2150    if (useda + usedb >= ThrID_MAX_VALID)
2151       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2152    tl_assert(out->sizeTS >= useda + usedb);
2153
2154    ia = ib = 0;
2155
2156    while (1) {
2157
2158       /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
2159          from a and b in order, where thrid is the next ThrID
2160          occurring in either a or b, and tyma/b are the relevant
2161          scalar timestamps, taking into account implicit zeroes. */
2162       tl_assert(ia >= 0 && ia <= useda);
2163       tl_assert(ib >= 0 && ib <= usedb);
2164
2165       if        (ia == useda && ib == usedb) {
2166          /* both empty - done */
2167          break;
2168
2169       } else if (ia == useda && ib != usedb) {
2170          /* a empty, use up b */
2171          ScalarTS* tmpb = &b->ts[ib];
2172          thrid = tmpb->thrid;
2173          tyma  = 0;
2174          tymb  = tmpb->tym;
2175          ib++;
2176
2177       } else if (ia != useda && ib == usedb) {
2178          /* b empty, use up a */
2179          ScalarTS* tmpa = &a->ts[ia];
2180          thrid = tmpa->thrid;
2181          tyma  = tmpa->tym;
2182          tymb  = 0;
2183          ia++;
2184
2185       } else {
2186          /* both not empty; extract lowest-ThrID'd triple */
2187          ScalarTS* tmpa = &a->ts[ia];
2188          ScalarTS* tmpb = &b->ts[ib];
2189          if (tmpa->thrid < tmpb->thrid) {
2190             /* a has the lowest unconsidered ThrID */
2191             thrid = tmpa->thrid;
2192             tyma  = tmpa->tym;
2193             tymb  = 0;
2194             ia++;
2195          } else if (tmpa->thrid > tmpb->thrid) {
2196             /* b has the lowest unconsidered ThrID */
2197             thrid = tmpb->thrid;
2198             tyma  = 0;
2199             tymb  = tmpb->tym;
2200             ib++;
2201          } else {
2202             /* they both next mention the same ThrID */
2203             tl_assert(tmpa->thrid == tmpb->thrid);
2204             thrid = tmpa->thrid; /* == tmpb->thrid */
2205             tyma  = tmpa->tym;
2206             tymb  = tmpb->tym;
2207             ia++;
2208             ib++;
2209             ncommon++;
2210          }
2211       }
2212
2213       /* having laboriously determined (thr, tyma, tymb), do something
2214          useful with it. */
2215       tymMax = tyma > tymb ? tyma : tymb;
2216       if (tymMax > 0) {
2217          UInt hi = out->usedTS++;
2218          out->ts[hi].thrid = thrid;
2219          out->ts[hi].tym   = tymMax;
2220       }
2221
2222    }
2223
2224    tl_assert(is_sane_VTS(out));
2225    tl_assert(out->usedTS <= out->sizeTS);
2226    tl_assert(out->usedTS == useda + usedb - ncommon);
2227 }
2228
2229
2230 /* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
2231    they are, or the first ThrID for which they are not (no valid ThrID
2232    has the value zero).  This rather strange convention is used
2233    because sometimes we want to know the actual index at which they
2234    first differ. */
2235 static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
2236 {
2237    Word  ia, ib, useda, usedb;
2238    ULong tyma, tymb;
2239
2240    stats__vts__cmpLEQ++;
2241
2242    tl_assert(a);
2243    tl_assert(b);
2244    useda = a->usedTS;
2245    usedb = b->usedTS;
2246
2247    ia = ib = 0;
2248
2249    while (1) {
2250
2251       /* This logic is to enumerate doubles (tyma, tymb) drawn
2252          from a and b in order, and tyma/b are the relevant
2253          scalar timestamps, taking into account implicit zeroes. */
2254       ThrID thrid;
2255
2256       tl_assert(ia >= 0 && ia <= useda);
2257       tl_assert(ib >= 0 && ib <= usedb);
2258
2259       if        (ia == useda && ib == usedb) {
2260          /* both empty - done */
2261          break;
2262
2263       } else if (ia == useda && ib != usedb) {
2264          /* a empty, use up b */
2265          ScalarTS* tmpb = &b->ts[ib];
2266          tyma  = 0;
2267          tymb  = tmpb->tym;
2268          thrid = tmpb->thrid;
2269          ib++;
2270
2271       } else if (ia != useda && ib == usedb) {
2272          /* b empty, use up a */
2273          ScalarTS* tmpa = &a->ts[ia];
2274          tyma  = tmpa->tym;
2275          thrid = tmpa->thrid;
2276          tymb  = 0;
2277          ia++;
2278
2279       } else {
2280          /* both not empty; extract lowest-ThrID'd triple */
2281          ScalarTS* tmpa = &a->ts[ia];
2282          ScalarTS* tmpb = &b->ts[ib];
2283          if (tmpa->thrid < tmpb->thrid) {
2284             /* a has the lowest unconsidered ThrID */
2285             tyma  = tmpa->tym;
2286             thrid = tmpa->thrid;
2287             tymb  = 0;
2288             ia++;
2289          }
2290          else
2291          if (tmpa->thrid > tmpb->thrid) {
2292             /* b has the lowest unconsidered ThrID */
2293             tyma  = 0;
2294             tymb  = tmpb->tym;
2295             thrid = tmpb->thrid;
2296             ib++;
2297          } else {
2298             /* they both next mention the same ThrID */
2299             tl_assert(tmpa->thrid == tmpb->thrid);
2300             tyma  = tmpa->tym;
2301             thrid = tmpa->thrid;
2302             tymb  = tmpb->tym;
2303             ia++;
2304             ib++;
2305          }
2306       }
2307
2308       /* having laboriously determined (tyma, tymb), do something
2309          useful with it. */
2310       if (tyma > tymb) {
2311          /* not LEQ at this index.  Quit, since the answer is
2312             determined already. */
2313          tl_assert(thrid >= 1024);
2314          return thrid;
2315       }
2316    }
2317
2318    return 0; /* all points are LEQ => return an invalid ThrID */
2319 }
2320
2321
2322 /* Compute an arbitrary structural (total) ordering on the two args,
2323    based on their VCs, so they can be looked up in a table, tree, etc.
2324    Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
2325    performance critical so there is some effort expended to make it sa
2326    fast as possible.
2327 */
2328 Word VTS__cmp_structural ( VTS* a, VTS* b )
2329 {
2330    /* We just need to generate an arbitrary total ordering based on
2331       a->ts and b->ts.  Preferably do it in a way which comes across likely
2332       differences relatively quickly. */
2333    Word     i;
2334    Word     useda = 0,    usedb = 0;
2335    ScalarTS *ctsa = NULL, *ctsb = NULL;
2336
2337    stats__vts__cmp_structural++;
2338
2339    tl_assert(a);
2340    tl_assert(b);
2341
2342    ctsa = &a->ts[0]; useda = a->usedTS;
2343    ctsb = &b->ts[0]; usedb = b->usedTS;
2344
2345    if (LIKELY(useda == usedb)) {
2346       ScalarTS *tmpa = NULL, *tmpb = NULL;
2347       stats__vts__cmp_structural_slow++;
2348       /* Same length vectors.  Find the first difference, if any, as
2349          fast as possible. */
2350       for (i = 0; i < useda; i++) {
2351          tmpa = &ctsa[i];
2352          tmpb = &ctsb[i];
2353          if (LIKELY(tmpa->tym == tmpb->tym
2354                     && tmpa->thrid == tmpb->thrid))
2355             continue;
2356          else
2357             break;
2358       }
2359       if (UNLIKELY(i == useda)) {
2360          /* They're identical. */
2361          return 0;
2362       } else {
2363          tl_assert(i >= 0 && i < useda);
2364          if (tmpa->tym < tmpb->tym) return -1;
2365          if (tmpa->tym > tmpb->tym) return 1;
2366          if (tmpa->thrid < tmpb->thrid) return -1;
2367          if (tmpa->thrid > tmpb->thrid) return 1;
2368          /* we just established them as non-identical, hence: */
2369       }
2370       /*NOTREACHED*/
2371       tl_assert(0);
2372    }
2373
2374    if (useda < usedb) return -1;
2375    if (useda > usedb) return 1;
2376    /*NOTREACHED*/
2377    tl_assert(0);
2378 }
2379
2380
2381 /* Debugging only.  Display the given VTS in the buffer.
2382 */
2383 void VTS__show ( HChar* buf, Int nBuf, VTS* vts )
2384 {
2385    ScalarTS* st;
2386    HChar     unit[64];
2387    Word      i, n;
2388    Int       avail = nBuf;
2389    tl_assert(vts && vts->ts);
2390    tl_assert(nBuf > 16);
2391    buf[0] = '[';
2392    buf[1] = 0;
2393    n =  vts->usedTS;
2394    for (i = 0; i < n; i++) {
2395       tl_assert(avail >= 40);
2396       st = &vts->ts[i];
2397       VG_(memset)(unit, 0, sizeof(unit));
2398       VG_(sprintf)(unit, i < n-1 ? "%u:%llu " : "%u:%llu",
2399                          st->thrid, (ULong)st->tym);
2400       if (avail < VG_(strlen)(unit) + 40/*let's say*/) {
2401          VG_(strcat)(buf, " ...]");
2402          buf[nBuf-1] = 0;
2403          return;
2404       }
2405       VG_(strcat)(buf, unit);
2406       avail -= VG_(strlen)(unit);
2407    }
2408    VG_(strcat)(buf, "]");
2409    buf[nBuf-1] = 0;
2410 }
2411
2412
2413 /* Debugging only.  Return vts[index], so to speak.
2414 */
2415 ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
2416 {
2417    UWord i, n;
2418    ThrID idx_thrid = Thr__to_ThrID(idx);
2419    stats__vts__indexat_slow++;
2420    tl_assert(vts && vts->ts);
2421    n = vts->usedTS;
2422    for (i = 0; i < n; i++) {
2423       ScalarTS* st = &vts->ts[i];
2424       if (st->thrid == idx_thrid)
2425          return st->tym;
2426    }
2427    return 0;
2428 }
2429
2430
2431 /* See comment on prototype above.
2432 */
2433 static void VTS__declare_thread_very_dead ( Thr* thr )
2434 {
2435    if (0) VG_(printf)("VTQ:  tae %p\n", thr);
2436
2437    tl_assert(thr->llexit_done);
2438    tl_assert(thr->joinedwith_done);
2439
2440    ThrID nyu;
2441    nyu = Thr__to_ThrID(thr);
2442    VG_(addToXA)( verydead_thread_table, &nyu );
2443
2444    /* We can only get here if we're assured that we'll never again
2445       need to look at this thread's ::viR or ::viW.  Set them to
2446       VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
2447       mostly so that we don't wind up pruning them (as that would be
2448       nonsensical: the only interesting ScalarTS entry for a dead
2449       thread is its own index, and the pruning will remove that.). */
2450    VtsID__rcdec(thr->viR);
2451    VtsID__rcdec(thr->viW);
2452    thr->viR = VtsID_INVALID;
2453    thr->viW = VtsID_INVALID;
2454 }
2455
2456
2457 /////////////////////////////////////////////////////////////////
2458 /////////////////////////////////////////////////////////////////
2459 //                                                             //
2460 // SECTION END vts primitives                                  //
2461 //                                                             //
2462 /////////////////////////////////////////////////////////////////
2463 /////////////////////////////////////////////////////////////////
2464
2465
2466
2467 /////////////////////////////////////////////////////////////////
2468 /////////////////////////////////////////////////////////////////
2469 //                                                             //
2470 // SECTION BEGIN main library                                  //
2471 //                                                             //
2472 /////////////////////////////////////////////////////////////////
2473 /////////////////////////////////////////////////////////////////
2474
2475
2476 /////////////////////////////////////////////////////////
2477 //                                                     //
2478 // VTS set                                             //
2479 //                                                     //
2480 /////////////////////////////////////////////////////////
2481
2482 static WordFM* /* WordFM VTS* void */ vts_set = NULL;
2483
2484 static void vts_set_init ( void )
2485 {
2486    tl_assert(!vts_set);
2487    vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
2488                          HG_(free),
2489                          (Word(*)(UWord,UWord))VTS__cmp_structural );
2490 }
2491
2492 /* Given a VTS, look in vts_set to see if we already have a
2493    structurally identical one.  If yes, return the pair (True, pointer
2494    to the existing one).  If no, clone this one, add the clone to the
2495    set, and return (False, pointer to the clone). */
2496 static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
2497 {
2498    UWord keyW, valW;
2499    stats__vts_set__focaa++;
2500    tl_assert(cand->id == VtsID_INVALID);
2501    /* lookup cand (by value) */
2502    if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
2503       /* found it */
2504       tl_assert(valW == 0);
2505       /* if this fails, cand (by ref) was already present (!) */
2506       tl_assert(keyW != (UWord)cand);
2507       *res = (VTS*)keyW;
2508       return True;
2509    } else {
2510       /* not present.  Clone, add and return address of clone. */
2511       stats__vts_set__focaa_a++;
2512       VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
2513       tl_assert(clone != cand);
2514       VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
2515       *res = clone;
2516       return False;
2517    }
2518 }
2519
2520
2521 /////////////////////////////////////////////////////////
2522 //                                                     //
2523 // VTS table                                           //
2524 //                                                     //
2525 /////////////////////////////////////////////////////////
2526
2527 static void VtsID__invalidate_caches ( void ); /* fwds */
2528
2529 /* A type to hold VTS table entries.  Invariants:
2530    If .vts == NULL, then this entry is not in use, so:
2531    - .rc == 0
2532    - this entry is on the freelist (unfortunately, does not imply
2533      any constraints on value for .freelink)
2534    If .vts != NULL, then this entry is in use:
2535    - .vts is findable in vts_set
2536    - .vts->id == this entry number
2537    - no specific value for .rc (even 0 is OK)
2538    - this entry is not on freelist, so .freelink == VtsID_INVALID
2539 */
2540 typedef
2541    struct {
2542       VTS*  vts;      /* vts, in vts_set */
2543       UWord rc;       /* reference count - enough for entire aspace */
2544       VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
2545       VtsID remap;    /* used only during pruning */
2546    }
2547    VtsTE;
2548
2549 /* The VTS table. */
2550 static XArray* /* of VtsTE */ vts_tab = NULL;
2551
2552 /* An index into the VTS table, indicating the start of the list of
2553    free (available for use) entries.  If the list is empty, this is
2554    VtsID_INVALID. */
2555 static VtsID vts_tab_freelist = VtsID_INVALID;
2556
2557 /* Do a GC of vts_tab when the freelist becomes empty AND the size of
2558    vts_tab equals or exceeds this size.  After GC, the value here is
2559    set appropriately so as to check for the next GC point. */
2560 static Word vts_next_GC_at = 1000;
2561
2562 static void vts_tab_init ( void )
2563 {
2564    vts_tab = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
2565                          HG_(free), sizeof(VtsTE) );
2566    vts_tab_freelist = VtsID_INVALID;
2567 }
2568
2569 /* Add ii to the free list, checking that it looks out-of-use. */
2570 static void add_to_free_list ( VtsID ii )
2571 {
2572    VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2573    tl_assert(ie->vts == NULL);
2574    tl_assert(ie->rc == 0);
2575    tl_assert(ie->freelink == VtsID_INVALID);
2576    ie->freelink = vts_tab_freelist;
2577    vts_tab_freelist = ii;
2578 }
2579
2580 /* Get an entry from the free list.  This will return VtsID_INVALID if
2581    the free list is empty. */
2582 static VtsID get_from_free_list ( void )
2583 {
2584    VtsID  ii;
2585    VtsTE* ie;
2586    if (vts_tab_freelist == VtsID_INVALID)
2587       return VtsID_INVALID;
2588    ii = vts_tab_freelist;
2589    ie = VG_(indexXA)( vts_tab, ii );
2590    tl_assert(ie->vts == NULL);
2591    tl_assert(ie->rc == 0);
2592    vts_tab_freelist = ie->freelink;
2593    return ii;
2594 }
2595
2596 /* Produce a new VtsID that can be used, either by getting it from
2597    the freelist, or, if that is empty, by expanding vts_tab. */
2598 static VtsID get_new_VtsID ( void )
2599 {
2600    VtsID ii;
2601    VtsTE te;
2602    ii = get_from_free_list();
2603    if (ii != VtsID_INVALID)
2604       return ii;
2605    te.vts = NULL;
2606    te.rc = 0;
2607    te.freelink = VtsID_INVALID;
2608    te.remap    = VtsID_INVALID;
2609    ii = (VtsID)VG_(addToXA)( vts_tab, &te );
2610    return ii;
2611 }
2612
2613
2614 /* Indirect callback from lib_zsm. */
2615 static void VtsID__rcinc ( VtsID ii )
2616 {
2617    VtsTE* ie;
2618    /* VG_(indexXA) does a range check for us */
2619    ie = VG_(indexXA)( vts_tab, ii );
2620    tl_assert(ie->vts); /* else it's not in use */
2621    tl_assert(ie->rc < ~0UL); /* else we can't continue */
2622    tl_assert(ie->vts->id == ii);
2623    ie->rc++;
2624 }
2625
2626 /* Indirect callback from lib_zsm. */
2627 static void VtsID__rcdec ( VtsID ii )
2628 {
2629    VtsTE* ie;
2630    /* VG_(indexXA) does a range check for us */
2631    ie = VG_(indexXA)( vts_tab, ii );
2632    tl_assert(ie->vts); /* else it's not in use */
2633    tl_assert(ie->rc > 0); /* else RC snafu */
2634    tl_assert(ie->vts->id == ii);
2635    ie->rc--;
2636 }
2637
2638
2639 /* Look up 'cand' in our collection of VTSs.  If present, return the
2640    VtsID for the pre-existing version.  If not present, clone it, add
2641    the clone to both vts_tab and vts_set, allocate a fresh VtsID for
2642    it, and return that. */
2643 static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
2644 {
2645    VTS* in_tab = NULL;
2646    tl_assert(cand->id == VtsID_INVALID);
2647    Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
2648    tl_assert(in_tab);
2649    if (already_have) {
2650       /* We already have a copy of 'cand'.  Use that. */
2651       VtsTE* ie;
2652       tl_assert(in_tab->id != VtsID_INVALID);
2653       ie = VG_(indexXA)( vts_tab, in_tab->id );
2654       tl_assert(ie->vts == in_tab);
2655       return in_tab->id;
2656    } else {
2657       VtsID  ii = get_new_VtsID();
2658       VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2659       ie->vts = in_tab;
2660       ie->rc = 0;
2661       ie->freelink = VtsID_INVALID;
2662       in_tab->id = ii;
2663       return ii;
2664    }
2665 }
2666
2667
2668 static void show_vts_stats ( const HChar* caller )
2669 {
2670    UWord nSet, nTab, nLive;
2671    ULong totrc;
2672    UWord n, i;
2673    nSet = VG_(sizeFM)( vts_set );
2674    nTab = VG_(sizeXA)( vts_tab );
2675    totrc = 0;
2676    nLive = 0;
2677    n = VG_(sizeXA)( vts_tab );
2678    for (i = 0; i < n; i++) {
2679       VtsTE* ie = VG_(indexXA)( vts_tab, i );
2680       if (ie->vts) {
2681          nLive++;
2682          totrc += (ULong)ie->rc;
2683       } else {
2684          tl_assert(ie->rc == 0);
2685       }
2686    }
2687    VG_(printf)("  show_vts_stats %s\n", caller);
2688    VG_(printf)("    vts_tab size %4lu\n", nTab);
2689    VG_(printf)("    vts_tab live %4lu\n", nLive);
2690    VG_(printf)("    vts_set size %4lu\n", nSet);
2691    VG_(printf)("        total rc %4llu\n", totrc);
2692 }
2693
2694
2695 /* --- Helpers for VtsID pruning --- */
2696
2697 static
2698 void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2699                    /*MOD*/XArray* /* of VtsTE */ new_tab,
2700                    VtsID* ii )
2701 {
2702    VtsTE *old_te, *new_te;
2703    VtsID old_id, new_id;
2704    /* We're relying here on VG_(indexXA)'s range checking to assert on
2705       any stupid values, in particular *ii == VtsID_INVALID. */
2706    old_id = *ii;
2707    old_te = VG_(indexXA)( old_tab, old_id );
2708    old_te->rc--;
2709    new_id = old_te->remap;
2710    new_te = VG_(indexXA)( new_tab, new_id );
2711    new_te->rc++;
2712    *ii = new_id;
2713 }
2714
2715 static
2716 void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2717                             /*MOD*/XArray* /* of VtsTE */ new_tab,
2718                             SVal* s )
2719 {
2720    SVal old_sv, new_sv;
2721    old_sv = *s;
2722    if (SVal__isC(old_sv)) {
2723       VtsID rMin, wMin;
2724       rMin = SVal__unC_Rmin(old_sv);
2725       wMin = SVal__unC_Wmin(old_sv);
2726       remap_VtsID( old_tab, new_tab, &rMin );
2727       remap_VtsID( old_tab, new_tab, &wMin );
2728       new_sv = SVal__mkC( rMin, wMin );
2729       *s = new_sv;
2730   }
2731 }
2732
2733
2734 /* NOT TO BE CALLED FROM WITHIN libzsm. */
2735 __attribute__((noinline))
2736 static void vts_tab__do_GC ( Bool show_stats )
2737 {
2738    UWord i, nTab, nLive, nFreed;
2739
2740    /* ---------- BEGIN VTS GC ---------- */
2741    /* check this is actually necessary. */
2742    tl_assert(vts_tab_freelist == VtsID_INVALID);
2743
2744    /* empty the caches for partial order checks and binary joins.  We
2745       could do better and prune out the entries to be deleted, but it
2746       ain't worth the hassle. */
2747    VtsID__invalidate_caches();
2748
2749    /* First, make the reference counts up to date. */
2750    zsm_flush_cache();
2751
2752    nTab = VG_(sizeXA)( vts_tab );
2753
2754    if (show_stats) {
2755       VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
2756       show_vts_stats("before GC");
2757    }
2758
2759    /* Now we can inspect the entire vts_tab.  Any entries with zero
2760       .rc fields are now no longer in use and can be put back on the
2761       free list, removed from vts_set, and deleted. */
2762    nFreed = 0;
2763    for (i = 0; i < nTab; i++) {
2764       Bool present;
2765       UWord oldK = 0, oldV = 12345;
2766       VtsTE* te = VG_(indexXA)( vts_tab, i );
2767       if (te->vts == NULL) {
2768          tl_assert(te->rc == 0);
2769          continue; /* already on the free list (presumably) */
2770       }
2771       if (te->rc > 0)
2772          continue; /* in use */
2773       /* Ok, we got one we can free. */
2774       tl_assert(te->vts->id == i);
2775       /* first, remove it from vts_set. */
2776       present = VG_(delFromFM)( vts_set,
2777                                 &oldK, &oldV, (UWord)te->vts );
2778       tl_assert(present); /* else it isn't in vts_set ?! */
2779       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
2780       tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
2781       /* now free the VTS itself */
2782       VTS__delete(te->vts);
2783       te->vts = NULL;
2784       /* and finally put this entry on the free list */
2785       tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */
2786       add_to_free_list( i );
2787       nFreed++;
2788    }
2789
2790    /* Now figure out when the next GC should be.  We'll allow the
2791       number of VTSs to double before GCing again.  Except of course
2792       that since we can't (or, at least, don't) shrink vts_tab, we
2793       can't set the threshhold value smaller than it. */
2794    tl_assert(nFreed <= nTab);
2795    nLive = nTab - nFreed;
2796    tl_assert(nLive >= 0 && nLive <= nTab);
2797    vts_next_GC_at = 2 * nLive;
2798    if (vts_next_GC_at < nTab)
2799       vts_next_GC_at = nTab;
2800
2801    if (show_stats) {
2802       show_vts_stats("after GC");
2803       VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
2804    }
2805
2806    if (VG_(clo_stats)) {
2807       static UInt ctr = 1;
2808       tl_assert(nTab > 0);
2809       VG_(message)(Vg_DebugMsg,
2810                   "libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
2811                   ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
2812    }
2813    /* ---------- END VTS GC ---------- */
2814
2815    /* Decide whether to do VTS pruning.  We have one of three
2816       settings. */
2817    static UInt pruning_auto_ctr = 0; /* do not make non-static */
2818
2819    Bool do_pruning = False;
2820    switch (HG_(clo_vts_pruning)) {
2821       case 0: /* never */
2822          break;
2823       case 1: /* auto */
2824          do_pruning = (++pruning_auto_ctr % 5) == 0;
2825          break;
2826       case 2: /* always */
2827          do_pruning = True;
2828          break;
2829       default:
2830          tl_assert(0);
2831    }
2832
2833    /* The rest of this routine only handles pruning, so we can
2834       quit at this point if it is not to be done. */
2835    if (!do_pruning)
2836       return;
2837
2838    /* ---------- BEGIN VTS PRUNING ---------- */
2839    /* We begin by sorting the backing table on its .thr values, so as
2840       to (1) check they are unique [else something has gone wrong,
2841       since it means we must have seen some Thr* exiting more than
2842       once, which can't happen], and (2) so that we can quickly look
2843       up the dead-thread entries as we work through the VTSs. */
2844    VG_(sortXA)( verydead_thread_table );
2845    /* Sanity check: check for unique .sts.thr values. */
2846    UWord nBT = VG_(sizeXA)( verydead_thread_table );
2847    if (nBT > 0) {
2848       ThrID thrid1, thrid2;
2849       thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, 0 );
2850       for (i = 1; i < nBT; i++) {
2851          thrid1 = thrid2;
2852          thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, i );
2853          tl_assert(thrid1 < thrid2);
2854       }
2855    }
2856    /* Ok, so the dead thread table has unique and in-order keys. */
2857
2858    /* We will run through the old table, and create a new table and
2859       set, at the same time setting the .remap entries in the old
2860       table to point to the new entries.  Then, visit every VtsID in
2861       the system, and replace all of them with new ones, using the
2862       .remap entries in the old table.  Finally, we can delete the old
2863       table and set. */
2864
2865    XArray* /* of VtsTE */ new_tab
2866       = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
2867                     HG_(free), sizeof(VtsTE) );
2868
2869    /* WordFM VTS* void */
2870    WordFM* new_set
2871       = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
2872                     HG_(free),
2873                     (Word(*)(UWord,UWord))VTS__cmp_structural );
2874
2875    /* Visit each old VTS.  For each one:
2876
2877       * make a pruned version
2878
2879       * search new_set for the pruned version, yielding either
2880         Nothing (not present) or the new VtsID for it.
2881
2882       * if not present, allocate a new VtsID for it, insert (pruned
2883         VTS, new VtsID) in the tree, and set
2884         remap_table[old VtsID] = new VtsID.
2885
2886       * if present, set remap_table[old VtsID] = new VtsID, where
2887         new VtsID was determined by the tree lookup.  Then free up
2888         the clone.
2889    */
2890
2891    UWord nBeforePruning = 0, nAfterPruning = 0;
2892    UWord nSTSsBefore = 0, nSTSsAfter = 0;
2893    VtsID new_VtsID_ctr = 0;
2894
2895    for (i = 0; i < nTab; i++) {
2896
2897       /* For each old VTS .. */
2898       VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
2899       VTS*   old_vts = old_te->vts;
2900       tl_assert(old_te->remap == VtsID_INVALID);
2901
2902       /* Skip it if not in use */
2903       if (old_te->rc == 0) {
2904          tl_assert(old_vts == NULL);
2905          continue;
2906       }
2907       tl_assert(old_vts != NULL);
2908       tl_assert(old_vts->id == i);
2909       tl_assert(old_vts->ts != NULL);
2910
2911       /* It is in use. Make a pruned version. */
2912       nBeforePruning++;
2913       nSTSsBefore += old_vts->usedTS;
2914       VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
2915                                    old_vts, verydead_thread_table);
2916       tl_assert(new_vts->sizeTS == new_vts->usedTS);
2917       tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
2918                 == 0x0ddC0ffeeBadF00dULL);
2919
2920       /* Get rid of the old VTS and the tree entry.  It's a bit more
2921          complex to incrementally delete the VTSs now than to nuke
2922          them all after we're done, but the upside is that we don't
2923          wind up temporarily storing potentially two complete copies
2924          of each VTS and hence spiking memory use. */
2925       UWord oldK = 0, oldV = 12345;
2926       Bool  present = VG_(delFromFM)( vts_set,
2927                                       &oldK, &oldV, (UWord)old_vts );
2928       tl_assert(present); /* else it isn't in vts_set ?! */
2929       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
2930       tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
2931       /* now free the VTS itself */
2932       VTS__delete(old_vts);
2933       old_te->vts = NULL;
2934       old_vts = NULL;
2935
2936       /* NO MENTIONS of old_vts allowed beyond this point. */
2937
2938       /* Ok, we have the pruned copy in new_vts.  See if a
2939          structurally identical version is already present in new_set.
2940          If so, delete the one we just made and move on; if not, add
2941          it. */
2942       VTS*  identical_version = NULL;
2943       UWord valW = 12345;
2944       if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
2945                         (UWord)new_vts)) {
2946          // already have it
2947          tl_assert(valW == 0);
2948          tl_assert(identical_version != NULL);
2949          tl_assert(identical_version != new_vts);
2950          VTS__delete(new_vts);
2951          new_vts = identical_version;
2952          tl_assert(new_vts->id != VtsID_INVALID);
2953       } else {
2954          tl_assert(valW == 12345);
2955          tl_assert(identical_version == NULL);
2956          new_vts->id = new_VtsID_ctr++;
2957          Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
2958          tl_assert(!b);
2959          VtsTE new_te;
2960          new_te.vts      = new_vts;
2961          new_te.rc       = 0;
2962          new_te.freelink = VtsID_INVALID;
2963          new_te.remap    = VtsID_INVALID;
2964          Word j = VG_(addToXA)( new_tab, &new_te );
2965          tl_assert(j <= i);
2966          tl_assert(j == new_VtsID_ctr - 1);
2967          // stats
2968          nAfterPruning++;
2969          nSTSsAfter += new_vts->usedTS;
2970       }
2971       old_te->remap = new_vts->id;
2972
2973    } /* for (i = 0; i < nTab; i++) */
2974
2975    /* At this point, we have:
2976       * the old VTS table, with its .remap entries set,
2977         and with all .vts == NULL.
2978       * the old VTS tree should be empty, since it and the old VTSs
2979         it contained have been incrementally deleted was we worked
2980         through the old table.
2981       * the new VTS table, with all .rc == 0, all .freelink and .remap
2982         == VtsID_INVALID.
2983       * the new VTS tree.
2984    */
2985    tl_assert( VG_(sizeFM)(vts_set) == 0 );
2986
2987    /* Now actually apply the mapping. */
2988    /* Visit all the VtsIDs in the entire system.  Where do we expect
2989       to find them?
2990       (a) in shadow memory -- the LineZs and LineFs
2991       (b) in our collection of struct _Thrs.
2992       (c) in our collection of struct _SOs.
2993       Nowhere else, AFAICS.  Not in the zsm cache, because that just
2994       got invalidated.
2995
2996       Using the .remap fields in vts_tab, map each old VtsID to a new
2997       VtsID.  For each old VtsID, dec its rc; and for each new one,
2998       inc it.  This sets up the new refcounts, and it also gives a
2999       cheap sanity check of the old ones: all old refcounts should be
3000       zero after this operation.
3001    */
3002
3003    /* Do the mappings for (a) above: iterate over the Primary shadow
3004       mem map (WordFM Addr SecMap*). */
3005    UWord secmapW = 0;
3006    VG_(initIterFM)( map_shmem );
3007    while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
3008       UWord   j;
3009       SecMap* sm = (SecMap*)secmapW;
3010       tl_assert(sm->magic == SecMap_MAGIC);
3011       /* Deal with the LineZs */
3012       for (i = 0; i < N_SECMAP_ZLINES; i++) {
3013          LineZ* lineZ = &sm->linesZ[i];
3014          if (lineZ->dict[0] == SVal_INVALID)
3015             continue; /* not in use -- data is in F rep instead */
3016          for (j = 0; j < 4; j++)
3017             remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
3018       }
3019       /* Deal with the LineFs */
3020       for (i = 0; i < sm->linesF_size; i++) {
3021          LineF* lineF = &sm->linesF[i];
3022          if (!lineF->inUse)
3023             continue;
3024          for (j = 0; j < N_LINE_ARANGE; j++)
3025             remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
3026       }
3027    }
3028    VG_(doneIterFM)( map_shmem );
3029
3030    /* Do the mappings for (b) above: visit our collection of struct
3031       _Thrs. */
3032    Thread* hgthread = get_admin_threads();
3033    tl_assert(hgthread);
3034    while (hgthread) {
3035       Thr* hbthr = hgthread->hbthr;
3036       tl_assert(hbthr);
3037       /* Threads that are listed in the prunable set have their viR
3038          and viW set to VtsID_INVALID, so we can't mess with them. */
3039       if (hbthr->llexit_done && hbthr->joinedwith_done) {
3040          tl_assert(hbthr->viR == VtsID_INVALID);
3041          tl_assert(hbthr->viW == VtsID_INVALID);
3042          hgthread = hgthread->admin;
3043          continue;
3044       }
3045       remap_VtsID( vts_tab, new_tab, &hbthr->viR );
3046       remap_VtsID( vts_tab, new_tab, &hbthr->viW );
3047       hgthread = hgthread->admin;
3048    }
3049
3050    /* Do the mappings for (c) above: visit the struct _SOs. */
3051    SO* so = admin_SO;
3052    while (so) {
3053       if (so->viR != VtsID_INVALID)
3054          remap_VtsID( vts_tab, new_tab, &so->viR );
3055       if (so->viW != VtsID_INVALID)
3056          remap_VtsID( vts_tab, new_tab, &so->viW );
3057       so = so->admin_next;
3058    }
3059
3060    /* So, we're nearly done (with this incredibly complex operation).
3061       Check the refcounts for the old VtsIDs all fell to zero, as
3062       expected.  Any failure is serious. */
3063    for (i = 0; i < nTab; i++) {
3064       VtsTE* te = VG_(indexXA)( vts_tab, i );
3065       tl_assert(te->vts == NULL);
3066       /* This is the assert proper.  Note we're also asserting
3067          zeroness for old entries which are unmapped (hence have
3068          .remap == VtsID_INVALID).  That's OK. */
3069       tl_assert(te->rc == 0);
3070    }
3071
3072    /* Install the new table and set. */
3073    VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
3074    vts_set = new_set;
3075    VG_(deleteXA)( vts_tab );
3076    vts_tab = new_tab;
3077
3078    /* The freelist of vts_tab entries is empty now, because we've
3079       compacted all of the live entries at the low end of the
3080       table. */
3081    vts_tab_freelist = VtsID_INVALID;
3082
3083    /* Sanity check vts_set and vts_tab. */
3084
3085    /* Because all the live entries got slid down to the bottom of vts_tab: */
3086    tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));
3087
3088    /* Assert that the vts_tab and vts_set entries point at each other
3089       in the required way */
3090    UWord wordK = 0, wordV = 0;
3091    VG_(initIterFM)( vts_set );
3092    while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
3093       tl_assert(wordK != 0);
3094       tl_assert(wordV == 0);
3095       VTS* vts = (VTS*)wordK;
3096       tl_assert(vts->id != VtsID_INVALID);
3097       VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
3098       tl_assert(te->vts == vts);
3099    }
3100    VG_(doneIterFM)( vts_set );
3101
3102    /* Also iterate over the table, and check each entry is
3103       plausible. */
3104    nTab = VG_(sizeXA)( vts_tab );
3105    for (i = 0; i < nTab; i++) {
3106       VtsTE* te = VG_(indexXA)( vts_tab, i );
3107       tl_assert(te->vts);
3108       tl_assert(te->vts->id == i);
3109       tl_assert(te->rc > 0); /* 'cos we just GC'd */
3110       tl_assert(te->freelink == VtsID_INVALID); /* in use */
3111       tl_assert(te->remap == VtsID_INVALID); /* not relevant */
3112    }
3113
3114    /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
3115    if (VG_(clo_stats)) {
3116       static UInt ctr = 1;
3117       tl_assert(nTab > 0);
3118       VG_(message)(
3119          Vg_DebugMsg,
3120          "libhb: VTS PR: #%u  before %lu (avg sz %lu)  "
3121             "after %lu (avg sz %lu)\n",
3122          ctr++,
3123          nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
3124          nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
3125       );
3126    }
3127    if (0)
3128    VG_(printf)("VTQ: before pruning %lu (avg sz %lu), "
3129                "after pruning %lu (avg sz %lu)\n",
3130                nBeforePruning, nSTSsBefore / nBeforePruning,
3131                nAfterPruning, nSTSsAfter / nAfterPruning);
3132    /* ---------- END VTS PRUNING ---------- */
3133 }
3134
3135
3136 /////////////////////////////////////////////////////////
3137 //                                                     //
3138 // Vts IDs                                             //
3139 //                                                     //
3140 /////////////////////////////////////////////////////////
3141
3142 //////////////////////////
3143 /* A temporary, max-sized VTS which is used as a temporary (the first
3144    argument) in VTS__singleton, VTS__tick and VTS__join operations. */
3145 static VTS* temp_max_sized_VTS = NULL;
3146
3147 //////////////////////////
3148 static ULong stats__cmpLEQ_queries = 0;
3149 static ULong stats__cmpLEQ_misses  = 0;
3150 static ULong stats__join2_queries  = 0;
3151 static ULong stats__join2_misses   = 0;
3152
3153 static inline UInt ROL32 ( UInt w, Int n ) {
3154    w = (w << n) | (w >> (32-n));
3155    return w;
3156 }
3157 static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
3158    UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
3159    return hash % nTab;
3160 }
3161
3162 #define N_CMPLEQ_CACHE 1023
3163 static
3164    struct { VtsID vi1; VtsID vi2; Bool leq; }
3165    cmpLEQ_cache[N_CMPLEQ_CACHE];
3166
3167 #define N_JOIN2_CACHE 1023
3168 static
3169    struct { VtsID vi1; VtsID vi2; VtsID res; }
3170    join2_cache[N_JOIN2_CACHE];
3171
3172 static void VtsID__invalidate_caches ( void ) {
3173    Int i;
3174    for (i = 0; i < N_CMPLEQ_CACHE; i++) {
3175       cmpLEQ_cache[i].vi1 = VtsID_INVALID;
3176       cmpLEQ_cache[i].vi2 = VtsID_INVALID;
3177       cmpLEQ_cache[i].leq = False;
3178    }
3179    for (i = 0; i < N_JOIN2_CACHE; i++) {
3180      join2_cache[i].vi1 = VtsID_INVALID;
3181      join2_cache[i].vi2 = VtsID_INVALID;
3182      join2_cache[i].res = VtsID_INVALID;
3183    }
3184 }
3185 //////////////////////////
3186
3187 //static Bool VtsID__is_valid ( VtsID vi ) {
3188 //   VtsTE* ve;
3189 //   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
3190 //      return False;
3191 //   ve = VG_(indexXA)( vts_tab, vi );
3192 //   if (!ve->vts)
3193 //      return False;
3194 //   tl_assert(ve->vts->id == vi);
3195 //   return True;
3196 //}
3197
3198 static VTS* VtsID__to_VTS ( VtsID vi ) {
3199    VtsTE* te = VG_(indexXA)( vts_tab, vi );
3200    tl_assert(te->vts);
3201    return te->vts;
3202 }
3203
3204 static void VtsID__pp ( VtsID vi ) {
3205    HChar buf[100];
3206    VTS* vts = VtsID__to_VTS(vi);
3207    VTS__show( buf, sizeof(buf)-1, vts );
3208    buf[sizeof(buf)-1] = 0;
3209    VG_(printf)("%s", buf);
3210 }
3211
3212 /* compute partial ordering relation of vi1 and vi2. */
3213 __attribute__((noinline))
3214 static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
3215    UInt hash;
3216    Bool leq;
3217    VTS  *v1, *v2;
3218    //if (vi1 == vi2) return True;
3219    tl_assert(vi1 != vi2);
3220    ////++
3221    stats__cmpLEQ_queries++;
3222    hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
3223    if (cmpLEQ_cache[hash].vi1 == vi1
3224        && cmpLEQ_cache[hash].vi2 == vi2)
3225       return cmpLEQ_cache[hash].leq;
3226    stats__cmpLEQ_misses++;
3227    ////--
3228    v1  = VtsID__to_VTS(vi1);
3229    v2  = VtsID__to_VTS(vi2);
3230    leq = VTS__cmpLEQ( v1, v2 ) == 0;
3231    ////++
3232    cmpLEQ_cache[hash].vi1 = vi1;
3233    cmpLEQ_cache[hash].vi2 = vi2;
3234    cmpLEQ_cache[hash].leq = leq;
3235    ////--
3236    return leq;
3237 }
3238 static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
3239    return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
3240 }
3241
3242 /* compute binary join */
3243 __attribute__((noinline))
3244 static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
3245    UInt  hash;
3246    VtsID res;
3247    VTS   *vts1, *vts2;
3248    //if (vi1 == vi2) return vi1;
3249    tl_assert(vi1 != vi2);
3250    ////++
3251    stats__join2_queries++;
3252    hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
3253    if (join2_cache[hash].vi1 == vi1
3254        && join2_cache[hash].vi2 == vi2)
3255       return join2_cache[hash].res;
3256    stats__join2_misses++;
3257    ////--
3258    vts1 = VtsID__to_VTS(vi1);
3259    vts2 = VtsID__to_VTS(vi2);
3260    temp_max_sized_VTS->usedTS = 0;
3261    VTS__join(temp_max_sized_VTS, vts1,vts2);
3262    res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3263    ////++
3264    join2_cache[hash].vi1 = vi1;
3265    join2_cache[hash].vi2 = vi2;
3266    join2_cache[hash].res = res;
3267    ////--
3268    return res;
3269 }
3270 static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
3271    return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
3272 }
3273
3274 /* create a singleton VTS, namely [thr:1] */
3275 static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
3276    temp_max_sized_VTS->usedTS = 0;
3277    VTS__singleton(temp_max_sized_VTS, thr,tym);
3278    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3279 }
3280
3281 /* tick operation, creates value 1 if specified index is absent */
3282 static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
3283    VTS* vts = VtsID__to_VTS(vi);
3284    temp_max_sized_VTS->usedTS = 0;
3285    VTS__tick(temp_max_sized_VTS, idx,vts);
3286    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3287 }
3288
3289 /* index into a VTS (only for assertions) */
3290 static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
3291    VTS* vts = VtsID__to_VTS(vi);
3292    return VTS__indexAt_SLOW( vts, idx );
3293 }
3294
3295 /* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
3296    any, really) element in vi1 which is pointwise greater-than the
3297    corresponding element in vi2.  If no such element exists, return
3298    NULL.  This needs to be fairly quick since it is called every time
3299    a race is detected. */
3300 static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
3301 {
3302    VTS  *vts1, *vts2;
3303    Thr*  diffthr;
3304    ThrID diffthrid;
3305    tl_assert(vi1 != vi2);
3306    vts1 = VtsID__to_VTS(vi1);
3307    vts2 = VtsID__to_VTS(vi2);
3308    tl_assert(vts1 != vts2);
3309    diffthrid = VTS__cmpLEQ(vts1, vts2);
3310    diffthr = Thr__from_ThrID(diffthrid);
3311    tl_assert(diffthr); /* else they are LEQ ! */
3312    return diffthr;
3313 }
3314
3315
3316 /////////////////////////////////////////////////////////
3317 //                                                     //
3318 // Filters                                             //
3319 //                                                     //
3320 /////////////////////////////////////////////////////////
3321
3322 /* Forget everything we know -- clear the filter and let everything
3323    through.  This needs to be as fast as possible, since it is called
3324    every time the running thread changes, and every time a thread's
3325    vector clocks change, which can be quite frequent.  The obvious
3326    fast way to do this is simply to stuff in tags which we know are
3327    not going to match anything, since they're not aligned to the start
3328    of a line. */
3329 static void Filter__clear ( Filter* fi, const HChar* who )
3330 {
3331    UWord i;
3332    if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
3333    for (i = 0; i < FI_NUM_LINES; i += 8) {
3334       fi->tags[i+0] = 1; /* impossible value -- cannot match */
3335       fi->tags[i+1] = 1;
3336       fi->tags[i+2] = 1;
3337       fi->tags[i+3] = 1;
3338       fi->tags[i+4] = 1;
3339       fi->tags[i+5] = 1;
3340       fi->tags[i+6] = 1;
3341       fi->tags[i+7] = 1;
3342    }
3343    tl_assert(i == FI_NUM_LINES);
3344 }
3345
3346 /* Clearing an arbitrary range in the filter.  Unfortunately
3347    we have to do this due to core-supplied new/die-mem events. */
3348
3349 static void Filter__clear_1byte ( Filter* fi, Addr a )
3350 {
3351    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3352    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3353    FiLine* line   = &fi->lines[lineno];
3354    UWord   loff   = (a - atag) / 8;
3355    UShort  mask   = 0x3 << (2 * (a & 7));
3356    /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3357    if (LIKELY( fi->tags[lineno] == atag )) {
3358       /* hit.  clear the bits. */
3359       UShort  u16  = line->u16s[loff];
3360       line->u16s[loff] = u16 & ~mask; /* clear them */
3361    } else {
3362       /* miss.  The filter doesn't hold this address, so ignore. */
3363    }
3364 }
3365
3366 static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
3367 {
3368    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3369    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3370    FiLine* line   = &fi->lines[lineno];
3371    UWord   loff   = (a - atag) / 8;
3372    if (LIKELY( fi->tags[lineno] == atag )) {
3373       line->u16s[loff] = 0;
3374    } else {
3375     /* miss.  The filter doesn't hold this address, so ignore. */
3376    }
3377 }
3378
3379 static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
3380 {
3381   //VG_(printf)("%lu ", len);
3382    /* slowly do part preceding 8-alignment */
3383    while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
3384       Filter__clear_1byte( fi, a );
3385       a++;
3386       len--;
3387    }
3388    /* vector loop */
3389    while (len >= 8) {
3390       Filter__clear_8bytes_aligned( fi, a );
3391       a += 8;
3392       len -= 8;
3393    }
3394    /* slowly do tail */
3395    while (UNLIKELY(len > 0)) {
3396       Filter__clear_1byte( fi, a );
3397       a++;
3398       len--;
3399    }
3400 }
3401
3402
3403 /* ------ Read handlers for the filter. ------ */
3404
3405 static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
3406 {
3407    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3408       return False;
3409    {
3410      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3411      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3412      FiLine* line   = &fi->lines[lineno];
3413      UWord   loff   = (a - atag) / 8;
3414      UShort  mask   = 0xAAAA;
3415      if (LIKELY( fi->tags[lineno] == atag )) {
3416         /* hit.  check line and update. */
3417         UShort u16  = line->u16s[loff];
3418         Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
3419         line->u16s[loff] = u16 | mask; /* set them */
3420         return ok;
3421      } else {
3422         /* miss.  nuke existing line and re-use it. */
3423         UWord i;
3424         fi->tags[lineno] = atag;
3425         for (i = 0; i < FI_LINE_SZB / 8; i++)
3426            line->u16s[i] = 0;
3427         line->u16s[loff] = mask;
3428         return False;
3429      }
3430    }
3431 }
3432
3433 static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
3434 {
3435    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3436       return False;
3437    {
3438      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3439      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3440      FiLine* line   = &fi->lines[lineno];
3441      UWord   loff   = (a - atag) / 8;
3442      UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
3443      if (LIKELY( fi->tags[lineno] == atag )) {
3444         /* hit.  check line and update. */
3445         UShort  u16  = line->u16s[loff];
3446         Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
3447         line->u16s[loff] = u16 | mask; /* set them */
3448         return ok;
3449      } else {
3450         /* miss.  nuke existing line and re-use it. */
3451         UWord   i;
3452         fi->tags[lineno] = atag;
3453         for (i = 0; i < FI_LINE_SZB / 8; i++)
3454            line->u16s[i] = 0;
3455         line->u16s[loff] = mask;
3456         return False;
3457      }
3458    }
3459 }
3460
3461 static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
3462 {
3463    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3464       return False;
3465    {
3466      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3467      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3468      FiLine* line   = &fi->lines[lineno];
3469      UWord   loff   = (a - atag) / 8;
3470      UShort  mask   = 0xA << (2 * (a & 6));
3471      /* mask is A000, 0A00, 00A0 or 000A */
3472      if (LIKELY( fi->tags[lineno] == atag )) {
3473         /* hit.  check line and update. */
3474         UShort  u16  = line->u16s[loff];
3475         Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
3476         line->u16s[loff] = u16 | mask; /* set them */
3477         return ok;
3478      } else {
3479         /* miss.  nuke existing line and re-use it. */
3480         UWord   i;
3481         fi->tags[lineno] = atag;
3482         for (i = 0; i < FI_LINE_SZB / 8; i++)
3483            line->u16s[i] = 0;
3484         line->u16s[loff] = mask;
3485         return False;
3486      }
3487    }
3488 }
3489
3490 static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
3491 {
3492    {
3493      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3494      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3495      FiLine* line   = &fi->lines[lineno];
3496      UWord   loff   = (a - atag) / 8;
3497      UShort  mask   = 0x2 << (2 * (a & 7));
3498      /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
3499      if (LIKELY( fi->tags[lineno] == atag )) {
3500         /* hit.  check line and update. */
3501         UShort  u16  = line->u16s[loff];
3502         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3503         line->u16s[loff] = u16 | mask; /* set them */
3504         return ok;
3505      } else {
3506         /* miss.  nuke existing line and re-use it. */
3507         UWord   i;
3508         fi->tags[lineno] = atag;
3509         for (i = 0; i < FI_LINE_SZB / 8; i++)
3510            line->u16s[i] = 0;
3511         line->u16s[loff] = mask;
3512         return False;
3513      }
3514    }
3515 }
3516
3517
3518 /* ------ Write handlers for the filter. ------ */
3519
3520 static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
3521 {
3522    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3523       return False;
3524    {
3525      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3526      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3527      FiLine* line   = &fi->lines[lineno];
3528      UWord   loff   = (a - atag) / 8;
3529      UShort  mask   = 0xFFFF;
3530      if (LIKELY( fi->tags[lineno] == atag )) {
3531         /* hit.  check line and update. */
3532         UShort u16  = line->u16s[loff];
3533         Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
3534         line->u16s[loff] = u16 | mask; /* set them */
3535         return ok;
3536      } else {
3537         /* miss.  nuke existing line and re-use it. */
3538         UWord i;
3539         fi->tags[lineno] = atag;
3540         for (i = 0; i < FI_LINE_SZB / 8; i++)
3541            line->u16s[i] = 0;
3542         line->u16s[loff] = mask;
3543         return False;
3544      }
3545    }
3546 }
3547
3548 static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
3549 {
3550    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3551       return False;
3552    {
3553      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3554      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3555      FiLine* line   = &fi->lines[lineno];
3556      UWord   loff   = (a - atag) / 8;
3557      UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
3558      if (LIKELY( fi->tags[lineno] == atag )) {
3559         /* hit.  check line and update. */
3560         UShort  u16  = line->u16s[loff];
3561         Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
3562         line->u16s[loff] = u16 | mask; /* set them */
3563         return ok;
3564      } else {
3565         /* miss.  nuke existing line and re-use it. */
3566         UWord   i;
3567         fi->tags[lineno] = atag;
3568         for (i = 0; i < FI_LINE_SZB / 8; i++)
3569            line->u16s[i] = 0;
3570         line->u16s[loff] = mask;
3571         return False;
3572      }
3573    }
3574 }
3575
3576 static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
3577 {
3578    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3579       return False;
3580    {
3581      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3582      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3583      FiLine* line   = &fi->lines[lineno];
3584      UWord   loff   = (a - atag) / 8;
3585      UShort  mask   = 0xF << (2 * (a & 6));
3586      /* mask is F000, 0F00, 00F0 or 000F */
3587      if (LIKELY( fi->tags[lineno] == atag )) {
3588         /* hit.  check line and update. */
3589         UShort  u16  = line->u16s[loff];
3590         Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
3591         line->u16s[loff] = u16 | mask; /* set them */
3592         return ok;
3593      } else {
3594         /* miss.  nuke existing line and re-use it. */
3595         UWord   i;
3596         fi->tags[lineno] = atag;
3597         for (i = 0; i < FI_LINE_SZB / 8; i++)
3598            line->u16s[i] = 0;
3599         line->u16s[loff] = mask;
3600         return False;
3601      }
3602    }
3603 }
3604
3605 static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
3606 {
3607    {
3608      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3609      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3610      FiLine* line   = &fi->lines[lineno];
3611      UWord   loff   = (a - atag) / 8;
3612      UShort  mask   = 0x3 << (2 * (a & 7));
3613      /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3614      if (LIKELY( fi->tags[lineno] == atag )) {
3615         /* hit.  check line and update. */
3616         UShort  u16  = line->u16s[loff];
3617         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3618         line->u16s[loff] = u16 | mask; /* set them */
3619         return ok;
3620      } else {
3621         /* miss.  nuke existing line and re-use it. */
3622         UWord   i;
3623         fi->tags[lineno] = atag;
3624         for (i = 0; i < FI_LINE_SZB / 8; i++)
3625            line->u16s[i] = 0;
3626         line->u16s[loff] = mask;
3627         return False;
3628      }
3629    }
3630 }
3631
3632
3633 /////////////////////////////////////////////////////////
3634 //                                                     //
3635 // Threads                                             //
3636 //                                                     //
3637 /////////////////////////////////////////////////////////
3638
3639 /* Maps ThrID values to their Thr*s (which contain ThrID values that
3640    should point back to the relevant slot in the array.  Lowest
3641    numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
3642 static XArray* /* of Thr* */ thrid_to_thr_map = NULL;
3643
3644 /* And a counter to dole out ThrID values.  For rationale/background,
3645    see comments on definition of ScalarTS (far) above. */
3646 static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */
3647
3648 static ThrID Thr__to_ThrID ( Thr* thr ) {
3649    return thr->thrid;
3650 }
3651 static Thr* Thr__from_ThrID ( UInt thrid ) {
3652    Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
3653    tl_assert(thr->thrid == thrid);
3654    return thr;
3655 }
3656
3657 static Thr* Thr__new ( void )
3658 {
3659    Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
3660    thr->viR = VtsID_INVALID;
3661    thr->viW = VtsID_INVALID;
3662    thr->llexit_done = False;
3663    thr->joinedwith_done = False;
3664    thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
3665    if (HG_(clo_history_level) == 1)
3666       thr->local_Kws_n_stacks
3667          = VG_(newXA)( HG_(zalloc),
3668                        "libhb.Thr__new.3 (local_Kws_and_stacks)",
3669                        HG_(free), sizeof(ULong_n_EC) );
3670
3671    /* Add this Thr* <-> ThrID binding to the mapping, and
3672       cross-check */
3673    if (!thrid_to_thr_map) {
3674       thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
3675                                      HG_(free), sizeof(Thr*) );
3676    }
3677
3678    if (thrid_counter >= ThrID_MAX_VALID) {
3679       /* We're hosed.  We have to stop. */
3680       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
3681    }
3682
3683    thr->thrid = thrid_counter++;
3684    Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
3685    tl_assert(ix + 1024 == thr->thrid);
3686
3687    return thr;
3688 }
3689
3690 static void note_local_Kw_n_stack_for ( Thr* thr )
3691 {
3692    Word       nPresent;
3693    ULong_n_EC pair;
3694    tl_assert(thr);
3695
3696    // We only collect this info at history level 1 (approx)
3697    if (HG_(clo_history_level) != 1)
3698       return;
3699
3700    /* This is the scalar Kw for thr. */
3701    pair.ull = VtsID__indexAt( thr->viW, thr );
3702    pair.ec  = main_get_EC( thr );
3703    tl_assert(pair.ec);
3704    tl_assert(thr->local_Kws_n_stacks);
3705
3706    /* check that we're not adding duplicates */
3707    nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
3708
3709    /* Throw away old stacks, if necessary.  We can't accumulate stuff
3710       indefinitely. */
3711    if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
3712       VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
3713       nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
3714       if (0)
3715          VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
3716                      thr, pair.ull, pair.ec );
3717    }
3718
3719    if (nPresent > 0) {
3720       ULong_n_EC* prevPair
3721          = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
3722       tl_assert( prevPair->ull <= pair.ull );
3723    }
3724
3725    if (nPresent == 0)
3726       pair.ec = NULL;
3727
3728    VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
3729
3730    if (0)
3731       VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
3732                   thr, pair.ull, pair.ec );
3733    if (0)
3734       VG_(pp_ExeContext)(pair.ec);
3735 }
3736
3737 static Int cmp__ULong_n_EC__by_ULong ( const ULong_n_EC* pair1,
3738                                        const ULong_n_EC* pair2 )
3739 {
3740    if (pair1->ull < pair2->ull) return -1;
3741    if (pair1->ull > pair2->ull) return 1;
3742    return 0;
3743 }
3744
3745
3746 /////////////////////////////////////////////////////////
3747 //                                                     //
3748 // Shadow Values                                       //
3749 //                                                     //
3750 /////////////////////////////////////////////////////////
3751
3752 // type SVal, SVal_INVALID and SVal_NOACCESS are defined by
3753 // hb_zsm.h.  We have to do everything else here.
3754
3755 /* SVal is 64 bit unsigned int.
3756
3757       <---------30--------->    <---------30--------->
3758    00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
3759    10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
3760    11 0--------------------0 00 0--------------------0   A: SVal_INVALID
3761
3762 */
3763 #define SVAL_TAGMASK (3ULL << 62)
3764
3765 static inline Bool SVal__isC ( SVal s ) {
3766    return (0ULL << 62) == (s & SVAL_TAGMASK);
3767 }
3768 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
3769    //tl_assert(VtsID__is_valid(rmini));
3770    //tl_assert(VtsID__is_valid(wmini));
3771    return (((ULong)rmini) << 32) | ((ULong)wmini);
3772 }
3773 static inline VtsID SVal__unC_Rmin ( SVal s ) {
3774    tl_assert(SVal__isC(s));
3775    return (VtsID)(s >> 32);
3776 }
3777 static inline VtsID SVal__unC_Wmin ( SVal s ) {
3778    tl_assert(SVal__isC(s));
3779    return (VtsID)(s & 0xFFFFFFFFULL);
3780 }
3781
3782 static inline Bool SVal__isA ( SVal s ) {
3783    return (2ULL << 62) == (s & SVAL_TAGMASK);
3784 }
3785 __attribute__((unused))
3786 static inline SVal SVal__mkA ( void ) {
3787    return 2ULL << 62;
3788 }
3789
3790 /* Direct callback from lib_zsm. */
3791 static void SVal__rcinc ( SVal s ) {
3792    if (SVal__isC(s)) {
3793       VtsID__rcinc( SVal__unC_Rmin(s) );
3794       VtsID__rcinc( SVal__unC_Wmin(s) );
3795    }
3796 }
3797
3798 /* Direct callback from lib_zsm. */
3799 static void SVal__rcdec ( SVal s ) {
3800    if (SVal__isC(s)) {
3801       VtsID__rcdec( SVal__unC_Rmin(s) );
3802       VtsID__rcdec( SVal__unC_Wmin(s) );
3803    }
3804 }
3805
3806
3807 /////////////////////////////////////////////////////////
3808 //                                                     //
3809 // Change-event map2                                   //
3810 //                                                     //
3811 /////////////////////////////////////////////////////////
3812
3813 #define EVENT_MAP_GC_DISCARD_FRACTION  0.5
3814
3815 /* This is in two parts:
3816
3817    1. A hash table of RCECs.  This is a set of reference-counted stack
3818       traces.  When the reference count of a stack trace becomes zero,
3819       it is removed from the set and freed up.  The intent is to have
3820       a set of stack traces which can be referred to from (2), but to
3821       only represent each one once.  The set is indexed/searched by
3822       ordering on the stack trace vectors.
3823
3824    2. A SparseWA of OldRefs.  These store information about each old
3825       ref that we need to record.  It is indexed by address of the
3826       location for which the information is recorded.  For LRU
3827       purposes, each OldRef also contains a generation number,
3828       indicating when it was most recently accessed.
3829
3830       The important part of an OldRef is, however, its accs[] array.
3831       This is an array of N_OLDREF_ACCS which binds (thread, R/W,
3832       size) triples to RCECs.  This allows us to collect the last
3833       access-traceback by up to N_OLDREF_ACCS different triples for
3834       this location.  The accs[] array is a MTF-array.  If a binding
3835       falls off the end, that's too bad -- we will lose info about
3836       that triple's access to this location.
3837
3838       When the SparseWA becomes too big, we can throw away the OldRefs
3839       whose generation numbers are below some threshold; hence doing
3840       approximate LRU discarding.  For each discarded OldRef we must
3841       of course decrement the reference count on the all RCECs it
3842       refers to, in order that entries from (1) eventually get
3843       discarded too.
3844
3845    A major improvement in reliability of this mechanism would be to
3846    have a dynamically sized OldRef.accs[] array, so no entries ever
3847    fall off the end.  In investigations (Dec 08) it appears that a
3848    major cause for the non-availability of conflicting-access traces
3849    in race reports is caused by the fixed size of this array.  I
3850    suspect for most OldRefs, only a few entries are used, but for a
3851    minority of cases there is an overflow, leading to info lossage.
3852    Investigations also suggest this is very workload and scheduling
3853    sensitive.  Therefore a dynamic sizing would be better.
3854
3855    However, dynamic sizing would defeat the use of a PoolAllocator
3856    for OldRef structures.  And that's important for performance.  So
3857    it's not straightforward to do.
3858 */
3859
3860
3861 static UWord stats__ctxt_rcdec1 = 0;
3862 static UWord stats__ctxt_rcdec2 = 0;
3863 static UWord stats__ctxt_rcdec3 = 0;
3864 static UWord stats__ctxt_rcdec_calls = 0;
3865 static UWord stats__ctxt_rcdec_discards = 0;
3866 static UWord stats__ctxt_rcdec1_eq = 0;
3867
3868 static UWord stats__ctxt_tab_curr = 0;
3869 static UWord stats__ctxt_tab_max  = 0;
3870
3871 static UWord stats__ctxt_tab_qs   = 0;
3872 static UWord stats__ctxt_tab_cmps = 0;
3873
3874
3875 ///////////////////////////////////////////////////////
3876 //// Part (1): A hash table of RCECs
3877 ///
3878
3879 #define N_FRAMES 8
3880
3881 // (UInt) `echo "Reference Counted Execution Context" | md5sum`
3882 #define RCEC_MAGIC 0xab88abb2UL
3883
3884 //#define N_RCEC_TAB 98317 /* prime */
3885 #define N_RCEC_TAB 196613 /* prime */
3886
3887 typedef
3888    struct _RCEC {
3889       UWord magic;  /* sanity check only */
3890       struct _RCEC* next;
3891       UWord rc;
3892       UWord rcX; /* used for crosschecking */
3893       UWord frames_hash;          /* hash of all the frames */
3894       UWord frames[N_FRAMES];
3895    }
3896    RCEC;
3897
3898 static RCEC** contextTab = NULL; /* hash table of RCEC*s */
3899
3900
3901 /* Gives an arbitrary total order on RCEC .frames fields */
3902 static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) {
3903    Word i;
3904    tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
3905    tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
3906    if (ec1->frames_hash < ec2->frames_hash) return -1;
3907    if (ec1->frames_hash > ec2->frames_hash) return  1;
3908    for (i = 0; i < N_FRAMES; i++) {
3909       if (ec1->frames[i] < ec2->frames[i]) return -1;
3910       if (ec1->frames[i] > ec2->frames[i]) return  1;
3911    }
3912    return 0;
3913 }
3914
3915
3916 /* Dec the ref of this RCEC. */
3917 static void ctxt__rcdec ( RCEC* ec )
3918 {
3919    stats__ctxt_rcdec_calls++;
3920    tl_assert(ec && ec->magic == RCEC_MAGIC);
3921    tl_assert(ec->rc > 0);
3922    ec->rc--;
3923 }
3924
3925 static void ctxt__rcinc ( RCEC* ec )
3926 {
3927    tl_assert(ec && ec->magic == RCEC_MAGIC);
3928    ec->rc++;
3929 }
3930
3931
3932 //////////// BEGIN RCEC pool allocator
3933 static PoolAlloc* rcec_pool_allocator;
3934
3935 static RCEC* alloc_RCEC ( void ) {
3936    return VG_(allocEltPA) ( rcec_pool_allocator );
3937 }
3938
3939 static void free_RCEC ( RCEC* rcec ) {
3940    tl_assert(rcec->magic == RCEC_MAGIC);
3941    VG_(freeEltPA)( rcec_pool_allocator, rcec );
3942 }
3943 //////////// END RCEC pool allocator
3944
3945
3946 /* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
3947    move it one step closer the the front of the list, so as to make
3948    subsequent searches for it cheaper. */
3949 static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
3950 {
3951    RCEC *ec0, *ec1, *ec2;
3952    if (ec == *headp)
3953       tl_assert(0); /* already at head of list */
3954    tl_assert(ec != NULL);
3955    ec0 = *headp;
3956    ec1 = NULL;
3957    ec2 = NULL;
3958    while (True) {
3959       if (ec0 == NULL || ec0 == ec) break;
3960       ec2 = ec1;
3961       ec1 = ec0;
3962       ec0 = ec0->next;
3963    }
3964    tl_assert(ec0 == ec);
3965    if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
3966       RCEC* tmp;
3967       /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
3968          predecessor.  Swap ec0 and ec1, that is, move ec0 one step
3969          closer to the start of the list. */
3970       tl_assert(ec2->next == ec1);
3971       tl_assert(ec1->next == ec0);
3972       tmp = ec0->next;
3973       ec2->next = ec0;
3974       ec0->next = ec1;
3975       ec1->next = tmp;
3976    }
3977    else
3978    if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
3979       /* it's second in the list. */
3980       tl_assert(*headp == ec1);
3981       tl_assert(ec1->next == ec0);
3982       ec1->next = ec0->next;
3983       ec0->next = ec1;
3984       *headp = ec0;
3985    }
3986 }
3987
3988
3989 /* Find the given RCEC in the tree, and return a pointer to it.  Or,
3990    if not present, add the given one to the tree (by making a copy of
3991    it, so the caller can immediately deallocate the original) and
3992    return a pointer to the copy.  The caller can safely have 'example'
3993    on its stack, since we will always return a pointer to a copy of
3994    it, not to the original.  Note that the inserted node will have .rc
3995    of zero and so the caller must immediatly increment it. */
3996 __attribute__((noinline))
3997 static RCEC* ctxt__find_or_add ( RCEC* example )
3998 {
3999    UWord hent;
4000    RCEC* copy;
4001    tl_assert(example && example->magic == RCEC_MAGIC);
4002    tl_assert(example->rc == 0);
4003
4004    /* Search the hash table to see if we already have it. */
4005    stats__ctxt_tab_qs++;
4006    hent = example->frames_hash % N_RCEC_TAB;
4007    copy = contextTab[hent];
4008    while (1) {
4009       if (!copy) break;
4010       tl_assert(copy->magic == RCEC_MAGIC);
4011       stats__ctxt_tab_cmps++;
4012       if (0 == RCEC__cmp_by_frames(copy, example)) break;
4013       copy = copy->next;
4014    }
4015
4016    if (copy) {
4017       tl_assert(copy != example);
4018       /* optimisation: if it's not at the head of its list, move 1
4019          step fwds, to make future searches cheaper */
4020       if (copy != contextTab[hent]) {
4021          move_RCEC_one_step_forward( &contextTab[hent], copy );
4022       }
4023    } else {
4024       copy = alloc_RCEC();
4025       tl_assert(copy != example);
4026       *copy = *example;
4027       copy->next = contextTab[hent];
4028       contextTab[hent] = copy;
4029       stats__ctxt_tab_curr++;
4030       if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
4031          stats__ctxt_tab_max = stats__ctxt_tab_curr;
4032    }
4033    return copy;
4034 }
4035
4036 static inline UWord ROLW ( UWord w, Int n )
4037 {
4038    Int bpw = 8 * sizeof(UWord);
4039    w = (w << n) | (w >> (bpw-n));
4040    return w;
4041 }
4042
4043 __attribute__((noinline))
4044 static RCEC* get_RCEC ( Thr* thr )
4045 {
4046    UWord hash, i;
4047    RCEC  example;
4048    example.magic = RCEC_MAGIC;
4049    example.rc = 0;
4050    example.rcX = 0;
4051    example.next = NULL;
4052    main_get_stacktrace( thr, &example.frames[0], N_FRAMES );
4053    hash = 0;
4054    for (i = 0; i < N_FRAMES; i++) {
4055       hash ^= example.frames[i];
4056       hash = ROLW(hash, 19);
4057    }
4058    example.frames_hash = hash;
4059    return ctxt__find_or_add( &example );
4060 }
4061
4062 ///////////////////////////////////////////////////////
4063 //// Part (2):
4064 ///  A SparseWA guest-addr -> OldRef, that refers to (1)
4065 ///
4066
4067 // (UInt) `echo "Old Reference Information" | md5sum`
4068 #define OldRef_MAGIC 0x30b1f075UL
4069
4070 /* Records an access: a thread, a context (size & writeness) and the
4071    number of held locks. The size (1,2,4,8) is encoded as 00 = 1, 01 =
4072    2, 10 = 4, 11 = 8.
4073 */
4074 typedef
4075    struct {
4076       RCEC*     rcec;
4077       WordSetID locksHeldW;
4078       UInt      thrid  : SCALARTS_N_THRBITS;
4079       UInt      szLg2B : 2;
4080       UInt      isW    : 1;
4081    }
4082    Thr_n_RCEC;
4083
4084 #define N_OLDREF_ACCS 5
4085
4086 typedef
4087    struct {
4088       UWord magic;  /* sanity check only */
4089       UWord gen;    /* when most recently accessed */
4090                     /* or free list when not in use */
4091       /* unused slots in this array have .thrid == 0, which is invalid */
4092       Thr_n_RCEC accs[N_OLDREF_ACCS];
4093    }
4094    OldRef;
4095
4096
4097 //////////// BEGIN OldRef pool allocator
4098 static PoolAlloc* oldref_pool_allocator;
4099
4100 static OldRef* alloc_OldRef ( void ) {
4101    return VG_(allocEltPA) ( oldref_pool_allocator );
4102 }
4103
4104 static void free_OldRef ( OldRef* r ) {
4105    tl_assert(r->magic == OldRef_MAGIC);
4106    VG_(freeEltPA)( oldref_pool_allocator, r );
4107 }
4108 //////////// END OldRef pool allocator
4109
4110
4111 static SparseWA* oldrefTree     = NULL; /* SparseWA* OldRef* */
4112 static UWord     oldrefGen      = 0;    /* current LRU generation # */
4113 static UWord     oldrefTreeN    = 0;    /* # elems in oldrefTree */
4114 static UWord     oldrefGenIncAt = 0;    /* inc gen # when size hits this */
4115
4116 inline static UInt min_UInt ( UInt a, UInt b ) {
4117    return a < b ? a : b;
4118 }
4119
4120 /* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
4121    first interval is lower, 1 if the first interval is higher, and 0
4122    if there is any overlap.  Redundant paranoia with casting is there
4123    following what looked distinctly like a bug in gcc-4.1.2, in which
4124    some of the comparisons were done signedly instead of
4125    unsignedly. */
4126 /* Copied from exp-ptrcheck/sg_main.c */
4127 static Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
4128                                      Addr a2, SizeT n2 ) {
4129    UWord a1w = (UWord)a1;
4130    UWord n1w = (UWord)n1;
4131    UWord a2w = (UWord)a2;
4132    UWord n2w = (UWord)n2;
4133    tl_assert(n1w > 0 && n2w > 0);
4134    if (a1w + n1w <= a2w) return -1L;
4135    if (a2w + n2w <= a1w) return 1L;
4136    return 0;
4137 }
4138
4139 static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
4140 {
4141    OldRef* ref;
4142    RCEC*   rcec;
4143    Word    i, j;
4144    UWord   keyW, valW;
4145    Bool    b;
4146
4147    tl_assert(thr);
4148    ThrID thrid = thr->thrid;
4149    tl_assert(thrid != 0); /* zero is used to denote an empty slot. */
4150
4151    WordSetID locksHeldW = thr->hgthread->locksetW;
4152
4153    rcec = get_RCEC( thr );
4154    ctxt__rcinc(rcec);
4155
4156    UInt szLg2B = 0;
4157    switch (szB) {
4158       /* This doesn't look particularly branch-predictor friendly. */
4159       case 1:  szLg2B = 0; break;
4160       case 2:  szLg2B = 1; break;
4161       case 4:  szLg2B = 2; break;
4162       case 8:  szLg2B = 3; break;
4163       default: tl_assert(0);
4164    }
4165
4166    /* Look in the map to see if we already have a record for this
4167       address. */
4168    b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, a );
4169
4170    if (b) {
4171
4172       /* We already have a record for this address.  We now need to
4173          see if we have a stack trace pertaining to this (thrid, R/W,
4174          size) triple. */
4175       tl_assert(keyW == a);
4176       ref = (OldRef*)valW;
4177       tl_assert(ref->magic == OldRef_MAGIC);
4178
4179       for (i = 0; i < N_OLDREF_ACCS; i++) {
4180          if (ref->accs[i].thrid != thrid)
4181             continue;
4182          if (ref->accs[i].szLg2B != szLg2B)
4183             continue;
4184          if (ref->accs[i].isW != (UInt)(isW & 1))
4185             continue;
4186          /* else we have a match, so stop looking. */
4187          break;
4188       }
4189
4190       if (i < N_OLDREF_ACCS) {
4191          /* thread 'thr' has an entry at index 'i'.  Update its RCEC. */
4192          if (i > 0) {
4193             Thr_n_RCEC tmp = ref->accs[i-1];
4194             ref->accs[i-1] = ref->accs[i];
4195             ref->accs[i] = tmp;
4196             i--;
4197          }
4198          if (rcec == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++;
4199          stats__ctxt_rcdec1++;
4200          ctxt__rcdec( ref->accs[i].rcec );
4201          tl_assert(ref->accs[i].thrid == thrid);
4202          /* Update the RCEC and the W-held lockset. */
4203          ref->accs[i].rcec       = rcec;
4204          ref->accs[i].locksHeldW = locksHeldW;
4205       } else {
4206          /* No entry for this (thread, R/W, size, nWHeld) quad.
4207             Shuffle all of them down one slot, and put the new entry
4208             at the start of the array. */
4209          if (ref->accs[N_OLDREF_ACCS-1].thrid != 0) {
4210             /* the last slot is in use.  We must dec the rc on the
4211                associated rcec. */
4212             tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec);
4213             stats__ctxt_rcdec2++;
4214             if (0 && 0 == (stats__ctxt_rcdec2 & 0xFFF))
4215                VG_(printf)("QQQQ %lu overflows\n",stats__ctxt_rcdec2);
4216             ctxt__rcdec( ref->accs[N_OLDREF_ACCS-1].rcec );
4217          } else {
4218             tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec);
4219          }
4220          for (j = N_OLDREF_ACCS-1; j >= 1; j--)
4221             ref->accs[j] = ref->accs[j-1];
4222          ref->accs[0].thrid      = thrid;
4223          ref->accs[0].szLg2B     = szLg2B;
4224          ref->accs[0].isW        = (UInt)(isW & 1);
4225          ref->accs[0].locksHeldW = locksHeldW;
4226          ref->accs[0].rcec       = rcec;
4227          /* thrid==0 is used to signify an empty slot, so we can't
4228             add zero thrid (such a ThrID is invalid anyway). */
4229          /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
4230       }
4231
4232       ref->gen = oldrefGen;
4233
4234    } else {
4235
4236       /* We don't have a record for this address.  Create a new one. */
4237       if (oldrefTreeN >= oldrefGenIncAt) {
4238          oldrefGen++;
4239          oldrefGenIncAt = oldrefTreeN + 50000;
4240          if (0) VG_(printf)("oldrefTree: new gen %lu at size %lu\n",
4241                             oldrefGen, oldrefTreeN );
4242       }
4243
4244       ref = alloc_OldRef();
4245       ref->magic = OldRef_MAGIC;
4246       ref->gen   = oldrefGen;
4247       ref->accs[0].thrid      = thrid;
4248       ref->accs[0].szLg2B     = szLg2B;
4249       ref->accs[0].isW        = (UInt)(isW & 1);
4250       ref->accs[0].locksHeldW = locksHeldW;
4251       ref->accs[0].rcec       = rcec;
4252
4253       /* thrid==0 is used to signify an empty slot, so we can't
4254          add zero thrid (such a ThrID is invalid anyway). */
4255       /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
4256
4257       /* Clear out the rest of the entries */
4258       for (j = 1; j < N_OLDREF_ACCS; j++) {
4259          ref->accs[j].rcec       = NULL;
4260          ref->accs[j].thrid      = 0;
4261          ref->accs[j].szLg2B     = 0;
4262          ref->accs[j].isW        = 0;
4263          ref->accs[j].locksHeldW = 0;
4264       }
4265       VG_(addToSWA)( oldrefTree, a, (UWord)ref );
4266       oldrefTreeN++;
4267
4268    }
4269 }
4270
4271
4272 /* Extract info from the conflicting-access machinery. */
4273 Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
4274                               /*OUT*/Thr**        resThr,
4275                               /*OUT*/SizeT*       resSzB,
4276                               /*OUT*/Bool*        resIsW,
4277                               /*OUT*/WordSetID*   locksHeldW,
4278                               Thr* thr, Addr a, SizeT szB, Bool isW )
4279 {
4280    Word    i, j;
4281    OldRef* ref;
4282    UWord   keyW, valW;
4283    Bool    b;
4284
4285    ThrID     cand_thrid;
4286    RCEC*     cand_rcec;
4287    Bool      cand_isW;
4288    SizeT     cand_szB;
4289    WordSetID cand_locksHeldW;
4290    Addr      cand_a;
4291
4292    Addr toCheck[15];
4293    Int  nToCheck = 0;
4294
4295    tl_assert(thr);
4296    tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
4297
4298    ThrID thrid = thr->thrid;
4299
4300    toCheck[nToCheck++] = a;
4301    for (i = -7; i < (Word)szB; i++) {
4302       if (i != 0)
4303          toCheck[nToCheck++] = a + i;
4304    }
4305    tl_assert(nToCheck <= 15);
4306
4307    /* Now see if we can find a suitable matching event for
4308       any of the addresses in toCheck[0 .. nToCheck-1]. */
4309    for (j = 0; j < nToCheck; j++) {
4310
4311       cand_a = toCheck[j];
4312       //      VG_(printf)("test %ld %p\n", j, cand_a);
4313
4314       b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, cand_a );
4315       if (!b)
4316          continue;
4317
4318       ref = (OldRef*)valW;
4319       tl_assert(keyW == cand_a);
4320       tl_assert(ref->magic == OldRef_MAGIC);
4321       tl_assert(ref->accs[0].thrid != 0); /* first slot must always be used */
4322
4323       cand_thrid      = 0; /* invalid; see comments in event_map_bind */
4324       cand_rcec       = NULL;
4325       cand_isW        = False;
4326       cand_szB        = 0;
4327       cand_locksHeldW = 0; /* always valid; see initialise_data_structures() */
4328
4329       for (i = 0; i < N_OLDREF_ACCS; i++) {
4330          Thr_n_RCEC* cand = &ref->accs[i];
4331          cand_rcec       = cand->rcec;
4332          cand_thrid      = cand->thrid;
4333          cand_isW        = (Bool)cand->isW;
4334          cand_szB        = 1 << cand->szLg2B;
4335          cand_locksHeldW = cand->locksHeldW;
4336
4337          if (cand_thrid == 0)
4338             /* This slot isn't in use.  Ignore it. */
4339             continue;
4340
4341          if (cand_thrid == thrid)
4342             /* This is an access by the same thread, but we're only
4343                interested in accesses from other threads.  Ignore. */
4344             continue;
4345
4346          if ((!cand_isW) && (!isW))
4347             /* We don't want to report a read racing against another
4348                read; that's stupid.  So in this case move on. */
4349             continue;
4350
4351          if (cmp_nonempty_intervals(a, szB, cand_a, cand_szB) != 0)
4352             /* No overlap with the access we're asking about.  Ignore. */
4353             continue;
4354
4355          /* We have a match.  Stop searching. */
4356          break;
4357       }
4358
4359       tl_assert(i >= 0 && i <= N_OLDREF_ACCS);
4360
4361       if (i < N_OLDREF_ACCS) {
4362          Int n, maxNFrames;
4363          /* return with success */
4364          tl_assert(cand_thrid);
4365          tl_assert(cand_rcec);
4366          tl_assert(cand_rcec->magic == RCEC_MAGIC);
4367          tl_assert(cand_szB >= 1);
4368          /* Count how many non-zero frames we have. */
4369          maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
4370          for (n = 0; n < maxNFrames; n++) {
4371             if (0 == cand_rcec->frames[n]) break;
4372          }
4373          *resEC      = VG_(make_ExeContext_from_StackTrace)
4374                           (cand_rcec->frames, n);
4375          *resThr     = Thr__from_ThrID(cand_thrid);
4376          *resSzB     = cand_szB;
4377          *resIsW     = cand_isW;
4378          *locksHeldW = cand_locksHeldW;
4379          return True;
4380       }
4381
4382       /* consider next address in toCheck[] */
4383    } /* for (j = 0; j < nToCheck; j++) */
4384
4385    /* really didn't find anything. */
4386    return False;
4387 }
4388
4389 static void event_map_init ( void )
4390 {
4391    Word i;
4392
4393    /* Context (RCEC) pool allocator */
4394    rcec_pool_allocator = VG_(newPA) (
4395                              sizeof(RCEC),
4396                              1000 /* RCECs per pool */,
4397                              HG_(zalloc),
4398                              "libhb.event_map_init.1 (RCEC pools)",
4399                              HG_(free)
4400                           );
4401
4402    /* Context table */
4403    tl_assert(!contextTab);
4404    contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
4405                              N_RCEC_TAB * sizeof(RCEC*) );
4406    for (i = 0; i < N_RCEC_TAB; i++)
4407       contextTab[i] = NULL;
4408
4409    /* Oldref pool allocator */
4410    oldref_pool_allocator = VG_(newPA)(
4411                                sizeof(OldRef),
4412                                1000 /* OldRefs per pool */,
4413                                HG_(zalloc),
4414                                "libhb.event_map_init.3 (OldRef pools)",
4415                                HG_(free)
4416                             );
4417
4418    /* Oldref tree */
4419    tl_assert(!oldrefTree);
4420    oldrefTree = VG_(newSWA)(
4421                    HG_(zalloc),
4422                    "libhb.event_map_init.4 (oldref tree)",
4423                    HG_(free)
4424                 );
4425
4426    oldrefGen = 0;
4427    oldrefGenIncAt = 0;
4428    oldrefTreeN = 0;
4429 }
4430
4431 static void event_map__check_reference_counts ( Bool before )
4432 {
4433    RCEC*   rcec;
4434    OldRef* oldref;
4435    Word    i;
4436    UWord   nEnts = 0;
4437    UWord   keyW, valW;
4438
4439    /* Set the 'check' reference counts to zero.  Also, optionally
4440       check that the real reference counts are non-zero.  We allow
4441       these to fall to zero before a GC, but the GC must get rid of
4442       all those that are zero, hence none should be zero after a
4443       GC. */
4444    for (i = 0; i < N_RCEC_TAB; i++) {
4445       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
4446          nEnts++;
4447          tl_assert(rcec);
4448          tl_assert(rcec->magic == RCEC_MAGIC);
4449          if (!before)
4450             tl_assert(rcec->rc > 0);
4451          rcec->rcX = 0;
4452       }
4453    }
4454
4455    /* check that the stats are sane */
4456    tl_assert(nEnts == stats__ctxt_tab_curr);
4457    tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
4458
4459    /* visit all the referencing points, inc check ref counts */
4460    VG_(initIterSWA)( oldrefTree );
4461    while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4462       oldref = (OldRef*)valW;
4463       tl_assert(oldref->magic == OldRef_MAGIC);
4464       for (i = 0; i < N_OLDREF_ACCS; i++) {
4465          ThrID aThrID = oldref->accs[i].thrid;
4466          RCEC* aRef   = oldref->accs[i].rcec;
4467          if (aThrID != 0) {
4468             tl_assert(aRef);
4469             tl_assert(aRef->magic == RCEC_MAGIC);
4470             aRef->rcX++;
4471          } else {
4472             tl_assert(!aRef);
4473          }
4474       }
4475    }
4476
4477    /* compare check ref counts with actual */
4478    for (i = 0; i < N_RCEC_TAB; i++) {
4479       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
4480          tl_assert(rcec->rc == rcec->rcX);
4481       }
4482    }
4483 }
4484
4485 __attribute__((noinline))
4486 static void event_map_maybe_GC ( void )
4487 {
4488    OldRef* oldref;
4489    UWord   keyW, valW, retained, maxGen;
4490    XArray* refs2del;
4491    Word    i, j, n2del;
4492
4493    UWord* genMap      = NULL;
4494    UWord  genMap_min  = 0;
4495    UWord  genMap_size = 0;
4496
4497    if (LIKELY(oldrefTreeN < HG_(clo_conflict_cache_size)))
4498       return;
4499
4500    if (0)
4501       VG_(printf)("libhb: event_map GC at size %lu\n", oldrefTreeN);
4502
4503    /* Check for sane command line params.  Limit values must match
4504       those in hg_process_cmd_line_option. */
4505    tl_assert( HG_(clo_conflict_cache_size) >= 10*1000 );
4506    tl_assert( HG_(clo_conflict_cache_size) <= 30*1000*1000 );
4507
4508    /* Check our counting is sane (expensive) */
4509    if (CHECK_CEM)
4510       tl_assert(oldrefTreeN == VG_(sizeSWA)( oldrefTree ));
4511
4512    /* Check the reference counts (expensive) */
4513    if (CHECK_CEM)
4514       event_map__check_reference_counts( True/*before*/ );
4515
4516    /* Compute the distribution of generation values in the ref tree.
4517       There are likely only to be a few different generation numbers
4518       in the whole tree, but we don't know what they are.  Hence use a
4519       dynamically resized array of counters.  The array is genMap[0
4520       .. genMap_size-1], where genMap[0] is the count for the
4521       generation number genMap_min, genMap[1] is the count for
4522       genMap_min+1, etc.  If a new number is seen outside the range
4523       [genMap_min .. genMap_min + genMap_size - 1] then the array is
4524       copied into a larger array, and genMap_min and genMap_size are
4525       adjusted accordingly. */
4526
4527    /* genMap :: generation-number -> count-of-nodes-with-that-number */
4528
4529    VG_(initIterSWA)( oldrefTree );
4530    while ( VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4531
4532        UWord ea, key;
4533        oldref = (OldRef*)valW;
4534        key = oldref->gen;
4535
4536       /* BEGIN find 'ea', which is the index in genMap holding the
4537          count for generation number 'key'. */
4538       if (UNLIKELY(genMap == NULL)) {
4539          /* deal with the first key to be seen, so that the following
4540             cases don't need to handle the complexity of a NULL count
4541             array. */
4542          genMap_min  = key;
4543          genMap_size = 1;
4544          genMap = HG_(zalloc)( "libhb.emmG.1a",
4545                                 genMap_size * sizeof(UWord) );
4546          ea = 0;
4547          if (0) VG_(printf)("(%lu) case 1 [%lu .. %lu]\n",
4548                             key, genMap_min, genMap_min+genMap_size- 1 );
4549       }
4550       else
4551       if (LIKELY(key >= genMap_min && key < genMap_min + genMap_size)) {
4552          /* this is the expected (almost-always-happens) case: 'key'
4553             is already mapped in the array. */
4554          ea = key - genMap_min;
4555       }
4556       else
4557       if (key < genMap_min) {
4558          /* 'key' appears before the start of the current array.
4559             Extend the current array by allocating a larger one and
4560             copying the current one to the upper end of it. */
4561          Word   more;
4562          UWord* map2;
4563          more = genMap_min - key;
4564          tl_assert(more > 0);
4565          map2 = HG_(zalloc)( "libhb.emmG.1b",
4566                              (genMap_size + more) * sizeof(UWord) );
4567          VG_(memcpy)( &map2[more], genMap, genMap_size * sizeof(UWord) );
4568          HG_(free)( genMap );
4569          genMap = map2;
4570          genMap_size += more;
4571          genMap_min -= more;
4572          ea = 0;
4573          tl_assert(genMap_min == key);
4574          if (0) VG_(printf)("(%lu) case 2 [%lu .. %lu]\n",
4575                             key, genMap_min,  genMap_min+genMap_size- 1 );
4576       }
4577       else {
4578          /* 'key' appears after the end of the current array.  Extend
4579             the current array by allocating a larger one and copying
4580             the current one to the lower end of it. */
4581          Word   more;
4582          UWord* map2;
4583          tl_assert(key >= genMap_min + genMap_size);
4584          more = key - (genMap_min + genMap_size) + 1;
4585          tl_assert(more > 0);
4586          map2 = HG_(zalloc)( "libhb.emmG.1c",
4587                              (genMap_size + more) * sizeof(UWord) );
4588          VG_(memcpy)( &map2[0], genMap, genMap_size * sizeof(UWord) );
4589          HG_(free)( genMap );
4590          genMap = map2;
4591          genMap_size += more;
4592          ea = genMap_size - 1;;
4593          tl_assert(genMap_min + genMap_size - 1 == key);
4594          if (0) VG_(printf)("(%lu) case 3 [%lu .. %lu]\n",
4595                             key, genMap_min, genMap_min+genMap_size- 1 );
4596       }
4597       /* END find 'ea' from 'key' */
4598
4599       tl_assert(ea >= 0 && ea < genMap_size);
4600       /* and the whole point of this elaborate computation of 'ea' is .. */
4601       genMap[ea]++;
4602    }
4603
4604    tl_assert(genMap);
4605    tl_assert(genMap_size > 0);
4606
4607    /* Sanity check what we just computed */
4608    { UWord sum = 0;
4609      for (i = 0; i < genMap_size; i++) {
4610         if (0) VG_(printf)("  xxx: gen %ld has %lu\n",
4611                            i + genMap_min, genMap[i] );
4612         sum += genMap[i];
4613      }
4614      tl_assert(sum == oldrefTreeN);
4615    }
4616
4617    /* Figure out how many generations to throw away */
4618    retained = oldrefTreeN;
4619    maxGen = 0;
4620
4621    for (i = 0; i < genMap_size; i++) {
4622       keyW = i + genMap_min;
4623       valW = genMap[i];
4624       tl_assert(keyW > 0); /* can't allow a generation # 0 */
4625       if (0) VG_(printf)("  XXX: gen %lu has %lu\n", keyW, valW );
4626       tl_assert(keyW >= maxGen);
4627       tl_assert(retained >= valW);
4628       if (retained - valW
4629           > (UWord)(HG_(clo_conflict_cache_size)
4630                     * EVENT_MAP_GC_DISCARD_FRACTION)) {
4631          retained -= valW;
4632          maxGen = keyW;
4633       } else {
4634          break;
4635       }
4636    }
4637
4638    HG_(free)(genMap);
4639
4640    tl_assert(retained >= 0 && retained <= oldrefTreeN);
4641
4642    /* Now make up a big list of the oldrefTree entries we want to
4643       delete.  We can't simultaneously traverse the tree and delete
4644       stuff from it, so first we need to copy them off somewhere
4645       else. (sigh) */
4646    refs2del = VG_(newXA)( HG_(zalloc), "libhb.emmG.2",
4647                           HG_(free), sizeof(Addr) );
4648
4649    if (retained < oldrefTreeN) {
4650
4651       /* This is the normal (expected) case.  We discard any ref whose
4652          generation number <= maxGen. */
4653       VG_(initIterSWA)( oldrefTree );
4654       while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4655          oldref = (OldRef*)valW;
4656          tl_assert(oldref->magic == OldRef_MAGIC);
4657          if (oldref->gen <= maxGen) {
4658             VG_(addToXA)( refs2del, &keyW );
4659          }
4660       }
4661       if (VG_(clo_stats)) {
4662          VG_(message)(Vg_DebugMsg,
4663             "libhb: EvM GC: delete generations %lu and below, "
4664             "retaining %lu entries\n",
4665             maxGen, retained );
4666       }
4667
4668    } else {
4669
4670       static UInt rand_seed = 0; /* leave as static */
4671
4672       /* Degenerate case: there's only one generation in the entire
4673          tree, so we need to have some other way of deciding which
4674          refs to throw away.  Just throw out half of them randomly. */
4675       tl_assert(retained == oldrefTreeN);
4676       VG_(initIterSWA)( oldrefTree );
4677       while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
4678          UInt n;
4679          oldref = (OldRef*)valW;
4680          tl_assert(oldref->magic == OldRef_MAGIC);
4681          n = VG_(random)( &rand_seed );
4682          if ((n & 0xFFF) < 0x800) {
4683             VG_(addToXA)( refs2del, &keyW );
4684             retained--;
4685          }
4686       }
4687       if (VG_(clo_stats)) {
4688          VG_(message)(Vg_DebugMsg,
4689             "libhb: EvM GC: randomly delete half the entries, "
4690             "retaining %lu entries\n",
4691             retained );
4692       }
4693
4694    }
4695
4696    n2del = VG_(sizeXA)( refs2del );
4697    tl_assert(n2del == (Word)(oldrefTreeN - retained));
4698
4699    if (0) VG_(printf)("%s","deleting entries\n");
4700    for (i = 0; i < n2del; i++) {
4701       Bool  b;
4702       Addr  ga2del = *(Addr*)VG_(indexXA)( refs2del, i );
4703       b = VG_(delFromSWA)( oldrefTree, &keyW, &valW, ga2del );
4704       tl_assert(b);
4705       tl_assert(keyW == ga2del);
4706       oldref = (OldRef*)valW;
4707       for (j = 0; j < N_OLDREF_ACCS; j++) {
4708          ThrID aThrID = oldref->accs[j].thrid;
4709          RCEC* aRef   = oldref->accs[j].rcec;
4710          if (aRef) {
4711             tl_assert(aThrID != 0);
4712             stats__ctxt_rcdec3++;
4713             ctxt__rcdec( aRef );
4714          } else {
4715             tl_assert(aThrID == 0);
4716          }
4717       }
4718
4719       free_OldRef( oldref );
4720    }
4721
4722    VG_(deleteXA)( refs2del );
4723
4724    tl_assert( VG_(sizeSWA)( oldrefTree ) == retained );
4725
4726    oldrefTreeN = retained;
4727    oldrefGenIncAt = oldrefTreeN; /* start new gen right away */
4728
4729    /* Throw away all RCECs with zero reference counts */
4730    for (i = 0; i < N_RCEC_TAB; i++) {
4731       RCEC** pp = &contextTab[i];
4732       RCEC*  p  = *pp;
4733       while (p) {
4734          if (p->rc == 0) {
4735             *pp = p->next;
4736             free_RCEC(p);
4737             p = *pp;
4738             tl_assert(stats__ctxt_tab_curr > 0);
4739             stats__ctxt_tab_curr--;
4740          } else {
4741             pp = &p->next;
4742             p = p->next;
4743          }
4744       }
4745    }
4746
4747    /* Check the reference counts (expensive) */
4748    if (CHECK_CEM)
4749       event_map__check_reference_counts( False/*after*/ );
4750
4751    //if (0)
4752    //VG_(printf)("XXXX final sizes: oldrefTree %ld, contextTree %ld\n\n",
4753    //            VG_(OSetGen_Size)(oldrefTree), VG_(OSetGen_Size)(contextTree));
4754
4755 }
4756
4757
4758 /////////////////////////////////////////////////////////
4759 //                                                     //
4760 // Core MSM                                            //
4761 //                                                     //
4762 /////////////////////////////////////////////////////////
4763
4764 /* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
4765    Nov 08, and again after [...],
4766    June 09. */
4767
4768 static ULong stats__msmcread         = 0;
4769 static ULong stats__msmcread_change  = 0;
4770 static ULong stats__msmcwrite        = 0;
4771 static ULong stats__msmcwrite_change = 0;
4772
4773 /* Some notes on the H1 history mechanism:
4774
4775    Transition rules are:
4776
4777    read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
4778    write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
4779
4780    After any access by a thread T to a location L, L's constraint pair
4781    (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
4782
4783    After a race by thread T conflicting with some previous access by
4784    some other thread U, for a location with constraint (before
4785    processing the later access) (Cr,Cw), then Cw[U] is the segment in
4786    which the previously access lies.
4787
4788    Hence in record_race_info, we pass in Cfailed and Kfailed, which
4789    are compared so as to find out which thread(s) this access
4790    conflicts with.  Once that is established, we also require the
4791    pre-update Cw for the location, so we can index into it for those
4792    threads, to get the scalar clock values for the point at which the
4793    former accesses were made.  (In fact we only bother to do any of
4794    this for an arbitrarily chosen one of the conflicting threads, as
4795    that's simpler, it avoids flooding the user with vast amounts of
4796    mostly useless information, and because the program is wrong if it
4797    contains any races at all -- so we don't really need to show all
4798    conflicting access pairs initially, so long as we only show none if
4799    none exist).
4800
4801    ---
4802
4803    That requires the auxiliary proof that
4804
4805       (Cr `join` Kw)[T] == Kw[T]
4806
4807    Why should that be true?  Because for any thread T, Kw[T] >= the
4808    scalar clock value for T known by any other thread.  In other
4809    words, because T's value for its own scalar clock is at least as up
4810    to date as the value for it known by any other thread (that is true
4811    for both the R- and W- scalar clocks).  Hence no other thread will
4812    be able to feed in a value for that element (indirectly via a
4813    constraint) which will exceed Kw[T], and hence the join cannot
4814    cause that particular element to advance.
4815 */
4816
4817 __attribute__((noinline))
4818 static void record_race_info ( Thr* acc_thr,
4819                                Addr acc_addr, SizeT szB, Bool isWrite,
4820                                VtsID Cfailed,
4821                                VtsID Kfailed,
4822                                VtsID Cw )
4823 {
4824    /* Call here to report a race.  We just hand it onwards to
4825       HG_(record_error_Race).  If that in turn discovers that the
4826       error is going to be collected, then, at history_level 2, that
4827       queries the conflicting-event map.  The alternative would be to
4828       query it right here.  But that causes a lot of pointless queries
4829       for errors which will shortly be discarded as duplicates, and
4830       can become a performance overhead; so we defer the query until
4831       we know the error is not a duplicate. */
4832
4833    /* Stacks for the bounds of the (or one of the) conflicting
4834       segment(s).  These are only set at history_level 1. */
4835    ExeContext* hist1_seg_start = NULL;
4836    ExeContext* hist1_seg_end   = NULL;
4837    Thread*     hist1_conf_thr  = NULL;
4838
4839    tl_assert(acc_thr);
4840    tl_assert(acc_thr->hgthread);
4841    tl_assert(acc_thr->hgthread->hbthr == acc_thr);
4842    tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);
4843
4844    if (HG_(clo_history_level) == 1) {
4845       Bool found;
4846       Word firstIx, lastIx;
4847       ULong_n_EC key;
4848
4849       /* At history_level 1, we must round up the relevant stack-pair
4850          for the conflicting segment right now.  This is because
4851          deferring it is complex; we can't (easily) put Kfailed and
4852          Cfailed into the XError and wait for later without
4853          getting tied up in difficulties with VtsID reference
4854          counting.  So just do it now. */
4855       Thr*  confThr;
4856       ULong confTym = 0;
4857       /* Which thread are we in conflict with?  There may be more than
4858          one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
4859          (in fact it's the one with the lowest Thr* value). */
4860       confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
4861       /* This must exist!  since if it was NULL then there's no
4862          conflict (semantics of return value of
4863          VtsID__findFirst_notLEQ), and msmc{read,write}, which has
4864          called us, just checked exactly this -- that there was in
4865          fact a race. */
4866       tl_assert(confThr);
4867
4868       /* Get the scalar clock value that the conflicting thread
4869          introduced into the constraint.  A careful examination of the
4870          base machine rules shows that this must be the same as the
4871          conflicting thread's scalar clock when it created this
4872          constraint.  Hence we know the scalar clock of the
4873          conflicting thread when the conflicting access was made. */
4874       confTym = VtsID__indexAt( Cfailed, confThr );
4875
4876       /* Using this scalar clock, index into the conflicting thread's
4877          collection of stack traces made each time its vector clock
4878          (hence its scalar clock) changed.  This gives the stack
4879          traces at the start and end of the conflicting segment (well,
4880          as per comment just above, of one of the conflicting
4881          segments, if there are more than one). */
4882       key.ull = confTym;
4883       key.ec  = NULL;
4884       /* tl_assert(confThr); -- asserted just above */
4885       tl_assert(confThr->local_Kws_n_stacks);
4886       firstIx = lastIx = 0;
4887       found = VG_(lookupXA_UNSAFE)(
4888                  confThr->local_Kws_n_stacks,
4889                  &key, &firstIx, &lastIx,
4890                  (XACmpFn_t)cmp__ULong_n_EC__by_ULong
4891               );
4892       if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
4893                          "confTym %llu found %d (%lu,%lu)\n",
4894                          Cfailed, Kfailed, Cw,
4895                          confThr, confTym, found, firstIx, lastIx);
4896       /* We can't indefinitely collect stack traces at VTS
4897          transitions, since we'd eventually run out of memory.  Hence
4898          note_local_Kw_n_stack_for will eventually throw away old
4899          ones, which in turn means we might fail to find index value
4900          confTym in the array. */
4901       if (found) {
4902          ULong_n_EC *pair_start, *pair_end;
4903          pair_start
4904             = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
4905          hist1_seg_start = pair_start->ec;
4906          if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
4907             pair_end
4908                = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
4909                                             lastIx+1 );
4910             /* from properties of VG_(lookupXA) and the comparison fn used: */
4911             tl_assert(pair_start->ull < pair_end->ull);
4912             hist1_seg_end = pair_end->ec;
4913             /* Could do a bit better here.  It may be that pair_end
4914                doesn't have a stack, but the following entries in the
4915                array have the same scalar Kw and to have a stack.  So
4916                we should search a bit further along the array than
4917                lastIx+1 if hist1_seg_end is NULL. */
4918          } else {
4919             if (!confThr->llexit_done)
4920                hist1_seg_end = main_get_EC( confThr );
4921          }
4922          // seg_start could be NULL iff this is the first stack in the thread
4923          //if (seg_start) VG_(pp_ExeContext)(seg_start);
4924          //if (seg_end)   VG_(pp_ExeContext)(seg_end);
4925          hist1_conf_thr = confThr->hgthread;
4926       }
4927    }
4928
4929    HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
4930                            szB, isWrite,
4931                            hist1_conf_thr, hist1_seg_start, hist1_seg_end );
4932 }
4933
4934 static Bool is_sane_SVal_C ( SVal sv ) {
4935    Bool leq;
4936    if (!SVal__isC(sv)) return True;
4937    leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
4938    return leq;
4939 }
4940
4941
4942 /* Compute new state following a read */
4943 static inline SVal msmcread ( SVal svOld,
4944                               /* The following are only needed for
4945                                  creating error reports. */
4946                               Thr* acc_thr,
4947                               Addr acc_addr, SizeT szB )
4948 {
4949    SVal svNew = SVal_INVALID;
4950    stats__msmcread++;
4951
4952    /* Redundant sanity check on the constraints */
4953    if (CHECK_MSM) {
4954       tl_assert(is_sane_SVal_C(svOld));
4955    }
4956
4957    if (LIKELY(SVal__isC(svOld))) {
4958       VtsID tviR  = acc_thr->viR;
4959       VtsID tviW  = acc_thr->viW;
4960       VtsID rmini = SVal__unC_Rmin(svOld);
4961       VtsID wmini = SVal__unC_Wmin(svOld);
4962       Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
4963       if (LIKELY(leq)) {
4964          /* no race */
4965          /* Note: RWLOCK subtlety: use tviW, not tviR */
4966          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
4967          goto out;
4968       } else {
4969          /* assert on sanity of constraints. */
4970          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
4971          tl_assert(leqxx);
4972          // same as in non-race case
4973          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
4974          record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
4975                            rmini, /* Cfailed */
4976                            tviR,  /* Kfailed */
4977                            wmini  /* Cw */ );
4978          goto out;
4979       }
4980    }
4981    if (SVal__isA(svOld)) {
4982       /* reading no-access memory (sigh); leave unchanged */
4983       /* check for no pollution */
4984       tl_assert(svOld == SVal_NOACCESS);
4985       svNew = SVal_NOACCESS;
4986       goto out;
4987    }
4988    if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
4989    tl_assert(0);
4990
4991   out:
4992    if (CHECK_MSM) {
4993       tl_assert(is_sane_SVal_C(svNew));
4994    }
4995    if (UNLIKELY(svNew != svOld)) {
4996       tl_assert(svNew != SVal_INVALID);
4997       if (HG_(clo_history_level) >= 2
4998           && SVal__isC(svOld) && SVal__isC(svNew)) {
4999          event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
5000          stats__msmcread_change++;
5001       }
5002    }
5003    return svNew;
5004 }
5005
5006
5007 /* Compute new state following a write */
5008 static inline SVal msmcwrite ( SVal svOld,
5009                               /* The following are only needed for
5010                                  creating error reports. */
5011                               Thr* acc_thr,
5012                               Addr acc_addr, SizeT szB )
5013 {
5014    SVal svNew = SVal_INVALID;
5015    stats__msmcwrite++;
5016
5017    /* Redundant sanity check on the constraints */
5018    if (CHECK_MSM) {
5019       tl_assert(is_sane_SVal_C(svOld));
5020    }
5021
5022    if (LIKELY(SVal__isC(svOld))) {
5023       VtsID tviW  = acc_thr->viW;
5024       VtsID wmini = SVal__unC_Wmin(svOld);
5025       Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
5026       if (LIKELY(leq)) {
5027          /* no race */
5028          svNew = SVal__mkC( tviW, tviW );
5029          goto out;
5030       } else {
5031          VtsID rmini = SVal__unC_Rmin(svOld);
5032          /* assert on sanity of constraints. */
5033          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5034          tl_assert(leqxx);
5035          // same as in non-race case
5036          // proof: in the non-race case, we have
5037          //    rmini <= wmini (invar on constraints)
5038          //    tviW <= tviR (invar on thread clocks)
5039          //    wmini <= tviW (from run-time check)
5040          // hence from transitivity of <= we have
5041          //    rmini <= wmini <= tviW
5042          // and so join(rmini,tviW) == tviW
5043          // and    join(wmini,tviW) == tviW
5044          // qed.
5045          svNew = SVal__mkC( VtsID__join2(rmini, tviW),
5046                             VtsID__join2(wmini, tviW) );
5047          record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
5048                            wmini, /* Cfailed */
5049                            tviW,  /* Kfailed */
5050                            wmini  /* Cw */ );
5051          goto out;
5052       }
5053    }
5054    if (SVal__isA(svOld)) {
5055       /* writing no-access memory (sigh); leave unchanged */
5056       /* check for no pollution */
5057       tl_assert(svOld == SVal_NOACCESS);
5058       svNew = SVal_NOACCESS;
5059       goto out;
5060    }
5061    if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
5062    tl_assert(0);
5063
5064   out:
5065    if (CHECK_MSM) {
5066       tl_assert(is_sane_SVal_C(svNew));
5067    }
5068    if (UNLIKELY(svNew != svOld)) {
5069       tl_assert(svNew != SVal_INVALID);
5070       if (HG_(clo_history_level) >= 2
5071           && SVal__isC(svOld) && SVal__isC(svNew)) {
5072          event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
5073          stats__msmcwrite_change++;
5074       }
5075    }
5076    return svNew;
5077 }
5078
5079
5080 /////////////////////////////////////////////////////////
5081 //                                                     //
5082 // Apply core MSM to specific memory locations         //
5083 //                                                     //
5084 /////////////////////////////////////////////////////////
5085
5086 /*------------- ZSM accesses: 8 bit sapply ------------- */
5087
5088 static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
5089    CacheLine* cl;
5090    UWord      cloff, tno, toff;
5091    SVal       svOld, svNew;
5092    UShort     descr;
5093    stats__cline_cread08s++;
5094    cl    = get_cacheline(a);
5095    cloff = get_cacheline_offset(a);
5096    tno   = get_treeno(a);
5097    toff  = get_tree_offset(a); /* == 0 .. 7 */
5098    descr = cl->descrs[tno];
5099    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5100       SVal* tree = &cl->svals[tno << 3];
5101       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5102       if (CHECK_ZSM)
5103          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5104    }
5105    svOld = cl->svals[cloff];
5106    svNew = msmcread( svOld, thr,a,1 );
5107    if (CHECK_ZSM)
5108       tl_assert(svNew != SVal_INVALID);
5109    cl->svals[cloff] = svNew;
5110 }
5111
5112 static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
5113    CacheLine* cl;
5114    UWord      cloff, tno, toff;
5115    SVal       svOld, svNew;
5116    UShort     descr;
5117    stats__cline_cwrite08s++;
5118    cl    = get_cacheline(a);
5119    cloff = get_cacheline_offset(a);
5120    tno   = get_treeno(a);
5121    toff  = get_tree_offset(a); /* == 0 .. 7 */
5122    descr = cl->descrs[tno];
5123    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5124       SVal* tree = &cl->svals[tno << 3];
5125       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5126       if (CHECK_ZSM)
5127          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5128    }
5129    svOld = cl->svals[cloff];
5130    svNew = msmcwrite( svOld, thr,a,1 );
5131    if (CHECK_ZSM)
5132       tl_assert(svNew != SVal_INVALID);
5133    cl->svals[cloff] = svNew;
5134 }
5135
5136 /*------------- ZSM accesses: 16 bit sapply ------------- */
5137
5138 static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
5139    CacheLine* cl;
5140    UWord      cloff, tno, toff;
5141    SVal       svOld, svNew;
5142    UShort     descr;
5143    stats__cline_cread16s++;
5144    if (UNLIKELY(!aligned16(a))) goto slowcase;
5145    cl    = get_cacheline(a);
5146    cloff = get_cacheline_offset(a);
5147    tno   = get_treeno(a);
5148    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5149    descr = cl->descrs[tno];
5150    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5151       if (valid_value_is_below_me_16(descr, toff)) {
5152          goto slowcase;
5153       } else {
5154          SVal* tree = &cl->svals[tno << 3];
5155          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5156       }
5157       if (CHECK_ZSM)
5158          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5159    }
5160    svOld = cl->svals[cloff];
5161    svNew = msmcread( svOld, thr,a,2 );
5162    if (CHECK_ZSM)
5163       tl_assert(svNew != SVal_INVALID);
5164    cl->svals[cloff] = svNew;
5165    return;
5166   slowcase: /* misaligned, or must go further down the tree */
5167    stats__cline_16to8splits++;
5168    zsm_sapply08__msmcread( thr, a + 0 );
5169    zsm_sapply08__msmcread( thr, a + 1 );
5170 }
5171
5172 static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
5173    CacheLine* cl;
5174    UWord      cloff, tno, toff;
5175    SVal       svOld, svNew;
5176    UShort     descr;
5177    stats__cline_cwrite16s++;
5178    if (UNLIKELY(!aligned16(a))) goto slowcase;
5179    cl    = get_cacheline(a);
5180    cloff = get_cacheline_offset(a);
5181    tno   = get_treeno(a);
5182    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5183    descr = cl->descrs[tno];
5184    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5185       if (valid_value_is_below_me_16(descr, toff)) {
5186          goto slowcase;
5187       } else {
5188          SVal* tree = &cl->svals[tno << 3];
5189          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5190       }
5191       if (CHECK_ZSM)
5192          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5193    }
5194    svOld = cl->svals[cloff];
5195    svNew = msmcwrite( svOld, thr,a,2 );
5196    if (CHECK_ZSM)
5197       tl_assert(svNew != SVal_INVALID);
5198    cl->svals[cloff] = svNew;
5199    return;
5200   slowcase: /* misaligned, or must go further down the tree */
5201    stats__cline_16to8splits++;
5202    zsm_sapply08__msmcwrite( thr, a + 0 );
5203    zsm_sapply08__msmcwrite( thr, a + 1 );
5204 }
5205
5206 /*------------- ZSM accesses: 32 bit sapply ------------- */
5207
5208 static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
5209    CacheLine* cl;
5210    UWord      cloff, tno, toff;
5211    SVal       svOld, svNew;
5212    UShort     descr;
5213    stats__cline_cread32s++;
5214    if (UNLIKELY(!aligned32(a))) goto slowcase;
5215    cl    = get_cacheline(a);
5216    cloff = get_cacheline_offset(a);
5217    tno   = get_treeno(a);
5218    toff  = get_tree_offset(a); /* == 0 or 4 */
5219    descr = cl->descrs[tno];
5220    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5221       if (valid_value_is_above_me_32(descr, toff)) {
5222          SVal* tree = &cl->svals[tno << 3];
5223          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5224       } else {
5225          goto slowcase;
5226       }
5227       if (CHECK_ZSM)
5228          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5229    }
5230    svOld = cl->svals[cloff];
5231    svNew = msmcread( svOld, thr,a,4 );
5232    if (CHECK_ZSM)
5233       tl_assert(svNew != SVal_INVALID);
5234    cl->svals[cloff] = svNew;
5235    return;
5236   slowcase: /* misaligned, or must go further down the tree */
5237    stats__cline_32to16splits++;
5238    zsm_sapply16__msmcread( thr, a + 0 );
5239    zsm_sapply16__msmcread( thr, a + 2 );
5240 }
5241
5242 static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
5243    CacheLine* cl;
5244    UWord      cloff, tno, toff;
5245    SVal       svOld, svNew;
5246    UShort     descr;
5247    stats__cline_cwrite32s++;
5248    if (UNLIKELY(!aligned32(a))) goto slowcase;
5249    cl    = get_cacheline(a);
5250    cloff = get_cacheline_offset(a);
5251    tno   = get_treeno(a);
5252    toff  = get_tree_offset(a); /* == 0 or 4 */
5253    descr = cl->descrs[tno];
5254    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5255       if (valid_value_is_above_me_32(descr, toff)) {
5256          SVal* tree = &cl->svals[tno << 3];
5257          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5258       } else {
5259          goto slowcase;
5260       }
5261       if (CHECK_ZSM)
5262          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5263    }
5264    svOld = cl->svals[cloff];
5265    svNew = msmcwrite( svOld, thr,a,4 );
5266    if (CHECK_ZSM)
5267       tl_assert(svNew != SVal_INVALID);
5268    cl->svals[cloff] = svNew;
5269    return;
5270   slowcase: /* misaligned, or must go further down the tree */
5271    stats__cline_32to16splits++;
5272    zsm_sapply16__msmcwrite( thr, a + 0 );
5273    zsm_sapply16__msmcwrite( thr, a + 2 );
5274 }
5275
5276 /*------------- ZSM accesses: 64 bit sapply ------------- */
5277
5278 static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
5279    CacheLine* cl;
5280    UWord      cloff, tno;
5281    //UWord      toff;
5282    SVal       svOld, svNew;
5283    UShort     descr;
5284    stats__cline_cread64s++;
5285    if (UNLIKELY(!aligned64(a))) goto slowcase;
5286    cl    = get_cacheline(a);
5287    cloff = get_cacheline_offset(a);
5288    tno   = get_treeno(a);
5289    //toff  = get_tree_offset(a); /* == 0, unused */
5290    descr = cl->descrs[tno];
5291    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5292       goto slowcase;
5293    }
5294    svOld = cl->svals[cloff];
5295    svNew = msmcread( svOld, thr,a,8 );
5296    if (CHECK_ZSM)
5297       tl_assert(svNew != SVal_INVALID);
5298    cl->svals[cloff] = svNew;
5299    return;
5300   slowcase: /* misaligned, or must go further down the tree */
5301    stats__cline_64to32splits++;
5302    zsm_sapply32__msmcread( thr, a + 0 );
5303    zsm_sapply32__msmcread( thr, a + 4 );
5304 }
5305
5306 static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
5307    CacheLine* cl;
5308    UWord      cloff, tno;
5309    //UWord      toff;
5310    SVal       svOld, svNew;
5311    UShort     descr;
5312    stats__cline_cwrite64s++;
5313    if (UNLIKELY(!aligned64(a))) goto slowcase;
5314    cl    = get_cacheline(a);
5315    cloff = get_cacheline_offset(a);
5316    tno   = get_treeno(a);
5317    //toff  = get_tree_offset(a); /* == 0, unused */
5318    descr = cl->descrs[tno];
5319    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5320       goto slowcase;
5321    }
5322    svOld = cl->svals[cloff];
5323    svNew = msmcwrite( svOld, thr,a,8 );
5324    if (CHECK_ZSM)
5325       tl_assert(svNew != SVal_INVALID);
5326    cl->svals[cloff] = svNew;
5327    return;
5328   slowcase: /* misaligned, or must go further down the tree */
5329    stats__cline_64to32splits++;
5330    zsm_sapply32__msmcwrite( thr, a + 0 );
5331    zsm_sapply32__msmcwrite( thr, a + 4 );
5332 }
5333
5334 /*--------------- ZSM accesses: 8 bit swrite --------------- */
5335
5336 static
5337 void zsm_swrite08 ( Addr a, SVal svNew ) {
5338    CacheLine* cl;
5339    UWord      cloff, tno, toff;
5340    UShort     descr;
5341    stats__cline_swrite08s++;
5342    cl    = get_cacheline(a);
5343    cloff = get_cacheline_offset(a);
5344    tno   = get_treeno(a);
5345    toff  = get_tree_offset(a); /* == 0 .. 7 */
5346    descr = cl->descrs[tno];
5347    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5348       SVal* tree = &cl->svals[tno << 3];
5349       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5350       if (CHECK_ZSM)
5351          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5352    }
5353    tl_assert(svNew != SVal_INVALID);
5354    cl->svals[cloff] = svNew;
5355 }
5356
5357 /*--------------- ZSM accesses: 16 bit swrite --------------- */
5358
5359 static
5360 void zsm_swrite16 ( Addr a, SVal svNew ) {
5361    CacheLine* cl;
5362    UWord      cloff, tno, toff;
5363    UShort     descr;
5364    stats__cline_swrite16s++;
5365    if (UNLIKELY(!aligned16(a))) goto slowcase;
5366    cl    = get_cacheline(a);
5367    cloff = get_cacheline_offset(a);
5368    tno   = get_treeno(a);
5369    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5370    descr = cl->descrs[tno];
5371    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5372       if (valid_value_is_below_me_16(descr, toff)) {
5373          /* Writing at this level.  Need to fix up 'descr'. */
5374          cl->descrs[tno] = pullup_descr_to_16(descr, toff);
5375          /* At this point, the tree does not match cl->descr[tno] any
5376             more.  The assignments below will fix it up. */
5377       } else {
5378          /* We can't indiscriminately write on the w16 node as in the
5379             w64 case, as that might make the node inconsistent with
5380             its parent.  So first, pull down to this level. */
5381          SVal* tree = &cl->svals[tno << 3];
5382          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5383       if (CHECK_ZSM)
5384          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5385       }
5386    }
5387    tl_assert(svNew != SVal_INVALID);
5388    cl->svals[cloff + 0] = svNew;
5389    cl->svals[cloff + 1] = SVal_INVALID;
5390    return;
5391   slowcase: /* misaligned */
5392    stats__cline_16to8splits++;
5393    zsm_swrite08( a + 0, svNew );
5394    zsm_swrite08( a + 1, svNew );
5395 }
5396
5397 /*--------------- ZSM accesses: 32 bit swrite --------------- */
5398
5399 static
5400 void zsm_swrite32 ( Addr a, SVal svNew ) {
5401    CacheLine* cl;
5402    UWord      cloff, tno, toff;
5403    UShort     descr;
5404    stats__cline_swrite32s++;
5405    if (UNLIKELY(!aligned32(a))) goto slowcase;
5406    cl    = get_cacheline(a);
5407    cloff = get_cacheline_offset(a);
5408    tno   = get_treeno(a);
5409    toff  = get_tree_offset(a); /* == 0 or 4 */
5410    descr = cl->descrs[tno];
5411    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5412       if (valid_value_is_above_me_32(descr, toff)) {
5413          /* We can't indiscriminately write on the w32 node as in the
5414             w64 case, as that might make the node inconsistent with
5415             its parent.  So first, pull down to this level. */
5416          SVal* tree = &cl->svals[tno << 3];
5417          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5418          if (CHECK_ZSM)
5419             tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5420       } else {
5421          /* Writing at this level.  Need to fix up 'descr'. */
5422          cl->descrs[tno] = pullup_descr_to_32(descr, toff);
5423          /* At this point, the tree does not match cl->descr[tno] any
5424             more.  The assignments below will fix it up. */
5425       }
5426    }
5427    tl_assert(svNew != SVal_INVALID);
5428    cl->svals[cloff + 0] = svNew;
5429    cl->svals[cloff + 1] = SVal_INVALID;
5430    cl->svals[cloff + 2] = SVal_INVALID;
5431    cl->svals[cloff + 3] = SVal_INVALID;
5432    return;
5433   slowcase: /* misaligned */
5434    stats__cline_32to16splits++;
5435    zsm_swrite16( a + 0, svNew );
5436    zsm_swrite16( a + 2, svNew );
5437 }
5438
5439 /*--------------- ZSM accesses: 64 bit swrite --------------- */
5440
5441 static
5442 void zsm_swrite64 ( Addr a, SVal svNew ) {
5443    CacheLine* cl;
5444    UWord      cloff, tno;
5445    //UWord    toff;
5446    stats__cline_swrite64s++;
5447    if (UNLIKELY(!aligned64(a))) goto slowcase;
5448    cl    = get_cacheline(a);
5449    cloff = get_cacheline_offset(a);
5450    tno   = get_treeno(a);
5451    //toff  = get_tree_offset(a); /* == 0, unused */
5452    cl->descrs[tno] = TREE_DESCR_64;
5453    tl_assert(svNew != SVal_INVALID);
5454    cl->svals[cloff + 0] = svNew;
5455    cl->svals[cloff + 1] = SVal_INVALID;
5456    cl->svals[cloff + 2] = SVal_INVALID;
5457    cl->svals[cloff + 3] = SVal_INVALID;
5458    cl->svals[cloff + 4] = SVal_INVALID;
5459    cl->svals[cloff + 5] = SVal_INVALID;
5460    cl->svals[cloff + 6] = SVal_INVALID;
5461    cl->svals[cloff + 7] = SVal_INVALID;
5462    return;
5463   slowcase: /* misaligned */
5464    stats__cline_64to32splits++;
5465    zsm_swrite32( a + 0, svNew );
5466    zsm_swrite32( a + 4, svNew );
5467 }
5468
5469 /*------------- ZSM accesses: 8 bit sread/scopy ------------- */
5470
5471 static
5472 SVal zsm_sread08 ( Addr a ) {
5473    CacheLine* cl;
5474    UWord      cloff, tno, toff;
5475    UShort     descr;
5476    stats__cline_sread08s++;
5477    cl    = get_cacheline(a);
5478    cloff = get_cacheline_offset(a);
5479    tno   = get_treeno(a);
5480    toff  = get_tree_offset(a); /* == 0 .. 7 */
5481    descr = cl->descrs[tno];
5482    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5483       SVal* tree = &cl->svals[tno << 3];
5484       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5485    }
5486    return cl->svals[cloff];
5487 }
5488
5489 static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
5490    SVal       sv;
5491    stats__cline_scopy08s++;
5492    sv = zsm_sread08( src );
5493    zsm_swrite08( dst, sv );
5494 }
5495
5496
5497 /* Block-copy states (needed for implementing realloc()).  Note this
5498    doesn't change the filtering arrangements.  The caller of
5499    zsm_scopy_range needs to attend to that. */
5500
5501 static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
5502 {
5503    SizeT i;
5504    if (len == 0)
5505       return;
5506
5507    /* assert for non-overlappingness */
5508    tl_assert(src+len <= dst || dst+len <= src);
5509
5510    /* To be simple, just copy byte by byte.  But so as not to wreck
5511       performance for later accesses to dst[0 .. len-1], normalise
5512       destination lines as we finish with them, and also normalise the
5513       line containing the first and last address. */
5514    for (i = 0; i < len; i++) {
5515       Bool normalise
5516          = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
5517            || i == 0       /* first in range */
5518            || i == len-1;  /* last in range */
5519       zsm_scopy08( src+i, dst+i, normalise );
5520    }
5521 }
5522
5523
5524 /* For setting address ranges to a given value.  Has considerable
5525    sophistication so as to avoid generating large numbers of pointless
5526    cache loads/writebacks for large ranges. */
5527
5528 /* Do small ranges in-cache, in the obvious way. */
5529 static
5530 void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
5531 {
5532    /* fast track a couple of common cases */
5533    if (len == 4 && aligned32(a)) {
5534       zsm_swrite32( a, svNew );
5535       return;
5536    }
5537    if (len == 8 && aligned64(a)) {
5538       zsm_swrite64( a, svNew );
5539       return;
5540    }
5541
5542    /* be completely general (but as efficient as possible) */
5543    if (len == 0) return;
5544
5545    if (!aligned16(a) && len >= 1) {
5546       zsm_swrite08( a, svNew );
5547       a += 1;
5548       len -= 1;
5549       tl_assert(aligned16(a));
5550    }
5551    if (len == 0) return;
5552
5553    if (!aligned32(a) && len >= 2) {
5554       zsm_swrite16( a, svNew );
5555       a += 2;
5556       len -= 2;
5557       tl_assert(aligned32(a));
5558    }
5559    if (len == 0) return;
5560
5561    if (!aligned64(a) && len >= 4) {
5562       zsm_swrite32( a, svNew );
5563       a += 4;
5564       len -= 4;
5565       tl_assert(aligned64(a));
5566    }
5567    if (len == 0) return;
5568
5569    if (len >= 8) {
5570       tl_assert(aligned64(a));
5571       while (len >= 8) {
5572          zsm_swrite64( a, svNew );
5573          a += 8;
5574          len -= 8;
5575       }
5576       tl_assert(aligned64(a));
5577    }
5578    if (len == 0) return;
5579
5580    if (len >= 4)
5581       tl_assert(aligned32(a));
5582    if (len >= 4) {
5583       zsm_swrite32( a, svNew );
5584       a += 4;
5585       len -= 4;
5586    }
5587    if (len == 0) return;
5588
5589    if (len >= 2)
5590       tl_assert(aligned16(a));
5591    if (len >= 2) {
5592       zsm_swrite16( a, svNew );
5593       a += 2;
5594       len -= 2;
5595    }
5596    if (len == 0) return;
5597
5598    if (len >= 1) {
5599       zsm_swrite08( a, svNew );
5600       //a += 1;
5601       len -= 1;
5602    }
5603    tl_assert(len == 0);
5604 }
5605
5606
5607 /* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
5608    for larger ranges, try to operate directly on the out-of-cache
5609    representation, rather than dragging lines into the cache,
5610    overwriting them, and forcing them out.  This turns out to be an
5611    important performance optimisation.
5612
5613    Note that this doesn't change the filtering arrangements.  The
5614    caller of zsm_sset_range needs to attend to that. */
5615
5616 static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
5617 {
5618    tl_assert(svNew != SVal_INVALID);
5619    stats__cache_make_New_arange += (ULong)len;
5620
5621    if (0 && len > 500)
5622       VG_(printf)("make New      ( %#lx, %ld )\n", a, len );
5623
5624    if (0) {
5625       static UWord n_New_in_cache = 0;
5626       static UWord n_New_not_in_cache = 0;
5627       /* tag is 'a' with the in-line offset masked out,
5628          eg a[31]..a[4] 0000 */
5629       Addr       tag = a & ~(N_LINE_ARANGE - 1);
5630       UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
5631       if (LIKELY(tag == cache_shmem.tags0[wix])) {
5632          n_New_in_cache++;
5633       } else {
5634          n_New_not_in_cache++;
5635       }
5636       if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
5637          VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
5638                      n_New_in_cache, n_New_not_in_cache );
5639    }
5640
5641    if (LIKELY(len < 2 * N_LINE_ARANGE)) {
5642       zsm_sset_range_SMALL( a, len, svNew );
5643    } else {
5644       Addr  before_start  = a;
5645       Addr  aligned_start = cacheline_ROUNDUP(a);
5646       Addr  after_start   = cacheline_ROUNDDN(a + len);
5647       UWord before_len    = aligned_start - before_start;
5648       UWord aligned_len   = after_start - aligned_start;
5649       UWord after_len     = a + len - after_start;
5650       tl_assert(before_start <= aligned_start);
5651       tl_assert(aligned_start <= after_start);
5652       tl_assert(before_len < N_LINE_ARANGE);
5653       tl_assert(after_len < N_LINE_ARANGE);
5654       tl_assert(get_cacheline_offset(aligned_start) == 0);
5655       if (get_cacheline_offset(a) == 0) {
5656          tl_assert(before_len == 0);
5657          tl_assert(a == aligned_start);
5658       }
5659       if (get_cacheline_offset(a+len) == 0) {
5660          tl_assert(after_len == 0);
5661          tl_assert(after_start == a+len);
5662       }
5663       if (before_len > 0) {
5664          zsm_sset_range_SMALL( before_start, before_len, svNew );
5665       }
5666       if (after_len > 0) {
5667          zsm_sset_range_SMALL( after_start, after_len, svNew );
5668       }
5669       stats__cache_make_New_inZrep += (ULong)aligned_len;
5670
5671       while (1) {
5672          Addr tag;
5673          UWord wix;
5674          if (aligned_start >= after_start)
5675             break;
5676          tl_assert(get_cacheline_offset(aligned_start) == 0);
5677          tag = aligned_start & ~(N_LINE_ARANGE - 1);
5678          wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
5679          if (tag == cache_shmem.tags0[wix]) {
5680             UWord i;
5681             for (i = 0; i < N_LINE_ARANGE / 8; i++)
5682                zsm_swrite64( aligned_start + i * 8, svNew );
5683          } else {
5684             UWord i;
5685             Word zix;
5686             SecMap* sm;
5687             LineZ* lineZ;
5688             /* This line is not in the cache.  Do not force it in; instead
5689                modify it in-place. */
5690             /* find the Z line to write in and rcdec it or the
5691                associated F line. */
5692             find_Z_for_writing( &sm, &zix, tag );
5693             tl_assert(sm);
5694             tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
5695             lineZ = &sm->linesZ[zix];
5696             lineZ->dict[0] = svNew;
5697             lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
5698             for (i = 0; i < N_LINE_ARANGE/4; i++)
5699                lineZ->ix2s[i] = 0; /* all refer to dict[0] */
5700             rcinc_LineZ(lineZ);
5701          }
5702          aligned_start += N_LINE_ARANGE;
5703          aligned_len -= N_LINE_ARANGE;
5704       }
5705       tl_assert(aligned_start == after_start);
5706       tl_assert(aligned_len == 0);
5707    }
5708 }
5709
5710
5711 /////////////////////////////////////////////////////////
5712 //                                                     //
5713 // Front-filtering accesses                            //
5714 //                                                     //
5715 /////////////////////////////////////////////////////////
5716
5717 static UWord stats__f_ac = 0;
5718 static UWord stats__f_sk = 0;
5719
5720 #if 0
5721 #  define STATS__F_SHOW \
5722      do { \
5723         if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
5724            VG_(printf)("filters: ac %lu sk %lu\n",   \
5725            stats__f_ac, stats__f_sk); \
5726      } while (0)
5727 #else
5728 #  define STATS__F_SHOW /* */
5729 #endif
5730
5731 void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
5732    stats__f_ac++;
5733    STATS__F_SHOW;
5734    if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
5735       stats__f_sk++;
5736       return;
5737    }
5738    zsm_sapply08__msmcwrite(thr, a);
5739 }
5740
5741 void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
5742    stats__f_ac++;
5743    STATS__F_SHOW;
5744    if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
5745       stats__f_sk++;
5746       return;
5747    }
5748    zsm_sapply16__msmcwrite(thr, a);
5749 }
5750
5751 void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
5752    stats__f_ac++;
5753    STATS__F_SHOW;
5754    if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
5755       stats__f_sk++;
5756       return;
5757    }
5758    zsm_sapply32__msmcwrite(thr, a);
5759 }
5760
5761 void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
5762    stats__f_ac++;
5763    STATS__F_SHOW;
5764    if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
5765       stats__f_sk++;
5766       return;
5767    }
5768    zsm_sapply64__msmcwrite(thr, a);
5769 }
5770
5771 void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
5772 {
5773    /* fast track a couple of common cases */
5774    if (len == 4 && aligned32(a)) {
5775       zsm_sapply32_f__msmcwrite( thr, a );
5776       return;
5777    }
5778    if (len == 8 && aligned64(a)) {
5779       zsm_sapply64_f__msmcwrite( thr, a );
5780       return;
5781    }
5782
5783    /* be completely general (but as efficient as possible) */
5784    if (len == 0) return;
5785
5786    if (!aligned16(a) && len >= 1) {
5787       zsm_sapply08_f__msmcwrite( thr, a );
5788       a += 1;
5789       len -= 1;
5790       tl_assert(aligned16(a));
5791    }
5792    if (len == 0) return;
5793
5794    if (!aligned32(a) && len >= 2) {
5795       zsm_sapply16_f__msmcwrite( thr, a );
5796       a += 2;
5797       len -= 2;
5798       tl_assert(aligned32(a));
5799    }
5800    if (len == 0) return;
5801
5802    if (!aligned64(a) && len >= 4) {
5803       zsm_sapply32_f__msmcwrite( thr, a );
5804       a += 4;
5805       len -= 4;
5806       tl_assert(aligned64(a));
5807    }
5808    if (len == 0) return;
5809
5810    if (len >= 8) {
5811       tl_assert(aligned64(a));
5812       while (len >= 8) {
5813          zsm_sapply64_f__msmcwrite( thr, a );
5814          a += 8;
5815          len -= 8;
5816       }
5817       tl_assert(aligned64(a));
5818    }
5819    if (len == 0) return;
5820
5821    if (len >= 4)
5822       tl_assert(aligned32(a));
5823    if (len >= 4) {
5824       zsm_sapply32_f__msmcwrite( thr, a );
5825       a += 4;
5826       len -= 4;
5827    }
5828    if (len == 0) return;
5829
5830    if (len >= 2)
5831       tl_assert(aligned16(a));
5832    if (len >= 2) {
5833       zsm_sapply16_f__msmcwrite( thr, a );
5834       a += 2;
5835       len -= 2;
5836    }
5837    if (len == 0) return;
5838
5839    if (len >= 1) {
5840       zsm_sapply08_f__msmcwrite( thr, a );
5841       //a += 1;
5842       len -= 1;
5843    }
5844    tl_assert(len == 0);
5845 }
5846
5847 void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
5848    stats__f_ac++;
5849    STATS__F_SHOW;
5850    if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
5851       stats__f_sk++;
5852       return;
5853    }
5854    zsm_sapply08__msmcread(thr, a);
5855 }
5856
5857 void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
5858    stats__f_ac++;
5859    STATS__F_SHOW;
5860    if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
5861       stats__f_sk++;
5862       return;
5863    }
5864    zsm_sapply16__msmcread(thr, a);
5865 }
5866
5867 void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
5868    stats__f_ac++;
5869    STATS__F_SHOW;
5870    if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
5871       stats__f_sk++;
5872       return;
5873    }
5874    zsm_sapply32__msmcread(thr, a);
5875 }
5876
5877 void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
5878    stats__f_ac++;
5879    STATS__F_SHOW;
5880    if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
5881       stats__f_sk++;
5882       return;
5883    }
5884    zsm_sapply64__msmcread(thr, a);
5885 }
5886
5887 void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
5888 {
5889    /* fast track a couple of common cases */
5890    if (len == 4 && aligned32(a)) {
5891       zsm_sapply32_f__msmcread( thr, a );
5892       return;
5893    }
5894    if (len == 8 && aligned64(a)) {
5895       zsm_sapply64_f__msmcread( thr, a );
5896       return;
5897    }
5898
5899    /* be completely general (but as efficient as possible) */
5900    if (len == 0) return;
5901
5902    if (!aligned16(a) && len >= 1) {
5903       zsm_sapply08_f__msmcread( thr, a );
5904       a += 1;
5905       len -= 1;
5906       tl_assert(aligned16(a));
5907    }
5908    if (len == 0) return;
5909
5910    if (!aligned32(a) && len >= 2) {
5911       zsm_sapply16_f__msmcread( thr, a );
5912       a += 2;
5913       len -= 2;
5914       tl_assert(aligned32(a));
5915    }
5916    if (len == 0) return;
5917
5918    if (!aligned64(a) && len >= 4) {
5919       zsm_sapply32_f__msmcread( thr, a );
5920       a += 4;
5921       len -= 4;
5922       tl_assert(aligned64(a));
5923    }
5924    if (len == 0) return;
5925
5926    if (len >= 8) {
5927       tl_assert(aligned64(a));
5928       while (len >= 8) {
5929          zsm_sapply64_f__msmcread( thr, a );
5930          a += 8;
5931          len -= 8;
5932       }
5933       tl_assert(aligned64(a));
5934    }
5935    if (len == 0) return;
5936
5937    if (len >= 4)
5938       tl_assert(aligned32(a));
5939    if (len >= 4) {
5940       zsm_sapply32_f__msmcread( thr, a );
5941       a += 4;
5942       len -= 4;
5943    }
5944    if (len == 0) return;
5945
5946    if (len >= 2)
5947       tl_assert(aligned16(a));
5948    if (len >= 2) {
5949       zsm_sapply16_f__msmcread( thr, a );
5950       a += 2;
5951       len -= 2;
5952    }
5953    if (len == 0) return;
5954
5955    if (len >= 1) {
5956       zsm_sapply08_f__msmcread( thr, a );
5957       //a += 1;
5958       len -= 1;
5959    }
5960    tl_assert(len == 0);
5961 }
5962
5963 void libhb_Thr_resumes ( Thr* thr )
5964 {
5965    if (0) VG_(printf)("resume %p\n", thr);
5966    tl_assert(thr);
5967    tl_assert(!thr->llexit_done);
5968    Filter__clear(thr->filter, "libhb_Thr_resumes");
5969    /* A kludge, but .. if this thread doesn't have any marker stacks
5970       at all, get one right now.  This is easier than figuring out
5971       exactly when at thread startup we can and can't take a stack
5972       snapshot. */
5973    if (HG_(clo_history_level) == 1) {
5974       tl_assert(thr->local_Kws_n_stacks);
5975       if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
5976          note_local_Kw_n_stack_for(thr);
5977    }
5978 }
5979
5980
5981 /////////////////////////////////////////////////////////
5982 //                                                     //
5983 // Synchronisation objects                             //
5984 //                                                     //
5985 /////////////////////////////////////////////////////////
5986
5987 /* A double linked list of all the SO's. */
5988 SO* admin_SO = NULL;
5989
5990 static SO* SO__Alloc ( void )
5991 {
5992    SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
5993    so->viR   = VtsID_INVALID;
5994    so->viW   = VtsID_INVALID;
5995    so->magic = SO_MAGIC;
5996    /* Add to double linked list */
5997    if (admin_SO) {
5998       tl_assert(admin_SO->admin_prev == NULL);
5999       admin_SO->admin_prev = so;
6000       so->admin_next = admin_SO;
6001    } else {
6002       so->admin_next = NULL;
6003    }
6004    so->admin_prev = NULL;
6005    admin_SO = so;
6006    /* */
6007    return so;
6008 }
6009
6010 static void SO__Dealloc ( SO* so )
6011 {
6012    tl_assert(so);
6013    tl_assert(so->magic == SO_MAGIC);
6014    if (so->viR == VtsID_INVALID) {
6015       tl_assert(so->viW == VtsID_INVALID);
6016    } else {
6017       tl_assert(so->viW != VtsID_INVALID);
6018       VtsID__rcdec(so->viR);
6019       VtsID__rcdec(so->viW);
6020    }
6021    so->magic = 0;
6022    /* Del from double linked list */
6023    if (so->admin_prev)
6024       so->admin_prev->admin_next = so->admin_next;
6025    if (so->admin_next)
6026       so->admin_next->admin_prev = so->admin_prev;
6027    if (so == admin_SO)
6028       admin_SO = so->admin_next;
6029    /* */
6030    HG_(free)( so );
6031 }
6032
6033
6034 /////////////////////////////////////////////////////////
6035 //                                                     //
6036 // Top Level API                                       //
6037 //                                                     //
6038 /////////////////////////////////////////////////////////
6039
6040 static void show_thread_state ( const HChar* str, Thr* t )
6041 {
6042    if (1) return;
6043    if (t->viR == t->viW) {
6044       VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
6045       VtsID__pp( t->viR );
6046       VG_(printf)("%s","\n");
6047    } else {
6048       VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
6049       VtsID__pp( t->viR );
6050       VG_(printf)(" viW %u==", t->viW);
6051       VtsID__pp( t->viW );
6052       VG_(printf)("%s","\n");
6053    }
6054 }
6055
6056
6057 Thr* libhb_init (
6058         void        (*get_stacktrace)( Thr*, Addr*, UWord ),
6059         ExeContext* (*get_EC)( Thr* )
6060      )
6061 {
6062    Thr*  thr;
6063    VtsID vi;
6064
6065    // We will have to have to store a large number of these,
6066    // so make sure they're the size we expect them to be.
6067    tl_assert(sizeof(ScalarTS) == 8);
6068
6069    /* because first 1024 unusable */
6070    tl_assert(SCALARTS_N_THRBITS >= 11);
6071    /* so as to fit in a UInt w/ 3 bits to spare (see defn of
6072       Thr_n_RCEC). */
6073    tl_assert(SCALARTS_N_THRBITS <= 29);
6074
6075    /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
6076       (32-bit).  It's not correctness-critical, but there are a lot of
6077       them, so it's important from a space viewpoint.  Unfortunately
6078       we simply can't pack it into 2 words on a 32-bit target. */
6079    if (sizeof(UWord) == 8) {
6080       tl_assert(sizeof(Thr_n_RCEC) == 16);
6081    } else {
6082       tl_assert(sizeof(Thr_n_RCEC) == 12);
6083    }
6084
6085    /* Word sets really are 32 bits.  Even on a 64 bit target. */
6086    tl_assert(sizeof(WordSetID) == 4);
6087    tl_assert(sizeof(WordSet) == sizeof(WordSetID));
6088
6089    tl_assert(get_stacktrace);
6090    tl_assert(get_EC);
6091    main_get_stacktrace   = get_stacktrace;
6092    main_get_EC           = get_EC;
6093
6094    // No need to initialise hg_wordfm.
6095    // No need to initialise hg_wordset.
6096
6097    /* Allocated once and never deallocated.  Used as a temporary in
6098       VTS singleton, tick and join operations. */
6099    temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
6100    temp_max_sized_VTS->id = VtsID_INVALID;
6101    verydead_thread_table_init();
6102    vts_set_init();
6103    vts_tab_init();
6104    event_map_init();
6105    VtsID__invalidate_caches();
6106
6107    // initialise shadow memory
6108    zsm_init( SVal__rcinc, SVal__rcdec );
6109
6110    thr = Thr__new();
6111    vi  = VtsID__mk_Singleton( thr, 1 );
6112    thr->viR = vi;
6113    thr->viW = vi;
6114    VtsID__rcinc(thr->viR);
6115    VtsID__rcinc(thr->viW);
6116
6117    show_thread_state("  root", thr);
6118    return thr;
6119 }
6120
6121
6122 Thr* libhb_create ( Thr* parent )
6123 {
6124    /* The child's VTSs are copies of the parent's VTSs, but ticked at
6125       the child's index.  Since the child's index is guaranteed
6126       unique, it has never been seen before, so the implicit value
6127       before the tick is zero and after that is one. */
6128    Thr* child = Thr__new();
6129
6130    child->viR = VtsID__tick( parent->viR, child );
6131    child->viW = VtsID__tick( parent->viW, child );
6132    Filter__clear(child->filter, "libhb_create(child)");
6133    VtsID__rcinc(child->viR);
6134    VtsID__rcinc(child->viW);
6135    /* We need to do note_local_Kw_n_stack_for( child ), but it's too
6136       early for that - it may not have a valid TId yet.  So, let
6137       libhb_Thr_resumes pick it up the first time the thread runs. */
6138
6139    tl_assert(VtsID__indexAt( child->viR, child ) == 1);
6140    tl_assert(VtsID__indexAt( child->viW, child ) == 1);
6141
6142    /* and the parent has to move along too */
6143    VtsID__rcdec(parent->viR);
6144    VtsID__rcdec(parent->viW);
6145    parent->viR = VtsID__tick( parent->viR, parent );
6146    parent->viW = VtsID__tick( parent->viW, parent );
6147    Filter__clear(parent->filter, "libhb_create(parent)");
6148    VtsID__rcinc(parent->viR);
6149    VtsID__rcinc(parent->viW);
6150    note_local_Kw_n_stack_for( parent );
6151
6152    show_thread_state(" child", child);
6153    show_thread_state("parent", parent);
6154
6155    return child;
6156 }
6157
6158 /* Shut down the library, and print stats (in fact that's _all_
6159    this is for. */
6160 void libhb_shutdown ( Bool show_stats )
6161 {
6162    if (show_stats) {
6163       VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
6164       VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
6165                   stats__secmaps_allocd,
6166                   stats__secmap_ga_space_covered);
6167       VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
6168                   stats__secmap_linesZ_allocd,
6169                   stats__secmap_linesZ_bytes);
6170       VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)\n",
6171                   stats__secmap_linesF_allocd,
6172                   stats__secmap_linesF_bytes);
6173       VG_(printf)(" secmaps: %'10lu iterator steppings\n",
6174                   stats__secmap_iterator_steppings);
6175       VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
6176                   stats__secmaps_search, stats__secmaps_search_slow);
6177
6178       VG_(printf)("%s","\n");
6179       VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
6180                   stats__cache_totrefs, stats__cache_totmisses );
6181       VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
6182                   stats__cache_Z_fetches, stats__cache_F_fetches );
6183       VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
6184                   stats__cache_Z_wbacks, stats__cache_F_wbacks );
6185       VG_(printf)("   cache: %'14lu invals,     %'14lu flushes\n",
6186                   stats__cache_invals, stats__cache_flushes );
6187       VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
6188                   stats__cache_make_New_arange,
6189                   stats__cache_make_New_inZrep);
6190
6191       VG_(printf)("%s","\n");
6192       VG_(printf)("   cline: %'10lu normalises\n",
6193                   stats__cline_normalises );
6194       VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6195                   stats__cline_cread64s,
6196                   stats__cline_cread32s,
6197                   stats__cline_cread16s,
6198                   stats__cline_cread08s );
6199       VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6200                   stats__cline_cwrite64s,
6201                   stats__cline_cwrite32s,
6202                   stats__cline_cwrite16s,
6203                   stats__cline_cwrite08s );
6204       VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6205                   stats__cline_swrite64s,
6206                   stats__cline_swrite32s,
6207                   stats__cline_swrite16s,
6208                   stats__cline_swrite08s );
6209       VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
6210                   stats__cline_sread08s, stats__cline_scopy08s );
6211       VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
6212                  stats__cline_64to32splits,
6213                  stats__cline_32to16splits,
6214                  stats__cline_16to8splits );
6215       VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
6216                  stats__cline_64to32pulldown,
6217                  stats__cline_32to16pulldown,
6218                  stats__cline_16to8pulldown );
6219       if (0)
6220       VG_(printf)("   cline: sizeof(CacheLineZ) %ld, covers %ld bytes of arange\n",
6221                   (Word)sizeof(LineZ), (Word)N_LINE_ARANGE);
6222
6223       VG_(printf)("%s","\n");
6224
6225       VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
6226                   stats__msmcread, stats__msmcread_change);
6227       VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
6228                   stats__msmcwrite, stats__msmcwrite_change);
6229       VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
6230                   stats__cmpLEQ_queries, stats__cmpLEQ_misses);
6231       VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
6232                   stats__join2_queries, stats__join2_misses);
6233
6234       VG_(printf)("%s","\n");
6235       VG_(printf)( "   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
6236                    stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
6237       VG_(printf)( "   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
6238                    stats__vts__cmp_structural, stats__vts__cmp_structural_slow );
6239       VG_(printf)( "   libhb: VTSset: find__or__clone_and_add %'lu (%'lu allocd)\n",
6240                    stats__vts_set__focaa, stats__vts_set__focaa_a );
6241       VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
6242                    stats__vts__indexat_slow );
6243
6244       VG_(printf)("%s","\n");
6245       VG_(printf)(
6246          "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
6247          VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
6248       );
6249       VG_(printf)( "   libhb: %lu entries in vts_set\n",
6250                    VG_(sizeFM)( vts_set ) );
6251
6252       VG_(printf)("%s","\n");
6253       VG_(printf)( "   libhb: ctxt__rcdec: 1=%lu(%lu eq), 2=%lu, 3=%lu\n",
6254                    stats__ctxt_rcdec1, stats__ctxt_rcdec1_eq,
6255                    stats__ctxt_rcdec2,
6256                    stats__ctxt_rcdec3 );
6257       VG_(printf)( "   libhb: ctxt__rcdec: calls %lu, discards %lu\n",
6258                    stats__ctxt_rcdec_calls, stats__ctxt_rcdec_discards);
6259       VG_(printf)( "   libhb: contextTab: %lu slots, %lu max ents\n",
6260                    (UWord)N_RCEC_TAB,
6261                    stats__ctxt_tab_curr );
6262       VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
6263                    stats__ctxt_tab_qs,
6264                    stats__ctxt_tab_cmps );
6265 #if 0
6266       VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
6267       VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
6268       VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
6269       VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
6270       VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
6271       VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
6272       VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
6273       VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
6274       VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
6275       VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
6276       VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
6277       VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
6278       VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
6279       VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));
6280
6281       VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
6282       VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
6283       VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
6284       VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
6285 #endif
6286
6287       VG_(printf)("%s","<<< END libhb stats >>>\n");
6288       VG_(printf)("%s","\n");
6289
6290    }
6291 }
6292
6293 /* Receive notification that a thread has low level exited.  The
6294    significance here is that we do not expect to see any more memory
6295    references from it. */
6296 void libhb_async_exit ( Thr* thr )
6297 {
6298    tl_assert(thr);
6299    tl_assert(!thr->llexit_done);
6300    thr->llexit_done = True;
6301
6302    /* free up Filter and local_Kws_n_stacks (well, actually not the
6303       latter ..) */
6304    tl_assert(thr->filter);
6305    HG_(free)(thr->filter);
6306    thr->filter = NULL;
6307
6308    /* Tell the VTS mechanism this thread has exited, so it can
6309       participate in VTS pruning.  Note this can only happen if the
6310       thread has both ll_exited and has been joined with. */
6311    if (thr->joinedwith_done)
6312       VTS__declare_thread_very_dead(thr);
6313
6314    /* Another space-accuracy tradeoff.  Do we want to be able to show
6315       H1 history for conflicts in threads which have since exited?  If
6316       yes, then we better not free up thr->local_Kws_n_stacks.  The
6317       downside is a potential per-thread leak of up to
6318       N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
6319       XArray average overcommit factor is (1.5 I'd guess). */
6320    // hence:
6321    // VG_(deleteXA)(thr->local_Kws_n_stacks);
6322    // thr->local_Kws_n_stacks = NULL;
6323 }
6324
6325 /* Receive notification that a thread has been joined with.  The
6326    significance here is that we do not expect to see any further
6327    references to its vector clocks (Thr::viR and Thr::viW). */
6328 void libhb_joinedwith_done ( Thr* thr )
6329 {
6330    tl_assert(thr);
6331    /* Caller must ensure that this is only ever called once per Thr. */
6332    tl_assert(!thr->joinedwith_done);
6333    thr->joinedwith_done = True;
6334    if (thr->llexit_done)
6335       VTS__declare_thread_very_dead(thr);
6336 }
6337
6338
6339 /* Both Segs and SOs point to VTSs.  However, there is no sharing, so
6340    a Seg that points at a VTS is its one-and-only owner, and ditto for
6341    a SO that points at a VTS. */
6342
6343 SO* libhb_so_alloc ( void )
6344 {
6345    return SO__Alloc();
6346 }
6347
6348 void libhb_so_dealloc ( SO* so )
6349 {
6350    tl_assert(so);
6351    tl_assert(so->magic == SO_MAGIC);
6352    SO__Dealloc(so);
6353 }
6354
6355 /* See comments in libhb.h for details on the meaning of
6356    strong vs weak sends and strong vs weak receives. */
6357 void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
6358 {
6359    /* Copy the VTSs from 'thr' into the sync object, and then move
6360       the thread along one step. */
6361
6362    tl_assert(so);
6363    tl_assert(so->magic == SO_MAGIC);
6364
6365    /* stay sane .. a thread's read-clock must always lead or be the
6366       same as its write-clock */
6367    { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
6368      tl_assert(leq);
6369    }
6370
6371    /* since we're overwriting the VtsIDs in the SO, we need to drop
6372       any references made by the previous contents thereof */
6373    if (so->viR == VtsID_INVALID) {
6374       tl_assert(so->viW == VtsID_INVALID);
6375       so->viR = thr->viR;
6376       so->viW = thr->viW;
6377       VtsID__rcinc(so->viR);
6378       VtsID__rcinc(so->viW);
6379    } else {
6380       /* In a strong send, we dump any previous VC in the SO and
6381          install the sending thread's VC instead.  For a weak send we
6382          must join2 with what's already there. */
6383       tl_assert(so->viW != VtsID_INVALID);
6384       VtsID__rcdec(so->viR);
6385       VtsID__rcdec(so->viW);
6386       so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
6387       so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
6388       VtsID__rcinc(so->viR);
6389       VtsID__rcinc(so->viW);
6390    }
6391
6392    /* move both parent clocks along */
6393    VtsID__rcdec(thr->viR);
6394    VtsID__rcdec(thr->viW);
6395    thr->viR = VtsID__tick( thr->viR, thr );
6396    thr->viW = VtsID__tick( thr->viW, thr );
6397    if (!thr->llexit_done) {
6398       Filter__clear(thr->filter, "libhb_so_send");
6399       note_local_Kw_n_stack_for(thr);
6400    }
6401    VtsID__rcinc(thr->viR);
6402    VtsID__rcinc(thr->viW);
6403
6404    if (strong_send)
6405       show_thread_state("s-send", thr);
6406    else
6407       show_thread_state("w-send", thr);
6408 }
6409
6410 void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
6411 {
6412    tl_assert(so);
6413    tl_assert(so->magic == SO_MAGIC);
6414
6415    if (so->viR != VtsID_INVALID) {
6416       tl_assert(so->viW != VtsID_INVALID);
6417
6418       /* Weak receive (basically, an R-acquisition of a R-W lock).
6419          This advances the read-clock of the receiver, but not the
6420          write-clock. */
6421       VtsID__rcdec(thr->viR);
6422       thr->viR = VtsID__join2( thr->viR, so->viR );
6423       VtsID__rcinc(thr->viR);
6424
6425       /* At one point (r10589) it seemed safest to tick the clocks for
6426          the receiving thread after the join.  But on reflection, I
6427          wonder if that might cause it to 'overtake' constraints,
6428          which could lead to missing races.  So, back out that part of
6429          r10589. */
6430       //VtsID__rcdec(thr->viR);
6431       //thr->viR = VtsID__tick( thr->viR, thr );
6432       //VtsID__rcinc(thr->viR);
6433
6434       /* For a strong receive, we also advance the receiver's write
6435          clock, which means the receive as a whole is essentially
6436          equivalent to a W-acquisition of a R-W lock. */
6437       if (strong_recv) {
6438          VtsID__rcdec(thr->viW);
6439          thr->viW = VtsID__join2( thr->viW, so->viW );
6440          VtsID__rcinc(thr->viW);
6441
6442          /* See comment just above, re r10589. */
6443          //VtsID__rcdec(thr->viW);
6444          //thr->viW = VtsID__tick( thr->viW, thr );
6445          //VtsID__rcinc(thr->viW);
6446       }
6447
6448       if (thr->filter)
6449          Filter__clear(thr->filter, "libhb_so_recv");
6450       note_local_Kw_n_stack_for(thr);
6451
6452       if (strong_recv)
6453          show_thread_state("s-recv", thr);
6454       else
6455          show_thread_state("w-recv", thr);
6456
6457    } else {
6458       tl_assert(so->viW == VtsID_INVALID);
6459       /* Deal with degenerate case: 'so' has no vts, so there has been
6460          no message posted to it.  Just ignore this case. */
6461       show_thread_state("d-recv", thr);
6462    }
6463 }
6464
6465 Bool libhb_so_everSent ( SO* so )
6466 {
6467    if (so->viR == VtsID_INVALID) {
6468       tl_assert(so->viW == VtsID_INVALID);
6469       return False;
6470    } else {
6471       tl_assert(so->viW != VtsID_INVALID);
6472       return True;
6473    }
6474 }
6475
6476 #define XXX1 0 // 0x67a106c
6477 #define XXX2 0
6478
6479 static inline Bool TRACEME(Addr a, SizeT szB) {
6480    if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
6481    if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
6482    return False;
6483 }
6484 static void trace ( Thr* thr, Addr a, SizeT szB, const HChar* s )
6485 {
6486   SVal sv = zsm_sread08(a);
6487   VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
6488   show_thread_state("", thr);
6489   VG_(printf)("%s","\n");
6490 }
6491
6492 void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
6493 {
6494    SVal sv = SVal__mkC(thr->viW, thr->viW);
6495    tl_assert(is_sane_SVal_C(sv));
6496    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
6497    zsm_sset_range( a, szB, sv );
6498    Filter__clear_range( thr->filter, a, szB );
6499    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
6500 }
6501
6502 void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
6503 {
6504    /* do nothing */
6505 }
6506
6507 void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
6508 {
6509    /* This really does put the requested range in NoAccess.  It's
6510       expensive though. */
6511    SVal sv = SVal_NOACCESS;
6512    tl_assert(is_sane_SVal_C(sv));
6513    zsm_sset_range( a, szB, sv );
6514    Filter__clear_range( thr->filter, a, szB );
6515 }
6516
6517 void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
6518 {
6519    SVal sv = SVal_NOACCESS;
6520    tl_assert(is_sane_SVal_C(sv));
6521    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
6522    zsm_sset_range( a, szB, sv );
6523    Filter__clear_range( thr->filter, a, szB );
6524    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
6525 }
6526
6527 Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
6528    tl_assert(thr);
6529    return thr->hgthread;
6530 }
6531
6532 void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
6533    tl_assert(thr);
6534    thr->hgthread = hgthread;
6535 }
6536
6537 void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
6538 {
6539    zsm_scopy_range(src, dst, len);
6540    Filter__clear_range( thr->filter, dst, len );
6541 }
6542
6543 void libhb_maybe_GC ( void )
6544 {
6545    event_map_maybe_GC();
6546    /* If there are still freelist entries available, no need for a
6547       GC. */
6548    if (vts_tab_freelist != VtsID_INVALID)
6549       return;
6550    /* So all the table entries are full, and we're having to expand
6551       the table.  But did we hit the threshhold point yet? */
6552    if (VG_(sizeXA)( vts_tab ) < vts_next_GC_at)
6553       return;
6554    vts_tab__do_GC( False/*don't show stats*/ );
6555 }
6556
6557
6558 /////////////////////////////////////////////////////////////////
6559 /////////////////////////////////////////////////////////////////
6560 //                                                             //
6561 // SECTION END main library                                    //
6562 //                                                             //
6563 /////////////////////////////////////////////////////////////////
6564 /////////////////////////////////////////////////////////////////
6565
6566 /*--------------------------------------------------------------------*/
6567 /*--- end                                             libhb_main.c ---*/
6568 /*--------------------------------------------------------------------*/