helgrind/libhb_core.c

   1
   2 /*--------------------------------------------------------------------*/
   3 /*--- LibHB: a library for implementing and checking               ---*/
   4 /*--- the happens-before relationship in concurrent programs.      ---*/
   5 /*---                                                 libhb_main.c ---*/
   6 /*--------------------------------------------------------------------*/
   7
   8 /*
   9    This file is part of LibHB, a library for implementing and checking
  10    the happens-before relationship in concurrent programs.
  11
  12    Copyright (C) 2008-2017 OpenWorks Ltd
  13       info@open-works.co.uk
  14
  15    This program is free software; you can redistribute it and/or
  16    modify it under the terms of the GNU General Public License as
  17    published by the Free Software Foundation; either version 2 of the
  18    License, or (at your option) any later version.
  19
  20    This program is distributed in the hope that it will be useful, but
  21    WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23    General Public License for more details.
  24
  25    You should have received a copy of the GNU General Public License
  26    along with this program; if not, see <http://www.gnu.org/licenses/>.
  27
  28    The GNU General Public License is contained in the file COPYING.
  29 */
  30
  31 #include "pub_tool_basics.h"
  32 #include "pub_tool_poolalloc.h"
  33 #include "pub_tool_libcassert.h"
  34 #include "pub_tool_libcbase.h"
  35 #include "pub_tool_libcprint.h"
  36 #include "pub_tool_machine.h"
  37 #include "pub_tool_mallocfree.h"
  38 #include "pub_tool_wordfm.h"
  39 #include "pub_tool_hashtable.h"
  40 #include "pub_tool_xarray.h"
  41 #include "pub_tool_oset.h"
  42 #include "pub_tool_threadstate.h"
  43 #include "pub_tool_aspacemgr.h"
  44 #include "pub_tool_stacktrace.h"
  45 #include "pub_tool_execontext.h"
  46 #include "pub_tool_errormgr.h"
  47 #include "pub_tool_debuginfo.h"
  48 #include "pub_tool_gdbserver.h"
  49 #include "pub_tool_options.h"        // VG_(clo_stats)
  50 #include "hg_basics.h"
  51 #include "hg_wordset.h"
  52 #include "hg_lock_n_thread.h"
  53 #include "hg_errors.h"
  54
  55 #include "libhb.h"
  56
  57
  58 /////////////////////////////////////////////////////////////////
  59 /////////////////////////////////////////////////////////////////
  60 //                                                             //
  61 // Debugging #defines                                          //
  62 //                                                             //
  63 /////////////////////////////////////////////////////////////////
  64 /////////////////////////////////////////////////////////////////
  65
  66 /* Check the sanity of shadow values in the core memory state
  67    machine.  Change #if 0 to #if 1 to enable this. */
  68 #if 0
  69 #  define CHECK_MSM 1
  70 #else
  71 #  define CHECK_MSM 0
  72 #endif
  73
  74
  75 /* Check sanity (reference counts, etc) in the conflicting access
  76    machinery.  Change #if 0 to #if 1 to enable this. */
  77 #if 0
  78 #  define CHECK_CEM 1
  79 #else
  80 #  define CHECK_CEM 0
  81 #endif
  82
  83
  84 /* Check sanity in the compressed shadow memory machinery,
  85    particularly in its caching innards.  Unfortunately there's no
  86    almost-zero-cost way to make them selectable at run time.  Hence
  87    set the #if 0 to #if 1 and rebuild if you want them. */
  88 #if 0
  89 #  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
  90 #  define inline __attribute__((noinline))
  91    /* probably want to ditch -fomit-frame-pointer too */
  92 #else
  93 #  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
  94 #endif
  95
  96 /* Define to 1 to activate tracing cached rcec. */
  97 #define DEBUG_CACHED_RCEC 0
  98
  99 /////////////////////////////////////////////////////////////////
 100 /////////////////////////////////////////////////////////////////
 101 //                                                             //
 102 // data decls: VtsID                                           //
 103 //                                                             //
 104 /////////////////////////////////////////////////////////////////
 105 /////////////////////////////////////////////////////////////////
 106
 107 /* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
 108    bits, since they have to be packed into the lowest 30 bits of an
 109    SVal. */
 110 typedef  UInt  VtsID;
 111 #define VtsID_INVALID 0xFFFFFFFF
 112
 113
 114
 115 /////////////////////////////////////////////////////////////////
 116 /////////////////////////////////////////////////////////////////
 117 //                                                             //
 118 // data decls: SVal                                            //
 119 //                                                             //
 120 /////////////////////////////////////////////////////////////////
 121 /////////////////////////////////////////////////////////////////
 122
 123 typedef  ULong  SVal;
 124
 125 /* This value has special significance to the implementation, and callers
 126    may not store it in the shadow memory. */
 127 #define SVal_INVALID (3ULL << 62)
 128
 129 /* This is the default value for shadow memory.  Initially the shadow
 130    memory contains no accessible areas and so all reads produce this
 131    value.  TODO: make this caller-defineable. */
 132 #define SVal_NOACCESS (2ULL << 62)
 133
 134
 135
 136 /////////////////////////////////////////////////////////////////
 137 /////////////////////////////////////////////////////////////////
 138 //                                                             //
 139 // data decls: ScalarTS                                        //
 140 //                                                             //
 141 /////////////////////////////////////////////////////////////////
 142 /////////////////////////////////////////////////////////////////
 143
 144 /* Scalar Timestamp.  We have to store a lot of these, so there is
 145    some effort to make them as small as possible.  Logically they are
 146    a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
 147    We pack it into 64 bits by representing the Thr* using a ThrID, a
 148    small integer (18 bits), and a 46 bit integer for the timestamp
 149    number.  The 46/18 split is arbitrary, but has the effect that
 150    Helgrind can only handle programs that create 2^18 or fewer threads
 151    over their entire lifetime, and have no more than 2^46 timestamp
 152    ticks (synchronisation operations on the same thread).
 153
 154    This doesn't seem like much of a limitation.  2^46 ticks is
 155    7.06e+13, and if each tick (optimistically) takes the machine 1000
 156    cycles to process, then the minimum time to process that many ticks
 157    at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
 158    but VTS ticks, which isn't realistic.
 159
 160    NB1: SCALARTS_N_THRBITS must be 27 or lower.  The obvious limit is
 161    32 since a ThrID is a UInt.  27 comes from the fact that
 162    'Thr_n_RCEC', which records information about old accesses, packs
 163    in tsw not only a ThrID but also minimum 4+1 other bits (access size
 164    and writeness) in a UInt, hence limiting size to 32-(4+1) == 27.
 165
 166    NB2: thrid values are issued upwards from 1024, and values less
 167    than that aren't valid.  This isn't per se necessary (any order
 168    will do, so long as they are unique), but it does help ensure they
 169    are less likely to get confused with the various other kinds of
 170    small-integer thread ids drifting around (eg, TId).
 171    So, SCALARTS_N_THRBITS must be 11 or more.
 172    See also NB5.
 173
 174    NB3: this probably also relies on the fact that Thr's are never
 175    deallocated -- they exist forever.  Hence the 1-1 mapping from
 176    Thr's to thrid values (set up in Thr__new) persists forever.
 177
 178    NB4: temp_max_sized_VTS is allocated at startup and never freed.
 179    It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
 180    ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
 181    making the memory use for this go sky-high.  With
 182    SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
 183    like an OK tradeoff.  If more than 256k threads need to be
 184    supported, we could change SCALARTS_N_THRBITS to 20, which would
 185    facilitate supporting 1 million threads at the cost of 8MB storage
 186    for temp_max_sized_VTS.
 187
 188    NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
 189    ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
 190    must never be a valid ThrID.  Given NB2 that's OK.
 191 */
 192 #define SCALARTS_N_THRBITS 18  /* valid range: 11 to 27 inclusive,
 193                                   See NB1 and NB2 above. */
 194
 195 #define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
 196 typedef
 197    struct {
 198       ThrID thrid : SCALARTS_N_THRBITS;
 199       ULong tym   : SCALARTS_N_TYMBITS;
 200    }
 201    ScalarTS;
 202
 203 #define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
 204
 205
 206
 207 /////////////////////////////////////////////////////////////////
 208 /////////////////////////////////////////////////////////////////
 209 //                                                             //
 210 // data decls: Filter                                          //
 211 //                                                             //
 212 /////////////////////////////////////////////////////////////////
 213 /////////////////////////////////////////////////////////////////
 214
 215 // baseline: 5, 9
 216 #define FI_LINE_SZB_LOG2  5
 217 #define FI_NUM_LINES_LOG2 10
 218
 219 #define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
 220 #define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
 221
 222 #define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
 223 #define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
 224
 225 #define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
 226                              & (Addr)(FI_NUM_LINES-1) )
 227
 228
 229 /* In the lines, each 8 bytes are treated individually, and are mapped
 230    to a UShort.  Regardless of endianness of the underlying machine,
 231    bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
 232    the highest address.
 233
 234    Of each bit pair, the higher numbered bit is set if a R has been
 235    seen, so the actual layout is:
 236
 237    15 14             ...  01 00
 238
 239    R  W  for addr+7  ...  R  W  for addr+0
 240
 241    So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
 242 */
 243
 244 /* tags are separated from lines.  tags are Addrs and are
 245    the base address of the line. */
 246 typedef
 247    struct {
 248       UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
 249    }
 250    FiLine;
 251
 252 typedef
 253    struct {
 254       Addr   tags[FI_NUM_LINES];
 255       FiLine lines[FI_NUM_LINES];
 256    }
 257    Filter;
 258
 259
 260
 261 /////////////////////////////////////////////////////////////////
 262 /////////////////////////////////////////////////////////////////
 263 //                                                             //
 264 // data decls: Thr, ULong_n_EC                                 //
 265 //                                                             //
 266 /////////////////////////////////////////////////////////////////
 267 /////////////////////////////////////////////////////////////////
 268
 269 // Records stacks for H1 history mechanism (DRD-style)
 270 typedef
 271    struct { ULong ull; ExeContext* ec; }
 272    ULong_n_EC;
 273
 274
 275 /* How many of the above records to collect for each thread?  Older
 276    ones are dumped when we run out of space.  62.5k requires 1MB per
 277    thread, since each ULong_n_EC record is 16 bytes long.  When more
 278    than N_KWs_N_STACKs_PER_THREAD are present, the older half are
 279    deleted to make space.  Hence in the worst case we will be able to
 280    produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
 281    Kw transitions (segments in this thread).  For the current setting
 282    that gives a guaranteed stack for at least the last 31.25k
 283    segments. */
 284 #define N_KWs_N_STACKs_PER_THREAD 62500
 285
 286
 287 #define N_FRAMES 8
 288 // (UInt) `echo "Reference Counted Execution Context" | md5sum`
 289 #define RCEC_MAGIC 0xab88abb2UL
 290
 291 /* RCEC usage is commented more in details in the section 'Change-event map2'
 292    later in this file */
 293 typedef
 294    struct _RCEC {
 295       UWord magic;  /* sanity check only */
 296       struct _RCEC* next;
 297       UWord rc;
 298       UWord rcX; /* used for crosschecking */
 299       UWord frames_hash;          /* hash of all the frames */
 300       UWord frames[N_FRAMES];
 301    }
 302    RCEC;
 303
 304 struct _Thr {
 305    /* Current VTSs for this thread.  They change as we go along.  viR
 306       is the VTS to be used for reads, viW for writes.  Usually they
 307       are the same, but can differ when we deal with reader-writer
 308       locks.  It is always the case that
 309          VtsID__cmpLEQ(viW,viR) == True
 310       that is, viW must be the same, or lagging behind, viR. */
 311    VtsID viR;
 312    VtsID viW;
 313
 314    /* Is initially False, and is set to True after the thread really
 315       has done a low-level exit.  When True, we expect to never see
 316       any more memory references done by this thread. */
 317    Bool llexit_done;
 318
 319    /* Is initially False, and is set to True after the thread has been
 320       joined with (reaped by some other thread).  After this point, we
 321       do not expect to see any uses of .viR or .viW, so it is safe to
 322       set them to VtsID_INVALID. */
 323    Bool joinedwith_done;
 324
 325    /* A small integer giving a unique identity to this Thr.  See
 326       comments on the definition of ScalarTS for details. */
 327    ThrID thrid : SCALARTS_N_THRBITS;
 328
 329    /* A filter that removes references for which we believe that
 330       msmcread/msmcwrite will not change the state, nor report a
 331       race. */
 332    Filter* filter;
 333
 334    /* A pointer back to the top level Thread structure.  There is a
 335       1-1 mapping between Thread and Thr structures -- each Thr points
 336       at its corresponding Thread, and vice versa.  Really, Thr and
 337       Thread should be merged into a single structure. */
 338    Thread* hgthread;
 339
 340    /* cached_rcec maintains the last RCEC that was retrieved for this thread. */
 341    RCEC cached_rcec; // cached_rcec value, not ref-counted.
 342    /* The shadow register vex_shadow1 SP register (SP_s1) is used to maintain
 343       the validity of the cached rcec.
 344       If SP_s1 is 0, then the cached rcec is invalid (cannot be used).
 345       If SP_S1 is != 0, then the cached rcec is valid. The valid cached rcec
 346       can be used to generate a new RCEC by changing just the last frame. */
 347
 348    /* The ULongs (scalar Kws) in this accumulate in strictly
 349       increasing order, without duplicates.  This is important because
 350       we need to be able to find a given scalar Kw in this array
 351       later, by binary search. */
 352    XArray* /* ULong_n_EC */ local_Kws_n_stacks;
 353 };
 354
 355
 356
 357 /////////////////////////////////////////////////////////////////
 358 /////////////////////////////////////////////////////////////////
 359 //                                                             //
 360 // data decls: SO                                              //
 361 //                                                             //
 362 /////////////////////////////////////////////////////////////////
 363 /////////////////////////////////////////////////////////////////
 364
 365 // (UInt) `echo "Synchronisation object" | md5sum`
 366 #define SO_MAGIC 0x56b3c5b0U
 367
 368 struct _SO {
 369    struct _SO* admin_prev;
 370    struct _SO* admin_next;
 371    VtsID viR; /* r-clock of sender */
 372    VtsID viW; /* w-clock of sender */
 373    UInt  magic;
 374 };
 375
 376
 377
 378 /////////////////////////////////////////////////////////////////
 379 /////////////////////////////////////////////////////////////////
 380 //                                                             //
 381 // Forward declarations                                        //
 382 //                                                             //
 383 /////////////////////////////////////////////////////////////////
 384 /////////////////////////////////////////////////////////////////
 385
 386 /* fwds for
 387    Globals needed by other parts of the library.  These are set
 388    once at startup and then never changed. */
 389 static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
 390 static ExeContext* (*main_get_EC)( Thr* ) = NULL;
 391
 392 /* misc fn and data fwdses */
 393 static void VtsID__rcinc ( VtsID ii );
 394 static void VtsID__rcdec ( VtsID ii );
 395
 396 static inline Bool SVal__isC ( SVal s );
 397 static inline VtsID SVal__unC_Rmin ( SVal s );
 398 static inline VtsID SVal__unC_Wmin ( SVal s );
 399 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
 400 static inline void SVal__rcinc ( SVal s );
 401 static inline void SVal__rcdec ( SVal s );
 402 /* SVal in LineZ are used to store various pointers. */
 403 static inline void *SVal2Ptr (SVal s);
 404 static inline SVal Ptr2SVal (void* ptr);
 405
 406 /* A double linked list of all the SO's. */
 407 SO* admin_SO;
 408
 409
 410
 411 /////////////////////////////////////////////////////////////////
 412 /////////////////////////////////////////////////////////////////
 413 //                                                             //
 414 // SECTION BEGIN compressed shadow memory                      //
 415 //                                                             //
 416 /////////////////////////////////////////////////////////////////
 417 /////////////////////////////////////////////////////////////////
 418
 419 #ifndef __HB_ZSM_H
 420 #define __HB_ZSM_H
 421
 422 /* Initialise the library.  Once initialised, it will (or may) call
 423    SVal__rcinc and SVal__rcdec in response to all the calls below, in order to
 424    allow the user to do reference counting on the SVals stored herein.
 425    It is important to understand, however, that due to internal
 426    caching, the reference counts are in general inaccurate, and can be
 427    both above or below the true reference count for an item.  In
 428    particular, the library may indicate that the reference count for
 429    an item is zero, when in fact it is not.
 430
 431    To make the reference counting exact and therefore non-pointless,
 432    call zsm_flush_cache.  Immediately after it returns, the reference
 433    counts for all items, as deduced by the caller by observing calls
 434    to SVal__rcinc and SVal__rcdec, will be correct, and so any items with a
 435    zero reference count may be freed (or at least considered to be
 436    unreferenced by this library).
 437 */
 438 static void zsm_init ( void );
 439
 440 static void zsm_sset_range  ( Addr, SizeT, SVal );
 441 static void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew );
 442 static void zsm_scopy_range ( Addr, Addr, SizeT );
 443 static void zsm_flush_cache ( void );
 444
 445 #endif /* ! __HB_ZSM_H */
 446
 447
 448 /* Round a up to the next multiple of N.  N must be a power of 2 */
 449 #define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
 450 /* Round a down to the next multiple of N.  N must be a power of 2 */
 451 #define ROUNDDN(a, N)   ((a) & ~(N-1))
 452
 453 /* True if a belongs in range [start, start + szB[
 454    (i.e. start + szB is excluded). */
 455 static inline Bool address_in_range (Addr a, Addr start,  SizeT szB)
 456 {
 457    /* Checking start <= a && a < start + szB.
 458       As start and a are unsigned addresses, the condition can
 459       be simplified. */
 460    if (CHECK_ZSM)
 461       tl_assert ((a - start < szB)
 462                  == (start <= a
 463                      &&       a < start + szB));
 464    return a - start < szB;
 465 }
 466
 467 /* ------ CacheLine ------ */
 468
 469 #define N_LINE_BITS      6 /* must be >= 3 */
 470 #define N_LINE_ARANGE    (1 << N_LINE_BITS)
 471 #define N_LINE_TREES     (N_LINE_ARANGE >> 3)
 472
 473 typedef
 474    struct {
 475       UShort descrs[N_LINE_TREES];
 476       SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
 477    }
 478    CacheLine;
 479
 480 #define TREE_DESCR_16_0 (1<<0)
 481 #define TREE_DESCR_32_0 (1<<1)
 482 #define TREE_DESCR_16_1 (1<<2)
 483 #define TREE_DESCR_64   (1<<3)
 484 #define TREE_DESCR_16_2 (1<<4)
 485 #define TREE_DESCR_32_1 (1<<5)
 486 #define TREE_DESCR_16_3 (1<<6)
 487 #define TREE_DESCR_8_0  (1<<7)
 488 #define TREE_DESCR_8_1  (1<<8)
 489 #define TREE_DESCR_8_2  (1<<9)
 490 #define TREE_DESCR_8_3  (1<<10)
 491 #define TREE_DESCR_8_4  (1<<11)
 492 #define TREE_DESCR_8_5  (1<<12)
 493 #define TREE_DESCR_8_6  (1<<13)
 494 #define TREE_DESCR_8_7  (1<<14)
 495 #define TREE_DESCR_DTY  (1<<15)
 496
 497 typedef
 498    struct {
 499       SVal  dict[4]; /* can represent up to 4 diff values in the line */
 500       UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
 501                                       dict indexes */
 502       /* if dict[0] == SVal_INVALID then dict[1] is a pointer to the
 503          LineF to use, and dict[2..] are also SVal_INVALID. */
 504    }
 505    LineZ; /* compressed rep for a cache line */
 506
 507 /* LineZ.dict[1] is used to store various pointers:
 508    * In the first lineZ of a free SecMap, it points to the next free SecMap.
 509    * In a lineZ for which we need to use a lineF, it points to the lineF. */
 510
 511
 512 typedef
 513    struct {
 514       SVal w64s[N_LINE_ARANGE];
 515    }
 516    LineF; /* full rep for a cache line */
 517
 518 /* We use a pool allocator for LineF, as LineF is relatively small,
 519    and we will often alloc/release such lines. */
 520 static PoolAlloc* LineF_pool_allocator;
 521
 522 /* SVal in a lineZ are used to store various pointers.
 523    Below are conversion functions to support that. */
 524 static inline LineF *LineF_Ptr (LineZ *lineZ)
 525 {
 526    tl_assert(lineZ->dict[0] == SVal_INVALID);
 527    return SVal2Ptr (lineZ->dict[1]);
 528 }
 529
 530 /* Shadow memory.
 531    Primary map is a WordFM Addr SecMap*.
 532    SecMaps cover some page-size-ish section of address space and hold
 533      a compressed representation.
 534    CacheLine-sized chunks of SecMaps are copied into a Cache, being
 535    decompressed when moved into the cache and recompressed on the
 536    way out.  Because of this, the cache must operate as a writeback
 537    cache, not a writethrough one.
 538
 539    Each SecMap must hold a power-of-2 number of CacheLines.  Hence
 540    N_SECMAP_BITS must >= N_LINE_BITS.
 541 */
 542 #define N_SECMAP_BITS   13
 543 #define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
 544
 545 // # CacheLines held by a SecMap
 546 #define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
 547
 548 /* The data in the SecMap is held in the array of LineZs.  Each LineZ
 549    either carries the required data directly, in a compressed
 550    representation, or it holds (in .dict[1]) a pointer to a LineF
 551    that holds the full representation.
 552
 553    As each in-use LineF is referred to by exactly one LineZ,
 554    the number of .linesZ[] that refer to a lineF should equal
 555    the number of used lineF.
 556
 557    RC obligations: the RCs presented to the user include exactly
 558    the values in:
 559    * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
 560    * F reps that are in use
 561
 562    Hence the following actions at the following transitions are required:
 563
 564    F rep: alloc'd       -> freed                -- rcdec_LineF
 565    F rep:               -> alloc'd              -- rcinc_LineF
 566    Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
 567    Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
 568 */
 569
 570 typedef
 571    struct {
 572       UInt   magic;
 573       LineZ  linesZ[N_SECMAP_ZLINES];
 574    }
 575    SecMap;
 576
 577 #define SecMap_MAGIC   0x571e58cbU
 578
 579 // (UInt) `echo "Free SecMap" | md5sum`
 580 #define SecMap_free_MAGIC 0x5a977f30U
 581
 582 __attribute__((unused))
 583 static inline Bool is_sane_SecMap ( SecMap* sm ) {
 584    return sm != NULL && sm->magic == SecMap_MAGIC;
 585 }
 586
 587 /* ------ Cache ------ */
 588
 589 #define N_WAY_BITS 16
 590 #define N_WAY_NENT (1 << N_WAY_BITS)
 591
 592 /* Each tag is the address of the associated CacheLine, rounded down
 593    to a CacheLine address boundary.  A CacheLine size must be a power
 594    of 2 and must be 8 or more.  Hence an easy way to initialise the
 595    cache so it is empty is to set all the tag values to any value % 8
 596    != 0, eg 1.  This means all queries in the cache initially miss.
 597    It does however require us to detect and not writeback, any line
 598    with a bogus tag. */
 599 typedef
 600    struct {
 601       CacheLine lyns0[N_WAY_NENT];
 602       Addr      tags0[N_WAY_NENT];
 603    }
 604    Cache;
 605
 606 static inline Bool is_valid_scache_tag ( Addr tag ) {
 607    /* a valid tag should be naturally aligned to the start of
 608       a CacheLine. */
 609    return 0 == (tag & (N_LINE_ARANGE - 1));
 610 }
 611
 612
 613 /* --------- Primary data structures --------- */
 614
 615 /* Shadow memory primary map */
 616 static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
 617 static Cache   cache_shmem;
 618
 619
 620 static UWord stats__secmaps_search       = 0; // # SM finds
 621 static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
 622 static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
 623 static UWord stats__secmaps_in_map_shmem = 0; // # SecMaps 'live'
 624 static UWord stats__secmaps_scanGC       = 0; // # nr of scan GC done.
 625 static UWord stats__secmaps_scanGCed     = 0; // # SecMaps GC-ed via scan
 626 static UWord stats__secmaps_ssetGCed     = 0; // # SecMaps GC-ed via setnoaccess
 627 static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
 628 static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
 629 static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
 630 static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
 631 static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
 632 static UWord stats__cache_F_fetches      = 0; // # F lines fetched
 633 static UWord stats__cache_F_wbacks       = 0; // # F lines written back
 634 static UWord stats__cache_flushes_invals = 0; // # cache flushes and invals
 635 static UWord stats__cache_totrefs        = 0; // # total accesses
 636 static UWord stats__cache_totmisses      = 0; // # misses
 637 static ULong stats__cache_make_New_arange = 0; // total arange made New
 638 static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
 639 static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
 640 static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
 641 static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
 642 static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
 643 static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
 644 static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
 645 static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
 646 static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
 647 static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
 648 static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
 649 static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
 650 static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
 651 static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
 652 static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
 653 static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
 654 static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
 655 static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
 656 static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
 657 static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
 658 static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
 659 static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
 660 static UWord stats__vts__tick            = 0; // # calls to VTS__tick
 661 static UWord stats__vts__join            = 0; // # calls to VTS__join
 662 static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
 663 static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
 664 static UWord stats__vts_tab_GC           = 0; // # nr of vts_tab GC
 665 static UWord stats__vts_pruning          = 0; // # nr of vts pruning
 666
 667 // # calls to VTS__cmp_structural w/ slow case
 668 static UWord stats__vts__cmp_structural_slow = 0;
 669
 670 // # calls to VTS__indexAt_SLOW
 671 static UWord stats__vts__indexat_slow = 0;
 672
 673 // # calls to vts_set__find__or__clone_and_add
 674 static UWord stats__vts_set__focaa    = 0;
 675
 676 // # calls to vts_set__find__or__clone_and_add that lead to an
 677 // allocation
 678 static UWord stats__vts_set__focaa_a  = 0;
 679
 680
 681 static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
 682    return a & ~(N_SECMAP_ARANGE - 1);
 683 }
 684 static inline UWord shmem__get_SecMap_offset ( Addr a ) {
 685    return a & (N_SECMAP_ARANGE - 1);
 686 }
 687
 688
 689 /*----------------------------------------------------------------*/
 690 /*--- map_shmem :: WordFM Addr SecMap                          ---*/
 691 /*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
 692 /*----------------------------------------------------------------*/
 693
 694 /*--------------- SecMap allocation --------------- */
 695
 696 static HChar* shmem__bigchunk_next = NULL;
 697 static HChar* shmem__bigchunk_end1 = NULL;
 698
 699 static void* shmem__bigchunk_alloc ( SizeT n )
 700 {
 701    const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
 702    tl_assert(n > 0);
 703    n = VG_ROUNDUP(n, 16);
 704    tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
 705    tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
 706              <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
 707    if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
 708       if (0)
 709       VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
 710                   (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
 711       shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
 712       if (shmem__bigchunk_next == NULL)
 713          VG_(out_of_memory_NORETURN)(
 714             "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
 715       shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
 716    }
 717    tl_assert(shmem__bigchunk_next);
 718    tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
 719    tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
 720    shmem__bigchunk_next += n;
 721    return shmem__bigchunk_next - n;
 722 }
 723
 724 /* SecMap changed to be fully SVal_NOACCESS are inserted in a list of
 725    recycled SecMap. When a new SecMap is needed, a recycled SecMap
 726    will be used in preference to allocating a new SecMap. */
 727 /* We make a linked list of SecMap. The first LineZ is re-used to
 728    implement the linked list. */
 729 /* Returns the SecMap following sm in the free list.
 730    NULL if sm is the last SecMap. sm must be on the free list. */
 731 static inline SecMap *SecMap_freelist_next ( SecMap* sm )
 732 {
 733    tl_assert (sm);
 734    tl_assert (sm->magic == SecMap_free_MAGIC);
 735    return SVal2Ptr (sm->linesZ[0].dict[1]);
 736 }
 737 static inline void set_SecMap_freelist_next ( SecMap* sm, SecMap* next )
 738 {
 739    tl_assert (sm);
 740    tl_assert (sm->magic == SecMap_free_MAGIC);
 741    tl_assert (next == NULL || next->magic == SecMap_free_MAGIC);
 742    sm->linesZ[0].dict[1] = Ptr2SVal (next);
 743 }
 744
 745 static SecMap *SecMap_freelist = NULL;
 746 static UWord SecMap_freelist_length(void)
 747 {
 748    SecMap *sm;
 749    UWord n = 0;
 750
 751    sm = SecMap_freelist;
 752    while (sm) {
 753      n++;
 754      sm = SecMap_freelist_next (sm);
 755    }
 756    return n;
 757 }
 758
 759 static void push_SecMap_on_freelist(SecMap* sm)
 760 {
 761    if (0) VG_(message)(Vg_DebugMsg, "%p push\n", sm);
 762    sm->magic = SecMap_free_MAGIC;
 763    set_SecMap_freelist_next(sm, SecMap_freelist);
 764    SecMap_freelist = sm;
 765 }
 766 /* Returns a free SecMap if there is one.
 767    Otherwise, returns NULL. */
 768 static SecMap *pop_SecMap_from_freelist(void)
 769 {
 770    SecMap *sm;
 771
 772    sm = SecMap_freelist;
 773    if (sm) {
 774       tl_assert (sm->magic == SecMap_free_MAGIC);
 775       SecMap_freelist = SecMap_freelist_next (sm);
 776       if (0) VG_(message)(Vg_DebugMsg, "%p pop\n", sm);
 777    }
 778    return sm;
 779 }
 780
 781 static SecMap* shmem__alloc_or_recycle_SecMap ( void )
 782 {
 783    Word    i, j;
 784    SecMap* sm = pop_SecMap_from_freelist();
 785
 786    if (!sm) {
 787       sm = shmem__bigchunk_alloc( sizeof(SecMap) );
 788       stats__secmaps_allocd++;
 789       stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
 790       stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
 791       stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
 792    }
 793    if (0) VG_(printf)("alloc_SecMap %p\n",sm);
 794    tl_assert(sm);
 795    sm->magic = SecMap_MAGIC;
 796    for (i = 0; i < N_SECMAP_ZLINES; i++) {
 797       sm->linesZ[i].dict[0] = SVal_NOACCESS;
 798       sm->linesZ[i].dict[1] = SVal_INVALID;
 799       sm->linesZ[i].dict[2] = SVal_INVALID;
 800       sm->linesZ[i].dict[3] = SVal_INVALID;
 801       for (j = 0; j < N_LINE_ARANGE/4; j++)
 802          sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
 803    }
 804    return sm;
 805 }
 806
 807 typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
 808 static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
 809
 810 static SecMap* shmem__find_SecMap ( Addr ga )
 811 {
 812    SecMap* sm    = NULL;
 813    Addr    gaKey = shmem__round_to_SecMap_base(ga);
 814    // Cache
 815    stats__secmaps_search++;
 816    if (LIKELY(gaKey == smCache[0].gaKey))
 817       return smCache[0].sm;
 818    if (LIKELY(gaKey == smCache[1].gaKey)) {
 819       SMCacheEnt tmp = smCache[0];
 820       smCache[0] = smCache[1];
 821       smCache[1] = tmp;
 822       return smCache[0].sm;
 823    }
 824    if (gaKey == smCache[2].gaKey) {
 825       SMCacheEnt tmp = smCache[1];
 826       smCache[1] = smCache[2];
 827       smCache[2] = tmp;
 828       return smCache[1].sm;
 829    }
 830    // end Cache
 831    stats__secmaps_search_slow++;
 832    if (VG_(lookupFM)( map_shmem,
 833                       NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
 834       tl_assert(sm != NULL);
 835       smCache[2] = smCache[1];
 836       smCache[1] = smCache[0];
 837       smCache[0].gaKey = gaKey;
 838       smCache[0].sm    = sm;
 839    } else {
 840       tl_assert(sm == NULL);
 841    }
 842    return sm;
 843 }
 844
 845 /* Scan the SecMap and count the SecMap that can be GC-ed.
 846    If really, really does the GC of the SecMap. */
 847 /* NOT TO BE CALLED FROM WITHIN libzsm. */
 848 static UWord next_SecMap_GC_at = 1000;
 849 __attribute__((noinline))
 850 static UWord shmem__SecMap_do_GC(Bool really)
 851 {
 852    UWord secmapW = 0;
 853    Addr  gaKey;
 854    UWord examined = 0;
 855    UWord ok_GCed = 0;
 856
 857    /* First invalidate the smCache */
 858    smCache[0].gaKey = 1;
 859    smCache[1].gaKey = 1;
 860    smCache[2].gaKey = 1;
 861    STATIC_ASSERT (3 == sizeof(smCache)/sizeof(smCache[0]));
 862
 863    VG_(initIterFM)( map_shmem );
 864    while (VG_(nextIterFM)( map_shmem, &gaKey, &secmapW )) {
 865       UWord   i;
 866       UWord   j;
 867       UWord   n_linesF = 0;
 868       SecMap* sm = (SecMap*)secmapW;
 869       tl_assert(sm->magic == SecMap_MAGIC);
 870       Bool ok_to_GC = True;
 871
 872       examined++;
 873
 874       /* Deal with the LineZs and the possible LineF of a LineZ. */
 875       for (i = 0; i < N_SECMAP_ZLINES && ok_to_GC; i++) {
 876          LineZ* lineZ = &sm->linesZ[i];
 877          if (lineZ->dict[0] != SVal_INVALID) {
 878             ok_to_GC = lineZ->dict[0] == SVal_NOACCESS
 879                && !SVal__isC (lineZ->dict[1])
 880                && !SVal__isC (lineZ->dict[2])
 881                && !SVal__isC (lineZ->dict[3]);
 882          } else {
 883             LineF *lineF = LineF_Ptr(lineZ);
 884             n_linesF++;
 885             for (j = 0; j < N_LINE_ARANGE && ok_to_GC; j++)
 886                ok_to_GC = lineF->w64s[j] == SVal_NOACCESS;
 887          }
 888       }
 889       if (ok_to_GC)
 890          ok_GCed++;
 891       if (ok_to_GC && really) {
 892         SecMap *fm_sm;
 893         Addr fm_gaKey;
 894         /* We cannot remove a SecMap from map_shmem while iterating.
 895            So, stop iteration, remove from map_shmem, recreate the iteration
 896            on the next SecMap. */
 897         VG_(doneIterFM) ( map_shmem );
 898         /* No need to rcdec linesZ or linesF, these are all SVal_NOACCESS.
 899            We just need to free the lineF referenced by the linesZ. */
 900         if (n_linesF > 0) {
 901            for (i = 0; i < N_SECMAP_ZLINES && n_linesF > 0; i++) {
 902               LineZ* lineZ = &sm->linesZ[i];
 903               if (lineZ->dict[0] == SVal_INVALID) {
 904                  VG_(freeEltPA)( LineF_pool_allocator, LineF_Ptr(lineZ) );
 905                  n_linesF--;
 906               }
 907            }
 908         }
 909         if (!VG_(delFromFM)(map_shmem, &fm_gaKey, (UWord*)&fm_sm, gaKey))
 910           tl_assert (0);
 911         stats__secmaps_in_map_shmem--;
 912         tl_assert (gaKey == fm_gaKey);
 913         tl_assert (sm == fm_sm);
 914         stats__secmaps_scanGCed++;
 915         push_SecMap_on_freelist (sm);
 916         VG_(initIterAtFM) (map_shmem, gaKey + N_SECMAP_ARANGE);
 917       }
 918    }
 919    VG_(doneIterFM)( map_shmem );
 920
 921    if (really) {
 922       stats__secmaps_scanGC++;
 923       /* Next GC when we approach the max allocated */
 924       next_SecMap_GC_at = stats__secmaps_allocd - 1000;
 925       /* Unless we GCed less than 10%. We then allow to alloc 10%
 926          more before GCing. This avoids doing a lot of costly GC
 927          for the worst case : the 'growing phase' of an application
 928          that allocates a lot of memory.
 929          Worst can can be reproduced e.g. by
 930              perf/memrw -t 30000000 -b 1000 -r 1 -l 1
 931          that allocates around 30Gb of memory. */
 932       if (ok_GCed < stats__secmaps_allocd/10)
 933          next_SecMap_GC_at = stats__secmaps_allocd + stats__secmaps_allocd/10;
 934
 935    }
 936
 937    if (VG_(clo_stats) && really) {
 938       VG_(message)(Vg_DebugMsg,
 939                   "libhb: SecMap GC: #%lu scanned %lu, GCed %lu,"
 940                    " next GC at %lu\n",
 941                    stats__secmaps_scanGC, examined, ok_GCed,
 942                    next_SecMap_GC_at);
 943    }
 944
 945    return ok_GCed;
 946 }
 947
 948 static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
 949 {
 950    SecMap* sm = shmem__find_SecMap ( ga );
 951    if (LIKELY(sm)) {
 952       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
 953       return sm;
 954    } else {
 955       /* create a new one */
 956       Addr gaKey = shmem__round_to_SecMap_base(ga);
 957       sm = shmem__alloc_or_recycle_SecMap();
 958       tl_assert(sm);
 959       VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
 960       stats__secmaps_in_map_shmem++;
 961       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
 962       return sm;
 963    }
 964 }
 965
 966 /* Returns the nr of linesF which are in use. Note: this is scanning
 967    the secmap wordFM. So, this is to be used for statistics only. */
 968 __attribute__((noinline))
 969 static UWord shmem__SecMap_used_linesF(void)
 970 {
 971    UWord secmapW = 0;
 972    Addr  gaKey;
 973    UWord inUse = 0;
 974
 975    VG_(initIterFM)( map_shmem );
 976    while (VG_(nextIterFM)( map_shmem, &gaKey, &secmapW )) {
 977       UWord   i;
 978       SecMap* sm = (SecMap*)secmapW;
 979       tl_assert(sm->magic == SecMap_MAGIC);
 980
 981       for (i = 0; i < N_SECMAP_ZLINES; i++) {
 982          LineZ* lineZ = &sm->linesZ[i];
 983          if (lineZ->dict[0] == SVal_INVALID)
 984             inUse++;
 985       }
 986    }
 987    VG_(doneIterFM)( map_shmem );
 988
 989    return inUse;
 990 }
 991
 992 /* ------------ LineF and LineZ related ------------ */
 993
 994 static void rcinc_LineF ( LineF* lineF ) {
 995    UWord i;
 996    for (i = 0; i < N_LINE_ARANGE; i++)
 997       SVal__rcinc(lineF->w64s[i]);
 998 }
 999
1000 static void rcdec_LineF ( LineF* lineF ) {
1001    UWord i;
1002    for (i = 0; i < N_LINE_ARANGE; i++)
1003       SVal__rcdec(lineF->w64s[i]);
1004 }
1005
1006 static void rcinc_LineZ ( LineZ* lineZ ) {
1007    tl_assert(lineZ->dict[0] != SVal_INVALID);
1008    SVal__rcinc(lineZ->dict[0]);
1009    if (lineZ->dict[1] != SVal_INVALID) SVal__rcinc(lineZ->dict[1]);
1010    if (lineZ->dict[2] != SVal_INVALID) SVal__rcinc(lineZ->dict[2]);
1011    if (lineZ->dict[3] != SVal_INVALID) SVal__rcinc(lineZ->dict[3]);
1012 }
1013
1014 static void rcdec_LineZ ( LineZ* lineZ ) {
1015    tl_assert(lineZ->dict[0] != SVal_INVALID);
1016    SVal__rcdec(lineZ->dict[0]);
1017    if (lineZ->dict[1] != SVal_INVALID) SVal__rcdec(lineZ->dict[1]);
1018    if (lineZ->dict[2] != SVal_INVALID) SVal__rcdec(lineZ->dict[2]);
1019    if (lineZ->dict[3] != SVal_INVALID) SVal__rcdec(lineZ->dict[3]);
1020 }
1021
1022 inline
1023 static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
1024    Word bix, shft, mask, prep;
1025    tl_assert(ix >= 0);
1026    bix  = ix >> 2;
1027    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
1028    mask = 3 << shft;
1029    prep = b2 << shft;
1030    arr[bix] = (arr[bix] & ~mask) | prep;
1031 }
1032
1033 inline
1034 static UWord read_twobit_array ( UChar* arr, UWord ix ) {
1035    Word bix, shft;
1036    tl_assert(ix >= 0);
1037    bix  = ix >> 2;
1038    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
1039    return (arr[bix] >> shft) & 3;
1040 }
1041
1042 /* We cache one free lineF, to avoid pool allocator calls.
1043    Measurement on firefox has shown that this avoids more than 90%
1044    of the PA calls. */
1045 static LineF *free_lineF = NULL;
1046
1047 /* Allocates a lineF for LineZ. Sets lineZ in a state indicating
1048    lineF has to be used. */
1049 static inline LineF *alloc_LineF_for_Z (LineZ *lineZ)
1050 {
1051    LineF *lineF;
1052
1053    tl_assert(lineZ->dict[0] == SVal_INVALID);
1054
1055    if (LIKELY(free_lineF)) {
1056       lineF = free_lineF;
1057       free_lineF = NULL;
1058    } else {
1059       lineF = VG_(allocEltPA) ( LineF_pool_allocator );
1060    }
1061    lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1062    lineZ->dict[1] = Ptr2SVal (lineF);
1063
1064    return lineF;
1065 }
1066
1067 /* rcdec the LineF of lineZ, frees the lineF, and sets lineZ
1068    back to its initial state SVal_NOACCESS (i.e. ready to be
1069    read or written just after SecMap allocation). */
1070 static inline void clear_LineF_of_Z (LineZ *lineZ)
1071 {
1072    LineF *lineF = LineF_Ptr(lineZ);
1073
1074    rcdec_LineF(lineF);
1075    if (UNLIKELY(free_lineF)) {
1076       VG_(freeEltPA)( LineF_pool_allocator, lineF );
1077    } else {
1078       free_lineF = lineF;
1079    }
1080    lineZ->dict[0] = SVal_NOACCESS;
1081    lineZ->dict[1] = SVal_INVALID;
1082 }
1083
1084 /* Given address 'tag', find either the Z or F line containing relevant
1085    data, so it can be read into the cache.
1086 */
1087 static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
1088                                   /*OUT*/LineF** fp, Addr tag ) {
1089    LineZ* lineZ;
1090    LineF* lineF;
1091    UWord   zix;
1092    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
1093    UWord   smoff = shmem__get_SecMap_offset(tag);
1094    /* since smoff is derived from a valid tag, it should be
1095       cacheline-aligned. */
1096    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
1097    zix = smoff >> N_LINE_BITS;
1098    tl_assert(zix < N_SECMAP_ZLINES);
1099    lineZ = &sm->linesZ[zix];
1100    lineF = NULL;
1101    if (lineZ->dict[0] == SVal_INVALID) {
1102       lineF = LineF_Ptr (lineZ);
1103       lineZ = NULL;
1104    }
1105    *zp = lineZ;
1106    *fp = lineF;
1107 }
1108
1109 /* Given address 'tag', return the relevant SecMap and the index of
1110    the LineZ within it, in the expectation that the line is to be
1111    overwritten.  Regardless of whether 'tag' is currently associated
1112    with a Z or F representation, to rcdec on the current
1113    representation, in recognition of the fact that the contents are
1114    just about to be overwritten. */
1115 static __attribute__((noinline))
1116 void find_Z_for_writing ( /*OUT*/SecMap** smp,
1117                           /*OUT*/Word* zixp,
1118                           Addr tag ) {
1119    LineZ* lineZ;
1120    UWord   zix;
1121    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
1122    UWord   smoff = shmem__get_SecMap_offset(tag);
1123    /* since smoff is derived from a valid tag, it should be
1124       cacheline-aligned. */
1125    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
1126    zix = smoff >> N_LINE_BITS;
1127    tl_assert(zix < N_SECMAP_ZLINES);
1128    lineZ = &sm->linesZ[zix];
1129    /* re RCs, we are rcdec_LineZ/clear_LineF_of_Z this LineZ so that new data
1130       can be parked in it.  Hence have to rcdec it accordingly. */
1131    /* If lineZ has an associated lineF, free it up. */
1132    if (lineZ->dict[0] == SVal_INVALID)
1133       clear_LineF_of_Z(lineZ);
1134    else
1135       rcdec_LineZ(lineZ);
1136    *smp  = sm;
1137    *zixp = zix;
1138 }
1139
1140 /* ------------ CacheLine and implicit-tree related ------------ */
1141
1142 __attribute__((unused))
1143 static void pp_CacheLine ( CacheLine* cl ) {
1144    Word i;
1145    if (!cl) {
1146       VG_(printf)("%s","pp_CacheLine(NULL)\n");
1147       return;
1148    }
1149    for (i = 0; i < N_LINE_TREES; i++)
1150       VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
1151    for (i = 0; i < N_LINE_ARANGE; i++)
1152       VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
1153 }
1154
1155 static UChar descr_to_validbits ( UShort descr )
1156 {
1157    /* a.k.a Party Time for gcc's constant folder */
1158 #  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
1159                 b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
1160              ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
1161                           ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
1162                           ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
1163                           ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
1164                           ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
1165                           ( (b16_2) << 4)  | ( (b64)   << 3)  | \
1166                           ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
1167                           ( (b16_0) << 0) ) )
1168
1169 #  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
1170              ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
1171                          ( (bit5) << 5) | ( (bit4) << 4) | \
1172                          ( (bit3) << 3) | ( (bit2) << 2) | \
1173                          ( (bit1) << 1) | ( (bit0) << 0) ) )
1174
1175    /* these should all get folded out at compile time */
1176    tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
1177    tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
1178    tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
1179    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
1180    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
1181    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
1182    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
1183    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
1184    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
1185
1186    switch (descr) {
1187    /*
1188               +--------------------------------- TREE_DESCR_8_7
1189               |             +------------------- TREE_DESCR_8_0
1190               |             |  +---------------- TREE_DESCR_16_3
1191               |             |  | +-------------- TREE_DESCR_32_1
1192               |             |  | | +------------ TREE_DESCR_16_2
1193               |             |  | | |  +--------- TREE_DESCR_64
1194               |             |  | | |  |  +------ TREE_DESCR_16_1
1195               |             |  | | |  |  | +---- TREE_DESCR_32_0
1196               |             |  | | |  |  | | +-- TREE_DESCR_16_0
1197               |             |  | | |  |  | | |
1198               |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
1199    case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
1200                                                  return BYTE(1,1,1,1,1,1,1,1);
1201    case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
1202                                                  return BYTE(1,1,0,1,1,1,1,1);
1203    case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
1204                                                  return BYTE(0,1,1,1,1,1,1,1);
1205    case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
1206                                                  return BYTE(0,1,0,1,1,1,1,1);
1207
1208    case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
1209                                                  return BYTE(1,1,1,1,1,1,0,1);
1210    case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
1211                                                  return BYTE(1,1,0,1,1,1,0,1);
1212    case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
1213                                                  return BYTE(0,1,1,1,1,1,0,1);
1214    case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
1215                                                  return BYTE(0,1,0,1,1,1,0,1);
1216
1217    case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
1218                                                  return BYTE(1,1,1,1,0,1,1,1);
1219    case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
1220                                                  return BYTE(1,1,0,1,0,1,1,1);
1221    case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
1222                                                  return BYTE(0,1,1,1,0,1,1,1);
1223    case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
1224                                                  return BYTE(0,1,0,1,0,1,1,1);
1225
1226    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
1227                                                  return BYTE(1,1,1,1,0,1,0,1);
1228    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
1229                                                  return BYTE(1,1,0,1,0,1,0,1);
1230    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
1231                                                  return BYTE(0,1,1,1,0,1,0,1);
1232    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
1233                                                  return BYTE(0,1,0,1,0,1,0,1);
1234
1235    case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
1236                                                  return BYTE(0,0,0,1,1,1,1,1);
1237    case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
1238                                                  return BYTE(0,0,0,1,1,1,0,1);
1239    case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
1240                                                  return BYTE(0,0,0,1,0,1,1,1);
1241    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
1242                                                  return BYTE(0,0,0,1,0,1,0,1);
1243
1244    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
1245                                                  return BYTE(1,1,1,1,0,0,0,1);
1246    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
1247                                                  return BYTE(1,1,0,1,0,0,0,1);
1248    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
1249                                                  return BYTE(0,1,1,1,0,0,0,1);
1250    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
1251                                                  return BYTE(0,1,0,1,0,0,0,1);
1252
1253    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
1254                                                  return BYTE(0,0,0,1,0,0,0,1);
1255
1256    case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
1257                                                  return BYTE(0,0,0,0,0,0,0,1);
1258
1259    default: return BYTE(0,0,0,0,0,0,0,0);
1260                    /* INVALID - any valid descr produces at least one
1261                       valid bit in tree[0..7]*/
1262    }
1263    /* NOTREACHED*/
1264    tl_assert(0);
1265
1266 #  undef DESCR
1267 #  undef BYTE
1268 }
1269
1270 __attribute__((unused))
1271 static Bool is_sane_Descr ( UShort descr ) {
1272    return descr_to_validbits(descr) != 0;
1273 }
1274
1275 static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
1276    VG_(sprintf)(dst,
1277                 "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
1278                 (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
1279                 (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
1280                 (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
1281                 (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
1282                 (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
1283                 (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
1284                 (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
1285                 (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
1286                 (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
1287                 (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
1288                 (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
1289                 (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
1290                 (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
1291                 (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
1292                 (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
1293    );
1294 }
1295 static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
1296    VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
1297                      (Int)((byte & 128) ? 1 : 0),
1298                      (Int)((byte &  64) ? 1 : 0),
1299                      (Int)((byte &  32) ? 1 : 0),
1300                      (Int)((byte &  16) ? 1 : 0),
1301                      (Int)((byte &   8) ? 1 : 0),
1302                      (Int)((byte &   4) ? 1 : 0),
1303                      (Int)((byte &   2) ? 1 : 0),
1304                      (Int)((byte &   1) ? 1 : 0)
1305    );
1306 }
1307
1308 static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
1309    Word  i;
1310    UChar validbits = descr_to_validbits(descr);
1311    HChar buf[128], buf2[128];    // large enough
1312    if (validbits == 0)
1313       goto bad;
1314    for (i = 0; i < 8; i++) {
1315       if (validbits & (1<<i)) {
1316          if (tree[i] == SVal_INVALID)
1317             goto bad;
1318       } else {
1319          if (tree[i] != SVal_INVALID)
1320             goto bad;
1321       }
1322    }
1323    return True;
1324   bad:
1325    sprintf_Descr( buf, descr );
1326    sprintf_Byte( buf2, validbits );
1327    VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
1328    VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
1329    VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
1330    for (i = 0; i < 8; i++)
1331       VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
1332    VG_(printf)("%s","}\n");
1333    return 0;
1334 }
1335
1336 static Bool is_sane_CacheLine ( CacheLine* cl )
1337 {
1338    Word tno, cloff;
1339
1340    if (!cl) goto bad;
1341
1342    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1343       UShort descr = cl->descrs[tno];
1344       SVal*  tree  = &cl->svals[cloff];
1345       if (!is_sane_Descr_and_Tree(descr, tree))
1346          goto bad;
1347    }
1348    tl_assert(cloff == N_LINE_ARANGE);
1349    return True;
1350   bad:
1351    pp_CacheLine(cl);
1352    return False;
1353 }
1354
1355 static UShort normalise_tree ( /*MOD*/SVal* tree )
1356 {
1357    UShort descr;
1358    /* pre: incoming tree[0..7] does not have any invalid shvals, in
1359       particular no zeroes. */
1360    if (CHECK_ZSM
1361        && UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
1362                    || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
1363                    || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
1364                    || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
1365       tl_assert(0);
1366
1367    descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
1368            | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
1369            | TREE_DESCR_8_1 | TREE_DESCR_8_0;
1370    /* build 16-bit layer */
1371    if (tree[1] == tree[0]) {
1372       tree[1] = SVal_INVALID;
1373       descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
1374       descr |= TREE_DESCR_16_0;
1375    }
1376    if (tree[3] == tree[2]) {
1377       tree[3] = SVal_INVALID;
1378       descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
1379       descr |= TREE_DESCR_16_1;
1380    }
1381    if (tree[5] == tree[4]) {
1382       tree[5] = SVal_INVALID;
1383       descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
1384       descr |= TREE_DESCR_16_2;
1385    }
1386    if (tree[7] == tree[6]) {
1387       tree[7] = SVal_INVALID;
1388       descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
1389       descr |= TREE_DESCR_16_3;
1390    }
1391    /* build 32-bit layer */
1392    if (tree[2] == tree[0]
1393        && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
1394       tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
1395       descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
1396       descr |= TREE_DESCR_32_0;
1397    }
1398    if (tree[6] == tree[4]
1399        && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
1400       tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
1401       descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
1402       descr |= TREE_DESCR_32_1;
1403    }
1404    /* build 64-bit layer */
1405    if (tree[4] == tree[0]
1406        && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
1407       tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
1408       descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
1409       descr |= TREE_DESCR_64;
1410    }
1411    return descr;
1412 }
1413
1414 /* This takes a cacheline where all the data is at the leaves
1415    (w8[..]) and builds a correctly normalised tree. */
1416 static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
1417 {
1418    Word tno, cloff;
1419    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1420       SVal* tree = &cl->svals[cloff];
1421       cl->descrs[tno] = normalise_tree( tree );
1422    }
1423    tl_assert(cloff == N_LINE_ARANGE);
1424    if (CHECK_ZSM)
1425       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1426    stats__cline_normalises++;
1427 }
1428
1429
1430 typedef struct { UChar count; SVal sval; } CountedSVal;
1431
1432 static
1433 void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
1434                                /*OUT*/Word* dstUsedP,
1435                                Word nDst, CacheLine* src )
1436 {
1437    Word  tno, cloff, dstUsed;
1438
1439    tl_assert(nDst == N_LINE_ARANGE);
1440    dstUsed = 0;
1441
1442    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1443       UShort descr = src->descrs[tno];
1444       SVal*  tree  = &src->svals[cloff];
1445
1446       /* sequentialise the tree described by (descr,tree). */
1447 #     define PUT(_n,_v)                                \
1448          do { dst[dstUsed  ].count = (_n);             \
1449               dst[dstUsed++].sval  = (_v);             \
1450          } while (0)
1451
1452       /* byte 0 */
1453       if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
1454       if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
1455       if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
1456       if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
1457       /* byte 1 */
1458       if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
1459       /* byte 2 */
1460       if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
1461       if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
1462       /* byte 3 */
1463       if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
1464       /* byte 4 */
1465       if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
1466       if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
1467       if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
1468       /* byte 5 */
1469       if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
1470       /* byte 6 */
1471       if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
1472       if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
1473       /* byte 7 */
1474       if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
1475
1476 #     undef PUT
1477       /* END sequentialise the tree described by (descr,tree). */
1478
1479    }
1480    tl_assert(cloff == N_LINE_ARANGE);
1481    tl_assert(dstUsed <= nDst);
1482
1483    *dstUsedP = dstUsed;
1484 }
1485
1486 /* Write the cacheline 'wix' to backing store.  Where it ends up
1487    is determined by its tag field. */
1488 static __attribute__((noinline)) void cacheline_wback ( UWord wix )
1489 {
1490    Word        i, j, k, m;
1491    Addr        tag;
1492    SecMap*     sm;
1493    CacheLine*  cl;
1494    LineZ* lineZ;
1495    LineF* lineF;
1496    Word        zix, fix, csvalsUsed;
1497    CountedSVal csvals[N_LINE_ARANGE];
1498    SVal        sv;
1499
1500    if (0)
1501    VG_(printf)("scache wback line %d\n", (Int)wix);
1502
1503    tl_assert(wix >= 0 && wix < N_WAY_NENT);
1504
1505    tag =  cache_shmem.tags0[wix];
1506    cl  = &cache_shmem.lyns0[wix];
1507
1508    /* The cache line may have been invalidated; if so, ignore it. */
1509    if (!is_valid_scache_tag(tag))
1510       return;
1511
1512    /* Where are we going to put it? */
1513    sm         = NULL;
1514    lineZ      = NULL;
1515    lineF      = NULL;
1516    zix = fix = -1;
1517
1518    /* find the Z line to write in and rcdec it or the associated F
1519       line. */
1520    find_Z_for_writing( &sm, &zix, tag );
1521
1522    tl_assert(sm);
1523    tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
1524    lineZ = &sm->linesZ[zix];
1525
1526    /* Generate the data to be stored */
1527    if (CHECK_ZSM)
1528       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1529
1530    csvalsUsed = -1;
1531    sequentialise_CacheLine( csvals, &csvalsUsed,
1532                             N_LINE_ARANGE, cl );
1533    tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
1534    if (0) VG_(printf)("%ld ", csvalsUsed);
1535
1536    lineZ->dict[0] = lineZ->dict[1]
1537                   = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1538
1539    /* i indexes actual shadow values, k is cursor in csvals */
1540    i = 0;
1541    for (k = 0; k < csvalsUsed; k++) {
1542
1543       sv = csvals[k].sval;
1544       if (CHECK_ZSM)
1545          tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1546       /* do we already have it? */
1547       if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
1548       if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
1549       if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
1550       if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
1551       /* no.  look for a free slot. */
1552       if (CHECK_ZSM)
1553          tl_assert(sv != SVal_INVALID);
1554       if (lineZ->dict[0]
1555           == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
1556       if (lineZ->dict[1]
1557           == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
1558       if (lineZ->dict[2]
1559           == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
1560       if (lineZ->dict[3]
1561           == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
1562       break; /* we'll have to use the f rep */
1563      dict_ok:
1564       m = csvals[k].count;
1565       if (m == 8) {
1566          write_twobit_array( lineZ->ix2s, i+0, j );
1567          write_twobit_array( lineZ->ix2s, i+1, j );
1568          write_twobit_array( lineZ->ix2s, i+2, j );
1569          write_twobit_array( lineZ->ix2s, i+3, j );
1570          write_twobit_array( lineZ->ix2s, i+4, j );
1571          write_twobit_array( lineZ->ix2s, i+5, j );
1572          write_twobit_array( lineZ->ix2s, i+6, j );
1573          write_twobit_array( lineZ->ix2s, i+7, j );
1574          i += 8;
1575       }
1576       else if (m == 4) {
1577          write_twobit_array( lineZ->ix2s, i+0, j );
1578          write_twobit_array( lineZ->ix2s, i+1, j );
1579          write_twobit_array( lineZ->ix2s, i+2, j );
1580          write_twobit_array( lineZ->ix2s, i+3, j );
1581          i += 4;
1582       }
1583       else if (m == 1) {
1584          write_twobit_array( lineZ->ix2s, i+0, j );
1585          i += 1;
1586       }
1587       else if (m == 2) {
1588          write_twobit_array( lineZ->ix2s, i+0, j );
1589          write_twobit_array( lineZ->ix2s, i+1, j );
1590          i += 2;
1591       }
1592       else {
1593          tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
1594       }
1595
1596    }
1597
1598    if (LIKELY(i == N_LINE_ARANGE)) {
1599       /* Construction of the compressed representation was
1600          successful. */
1601       rcinc_LineZ(lineZ);
1602       stats__cache_Z_wbacks++;
1603    } else {
1604       /* Cannot use the compressed(z) representation.  Use the full(f)
1605          rep instead. */
1606       tl_assert(i >= 0 && i < N_LINE_ARANGE);
1607       lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1608       lineF = alloc_LineF_for_Z (lineZ);
1609       i = 0;
1610       for (k = 0; k < csvalsUsed; k++) {
1611          if (CHECK_ZSM)
1612             tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1613          sv = csvals[k].sval;
1614          if (CHECK_ZSM)
1615             tl_assert(sv != SVal_INVALID);
1616          for (m = csvals[k].count; m > 0; m--) {
1617             lineF->w64s[i] = sv;
1618             i++;
1619          }
1620       }
1621       tl_assert(i == N_LINE_ARANGE);
1622       rcinc_LineF(lineF);
1623       stats__cache_F_wbacks++;
1624    }
1625 }
1626
1627 /* Fetch the cacheline 'wix' from the backing store.  The tag
1628    associated with 'wix' is assumed to have already been filled in;
1629    hence that is used to determine where in the backing store to read
1630    from. */
1631 static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
1632 {
1633    Word       i;
1634    Addr       tag;
1635    CacheLine* cl;
1636    LineZ*     lineZ;
1637    LineF*     lineF;
1638
1639    if (0)
1640    VG_(printf)("scache fetch line %d\n", (Int)wix);
1641
1642    tl_assert(wix >= 0 && wix < N_WAY_NENT);
1643
1644    tag =  cache_shmem.tags0[wix];
1645    cl  = &cache_shmem.lyns0[wix];
1646
1647    /* reject nonsense requests */
1648    tl_assert(is_valid_scache_tag(tag));
1649
1650    lineZ = NULL;
1651    lineF = NULL;
1652    find_ZF_for_reading( &lineZ, &lineF, tag );
1653    tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
1654
1655    /* expand the data into the bottom layer of the tree, then get
1656       cacheline_normalise to build the descriptor array. */
1657    if (lineF) {
1658       for (i = 0; i < N_LINE_ARANGE; i++) {
1659          cl->svals[i] = lineF->w64s[i];
1660       }
1661       stats__cache_F_fetches++;
1662    } else {
1663       for (i = 0; i < N_LINE_ARANGE; i++) {
1664          UWord ix = read_twobit_array( lineZ->ix2s, i );
1665          if (CHECK_ZSM) tl_assert(ix >= 0 && ix <= 3);
1666          cl->svals[i] = lineZ->dict[ix];
1667          if (CHECK_ZSM) tl_assert(cl->svals[i] != SVal_INVALID);
1668       }
1669       stats__cache_Z_fetches++;
1670    }
1671    normalise_CacheLine( cl );
1672 }
1673
1674 /* Invalid the cachelines corresponding to the given range, which
1675    must start and end on a cacheline boundary. */
1676 static void shmem__invalidate_scache_range (Addr ga, SizeT szB)
1677 {
1678    Word wix;
1679
1680    /* ga must be on a cacheline boundary. */
1681    tl_assert (is_valid_scache_tag (ga));
1682    /* szB must be a multiple of cacheline size. */
1683    tl_assert (0 == (szB & (N_LINE_ARANGE - 1)));
1684
1685
1686    Word ga_ix = (ga >> N_LINE_BITS) & (N_WAY_NENT - 1);
1687    Word nwix = szB / N_LINE_ARANGE;
1688
1689    if (nwix > N_WAY_NENT)
1690       nwix = N_WAY_NENT; // no need to check several times the same entry.
1691
1692    for (wix = 0; wix < nwix; wix++) {
1693       if (address_in_range(cache_shmem.tags0[ga_ix], ga, szB))
1694          cache_shmem.tags0[ga_ix] = 1/*INVALID*/;
1695       ga_ix++;
1696       if (UNLIKELY(ga_ix == N_WAY_NENT))
1697          ga_ix = 0;
1698    }
1699 }
1700
1701
1702 static void shmem__flush_and_invalidate_scache ( void ) {
1703    Word wix;
1704    Addr tag;
1705    if (0) VG_(printf)("%s","scache flush and invalidate\n");
1706    tl_assert(!is_valid_scache_tag(1));
1707    for (wix = 0; wix < N_WAY_NENT; wix++) {
1708       tag = cache_shmem.tags0[wix];
1709       if (tag == 1/*INVALID*/) {
1710          /* already invalid; nothing to do */
1711       } else {
1712          tl_assert(is_valid_scache_tag(tag));
1713          cacheline_wback( wix );
1714       }
1715       cache_shmem.tags0[wix] = 1/*INVALID*/;
1716    }
1717    stats__cache_flushes_invals++;
1718 }
1719
1720
1721 static inline Bool aligned16 ( Addr a ) {
1722    return 0 == (a & 1);
1723 }
1724 static inline Bool aligned32 ( Addr a ) {
1725    return 0 == (a & 3);
1726 }
1727 static inline Bool aligned64 ( Addr a ) {
1728    return 0 == (a & 7);
1729 }
1730 static inline UWord get_cacheline_offset ( Addr a ) {
1731    return (UWord)(a & (N_LINE_ARANGE - 1));
1732 }
1733 static inline Addr cacheline_ROUNDUP ( Addr a ) {
1734    return ROUNDUP(a, N_LINE_ARANGE);
1735 }
1736 static inline Addr cacheline_ROUNDDN ( Addr a ) {
1737    return ROUNDDN(a, N_LINE_ARANGE);
1738 }
1739 static inline UWord get_treeno ( Addr a ) {
1740    return get_cacheline_offset(a) >> 3;
1741 }
1742 static inline UWord get_tree_offset ( Addr a ) {
1743    return a & 7;
1744 }
1745
1746 static __attribute__((noinline))
1747        CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
1748 static inline CacheLine* get_cacheline ( Addr a )
1749 {
1750    /* tag is 'a' with the in-line offset masked out,
1751       eg a[31]..a[4] 0000 */
1752    Addr       tag = a & ~(N_LINE_ARANGE - 1);
1753    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1754    stats__cache_totrefs++;
1755    if (LIKELY(tag == cache_shmem.tags0[wix])) {
1756       return &cache_shmem.lyns0[wix];
1757    } else {
1758       return get_cacheline_MISS( a );
1759    }
1760 }
1761
1762 static __attribute__((noinline))
1763        CacheLine* get_cacheline_MISS ( Addr a )
1764 {
1765    /* tag is 'a' with the in-line offset masked out,
1766       eg a[31]..a[4] 0000 */
1767
1768    CacheLine* cl;
1769    Addr*      tag_old_p;
1770    Addr       tag = a & ~(N_LINE_ARANGE - 1);
1771    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1772
1773    tl_assert(tag != cache_shmem.tags0[wix]);
1774
1775    /* Dump the old line into the backing store. */
1776    stats__cache_totmisses++;
1777
1778    cl        = &cache_shmem.lyns0[wix];
1779    tag_old_p = &cache_shmem.tags0[wix];
1780
1781    if (is_valid_scache_tag( *tag_old_p )) {
1782       /* EXPENSIVE and REDUNDANT: callee does it */
1783       if (CHECK_ZSM)
1784          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1785       cacheline_wback( wix );
1786    }
1787    /* and reload the new one */
1788    *tag_old_p = tag;
1789    cacheline_fetch( wix );
1790    if (CHECK_ZSM)
1791       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1792    return cl;
1793 }
1794
1795 static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1796    stats__cline_64to32pulldown++;
1797    switch (toff) {
1798       case 0: case 4:
1799          tl_assert(descr & TREE_DESCR_64);
1800          tree[4] = tree[0];
1801          descr &= ~TREE_DESCR_64;
1802          descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
1803          break;
1804       default:
1805          tl_assert(0);
1806    }
1807    return descr;
1808 }
1809
1810 static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1811    stats__cline_32to16pulldown++;
1812    switch (toff) {
1813       case 0: case 2:
1814          if (!(descr & TREE_DESCR_32_0)) {
1815             descr = pulldown_to_32(tree, 0, descr);
1816          }
1817          tl_assert(descr & TREE_DESCR_32_0);
1818          tree[2] = tree[0];
1819          descr &= ~TREE_DESCR_32_0;
1820          descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
1821          break;
1822       case 4: case 6:
1823          if (!(descr & TREE_DESCR_32_1)) {
1824             descr = pulldown_to_32(tree, 4, descr);
1825          }
1826          tl_assert(descr & TREE_DESCR_32_1);
1827          tree[6] = tree[4];
1828          descr &= ~TREE_DESCR_32_1;
1829          descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
1830          break;
1831       default:
1832          tl_assert(0);
1833    }
1834    return descr;
1835 }
1836
1837 static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1838    stats__cline_16to8pulldown++;
1839    switch (toff) {
1840       case 0: case 1:
1841          if (!(descr & TREE_DESCR_16_0)) {
1842             descr = pulldown_to_16(tree, 0, descr);
1843          }
1844          tl_assert(descr & TREE_DESCR_16_0);
1845          tree[1] = tree[0];
1846          descr &= ~TREE_DESCR_16_0;
1847          descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
1848          break;
1849       case 2: case 3:
1850          if (!(descr & TREE_DESCR_16_1)) {
1851             descr = pulldown_to_16(tree, 2, descr);
1852          }
1853          tl_assert(descr & TREE_DESCR_16_1);
1854          tree[3] = tree[2];
1855          descr &= ~TREE_DESCR_16_1;
1856          descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
1857          break;
1858       case 4: case 5:
1859          if (!(descr & TREE_DESCR_16_2)) {
1860             descr = pulldown_to_16(tree, 4, descr);
1861          }
1862          tl_assert(descr & TREE_DESCR_16_2);
1863          tree[5] = tree[4];
1864          descr &= ~TREE_DESCR_16_2;
1865          descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
1866          break;
1867       case 6: case 7:
1868          if (!(descr & TREE_DESCR_16_3)) {
1869             descr = pulldown_to_16(tree, 6, descr);
1870          }
1871          tl_assert(descr & TREE_DESCR_16_3);
1872          tree[7] = tree[6];
1873          descr &= ~TREE_DESCR_16_3;
1874          descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
1875          break;
1876       default:
1877          tl_assert(0);
1878    }
1879    return descr;
1880 }
1881
1882
1883 static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
1884    UShort mask;
1885    switch (toff) {
1886       case 0:
1887          mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
1888          tl_assert( (descr & mask) == mask );
1889          descr &= ~mask;
1890          descr |= TREE_DESCR_16_0;
1891          break;
1892       case 2:
1893          mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
1894          tl_assert( (descr & mask) == mask );
1895          descr &= ~mask;
1896          descr |= TREE_DESCR_16_1;
1897          break;
1898       case 4:
1899          mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
1900          tl_assert( (descr & mask) == mask );
1901          descr &= ~mask;
1902          descr |= TREE_DESCR_16_2;
1903          break;
1904       case 6:
1905          mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
1906          tl_assert( (descr & mask) == mask );
1907          descr &= ~mask;
1908          descr |= TREE_DESCR_16_3;
1909          break;
1910       default:
1911          tl_assert(0);
1912    }
1913    return descr;
1914 }
1915
1916 static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
1917    UShort mask;
1918    switch (toff) {
1919       case 0:
1920          if (!(descr & TREE_DESCR_16_0))
1921             descr = pullup_descr_to_16(descr, 0);
1922          if (!(descr & TREE_DESCR_16_1))
1923             descr = pullup_descr_to_16(descr, 2);
1924          mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
1925          tl_assert( (descr & mask) == mask );
1926          descr &= ~mask;
1927          descr |= TREE_DESCR_32_0;
1928          break;
1929       case 4:
1930          if (!(descr & TREE_DESCR_16_2))
1931             descr = pullup_descr_to_16(descr, 4);
1932          if (!(descr & TREE_DESCR_16_3))
1933             descr = pullup_descr_to_16(descr, 6);
1934          mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
1935          tl_assert( (descr & mask) == mask );
1936          descr &= ~mask;
1937          descr |= TREE_DESCR_32_1;
1938          break;
1939       default:
1940          tl_assert(0);
1941    }
1942    return descr;
1943 }
1944
1945 static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
1946    switch (toff) {
1947       case 0: case 4:
1948          return 0 != (descr & TREE_DESCR_64);
1949       default:
1950          tl_assert(0);
1951    }
1952 }
1953
1954 static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
1955    switch (toff) {
1956       case 0:
1957          return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
1958       case 2:
1959          return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
1960       case 4:
1961          return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
1962       case 6:
1963          return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
1964       default:
1965          tl_assert(0);
1966    }
1967 }
1968
1969 /* ------------ Cache management ------------ */
1970
1971 static void zsm_flush_cache ( void )
1972 {
1973    shmem__flush_and_invalidate_scache();
1974 }
1975
1976
1977 static void zsm_init ( void )
1978 {
1979    tl_assert( sizeof(UWord) == sizeof(Addr) );
1980
1981    tl_assert(map_shmem == NULL);
1982    map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
1983                            HG_(free),
1984                            NULL/*unboxed UWord cmp*/);
1985    /* Invalidate all cache entries. */
1986    tl_assert(!is_valid_scache_tag(1));
1987    for (UWord wix = 0; wix < N_WAY_NENT; wix++) {
1988       cache_shmem.tags0[wix] = 1/*INVALID*/;
1989    }
1990
1991    LineF_pool_allocator = VG_(newPA) (
1992                              sizeof(LineF),
1993                              /* Nr elements/pool to fill a core arena block
1994                                 taking some arena overhead into account. */
1995                              (4 * 1024 * 1024 - 200)/sizeof(LineF),
1996                              HG_(zalloc),
1997                              "libhb.LineF_storage.pool",
1998                              HG_(free)
1999                           );
2000
2001    /* a SecMap must contain an integral number of CacheLines */
2002    tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
2003    /* also ... a CacheLine holds an integral number of trees */
2004    tl_assert(0 == (N_LINE_ARANGE % 8));
2005 }
2006
2007 /////////////////////////////////////////////////////////////////
2008 /////////////////////////////////////////////////////////////////
2009 //                                                             //
2010 // SECTION END compressed shadow memory                        //
2011 //                                                             //
2012 /////////////////////////////////////////////////////////////////
2013 /////////////////////////////////////////////////////////////////
2014
2015
2016
2017 /////////////////////////////////////////////////////////////////
2018 /////////////////////////////////////////////////////////////////
2019 //                                                             //
2020 // SECTION BEGIN vts primitives                                //
2021 //                                                             //
2022 /////////////////////////////////////////////////////////////////
2023 /////////////////////////////////////////////////////////////////
2024
2025
2026 /* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
2027    being compact stand-ins for Thr*'s.  Use these functions to map
2028    between them. */
2029 static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
2030 static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
2031
2032 __attribute__((noreturn))
2033 static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
2034 {
2035    if (due_to_nThrs) {
2036       const HChar* s =
2037          "\n"
2038          "Helgrind: cannot continue, run aborted: too many threads.\n"
2039          "Sorry.  Helgrind can only handle programs that create\n"
2040          "%'llu or fewer threads over their entire lifetime.\n"
2041          "\n";
2042       VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
2043    } else {
2044       const HChar* s =
2045          "\n"
2046          "Helgrind: cannot continue, run aborted: too many\n"
2047          "synchronisation events.  Sorry. Helgrind can only handle\n"
2048          "programs which perform %'llu or fewer\n"
2049          "inter-thread synchronisation events (locks, unlocks, etc).\n"
2050          "\n";
2051       VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
2052    }
2053    VG_(exit)(1);
2054    /*NOTREACHED*/
2055    tl_assert(0); /*wtf?!*/
2056 }
2057
2058
2059 /* The dead thread (ThrID, actually) tables.  A thread may only be
2060    listed here if we have been notified thereof by libhb_async_exit.
2061    New entries are added at the end.  The order isn't important, but
2062    the ThrID values must be unique.
2063    verydead_thread_table_not_pruned lists the identity of the threads
2064    that died since the previous round of pruning.
2065    Once pruning is done, these ThrID are added in verydead_thread_table.
2066    We don't actually need to keep the set of threads that have ever died --
2067    only the threads that have died since the previous round of
2068    pruning.  But it's useful for sanity check purposes to keep the
2069    entire set, so we do. */
2070 static XArray* /* of ThrID */ verydead_thread_table_not_pruned = NULL;
2071 static XArray* /* of ThrID */ verydead_thread_table = NULL;
2072
2073 /* Arbitrary total ordering on ThrIDs. */
2074 static Int cmp__ThrID ( const void* v1, const void* v2 ) {
2075    ThrID id1 = *(const ThrID*)v1;
2076    ThrID id2 = *(const ThrID*)v2;
2077    if (id1 < id2) return -1;
2078    if (id1 > id2) return 1;
2079    return 0;
2080 }
2081
2082 static void verydead_thread_tables_init ( void )
2083 {
2084    tl_assert(!verydead_thread_table);
2085    tl_assert(!verydead_thread_table_not_pruned);
2086    verydead_thread_table
2087      = VG_(newXA)( HG_(zalloc),
2088                    "libhb.verydead_thread_table_init.1",
2089                    HG_(free), sizeof(ThrID) );
2090    VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
2091    verydead_thread_table_not_pruned
2092      = VG_(newXA)( HG_(zalloc),
2093                    "libhb.verydead_thread_table_init.2",
2094                    HG_(free), sizeof(ThrID) );
2095    VG_(setCmpFnXA)(verydead_thread_table_not_pruned, cmp__ThrID);
2096 }
2097
2098 static void verydead_thread_table_sort_and_check (XArray* thrids)
2099 {
2100    UWord i;
2101
2102    VG_(sortXA)( thrids );
2103    /* Sanity check: check for unique .sts.thr values. */
2104    UWord nBT = VG_(sizeXA)( thrids );
2105    if (nBT > 0) {
2106       ThrID thrid1, thrid2;
2107       thrid2 = *(ThrID*)VG_(indexXA)( thrids, 0 );
2108       for (i = 1; i < nBT; i++) {
2109          thrid1 = thrid2;
2110          thrid2 = *(ThrID*)VG_(indexXA)( thrids, i );
2111          tl_assert(thrid1 < thrid2);
2112       }
2113    }
2114    /* Ok, so the dead thread table thrids has unique and in-order keys. */
2115 }
2116
2117 /* A VTS contains .ts, its vector clock, and also .id, a field to hold
2118    a backlink for the caller's convenience.  Since we have no idea
2119    what to set that to in the library, it always gets set to
2120    VtsID_INVALID. */
2121 typedef
2122    struct {
2123       VtsID    id;
2124       UInt     usedTS;
2125       UInt     sizeTS;
2126       ScalarTS ts[0];
2127    }
2128    VTS;
2129
2130 /* Allocate a VTS capable of storing 'sizeTS' entries. */
2131 static VTS* VTS__new ( const HChar* who, UInt sizeTS );
2132
2133 /* Make a clone of 'vts', sizing the new array to exactly match the
2134    number of ScalarTSs present. */
2135 static VTS* VTS__clone ( const HChar* who, VTS* vts );
2136
2137 /* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
2138    array is sized exactly to hold the number of required elements.
2139    'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
2140    must be in strictly increasing order. */
2141 static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel );
2142
2143 /* Delete this VTS in its entirety. */
2144 static void VTS__delete ( VTS* vts );
2145
2146 /* Create a new singleton VTS in 'out'.  Caller must have
2147    pre-allocated 'out' sufficiently big to hold the result in all
2148    possible cases. */
2149 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );
2150
2151 /* Create in 'out' a VTS which is the same as 'vts' except with
2152    vts[me]++, so to speak.  Caller must have pre-allocated 'out'
2153    sufficiently big to hold the result in all possible cases. */
2154 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );
2155
2156 /* Create in 'out' a VTS which is the join (max) of 'a' and
2157    'b'. Caller must have pre-allocated 'out' sufficiently big to hold
2158    the result in all possible cases. */
2159 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );
2160
2161 /* Compute the partial ordering relation of the two args.  Although we
2162    could be completely general and return an enumeration value (EQ,
2163    LT, GT, UN), in fact we only need LEQ, and so we may as well
2164    hardwire that fact.
2165
2166    Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
2167    invald ThrID).  In the latter case, the returned ThrID indicates
2168    the discovered point for which they are not.  There may be more
2169    than one such point, but we only care about seeing one of them, not
2170    all of them.  This rather strange convention is used because
2171    sometimes we want to know the actual index at which they first
2172    differ. */
2173 static UInt VTS__cmpLEQ ( VTS* a, VTS* b );
2174
2175 /* Compute an arbitrary structural (total) ordering on the two args,
2176    based on their VCs, so they can be looked up in a table, tree, etc.
2177    Returns -1, 0 or 1. */
2178 static Word VTS__cmp_structural ( VTS* a, VTS* b );
2179
2180 /* Debugging only.  Display the given VTS. */
2181 static void VTS__show ( const VTS* vts );
2182
2183 /* Debugging only.  Return vts[index], so to speak. */
2184 static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
2185
2186 /* Notify the VTS machinery that a thread has been declared
2187    comprehensively dead: that is, it has done an async exit AND it has
2188    been joined with.  This should ensure that its local clocks (.viR
2189    and .viW) will never again change, and so all mentions of this
2190    thread from all VTSs in the system may be removed. */
2191 static void VTS__declare_thread_very_dead ( Thr* idx );
2192
2193 /*--------------- to do with Vector Timestamps ---------------*/
2194
2195 static Bool is_sane_VTS ( VTS* vts )
2196 {
2197    UWord     i, n;
2198    ScalarTS  *st1, *st2;
2199    if (!vts) return False;
2200    if (vts->usedTS > vts->sizeTS) return False;
2201    n = vts->usedTS;
2202    if (n == 1) {
2203       st1 = &vts->ts[0];
2204       if (st1->tym == 0)
2205          return False;
2206    }
2207    else
2208    if (n >= 2) {
2209       for (i = 0; i < n-1; i++) {
2210          st1 = &vts->ts[i];
2211          st2 = &vts->ts[i+1];
2212          if (st1->thrid >= st2->thrid)
2213             return False;
2214          if (st1->tym == 0 || st2->tym == 0)
2215             return False;
2216       }
2217    }
2218    return True;
2219 }
2220
2221
2222 /* Create a new, empty VTS.
2223 */
2224 static VTS* VTS__new ( const HChar* who, UInt sizeTS )
2225 {
2226    VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
2227    tl_assert(vts->usedTS == 0);
2228    vts->sizeTS = sizeTS;
2229    *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
2230    return vts;
2231 }
2232
2233 /* Clone this VTS.
2234 */
2235 static VTS* VTS__clone ( const HChar* who, VTS* vts )
2236 {
2237    tl_assert(vts);
2238    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2239    UInt nTS = vts->usedTS;
2240    VTS* clone = VTS__new(who, nTS);
2241    clone->id = vts->id;
2242    clone->sizeTS = nTS;
2243    clone->usedTS = nTS;
2244    UInt i;
2245    for (i = 0; i < nTS; i++) {
2246       clone->ts[i] = vts->ts[i];
2247    }
2248    tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2249    return clone;
2250 }
2251
2252
2253 /* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
2254    must be in strictly increasing order.  We could obviously do this
2255    much more efficiently (in linear time) if necessary.
2256 */
2257 static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel )
2258 {
2259    UInt i, j;
2260    tl_assert(vts);
2261    tl_assert(thridsToDel);
2262    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2263    UInt nTS = vts->usedTS;
2264    /* Figure out how many ScalarTSs will remain in the output. */
2265    UInt nReq = nTS;
2266    for (i = 0; i < nTS; i++) {
2267       ThrID thrid = vts->ts[i].thrid;
2268       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2269          nReq--;
2270    }
2271    tl_assert(nReq <= nTS);
2272    /* Copy the ones that will remain. */
2273    VTS* res = VTS__new(who, nReq);
2274    j = 0;
2275    for (i = 0; i < nTS; i++) {
2276       ThrID thrid = vts->ts[i].thrid;
2277       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2278          continue;
2279       res->ts[j++] = vts->ts[i];
2280    }
2281    tl_assert(j == nReq);
2282    tl_assert(j == res->sizeTS);
2283    res->usedTS = j;
2284    tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
2285    return res;
2286 }
2287
2288
2289 /* Delete this VTS in its entirety.
2290 */
2291 static void VTS__delete ( VTS* vts )
2292 {
2293    tl_assert(vts);
2294    tl_assert(vts->usedTS <= vts->sizeTS);
2295    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2296    HG_(free)(vts);
2297 }
2298
2299
2300 /* Create a new singleton VTS.
2301 */
2302 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
2303 {
2304    tl_assert(thr);
2305    tl_assert(tym >= 1);
2306    tl_assert(out);
2307    tl_assert(out->usedTS == 0);
2308    tl_assert(out->sizeTS >= 1);
2309    UInt hi = out->usedTS++;
2310    out->ts[hi].thrid = Thr__to_ThrID(thr);
2311    out->ts[hi].tym   = tym;
2312 }
2313
2314
2315 /* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
2316    not modified.
2317 */
2318 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
2319 {
2320    UInt      i, n;
2321    ThrID     me_thrid;
2322    Bool      found = False;
2323
2324    stats__vts__tick++;
2325
2326    tl_assert(out);
2327    tl_assert(out->usedTS == 0);
2328    if (vts->usedTS >= ThrID_MAX_VALID)
2329       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2330    tl_assert(out->sizeTS >= 1 + vts->usedTS);
2331
2332    tl_assert(me);
2333    me_thrid = Thr__to_ThrID(me);
2334    tl_assert(is_sane_VTS(vts));
2335    n = vts->usedTS;
2336
2337    /* Copy all entries which precede 'me'. */
2338    for (i = 0; i < n; i++) {
2339       ScalarTS* here = &vts->ts[i];
2340       if (UNLIKELY(here->thrid >= me_thrid))
2341          break;
2342       UInt hi = out->usedTS++;
2343       out->ts[hi] = *here;
2344    }
2345
2346    /* 'i' now indicates the next entry to copy, if any.
2347        There are 3 possibilities:
2348        (a) there is no next entry (we used them all up already):
2349            add (me_thrid,1) to the output, and quit
2350        (b) there is a next entry, and its thrid > me_thrid:
2351            add (me_thrid,1) to the output, then copy the remaining entries
2352        (c) there is a next entry, and its thrid == me_thrid:
2353            copy it to the output but increment its timestamp value.
2354            Then copy the remaining entries.  (c) is the common case.
2355    */
2356    tl_assert(i >= 0 && i <= n);
2357    if (i == n) { /* case (a) */
2358       UInt hi = out->usedTS++;
2359       out->ts[hi].thrid = me_thrid;
2360       out->ts[hi].tym   = 1;
2361    } else {
2362       /* cases (b) and (c) */
2363       ScalarTS* here = &vts->ts[i];
2364       if (me_thrid == here->thrid) { /* case (c) */
2365          if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
2366             /* We're hosed.  We have to stop. */
2367             scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
2368          }
2369          UInt hi = out->usedTS++;
2370          out->ts[hi].thrid = here->thrid;
2371          out->ts[hi].tym   = here->tym + 1;
2372          i++;
2373          found = True;
2374       } else { /* case (b) */
2375          UInt hi = out->usedTS++;
2376          out->ts[hi].thrid = me_thrid;
2377          out->ts[hi].tym   = 1;
2378       }
2379       /* And copy any remaining entries. */
2380       for (/*keepgoing*/; i < n; i++) {
2381          ScalarTS* here2 = &vts->ts[i];
2382          UInt hi = out->usedTS++;
2383          out->ts[hi] = *here2;
2384       }
2385    }
2386
2387    tl_assert(is_sane_VTS(out));
2388    tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
2389    tl_assert(out->usedTS <= out->sizeTS);
2390 }
2391
2392
2393 /* Return a new VTS constructed as the join (max) of the 2 args.
2394    Neither arg is modified.
2395 */
2396 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
2397 {
2398    UInt     ia, ib, useda, usedb;
2399    ULong    tyma, tymb, tymMax;
2400    ThrID    thrid;
2401    UInt     ncommon = 0;
2402
2403    stats__vts__join++;
2404
2405    tl_assert(a);
2406    tl_assert(b);
2407    useda = a->usedTS;
2408    usedb = b->usedTS;
2409
2410    tl_assert(out);
2411    tl_assert(out->usedTS == 0);
2412    /* overly conservative test, but doing better involves comparing
2413       the two VTSs, which we don't want to do at this point. */
2414    if (useda + usedb >= ThrID_MAX_VALID)
2415       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2416    tl_assert(out->sizeTS >= useda + usedb);
2417
2418    ia = ib = 0;
2419
2420    while (1) {
2421
2422       /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
2423          from a and b in order, where thrid is the next ThrID
2424          occurring in either a or b, and tyma/b are the relevant
2425          scalar timestamps, taking into account implicit zeroes. */
2426       tl_assert(ia >= 0 && ia <= useda);
2427       tl_assert(ib >= 0 && ib <= usedb);
2428
2429       if        (ia == useda && ib == usedb) {
2430          /* both empty - done */
2431          break;
2432
2433       } else if (ia == useda && ib != usedb) {
2434          /* a empty, use up b */
2435          ScalarTS* tmpb = &b->ts[ib];
2436          thrid = tmpb->thrid;
2437          tyma  = 0;
2438          tymb  = tmpb->tym;
2439          ib++;
2440
2441       } else if (ia != useda && ib == usedb) {
2442          /* b empty, use up a */
2443          ScalarTS* tmpa = &a->ts[ia];
2444          thrid = tmpa->thrid;
2445          tyma  = tmpa->tym;
2446          tymb  = 0;
2447          ia++;
2448
2449       } else {
2450          /* both not empty; extract lowest-ThrID'd triple */
2451          ScalarTS* tmpa = &a->ts[ia];
2452          ScalarTS* tmpb = &b->ts[ib];
2453          if (tmpa->thrid < tmpb->thrid) {
2454             /* a has the lowest unconsidered ThrID */
2455             thrid = tmpa->thrid;
2456             tyma  = tmpa->tym;
2457             tymb  = 0;
2458             ia++;
2459          } else if (tmpa->thrid > tmpb->thrid) {
2460             /* b has the lowest unconsidered ThrID */
2461             thrid = tmpb->thrid;
2462             tyma  = 0;
2463             tymb  = tmpb->tym;
2464             ib++;
2465          } else {
2466             /* they both next mention the same ThrID */
2467             tl_assert(tmpa->thrid == tmpb->thrid);
2468             thrid = tmpa->thrid; /* == tmpb->thrid */
2469             tyma  = tmpa->tym;
2470             tymb  = tmpb->tym;
2471             ia++;
2472             ib++;
2473             ncommon++;
2474          }
2475       }
2476
2477       /* having laboriously determined (thr, tyma, tymb), do something
2478          useful with it. */
2479       tymMax = tyma > tymb ? tyma : tymb;
2480       if (tymMax > 0) {
2481          UInt hi = out->usedTS++;
2482          out->ts[hi].thrid = thrid;
2483          out->ts[hi].tym   = tymMax;
2484       }
2485
2486    }
2487
2488    tl_assert(is_sane_VTS(out));
2489    tl_assert(out->usedTS <= out->sizeTS);
2490    tl_assert(out->usedTS == useda + usedb - ncommon);
2491 }
2492
2493
2494 /* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
2495    they are, or the first ThrID for which they are not (no valid ThrID
2496    has the value zero).  This rather strange convention is used
2497    because sometimes we want to know the actual index at which they
2498    first differ. */
2499 static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
2500 {
2501    Word  ia, ib, useda, usedb;
2502    ULong tyma, tymb;
2503
2504    stats__vts__cmpLEQ++;
2505
2506    tl_assert(a);
2507    tl_assert(b);
2508    useda = a->usedTS;
2509    usedb = b->usedTS;
2510
2511    ia = ib = 0;
2512
2513    while (1) {
2514
2515       /* This logic is to enumerate doubles (tyma, tymb) drawn
2516          from a and b in order, and tyma/b are the relevant
2517          scalar timestamps, taking into account implicit zeroes. */
2518       ThrID thrid;
2519
2520       tl_assert(ia >= 0 && ia <= useda);
2521       tl_assert(ib >= 0 && ib <= usedb);
2522
2523       if        (ia == useda && ib == usedb) {
2524          /* both empty - done */
2525          break;
2526
2527       } else if (ia == useda && ib != usedb) {
2528          /* a empty, use up b */
2529          ScalarTS* tmpb = &b->ts[ib];
2530          tyma  = 0;
2531          tymb  = tmpb->tym;
2532          thrid = tmpb->thrid;
2533          ib++;
2534
2535       } else if (ia != useda && ib == usedb) {
2536          /* b empty, use up a */
2537          ScalarTS* tmpa = &a->ts[ia];
2538          tyma  = tmpa->tym;
2539          thrid = tmpa->thrid;
2540          tymb  = 0;
2541          ia++;
2542
2543       } else {
2544          /* both not empty; extract lowest-ThrID'd triple */
2545          ScalarTS* tmpa = &a->ts[ia];
2546          ScalarTS* tmpb = &b->ts[ib];
2547          if (tmpa->thrid < tmpb->thrid) {
2548             /* a has the lowest unconsidered ThrID */
2549             tyma  = tmpa->tym;
2550             thrid = tmpa->thrid;
2551             tymb  = 0;
2552             ia++;
2553          }
2554          else
2555          if (tmpa->thrid > tmpb->thrid) {
2556             /* b has the lowest unconsidered ThrID */
2557             tyma  = 0;
2558             tymb  = tmpb->tym;
2559             thrid = tmpb->thrid;
2560             ib++;
2561          } else {
2562             /* they both next mention the same ThrID */
2563             tl_assert(tmpa->thrid == tmpb->thrid);
2564             tyma  = tmpa->tym;
2565             thrid = tmpa->thrid;
2566             tymb  = tmpb->tym;
2567             ia++;
2568             ib++;
2569          }
2570       }
2571
2572       /* having laboriously determined (tyma, tymb), do something
2573          useful with it. */
2574       if (tyma > tymb) {
2575          /* not LEQ at this index.  Quit, since the answer is
2576             determined already. */
2577          tl_assert(thrid >= 1024);
2578          return thrid;
2579       }
2580    }
2581
2582    return 0; /* all points are LEQ => return an invalid ThrID */
2583 }
2584
2585
2586 /* Compute an arbitrary structural (total) ordering on the two args,
2587    based on their VCs, so they can be looked up in a table, tree, etc.
2588    Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
2589    performance critical so there is some effort expended to make it sa
2590    fast as possible.
2591 */
2592 Word VTS__cmp_structural ( VTS* a, VTS* b )
2593 {
2594    /* We just need to generate an arbitrary total ordering based on
2595       a->ts and b->ts.  Preferably do it in a way which comes across likely
2596       differences relatively quickly. */
2597    Word     i;
2598    Word     useda = 0,    usedb = 0;
2599    ScalarTS *ctsa = NULL, *ctsb = NULL;
2600
2601    stats__vts__cmp_structural++;
2602
2603    tl_assert(a);
2604    tl_assert(b);
2605
2606    ctsa = &a->ts[0]; useda = a->usedTS;
2607    ctsb = &b->ts[0]; usedb = b->usedTS;
2608
2609    if (LIKELY(useda == usedb)) {
2610       ScalarTS *tmpa = NULL, *tmpb = NULL;
2611       stats__vts__cmp_structural_slow++;
2612       /* Same length vectors.  Find the first difference, if any, as
2613          fast as possible. */
2614       for (i = 0; i < useda; i++) {
2615          tmpa = &ctsa[i];
2616          tmpb = &ctsb[i];
2617          if (LIKELY(tmpa->tym == tmpb->tym
2618                     && tmpa->thrid == tmpb->thrid))
2619             continue;
2620          else
2621             break;
2622       }
2623       if (UNLIKELY(i == useda)) {
2624          /* They're identical. */
2625          return 0;
2626       } else {
2627          tl_assert(i >= 0 && i < useda);
2628          if (tmpa->tym < tmpb->tym) return -1;
2629          if (tmpa->tym > tmpb->tym) return 1;
2630          if (tmpa->thrid < tmpb->thrid) return -1;
2631          if (tmpa->thrid > tmpb->thrid) return 1;
2632          /* we just established them as non-identical, hence: */
2633       }
2634       /*NOTREACHED*/
2635       tl_assert(0);
2636    }
2637
2638    if (useda < usedb) return -1;
2639    if (useda > usedb) return 1;
2640    /*NOTREACHED*/
2641    tl_assert(0);
2642 }
2643
2644
2645 /* Debugging only.  Display the given VTS.
2646 */
2647 static void VTS__show ( const VTS* vts )
2648 {
2649    Word      i, n;
2650    tl_assert(vts);
2651
2652    VG_(printf)("[");
2653    n =  vts->usedTS;
2654    for (i = 0; i < n; i++) {
2655       const ScalarTS *st = &vts->ts[i];
2656       VG_(printf)(i < n-1 ? "%d:%llu " : "%d:%llu", st->thrid, (ULong)st->tym);
2657    }
2658    VG_(printf)("]");
2659 }
2660
2661
2662 /* Debugging only.  Return vts[index], so to speak.
2663 */
2664 ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
2665 {
2666    UWord i, n;
2667    ThrID idx_thrid = Thr__to_ThrID(idx);
2668    stats__vts__indexat_slow++;
2669    tl_assert(vts);
2670    n = vts->usedTS;
2671    for (i = 0; i < n; i++) {
2672       ScalarTS* st = &vts->ts[i];
2673       if (st->thrid == idx_thrid)
2674          return st->tym;
2675    }
2676    return 0;
2677 }
2678
2679
2680 /* See comment on prototype above.
2681 */
2682 static void VTS__declare_thread_very_dead ( Thr* thr )
2683 {
2684    if (0) VG_(printf)("VTQ:  tae %p\n", thr);
2685
2686    tl_assert(thr->llexit_done);
2687    tl_assert(thr->joinedwith_done);
2688
2689    ThrID nyu;
2690    nyu = Thr__to_ThrID(thr);
2691    VG_(addToXA)( verydead_thread_table_not_pruned, &nyu );
2692
2693    /* We can only get here if we're assured that we'll never again
2694       need to look at this thread's ::viR or ::viW.  Set them to
2695       VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
2696       mostly so that we don't wind up pruning them (as that would be
2697       nonsensical: the only interesting ScalarTS entry for a dead
2698       thread is its own index, and the pruning will remove that.). */
2699    VtsID__rcdec(thr->viR);
2700    VtsID__rcdec(thr->viW);
2701    thr->viR = VtsID_INVALID;
2702    thr->viW = VtsID_INVALID;
2703 }
2704
2705
2706 /////////////////////////////////////////////////////////////////
2707 /////////////////////////////////////////////////////////////////
2708 //                                                             //
2709 // SECTION END vts primitives                                  //
2710 //                                                             //
2711 /////////////////////////////////////////////////////////////////
2712 /////////////////////////////////////////////////////////////////
2713
2714
2715
2716 /////////////////////////////////////////////////////////////////
2717 /////////////////////////////////////////////////////////////////
2718 //                                                             //
2719 // SECTION BEGIN main library                                  //
2720 //                                                             //
2721 /////////////////////////////////////////////////////////////////
2722 /////////////////////////////////////////////////////////////////
2723
2724
2725 /////////////////////////////////////////////////////////
2726 //                                                     //
2727 // VTS set                                             //
2728 //                                                     //
2729 /////////////////////////////////////////////////////////
2730
2731 static WordFM* /* WordFM VTS* void */ vts_set = NULL;
2732
2733 static void vts_set_init ( void )
2734 {
2735    tl_assert(!vts_set);
2736    vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
2737                          HG_(free),
2738                          (Word(*)(UWord,UWord))VTS__cmp_structural );
2739 }
2740
2741 /* Given a VTS, look in vts_set to see if we already have a
2742    structurally identical one.  If yes, return the pair (True, pointer
2743    to the existing one).  If no, clone this one, add the clone to the
2744    set, and return (False, pointer to the clone). */
2745 static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
2746 {
2747    UWord keyW, valW;
2748    stats__vts_set__focaa++;
2749    tl_assert(cand->id == VtsID_INVALID);
2750    /* lookup cand (by value) */
2751    if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
2752       /* found it */
2753       tl_assert(valW == 0);
2754       /* if this fails, cand (by ref) was already present (!) */
2755       tl_assert(keyW != (UWord)cand);
2756       *res = (VTS*)keyW;
2757       return True;
2758    } else {
2759       /* not present.  Clone, add and return address of clone. */
2760       stats__vts_set__focaa_a++;
2761       VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
2762       tl_assert(clone != cand);
2763       VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
2764       *res = clone;
2765       return False;
2766    }
2767 }
2768
2769
2770 /////////////////////////////////////////////////////////
2771 //                                                     //
2772 // VTS table                                           //
2773 //                                                     //
2774 /////////////////////////////////////////////////////////
2775
2776 static void VtsID__invalidate_caches ( void ); /* fwds */
2777
2778 /* A type to hold VTS table entries.  Invariants:
2779    If .vts == NULL, then this entry is not in use, so:
2780    - .rc == 0
2781    - this entry is on the freelist (unfortunately, does not imply
2782      any constraints on value for u.freelink)
2783    If .vts != NULL, then this entry is in use:
2784    - .vts is findable in vts_set
2785    - .vts->id == this entry number
2786    - no specific value for .rc (even 0 is OK)
2787    - this entry is not on freelist, so u.freelink == VtsID_INVALID
2788 */
2789 typedef
2790    struct {
2791       VTS*  vts;      /* vts, in vts_set */
2792       UWord rc;       /* reference count - enough for entire aspace */
2793       union {
2794          VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
2795          VtsID remap;    /* used only during pruning, for used entries */
2796       } u;
2797       /* u.freelink only used when vts == NULL,
2798          u.remap only used when vts != NULL, during pruning. */
2799    }
2800    VtsTE;
2801
2802 /* The VTS table. */
2803 static XArray* /* of VtsTE */ vts_tab = NULL;
2804
2805 /* An index into the VTS table, indicating the start of the list of
2806    free (available for use) entries.  If the list is empty, this is
2807    VtsID_INVALID. */
2808 static VtsID vts_tab_freelist = VtsID_INVALID;
2809
2810 /* Do a GC of vts_tab when the freelist becomes empty AND the size of
2811    vts_tab equals or exceeds this size.  After GC, the value here is
2812    set appropriately so as to check for the next GC point. */
2813 static Word vts_next_GC_at = 1000;
2814
2815 static void vts_tab_init ( void )
2816 {
2817    vts_tab = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
2818                          HG_(free), sizeof(VtsTE) );
2819    vts_tab_freelist = VtsID_INVALID;
2820 }
2821
2822 /* Add ii to the free list, checking that it looks out-of-use. */
2823 static void add_to_free_list ( VtsID ii )
2824 {
2825    VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2826    tl_assert(ie->vts == NULL);
2827    tl_assert(ie->rc == 0);
2828    tl_assert(ie->u.freelink == VtsID_INVALID);
2829    ie->u.freelink = vts_tab_freelist;
2830    vts_tab_freelist = ii;
2831 }
2832
2833 /* Get an entry from the free list.  This will return VtsID_INVALID if
2834    the free list is empty. */
2835 static VtsID get_from_free_list ( void )
2836 {
2837    VtsID  ii;
2838    VtsTE* ie;
2839    if (vts_tab_freelist == VtsID_INVALID)
2840       return VtsID_INVALID;
2841    ii = vts_tab_freelist;
2842    ie = VG_(indexXA)( vts_tab, ii );
2843    tl_assert(ie->vts == NULL);
2844    tl_assert(ie->rc == 0);
2845    vts_tab_freelist = ie->u.freelink;
2846    return ii;
2847 }
2848
2849 /* Produce a new VtsID that can be used, either by getting it from
2850    the freelist, or, if that is empty, by expanding vts_tab. */
2851 static VtsID get_new_VtsID ( void )
2852 {
2853    VtsID ii;
2854    VtsTE te;
2855    ii = get_from_free_list();
2856    if (ii != VtsID_INVALID)
2857       return ii;
2858    te.vts = NULL;
2859    te.rc = 0;
2860    te.u.freelink = VtsID_INVALID;
2861    ii = (VtsID)VG_(addToXA)( vts_tab, &te );
2862    return ii;
2863 }
2864
2865
2866 /* Indirect callback from lib_zsm. */
2867 static void VtsID__rcinc ( VtsID ii )
2868 {
2869    VtsTE* ie;
2870    /* VG_(indexXA) does a range check for us */
2871    ie = VG_(indexXA)( vts_tab, ii );
2872    tl_assert(ie->vts); /* else it's not in use */
2873    tl_assert(ie->rc < ~0UL); /* else we can't continue */
2874    tl_assert(ie->vts->id == ii);
2875    ie->rc++;
2876 }
2877
2878 /* Indirect callback from lib_zsm. */
2879 static void VtsID__rcdec ( VtsID ii )
2880 {
2881    VtsTE* ie;
2882    /* VG_(indexXA) does a range check for us */
2883    ie = VG_(indexXA)( vts_tab, ii );
2884    tl_assert(ie->vts); /* else it's not in use */
2885    tl_assert(ie->rc > 0); /* else RC snafu */
2886    tl_assert(ie->vts->id == ii);
2887    ie->rc--;
2888 }
2889
2890
2891 /* Look up 'cand' in our collection of VTSs.  If present, return the
2892    VtsID for the pre-existing version.  If not present, clone it, add
2893    the clone to both vts_tab and vts_set, allocate a fresh VtsID for
2894    it, and return that. */
2895 static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
2896 {
2897    VTS* in_tab = NULL;
2898    tl_assert(cand->id == VtsID_INVALID);
2899    Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
2900    tl_assert(in_tab);
2901    if (already_have) {
2902       /* We already have a copy of 'cand'.  Use that. */
2903       VtsTE* ie;
2904       tl_assert(in_tab->id != VtsID_INVALID);
2905       ie = VG_(indexXA)( vts_tab, in_tab->id );
2906       tl_assert(ie->vts == in_tab);
2907       return in_tab->id;
2908    } else {
2909       VtsID  ii = get_new_VtsID();
2910       VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2911       ie->vts = in_tab;
2912       ie->rc = 0;
2913       ie->u.freelink = VtsID_INVALID;
2914       in_tab->id = ii;
2915       return ii;
2916    }
2917 }
2918
2919
2920 static void show_vts_stats ( const HChar* caller )
2921 {
2922    UWord nSet, nTab, nLive;
2923    ULong totrc;
2924    UWord n, i;
2925    nSet = VG_(sizeFM)( vts_set );
2926    nTab = VG_(sizeXA)( vts_tab );
2927    totrc = 0;
2928    nLive = 0;
2929    n = VG_(sizeXA)( vts_tab );
2930    for (i = 0; i < n; i++) {
2931       VtsTE* ie = VG_(indexXA)( vts_tab, i );
2932       if (ie->vts) {
2933          nLive++;
2934          totrc += (ULong)ie->rc;
2935       } else {
2936          tl_assert(ie->rc == 0);
2937       }
2938    }
2939    VG_(printf)("  show_vts_stats %s\n", caller);
2940    VG_(printf)("    vts_tab size %4lu\n", nTab);
2941    VG_(printf)("    vts_tab live %4lu\n", nLive);
2942    VG_(printf)("    vts_set size %4lu\n", nSet);
2943    VG_(printf)("        total rc %4llu\n", totrc);
2944 }
2945
2946
2947 /* --- Helpers for VtsID pruning --- */
2948
2949 static
2950 void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2951                    /*MOD*/XArray* /* of VtsTE */ new_tab,
2952                    VtsID* ii )
2953 {
2954    VtsTE *old_te, *new_te;
2955    VtsID old_id, new_id;
2956    /* We're relying here on VG_(indexXA)'s range checking to assert on
2957       any stupid values, in particular *ii == VtsID_INVALID. */
2958    old_id = *ii;
2959    old_te = VG_(indexXA)( old_tab, old_id );
2960    old_te->rc--;
2961    new_id = old_te->u.remap;
2962    new_te = VG_(indexXA)( new_tab, new_id );
2963    new_te->rc++;
2964    *ii = new_id;
2965 }
2966
2967 static
2968 void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2969                             /*MOD*/XArray* /* of VtsTE */ new_tab,
2970                             SVal* s )
2971 {
2972    SVal old_sv, new_sv;
2973    old_sv = *s;
2974    if (SVal__isC(old_sv)) {
2975       VtsID rMin, wMin;
2976       rMin = SVal__unC_Rmin(old_sv);
2977       wMin = SVal__unC_Wmin(old_sv);
2978       remap_VtsID( old_tab, new_tab, &rMin );
2979       remap_VtsID( old_tab, new_tab, &wMin );
2980       new_sv = SVal__mkC( rMin, wMin );
2981       *s = new_sv;
2982   }
2983 }
2984
2985
2986 /* NOT TO BE CALLED FROM WITHIN libzsm. */
2987 __attribute__((noinline))
2988 static void vts_tab__do_GC ( Bool show_stats )
2989 {
2990    UWord i, nTab, nLive, nFreed;
2991
2992    /* ---------- BEGIN VTS GC ---------- */
2993    /* check this is actually necessary. */
2994    tl_assert(vts_tab_freelist == VtsID_INVALID);
2995
2996    /* empty the caches for partial order checks and binary joins.  We
2997       could do better and prune out the entries to be deleted, but it
2998       ain't worth the hassle. */
2999    VtsID__invalidate_caches();
3000
3001    /* First, make the reference counts up to date. */
3002    zsm_flush_cache();
3003
3004    nTab = VG_(sizeXA)( vts_tab );
3005
3006    if (show_stats) {
3007       VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
3008       show_vts_stats("before GC");
3009    }
3010
3011    /* Now we can inspect the entire vts_tab.  Any entries with zero
3012       .rc fields are now no longer in use and can be put back on the
3013       free list, removed from vts_set, and deleted. */
3014    nFreed = 0;
3015    for (i = 0; i < nTab; i++) {
3016       Bool present;
3017       UWord oldK = 0, oldV = 12345;
3018       VtsTE* te = VG_(indexXA)( vts_tab, i );
3019       if (te->vts == NULL) {
3020          tl_assert(te->rc == 0);
3021          continue; /* already on the free list (presumably) */
3022       }
3023       if (te->rc > 0)
3024          continue; /* in use */
3025       /* Ok, we got one we can free. */
3026       tl_assert(te->vts->id == i);
3027       /* first, remove it from vts_set. */
3028       present = VG_(delFromFM)( vts_set,
3029                                 &oldK, &oldV, (UWord)te->vts );
3030       tl_assert(present); /* else it isn't in vts_set ?! */
3031       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
3032       tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
3033       /* now free the VTS itself */
3034       VTS__delete(te->vts);
3035       te->vts = NULL;
3036       /* and finally put this entry on the free list */
3037       tl_assert(te->u.freelink == VtsID_INVALID); /* can't already be on it */
3038       add_to_free_list( i );
3039       nFreed++;
3040    }
3041
3042    /* Now figure out when the next GC should be.  We'll allow the
3043       number of VTSs to double before GCing again.  Except of course
3044       that since we can't (or, at least, don't) shrink vts_tab, we
3045       can't set the threshold value smaller than it. */
3046    tl_assert(nFreed <= nTab);
3047    nLive = nTab - nFreed;
3048    tl_assert(nLive >= 0 && nLive <= nTab);
3049    vts_next_GC_at = 2 * nLive;
3050    if (vts_next_GC_at < nTab)
3051       vts_next_GC_at = nTab;
3052
3053    if (show_stats) {
3054       show_vts_stats("after GC");
3055       VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
3056    }
3057
3058    stats__vts_tab_GC++;
3059    if (VG_(clo_stats)) {
3060       tl_assert(nTab > 0);
3061       VG_(message)(Vg_DebugMsg,
3062                    "libhb: VTS GC: #%lu  old size %lu  live %lu  (%2llu%%)\n",
3063                    stats__vts_tab_GC,
3064                    nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
3065    }
3066    /* ---------- END VTS GC ---------- */
3067
3068    /* Decide whether to do VTS pruning.  We have one of three
3069       settings. */
3070    static UInt pruning_auto_ctr = 0; /* do not make non-static */
3071
3072    Bool do_pruning = False;
3073    switch (HG_(clo_vts_pruning)) {
3074       case 0: /* never */
3075          break;
3076       case 1: /* auto */
3077          do_pruning = (++pruning_auto_ctr % 5) == 0;
3078          break;
3079       case 2: /* always */
3080          do_pruning = True;
3081          break;
3082       default:
3083          tl_assert(0);
3084    }
3085
3086    /* The rest of this routine only handles pruning, so we can
3087       quit at this point if it is not to be done. */
3088    if (!do_pruning)
3089       return;
3090    /* No need to do pruning if no thread died since the last pruning as
3091       no VtsTE can be pruned. */
3092    if (VG_(sizeXA)( verydead_thread_table_not_pruned) == 0)
3093       return;
3094
3095    /* ---------- BEGIN VTS PRUNING ---------- */
3096    /* Sort and check the very dead threads that died since the last pruning.
3097       Sorting is used for the check and so that we can quickly look
3098       up the dead-thread entries as we work through the VTSs. */
3099    verydead_thread_table_sort_and_check (verydead_thread_table_not_pruned);
3100
3101    /* We will run through the old table, and create a new table and
3102       set, at the same time setting the u.remap entries in the old
3103       table to point to the new entries.  Then, visit every VtsID in
3104       the system, and replace all of them with new ones, using the
3105       u.remap entries in the old table.  Finally, we can delete the old
3106       table and set. */
3107
3108    XArray* /* of VtsTE */ new_tab
3109       = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
3110                     HG_(free), sizeof(VtsTE) );
3111
3112    /* WordFM VTS* void */
3113    WordFM* new_set
3114       = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
3115                     HG_(free),
3116                     (Word(*)(UWord,UWord))VTS__cmp_structural );
3117
3118    /* Visit each old VTS.  For each one:
3119
3120       * make a pruned version
3121
3122       * search new_set for the pruned version, yielding either
3123         Nothing (not present) or the new VtsID for it.
3124
3125       * if not present, allocate a new VtsID for it, insert (pruned
3126         VTS, new VtsID) in the tree, and set
3127         remap_table[old VtsID] = new VtsID.
3128
3129       * if present, set remap_table[old VtsID] = new VtsID, where
3130         new VtsID was determined by the tree lookup.  Then free up
3131         the clone.
3132    */
3133
3134    UWord nBeforePruning = 0, nAfterPruning = 0;
3135    UWord nSTSsBefore = 0, nSTSsAfter = 0;
3136    VtsID new_VtsID_ctr = 0;
3137
3138    for (i = 0; i < nTab; i++) {
3139
3140       /* For each old VTS .. */
3141       VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
3142       VTS*   old_vts = old_te->vts;
3143
3144       /* Skip it if not in use */
3145       if (old_te->rc == 0) {
3146          tl_assert(old_vts == NULL);
3147          continue;
3148       }
3149       tl_assert(old_te->u.remap == VtsID_INVALID);
3150       tl_assert(old_vts != NULL);
3151       tl_assert(old_vts->id == i);
3152       tl_assert(old_vts->ts != NULL);
3153
3154       /* It is in use. Make a pruned version. */
3155       nBeforePruning++;
3156       nSTSsBefore += old_vts->usedTS;
3157       VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
3158                                    old_vts, verydead_thread_table_not_pruned);
3159       tl_assert(new_vts->sizeTS == new_vts->usedTS);
3160       tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
3161                 == 0x0ddC0ffeeBadF00dULL);
3162
3163       /* Get rid of the old VTS and the tree entry.  It's a bit more
3164          complex to incrementally delete the VTSs now than to nuke
3165          them all after we're done, but the upside is that we don't
3166          wind up temporarily storing potentially two complete copies
3167          of each VTS and hence spiking memory use. */
3168       UWord oldK = 0, oldV = 12345;
3169       Bool  present = VG_(delFromFM)( vts_set,
3170                                       &oldK, &oldV, (UWord)old_vts );
3171       tl_assert(present); /* else it isn't in vts_set ?! */
3172       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
3173       tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
3174       /* now free the VTS itself */
3175       VTS__delete(old_vts);
3176       old_te->vts = NULL;
3177       old_vts = NULL;
3178
3179       /* NO MENTIONS of old_vts allowed beyond this point. */
3180
3181       /* Ok, we have the pruned copy in new_vts.  See if a
3182          structurally identical version is already present in new_set.
3183          If so, delete the one we just made and move on; if not, add
3184          it. */
3185       VTS*  identical_version = NULL;
3186       UWord valW = 12345;
3187       if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
3188                         (UWord)new_vts)) {
3189          // already have it
3190          tl_assert(valW == 0);
3191          tl_assert(identical_version != NULL);
3192          tl_assert(identical_version != new_vts);
3193          VTS__delete(new_vts);
3194          new_vts = identical_version;
3195          tl_assert(new_vts->id != VtsID_INVALID);
3196       } else {
3197          tl_assert(valW == 12345);
3198          tl_assert(identical_version == NULL);
3199          new_vts->id = new_VtsID_ctr++;
3200          Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
3201          tl_assert(!b);
3202          VtsTE new_te;
3203          new_te.vts      = new_vts;
3204          new_te.rc       = 0;
3205          new_te.u.freelink = VtsID_INVALID;
3206          Word j = VG_(addToXA)( new_tab, &new_te );
3207          tl_assert(j <= i);
3208          tl_assert(j == new_VtsID_ctr - 1);
3209          // stats
3210          nAfterPruning++;
3211          nSTSsAfter += new_vts->usedTS;
3212       }
3213       old_te->u.remap = new_vts->id;
3214
3215    } /* for (i = 0; i < nTab; i++) */
3216
3217    /* Move very dead thread from verydead_thread_table_not_pruned to
3218       verydead_thread_table. Sort and check verydead_thread_table
3219       to verify a thread was reported very dead only once. */
3220    {
3221       UWord nBT = VG_(sizeXA)( verydead_thread_table_not_pruned);
3222
3223       for (i = 0; i < nBT; i++) {
3224          ThrID thrid =
3225             *(ThrID*)VG_(indexXA)( verydead_thread_table_not_pruned, i );
3226          VG_(addToXA)( verydead_thread_table, &thrid );
3227       }
3228       verydead_thread_table_sort_and_check (verydead_thread_table);
3229       VG_(dropHeadXA) (verydead_thread_table_not_pruned, nBT);
3230    }
3231
3232    /* At this point, we have:
3233       * the old VTS table, with its u.remap entries set,
3234         and with all .vts == NULL.
3235       * the old VTS tree should be empty, since it and the old VTSs
3236         it contained have been incrementally deleted was we worked
3237         through the old table.
3238       * the new VTS table, with all .rc == 0, all u.freelink and u.remap
3239         == VtsID_INVALID.
3240       * the new VTS tree.
3241    */
3242    tl_assert( VG_(sizeFM)(vts_set) == 0 );
3243
3244    /* Now actually apply the mapping. */
3245    /* Visit all the VtsIDs in the entire system.  Where do we expect
3246       to find them?
3247       (a) in shadow memory -- the LineZs and LineFs
3248       (b) in our collection of struct _Thrs.
3249       (c) in our collection of struct _SOs.
3250       Nowhere else, AFAICS.  Not in the zsm cache, because that just
3251       got invalidated.
3252
3253       Using the u.remap fields in vts_tab, map each old VtsID to a new
3254       VtsID.  For each old VtsID, dec its rc; and for each new one,
3255       inc it.  This sets up the new refcounts, and it also gives a
3256       cheap sanity check of the old ones: all old refcounts should be
3257       zero after this operation.
3258    */
3259
3260    /* Do the mappings for (a) above: iterate over the Primary shadow
3261       mem map (WordFM Addr SecMap*). */
3262    UWord secmapW = 0;
3263    VG_(initIterFM)( map_shmem );
3264    while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
3265       UWord   j;
3266       SecMap* sm = (SecMap*)secmapW;
3267       tl_assert(sm->magic == SecMap_MAGIC);
3268       /* Deal with the LineZs */
3269       for (i = 0; i < N_SECMAP_ZLINES; i++) {
3270          LineZ* lineZ = &sm->linesZ[i];
3271          if (lineZ->dict[0] != SVal_INVALID) {
3272             for (j = 0; j < 4; j++)
3273                remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
3274          } else {
3275             LineF* lineF = SVal2Ptr (lineZ->dict[1]);
3276             for (j = 0; j < N_LINE_ARANGE; j++)
3277                remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
3278          }
3279       }
3280    }
3281    VG_(doneIterFM)( map_shmem );
3282
3283    /* Do the mappings for (b) above: visit our collection of struct
3284       _Thrs. */
3285    Thread* hgthread = get_admin_threads();
3286    tl_assert(hgthread);
3287    while (hgthread) {
3288       Thr* hbthr = hgthread->hbthr;
3289       tl_assert(hbthr);
3290       /* Threads that are listed in the prunable set have their viR
3291          and viW set to VtsID_INVALID, so we can't mess with them. */
3292       if (hbthr->llexit_done && hbthr->joinedwith_done) {
3293          tl_assert(hbthr->viR == VtsID_INVALID);
3294          tl_assert(hbthr->viW == VtsID_INVALID);
3295          hgthread = hgthread->admin;
3296          continue;
3297       }
3298       remap_VtsID( vts_tab, new_tab, &hbthr->viR );
3299       remap_VtsID( vts_tab, new_tab, &hbthr->viW );
3300       hgthread = hgthread->admin;
3301    }
3302
3303    /* Do the mappings for (c) above: visit the struct _SOs. */
3304    SO* so = admin_SO;
3305    while (so) {
3306       if (so->viR != VtsID_INVALID)
3307          remap_VtsID( vts_tab, new_tab, &so->viR );
3308       if (so->viW != VtsID_INVALID)
3309          remap_VtsID( vts_tab, new_tab, &so->viW );
3310       so = so->admin_next;
3311    }
3312
3313    /* So, we're nearly done (with this incredibly complex operation).
3314       Check the refcounts for the old VtsIDs all fell to zero, as
3315       expected.  Any failure is serious. */
3316    for (i = 0; i < nTab; i++) {
3317       VtsTE* te = VG_(indexXA)( vts_tab, i );
3318       tl_assert(te->vts == NULL);
3319       /* This is the assert proper.  Note we're also asserting
3320          zeroness for old entries which are unmapped.  That's OK. */
3321       tl_assert(te->rc == 0);
3322    }
3323
3324    /* Install the new table and set. */
3325    VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
3326    vts_set = new_set;
3327    VG_(deleteXA)( vts_tab );
3328    vts_tab = new_tab;
3329
3330    /* The freelist of vts_tab entries is empty now, because we've
3331       compacted all of the live entries at the low end of the
3332       table. */
3333    vts_tab_freelist = VtsID_INVALID;
3334
3335    /* Sanity check vts_set and vts_tab. */
3336
3337    /* Because all the live entries got slid down to the bottom of vts_tab: */
3338    tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));
3339
3340    /* Assert that the vts_tab and vts_set entries point at each other
3341       in the required way */
3342    UWord wordK = 0, wordV = 0;
3343    VG_(initIterFM)( vts_set );
3344    while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
3345       tl_assert(wordK != 0);
3346       tl_assert(wordV == 0);
3347       VTS* vts = (VTS*)wordK;
3348       tl_assert(vts->id != VtsID_INVALID);
3349       VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
3350       tl_assert(te->vts == vts);
3351    }
3352    VG_(doneIterFM)( vts_set );
3353
3354    /* Also iterate over the table, and check each entry is
3355       plausible. */
3356    nTab = VG_(sizeXA)( vts_tab );
3357    for (i = 0; i < nTab; i++) {
3358       VtsTE* te = VG_(indexXA)( vts_tab, i );
3359       tl_assert(te->vts);
3360       tl_assert(te->vts->id == i);
3361       tl_assert(te->rc > 0); /* 'cos we just GC'd */
3362       tl_assert(te->u.freelink == VtsID_INVALID); /* in use */
3363       /* value of te->u.remap  not relevant */
3364    }
3365
3366    /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
3367    stats__vts_pruning++;
3368    if (VG_(clo_stats)) {
3369       tl_assert(nTab > 0);
3370       VG_(message)(
3371          Vg_DebugMsg,
3372          "libhb: VTS PR: #%lu  before %lu (avg sz %lu)  "
3373             "after %lu (avg sz %lu)\n",
3374          stats__vts_pruning,
3375          nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
3376          nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
3377       );
3378    }
3379    /* ---------- END VTS PRUNING ---------- */
3380 }
3381
3382
3383 /////////////////////////////////////////////////////////
3384 //                                                     //
3385 // Vts IDs                                             //
3386 //                                                     //
3387 /////////////////////////////////////////////////////////
3388
3389 //////////////////////////
3390 /* A temporary, max-sized VTS which is used as a temporary (the first
3391    argument) in VTS__singleton, VTS__tick and VTS__join operations. */
3392 static VTS* temp_max_sized_VTS = NULL;
3393
3394 //////////////////////////
3395 static ULong stats__cmpLEQ_queries = 0;
3396 static ULong stats__cmpLEQ_misses  = 0;
3397 static ULong stats__join2_queries  = 0;
3398 static ULong stats__join2_misses   = 0;
3399
3400 static inline UInt ROL32 ( UInt w, Int n ) {
3401    w = (w << n) | (w >> (32-n));
3402    return w;
3403 }
3404 static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
3405    UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
3406    return hash % nTab;
3407 }
3408
3409 #define N_CMPLEQ_CACHE 1023
3410 static
3411    struct { VtsID vi1; VtsID vi2; Bool leq; }
3412    cmpLEQ_cache[N_CMPLEQ_CACHE];
3413
3414 #define N_JOIN2_CACHE 1023
3415 static
3416    struct { VtsID vi1; VtsID vi2; VtsID res; }
3417    join2_cache[N_JOIN2_CACHE];
3418
3419 static void VtsID__invalidate_caches ( void ) {
3420    Int i;
3421    for (i = 0; i < N_CMPLEQ_CACHE; i++) {
3422       cmpLEQ_cache[i].vi1 = VtsID_INVALID;
3423       cmpLEQ_cache[i].vi2 = VtsID_INVALID;
3424       cmpLEQ_cache[i].leq = False;
3425    }
3426    for (i = 0; i < N_JOIN2_CACHE; i++) {
3427      join2_cache[i].vi1 = VtsID_INVALID;
3428      join2_cache[i].vi2 = VtsID_INVALID;
3429      join2_cache[i].res = VtsID_INVALID;
3430    }
3431 }
3432 //////////////////////////
3433
3434 //static Bool VtsID__is_valid ( VtsID vi ) {
3435 //   VtsTE* ve;
3436 //   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
3437 //      return False;
3438 //   ve = VG_(indexXA)( vts_tab, vi );
3439 //   if (!ve->vts)
3440 //      return False;
3441 //   tl_assert(ve->vts->id == vi);
3442 //   return True;
3443 //}
3444
3445 static VTS* VtsID__to_VTS ( VtsID vi ) {
3446    VtsTE* te = VG_(indexXA)( vts_tab, vi );
3447    tl_assert(te->vts);
3448    return te->vts;
3449 }
3450
3451 static void VtsID__pp ( VtsID vi ) {
3452    VTS* vts = VtsID__to_VTS(vi);
3453    VTS__show( vts );
3454 }
3455
3456 /* compute partial ordering relation of vi1 and vi2. */
3457 __attribute__((noinline))
3458 static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
3459    UInt hash;
3460    Bool leq;
3461    VTS  *v1, *v2;
3462    //if (vi1 == vi2) return True;
3463    tl_assert(vi1 != vi2);
3464    ////++
3465    stats__cmpLEQ_queries++;
3466    hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
3467    if (cmpLEQ_cache[hash].vi1 == vi1
3468        && cmpLEQ_cache[hash].vi2 == vi2)
3469       return cmpLEQ_cache[hash].leq;
3470    stats__cmpLEQ_misses++;
3471    ////--
3472    v1  = VtsID__to_VTS(vi1);
3473    v2  = VtsID__to_VTS(vi2);
3474    leq = VTS__cmpLEQ( v1, v2 ) == 0;
3475    ////++
3476    cmpLEQ_cache[hash].vi1 = vi1;
3477    cmpLEQ_cache[hash].vi2 = vi2;
3478    cmpLEQ_cache[hash].leq = leq;
3479    ////--
3480    return leq;
3481 }
3482 static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
3483    return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
3484 }
3485
3486 /* compute binary join */
3487 __attribute__((noinline))
3488 static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
3489    UInt  hash;
3490    VtsID res;
3491    VTS   *vts1, *vts2;
3492    //if (vi1 == vi2) return vi1;
3493    tl_assert(vi1 != vi2);
3494    ////++
3495    stats__join2_queries++;
3496    hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
3497    if (join2_cache[hash].vi1 == vi1
3498        && join2_cache[hash].vi2 == vi2)
3499       return join2_cache[hash].res;
3500    stats__join2_misses++;
3501    ////--
3502    vts1 = VtsID__to_VTS(vi1);
3503    vts2 = VtsID__to_VTS(vi2);
3504    temp_max_sized_VTS->usedTS = 0;
3505    VTS__join(temp_max_sized_VTS, vts1,vts2);
3506    res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3507    ////++
3508    join2_cache[hash].vi1 = vi1;
3509    join2_cache[hash].vi2 = vi2;
3510    join2_cache[hash].res = res;
3511    ////--
3512    return res;
3513 }
3514 static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
3515    return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
3516 }
3517
3518 /* create a singleton VTS, namely [thr:1] */
3519 static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
3520    temp_max_sized_VTS->usedTS = 0;
3521    VTS__singleton(temp_max_sized_VTS, thr,tym);
3522    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3523 }
3524
3525 /* tick operation, creates value 1 if specified index is absent */
3526 static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
3527    VTS* vts = VtsID__to_VTS(vi);
3528    temp_max_sized_VTS->usedTS = 0;
3529    VTS__tick(temp_max_sized_VTS, idx,vts);
3530    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3531 }
3532
3533 /* index into a VTS (only for assertions) */
3534 static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
3535    VTS* vts = VtsID__to_VTS(vi);
3536    return VTS__indexAt_SLOW( vts, idx );
3537 }
3538
3539 /* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
3540    any, really) element in vi1 which is pointwise greater-than the
3541    corresponding element in vi2.  If no such element exists, return
3542    NULL.  This needs to be fairly quick since it is called every time
3543    a race is detected. */
3544 static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
3545 {
3546    VTS  *vts1, *vts2;
3547    Thr*  diffthr;
3548    ThrID diffthrid;
3549    tl_assert(vi1 != vi2);
3550    vts1 = VtsID__to_VTS(vi1);
3551    vts2 = VtsID__to_VTS(vi2);
3552    tl_assert(vts1 != vts2);
3553    diffthrid = VTS__cmpLEQ(vts1, vts2);
3554    diffthr = Thr__from_ThrID(diffthrid);
3555    tl_assert(diffthr); /* else they are LEQ ! */
3556    return diffthr;
3557 }
3558
3559
3560 /////////////////////////////////////////////////////////
3561 //                                                     //
3562 // Filters                                             //
3563 //                                                     //
3564 /////////////////////////////////////////////////////////
3565
3566 /* Forget everything we know -- clear the filter and let everything
3567    through.  This needs to be as fast as possible, since it is called
3568    every time the running thread changes, and every time a thread's
3569    vector clocks change, which can be quite frequent.  The obvious
3570    fast way to do this is simply to stuff in tags which we know are
3571    not going to match anything, since they're not aligned to the start
3572    of a line. */
3573 static void Filter__clear ( Filter* fi, const HChar* who )
3574 {
3575    UWord i;
3576    if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
3577    for (i = 0; i < FI_NUM_LINES; i += 8) {
3578       fi->tags[i+0] = 1; /* impossible value -- cannot match */
3579       fi->tags[i+1] = 1;
3580       fi->tags[i+2] = 1;
3581       fi->tags[i+3] = 1;
3582       fi->tags[i+4] = 1;
3583       fi->tags[i+5] = 1;
3584       fi->tags[i+6] = 1;
3585       fi->tags[i+7] = 1;
3586    }
3587    tl_assert(i == FI_NUM_LINES);
3588 }
3589
3590 /* Clearing an arbitrary range in the filter.  Unfortunately
3591    we have to do this due to core-supplied new/die-mem events. */
3592
3593 static void Filter__clear_1byte ( Filter* fi, Addr a )
3594 {
3595    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3596    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3597    FiLine* line   = &fi->lines[lineno];
3598    UWord   loff   = (a - atag) / 8;
3599    UShort  mask   = 0x3 << (2 * (a & 7));
3600    /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3601    if (LIKELY( fi->tags[lineno] == atag )) {
3602       /* hit.  clear the bits. */
3603       UShort  u16  = line->u16s[loff];
3604       line->u16s[loff] = u16 & ~mask; /* clear them */
3605    } else {
3606       /* miss.  The filter doesn't hold this address, so ignore. */
3607    }
3608 }
3609
3610 static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
3611 {
3612    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3613    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3614    FiLine* line   = &fi->lines[lineno];
3615    UWord   loff   = (a - atag) / 8;
3616    if (LIKELY( fi->tags[lineno] == atag )) {
3617       line->u16s[loff] = 0;
3618    } else {
3619     /* miss.  The filter doesn't hold this address, so ignore. */
3620    }
3621 }
3622
3623 /* Only used to verify the fast Filter__clear_range */
3624 __attribute__((unused))
3625 static void Filter__clear_range_SLOW ( Filter* fi, Addr a, UWord len )
3626 {
3627    tl_assert (CHECK_ZSM);
3628
3629    /* slowly do part preceding 8-alignment */
3630    while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
3631       Filter__clear_1byte( fi, a );
3632       a++;
3633       len--;
3634    }
3635    /* vector loop */
3636    while (len >= 8) {
3637       Filter__clear_8bytes_aligned( fi, a );
3638       a += 8;
3639       len -= 8;
3640    }
3641    /* slowly do tail */
3642    while (UNLIKELY(len > 0)) {
3643       Filter__clear_1byte( fi, a );
3644       a++;
3645       len--;
3646    }
3647 }
3648
3649 static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
3650 {
3651 #  if CHECK_ZSM > 0
3652    /* We check the below more complex algorithm with the simple one.
3653       This check is very expensive : we do first the slow way on a
3654       copy of the data, then do it the fast way. On RETURN, we check
3655       the two values are equal. */
3656    Filter fi_check = *fi;
3657    Filter__clear_range_SLOW(&fi_check, a, len);
3658 #  define RETURN goto check_and_return
3659 #  else
3660 #  define RETURN return
3661 #  endif
3662
3663    Addr    begtag = FI_GET_TAG(a);       /* tag of range begin */
3664
3665    Addr    end = a + len - 1;
3666    Addr    endtag = FI_GET_TAG(end); /* tag of range end. */
3667
3668    UWord rlen = len; /* remaining length to clear */
3669
3670    Addr    c = a; /* Current position we are clearing. */
3671    UWord   clineno = FI_GET_LINENO(c); /* Current lineno we are clearing */
3672    FiLine* cline; /* Current line we are clearing */
3673    UWord   cloff; /* Current offset in line we are clearing, when clearing
3674                      partial lines. */
3675
3676    UShort u16;
3677
3678    STATIC_ASSERT (FI_LINE_SZB == 32);
3679    // Below assumes filter lines are 32 bytes
3680
3681    if (LIKELY(fi->tags[clineno] == begtag)) {
3682       /* LIKELY for the heavy caller VG_(unknown_SP_update). */
3683       /* First filter line matches begtag.
3684          If c is not at the filter line begin, the below will clear
3685          the filter line bytes starting from c. */
3686       cline = &fi->lines[clineno];
3687       cloff = (c - begtag) / 8;
3688
3689       /* First the byte(s) needed to reach 8-alignment */
3690       if (UNLIKELY(!VG_IS_8_ALIGNED(c))) {
3691          /* hiB is the nr of bytes (higher addresses) from c to reach
3692             8-aligment. */
3693          UWord hiB = 8 - (c & 7);
3694          /* Compute 2-bit/byte mask representing hiB bytes [c..c+hiB[
3695             mask is  C000 , F000, FC00, FF00, FFC0, FFF0 or FFFC for the byte
3696             range    7..7   6..7  5..7  4..7  3..7  2..7    1..7 */
3697          UShort mask = 0xFFFF << (16 - 2*hiB);
3698
3699          u16  = cline->u16s[cloff];
3700          if (LIKELY(rlen >= hiB)) {
3701             cline->u16s[cloff] = u16 & ~mask; /* clear all hiB from c */
3702             rlen -= hiB;
3703             c += hiB;
3704             cloff += 1;
3705          } else {
3706             /* Only have the bits for rlen bytes bytes. */
3707             mask = mask & ~(0xFFFF << (16 - 2*(hiB-rlen)));
3708             cline->u16s[cloff] = u16 & ~mask; /* clear rlen bytes from c. */
3709             RETURN;  // We have cleared all what we can.
3710          }
3711       }
3712       /* c is now 8 aligned. Clear by 8 aligned bytes,
3713          till c is filter-line aligned */
3714       while (!VG_IS_32_ALIGNED(c) && rlen >= 8) {
3715          cline->u16s[cloff] = 0;
3716          c += 8;
3717          rlen -= 8;
3718          cloff += 1;
3719       }
3720    } else {
3721       c = begtag + FI_LINE_SZB;
3722       if (c > end)
3723          RETURN;   // We have cleared all what we can.
3724       rlen -= c - a;
3725    }
3726    // We have changed c, so re-establish clineno.
3727    clineno = FI_GET_LINENO(c);
3728
3729    if (rlen >= FI_LINE_SZB) {
3730       /* Here, c is filter line-aligned. Clear all full lines that
3731          overlap with the range starting at c, made of a full lines */
3732       UWord nfull = rlen / FI_LINE_SZB;
3733       UWord full_len = nfull * FI_LINE_SZB;
3734       rlen -= full_len;
3735       if (nfull > FI_NUM_LINES)
3736          nfull = FI_NUM_LINES; // no need to check several times the same entry.
3737
3738       for (UWord n = 0; n < nfull; n++) {
3739          if (UNLIKELY(address_in_range(fi->tags[clineno], c, full_len))) {
3740             cline = &fi->lines[clineno];
3741             cline->u16s[0] = 0;
3742             cline->u16s[1] = 0;
3743             cline->u16s[2] = 0;
3744             cline->u16s[3] = 0;
3745             STATIC_ASSERT (4 == sizeof(cline->u16s)/sizeof(cline->u16s[0]));
3746          }
3747          clineno++;
3748          if (UNLIKELY(clineno == FI_NUM_LINES))
3749             clineno = 0;
3750       }
3751
3752       c += full_len;
3753       clineno = FI_GET_LINENO(c);
3754    }
3755
3756    if (CHECK_ZSM) {
3757       tl_assert(VG_IS_8_ALIGNED(c));
3758       tl_assert(clineno == FI_GET_LINENO(c));
3759    }
3760
3761    /* Do the last filter line, if it was not cleared as a full filter line */
3762    if (UNLIKELY(rlen > 0) && fi->tags[clineno] == endtag) {
3763       cline = &fi->lines[clineno];
3764       cloff = (c - endtag) / 8;
3765       if (CHECK_ZSM) tl_assert(FI_GET_TAG(c) == endtag);
3766
3767       /* c is 8 aligned. Clear by 8 aligned bytes, till we have less than
3768          8 bytes. */
3769       while (rlen >= 8) {
3770          cline->u16s[cloff] = 0;
3771          c += 8;
3772          rlen -= 8;
3773          cloff += 1;
3774       }
3775       /* Then the remaining byte(s) */
3776       if (rlen > 0) {
3777          /* nr of bytes from c to reach end. */
3778          UWord loB = rlen;
3779          /* Compute mask representing loB bytes [c..c+loB[ :
3780             mask is 0003, 000F, 003F, 00FF, 03FF, 0FFF or 3FFF */
3781          UShort mask = 0xFFFF >> (16 - 2*loB);
3782
3783          u16  = cline->u16s[cloff];
3784          cline->u16s[cloff] = u16 & ~mask; /* clear all loB from c */
3785       }
3786    }
3787
3788 #  if CHECK_ZSM > 0
3789    check_and_return:
3790    tl_assert (VG_(memcmp)(&fi_check, fi, sizeof(fi_check)) == 0);
3791 #  endif
3792 #  undef RETURN
3793 }
3794
3795 /* ------ Read handlers for the filter. ------ */
3796
3797 static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
3798 {
3799    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3800       return False;
3801    {
3802      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3803      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3804      FiLine* line   = &fi->lines[lineno];
3805      UWord   loff   = (a - atag) / 8;
3806      UShort  mask   = 0xAAAA;
3807      if (LIKELY( fi->tags[lineno] == atag )) {
3808         /* hit.  check line and update. */
3809         UShort u16  = line->u16s[loff];
3810         Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
3811         line->u16s[loff] = u16 | mask; /* set them */
3812         return ok;
3813      } else {
3814         /* miss.  nuke existing line and re-use it. */
3815         UWord i;
3816         fi->tags[lineno] = atag;
3817         for (i = 0; i < FI_LINE_SZB / 8; i++)
3818            line->u16s[i] = 0;
3819         line->u16s[loff] = mask;
3820         return False;
3821      }
3822    }
3823 }
3824
3825 static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
3826 {
3827    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3828       return False;
3829    {
3830      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3831      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3832      FiLine* line   = &fi->lines[lineno];
3833      UWord   loff   = (a - atag) / 8;
3834      UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
3835      if (LIKELY( fi->tags[lineno] == atag )) {
3836         /* hit.  check line and update. */
3837         UShort  u16  = line->u16s[loff];
3838         Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
3839         line->u16s[loff] = u16 | mask; /* set them */
3840         return ok;
3841      } else {
3842         /* miss.  nuke existing line and re-use it. */
3843         UWord   i;
3844         fi->tags[lineno] = atag;
3845         for (i = 0; i < FI_LINE_SZB / 8; i++)
3846            line->u16s[i] = 0;
3847         line->u16s[loff] = mask;
3848         return False;
3849      }
3850    }
3851 }
3852
3853 static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
3854 {
3855    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3856       return False;
3857    {
3858      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3859      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3860      FiLine* line   = &fi->lines[lineno];
3861      UWord   loff   = (a - atag) / 8;
3862      UShort  mask   = 0xA << (2 * (a & 6));
3863      /* mask is A000, 0A00, 00A0 or 000A */
3864      if (LIKELY( fi->tags[lineno] == atag )) {
3865         /* hit.  check line and update. */
3866         UShort  u16  = line->u16s[loff];
3867         Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
3868         line->u16s[loff] = u16 | mask; /* set them */
3869         return ok;
3870      } else {
3871         /* miss.  nuke existing line and re-use it. */
3872         UWord   i;
3873         fi->tags[lineno] = atag;
3874         for (i = 0; i < FI_LINE_SZB / 8; i++)
3875            line->u16s[i] = 0;
3876         line->u16s[loff] = mask;
3877         return False;
3878      }
3879    }
3880 }
3881
3882 static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
3883 {
3884    {
3885      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3886      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3887      FiLine* line   = &fi->lines[lineno];
3888      UWord   loff   = (a - atag) / 8;
3889      UShort  mask   = 0x2 << (2 * (a & 7));
3890      /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
3891      if (LIKELY( fi->tags[lineno] == atag )) {
3892         /* hit.  check line and update. */
3893         UShort  u16  = line->u16s[loff];
3894         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3895         line->u16s[loff] = u16 | mask; /* set them */
3896         return ok;
3897      } else {
3898         /* miss.  nuke existing line and re-use it. */
3899         UWord   i;
3900         fi->tags[lineno] = atag;
3901         for (i = 0; i < FI_LINE_SZB / 8; i++)
3902            line->u16s[i] = 0;
3903         line->u16s[loff] = mask;
3904         return False;
3905      }
3906    }
3907 }
3908
3909
3910 /* ------ Write handlers for the filter. ------ */
3911
3912 static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
3913 {
3914    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3915       return False;
3916    {
3917      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3918      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3919      FiLine* line   = &fi->lines[lineno];
3920      UWord   loff   = (a - atag) / 8;
3921      UShort  mask   = 0xFFFF;
3922      if (LIKELY( fi->tags[lineno] == atag )) {
3923         /* hit.  check line and update. */
3924         UShort u16  = line->u16s[loff];
3925         Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
3926         line->u16s[loff] = u16 | mask; /* set them */
3927         return ok;
3928      } else {
3929         /* miss.  nuke existing line and re-use it. */
3930         UWord i;
3931         fi->tags[lineno] = atag;
3932         for (i = 0; i < FI_LINE_SZB / 8; i++)
3933            line->u16s[i] = 0;
3934         line->u16s[loff] = mask;
3935         return False;
3936      }
3937    }
3938 }
3939
3940 static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
3941 {
3942    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3943       return False;
3944    {
3945      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3946      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3947      FiLine* line   = &fi->lines[lineno];
3948      UWord   loff   = (a - atag) / 8;
3949      UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
3950      if (LIKELY( fi->tags[lineno] == atag )) {
3951         /* hit.  check line and update. */
3952         UShort  u16  = line->u16s[loff];
3953         Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
3954         line->u16s[loff] = u16 | mask; /* set them */
3955         return ok;
3956      } else {
3957         /* miss.  nuke existing line and re-use it. */
3958         UWord   i;
3959         fi->tags[lineno] = atag;
3960         for (i = 0; i < FI_LINE_SZB / 8; i++)
3961            line->u16s[i] = 0;
3962         line->u16s[loff] = mask;
3963         return False;
3964      }
3965    }
3966 }
3967
3968 static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
3969 {
3970    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3971       return False;
3972    {
3973      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3974      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3975      FiLine* line   = &fi->lines[lineno];
3976      UWord   loff   = (a - atag) / 8;
3977      UShort  mask   = 0xF << (2 * (a & 6));
3978      /* mask is F000, 0F00, 00F0 or 000F */
3979      if (LIKELY( fi->tags[lineno] == atag )) {
3980         /* hit.  check line and update. */
3981         UShort  u16  = line->u16s[loff];
3982         Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
3983         line->u16s[loff] = u16 | mask; /* set them */
3984         return ok;
3985      } else {
3986         /* miss.  nuke existing line and re-use it. */
3987         UWord   i;
3988         fi->tags[lineno] = atag;
3989         for (i = 0; i < FI_LINE_SZB / 8; i++)
3990            line->u16s[i] = 0;
3991         line->u16s[loff] = mask;
3992         return False;
3993      }
3994    }
3995 }
3996
3997 static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
3998 {
3999    {
4000      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
4001      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
4002      FiLine* line   = &fi->lines[lineno];
4003      UWord   loff   = (a - atag) / 8;
4004      UShort  mask   = 0x3 << (2 * (a & 7));
4005      /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
4006      if (LIKELY( fi->tags[lineno] == atag )) {
4007         /* hit.  check line and update. */
4008         UShort  u16  = line->u16s[loff];
4009         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
4010         line->u16s[loff] = u16 | mask; /* set them */
4011         return ok;
4012      } else {
4013         /* miss.  nuke existing line and re-use it. */
4014         UWord   i;
4015         fi->tags[lineno] = atag;
4016         for (i = 0; i < FI_LINE_SZB / 8; i++)
4017            line->u16s[i] = 0;
4018         line->u16s[loff] = mask;
4019         return False;
4020      }
4021    }
4022 }
4023
4024
4025 /////////////////////////////////////////////////////////
4026 //                                                     //
4027 // Threads                                             //
4028 //                                                     //
4029 /////////////////////////////////////////////////////////
4030
4031 /* Maps ThrID values to their Thr*s (which contain ThrID values that
4032    should point back to the relevant slot in the array.  Lowest
4033    numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
4034 static XArray* /* of Thr* */ thrid_to_thr_map = NULL;
4035
4036 /* And a counter to dole out ThrID values.  For rationale/background,
4037    see comments on definition of ScalarTS (far) above. */
4038 static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */
4039
4040 static ThrID Thr__to_ThrID ( Thr* thr ) {
4041    return thr->thrid;
4042 }
4043 static Thr* Thr__from_ThrID ( UInt thrid ) {
4044    Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
4045    tl_assert(thr->thrid == thrid);
4046    return thr;
4047 }
4048
4049 /* True if the cached rcec for thr is valid and can be used to build the
4050    current stack trace just by changing the last frame to the current IP. */
4051 static inline Bool cached_rcec_valid(Thr *thr)
4052 {
4053    UWord cached_stackvalid = VG_(get_SP_s1) (thr->hgthread->coretid);
4054    return cached_stackvalid != 0;
4055 }
4056 /* Set the validity of the cached rcec of thr. */
4057 static inline void set_cached_rcec_validity(Thr *thr, Bool valid)
4058 {
4059    VG_(set_SP_s1) (thr->hgthread->coretid, valid);
4060 }
4061
4062 static Thr* Thr__new ( void )
4063 {
4064    Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
4065    thr->viR = VtsID_INVALID;
4066    thr->viW = VtsID_INVALID;
4067    thr->llexit_done = False;
4068    thr->joinedwith_done = False;
4069    thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
4070    if (HG_(clo_history_level) == 1)
4071       thr->local_Kws_n_stacks
4072          = VG_(newXA)( HG_(zalloc),
4073                        "libhb.Thr__new.3 (local_Kws_and_stacks)",
4074                        HG_(free), sizeof(ULong_n_EC) );
4075    /* Make an 'empty' cached rcec in thr. */
4076    thr->cached_rcec.magic = RCEC_MAGIC;
4077    thr->cached_rcec.rc = 0;
4078    thr->cached_rcec.rcX = 0;
4079    thr->cached_rcec.next = NULL;
4080
4081    /* Add this Thr* <-> ThrID binding to the mapping, and
4082       cross-check */
4083    if (!thrid_to_thr_map) {
4084       thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
4085                                      HG_(free), sizeof(Thr*) );
4086    }
4087
4088    if (thrid_counter >= ThrID_MAX_VALID) {
4089       /* We're hosed.  We have to stop. */
4090       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
4091    }
4092
4093    thr->thrid = thrid_counter++;
4094    Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
4095    tl_assert(ix + 1024 == thr->thrid);
4096
4097    return thr;
4098 }
4099
4100 static void note_local_Kw_n_stack_for ( Thr* thr )
4101 {
4102    Word       nPresent;
4103    ULong_n_EC pair;
4104    tl_assert(thr);
4105
4106    // We only collect this info at history level 1 (approx)
4107    if (HG_(clo_history_level) != 1)
4108       return;
4109
4110    /* This is the scalar Kw for thr. */
4111    pair.ull = VtsID__indexAt( thr->viW, thr );
4112    pair.ec  = main_get_EC( thr );
4113    tl_assert(pair.ec);
4114    tl_assert(thr->local_Kws_n_stacks);
4115
4116    /* check that we're not adding duplicates */
4117    nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
4118
4119    /* Throw away old stacks, if necessary.  We can't accumulate stuff
4120       indefinitely. */
4121    if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
4122       VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
4123       nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
4124       if (0)
4125          VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
4126                      thr, pair.ull, pair.ec );
4127    }
4128
4129    if (nPresent > 0) {
4130       ULong_n_EC* prevPair
4131          = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
4132       tl_assert( prevPair->ull <= pair.ull );
4133    }
4134
4135    if (nPresent == 0)
4136       pair.ec = NULL;
4137
4138    VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
4139
4140    if (0)
4141       VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
4142                   thr, pair.ull, pair.ec );
4143    if (0)
4144       VG_(pp_ExeContext)(pair.ec);
4145 }
4146
4147 static Int cmp__ULong_n_EC__by_ULong ( const ULong_n_EC* pair1,
4148                                        const ULong_n_EC* pair2 )
4149 {
4150    if (pair1->ull < pair2->ull) return -1;
4151    if (pair1->ull > pair2->ull) return 1;
4152    return 0;
4153 }
4154
4155
4156 /////////////////////////////////////////////////////////
4157 //                                                     //
4158 // Shadow Values                                       //
4159 //                                                     //
4160 /////////////////////////////////////////////////////////
4161
4162 // type SVal, SVal_INVALID and SVal_NOACCESS are defined by
4163 // hb_zsm.h.  We have to do everything else here.
4164
4165 /* SVal is 64 bit unsigned int.
4166
4167       <---------30--------->    <---------30--------->
4168    00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
4169    10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
4170    11 0--------------------0 00 0--------------------0   A: SVal_INVALID
4171
4172 */
4173 #define SVAL_TAGMASK (3ULL << 62)
4174
4175 static inline Bool SVal__isC ( SVal s ) {
4176    return (0ULL << 62) == (s & SVAL_TAGMASK);
4177 }
4178 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
4179    //tl_assert(VtsID__is_valid(rmini));
4180    //tl_assert(VtsID__is_valid(wmini));
4181    return (((ULong)rmini) << 32) | ((ULong)wmini);
4182 }
4183 static inline VtsID SVal__unC_Rmin ( SVal s ) {
4184    tl_assert(SVal__isC(s));
4185    return (VtsID)(s >> 32);
4186 }
4187 static inline VtsID SVal__unC_Wmin ( SVal s ) {
4188    tl_assert(SVal__isC(s));
4189    return (VtsID)(s & 0xFFFFFFFFULL);
4190 }
4191
4192 static inline Bool SVal__isA ( SVal s ) {
4193    return (2ULL << 62) == (s & SVAL_TAGMASK);
4194 }
4195 __attribute__((unused))
4196 static inline SVal SVal__mkA ( void ) {
4197    return 2ULL << 62;
4198 }
4199
4200 /* Direct callback from lib_zsm. */
4201 static inline void SVal__rcinc ( SVal s ) {
4202    if (SVal__isC(s)) {
4203       VtsID__rcinc( SVal__unC_Rmin(s) );
4204       VtsID__rcinc( SVal__unC_Wmin(s) );
4205    }
4206 }
4207
4208 /* Direct callback from lib_zsm. */
4209 static inline void SVal__rcdec ( SVal s ) {
4210    if (SVal__isC(s)) {
4211       VtsID__rcdec( SVal__unC_Rmin(s) );
4212       VtsID__rcdec( SVal__unC_Wmin(s) );
4213    }
4214 }
4215
4216 static inline void *SVal2Ptr (SVal s)
4217 {
4218    return (void*)(UWord)s;
4219 }
4220
4221 static inline SVal Ptr2SVal (void* ptr)
4222 {
4223    return (SVal)(UWord)ptr;
4224 }
4225
4226
4227
4228 /////////////////////////////////////////////////////////
4229 //                                                     //
4230 // Change-event map2                                   //
4231 //                                                     //
4232 /////////////////////////////////////////////////////////
4233
4234 /* This is in two parts:
4235
4236    1. A hash table of RCECs.  This is a set of reference-counted stack
4237       traces.  When the reference count of a stack trace becomes zero,
4238       it is removed from the set and freed up.  The intent is to have
4239       a set of stack traces which can be referred to from (2), but to
4240       only represent each one once.  The set is indexed/searched by
4241       ordering on the stack trace vectors.
4242
4243    2. A Hash table of OldRefs.  These store information about each old
4244       ref that we need to record.  Hash table key is the address of the
4245       location for which the information is recorded.  For LRU
4246       purposes, each OldRef in the hash table is also on a doubly
4247       linked list maintaining the order in which the OldRef were most
4248       recently accessed.
4249       Each OldRef also maintains the stamp at which it was last accessed.
4250       With these stamps, we can quickly check which of 2 OldRef is the
4251       'newest', without having to scan the full list of LRU OldRef.
4252
4253       The important part of an OldRef is, however, its acc component.
4254       This binds a TSW triple (thread, size, R/W) to an RCEC.
4255
4256       We allocate a maximum of VG_(clo_conflict_cache_size) OldRef.
4257       Then we do exact LRU discarding.  For each discarded OldRef we must
4258       of course decrement the reference count on the RCEC it
4259       refers to, in order that entries from (1) eventually get
4260       discarded too.
4261 */
4262
4263 static UWord stats__evm__lookup_found = 0;
4264 static UWord stats__evm__lookup_notfound = 0;
4265
4266 static UWord stats__ctxt_eq_tsw_eq_rcec = 0;
4267 static UWord stats__ctxt_eq_tsw_neq_rcec = 0;
4268 static UWord stats__ctxt_neq_tsw_neq_rcec = 0;
4269 static UWord stats__ctxt_rcdec_calls = 0;
4270 static UWord stats__ctxt_rcec_gc_discards = 0;
4271
4272 static UWord stats__ctxt_tab_curr = 0;
4273 static UWord stats__ctxt_tab_max  = 0;
4274
4275 static UWord stats__ctxt_tab_qs   = 0;
4276 static UWord stats__ctxt_tab_cmps = 0;
4277
4278
4279 ///////////////////////////////////////////////////////
4280 //// Part (1): A hash table of RCECs
4281 ///
4282
4283 //#define N_RCEC_TAB 98317 /* prime */
4284 #define N_RCEC_TAB 196613 /* prime */
4285
4286 //////////// BEGIN RCEC pool allocator
4287 static PoolAlloc* rcec_pool_allocator;
4288 static RCEC* alloc_RCEC ( void ) {
4289    return VG_(allocEltPA) ( rcec_pool_allocator );
4290 }
4291
4292 static void free_RCEC ( RCEC* rcec ) {
4293    tl_assert(rcec->magic == RCEC_MAGIC);
4294    VG_(freeEltPA)( rcec_pool_allocator, rcec );
4295 }
4296 //////////// END RCEC pool allocator
4297
4298 static RCEC** contextTab = NULL; /* hash table of RCEC*s */
4299
4300 /* Count of allocated RCEC having ref count > 0 */
4301 static UWord RCEC_referenced = 0;
4302
4303 /* True if the frames of ec1 and ec2 are different. */
4304 static Bool RCEC__differs_by_frames ( RCEC* ec1, RCEC* ec2 ) {
4305    Word i;
4306    if (CHECK_CEM) {
4307       tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
4308       tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
4309    }
4310    if (ec1->frames_hash != ec2->frames_hash) return True;
4311    for (i = 0; i < N_FRAMES; i++) {
4312       if (ec1->frames[i] != ec2->frames[i]) return True;
4313    }
4314    return False;
4315 }
4316
4317 /* Dec the ref of this RCEC. */
4318 static void ctxt__rcdec ( RCEC* ec )
4319 {
4320    stats__ctxt_rcdec_calls++;
4321    if (CHECK_CEM)
4322       tl_assert(ec && ec->magic == RCEC_MAGIC);
4323    tl_assert(ec->rc > 0);
4324    ec->rc--;
4325    if (ec->rc == 0)
4326       RCEC_referenced--;
4327 }
4328
4329 static void ctxt__rcinc ( RCEC* ec )
4330 {
4331    if (CHECK_CEM)
4332       tl_assert(ec && ec->magic == RCEC_MAGIC);
4333    if (ec->rc == 0)
4334       RCEC_referenced++;
4335    ec->rc++;
4336 }
4337
4338
4339 /* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
4340    move it one step closer to the front of the list, so as to make
4341    subsequent searches for it cheaper. */
4342 static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
4343 {
4344    RCEC *ec0, *ec1, *ec2;
4345    if (ec == *headp)
4346       tl_assert(0); /* already at head of list */
4347    tl_assert(ec != NULL);
4348    ec0 = *headp;
4349    ec1 = NULL;
4350    ec2 = NULL;
4351    while (True) {
4352       if (ec0 == NULL || ec0 == ec) break;
4353       ec2 = ec1;
4354       ec1 = ec0;
4355       ec0 = ec0->next;
4356    }
4357    tl_assert(ec0 == ec);
4358    if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
4359       RCEC* tmp;
4360       /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
4361          predecessor.  Swap ec0 and ec1, that is, move ec0 one step
4362          closer to the start of the list. */
4363       tl_assert(ec2->next == ec1);
4364       tl_assert(ec1->next == ec0);
4365       tmp = ec0->next;
4366       ec2->next = ec0;
4367       ec0->next = ec1;
4368       ec1->next = tmp;
4369    }
4370    else
4371    if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
4372       /* it's second in the list. */
4373       tl_assert(*headp == ec1);
4374       tl_assert(ec1->next == ec0);
4375       ec1->next = ec0->next;
4376       ec0->next = ec1;
4377       *headp = ec0;
4378    }
4379 }
4380
4381
4382 /* Find the given RCEC in the tree, and return a pointer to it.  Or,
4383    if not present, add the given one to the tree (by making a copy of
4384    it, so the caller can immediately deallocate the original) and
4385    return a pointer to the copy.  The caller can safely have 'example'
4386    on its stack, since we will always return a pointer to a copy of
4387    it, not to the original.  Note that the inserted node will have .rc
4388    of zero and so the caller must immediately increment it. */
4389 __attribute__((noinline))
4390 static RCEC* ctxt__find_or_add ( RCEC* example )
4391 {
4392    UWord hent;
4393    RCEC* copy;
4394
4395    if (CHECK_CEM) {
4396       /* Note that the single caller of ctxt__find_or_add always provides
4397          &thr->cached_rcec as argument. The sanity of thr->cached_rcec is always
4398          checked with a thread terminates. */
4399       tl_assert(example && example->magic == RCEC_MAGIC);
4400       tl_assert(example->rc == 0);
4401    }
4402
4403    /* Search the hash table to see if we already have it. */
4404    stats__ctxt_tab_qs++;
4405    hent = example->frames_hash % N_RCEC_TAB;
4406    copy = contextTab[hent];
4407    while (1) {
4408       if (!copy) break;
4409       if (CHECK_CEM)
4410          tl_assert(copy->magic == RCEC_MAGIC);
4411       stats__ctxt_tab_cmps++;
4412       if (!RCEC__differs_by_frames(copy, example)) break;
4413       copy = copy->next;
4414    }
4415
4416    if (copy) {
4417       tl_assert(copy != example);
4418       /* optimisation: if it's not at the head of its list, move 1
4419          step fwds, to make future searches cheaper */
4420       if (copy != contextTab[hent]) {
4421          move_RCEC_one_step_forward( &contextTab[hent], copy );
4422       }
4423    } else {
4424       copy = alloc_RCEC();
4425       tl_assert(copy != example);
4426       *copy = *example;
4427       copy->next = contextTab[hent];
4428       contextTab[hent] = copy;
4429       stats__ctxt_tab_curr++;
4430       if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
4431          stats__ctxt_tab_max = stats__ctxt_tab_curr;
4432    }
4433    return copy;
4434 }
4435
4436 static inline UWord ROLW ( UWord w, Int n )
4437 {
4438    Int bpw = 8 * sizeof(UWord);
4439    w = (w << n) | (w >> (bpw-n));
4440    return w;
4441 }
4442
4443 static UWord stats__cached_rcec_identical = 0;
4444 static UWord stats__cached_rcec_updated = 0;
4445 static UWord stats__cached_rcec_fresh = 0;
4446 static UWord stats__cached_rcec_diff = 0;
4447 static UWord stats__cached_rcec_diff_known_reason = 0;
4448
4449 /* Check if the cached rcec in thr corresponds to the current
4450    stacktrace of the thread. Returns True if ok, False otherwise.
4451    This is just used for debugging the cached rcec logic, activated
4452    using --hg-sanity-flags=xx1xxx i.e. SCE_ACCESS flag.
4453    When this flag is activated, a call to this function will happen each time
4454    a stack trace is needed for a memory access. */
4455 __attribute__((noinline))
4456 static Bool check_cached_rcec_ok (Thr* thr, Addr previous_frame0)
4457 {
4458    Bool  ok = True;
4459    UInt  i;
4460    UWord frames[N_FRAMES];
4461    UWord sps[N_FRAMES];
4462    UWord fps[N_FRAMES];
4463    const DiEpoch cur_ep = VG_(current_DiEpoch)();
4464
4465    for (i = 0; i < N_FRAMES; i++)
4466       frames[i] = sps[i] = fps[i] = 0;
4467    VG_(get_StackTrace)( thr->hgthread->coretid, &frames[0], N_FRAMES,
4468                         &sps[0], &fps[0], 0);
4469    for (i = 0; i < N_FRAMES; i++) {
4470       if ( thr->cached_rcec.frames[i] != frames[i] ) {
4471          /* There are a bunch of "normal" reasons for which a stack
4472             derived from the cached rcec differs from frames. */
4473          const HChar *reason = NULL;
4474
4475          /* Old linkers (e.g. RHEL5) gave no cfi unwind information in the PLT
4476             section (fix was added in binutils around June 2011).
4477             Without PLT unwind info, stacktrace in the PLT section are
4478             missing an entry. E.g. the cached stacktrace is:
4479               ==4463==    at 0x2035C0: ___tls_get_addr (dl-tls.c:753)
4480               ==4463==    by 0x33B7F9: __libc_thread_freeres
4481                                                 (in /lib/libc-2.11.2.so)
4482               ==4463==    by 0x39BA4F: start_thread (pthread_create.c:307)
4483               ==4463==    by 0x2F107D: clone (clone.S:130)
4484            while the 'check stacktrace' is
4485               ==4463==    at 0x2035C0: ___tls_get_addr (dl-tls.c:753)
4486               ==4463==    by 0x33B82D: strerror_thread_freeres
4487                                                 (in /lib/libc-2.11.2.so)
4488               ==4463==    by 0x33B7F9: __libc_thread_freeres
4489                                                 (in /lib/libc-2.11.2.so)
4490               ==4463==    by 0x39BA4F: start_thread (pthread_create.c:307)
4491               ==4463==    by 0x2F107D: clone (clone.S:130)
4492            No cheap/easy way to detect or fix that. */
4493
4494          /* It seems that sometimes, the CFI unwind info looks wrong
4495             for a 'ret' instruction. E.g. here is the unwind info
4496             for a 'retq' on gcc20 (amd64, Debian 7)
4497                 [0x4e3ddfe .. 0x4e3ddfe]: let cfa=oldSP+48 in RA=*(cfa+-8)
4498                                                       SP=cfa+0 BP=*(cfa+-24)
4499             This unwind info looks doubtful, as the RA should be at oldSP.
4500             No easy way to detect this problem.
4501             This gives a difference between cached rcec and
4502             current stack trace: the cached rcec is correct. */
4503
4504          /* When returning from main, unwind info becomes erratic.
4505             So, by default, only report errors for main and above,
4506             unless asked to show below main. */
4507          if (reason == NULL) {
4508             UInt fr_main;
4509             Vg_FnNameKind fr_kind;
4510             for (fr_main = 0; fr_main < N_FRAMES; fr_main++) {
4511                fr_kind = VG_(get_fnname_kind_from_IP)
4512                                 (cur_ep, frames[fr_main]);
4513                if (fr_kind == Vg_FnNameMain || fr_kind == Vg_FnNameBelowMain)
4514                   break;
4515             }
4516             UInt kh_main;
4517             Vg_FnNameKind kh_kind;
4518             for (kh_main = 0; kh_main < N_FRAMES; kh_main++) {
4519                kh_kind = VG_(get_fnname_kind_from_IP)
4520                                 (cur_ep, thr->cached_rcec.frames[kh_main]);
4521                if (kh_kind == Vg_FnNameMain || kh_kind == Vg_FnNameBelowMain)
4522                   break;
4523             }
4524             if (kh_main == fr_main
4525                 && kh_kind == fr_kind
4526                 && (kh_main < i || (kh_main == i
4527                                     && kh_kind == Vg_FnNameBelowMain))) {
4528                // found main or below main before the difference
4529                reason = "Below main";
4530             }
4531          }
4532
4533          /* We have places where the stack is missing some internal
4534             pthread functions. For such stacktraces, GDB reports only
4535             one function, telling:
4536                #0  0xf7fa81fe in _L_unlock_669 ()
4537                               from /lib/i386-linux-gnu/libpthread.so.0
4538                Backtrace stopped: previous frame identical to
4539                                             this frame (corrupt stack?)
4540
4541             This is when sps and fps are identical.
4542             The cached stack trace is then
4543                ==3336==    at 0x40641FE: _L_unlock_669
4544                                               (pthread_mutex_unlock.c:310)
4545                ==3336==    by 0x40302BE: pthread_mutex_unlock
4546                                               (hg_intercepts.c:710)
4547                ==3336==    by 0x80486AF: main (cond_timedwait_test.c:14)
4548            while the 'check stacktrace' is
4549                ==3336==    at 0x40641FE: _L_unlock_669
4550                                               (pthread_mutex_unlock.c:310)
4551                ==3336==    by 0x4064206: _L_unlock_669
4552                                               (pthread_mutex_unlock.c:310)
4553                ==3336==    by 0x4064132: __pthread_mutex_unlock_usercnt
4554                                               (pthread_mutex_unlock.c:57)
4555                ==3336==    by 0x40302BE: pthread_mutex_unlock
4556                                                (hg_intercepts.c:710)
4557                ==3336==    by 0x80486AF: main (cond_timedwait_test.c:14) */
4558          if (reason == NULL) {
4559             if ((i > 0
4560                       && sps[i] == sps[i-1] && fps[i] == fps[i-1])
4561                 || (i < N_FRAMES-1
4562                       && sps[i] == sps[i+1] && fps[i] == fps[i+1])) {
4563                reason = "previous||next frame: identical sp and fp";
4564             }
4565          }
4566          if (reason == NULL) {
4567             if ((i > 0
4568                       && fps[i] == fps[i-1])
4569                 || (i < N_FRAMES-1
4570                       && fps[i] == fps[i+1])) {
4571                reason = "previous||next frame: identical fp";
4572             }
4573          }
4574
4575          /* When we have a read or write 'in the middle of a push instruction',
4576             then the normal backtrace is not very good, while the helgrind
4577             stacktrace is better, as it undoes the not yet fully finished
4578             push instruction before getting the stacktrace. */
4579          if (reason == NULL && thr->hgthread->first_sp_delta != 0) {
4580             reason = "fixupSP probably needed for check stacktrace";
4581          }
4582
4583          /* Unwinding becomes hectic when running the exit handlers.
4584             None of GDB, cached stacktrace and check stacktrace corresponds.
4585             So, if we find __run_exit_handlers, ignore the difference. */
4586          if (reason == NULL) {
4587             const HChar *fnname;
4588             for (UInt f = 0; f < N_FRAMES; f++) {
4589                if (VG_(get_fnname)( cur_ep, frames[f], &fnname)
4590                    && VG_(strcmp) ("__run_exit_handlers", fnname) == 0) {
4591                   reason = "exit handlers";
4592                   break;
4593                }
4594             }
4595          }
4596
4597          // Show what we have found for this difference
4598          if (reason == NULL) {
4599             ok = False;
4600             stats__cached_rcec_diff++;
4601          } else {
4602             ok = True;
4603             stats__cached_rcec_diff_known_reason++;
4604          }
4605          if (!ok || VG_(clo_verbosity) > 2) {
4606             Bool save_show_below_main = VG_(clo_show_below_main);
4607             VG_(clo_show_below_main) = True;
4608             /* The below error msg reports an unexpected diff in 'frame %d'.
4609                The (maybe wrong) pc found in the cached stacktrace is
4610                'cached_pc %p' while an unwind gives the (maybe wrong)
4611                'check_pc %p'.
4612                After, 'previous_frame0 %p' tells where the cached stacktrace
4613                was taken.
4614                This is then followed by the full resulting cache stack trace
4615                and the full stack trace found doing unwind.
4616                Such a diff can have various origins:
4617                  * a bug in the unwinder, when the cached stack trace was taken
4618                    at 'previous_frame0'
4619                  * a bug in the unwinder, when the check stack trace was taken
4620                    (i.e. at current pc).
4621                  * a missing 'invalidate cache stack trace' somewhere in the
4622                    instructions between 'previous_frame0' and current_pc.
4623                To investigate the last case, typically, disass the range of
4624                instructions where an invalidate cached stack might miss. */
4625             VG_(printf)("%s diff tid %u frame %u "
4626                         "cached_pc %p check_pc %p\n",
4627                         reason ? reason : "unexpected",
4628                         thr->hgthread->coretid,
4629                         i,
4630                         (void*)thr->cached_rcec.frames[i],
4631                         (void*)frames[i]);
4632             VG_(printf)("cached stack trace previous_frame0 %p\n",
4633                         (void*)previous_frame0);
4634             VG_(pp_StackTrace)(cur_ep, &previous_frame0, 1);
4635             VG_(printf)("resulting cached stack trace:\n");
4636             VG_(pp_StackTrace)(cur_ep, thr->cached_rcec.frames, N_FRAMES);
4637             VG_(printf)("check stack trace:\n");
4638             VG_(pp_StackTrace)(cur_ep, frames, N_FRAMES);
4639
4640             VG_(show_sched_status) (False,  // host_stacktrace
4641                                     False,  // stack_usage
4642                                     False); // exited_threads
4643             if (VG_(clo_vgdb_error) == 1234567890) // HACK TO ALLOW TO DEBUG
4644                VG_(gdbserver) ( thr->hgthread->coretid );
4645             VG_(clo_show_below_main) = save_show_below_main;
4646          }
4647          break; // Stop giving more errors for this stacktrace.
4648       }
4649    }
4650    return ok;
4651 }
4652
4653 __attribute__((noinline))
4654 static RCEC* get_RCEC ( Thr* thr )
4655 {
4656    UInt  i;
4657    UWord hash;
4658    Addr  previous_frame0 = 0; // Assignment needed to silence gcc
4659    RCEC  *res;
4660    const Bool thr_cached_rcec_valid = cached_rcec_valid(thr);
4661    const Addr cur_ip = VG_(get_IP)(thr->hgthread->coretid);
4662
4663    if (DEBUG_CACHED_RCEC)
4664       VG_(printf)("get rcec tid %u at IP %p SP %p"
4665                   " first_sp_delta %ld cached valid %d\n",
4666                   thr->hgthread->coretid,
4667                   (void*)cur_ip,
4668                   (void*)VG_(get_SP)(thr->hgthread->coretid),
4669                   thr->hgthread->first_sp_delta, thr_cached_rcec_valid);
4670
4671    /* If we have a valid cached rcec, derive the new rcec from the cached one
4672       and update the cached one.
4673       Otherwise, compute a fresh rcec. */
4674
4675    if (thr_cached_rcec_valid) {
4676       /* Update the stacktrace of the cached rcec with the current IP */
4677       previous_frame0 = thr->cached_rcec.frames[0];
4678       thr->cached_rcec.frames[0] = cur_ip;
4679
4680 #     if defined(VGP_x86_linux)
4681       // See m_stacktrace.c kludge
4682       extern Addr VG_(client__dl_sysinfo_int80);
4683       /// #include pub_core_clientstate needed for the above ????
4684       /// or move the above into a pub_tool_??? tool_stacktrace.h maybe ????
4685       if (VG_(client__dl_sysinfo_int80) != 0 /* we know its address */
4686           && cur_ip >= VG_(client__dl_sysinfo_int80)
4687           && cur_ip < VG_(client__dl_sysinfo_int80)+3
4688           ) {
4689          thr->cached_rcec.frames[0]
4690             = (ULong) *(Addr*)(UWord)VG_(get_SP)(thr->hgthread->coretid);
4691       }
4692 #     endif
4693
4694       if (previous_frame0 == thr->cached_rcec.frames[0])
4695          stats__cached_rcec_identical++;
4696       else
4697          stats__cached_rcec_updated++;
4698    } else {
4699       /* Compute a fresh stacktrace. */
4700       main_get_stacktrace( thr, &thr->cached_rcec.frames[0], N_FRAMES );
4701       if (DEBUG_CACHED_RCEC) {
4702          Bool save_show_below_main = VG_(clo_show_below_main);
4703          VG_(clo_show_below_main) = True;
4704          VG_(printf)("caching stack trace:\n");
4705          VG_(pp_StackTrace)(VG_(current_DiEpoch)(),
4706                             &thr->cached_rcec.frames[0], N_FRAMES);
4707          VG_(clo_show_below_main) = save_show_below_main;
4708       }
4709       stats__cached_rcec_fresh++;
4710    }
4711
4712    hash = 0;
4713    for (i = 0; i < N_FRAMES; i++) {
4714       hash ^= thr->cached_rcec.frames[i];
4715       hash = ROLW(hash, 19);
4716    }
4717    thr->cached_rcec.frames_hash = hash;
4718    res = ctxt__find_or_add( &thr->cached_rcec );
4719
4720    if (UNLIKELY(HG_(clo_sanity_flags) & SCE_ACCESS)
4721        && thr_cached_rcec_valid) {
4722       /* In case the cached and check differ, invalidate the cached rcec.
4723          We have less duplicated diffs reported afterwards. */
4724       if (!check_cached_rcec_ok (thr, previous_frame0))
4725          set_cached_rcec_validity(thr, False);
4726    } else {
4727       if (HG_(clo_delta_stacktrace) && !thr_cached_rcec_valid)
4728             set_cached_rcec_validity(thr, True);
4729    }
4730
4731    return res;
4732 }
4733
4734 ///////////////////////////////////////////////////////
4735 //// Part (2):
4736 ///  A hashtable guest-addr -> OldRef, that refers to (1)
4737 ///  Note: we use the guest address as key. This means that the entries
4738 ///  for multiple threads accessing the same address will land in the same
4739 ///  bucket. It might be nice to have a better distribution of the
4740 ///  OldRef in the hashtable by using ask key the guestaddress ^ tsw.
4741 ///  The problem is that when a race is reported on a ga, we need to retrieve
4742 ///  efficiently the accesses to ga by other threads, only using the ga.
4743 ///  Measurements on firefox have shown that the chain length is reasonable.
4744
4745 /* Records an access: a thread, a context (size & writeness) and the
4746    number of held locks. The size (1,2,4,8) is stored as is in szB.
4747    Note that szB uses more bits than needed to store a size up to 8.
4748    This allows to use a TSW as a fully initialised UInt e.g. in
4749    cmp_oldref_tsw. If needed, a more compact representation of szB
4750    can be done (e.g. use only 4 bits, or use only 2 bits and encode the
4751    size (1,2,4,8) as 00 = 1, 01 = 2, 10 = 4, 11 = 8. */
4752 typedef
4753    struct {
4754       UInt      thrid  : SCALARTS_N_THRBITS;
4755       UInt      szB    : 32 - SCALARTS_N_THRBITS - 1;
4756       UInt      isW    : 1;
4757    } TSW; // Thread+Size+Writeness
4758 typedef
4759    struct {
4760       TSW       tsw;
4761       WordSetID locksHeldW;
4762       RCEC*     rcec;
4763    }
4764    Thr_n_RCEC;
4765
4766 typedef
4767    struct OldRef {
4768       struct OldRef *ht_next; // to link hash table nodes together.
4769       UWord  ga; // hash_table key, == address for which we record an access.
4770       struct OldRef *prev; // to refs older than this one
4771       struct OldRef *next; // to refs newer that this one
4772       UWord stamp; // allows to order (by time of access) 2 OldRef
4773       Thr_n_RCEC acc;
4774    }
4775    OldRef;
4776
4777 /* Returns the or->tsw as an UInt */
4778 static inline UInt oldref_tsw (const OldRef* or)
4779 {
4780    return *(const UInt*)(&or->acc.tsw);
4781 }
4782
4783 /* Compare the tsw component for 2 OldRef.
4784    Used for OldRef hashtable (which already verifies equality of the
4785    'key' part. */
4786 static Word cmp_oldref_tsw (const void* node1, const void* node2 )
4787 {
4788    const UInt tsw1 = oldref_tsw(node1);
4789    const UInt tsw2 = oldref_tsw(node2);
4790
4791    if (tsw1 < tsw2) return -1;
4792    if (tsw1 > tsw2) return  1;
4793    return 0;
4794 }
4795
4796
4797 //////////// BEGIN OldRef pool allocator
4798 static PoolAlloc* oldref_pool_allocator;
4799 // Note: We only allocate elements in this pool allocator, we never free them.
4800 // We stop allocating elements at VG_(clo_conflict_cache_size).
4801 //////////// END OldRef pool allocator
4802
4803 static OldRef mru;
4804 static OldRef lru;
4805 // A double linked list, chaining all OldREf in a mru/lru order.
4806 // mru/lru are sentinel nodes.
4807 // Whenever an oldref is re-used, its position is changed as the most recently
4808 // used (i.e. pointed to by mru.prev).
4809 // When a new oldref is needed, it is allocated from the pool
4810 //  if we have not yet reached --conflict-cache-size.
4811 // Otherwise, if all oldref have already been allocated,
4812 // the least recently used (i.e. pointed to by lru.next) is re-used.
4813 // When an OldRef is used, it is moved as the most recently used entry
4814 // (i.e. pointed to by mru.prev).
4815
4816 // Removes r from the double linked list
4817 // Note: we do not need to test for special cases such as
4818 // NULL next or prev pointers, because we have sentinel nodes
4819 // at both sides of the list. So, a node is always forward and
4820 // backward linked.
4821 static inline void OldRef_unchain(OldRef *r)
4822 {
4823    r->next->prev = r->prev;
4824    r->prev->next = r->next;
4825 }
4826
4827 // Insert new as the newest OldRef
4828 // Similarly to OldRef_unchain, no need to test for NULL
4829 // pointers, as e.g. mru.prev is always guaranteed to point
4830 // to a non NULL node (lru when the list is empty).
4831 static inline void OldRef_newest(OldRef *new)
4832 {
4833    new->next = &mru;
4834    new->prev = mru.prev;
4835    mru.prev = new;
4836    new->prev->next = new;
4837 }
4838
4839
4840 static VgHashTable* oldrefHT    = NULL; /* Hash table* OldRef* */
4841 static UWord     oldrefHTN    = 0;    /* # elems in oldrefHT */
4842 /* Note: the nr of ref in the oldrefHT will always be equal to
4843    the nr of elements that were allocated from the OldRef pool allocator
4844    as we never free an OldRef : we just re-use them. */
4845
4846
4847 /* allocates a new OldRef or re-use the lru one if all allowed OldRef
4848    have already been allocated. */
4849 static OldRef* alloc_or_reuse_OldRef ( void )
4850 {
4851    if (oldrefHTN < HG_(clo_conflict_cache_size)) {
4852       oldrefHTN++;
4853       return VG_(allocEltPA) ( oldref_pool_allocator );
4854    } else {
4855       OldRef *oldref_ht;
4856       OldRef *oldref = lru.next;
4857
4858       OldRef_unchain(oldref);
4859       oldref_ht = VG_(HT_gen_remove) (oldrefHT, oldref, cmp_oldref_tsw);
4860       tl_assert (oldref == oldref_ht);
4861       ctxt__rcdec( oldref->acc.rcec );
4862       return oldref;
4863    }
4864 }
4865
4866
4867 inline static UInt min_UInt ( UInt a, UInt b ) {
4868    return a < b ? a : b;
4869 }
4870
4871 /* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
4872    first interval is lower, 1 if the first interval is higher, and 0
4873    if there is any overlap.  Redundant paranoia with casting is there
4874    following what looked distinctly like a bug in gcc-4.1.2, in which
4875    some of the comparisons were done signedly instead of
4876    unsignedly. */
4877 /* Copied from exp-ptrcheck/sg_main.c */
4878 static inline Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
4879                                             Addr a2, SizeT n2 ) {
4880    UWord a1w = (UWord)a1;
4881    UWord n1w = (UWord)n1;
4882    UWord a2w = (UWord)a2;
4883    UWord n2w = (UWord)n2;
4884    tl_assert(n1w > 0 && n2w > 0);
4885    if (a1w + n1w <= a2w) return -1L;
4886    if (a2w + n2w <= a1w) return 1L;
4887    return 0;
4888 }
4889
4890 static UWord event_map_stamp = 0; // Used to stamp each OldRef when touched.
4891
4892 static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
4893 {
4894    OldRef  example;
4895    OldRef* ref;
4896    RCEC*   rcec;
4897
4898    tl_assert(thr);
4899    ThrID thrid = thr->thrid;
4900    tl_assert(thrid != 0); /* zero is used to denote an empty slot. */
4901
4902    WordSetID locksHeldW = thr->hgthread->locksetW;
4903
4904    rcec = get_RCEC( thr );
4905
4906    /* Look in the oldrefHT to see if we already have a record for this
4907       address/thr/sz/isW. */
4908    example.ga = a;
4909    example.acc.tsw = (TSW) {.thrid = thrid,
4910                             .szB = szB,
4911                             .isW = (UInt)(isW & 1)};
4912    ref = VG_(HT_gen_lookup) (oldrefHT, &example, cmp_oldref_tsw);
4913
4914    if (ref) {
4915       /* We already have a record for this address and this (thrid, R/W,
4916          size) triple. */
4917       tl_assert (ref->ga == a);
4918
4919       /* thread 'thr' has an entry.  Update its RCEC, if it differs. */
4920       if (rcec == ref->acc.rcec)
4921          stats__ctxt_eq_tsw_eq_rcec++;
4922       else {
4923          stats__ctxt_eq_tsw_neq_rcec++;
4924          ctxt__rcdec( ref->acc.rcec );
4925          ctxt__rcinc(rcec);
4926          ref->acc.rcec       = rcec;
4927       }
4928       tl_assert(ref->acc.tsw.thrid == thrid);
4929       /* Update the stamp, RCEC and the W-held lockset. */
4930       ref->stamp = event_map_stamp;
4931       ref->acc.locksHeldW = locksHeldW;
4932
4933       OldRef_unchain(ref);
4934       OldRef_newest(ref);
4935
4936    } else {
4937       tl_assert (szB == 4 || szB == 8 ||szB == 1 || szB == 2);
4938       // We only need to check the size the first time we insert a ref.
4939       // Check for most frequent cases first
4940       // Note: we could support a szB up to 1 << (32 - SCALARTS_N_THRBITS - 1)
4941
4942       /* We don't have a record for this address+triple.  Create a new one. */
4943       stats__ctxt_neq_tsw_neq_rcec++;
4944       ref = alloc_or_reuse_OldRef();
4945       ref->ga = a;
4946       ref->acc.tsw = (TSW) {.thrid  = thrid,
4947                             .szB    = szB,
4948                             .isW    = (UInt)(isW & 1)};
4949       ref->stamp = event_map_stamp;
4950       ref->acc.locksHeldW = locksHeldW;
4951       ref->acc.rcec       = rcec;
4952       ctxt__rcinc(rcec);
4953
4954       VG_(HT_add_node) ( oldrefHT, ref );
4955       OldRef_newest (ref);
4956    }
4957    event_map_stamp++;
4958 }
4959
4960
4961 /* Extract info from the conflicting-access machinery.
4962    Returns the most recent conflicting access with thr/[a, a+szB[/isW. */
4963 Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
4964                               /*OUT*/Thr**        resThr,
4965                               /*OUT*/SizeT*       resSzB,
4966                               /*OUT*/Bool*        resIsW,
4967                               /*OUT*/WordSetID*   locksHeldW,
4968                               Thr* thr, Addr a, SizeT szB, Bool isW )
4969 {
4970    Word    i, j;
4971    OldRef *ref = NULL;
4972    SizeT  ref_szB = 0;
4973
4974    OldRef *cand_ref;
4975    SizeT  cand_ref_szB;
4976    Addr   cand_a;
4977
4978    Addr toCheck[15];
4979    Int  nToCheck = 0;
4980
4981    tl_assert(thr);
4982    tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
4983
4984    ThrID thrid = thr->thrid;
4985
4986    toCheck[nToCheck++] = a;
4987    for (i = -7; i < (Word)szB; i++) {
4988       if (i != 0)
4989          toCheck[nToCheck++] = a + i;
4990    }
4991    tl_assert(nToCheck <= 15);
4992
4993    /* Now see if we can find a suitable matching event for
4994       any of the addresses in toCheck[0 .. nToCheck-1]. */
4995    for (j = 0; j < nToCheck; j++) {
4996
4997       cand_a = toCheck[j];
4998       //      VG_(printf)("test %ld %p\n", j, cand_a);
4999
5000       /* Find the first HT element for this address.
5001          We might have several of these. They will be linked via ht_next.
5002          We however need to check various elements as the list contains
5003          all elements that map to the same bucket. */
5004       for (cand_ref = VG_(HT_lookup)( oldrefHT, cand_a );
5005            cand_ref; cand_ref = cand_ref->ht_next) {
5006          if (cand_ref->ga != cand_a)
5007             /* OldRef for another address in this HT bucket. Ignore. */
5008             continue;
5009
5010          if (cand_ref->acc.tsw.thrid == thrid)
5011             /* This is an access by the same thread, but we're only
5012                interested in accesses from other threads.  Ignore. */
5013             continue;
5014
5015          if ((!cand_ref->acc.tsw.isW) && (!isW))
5016             /* We don't want to report a read racing against another
5017                read; that's stupid.  So in this case move on. */
5018             continue;
5019
5020          cand_ref_szB        = cand_ref->acc.tsw.szB;
5021          if (cmp_nonempty_intervals(a, szB, cand_a, cand_ref_szB) != 0)
5022             /* No overlap with the access we're asking about.  Ignore. */
5023             continue;
5024
5025          /* We have a match. Keep this match if it is newer than
5026             the previous match. Note that stamp are Unsigned Words, and
5027             for long running applications, event_map_stamp might have cycled.
5028             So, 'roll' each stamp using event_map_stamp to have the
5029             stamps in the good order, in case event_map_stamp recycled. */
5030          if (!ref
5031              || (ref->stamp - event_map_stamp)
5032                    < (cand_ref->stamp - event_map_stamp)) {
5033             ref = cand_ref;
5034             ref_szB = cand_ref_szB;
5035          }
5036       }
5037
5038       if (ref) {
5039          /* return with success */
5040          Int n, maxNFrames;
5041          RCEC*     ref_rcec = ref->acc.rcec;
5042          tl_assert(ref->acc.tsw.thrid);
5043          tl_assert(ref_rcec);
5044          tl_assert(ref_rcec->magic == RCEC_MAGIC);
5045          tl_assert(ref_szB >= 1);
5046          /* Count how many non-zero frames we have. */
5047          maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
5048          for (n = 0; n < maxNFrames; n++) {
5049             if (0 == ref_rcec->frames[n]) break;
5050          }
5051          *resEC      = VG_(make_ExeContext_from_StackTrace)(ref_rcec->frames,
5052                                                             n);
5053          *resThr     = Thr__from_ThrID(ref->acc.tsw.thrid);
5054          *resSzB     = ref_szB;
5055          *resIsW     = ref->acc.tsw.isW;
5056          *locksHeldW = ref->acc.locksHeldW;
5057          stats__evm__lookup_found++;
5058          return True;
5059       }
5060
5061       /* consider next address in toCheck[] */
5062    } /* for (j = 0; j < nToCheck; j++) */
5063
5064    /* really didn't find anything. */
5065    stats__evm__lookup_notfound++;
5066    return False;
5067 }
5068
5069
5070 void libhb_event_map_access_history ( Addr a, SizeT szB, Access_t fn )
5071 {
5072    OldRef *ref = lru.next;
5073    SizeT ref_szB;
5074    Int n;
5075
5076    while (ref != &mru) {
5077       ref_szB = ref->acc.tsw.szB;
5078       if (cmp_nonempty_intervals(a, szB, ref->ga, ref_szB) == 0) {
5079          RCEC* ref_rcec = ref->acc.rcec;
5080          for (n = 0; n < N_FRAMES; n++) {
5081             if (0 == ref_rcec->frames[n]) {
5082                break;
5083             }
5084          }
5085          (*fn)(ref_rcec->frames, n,
5086                Thr__from_ThrID(ref->acc.tsw.thrid),
5087                ref->ga,
5088                ref_szB,
5089                ref->acc.tsw.isW,
5090                ref->acc.locksHeldW);
5091       }
5092       tl_assert (ref->next == &mru
5093                  || ((ref->stamp - event_map_stamp)
5094                         < ref->next->stamp - event_map_stamp));
5095       ref = ref->next;
5096    }
5097 }
5098
5099 static void event_map_init ( void )
5100 {
5101    Word i;
5102
5103    /* Context (RCEC) pool allocator */
5104    rcec_pool_allocator = VG_(newPA) (
5105                              sizeof(RCEC),
5106                              1000 /* RCECs per pool */,
5107                              HG_(zalloc),
5108                              "libhb.event_map_init.1 (RCEC pools)",
5109                              HG_(free)
5110                           );
5111
5112    /* Context table */
5113    tl_assert(!contextTab);
5114    contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
5115                              N_RCEC_TAB * sizeof(RCEC*) );
5116    for (i = 0; i < N_RCEC_TAB; i++)
5117       contextTab[i] = NULL;
5118
5119    /* Oldref pool allocator */
5120    oldref_pool_allocator = VG_(newPA)(
5121                                sizeof(OldRef),
5122                                1000 /* OldRefs per pool */,
5123                                HG_(zalloc),
5124                                "libhb.event_map_init.3 (OldRef pools)",
5125                                HG_(free)
5126                             );
5127
5128    /* Oldref hashtable */
5129    tl_assert(!oldrefHT);
5130    oldrefHT = VG_(HT_construct) ("libhb.event_map_init.4 (oldref hashtable)");
5131
5132    oldrefHTN = 0;
5133    mru.prev = &lru;
5134    mru.next = NULL;
5135    lru.prev = NULL;
5136    lru.next = &mru;
5137    mru.acc = (Thr_n_RCEC) {.tsw = {.thrid = 0,
5138                                    .szB = 0,
5139                                    .isW = 0},
5140                            .locksHeldW = 0,
5141                            .rcec = NULL};
5142    lru.acc = mru.acc;
5143 }
5144
5145 static void event_map__check_reference_counts ( void )
5146 {
5147    RCEC*   rcec;
5148    OldRef* oldref;
5149    Word    i;
5150    UWord   nEnts = 0;
5151
5152    /* Set the 'check' reference counts to zero.  Also, optionally
5153       check that the real reference counts are non-zero.  We allow
5154       these to fall to zero before a GC, but the GC must get rid of
5155       all those that are zero, hence none should be zero after a
5156       GC. */
5157    for (i = 0; i < N_RCEC_TAB; i++) {
5158       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
5159          nEnts++;
5160          tl_assert(rcec);
5161          tl_assert(rcec->magic == RCEC_MAGIC);
5162          rcec->rcX = 0;
5163       }
5164    }
5165
5166    /* check that the stats are sane */
5167    tl_assert(nEnts == stats__ctxt_tab_curr);
5168    tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
5169
5170    /* visit all the referencing points, inc check ref counts */
5171    VG_(HT_ResetIter)( oldrefHT );
5172    oldref = VG_(HT_Next)( oldrefHT );
5173    while (oldref) {
5174       tl_assert (oldref->acc.tsw.thrid);
5175       tl_assert (oldref->acc.rcec);
5176       tl_assert (oldref->acc.rcec->magic == RCEC_MAGIC);
5177       oldref->acc.rcec->rcX++;
5178       oldref = VG_(HT_Next)( oldrefHT );
5179    }
5180
5181    /* compare check ref counts with actual */
5182    for (i = 0; i < N_RCEC_TAB; i++) {
5183       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
5184          tl_assert(rcec->rc == rcec->rcX);
5185       }
5186    }
5187 }
5188
5189 __attribute__((noinline))
5190 static void do_RCEC_GC ( void )
5191 {
5192    UInt i;
5193
5194    if (VG_(clo_stats)) {
5195       static UInt ctr = 1;
5196       VG_(message)(Vg_DebugMsg,
5197                   "libhb: RCEC GC: #%u  %lu slots,"
5198                    " %lu cur ents(ref'd %lu),"
5199                    " %lu max ents\n",
5200                    ctr++,
5201                    (UWord)N_RCEC_TAB,
5202                    stats__ctxt_tab_curr, RCEC_referenced,
5203                    stats__ctxt_tab_max );
5204    }
5205    tl_assert (stats__ctxt_tab_curr > RCEC_referenced);
5206
5207    /* Throw away all RCECs with zero reference counts */
5208    for (i = 0; i < N_RCEC_TAB; i++) {
5209       RCEC** pp = &contextTab[i];
5210       RCEC*  p  = *pp;
5211       while (p) {
5212          if (p->rc == 0) {
5213             *pp = p->next;
5214             free_RCEC(p);
5215             p = *pp;
5216             tl_assert(stats__ctxt_tab_curr > 0);
5217             stats__ctxt_rcec_gc_discards++;
5218             stats__ctxt_tab_curr--;
5219          } else {
5220             pp = &p->next;
5221             p = p->next;
5222          }
5223       }
5224    }
5225
5226    tl_assert (stats__ctxt_tab_curr == RCEC_referenced);
5227 }
5228
5229 /////////////////////////////////////////////////////////
5230 //                                                     //
5231 // Core MSM                                            //
5232 //                                                     //
5233 /////////////////////////////////////////////////////////
5234
5235 /* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
5236    Nov 08, and again after [...],
5237    June 09. */
5238
5239 static ULong stats__msmcread         = 0;
5240 static ULong stats__msmcread_change  = 0;
5241 static ULong stats__msmcwrite        = 0;
5242 static ULong stats__msmcwrite_change = 0;
5243
5244 /* Some notes on the H1 history mechanism:
5245
5246    Transition rules are:
5247
5248    read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
5249    write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
5250
5251    After any access by a thread T to a location L, L's constraint pair
5252    (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
5253
5254    After a race by thread T conflicting with some previous access by
5255    some other thread U, for a location with constraint (before
5256    processing the later access) (Cr,Cw), then Cw[U] is the segment in
5257    which the previously access lies.
5258
5259    Hence in record_race_info, we pass in Cfailed and Kfailed, which
5260    are compared so as to find out which thread(s) this access
5261    conflicts with.  Once that is established, we also require the
5262    pre-update Cw for the location, so we can index into it for those
5263    threads, to get the scalar clock values for the point at which the
5264    former accesses were made.  (In fact we only bother to do any of
5265    this for an arbitrarily chosen one of the conflicting threads, as
5266    that's simpler, it avoids flooding the user with vast amounts of
5267    mostly useless information, and because the program is wrong if it
5268    contains any races at all -- so we don't really need to show all
5269    conflicting access pairs initially, so long as we only show none if
5270    none exist).
5271
5272    ---
5273
5274    That requires the auxiliary proof that
5275
5276       (Cr `join` Kw)[T] == Kw[T]
5277
5278    Why should that be true?  Because for any thread T, Kw[T] >= the
5279    scalar clock value for T known by any other thread.  In other
5280    words, because T's value for its own scalar clock is at least as up
5281    to date as the value for it known by any other thread (that is true
5282    for both the R- and W- scalar clocks).  Hence no other thread will
5283    be able to feed in a value for that element (indirectly via a
5284    constraint) which will exceed Kw[T], and hence the join cannot
5285    cause that particular element to advance.
5286 */
5287
5288 __attribute__((noinline))
5289 static void record_race_info ( Thr* acc_thr,
5290                                Addr acc_addr, SizeT szB, Bool isWrite,
5291                                VtsID Cfailed,
5292                                VtsID Kfailed,
5293                                VtsID Cw )
5294 {
5295    /* Call here to report a race.  We just hand it onwards to
5296       HG_(record_error_Race).  If that in turn discovers that the
5297       error is going to be collected, then, at history_level 2, that
5298       queries the conflicting-event map.  The alternative would be to
5299       query it right here.  But that causes a lot of pointless queries
5300       for errors which will shortly be discarded as duplicates, and
5301       can become a performance overhead; so we defer the query until
5302       we know the error is not a duplicate. */
5303
5304    /* Stacks for the bounds of the (or one of the) conflicting
5305       segment(s).  These are only set at history_level 1. */
5306    ExeContext* hist1_seg_start = NULL;
5307    ExeContext* hist1_seg_end   = NULL;
5308    Thread*     hist1_conf_thr  = NULL;
5309
5310    tl_assert(acc_thr);
5311    tl_assert(acc_thr->hgthread);
5312    tl_assert(acc_thr->hgthread->hbthr == acc_thr);
5313    tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);
5314
5315    if (HG_(clo_history_level) == 1) {
5316       Bool found;
5317       Word firstIx, lastIx;
5318       ULong_n_EC key;
5319
5320       /* At history_level 1, we must round up the relevant stack-pair
5321          for the conflicting segment right now.  This is because
5322          deferring it is complex; we can't (easily) put Kfailed and
5323          Cfailed into the XError and wait for later without
5324          getting tied up in difficulties with VtsID reference
5325          counting.  So just do it now. */
5326       Thr*  confThr;
5327       ULong confTym = 0;
5328       /* Which thread are we in conflict with?  There may be more than
5329          one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
5330          (in fact it's the one with the lowest Thr* value). */
5331       confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
5332       /* This must exist!  since if it was NULL then there's no
5333          conflict (semantics of return value of
5334          VtsID__findFirst_notLEQ), and msmc{read,write}, which has
5335          called us, just checked exactly this -- that there was in
5336          fact a race. */
5337       tl_assert(confThr);
5338
5339       /* Get the scalar clock value that the conflicting thread
5340          introduced into the constraint.  A careful examination of the
5341          base machine rules shows that this must be the same as the
5342          conflicting thread's scalar clock when it created this
5343          constraint.  Hence we know the scalar clock of the
5344          conflicting thread when the conflicting access was made. */
5345       confTym = VtsID__indexAt( Cfailed, confThr );
5346
5347       /* Using this scalar clock, index into the conflicting thread's
5348          collection of stack traces made each time its vector clock
5349          (hence its scalar clock) changed.  This gives the stack
5350          traces at the start and end of the conflicting segment (well,
5351          as per comment just above, of one of the conflicting
5352          segments, if there are more than one). */
5353       key.ull = confTym;
5354       key.ec  = NULL;
5355       /* tl_assert(confThr); -- asserted just above */
5356       tl_assert(confThr->local_Kws_n_stacks);
5357       firstIx = lastIx = 0;
5358       found = VG_(lookupXA_UNSAFE)(
5359                  confThr->local_Kws_n_stacks,
5360                  &key, &firstIx, &lastIx,
5361                  (XACmpFn_t)cmp__ULong_n_EC__by_ULong
5362               );
5363       if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
5364                          "confTym %llu found %d (%ld,%ld)\n",
5365                          Cfailed, Kfailed, Cw,
5366                          confThr, confTym, found, firstIx, lastIx);
5367       /* We can't indefinitely collect stack traces at VTS
5368          transitions, since we'd eventually run out of memory.  Hence
5369          note_local_Kw_n_stack_for will eventually throw away old
5370          ones, which in turn means we might fail to find index value
5371          confTym in the array. */
5372       if (found) {
5373          ULong_n_EC *pair_start, *pair_end;
5374          pair_start
5375             = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
5376          hist1_seg_start = pair_start->ec;
5377          if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
5378             pair_end
5379                = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
5380                                             lastIx+1 );
5381             /* from properties of VG_(lookupXA) and the comparison fn used: */
5382             tl_assert(pair_start->ull < pair_end->ull);
5383             hist1_seg_end = pair_end->ec;
5384             /* Could do a bit better here.  It may be that pair_end
5385                doesn't have a stack, but the following entries in the
5386                array have the same scalar Kw and to have a stack.  So
5387                we should search a bit further along the array than
5388                lastIx+1 if hist1_seg_end is NULL. */
5389          } else {
5390             if (!confThr->llexit_done)
5391                hist1_seg_end = main_get_EC( confThr );
5392          }
5393          // seg_start could be NULL iff this is the first stack in the thread
5394          //if (seg_start) VG_(pp_ExeContext)(seg_start);
5395          //if (seg_end)   VG_(pp_ExeContext)(seg_end);
5396          hist1_conf_thr = confThr->hgthread;
5397       }
5398    }
5399
5400    HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
5401                            szB, isWrite,
5402                            hist1_conf_thr, hist1_seg_start, hist1_seg_end );
5403 }
5404
5405 static Bool is_sane_SVal_C ( SVal sv ) {
5406    Bool leq;
5407    if (!SVal__isC(sv)) return True;
5408    leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
5409    return leq;
5410 }
5411
5412
5413 /* Compute new state following a read */
5414 static inline SVal msmcread ( SVal svOld,
5415                               /* The following are only needed for
5416                                  creating error reports. */
5417                               Thr* acc_thr,
5418                               Addr acc_addr, SizeT szB )
5419 {
5420    SVal svNew = SVal_INVALID;
5421    stats__msmcread++;
5422
5423    /* Redundant sanity check on the constraints */
5424    if (CHECK_MSM) {
5425       tl_assert(is_sane_SVal_C(svOld));
5426    }
5427
5428    if (LIKELY(SVal__isC(svOld))) {
5429       VtsID tviR  = acc_thr->viR;
5430       VtsID tviW  = acc_thr->viW;
5431       VtsID rmini = SVal__unC_Rmin(svOld);
5432       VtsID wmini = SVal__unC_Wmin(svOld);
5433       Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
5434       if (LIKELY(leq)) {
5435          /* no race */
5436          /* Note: RWLOCK subtlety: use tviW, not tviR */
5437          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5438          goto out;
5439       } else {
5440          /* assert on sanity of constraints. */
5441          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5442          tl_assert(leqxx);
5443          // same as in non-race case
5444          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5445          record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
5446                            rmini, /* Cfailed */
5447                            tviR,  /* Kfailed */
5448                            wmini  /* Cw */ );
5449          goto out;
5450       }
5451    }
5452    if (SVal__isA(svOld)) {
5453       /* reading no-access memory (sigh); leave unchanged */
5454       /* check for no pollution */
5455       tl_assert(svOld == SVal_NOACCESS);
5456       svNew = SVal_NOACCESS;
5457       goto out;
5458    }
5459    if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
5460    tl_assert(0);
5461
5462   out:
5463    if (CHECK_MSM) {
5464       tl_assert(is_sane_SVal_C(svNew));
5465    }
5466    if (UNLIKELY(svNew != svOld)) {
5467       tl_assert(svNew != SVal_INVALID);
5468       if (HG_(clo_history_level) >= 2
5469           && SVal__isC(svOld) && SVal__isC(svNew)) {
5470          event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
5471          stats__msmcread_change++;
5472       }
5473    }
5474    return svNew;
5475 }
5476
5477
5478 /* Compute new state following a write */
5479 static inline SVal msmcwrite ( SVal svOld,
5480                               /* The following are only needed for
5481                                  creating error reports. */
5482                               Thr* acc_thr,
5483                               Addr acc_addr, SizeT szB )
5484 {
5485    SVal svNew = SVal_INVALID;
5486    stats__msmcwrite++;
5487
5488    /* Redundant sanity check on the constraints */
5489    if (CHECK_MSM) {
5490       tl_assert(is_sane_SVal_C(svOld));
5491    }
5492
5493    if (LIKELY(SVal__isC(svOld))) {
5494       VtsID tviW  = acc_thr->viW;
5495       VtsID wmini = SVal__unC_Wmin(svOld);
5496       Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
5497       if (LIKELY(leq)) {
5498          /* no race */
5499          svNew = SVal__mkC( tviW, tviW );
5500          goto out;
5501       } else {
5502          VtsID rmini = SVal__unC_Rmin(svOld);
5503          /* assert on sanity of constraints. */
5504          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5505          tl_assert(leqxx);
5506          // same as in non-race case
5507          // proof: in the non-race case, we have
5508          //    rmini <= wmini (invar on constraints)
5509          //    tviW <= tviR (invar on thread clocks)
5510          //    wmini <= tviW (from run-time check)
5511          // hence from transitivity of <= we have
5512          //    rmini <= wmini <= tviW
5513          // and so join(rmini,tviW) == tviW
5514          // and    join(wmini,tviW) == tviW
5515          // qed.
5516          svNew = SVal__mkC( VtsID__join2(rmini, tviW),
5517                             VtsID__join2(wmini, tviW) );
5518          record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
5519                            wmini, /* Cfailed */
5520                            tviW,  /* Kfailed */
5521                            wmini  /* Cw */ );
5522          goto out;
5523       }
5524    }
5525    if (SVal__isA(svOld)) {
5526       /* writing no-access memory (sigh); leave unchanged */
5527       /* check for no pollution */
5528       tl_assert(svOld == SVal_NOACCESS);
5529       svNew = SVal_NOACCESS;
5530       goto out;
5531    }
5532    if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
5533    tl_assert(0);
5534
5535   out:
5536    if (CHECK_MSM) {
5537       tl_assert(is_sane_SVal_C(svNew));
5538    }
5539    if (UNLIKELY(svNew != svOld)) {
5540       tl_assert(svNew != SVal_INVALID);
5541       if (HG_(clo_history_level) >= 2
5542           && SVal__isC(svOld) && SVal__isC(svNew)) {
5543          event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
5544          stats__msmcwrite_change++;
5545       }
5546    }
5547    return svNew;
5548 }
5549
5550
5551 /////////////////////////////////////////////////////////
5552 //                                                     //
5553 // Apply core MSM to specific memory locations         //
5554 //                                                     //
5555 /////////////////////////////////////////////////////////
5556
5557 /*------------- ZSM accesses: 8 bit sapply ------------- */
5558
5559 static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
5560    CacheLine* cl;
5561    UWord      cloff, tno, toff;
5562    SVal       svOld, svNew;
5563    UShort     descr;
5564    stats__cline_cread08s++;
5565    cl    = get_cacheline(a);
5566    cloff = get_cacheline_offset(a);
5567    tno   = get_treeno(a);
5568    toff  = get_tree_offset(a); /* == 0 .. 7 */
5569    descr = cl->descrs[tno];
5570    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5571       SVal* tree = &cl->svals[tno << 3];
5572       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5573       if (CHECK_ZSM)
5574          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5575    }
5576    svOld = cl->svals[cloff];
5577    svNew = msmcread( svOld, thr,a,1 );
5578    if (CHECK_ZSM)
5579       tl_assert(svNew != SVal_INVALID);
5580    cl->svals[cloff] = svNew;
5581 }
5582
5583 static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
5584    CacheLine* cl;
5585    UWord      cloff, tno, toff;
5586    SVal       svOld, svNew;
5587    UShort     descr;
5588    stats__cline_cwrite08s++;
5589    cl    = get_cacheline(a);
5590    cloff = get_cacheline_offset(a);
5591    tno   = get_treeno(a);
5592    toff  = get_tree_offset(a); /* == 0 .. 7 */
5593    descr = cl->descrs[tno];
5594    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5595       SVal* tree = &cl->svals[tno << 3];
5596       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5597       if (CHECK_ZSM)
5598          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5599    }
5600    svOld = cl->svals[cloff];
5601    svNew = msmcwrite( svOld, thr,a,1 );
5602    if (CHECK_ZSM)
5603       tl_assert(svNew != SVal_INVALID);
5604    cl->svals[cloff] = svNew;
5605 }
5606
5607 /*------------- ZSM accesses: 16 bit sapply ------------- */
5608
5609 static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
5610    CacheLine* cl;
5611    UWord      cloff, tno, toff;
5612    SVal       svOld, svNew;
5613    UShort     descr;
5614    stats__cline_cread16s++;
5615    if (UNLIKELY(!aligned16(a))) goto slowcase;
5616    cl    = get_cacheline(a);
5617    cloff = get_cacheline_offset(a);
5618    tno   = get_treeno(a);
5619    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5620    descr = cl->descrs[tno];
5621    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5622       if (valid_value_is_below_me_16(descr, toff)) {
5623          goto slowcase;
5624       } else {
5625          SVal* tree = &cl->svals[tno << 3];
5626          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5627       }
5628       if (CHECK_ZSM)
5629          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5630    }
5631    svOld = cl->svals[cloff];
5632    svNew = msmcread( svOld, thr,a,2 );
5633    if (CHECK_ZSM)
5634       tl_assert(svNew != SVal_INVALID);
5635    cl->svals[cloff] = svNew;
5636    return;
5637   slowcase: /* misaligned, or must go further down the tree */
5638    stats__cline_16to8splits++;
5639    zsm_sapply08__msmcread( thr, a + 0 );
5640    zsm_sapply08__msmcread( thr, a + 1 );
5641 }
5642
5643 static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
5644    CacheLine* cl;
5645    UWord      cloff, tno, toff;
5646    SVal       svOld, svNew;
5647    UShort     descr;
5648    stats__cline_cwrite16s++;
5649    if (UNLIKELY(!aligned16(a))) goto slowcase;
5650    cl    = get_cacheline(a);
5651    cloff = get_cacheline_offset(a);
5652    tno   = get_treeno(a);
5653    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5654    descr = cl->descrs[tno];
5655    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5656       if (valid_value_is_below_me_16(descr, toff)) {
5657          goto slowcase;
5658       } else {
5659          SVal* tree = &cl->svals[tno << 3];
5660          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5661       }
5662       if (CHECK_ZSM)
5663          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5664    }
5665    svOld = cl->svals[cloff];
5666    svNew = msmcwrite( svOld, thr,a,2 );
5667    if (CHECK_ZSM)
5668       tl_assert(svNew != SVal_INVALID);
5669    cl->svals[cloff] = svNew;
5670    return;
5671   slowcase: /* misaligned, or must go further down the tree */
5672    stats__cline_16to8splits++;
5673    zsm_sapply08__msmcwrite( thr, a + 0 );
5674    zsm_sapply08__msmcwrite( thr, a + 1 );
5675 }
5676
5677 /*------------- ZSM accesses: 32 bit sapply ------------- */
5678
5679 static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
5680    CacheLine* cl;
5681    UWord      cloff, tno, toff;
5682    SVal       svOld, svNew;
5683    UShort     descr;
5684    stats__cline_cread32s++;
5685    if (UNLIKELY(!aligned32(a))) goto slowcase;
5686    cl    = get_cacheline(a);
5687    cloff = get_cacheline_offset(a);
5688    tno   = get_treeno(a);
5689    toff  = get_tree_offset(a); /* == 0 or 4 */
5690    descr = cl->descrs[tno];
5691    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5692       if (valid_value_is_above_me_32(descr, toff)) {
5693          SVal* tree = &cl->svals[tno << 3];
5694          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5695       } else {
5696          goto slowcase;
5697       }
5698       if (CHECK_ZSM)
5699          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5700    }
5701    svOld = cl->svals[cloff];
5702    svNew = msmcread( svOld, thr,a,4 );
5703    if (CHECK_ZSM)
5704       tl_assert(svNew != SVal_INVALID);
5705    cl->svals[cloff] = svNew;
5706    return;
5707   slowcase: /* misaligned, or must go further down the tree */
5708    stats__cline_32to16splits++;
5709    zsm_sapply16__msmcread( thr, a + 0 );
5710    zsm_sapply16__msmcread( thr, a + 2 );
5711 }
5712
5713 static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
5714    CacheLine* cl;
5715    UWord      cloff, tno, toff;
5716    SVal       svOld, svNew;
5717    UShort     descr;
5718    stats__cline_cwrite32s++;
5719    if (UNLIKELY(!aligned32(a))) goto slowcase;
5720    cl    = get_cacheline(a);
5721    cloff = get_cacheline_offset(a);
5722    tno   = get_treeno(a);
5723    toff  = get_tree_offset(a); /* == 0 or 4 */
5724    descr = cl->descrs[tno];
5725    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5726       if (valid_value_is_above_me_32(descr, toff)) {
5727          SVal* tree = &cl->svals[tno << 3];
5728          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5729       } else {
5730          goto slowcase;
5731       }
5732       if (CHECK_ZSM)
5733          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5734    }
5735    svOld = cl->svals[cloff];
5736    svNew = msmcwrite( svOld, thr,a,4 );
5737    if (CHECK_ZSM)
5738       tl_assert(svNew != SVal_INVALID);
5739    cl->svals[cloff] = svNew;
5740    return;
5741   slowcase: /* misaligned, or must go further down the tree */
5742    stats__cline_32to16splits++;
5743    zsm_sapply16__msmcwrite( thr, a + 0 );
5744    zsm_sapply16__msmcwrite( thr, a + 2 );
5745 }
5746
5747 /*------------- ZSM accesses: 64 bit sapply ------------- */
5748
5749 static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
5750    CacheLine* cl;
5751    UWord      cloff, tno;
5752    //UWord      toff;
5753    SVal       svOld, svNew;
5754    UShort     descr;
5755    stats__cline_cread64s++;
5756    if (UNLIKELY(!aligned64(a))) goto slowcase;
5757    cl    = get_cacheline(a);
5758    cloff = get_cacheline_offset(a);
5759    tno   = get_treeno(a);
5760    //toff  = get_tree_offset(a); /* == 0, unused */
5761    descr = cl->descrs[tno];
5762    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5763       goto slowcase;
5764    }
5765    svOld = cl->svals[cloff];
5766    svNew = msmcread( svOld, thr,a,8 );
5767    if (CHECK_ZSM)
5768       tl_assert(svNew != SVal_INVALID);
5769    cl->svals[cloff] = svNew;
5770    return;
5771   slowcase: /* misaligned, or must go further down the tree */
5772    stats__cline_64to32splits++;
5773    zsm_sapply32__msmcread( thr, a + 0 );
5774    zsm_sapply32__msmcread( thr, a + 4 );
5775 }
5776
5777 static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
5778    CacheLine* cl;
5779    UWord      cloff, tno;
5780    //UWord      toff;
5781    SVal       svOld, svNew;
5782    UShort     descr;
5783    stats__cline_cwrite64s++;
5784    if (UNLIKELY(!aligned64(a))) goto slowcase;
5785    cl    = get_cacheline(a);
5786    cloff = get_cacheline_offset(a);
5787    tno   = get_treeno(a);
5788    //toff  = get_tree_offset(a); /* == 0, unused */
5789    descr = cl->descrs[tno];
5790    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5791       goto slowcase;
5792    }
5793    svOld = cl->svals[cloff];
5794    svNew = msmcwrite( svOld, thr,a,8 );
5795    if (CHECK_ZSM)
5796       tl_assert(svNew != SVal_INVALID);
5797    cl->svals[cloff] = svNew;
5798    return;
5799   slowcase: /* misaligned, or must go further down the tree */
5800    stats__cline_64to32splits++;
5801    zsm_sapply32__msmcwrite( thr, a + 0 );
5802    zsm_sapply32__msmcwrite( thr, a + 4 );
5803 }
5804
5805 /*--------------- ZSM accesses: 8 bit swrite --------------- */
5806
5807 static
5808 void zsm_swrite08 ( Addr a, SVal svNew ) {
5809    CacheLine* cl;
5810    UWord      cloff, tno, toff;
5811    UShort     descr;
5812    stats__cline_swrite08s++;
5813    cl    = get_cacheline(a);
5814    cloff = get_cacheline_offset(a);
5815    tno   = get_treeno(a);
5816    toff  = get_tree_offset(a); /* == 0 .. 7 */
5817    descr = cl->descrs[tno];
5818    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5819       SVal* tree = &cl->svals[tno << 3];
5820       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5821       if (CHECK_ZSM)
5822          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5823    }
5824    tl_assert(svNew != SVal_INVALID);
5825    cl->svals[cloff] = svNew;
5826 }
5827
5828 /*--------------- ZSM accesses: 16 bit swrite --------------- */
5829
5830 static
5831 void zsm_swrite16 ( Addr a, SVal svNew ) {
5832    CacheLine* cl;
5833    UWord      cloff, tno, toff;
5834    UShort     descr;
5835    stats__cline_swrite16s++;
5836    if (UNLIKELY(!aligned16(a))) goto slowcase;
5837    cl    = get_cacheline(a);
5838    cloff = get_cacheline_offset(a);
5839    tno   = get_treeno(a);
5840    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5841    descr = cl->descrs[tno];
5842    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5843       if (valid_value_is_below_me_16(descr, toff)) {
5844          /* Writing at this level.  Need to fix up 'descr'. */
5845          cl->descrs[tno] = pullup_descr_to_16(descr, toff);
5846          /* At this point, the tree does not match cl->descr[tno] any
5847             more.  The assignments below will fix it up. */
5848       } else {
5849          /* We can't indiscriminately write on the w16 node as in the
5850             w64 case, as that might make the node inconsistent with
5851             its parent.  So first, pull down to this level. */
5852          SVal* tree = &cl->svals[tno << 3];
5853          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5854       if (CHECK_ZSM)
5855          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5856       }
5857    }
5858    tl_assert(svNew != SVal_INVALID);
5859    cl->svals[cloff + 0] = svNew;
5860    cl->svals[cloff + 1] = SVal_INVALID;
5861    return;
5862   slowcase: /* misaligned */
5863    stats__cline_16to8splits++;
5864    zsm_swrite08( a + 0, svNew );
5865    zsm_swrite08( a + 1, svNew );
5866 }
5867
5868 /*--------------- ZSM accesses: 32 bit swrite --------------- */
5869
5870 static
5871 void zsm_swrite32 ( Addr a, SVal svNew ) {
5872    CacheLine* cl;
5873    UWord      cloff, tno, toff;
5874    UShort     descr;
5875    stats__cline_swrite32s++;
5876    if (UNLIKELY(!aligned32(a))) goto slowcase;
5877    cl    = get_cacheline(a);
5878    cloff = get_cacheline_offset(a);
5879    tno   = get_treeno(a);
5880    toff  = get_tree_offset(a); /* == 0 or 4 */
5881    descr = cl->descrs[tno];
5882    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5883       if (valid_value_is_above_me_32(descr, toff)) {
5884          /* We can't indiscriminately write on the w32 node as in the
5885             w64 case, as that might make the node inconsistent with
5886             its parent.  So first, pull down to this level. */
5887          SVal* tree = &cl->svals[tno << 3];
5888          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5889          if (CHECK_ZSM)
5890             tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5891       } else {
5892          /* Writing at this level.  Need to fix up 'descr'. */
5893          cl->descrs[tno] = pullup_descr_to_32(descr, toff);
5894          /* At this point, the tree does not match cl->descr[tno] any
5895             more.  The assignments below will fix it up. */
5896       }
5897    }
5898    tl_assert(svNew != SVal_INVALID);
5899    cl->svals[cloff + 0] = svNew;
5900    cl->svals[cloff + 1] = SVal_INVALID;
5901    cl->svals[cloff + 2] = SVal_INVALID;
5902    cl->svals[cloff + 3] = SVal_INVALID;
5903    return;
5904   slowcase: /* misaligned */
5905    stats__cline_32to16splits++;
5906    zsm_swrite16( a + 0, svNew );
5907    zsm_swrite16( a + 2, svNew );
5908 }
5909
5910 /*--------------- ZSM accesses: 64 bit swrite --------------- */
5911
5912 static
5913 void zsm_swrite64 ( Addr a, SVal svNew ) {
5914    CacheLine* cl;
5915    UWord      cloff, tno;
5916    //UWord    toff;
5917    stats__cline_swrite64s++;
5918    if (UNLIKELY(!aligned64(a))) goto slowcase;
5919    cl    = get_cacheline(a);
5920    cloff = get_cacheline_offset(a);
5921    tno   = get_treeno(a);
5922    //toff  = get_tree_offset(a); /* == 0, unused */
5923    cl->descrs[tno] = TREE_DESCR_64;
5924    if (CHECK_ZSM)
5925       tl_assert(svNew != SVal_INVALID); /* EXPENSIVE */
5926    cl->svals[cloff + 0] = svNew;
5927    cl->svals[cloff + 1] = SVal_INVALID;
5928    cl->svals[cloff + 2] = SVal_INVALID;
5929    cl->svals[cloff + 3] = SVal_INVALID;
5930    cl->svals[cloff + 4] = SVal_INVALID;
5931    cl->svals[cloff + 5] = SVal_INVALID;
5932    cl->svals[cloff + 6] = SVal_INVALID;
5933    cl->svals[cloff + 7] = SVal_INVALID;
5934    return;
5935   slowcase: /* misaligned */
5936    stats__cline_64to32splits++;
5937    zsm_swrite32( a + 0, svNew );
5938    zsm_swrite32( a + 4, svNew );
5939 }
5940
5941 /*------------- ZSM accesses: 8 bit sread/scopy ------------- */
5942
5943 static
5944 SVal zsm_sread08 ( Addr a ) {
5945    CacheLine* cl;
5946    UWord      cloff, tno, toff;
5947    UShort     descr;
5948    stats__cline_sread08s++;
5949    cl    = get_cacheline(a);
5950    cloff = get_cacheline_offset(a);
5951    tno   = get_treeno(a);
5952    toff  = get_tree_offset(a); /* == 0 .. 7 */
5953    descr = cl->descrs[tno];
5954    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5955       SVal* tree = &cl->svals[tno << 3];
5956       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5957    }
5958    return cl->svals[cloff];
5959 }
5960
5961 static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
5962    SVal       sv;
5963    stats__cline_scopy08s++;
5964    sv = zsm_sread08( src );
5965    zsm_swrite08( dst, sv );
5966 }
5967
5968
5969 /* Block-copy states (needed for implementing realloc()).  Note this
5970    doesn't change the filtering arrangements.  The caller of
5971    zsm_scopy_range needs to attend to that. */
5972
5973 static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
5974 {
5975    SizeT i;
5976    if (len == 0)
5977       return;
5978
5979    /* assert for non-overlappingness */
5980    tl_assert(src+len <= dst || dst+len <= src);
5981
5982    /* To be simple, just copy byte by byte.  But so as not to wreck
5983       performance for later accesses to dst[0 .. len-1], normalise
5984       destination lines as we finish with them, and also normalise the
5985       line containing the first and last address. */
5986    for (i = 0; i < len; i++) {
5987       Bool normalise
5988          = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
5989            || i == 0       /* first in range */
5990            || i == len-1;  /* last in range */
5991       zsm_scopy08( src+i, dst+i, normalise );
5992    }
5993 }
5994
5995
5996 /* For setting address ranges to a given value.  Has considerable
5997    sophistication so as to avoid generating large numbers of pointless
5998    cache loads/writebacks for large ranges. */
5999
6000 /* Do small ranges in-cache, in the obvious way. */
6001 static
6002 void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
6003 {
6004    /* fast track a couple of common cases */
6005    if (len == 4 && aligned32(a)) {
6006       zsm_swrite32( a, svNew );
6007       return;
6008    }
6009    if (len == 8 && aligned64(a)) {
6010       zsm_swrite64( a, svNew );
6011       return;
6012    }
6013
6014    /* be completely general (but as efficient as possible) */
6015    if (len == 0) return;
6016
6017    if (!aligned16(a) && len >= 1) {
6018       zsm_swrite08( a, svNew );
6019       a += 1;
6020       len -= 1;
6021       tl_assert(aligned16(a));
6022    }
6023    if (len == 0) return;
6024
6025    if (!aligned32(a) && len >= 2) {
6026       zsm_swrite16( a, svNew );
6027       a += 2;
6028       len -= 2;
6029       tl_assert(aligned32(a));
6030    }
6031    if (len == 0) return;
6032
6033    if (!aligned64(a) && len >= 4) {
6034       zsm_swrite32( a, svNew );
6035       a += 4;
6036       len -= 4;
6037       tl_assert(aligned64(a));
6038    }
6039    if (len == 0) return;
6040
6041    if (len >= 8) {
6042       tl_assert(aligned64(a));
6043       while (len >= 8) {
6044          zsm_swrite64( a, svNew );
6045          a += 8;
6046          len -= 8;
6047       }
6048       tl_assert(aligned64(a));
6049    }
6050    if (len == 0) return;
6051
6052    if (len >= 4)
6053       tl_assert(aligned32(a));
6054    if (len >= 4) {
6055       zsm_swrite32( a, svNew );
6056       a += 4;
6057       len -= 4;
6058    }
6059    if (len == 0) return;
6060
6061    if (len >= 2)
6062       tl_assert(aligned16(a));
6063    if (len >= 2) {
6064       zsm_swrite16( a, svNew );
6065       a += 2;
6066       len -= 2;
6067    }
6068    if (len == 0) return;
6069
6070    if (len >= 1) {
6071       zsm_swrite08( a, svNew );
6072       //a += 1;
6073       len -= 1;
6074    }
6075    tl_assert(len == 0);
6076 }
6077
6078
6079 /* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
6080    for larger ranges, try to operate directly on the out-of-cache
6081    representation, rather than dragging lines into the cache,
6082    overwriting them, and forcing them out.  This turns out to be an
6083    important performance optimisation.
6084
6085    Note that this doesn't change the filtering arrangements.  The
6086    caller of zsm_sset_range needs to attend to that. */
6087
6088 static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
6089 {
6090    tl_assert(svNew != SVal_INVALID);
6091    stats__cache_make_New_arange += (ULong)len;
6092
6093    if (0 && len > 500)
6094       VG_(printf)("make New      ( %#lx, %lu )\n", a, len );
6095
6096    if (0) {
6097       static UWord n_New_in_cache = 0;
6098       static UWord n_New_not_in_cache = 0;
6099       /* tag is 'a' with the in-line offset masked out,
6100          eg a[31]..a[4] 0000 */
6101       Addr       tag = a & ~(N_LINE_ARANGE - 1);
6102       UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
6103       if (LIKELY(tag == cache_shmem.tags0[wix])) {
6104          n_New_in_cache++;
6105       } else {
6106          n_New_not_in_cache++;
6107       }
6108       if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
6109          VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
6110                      n_New_in_cache, n_New_not_in_cache );
6111    }
6112
6113    if (LIKELY(len < 2 * N_LINE_ARANGE)) {
6114       zsm_sset_range_SMALL( a, len, svNew );
6115    } else {
6116       Addr  before_start  = a;
6117       Addr  aligned_start = cacheline_ROUNDUP(a);
6118       Addr  after_start   = cacheline_ROUNDDN(a + len);
6119       UWord before_len    = aligned_start - before_start;
6120       UWord aligned_len   = after_start - aligned_start;
6121       UWord after_len     = a + len - after_start;
6122       tl_assert(before_start <= aligned_start);
6123       tl_assert(aligned_start <= after_start);
6124       tl_assert(before_len < N_LINE_ARANGE);
6125       tl_assert(after_len < N_LINE_ARANGE);
6126       tl_assert(get_cacheline_offset(aligned_start) == 0);
6127       if (get_cacheline_offset(a) == 0) {
6128          tl_assert(before_len == 0);
6129          tl_assert(a == aligned_start);
6130       }
6131       if (get_cacheline_offset(a+len) == 0) {
6132          tl_assert(after_len == 0);
6133          tl_assert(after_start == a+len);
6134       }
6135       if (before_len > 0) {
6136          zsm_sset_range_SMALL( before_start, before_len, svNew );
6137       }
6138       if (after_len > 0) {
6139          zsm_sset_range_SMALL( after_start, after_len, svNew );
6140       }
6141       stats__cache_make_New_inZrep += (ULong)aligned_len;
6142
6143       while (1) {
6144          Addr tag;
6145          UWord wix;
6146          if (aligned_start >= after_start)
6147             break;
6148          tl_assert(get_cacheline_offset(aligned_start) == 0);
6149          tag = aligned_start & ~(N_LINE_ARANGE - 1);
6150          wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
6151          if (tag == cache_shmem.tags0[wix]) {
6152             UWord i;
6153             for (i = 0; i < N_LINE_ARANGE / 8; i++)
6154                zsm_swrite64( aligned_start + i * 8, svNew );
6155          } else {
6156             UWord i;
6157             Word zix;
6158             SecMap* sm;
6159             LineZ* lineZ;
6160             /* This line is not in the cache.  Do not force it in; instead
6161                modify it in-place. */
6162             /* find the Z line to write in and rcdec it or the
6163                associated F line. */
6164             find_Z_for_writing( &sm, &zix, tag );
6165             tl_assert(sm);
6166             tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
6167             lineZ = &sm->linesZ[zix];
6168             lineZ->dict[0] = svNew;
6169             lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
6170             for (i = 0; i < N_LINE_ARANGE/4; i++)
6171                lineZ->ix2s[i] = 0; /* all refer to dict[0] */
6172             rcinc_LineZ(lineZ);
6173          }
6174          aligned_start += N_LINE_ARANGE;
6175          aligned_len -= N_LINE_ARANGE;
6176       }
6177       tl_assert(aligned_start == after_start);
6178       tl_assert(aligned_len == 0);
6179    }
6180 }
6181
6182
6183 /////////////////////////////////////////////////////////
6184 //                                                     //
6185 // Front-filtering accesses                            //
6186 //                                                     //
6187 /////////////////////////////////////////////////////////
6188
6189 static UWord stats__f_ac = 0;
6190 static UWord stats__f_sk = 0;
6191
6192 #if 0
6193 #  define STATS__F_SHOW \
6194      do { \
6195         if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
6196            VG_(printf)("filters: ac %lu sk %lu\n",   \
6197            stats__f_ac, stats__f_sk); \
6198      } while (0)
6199 #else
6200 #  define STATS__F_SHOW /* */
6201 #endif
6202
6203 void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
6204    stats__f_ac++;
6205    STATS__F_SHOW;
6206    if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
6207       stats__f_sk++;
6208       return;
6209    }
6210    zsm_sapply08__msmcwrite(thr, a);
6211 }
6212
6213 void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
6214    stats__f_ac++;
6215    STATS__F_SHOW;
6216    if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
6217       stats__f_sk++;
6218       return;
6219    }
6220    zsm_sapply16__msmcwrite(thr, a);
6221 }
6222
6223 void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
6224    stats__f_ac++;
6225    STATS__F_SHOW;
6226    if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
6227       stats__f_sk++;
6228       return;
6229    }
6230    zsm_sapply32__msmcwrite(thr, a);
6231 }
6232
6233 void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
6234    stats__f_ac++;
6235    STATS__F_SHOW;
6236    if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
6237       stats__f_sk++;
6238       return;
6239    }
6240    zsm_sapply64__msmcwrite(thr, a);
6241 }
6242
6243 void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
6244 {
6245    /* fast track a couple of common cases */
6246    if (len == 4 && aligned32(a)) {
6247       zsm_sapply32_f__msmcwrite( thr, a );
6248       return;
6249    }
6250    if (len == 8 && aligned64(a)) {
6251       zsm_sapply64_f__msmcwrite( thr, a );
6252       return;
6253    }
6254
6255    /* be completely general (but as efficient as possible) */
6256    if (len == 0) return;
6257
6258    if (!aligned16(a) && len >= 1) {
6259       zsm_sapply08_f__msmcwrite( thr, a );
6260       a += 1;
6261       len -= 1;
6262       tl_assert(aligned16(a));
6263    }
6264    if (len == 0) return;
6265
6266    if (!aligned32(a) && len >= 2) {
6267       zsm_sapply16_f__msmcwrite( thr, a );
6268       a += 2;
6269       len -= 2;
6270       tl_assert(aligned32(a));
6271    }
6272    if (len == 0) return;
6273
6274    if (!aligned64(a) && len >= 4) {
6275       zsm_sapply32_f__msmcwrite( thr, a );
6276       a += 4;
6277       len -= 4;
6278       tl_assert(aligned64(a));
6279    }
6280    if (len == 0) return;
6281
6282    if (len >= 8) {
6283       tl_assert(aligned64(a));
6284       while (len >= 8) {
6285          zsm_sapply64_f__msmcwrite( thr, a );
6286          a += 8;
6287          len -= 8;
6288       }
6289       tl_assert(aligned64(a));
6290    }
6291    if (len == 0) return;
6292
6293    if (len >= 4)
6294       tl_assert(aligned32(a));
6295    if (len >= 4) {
6296       zsm_sapply32_f__msmcwrite( thr, a );
6297       a += 4;
6298       len -= 4;
6299    }
6300    if (len == 0) return;
6301
6302    if (len >= 2)
6303       tl_assert(aligned16(a));
6304    if (len >= 2) {
6305       zsm_sapply16_f__msmcwrite( thr, a );
6306       a += 2;
6307       len -= 2;
6308    }
6309    if (len == 0) return;
6310
6311    if (len >= 1) {
6312       zsm_sapply08_f__msmcwrite( thr, a );
6313       //a += 1;
6314       len -= 1;
6315    }
6316    tl_assert(len == 0);
6317 }
6318
6319 void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
6320    stats__f_ac++;
6321    STATS__F_SHOW;
6322    if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
6323       stats__f_sk++;
6324       return;
6325    }
6326    zsm_sapply08__msmcread(thr, a);
6327 }
6328
6329 void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
6330    stats__f_ac++;
6331    STATS__F_SHOW;
6332    if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
6333       stats__f_sk++;
6334       return;
6335    }
6336    zsm_sapply16__msmcread(thr, a);
6337 }
6338
6339 void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
6340    stats__f_ac++;
6341    STATS__F_SHOW;
6342    if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
6343       stats__f_sk++;
6344       return;
6345    }
6346    zsm_sapply32__msmcread(thr, a);
6347 }
6348
6349 void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
6350    stats__f_ac++;
6351    STATS__F_SHOW;
6352    if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
6353       stats__f_sk++;
6354       return;
6355    }
6356    zsm_sapply64__msmcread(thr, a);
6357 }
6358
6359 void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
6360 {
6361    /* fast track a couple of common cases */
6362    if (len == 4 && aligned32(a)) {
6363       zsm_sapply32_f__msmcread( thr, a );
6364       return;
6365    }
6366    if (len == 8 && aligned64(a)) {
6367       zsm_sapply64_f__msmcread( thr, a );
6368       return;
6369    }
6370
6371    /* be completely general (but as efficient as possible) */
6372    if (len == 0) return;
6373
6374    if (!aligned16(a) && len >= 1) {
6375       zsm_sapply08_f__msmcread( thr, a );
6376       a += 1;
6377       len -= 1;
6378       tl_assert(aligned16(a));
6379    }
6380    if (len == 0) return;
6381
6382    if (!aligned32(a) && len >= 2) {
6383       zsm_sapply16_f__msmcread( thr, a );
6384       a += 2;
6385       len -= 2;
6386       tl_assert(aligned32(a));
6387    }
6388    if (len == 0) return;
6389
6390    if (!aligned64(a) && len >= 4) {
6391       zsm_sapply32_f__msmcread( thr, a );
6392       a += 4;
6393       len -= 4;
6394       tl_assert(aligned64(a));
6395    }
6396    if (len == 0) return;
6397
6398    if (len >= 8) {
6399       tl_assert(aligned64(a));
6400       while (len >= 8) {
6401          zsm_sapply64_f__msmcread( thr, a );
6402          a += 8;
6403          len -= 8;
6404       }
6405       tl_assert(aligned64(a));
6406    }
6407    if (len == 0) return;
6408
6409    if (len >= 4)
6410       tl_assert(aligned32(a));
6411    if (len >= 4) {
6412       zsm_sapply32_f__msmcread( thr, a );
6413       a += 4;
6414       len -= 4;
6415    }
6416    if (len == 0) return;
6417
6418    if (len >= 2)
6419       tl_assert(aligned16(a));
6420    if (len >= 2) {
6421       zsm_sapply16_f__msmcread( thr, a );
6422       a += 2;
6423       len -= 2;
6424    }
6425    if (len == 0) return;
6426
6427    if (len >= 1) {
6428       zsm_sapply08_f__msmcread( thr, a );
6429       //a += 1;
6430       len -= 1;
6431    }
6432    tl_assert(len == 0);
6433 }
6434
6435 void libhb_Thr_resumes ( Thr* thr )
6436 {
6437    if (0) VG_(printf)("resume %p\n", thr);
6438    tl_assert(thr);
6439    tl_assert(!thr->llexit_done);
6440    Filter__clear(thr->filter, "libhb_Thr_resumes");
6441    /* A kludge, but .. if this thread doesn't have any marker stacks
6442       at all, get one right now.  This is easier than figuring out
6443       exactly when at thread startup we can and can't take a stack
6444       snapshot. */
6445    if (HG_(clo_history_level) == 1) {
6446       tl_assert(thr->local_Kws_n_stacks);
6447       if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
6448          note_local_Kw_n_stack_for(thr);
6449    }
6450 }
6451
6452
6453 /////////////////////////////////////////////////////////
6454 //                                                     //
6455 // Synchronisation objects                             //
6456 //                                                     //
6457 /////////////////////////////////////////////////////////
6458
6459 /* A double linked list of all the SO's. */
6460 SO* admin_SO = NULL;
6461
6462 static SO* SO__Alloc ( void )
6463 {
6464    SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
6465    so->viR   = VtsID_INVALID;
6466    so->viW   = VtsID_INVALID;
6467    so->magic = SO_MAGIC;
6468    /* Add to double linked list */
6469    if (admin_SO) {
6470       tl_assert(admin_SO->admin_prev == NULL);
6471       admin_SO->admin_prev = so;
6472       so->admin_next = admin_SO;
6473    } else {
6474       so->admin_next = NULL;
6475    }
6476    so->admin_prev = NULL;
6477    admin_SO = so;
6478    /* */
6479    return so;
6480 }
6481
6482 static void SO__Dealloc ( SO* so )
6483 {
6484    tl_assert(so);
6485    tl_assert(so->magic == SO_MAGIC);
6486    if (so->viR == VtsID_INVALID) {
6487       tl_assert(so->viW == VtsID_INVALID);
6488    } else {
6489       tl_assert(so->viW != VtsID_INVALID);
6490       VtsID__rcdec(so->viR);
6491       VtsID__rcdec(so->viW);
6492    }
6493    so->magic = 0;
6494    /* Del from double linked list */
6495    if (so->admin_prev)
6496       so->admin_prev->admin_next = so->admin_next;
6497    if (so->admin_next)
6498       so->admin_next->admin_prev = so->admin_prev;
6499    if (so == admin_SO)
6500       admin_SO = so->admin_next;
6501    /* */
6502    HG_(free)( so );
6503 }
6504
6505
6506 /////////////////////////////////////////////////////////
6507 //                                                     //
6508 // Top Level API                                       //
6509 //                                                     //
6510 /////////////////////////////////////////////////////////
6511
6512 static void show_thread_state ( const HChar* str, Thr* t )
6513 {
6514    if (1) return;
6515    if (t->viR == t->viW) {
6516       VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
6517       VtsID__pp( t->viR );
6518       VG_(printf)("%s","\n");
6519    } else {
6520       VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
6521       VtsID__pp( t->viR );
6522       VG_(printf)(" viW %u==", t->viW);
6523       VtsID__pp( t->viW );
6524       VG_(printf)("%s","\n");
6525    }
6526 }
6527
6528
6529 Thr* libhb_init (
6530         void        (*get_stacktrace)( Thr*, Addr*, UWord ),
6531         ExeContext* (*get_EC)( Thr* )
6532      )
6533 {
6534    Thr*  thr;
6535    VtsID vi;
6536
6537    // We will have to have to store a large number of these,
6538    // so make sure they're the size we expect them to be.
6539    STATIC_ASSERT(sizeof(ScalarTS) == 8);
6540
6541    /* because first 1024 unusable */
6542    STATIC_ASSERT(SCALARTS_N_THRBITS >= 11);
6543    /* so as to fit in a UInt w/ 5 bits to spare (see defn of
6544       Thr_n_RCEC and TSW). */
6545    STATIC_ASSERT(SCALARTS_N_THRBITS <= 27);
6546
6547    /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
6548       (32-bit).  It's not correctness-critical, but there are a lot of
6549       them, so it's important from a space viewpoint.  Unfortunately
6550       we simply can't pack it into 2 words on a 32-bit target. */
6551    STATIC_ASSERT(   (sizeof(UWord) == 8 && sizeof(Thr_n_RCEC) == 16)
6552                  || (sizeof(UWord) == 4 && sizeof(Thr_n_RCEC) == 12));
6553    STATIC_ASSERT(sizeof(TSW) == sizeof(UInt));
6554
6555    /* Word sets really are 32 bits.  Even on a 64 bit target. */
6556    STATIC_ASSERT(sizeof(WordSetID) == 4);
6557    STATIC_ASSERT(sizeof(WordSet) == sizeof(WordSetID));
6558
6559    tl_assert(get_stacktrace);
6560    tl_assert(get_EC);
6561    main_get_stacktrace   = get_stacktrace;
6562    main_get_EC           = get_EC;
6563
6564    // No need to initialise hg_wordfm.
6565    // No need to initialise hg_wordset.
6566
6567    /* Allocated once and never deallocated.  Used as a temporary in
6568       VTS singleton, tick and join operations. */
6569    temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
6570    temp_max_sized_VTS->id = VtsID_INVALID;
6571    verydead_thread_tables_init();
6572    vts_set_init();
6573    vts_tab_init();
6574    event_map_init();
6575    VtsID__invalidate_caches();
6576
6577    // initialise shadow memory
6578    zsm_init( );
6579
6580    thr = Thr__new();
6581    vi  = VtsID__mk_Singleton( thr, 1 );
6582    thr->viR = vi;
6583    thr->viW = vi;
6584    VtsID__rcinc(thr->viR);
6585    VtsID__rcinc(thr->viW);
6586
6587    show_thread_state("  root", thr);
6588    return thr;
6589 }
6590
6591
6592 Thr* libhb_create ( Thr* parent )
6593 {
6594    /* The child's VTSs are copies of the parent's VTSs, but ticked at
6595       the child's index.  Since the child's index is guaranteed
6596       unique, it has never been seen before, so the implicit value
6597       before the tick is zero and after that is one. */
6598    Thr* child = Thr__new();
6599
6600    child->viR = VtsID__tick( parent->viR, child );
6601    child->viW = VtsID__tick( parent->viW, child );
6602    Filter__clear(child->filter, "libhb_create(child)");
6603    VtsID__rcinc(child->viR);
6604    VtsID__rcinc(child->viW);
6605    /* We need to do note_local_Kw_n_stack_for( child ), but it's too
6606       early for that - it may not have a valid TId yet.  So, let
6607       libhb_Thr_resumes pick it up the first time the thread runs. */
6608
6609    tl_assert(VtsID__indexAt( child->viR, child ) == 1);
6610    tl_assert(VtsID__indexAt( child->viW, child ) == 1);
6611
6612    /* and the parent has to move along too */
6613    VtsID__rcdec(parent->viR);
6614    VtsID__rcdec(parent->viW);
6615    parent->viR = VtsID__tick( parent->viR, parent );
6616    parent->viW = VtsID__tick( parent->viW, parent );
6617    Filter__clear(parent->filter, "libhb_create(parent)");
6618    VtsID__rcinc(parent->viR);
6619    VtsID__rcinc(parent->viW);
6620    note_local_Kw_n_stack_for( parent );
6621
6622    show_thread_state(" child", child);
6623    show_thread_state("parent", parent);
6624
6625    return child;
6626 }
6627
6628 /* Shut down the library, and print stats (in fact that's _all_
6629    this is for. */
6630 void libhb_shutdown ( Bool show_stats )
6631 {
6632    if (show_stats) {
6633       VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
6634       VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
6635                   stats__secmaps_allocd,
6636                   stats__secmap_ga_space_covered);
6637       VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
6638                   stats__secmap_linesZ_allocd,
6639                   stats__secmap_linesZ_bytes);
6640       VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)"
6641                   " (%'10lu used)\n",
6642                   VG_(sizePA) (LineF_pool_allocator),
6643                   VG_(sizePA) (LineF_pool_allocator) * sizeof(LineF),
6644                   shmem__SecMap_used_linesF());
6645       VG_(printf)(" secmaps: %'10lu in map (can be scanGCed %'5lu)"
6646                   " #%lu scanGC \n",
6647                   stats__secmaps_in_map_shmem,
6648                   shmem__SecMap_do_GC(False /* really do GC */),
6649                   stats__secmaps_scanGC);
6650       tl_assert (VG_(sizeFM) (map_shmem) == stats__secmaps_in_map_shmem);
6651       VG_(printf)(" secmaps: %'10lu in freelist,"
6652                   " total (scanGCed %'lu, ssetGCed %'lu)\n",
6653                   SecMap_freelist_length(),
6654                   stats__secmaps_scanGCed,
6655                   stats__secmaps_ssetGCed);
6656       VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
6657                   stats__secmaps_search, stats__secmaps_search_slow);
6658
6659       VG_(printf)("%s","\n");
6660       VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
6661                   stats__cache_totrefs, stats__cache_totmisses );
6662       VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
6663                   stats__cache_Z_fetches, stats__cache_F_fetches );
6664       VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
6665                   stats__cache_Z_wbacks, stats__cache_F_wbacks );
6666       VG_(printf)("   cache: %'14lu flushes_invals\n",
6667                   stats__cache_flushes_invals );
6668       VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
6669                   stats__cache_make_New_arange,
6670                   stats__cache_make_New_inZrep);
6671
6672       VG_(printf)("%s","\n");
6673       VG_(printf)("   cline: %'10lu normalises\n",
6674                   stats__cline_normalises );
6675       VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6676                   stats__cline_cread64s,
6677                   stats__cline_cread32s,
6678                   stats__cline_cread16s,
6679                   stats__cline_cread08s );
6680       VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6681                   stats__cline_cwrite64s,
6682                   stats__cline_cwrite32s,
6683                   stats__cline_cwrite16s,
6684                   stats__cline_cwrite08s );
6685       VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6686                   stats__cline_swrite64s,
6687                   stats__cline_swrite32s,
6688                   stats__cline_swrite16s,
6689                   stats__cline_swrite08s );
6690       VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
6691                   stats__cline_sread08s, stats__cline_scopy08s );
6692       VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu"
6693                   "    2to1 %'12lu\n",
6694                   stats__cline_64to32splits, stats__cline_32to16splits,
6695                   stats__cline_16to8splits );
6696       VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu"
6697                   "    2to1 %'12lu\n",
6698                   stats__cline_64to32pulldown, stats__cline_32to16pulldown,
6699                   stats__cline_16to8pulldown );
6700       if (0)
6701       VG_(printf)("   cline: sizeof(CacheLineZ) %ld,"
6702                   " covers %ld bytes of arange\n",
6703                   (Word)sizeof(LineZ),
6704                   (Word)N_LINE_ARANGE);
6705
6706       VG_(printf)("%s","\n");
6707
6708       VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
6709                   stats__msmcread, stats__msmcread_change);
6710       VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
6711                   stats__msmcwrite, stats__msmcwrite_change);
6712       VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
6713                   stats__cmpLEQ_queries, stats__cmpLEQ_misses);
6714       VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
6715                   stats__join2_queries, stats__join2_misses);
6716
6717       VG_(printf)("%s","\n");
6718       VG_(printf)("   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
6719                   stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
6720       VG_(printf)("   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
6721                   stats__vts__cmp_structural, stats__vts__cmp_structural_slow);
6722       VG_(printf)("   libhb: VTSset: find__or__clone_and_add %'lu"
6723                   " (%'lu allocd)\n",
6724                    stats__vts_set__focaa, stats__vts_set__focaa_a );
6725       VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
6726                    stats__vts__indexat_slow );
6727
6728       VG_(printf)("%s","\n");
6729       VG_(printf)(
6730          "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
6731          VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
6732       );
6733       VG_(printf)("   libhb: #%lu vts_tab GC    #%lu vts pruning\n",
6734                   stats__vts_tab_GC, stats__vts_pruning);
6735       VG_(printf)( "   libhb: %lu entries in vts_set\n",
6736                    VG_(sizeFM)( vts_set ) );
6737
6738       VG_(printf)("%s","\n");
6739       {
6740          UInt live = 0;
6741          UInt llexit_done = 0;
6742          UInt joinedwith_done = 0;
6743          UInt llexit_and_joinedwith_done = 0;
6744
6745          Thread* hgthread = get_admin_threads();
6746          tl_assert(hgthread);
6747          while (hgthread) {
6748             Thr* hbthr = hgthread->hbthr;
6749             tl_assert(hbthr);
6750             if (hbthr->llexit_done && hbthr->joinedwith_done)
6751                llexit_and_joinedwith_done++;
6752             else if (hbthr->llexit_done)
6753                llexit_done++;
6754             else if (hbthr->joinedwith_done)
6755                joinedwith_done++;
6756             else
6757                live++;
6758             hgthread = hgthread->admin;
6759          }
6760          VG_(printf)("   libhb: threads live: %u exit_and_joinedwith %u"
6761                      " exit %u joinedwith %u\n",
6762                      live, llexit_and_joinedwith_done,
6763                      llexit_done, joinedwith_done);
6764          VG_(printf)("   libhb: %d verydead_threads, "
6765                      "%d verydead_threads_not_pruned\n",
6766                      (int) VG_(sizeXA)( verydead_thread_table),
6767                      (int) VG_(sizeXA)( verydead_thread_table_not_pruned));
6768          tl_assert (VG_(sizeXA)( verydead_thread_table)
6769                     + VG_(sizeXA)( verydead_thread_table_not_pruned)
6770                     == llexit_and_joinedwith_done);
6771       }
6772
6773       VG_(printf)("%s","\n");
6774       VG_(printf)( "   libhb: oldrefHTN %lu (%'d bytes)\n",
6775                    oldrefHTN, (int)(oldrefHTN * sizeof(OldRef)));
6776       tl_assert (oldrefHTN == VG_(HT_count_nodes) (oldrefHT));
6777       VG_(printf)( "   libhb: oldref lookup found=%lu notfound=%lu\n",
6778                    stats__evm__lookup_found, stats__evm__lookup_notfound);
6779       if (VG_(clo_verbosity) > 1)
6780          VG_(HT_print_stats) (oldrefHT, cmp_oldref_tsw);
6781       VG_(printf)( "   libhb: oldref bind tsw/rcec "
6782                    "==/==:%'lu ==/!=:%'lu !=/!=:%'lu\n",
6783                    stats__ctxt_eq_tsw_eq_rcec, stats__ctxt_eq_tsw_neq_rcec,
6784                    stats__ctxt_neq_tsw_neq_rcec);
6785       VG_(printf)( "   libhb: ctxt__rcdec calls %'lu. rcec gc discards %'lu\n",
6786                    stats__ctxt_rcdec_calls, stats__ctxt_rcec_gc_discards);
6787       VG_(printf)( "   libhb: contextTab: %lu slots,"
6788                    " %lu cur ents(ref'd %lu),"
6789                    " %lu max ents\n",
6790                    (UWord)N_RCEC_TAB,
6791                    stats__ctxt_tab_curr, RCEC_referenced,
6792                    stats__ctxt_tab_max );
6793       VG_(printf) ("   libhb: stats__cached_rcec "
6794                    "identical %'lu updated %'lu fresh %'lu\n",
6795                    stats__cached_rcec_identical, stats__cached_rcec_updated,
6796                    stats__cached_rcec_fresh);
6797       if (stats__cached_rcec_diff > 0)
6798          VG_(printf) ("   libhb: stats__cached_rcec diff unk reason%'lu\n",
6799                       stats__cached_rcec_diff);
6800       if (stats__cached_rcec_diff_known_reason > 0)
6801          VG_(printf) ("   libhb: stats__cached_rcec diff known reason %'lu\n",
6802                       stats__cached_rcec_diff_known_reason);
6803
6804       {
6805 #        define  MAXCHAIN 10
6806          UInt chains[MAXCHAIN+1]; // [MAXCHAIN] gets all chains >= MAXCHAIN
6807          UInt non0chain = 0;
6808          UInt n;
6809          UInt i;
6810          RCEC *p;
6811
6812          for (i = 0; i <= MAXCHAIN; i++) chains[i] = 0;
6813          for (i = 0; i < N_RCEC_TAB; i++) {
6814             n = 0;
6815             for (p = contextTab[i]; p; p = p->next)
6816                n++;
6817             if (n < MAXCHAIN)
6818                chains[n]++;
6819             else
6820                chains[MAXCHAIN]++;
6821             if (n > 0)
6822                non0chain++;
6823          }
6824          VG_(printf)( "   libhb: contextTab chain of [length]=nchain."
6825                       " Avg chain len %3.1f\n"
6826                       "        ",
6827                       (Double)stats__ctxt_tab_curr
6828                       / (Double)(non0chain ? non0chain : 1));
6829          for (i = 0; i <= MAXCHAIN; i++) {
6830             if (chains[i] != 0)
6831                 VG_(printf)( "[%u%s]=%u ",
6832                              i, i == MAXCHAIN ? "+" : "",
6833                              chains[i]);
6834          }
6835          VG_(printf)( "\n");
6836 #        undef MAXCHAIN
6837       }
6838       VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
6839                    stats__ctxt_tab_qs,
6840                    stats__ctxt_tab_cmps );
6841 #if 0
6842       VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
6843       VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
6844       VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
6845       VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
6846       VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
6847       VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
6848       VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
6849       VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
6850       VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
6851       VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
6852       VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
6853       VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
6854       VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
6855       VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));
6856
6857       VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
6858       VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
6859       VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
6860       VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
6861 #endif
6862
6863       VG_(printf)("%s","<<< END libhb stats >>>\n");
6864       VG_(printf)("%s","\n");
6865
6866    }
6867 }
6868
6869 /* Receive notification that a thread has low level exited.  The
6870    significance here is that we do not expect to see any more memory
6871    references from it. */
6872 void libhb_async_exit ( Thr* thr )
6873 {
6874    tl_assert(thr);
6875    tl_assert(!thr->llexit_done);
6876    thr->llexit_done = True;
6877
6878    /* Check nobody messed up with the cached_rcec */
6879    tl_assert (thr->cached_rcec.magic == RCEC_MAGIC);
6880    tl_assert (thr->cached_rcec.rc == 0);
6881    tl_assert (thr->cached_rcec.rcX == 0);
6882    tl_assert (thr->cached_rcec.next == NULL);
6883
6884    /* Just to be sure, declare the cached stack invalid. */
6885    set_cached_rcec_validity(thr, False);
6886
6887    /* free up Filter and local_Kws_n_stacks (well, actually not the
6888       latter ..) */
6889    tl_assert(thr->filter);
6890    HG_(free)(thr->filter);
6891    thr->filter = NULL;
6892
6893    /* Tell the VTS mechanism this thread has exited, so it can
6894       participate in VTS pruning.  Note this can only happen if the
6895       thread has both ll_exited and has been joined with. */
6896    if (thr->joinedwith_done)
6897       VTS__declare_thread_very_dead(thr);
6898
6899    /* Another space-accuracy tradeoff.  Do we want to be able to show
6900       H1 history for conflicts in threads which have since exited?  If
6901       yes, then we better not free up thr->local_Kws_n_stacks.  The
6902       downside is a potential per-thread leak of up to
6903       N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
6904       XArray average overcommit factor is (1.5 I'd guess). */
6905    // hence:
6906    // VG_(deleteXA)(thr->local_Kws_n_stacks);
6907    // thr->local_Kws_n_stacks = NULL;
6908 }
6909
6910 /* Receive notification that a thread has been joined with.  The
6911    significance here is that we do not expect to see any further
6912    references to its vector clocks (Thr::viR and Thr::viW). */
6913 void libhb_joinedwith_done ( Thr* thr )
6914 {
6915    tl_assert(thr);
6916    /* Caller must ensure that this is only ever called once per Thr. */
6917    tl_assert(!thr->joinedwith_done);
6918    thr->joinedwith_done = True;
6919    if (thr->llexit_done)
6920       VTS__declare_thread_very_dead(thr);
6921 }
6922
6923
6924 /* Both Segs and SOs point to VTSs.  However, there is no sharing, so
6925    a Seg that points at a VTS is its one-and-only owner, and ditto for
6926    a SO that points at a VTS. */
6927
6928 SO* libhb_so_alloc ( void )
6929 {
6930    return SO__Alloc();
6931 }
6932
6933 void libhb_so_dealloc ( SO* so )
6934 {
6935    tl_assert(so);
6936    tl_assert(so->magic == SO_MAGIC);
6937    SO__Dealloc(so);
6938 }
6939
6940 /* See comments in libhb.h for details on the meaning of
6941    strong vs weak sends and strong vs weak receives. */
6942 void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
6943 {
6944    /* Copy the VTSs from 'thr' into the sync object, and then move
6945       the thread along one step. */
6946
6947    tl_assert(so);
6948    tl_assert(so->magic == SO_MAGIC);
6949
6950    /* stay sane .. a thread's read-clock must always lead or be the
6951       same as its write-clock */
6952    { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
6953      tl_assert(leq);
6954    }
6955
6956    /* since we're overwriting the VtsIDs in the SO, we need to drop
6957       any references made by the previous contents thereof */
6958    if (so->viR == VtsID_INVALID) {
6959       tl_assert(so->viW == VtsID_INVALID);
6960       so->viR = thr->viR;
6961       so->viW = thr->viW;
6962       VtsID__rcinc(so->viR);
6963       VtsID__rcinc(so->viW);
6964    } else {
6965       /* In a strong send, we dump any previous VC in the SO and
6966          install the sending thread's VC instead.  For a weak send we
6967          must join2 with what's already there. */
6968       tl_assert(so->viW != VtsID_INVALID);
6969       VtsID__rcdec(so->viR);
6970       VtsID__rcdec(so->viW);
6971       so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
6972       so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
6973       VtsID__rcinc(so->viR);
6974       VtsID__rcinc(so->viW);
6975    }
6976
6977    /* move both parent clocks along */
6978    VtsID__rcdec(thr->viR);
6979    VtsID__rcdec(thr->viW);
6980    thr->viR = VtsID__tick( thr->viR, thr );
6981    thr->viW = VtsID__tick( thr->viW, thr );
6982    if (!thr->llexit_done) {
6983       Filter__clear(thr->filter, "libhb_so_send");
6984       note_local_Kw_n_stack_for(thr);
6985    }
6986    VtsID__rcinc(thr->viR);
6987    VtsID__rcinc(thr->viW);
6988
6989    if (strong_send)
6990       show_thread_state("s-send", thr);
6991    else
6992       show_thread_state("w-send", thr);
6993 }
6994
6995 void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
6996 {
6997    tl_assert(so);
6998    tl_assert(so->magic == SO_MAGIC);
6999
7000    if (so->viR != VtsID_INVALID) {
7001       tl_assert(so->viW != VtsID_INVALID);
7002
7003       /* Weak receive (basically, an R-acquisition of a R-W lock).
7004          This advances the read-clock of the receiver, but not the
7005          write-clock. */
7006       VtsID__rcdec(thr->viR);
7007       thr->viR = VtsID__join2( thr->viR, so->viR );
7008       VtsID__rcinc(thr->viR);
7009
7010       /* At one point (r10589) it seemed safest to tick the clocks for
7011          the receiving thread after the join.  But on reflection, I
7012          wonder if that might cause it to 'overtake' constraints,
7013          which could lead to missing races.  So, back out that part of
7014          r10589. */
7015       //VtsID__rcdec(thr->viR);
7016       //thr->viR = VtsID__tick( thr->viR, thr );
7017       //VtsID__rcinc(thr->viR);
7018
7019       /* For a strong receive, we also advance the receiver's write
7020          clock, which means the receive as a whole is essentially
7021          equivalent to a W-acquisition of a R-W lock. */
7022       if (strong_recv) {
7023          VtsID__rcdec(thr->viW);
7024          thr->viW = VtsID__join2( thr->viW, so->viW );
7025          VtsID__rcinc(thr->viW);
7026
7027          /* See comment just above, re r10589. */
7028          //VtsID__rcdec(thr->viW);
7029          //thr->viW = VtsID__tick( thr->viW, thr );
7030          //VtsID__rcinc(thr->viW);
7031       }
7032
7033       if (thr->filter)
7034          Filter__clear(thr->filter, "libhb_so_recv");
7035       note_local_Kw_n_stack_for(thr);
7036
7037       if (strong_recv)
7038          show_thread_state("s-recv", thr);
7039       else
7040          show_thread_state("w-recv", thr);
7041
7042    } else {
7043       tl_assert(so->viW == VtsID_INVALID);
7044       /* Deal with degenerate case: 'so' has no vts, so there has been
7045          no message posted to it.  Just ignore this case. */
7046       show_thread_state("d-recv", thr);
7047    }
7048 }
7049
7050 Bool libhb_so_everSent ( SO* so )
7051 {
7052    if (so->viR == VtsID_INVALID) {
7053       tl_assert(so->viW == VtsID_INVALID);
7054       return False;
7055    } else {
7056       tl_assert(so->viW != VtsID_INVALID);
7057       return True;
7058    }
7059 }
7060
7061 #define XXX1 0 // 0x67a106c
7062 #define XXX2 0
7063
7064 static inline Bool TRACEME(Addr a, SizeT szB) {
7065    if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
7066    if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
7067    return False;
7068 }
7069 static void trace ( Thr* thr, Addr a, SizeT szB, const HChar* s )
7070 {
7071   SVal sv = zsm_sread08(a);
7072   VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
7073   show_thread_state("", thr);
7074   VG_(printf)("%s","\n");
7075 }
7076
7077 void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
7078 {
7079    SVal sv = SVal__mkC(thr->viW, thr->viW);
7080    tl_assert(is_sane_SVal_C(sv));
7081    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
7082    zsm_sset_range( a, szB, sv );
7083    Filter__clear_range( thr->filter, a, szB );
7084    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
7085 }
7086
7087 void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
7088 {
7089    /* do nothing */
7090 }
7091
7092
7093 /* Set the lines zix_start till zix_end to NOACCESS. */
7094 static void zsm_secmap_line_range_noaccess (SecMap *sm,
7095                                             UInt zix_start, UInt zix_end)
7096 {
7097    for (UInt lz = zix_start; lz <= zix_end; lz++) {
7098       LineZ* lineZ;
7099       lineZ = &sm->linesZ[lz];
7100       if (lineZ->dict[0] != SVal_INVALID) {
7101          rcdec_LineZ(lineZ);
7102          lineZ->dict[0] = SVal_NOACCESS;
7103          lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
7104       } else {
7105          clear_LineF_of_Z(lineZ);
7106       }
7107       for (UInt i = 0; i < N_LINE_ARANGE/4; i++)
7108          lineZ->ix2s[i] = 0; /* all refer to dict[0] */
7109    }
7110 }
7111
7112 /* Set the given range to SVal_NOACCESS in-place in the secmap.
7113    a must be cacheline aligned. len must be a multiple of a cacheline
7114    and must be < N_SECMAP_ARANGE. */
7115 static void zsm_sset_range_noaccess_in_secmap(Addr a, SizeT len)
7116 {
7117    tl_assert (is_valid_scache_tag (a));
7118    tl_assert (0 == (len & (N_LINE_ARANGE - 1)));
7119    tl_assert (len < N_SECMAP_ARANGE);
7120
7121    SecMap *sm1 = shmem__find_SecMap (a);
7122    SecMap *sm2 = shmem__find_SecMap (a + len - 1);
7123    UWord zix_start = shmem__get_SecMap_offset(a          ) >> N_LINE_BITS;
7124    UWord zix_end   = shmem__get_SecMap_offset(a + len - 1) >> N_LINE_BITS;
7125
7126    if (sm1) {
7127       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm1));
7128       zsm_secmap_line_range_noaccess (sm1, zix_start,
7129                                       sm1 == sm2 ? zix_end : N_SECMAP_ZLINES-1);
7130    }
7131    if (sm2 && sm1 != sm2) {
7132       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm2));
7133       zsm_secmap_line_range_noaccess (sm2, 0, zix_end);
7134    }
7135 }
7136
7137 /* Set the given address range to SVal_NOACCESS.
7138    The SecMaps fully set to SVal_NOACCESS will be pushed in SecMap_freelist. */
7139 static void zsm_sset_range_noaccess (Addr addr, SizeT len)
7140 {
7141    /*
7142        BPC = Before, Partial Cacheline, = addr
7143              (i.e. starting inside a cacheline/inside a SecMap)
7144        BFC = Before, Full Cacheline(s), but not full SecMap
7145              (i.e. starting inside a SecMap)
7146        FSM = Full SecMap(s)
7147              (i.e. starting a SecMap)
7148        AFC = After, Full Cacheline(s), but not full SecMap
7149              (i.e. first address after the full SecMap(s))
7150        APC = After, Partial Cacheline, i.e. first address after the
7151              full CacheLines).
7152        ARE = After Range End = addr+len = first address not part of the range.
7153
7154        If addr     starts a Cacheline, then BPC == BFC.
7155        If addr     starts a SecMap,    then BPC == BFC == FSM.
7156        If addr+len starts a SecMap,    then APC == ARE == AFC
7157        If addr+len starts a Cacheline, then APC == ARE
7158    */
7159    Addr ARE = addr + len;
7160    Addr BPC = addr;
7161    Addr BFC = ROUNDUP(BPC, N_LINE_ARANGE);
7162    Addr FSM = ROUNDUP(BPC, N_SECMAP_ARANGE);
7163    Addr AFC = ROUNDDN(ARE, N_SECMAP_ARANGE);
7164    Addr APC = ROUNDDN(ARE, N_LINE_ARANGE);
7165    SizeT Plen = len; // Plen will be split between the following:
7166    SizeT BPClen;
7167    SizeT BFClen;
7168    SizeT FSMlen;
7169    SizeT AFClen;
7170    SizeT APClen;
7171
7172    /* Consumes from Plen the nr of bytes between from and to.
7173       from and to must be aligned on a multiple of round.
7174       The length consumed will be a multiple of round, with
7175       a maximum of Plen. */
7176 #  define PlenCONSUME(from, to, round, consumed) \
7177    do {                                          \
7178    if (from < to) {                              \
7179       if (to - from < Plen)                      \
7180          consumed = to - from;                   \
7181       else                                       \
7182          consumed = ROUNDDN(Plen, round);        \
7183    } else {                                      \
7184       consumed = 0;                              \
7185    }                                             \
7186    Plen -= consumed; } while (0)
7187
7188    PlenCONSUME(BPC, BFC, 1,               BPClen);
7189    PlenCONSUME(BFC, FSM, N_LINE_ARANGE,   BFClen);
7190    PlenCONSUME(FSM, AFC, N_SECMAP_ARANGE, FSMlen);
7191    PlenCONSUME(AFC, APC, N_LINE_ARANGE,   AFClen);
7192    PlenCONSUME(APC, ARE, 1,               APClen);
7193
7194    if (0)
7195       VG_(printf) ("addr %p[%lu] ARE %p"
7196                    " BPC %p[%lu] BFC %p[%lu] FSM %p[%lu]"
7197                    " AFC %p[%lu] APC %p[%lu]\n",
7198                    (void*)addr, len, (void*)ARE,
7199                    (void*)BPC, BPClen, (void*)BFC, BFClen, (void*)FSM, FSMlen,
7200                    (void*)AFC, AFClen, (void*)APC, APClen);
7201
7202    tl_assert (Plen == 0);
7203
7204    /* Set to NOACCESS pieces before and after not covered by entire SecMaps. */
7205
7206    /* First we set the partial cachelines. This is done through the cache. */
7207    if (BPClen > 0)
7208       zsm_sset_range_SMALL (BPC, BPClen, SVal_NOACCESS);
7209    if (APClen > 0)
7210       zsm_sset_range_SMALL (APC, APClen, SVal_NOACCESS);
7211
7212    /* After this, we will not use the cache anymore. We will directly work
7213       in-place on the z shadow memory in SecMap(s).
7214       So, we invalidate the cachelines for the whole range we are setting
7215       to NOACCESS below. */
7216    shmem__invalidate_scache_range (BFC, APC - BFC);
7217
7218    if (BFClen > 0)
7219       zsm_sset_range_noaccess_in_secmap (BFC, BFClen);
7220    if (AFClen > 0)
7221       zsm_sset_range_noaccess_in_secmap (AFC, AFClen);
7222
7223    if (FSMlen > 0) {
7224       /* Set to NOACCESS all the SecMaps, pushing the SecMaps to the
7225          free list. */
7226       Addr  sm_start = FSM;
7227       while (sm_start < AFC) {
7228          SecMap *sm = shmem__find_SecMap (sm_start);
7229          if (sm) {
7230             Addr gaKey;
7231             SecMap *fm_sm;
7232
7233             if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
7234             for (UInt lz = 0; lz < N_SECMAP_ZLINES; lz++) {
7235                LineZ *lineZ = &sm->linesZ[lz];
7236                if (LIKELY(lineZ->dict[0] != SVal_INVALID))
7237                   rcdec_LineZ(lineZ);
7238                else
7239                   clear_LineF_of_Z(lineZ);
7240             }
7241             if (!VG_(delFromFM)(map_shmem, &gaKey, (UWord*)&fm_sm, sm_start))
7242                tl_assert (0);
7243             stats__secmaps_in_map_shmem--;
7244             tl_assert (gaKey == sm_start);
7245             tl_assert (sm == fm_sm);
7246             stats__secmaps_ssetGCed++;
7247             push_SecMap_on_freelist (sm);
7248          }
7249          sm_start += N_SECMAP_ARANGE;
7250       }
7251       tl_assert (sm_start == AFC);
7252
7253       /* The above loop might have kept copies of freed SecMap in the smCache.
7254          => clear them. */
7255       if (address_in_range(smCache[0].gaKey, FSM, FSMlen)) {
7256          smCache[0].gaKey = 1;
7257          smCache[0].sm = NULL;
7258       }
7259       if (address_in_range(smCache[1].gaKey, FSM, FSMlen)) {
7260          smCache[1].gaKey = 1;
7261          smCache[1].sm = NULL;
7262       }
7263       if (address_in_range(smCache[2].gaKey, FSM, FSMlen)) {
7264          smCache[2].gaKey = 1;
7265          smCache[2].sm = NULL;
7266       }
7267       STATIC_ASSERT (3 == sizeof(smCache)/sizeof(SMCacheEnt));
7268    }
7269 }
7270
7271 void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
7272 {
7273    /* This really does put the requested range in NoAccess.  It's
7274       expensive though. */
7275    SVal sv = SVal_NOACCESS;
7276    tl_assert(is_sane_SVal_C(sv));
7277    if (LIKELY(szB < 2 * N_LINE_ARANGE))
7278       zsm_sset_range_SMALL (a, szB, SVal_NOACCESS);
7279    else
7280       zsm_sset_range_noaccess (a, szB);
7281    Filter__clear_range( thr->filter, a, szB );
7282 }
7283
7284 /* Works byte at a time. Can be optimised if needed. */
7285 UWord libhb_srange_get_abits (Addr a, UChar *abits, SizeT len)
7286 {
7287    UWord anr = 0; // nr of bytes addressable.
7288
7289    /* Get the accessibility of each byte. Pay attention to not
7290       create SecMap or LineZ when checking if a byte is addressable.
7291
7292       Note: this is used for client request. Performance deemed not critical.
7293       So for simplicity, we work byte per byte.
7294       Performance could be improved  by working with full cachelines
7295       or with full SecMap, when reaching a cacheline or secmap boundary. */
7296    for (SizeT i = 0; i < len; i++) {
7297       SVal       sv = SVal_INVALID;
7298       Addr       b = a + i;
7299       Addr       tag = b & ~(N_LINE_ARANGE - 1);
7300       UWord      wix = (b >> N_LINE_BITS) & (N_WAY_NENT - 1);
7301       UWord      cloff = get_cacheline_offset(b);
7302
7303       /* Note: we do not use get_cacheline(b) to avoid creating cachelines
7304          and/or SecMap for non addressable bytes. */
7305       if (tag == cache_shmem.tags0[wix]) {
7306          CacheLine copy = cache_shmem.lyns0[wix];
7307          /* We work on a copy of the cacheline, as we do not want to
7308             record the client request as a real read.
7309             The below is somewhat similar to zsm_sapply08__msmcread but
7310             avoids side effects on the cache. */
7311          UWord toff = get_tree_offset(b); /* == 0 .. 7 */
7312          UWord tno  = get_treeno(b);
7313          UShort descr = copy.descrs[tno];
7314          if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
7315             SVal* tree = &copy.svals[tno << 3];
7316             copy.descrs[tno] = pulldown_to_8(tree, toff, descr);
7317          }
7318          sv = copy.svals[cloff];
7319       } else {
7320          /* Byte not found in the cacheline. Search for a SecMap. */
7321          SecMap *sm = shmem__find_SecMap(b);
7322          LineZ *lineZ;
7323          if (sm == NULL)
7324             sv = SVal_NOACCESS;
7325          else {
7326             UWord zix = shmem__get_SecMap_offset(b) >> N_LINE_BITS;
7327             lineZ = &sm->linesZ[zix];
7328             if (lineZ->dict[0] == SVal_INVALID) {
7329                LineF *lineF = SVal2Ptr(lineZ->dict[1]);
7330                sv = lineF->w64s[cloff];
7331             } else {
7332                UWord ix = read_twobit_array( lineZ->ix2s, cloff );
7333                sv = lineZ->dict[ix];
7334             }
7335          }
7336       }
7337
7338       tl_assert (sv != SVal_INVALID);
7339       if (sv == SVal_NOACCESS) {
7340          if (abits)
7341             abits[i] = 0x00;
7342       } else {
7343          if (abits)
7344             abits[i] = 0xff;
7345          anr++;
7346       }
7347    }
7348
7349    return anr;
7350 }
7351
7352
7353 void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
7354 {
7355    SVal sv = SVal_NOACCESS;
7356    tl_assert(is_sane_SVal_C(sv));
7357    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
7358    if (LIKELY(szB < 2 * N_LINE_ARANGE))
7359       zsm_sset_range_SMALL (a, szB, SVal_NOACCESS);
7360    else
7361       zsm_sset_range_noaccess (a, szB);
7362    Filter__clear_range( thr->filter, a, szB );
7363    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
7364 }
7365
7366 Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
7367    tl_assert(thr);
7368    return thr->hgthread;
7369 }
7370
7371 void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
7372    tl_assert(thr);
7373    thr->hgthread = hgthread;
7374 }
7375
7376 void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
7377 {
7378    zsm_scopy_range(src, dst, len);
7379    Filter__clear_range( thr->filter, dst, len );
7380 }
7381
7382 void libhb_maybe_GC ( void )
7383 {
7384    /* GC the unreferenced (zero rc) RCECs when
7385          (1) reaching a significant nr of RCECs (to avoid scanning a contextTab
7386              with mostly NULL ptr)
7387      and (2) approaching the max nr of RCEC (as we have in any case
7388              at least that amount of RCEC in the pool allocator)
7389              Note: the margin allows to avoid a small but constant increase
7390              of the max nr of RCEC due to the fact that libhb_maybe_GC is
7391              not called when the current nr of RCEC exactly reaches the max.
7392      and (3) the nr of referenced RCECs is less than 75% than total nr RCECs.
7393      Avoid growing too much the nr of RCEC keeps the memory use low,
7394      and avoids to have too many elements in the (fixed) contextTab hashtable.
7395    */
7396    if (UNLIKELY(stats__ctxt_tab_curr > N_RCEC_TAB/2
7397                 && stats__ctxt_tab_curr + 1000 >= stats__ctxt_tab_max
7398                 && (stats__ctxt_tab_curr * 3)/4 > RCEC_referenced))
7399       do_RCEC_GC();
7400
7401    /* If there are still no entries available (all the table entries are full),
7402       and we hit the threshold point, then do a GC */
7403    Bool vts_tab_GC = vts_tab_freelist == VtsID_INVALID
7404       && VG_(sizeXA)( vts_tab ) >= vts_next_GC_at;
7405    if (UNLIKELY (vts_tab_GC))
7406       vts_tab__do_GC( False/*don't show stats*/ );
7407
7408    /* scan GC the SecMaps when
7409           (1) no SecMap in the freelist
7410       and (2) the current nr of live secmaps exceeds the threshold. */
7411    if (UNLIKELY(SecMap_freelist == NULL
7412                 && stats__secmaps_in_map_shmem >= next_SecMap_GC_at)) {
7413       // If we did a vts tab GC, then no need to flush the cache again.
7414       if (!vts_tab_GC)
7415          zsm_flush_cache();
7416       shmem__SecMap_do_GC(True);
7417    }
7418
7419    /* Check the reference counts (expensive) */
7420    if (CHECK_CEM)
7421       event_map__check_reference_counts();
7422 }
7423
7424
7425 /////////////////////////////////////////////////////////////////
7426 /////////////////////////////////////////////////////////////////
7427 //                                                             //
7428 // SECTION END main library                                    //
7429 //                                                             //
7430 /////////////////////////////////////////////////////////////////
7431 /////////////////////////////////////////////////////////////////
7432
7433 /*--------------------------------------------------------------------*/
7434 /*--- end                                             libhb_main.c ---*/
7435 /*--------------------------------------------------------------------*/