third_party/tcmalloc/chromium/src/thread_cache.cc

   1 // Copyright (c) 2008, Google Inc.
   2 // All rights reserved.
   3 //
   4 // Redistribution and use in source and binary forms, with or without
   5 // modification, are permitted provided that the following conditions are
   6 // met:
   7 //
   8 //     * Redistributions of source code must retain the above copyright
   9 // notice, this list of conditions and the following disclaimer.
  10 //     * Redistributions in binary form must reproduce the above
  11 // copyright notice, this list of conditions and the following disclaimer
  12 // in the documentation and/or other materials provided with the
  13 // distribution.
  14 //     * Neither the name of Google Inc. nor the names of its
  15 // contributors may be used to endorse or promote products derived from
  16 // this software without specific prior written permission.
  17 //
  18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 // ---
  31 // Author: Ken Ashcraft <opensource@google.com>
  32
  33 #include <config.h>
  34 #include "thread_cache.h"
  35 #include <errno.h>
  36 #include <string.h>                     // for memcpy
  37 #include <algorithm>                    // for max, min
  38 #include "base/commandlineflags.h"      // for SpinLockHolder
  39 #include "base/spinlock.h"              // for SpinLockHolder
  40 #include "central_freelist.h"           // for CentralFreeListPadded
  41 #include "maybe_threads.h"
  42
  43 using std::min;
  44 using std::max;
  45
  46 DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
  47              EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
  48                         kDefaultOverallThreadCacheSize),
  49              "Bound on the total amount of bytes allocated to "
  50              "thread caches. This bound is not strict, so it is possible "
  51              "for the cache to go over this bound in certain circumstances. "
  52              "Maximum value of this flag is capped to 1 GB.");
  53
  54 namespace tcmalloc {
  55
  56 static bool phinited = false;
  57
  58 volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
  59 size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
  60 ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
  61 PageHeapAllocator<ThreadCache> threadcache_allocator;
  62 ThreadCache* ThreadCache::thread_heaps_ = NULL;
  63 int ThreadCache::thread_heap_count_ = 0;
  64 ThreadCache* ThreadCache::next_memory_steal_ = NULL;
  65 #ifdef HAVE_TLS
  66 __thread ThreadCache* ThreadCache::threadlocal_heap_
  67 // See comments in thread_cache.h about this. Bug here:
  68 // http://code.google.com/p/chromium/issues/detail?id=124489
  69 #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE)
  70    __attribute__ ((tls_model ("initial-exec")))
  71 # endif
  72    ;
  73 #endif
  74 bool ThreadCache::tsd_inited_ = false;
  75 pthread_key_t ThreadCache::heap_key_;
  76
  77 #if defined(HAVE_TLS)
  78 bool kernel_supports_tls = false;      // be conservative
  79 # if defined(_WIN32)    // windows has supported TLS since winnt, I think.
  80     void CheckIfKernelSupportsTLS() {
  81       kernel_supports_tls = true;
  82     }
  83 # elif !HAVE_DECL_UNAME    // if too old for uname, probably too old for TLS
  84     void CheckIfKernelSupportsTLS() {
  85       kernel_supports_tls = false;
  86     }
  87 # else
  88 #   include <sys/utsname.h>    // DECL_UNAME checked for <sys/utsname.h> too
  89     void CheckIfKernelSupportsTLS() {
  90       struct utsname buf;
  91       if (uname(&buf) < 0) {   // should be impossible
  92         Log(kLog, __FILE__, __LINE__,
  93             "uname failed assuming no TLS support (errno)", errno);
  94         kernel_supports_tls = false;
  95       } else if (strcasecmp(buf.sysname, "linux") == 0) {
  96         // The linux case: the first kernel to support TLS was 2.6.0
  97         if (buf.release[0] < '2' && buf.release[1] == '.')    // 0.x or 1.x
  98           kernel_supports_tls = false;
  99         else if (buf.release[0] == '2' && buf.release[1] == '.' &&
 100                  buf.release[2] >= '0' && buf.release[2] < '6' &&
 101                  buf.release[3] == '.')                       // 2.0 - 2.5
 102           kernel_supports_tls = false;
 103         else
 104           kernel_supports_tls = true;
 105       } else if (strcasecmp(buf.sysname, "CYGWIN_NT-6.1-WOW64") == 0) {
 106         // In my testing, this version of cygwin, at least, would hang
 107         // when using TLS.
 108         kernel_supports_tls = false;
 109       } else {        // some other kernel, we'll be optimisitic
 110         kernel_supports_tls = true;
 111       }
 112       // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG
 113     }
 114 #  endif  // HAVE_DECL_UNAME
 115 #endif    // HAVE_TLS
 116
 117 void ThreadCache::Init(pthread_t tid) {
 118   size_ = 0;
 119   total_bytes_allocated_ = 0;
 120
 121   max_size_ = 0;
 122   IncreaseCacheLimitLocked();
 123   if (max_size_ == 0) {
 124     // There isn't enough memory to go around.  Just give the minimum to
 125     // this thread.
 126     max_size_ = kMinThreadCacheSize;
 127
 128     // Take unclaimed_cache_space_ negative.
 129     unclaimed_cache_space_ -= kMinThreadCacheSize;
 130     ASSERT(unclaimed_cache_space_ < 0);
 131   }
 132
 133   next_ = NULL;
 134   prev_ = NULL;
 135   tid_  = tid;
 136   in_setspecific_ = false;
 137   for (size_t cl = 0; cl < kNumClasses; ++cl) {
 138     list_[cl].Init();
 139   }
 140
 141   uint32_t sampler_seed;
 142   memcpy(&sampler_seed, &tid, sizeof(sampler_seed));
 143   sampler_.Init(sampler_seed);
 144 }
 145
 146 void ThreadCache::Cleanup() {
 147   // Put unused memory back into central cache
 148   for (int cl = 0; cl < kNumClasses; ++cl) {
 149     if (list_[cl].length() > 0) {
 150       ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
 151     }
 152   }
 153 }
 154
 155 // Remove some objects of class "cl" from central cache and add to thread heap.
 156 // On success, return the first object for immediate use; otherwise return NULL.
 157 void* ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) {
 158   FreeList* list = &list_[cl];
 159   ASSERT(list->empty());
 160   const int batch_size = Static::sizemap()->num_objects_to_move(cl);
 161
 162   const int num_to_move = min<int>(list->max_length(), batch_size);
 163   void *start, *end;
 164   int fetch_count = Static::central_cache()[cl].RemoveRange(
 165       &start, &end, num_to_move);
 166
 167   ASSERT((start == NULL) == (fetch_count == 0));
 168   if (--fetch_count >= 0) {
 169     size_ += byte_size * fetch_count;
 170     // Pop the top of the list and add the rest to the freelist.
 171     void *second = start;
 172     start = FL_Pop(&second);
 173     list->PushRange(fetch_count, second, end);
 174   }
 175
 176   // Increase max length slowly up to batch_size.  After that,
 177   // increase by batch_size in one shot so that the length is a
 178   // multiple of batch_size.
 179   if (list->max_length() < batch_size) {
 180     list->set_max_length(list->max_length() + 1);
 181   } else {
 182     // Don't let the list get too long.  In 32 bit builds, the length
 183     // is represented by a 16 bit int, so we need to watch out for
 184     // integer overflow.
 185     int new_length = min<int>(list->max_length() + batch_size,
 186                               kMaxDynamicFreeListLength);
 187     // The list's max_length must always be a multiple of batch_size,
 188     // and kMaxDynamicFreeListLength is not necessarily a multiple
 189     // of batch_size.
 190     new_length -= new_length % batch_size;
 191     ASSERT(new_length % batch_size == 0);
 192     list->set_max_length(new_length);
 193   }
 194   return start;
 195 }
 196
 197 void ThreadCache::ListTooLong(FreeList* list, size_t cl) {
 198   const int batch_size = Static::sizemap()->num_objects_to_move(cl);
 199   ReleaseToCentralCache(list, cl, batch_size);
 200
 201   // If the list is too long, we need to transfer some number of
 202   // objects to the central cache.  Ideally, we would transfer
 203   // num_objects_to_move, so the code below tries to make max_length
 204   // converge on num_objects_to_move.
 205
 206   if (list->max_length() < batch_size) {
 207     // Slow start the max_length so we don't overreserve.
 208     list->set_max_length(list->max_length() + 1);
 209   } else if (list->max_length() > batch_size) {
 210     // If we consistently go over max_length, shrink max_length.  If we don't
 211     // shrink it, some amount of memory will always stay in this freelist.
 212     list->set_length_overages(list->length_overages() + 1);
 213     if (list->length_overages() > kMaxOverages) {
 214       ASSERT(list->max_length() > batch_size);
 215       list->set_max_length(list->max_length() - batch_size);
 216       list->set_length_overages(0);
 217     }
 218   }
 219 }
 220
 221 // Remove some objects of class "cl" from thread heap and add to central cache
 222 void ThreadCache::ReleaseToCentralCache(FreeList* src, size_t cl, int N) {
 223   ASSERT(src == &list_[cl]);
 224   if (N > src->length()) N = src->length();
 225   size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl);
 226
 227   // We return prepackaged chains of the correct size to the central cache.
 228   // TODO: Use the same format internally in the thread caches?
 229   int batch_size = Static::sizemap()->num_objects_to_move(cl);
 230   while (N > batch_size) {
 231     void *tail, *head;
 232     src->PopRange(batch_size, &head, &tail);
 233     Static::central_cache()[cl].InsertRange(head, tail, batch_size);
 234     N -= batch_size;
 235   }
 236   void *tail, *head;
 237   src->PopRange(N, &head, &tail);
 238   Static::central_cache()[cl].InsertRange(head, tail, N);
 239   size_ -= delta_bytes;
 240 }
 241
 242 // Release idle memory to the central cache
 243 void ThreadCache::Scavenge() {
 244   // If the low-water mark for the free list is L, it means we would
 245   // not have had to allocate anything from the central cache even if
 246   // we had reduced the free list size by L.  We aim to get closer to
 247   // that situation by dropping L/2 nodes from the free list.  This
 248   // may not release much memory, but if so we will call scavenge again
 249   // pretty soon and the low-water marks will be high on that call.
 250   //int64 start = CycleClock::Now();
 251   for (int cl = 0; cl < kNumClasses; cl++) {
 252     FreeList* list = &list_[cl];
 253     const int lowmark = list->lowwatermark();
 254     if (lowmark > 0) {
 255       const int drop = (lowmark > 1) ? lowmark/2 : 1;
 256       ReleaseToCentralCache(list, cl, drop);
 257
 258       // Shrink the max length if it isn't used.  Only shrink down to
 259       // batch_size -- if the thread was active enough to get the max_length
 260       // above batch_size, it will likely be that active again.  If
 261       // max_length shinks below batch_size, the thread will have to
 262       // go through the slow-start behavior again.  The slow-start is useful
 263       // mainly for threads that stay relatively idle for their entire
 264       // lifetime.
 265       const int batch_size = Static::sizemap()->num_objects_to_move(cl);
 266       if (list->max_length() > batch_size) {
 267         list->set_max_length(
 268             max<int>(list->max_length() - batch_size, batch_size));
 269       }
 270     }
 271     list->clear_lowwatermark();
 272   }
 273
 274   IncreaseCacheLimit();
 275 }
 276
 277 void ThreadCache::IncreaseCacheLimit() {
 278   SpinLockHolder h(Static::pageheap_lock());
 279   IncreaseCacheLimitLocked();
 280 }
 281
 282 void ThreadCache::IncreaseCacheLimitLocked() {
 283   if (unclaimed_cache_space_ > 0) {
 284     // Possibly make unclaimed_cache_space_ negative.
 285     unclaimed_cache_space_ -= kStealAmount;
 286     max_size_ += kStealAmount;
 287     return;
 288   }
 289   // Don't hold pageheap_lock too long.  Try to steal from 10 other
 290   // threads before giving up.  The i < 10 condition also prevents an
 291   // infinite loop in case none of the existing thread heaps are
 292   // suitable places to steal from.
 293   for (int i = 0; i < 10;
 294        ++i, next_memory_steal_ = next_memory_steal_->next_) {
 295     // Reached the end of the linked list.  Start at the beginning.
 296     if (next_memory_steal_ == NULL) {
 297       ASSERT(thread_heaps_ != NULL);
 298       next_memory_steal_ = thread_heaps_;
 299     }
 300     if (next_memory_steal_ == this ||
 301         next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
 302       continue;
 303     }
 304     next_memory_steal_->max_size_ -= kStealAmount;
 305     max_size_ += kStealAmount;
 306
 307     next_memory_steal_ = next_memory_steal_->next_;
 308     return;
 309   }
 310 }
 311
 312 int ThreadCache::GetSamplePeriod() {
 313   return sampler_.GetSamplePeriod();
 314 }
 315
 316 // static
 317 unsigned int ThreadCache::GetBytesAllocatedOnCurrentThread() {
 318   return ThreadCache::GetThreadHeap()->GetTotalBytesAllocated();
 319 }
 320
 321 void ThreadCache::InitModule() {
 322   SpinLockHolder h(Static::pageheap_lock());
 323   if (!phinited) {
 324     Static::InitStaticVars();
 325     threadcache_allocator.Init();
 326     phinited = 1;
 327   }
 328 }
 329
 330 void ThreadCache::InitTSD() {
 331   ASSERT(!tsd_inited_);
 332   perftools_pthread_key_create(&heap_key_, DestroyThreadCache);
 333   tsd_inited_ = true;
 334
 335 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
 336   // We may have used a fake pthread_t for the main thread.  Fix it.
 337   pthread_t zero;
 338   memset(&zero, 0, sizeof(zero));
 339   SpinLockHolder h(Static::pageheap_lock());
 340   for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
 341     if (h->tid_ == zero) {
 342       h->tid_ = pthread_self();
 343     }
 344   }
 345 #endif
 346 }
 347
 348 ThreadCache* ThreadCache::CreateCacheIfNecessary() {
 349   // Initialize per-thread data if necessary
 350   ThreadCache* heap = NULL;
 351   {
 352     SpinLockHolder h(Static::pageheap_lock());
 353     // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
 354     // calling pthread routines (even pthread_self) too early could
 355     // cause a segfault.  Since we can call pthreads quite early, we
 356     // have to protect against that in such situations by making a
 357     // 'fake' pthread.  This is not ideal since it doesn't work well
 358     // when linking tcmalloc statically with apps that create threads
 359     // before main, so we only do it if we have to.
 360 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
 361     pthread_t me;
 362     if (!tsd_inited_) {
 363       memset(&me, 0, sizeof(me));
 364     } else {
 365       me = pthread_self();
 366     }
 367 #else
 368     const pthread_t me = pthread_self();
 369 #endif
 370
 371     // This may be a recursive malloc call from pthread_setspecific()
 372     // In that case, the heap for this thread has already been created
 373     // and added to the linked list.  So we search for that first.
 374     for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
 375       if (h->tid_ == me) {
 376         heap = h;
 377         break;
 378       }
 379     }
 380
 381     if (heap == NULL) heap = NewHeap(me);
 382   }
 383
 384   // We call pthread_setspecific() outside the lock because it may
 385   // call malloc() recursively.  We check for the recursive call using
 386   // the "in_setspecific_" flag so that we can avoid calling
 387   // pthread_setspecific() if we are already inside pthread_setspecific().
 388   if (!heap->in_setspecific_ && tsd_inited_) {
 389     heap->in_setspecific_ = true;
 390     perftools_pthread_setspecific(heap_key_, heap);
 391 #ifdef HAVE_TLS
 392     // Also keep a copy in __thread for faster retrieval
 393     threadlocal_heap_ = heap;
 394 #endif
 395     heap->in_setspecific_ = false;
 396   }
 397   return heap;
 398 }
 399
 400 ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
 401   // Create the heap and add it to the linked list
 402   ThreadCache *heap = threadcache_allocator.New();
 403   heap->Init(tid);
 404   heap->next_ = thread_heaps_;
 405   heap->prev_ = NULL;
 406   if (thread_heaps_ != NULL) {
 407     thread_heaps_->prev_ = heap;
 408   } else {
 409     // This is the only thread heap at the momment.
 410     ASSERT(next_memory_steal_ == NULL);
 411     next_memory_steal_ = heap;
 412   }
 413   thread_heaps_ = heap;
 414   thread_heap_count_++;
 415   return heap;
 416 }
 417
 418 void ThreadCache::BecomeIdle() {
 419   if (!tsd_inited_) return;              // No caches yet
 420   ThreadCache* heap = GetThreadHeap();
 421   if (heap == NULL) return;             // No thread cache to remove
 422   if (heap->in_setspecific_) return;    // Do not disturb the active caller
 423
 424   heap->in_setspecific_ = true;
 425   perftools_pthread_setspecific(heap_key_, NULL);
 426 #ifdef HAVE_TLS
 427   // Also update the copy in __thread
 428   threadlocal_heap_ = NULL;
 429 #endif
 430   heap->in_setspecific_ = false;
 431   if (GetThreadHeap() == heap) {
 432     // Somehow heap got reinstated by a recursive call to malloc
 433     // from pthread_setspecific.  We give up in this case.
 434     return;
 435   }
 436
 437   // We can now get rid of the heap
 438   DeleteCache(heap);
 439 }
 440
 441 void ThreadCache::DestroyThreadCache(void* ptr) {
 442   // Note that "ptr" cannot be NULL since pthread promises not
 443   // to invoke the destructor on NULL values, but for safety,
 444   // we check anyway.
 445   if (ptr == NULL) return;
 446 #ifdef HAVE_TLS
 447   // Prevent fast path of GetThreadHeap() from returning heap.
 448   threadlocal_heap_ = NULL;
 449 #endif
 450   DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
 451 }
 452
 453 void ThreadCache::DeleteCache(ThreadCache* heap) {
 454   // Remove all memory from heap
 455   heap->Cleanup();
 456
 457   // Remove from linked list
 458   SpinLockHolder h(Static::pageheap_lock());
 459   if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_;
 460   if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_;
 461   if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
 462   thread_heap_count_--;
 463
 464   if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
 465   if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_;
 466   unclaimed_cache_space_ += heap->max_size_;
 467
 468   threadcache_allocator.Delete(heap);
 469 }
 470
 471 void ThreadCache::RecomputePerThreadCacheSize() {
 472   // Divide available space across threads
 473   int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
 474   size_t space = overall_thread_cache_size_ / n;
 475
 476   // Limit to allowed range
 477   if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
 478   if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
 479
 480   double ratio = space / max<double>(1, per_thread_cache_size_);
 481   size_t claimed = 0;
 482   for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
 483     // Increasing the total cache size should not circumvent the
 484     // slow-start growth of max_size_.
 485     if (ratio < 1.0) {
 486         h->max_size_ = static_cast<size_t>(h->max_size_ * ratio);
 487     }
 488     claimed += h->max_size_;
 489   }
 490   unclaimed_cache_space_ = overall_thread_cache_size_ - claimed;
 491   per_thread_cache_size_ = space;
 492 }
 493
 494 void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {
 495   for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
 496     *total_bytes += h->Size();
 497     if (class_count) {
 498       for (int cl = 0; cl < kNumClasses; ++cl) {
 499         class_count[cl] += h->freelist_length(cl);
 500       }
 501     }
 502   }
 503 }
 504
 505 void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
 506   // Clip the value to a reasonable range
 507   if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
 508   if (new_size > (1<<30)) new_size = (1<<30);     // Limit to 1GB
 509   overall_thread_cache_size_ = new_size;
 510
 511   RecomputePerThreadCacheSize();
 512 }
 513
 514 }  // namespace tcmalloc