1 // Copyright (c) 2008, Google Inc.
2 // All rights reserved.
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 // Author: Ken Ashcraft <opensource@google.com>
34 #include "thread_cache.h"
36 #include <string.h> // for memcpy
37 #include <algorithm> // for max, min
38 #include "base/commandlineflags.h" // for SpinLockHolder
39 #include "base/spinlock.h" // for SpinLockHolder
40 #include "central_freelist.h" // for CentralFreeListPadded
41 #include "maybe_threads.h"
46 DEFINE_int64(tcmalloc_max_total_thread_cache_bytes
,
47 EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
48 kDefaultOverallThreadCacheSize
),
49 "Bound on the total amount of bytes allocated to "
50 "thread caches. This bound is not strict, so it is possible "
51 "for the cache to go over this bound in certain circumstances. "
52 "Maximum value of this flag is capped to 1 GB.");
56 static bool phinited
= false;
58 volatile size_t ThreadCache::per_thread_cache_size_
= kMaxThreadCacheSize
;
59 size_t ThreadCache::overall_thread_cache_size_
= kDefaultOverallThreadCacheSize
;
60 ssize_t
ThreadCache::unclaimed_cache_space_
= kDefaultOverallThreadCacheSize
;
61 PageHeapAllocator
<ThreadCache
> threadcache_allocator
;
62 ThreadCache
* ThreadCache::thread_heaps_
= NULL
;
63 int ThreadCache::thread_heap_count_
= 0;
64 ThreadCache
* ThreadCache::next_memory_steal_
= NULL
;
66 __thread ThreadCache
* ThreadCache::threadlocal_heap_
67 // See comments in thread_cache.h about this. Bug here:
68 // http://code.google.com/p/chromium/issues/detail?id=124489
69 #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE)
70 __attribute__ ((tls_model ("initial-exec")))
74 bool ThreadCache::tsd_inited_
= false;
75 pthread_key_t
ThreadCache::heap_key_
;
78 bool kernel_supports_tls
= false; // be conservative
79 # if defined(_WIN32) // windows has supported TLS since winnt, I think.
80 void CheckIfKernelSupportsTLS() {
81 kernel_supports_tls
= true;
83 # elif !HAVE_DECL_UNAME // if too old for uname, probably too old for TLS
84 void CheckIfKernelSupportsTLS() {
85 kernel_supports_tls
= false;
88 # include <sys/utsname.h> // DECL_UNAME checked for <sys/utsname.h> too
89 void CheckIfKernelSupportsTLS() {
91 if (uname(&buf
) < 0) { // should be impossible
92 Log(kLog
, __FILE__
, __LINE__
,
93 "uname failed assuming no TLS support (errno)", errno
);
94 kernel_supports_tls
= false;
95 } else if (strcasecmp(buf
.sysname
, "linux") == 0) {
96 // The linux case: the first kernel to support TLS was 2.6.0
97 if (buf
.release
[0] < '2' && buf
.release
[1] == '.') // 0.x or 1.x
98 kernel_supports_tls
= false;
99 else if (buf
.release
[0] == '2' && buf
.release
[1] == '.' &&
100 buf
.release
[2] >= '0' && buf
.release
[2] < '6' &&
101 buf
.release
[3] == '.') // 2.0 - 2.5
102 kernel_supports_tls
= false;
104 kernel_supports_tls
= true;
105 } else if (strcasecmp(buf
.sysname
, "CYGWIN_NT-6.1-WOW64") == 0) {
106 // In my testing, this version of cygwin, at least, would hang
108 kernel_supports_tls
= false;
109 } else { // some other kernel, we'll be optimisitic
110 kernel_supports_tls
= true;
112 // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG
114 # endif // HAVE_DECL_UNAME
117 void ThreadCache::Init(pthread_t tid
) {
119 total_bytes_allocated_
= 0;
122 IncreaseCacheLimitLocked();
123 if (max_size_
== 0) {
124 // There isn't enough memory to go around. Just give the minimum to
126 max_size_
= kMinThreadCacheSize
;
128 // Take unclaimed_cache_space_ negative.
129 unclaimed_cache_space_
-= kMinThreadCacheSize
;
130 ASSERT(unclaimed_cache_space_
< 0);
136 in_setspecific_
= false;
137 for (size_t cl
= 0; cl
< kNumClasses
; ++cl
) {
141 uint32_t sampler_seed
;
142 memcpy(&sampler_seed
, &tid
, sizeof(sampler_seed
));
143 sampler_
.Init(sampler_seed
);
146 void ThreadCache::Cleanup() {
147 // Put unused memory back into central cache
148 for (int cl
= 0; cl
< kNumClasses
; ++cl
) {
149 if (list_
[cl
].length() > 0) {
150 ReleaseToCentralCache(&list_
[cl
], cl
, list_
[cl
].length());
155 // Remove some objects of class "cl" from central cache and add to thread heap.
156 // On success, return the first object for immediate use; otherwise return NULL.
157 void* ThreadCache::FetchFromCentralCache(size_t cl
, size_t byte_size
) {
158 FreeList
* list
= &list_
[cl
];
159 ASSERT(list
->empty());
160 const int batch_size
= Static::sizemap()->num_objects_to_move(cl
);
162 const int num_to_move
= min
<int>(list
->max_length(), batch_size
);
164 int fetch_count
= Static::central_cache()[cl
].RemoveRange(
165 &start
, &end
, num_to_move
);
167 ASSERT((start
== NULL
) == (fetch_count
== 0));
168 if (--fetch_count
>= 0) {
169 size_
+= byte_size
* fetch_count
;
170 // Pop the top of the list and add the rest to the freelist.
171 void *second
= start
;
172 start
= FL_Pop(&second
);
173 list
->PushRange(fetch_count
, second
, end
);
176 // Increase max length slowly up to batch_size. After that,
177 // increase by batch_size in one shot so that the length is a
178 // multiple of batch_size.
179 if (list
->max_length() < batch_size
) {
180 list
->set_max_length(list
->max_length() + 1);
182 // Don't let the list get too long. In 32 bit builds, the length
183 // is represented by a 16 bit int, so we need to watch out for
185 int new_length
= min
<int>(list
->max_length() + batch_size
,
186 kMaxDynamicFreeListLength
);
187 // The list's max_length must always be a multiple of batch_size,
188 // and kMaxDynamicFreeListLength is not necessarily a multiple
190 new_length
-= new_length
% batch_size
;
191 ASSERT(new_length
% batch_size
== 0);
192 list
->set_max_length(new_length
);
197 void ThreadCache::ListTooLong(FreeList
* list
, size_t cl
) {
198 const int batch_size
= Static::sizemap()->num_objects_to_move(cl
);
199 ReleaseToCentralCache(list
, cl
, batch_size
);
201 // If the list is too long, we need to transfer some number of
202 // objects to the central cache. Ideally, we would transfer
203 // num_objects_to_move, so the code below tries to make max_length
204 // converge on num_objects_to_move.
206 if (list
->max_length() < batch_size
) {
207 // Slow start the max_length so we don't overreserve.
208 list
->set_max_length(list
->max_length() + 1);
209 } else if (list
->max_length() > batch_size
) {
210 // If we consistently go over max_length, shrink max_length. If we don't
211 // shrink it, some amount of memory will always stay in this freelist.
212 list
->set_length_overages(list
->length_overages() + 1);
213 if (list
->length_overages() > kMaxOverages
) {
214 ASSERT(list
->max_length() > batch_size
);
215 list
->set_max_length(list
->max_length() - batch_size
);
216 list
->set_length_overages(0);
221 // Remove some objects of class "cl" from thread heap and add to central cache
222 void ThreadCache::ReleaseToCentralCache(FreeList
* src
, size_t cl
, int N
) {
223 ASSERT(src
== &list_
[cl
]);
224 if (N
> src
->length()) N
= src
->length();
225 size_t delta_bytes
= N
* Static::sizemap()->ByteSizeForClass(cl
);
227 // We return prepackaged chains of the correct size to the central cache.
228 // TODO: Use the same format internally in the thread caches?
229 int batch_size
= Static::sizemap()->num_objects_to_move(cl
);
230 while (N
> batch_size
) {
232 src
->PopRange(batch_size
, &head
, &tail
);
233 Static::central_cache()[cl
].InsertRange(head
, tail
, batch_size
);
237 src
->PopRange(N
, &head
, &tail
);
238 Static::central_cache()[cl
].InsertRange(head
, tail
, N
);
239 size_
-= delta_bytes
;
242 // Release idle memory to the central cache
243 void ThreadCache::Scavenge() {
244 // If the low-water mark for the free list is L, it means we would
245 // not have had to allocate anything from the central cache even if
246 // we had reduced the free list size by L. We aim to get closer to
247 // that situation by dropping L/2 nodes from the free list. This
248 // may not release much memory, but if so we will call scavenge again
249 // pretty soon and the low-water marks will be high on that call.
250 //int64 start = CycleClock::Now();
251 for (int cl
= 0; cl
< kNumClasses
; cl
++) {
252 FreeList
* list
= &list_
[cl
];
253 const int lowmark
= list
->lowwatermark();
255 const int drop
= (lowmark
> 1) ? lowmark
/2 : 1;
256 ReleaseToCentralCache(list
, cl
, drop
);
258 // Shrink the max length if it isn't used. Only shrink down to
259 // batch_size -- if the thread was active enough to get the max_length
260 // above batch_size, it will likely be that active again. If
261 // max_length shinks below batch_size, the thread will have to
262 // go through the slow-start behavior again. The slow-start is useful
263 // mainly for threads that stay relatively idle for their entire
265 const int batch_size
= Static::sizemap()->num_objects_to_move(cl
);
266 if (list
->max_length() > batch_size
) {
267 list
->set_max_length(
268 max
<int>(list
->max_length() - batch_size
, batch_size
));
271 list
->clear_lowwatermark();
274 IncreaseCacheLimit();
277 void ThreadCache::IncreaseCacheLimit() {
278 SpinLockHolder
h(Static::pageheap_lock());
279 IncreaseCacheLimitLocked();
282 void ThreadCache::IncreaseCacheLimitLocked() {
283 if (unclaimed_cache_space_
> 0) {
284 // Possibly make unclaimed_cache_space_ negative.
285 unclaimed_cache_space_
-= kStealAmount
;
286 max_size_
+= kStealAmount
;
289 // Don't hold pageheap_lock too long. Try to steal from 10 other
290 // threads before giving up. The i < 10 condition also prevents an
291 // infinite loop in case none of the existing thread heaps are
292 // suitable places to steal from.
293 for (int i
= 0; i
< 10;
294 ++i
, next_memory_steal_
= next_memory_steal_
->next_
) {
295 // Reached the end of the linked list. Start at the beginning.
296 if (next_memory_steal_
== NULL
) {
297 ASSERT(thread_heaps_
!= NULL
);
298 next_memory_steal_
= thread_heaps_
;
300 if (next_memory_steal_
== this ||
301 next_memory_steal_
->max_size_
<= kMinThreadCacheSize
) {
304 next_memory_steal_
->max_size_
-= kStealAmount
;
305 max_size_
+= kStealAmount
;
307 next_memory_steal_
= next_memory_steal_
->next_
;
312 int ThreadCache::GetSamplePeriod() {
313 return sampler_
.GetSamplePeriod();
317 unsigned int ThreadCache::GetBytesAllocatedOnCurrentThread() {
318 return ThreadCache::GetThreadHeap()->GetTotalBytesAllocated();
321 void ThreadCache::InitModule() {
322 SpinLockHolder
h(Static::pageheap_lock());
324 Static::InitStaticVars();
325 threadcache_allocator
.Init();
330 void ThreadCache::InitTSD() {
331 ASSERT(!tsd_inited_
);
332 perftools_pthread_key_create(&heap_key_
, DestroyThreadCache
);
335 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
336 // We may have used a fake pthread_t for the main thread. Fix it.
338 memset(&zero
, 0, sizeof(zero
));
339 SpinLockHolder
h(Static::pageheap_lock());
340 for (ThreadCache
* h
= thread_heaps_
; h
!= NULL
; h
= h
->next_
) {
341 if (h
->tid_
== zero
) {
342 h
->tid_
= pthread_self();
348 ThreadCache
* ThreadCache::CreateCacheIfNecessary() {
349 // Initialize per-thread data if necessary
350 ThreadCache
* heap
= NULL
;
352 SpinLockHolder
h(Static::pageheap_lock());
353 // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
354 // calling pthread routines (even pthread_self) too early could
355 // cause a segfault. Since we can call pthreads quite early, we
356 // have to protect against that in such situations by making a
357 // 'fake' pthread. This is not ideal since it doesn't work well
358 // when linking tcmalloc statically with apps that create threads
359 // before main, so we only do it if we have to.
360 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
363 memset(&me
, 0, sizeof(me
));
368 const pthread_t me
= pthread_self();
371 // This may be a recursive malloc call from pthread_setspecific()
372 // In that case, the heap for this thread has already been created
373 // and added to the linked list. So we search for that first.
374 for (ThreadCache
* h
= thread_heaps_
; h
!= NULL
; h
= h
->next_
) {
381 if (heap
== NULL
) heap
= NewHeap(me
);
384 // We call pthread_setspecific() outside the lock because it may
385 // call malloc() recursively. We check for the recursive call using
386 // the "in_setspecific_" flag so that we can avoid calling
387 // pthread_setspecific() if we are already inside pthread_setspecific().
388 if (!heap
->in_setspecific_
&& tsd_inited_
) {
389 heap
->in_setspecific_
= true;
390 perftools_pthread_setspecific(heap_key_
, heap
);
392 // Also keep a copy in __thread for faster retrieval
393 threadlocal_heap_
= heap
;
395 heap
->in_setspecific_
= false;
400 ThreadCache
* ThreadCache::NewHeap(pthread_t tid
) {
401 // Create the heap and add it to the linked list
402 ThreadCache
*heap
= threadcache_allocator
.New();
404 heap
->next_
= thread_heaps_
;
406 if (thread_heaps_
!= NULL
) {
407 thread_heaps_
->prev_
= heap
;
409 // This is the only thread heap at the momment.
410 ASSERT(next_memory_steal_
== NULL
);
411 next_memory_steal_
= heap
;
413 thread_heaps_
= heap
;
414 thread_heap_count_
++;
418 void ThreadCache::BecomeIdle() {
419 if (!tsd_inited_
) return; // No caches yet
420 ThreadCache
* heap
= GetThreadHeap();
421 if (heap
== NULL
) return; // No thread cache to remove
422 if (heap
->in_setspecific_
) return; // Do not disturb the active caller
424 heap
->in_setspecific_
= true;
425 perftools_pthread_setspecific(heap_key_
, NULL
);
427 // Also update the copy in __thread
428 threadlocal_heap_
= NULL
;
430 heap
->in_setspecific_
= false;
431 if (GetThreadHeap() == heap
) {
432 // Somehow heap got reinstated by a recursive call to malloc
433 // from pthread_setspecific. We give up in this case.
437 // We can now get rid of the heap
441 void ThreadCache::DestroyThreadCache(void* ptr
) {
442 // Note that "ptr" cannot be NULL since pthread promises not
443 // to invoke the destructor on NULL values, but for safety,
445 if (ptr
== NULL
) return;
447 // Prevent fast path of GetThreadHeap() from returning heap.
448 threadlocal_heap_
= NULL
;
450 DeleteCache(reinterpret_cast<ThreadCache
*>(ptr
));
453 void ThreadCache::DeleteCache(ThreadCache
* heap
) {
454 // Remove all memory from heap
457 // Remove from linked list
458 SpinLockHolder
h(Static::pageheap_lock());
459 if (heap
->next_
!= NULL
) heap
->next_
->prev_
= heap
->prev_
;
460 if (heap
->prev_
!= NULL
) heap
->prev_
->next_
= heap
->next_
;
461 if (thread_heaps_
== heap
) thread_heaps_
= heap
->next_
;
462 thread_heap_count_
--;
464 if (next_memory_steal_
== heap
) next_memory_steal_
= heap
->next_
;
465 if (next_memory_steal_
== NULL
) next_memory_steal_
= thread_heaps_
;
466 unclaimed_cache_space_
+= heap
->max_size_
;
468 threadcache_allocator
.Delete(heap
);
471 void ThreadCache::RecomputePerThreadCacheSize() {
472 // Divide available space across threads
473 int n
= thread_heap_count_
> 0 ? thread_heap_count_
: 1;
474 size_t space
= overall_thread_cache_size_
/ n
;
476 // Limit to allowed range
477 if (space
< kMinThreadCacheSize
) space
= kMinThreadCacheSize
;
478 if (space
> kMaxThreadCacheSize
) space
= kMaxThreadCacheSize
;
480 double ratio
= space
/ max
<double>(1, per_thread_cache_size_
);
482 for (ThreadCache
* h
= thread_heaps_
; h
!= NULL
; h
= h
->next_
) {
483 // Increasing the total cache size should not circumvent the
484 // slow-start growth of max_size_.
486 h
->max_size_
= static_cast<size_t>(h
->max_size_
* ratio
);
488 claimed
+= h
->max_size_
;
490 unclaimed_cache_space_
= overall_thread_cache_size_
- claimed
;
491 per_thread_cache_size_
= space
;
494 void ThreadCache::GetThreadStats(uint64_t* total_bytes
, uint64_t* class_count
) {
495 for (ThreadCache
* h
= thread_heaps_
; h
!= NULL
; h
= h
->next_
) {
496 *total_bytes
+= h
->Size();
498 for (int cl
= 0; cl
< kNumClasses
; ++cl
) {
499 class_count
[cl
] += h
->freelist_length(cl
);
505 void ThreadCache::set_overall_thread_cache_size(size_t new_size
) {
506 // Clip the value to a reasonable range
507 if (new_size
< kMinThreadCacheSize
) new_size
= kMinThreadCacheSize
;
508 if (new_size
> (1<<30)) new_size
= (1<<30); // Limit to 1GB
509 overall_thread_cache_size_
= new_size
;
511 RecomputePerThreadCacheSize();
514 } // namespace tcmalloc