1 // Copyright (c) 2008, Google Inc.
2 // All rights reserved.
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 // Author: Ken Ashcraft <opensource@google.com>
34 #include "thread_cache.h"
36 #include <string.h> // for memcpy
37 #include <algorithm> // for max, min
38 #include "base/commandlineflags.h" // for SpinLockHolder
39 #include "base/spinlock.h" // for SpinLockHolder
40 #include "central_freelist.h" // for CentralFreeListPadded
41 #include "maybe_threads.h"
46 DEFINE_int64(tcmalloc_max_total_thread_cache_bytes
,
47 EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
48 kDefaultOverallThreadCacheSize
),
49 "Bound on the total amount of bytes allocated to "
50 "thread caches. This bound is not strict, so it is possible "
51 "for the cache to go over this bound in certain circumstances. "
52 "Maximum value of this flag is capped to 1 GB.");
56 static bool phinited
= false;
58 volatile size_t ThreadCache::per_thread_cache_size_
= kMaxThreadCacheSize
;
59 size_t ThreadCache::overall_thread_cache_size_
= kDefaultOverallThreadCacheSize
;
60 ssize_t
ThreadCache::unclaimed_cache_space_
= kDefaultOverallThreadCacheSize
;
61 PageHeapAllocator
<ThreadCache
> threadcache_allocator
;
62 ThreadCache
* ThreadCache::thread_heaps_
= NULL
;
63 int ThreadCache::thread_heap_count_
= 0;
64 ThreadCache
* ThreadCache::next_memory_steal_
= NULL
;
66 __thread ThreadCache
* ThreadCache::threadlocal_heap_
67 # ifdef HAVE___ATTRIBUTE__
68 __attribute__ ((tls_model ("initial-exec")))
72 bool ThreadCache::tsd_inited_
= false;
73 pthread_key_t
ThreadCache::heap_key_
;
76 bool kernel_supports_tls
= false; // be conservative
77 # if defined(_WIN32) // windows has supported TLS since winnt, I think.
78 void CheckIfKernelSupportsTLS() {
79 kernel_supports_tls
= true;
81 # elif !HAVE_DECL_UNAME // if too old for uname, probably too old for TLS
82 void CheckIfKernelSupportsTLS() {
83 kernel_supports_tls
= false;
86 # include <sys/utsname.h> // DECL_UNAME checked for <sys/utsname.h> too
87 void CheckIfKernelSupportsTLS() {
89 if (uname(&buf
) < 0) { // should be impossible
90 Log(kLog
, __FILE__
, __LINE__
,
91 "uname failed assuming no TLS support (errno)", errno
);
92 kernel_supports_tls
= false;
93 } else if (strcasecmp(buf
.sysname
, "linux") == 0) {
94 // The linux case: the first kernel to support TLS was 2.6.0
95 if (buf
.release
[0] < '2' && buf
.release
[1] == '.') // 0.x or 1.x
96 kernel_supports_tls
= false;
97 else if (buf
.release
[0] == '2' && buf
.release
[1] == '.' &&
98 buf
.release
[2] >= '0' && buf
.release
[2] < '6' &&
99 buf
.release
[3] == '.') // 2.0 - 2.5
100 kernel_supports_tls
= false;
102 kernel_supports_tls
= true;
103 } else if (strcasecmp(buf
.sysname
, "CYGWIN_NT-6.1-WOW64") == 0) {
104 // In my testing, this version of cygwin, at least, would hang
106 kernel_supports_tls
= false;
107 } else { // some other kernel, we'll be optimisitic
108 kernel_supports_tls
= true;
110 // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG
112 # endif // HAVE_DECL_UNAME
115 void ThreadCache::Init(pthread_t tid
) {
119 IncreaseCacheLimitLocked();
120 if (max_size_
== 0) {
121 // There isn't enough memory to go around. Just give the minimum to
123 max_size_
= kMinThreadCacheSize
;
125 // Take unclaimed_cache_space_ negative.
126 unclaimed_cache_space_
-= kMinThreadCacheSize
;
127 ASSERT(unclaimed_cache_space_
< 0);
133 in_setspecific_
= false;
134 for (size_t cl
= 0; cl
< kNumClasses
; ++cl
) {
138 uint32_t sampler_seed
;
139 memcpy(&sampler_seed
, &tid
, sizeof(sampler_seed
));
140 sampler_
.Init(sampler_seed
);
143 void ThreadCache::Cleanup() {
144 // Put unused memory back into central cache
145 for (int cl
= 0; cl
< kNumClasses
; ++cl
) {
146 if (list_
[cl
].length() > 0) {
147 ReleaseToCentralCache(&list_
[cl
], cl
, list_
[cl
].length());
152 // Remove some objects of class "cl" from central cache and add to thread heap.
153 // On success, return the first object for immediate use; otherwise return NULL.
154 void* ThreadCache::FetchFromCentralCache(size_t cl
, size_t byte_size
) {
155 FreeList
* list
= &list_
[cl
];
156 ASSERT(list
->empty());
157 const int batch_size
= Static::sizemap()->num_objects_to_move(cl
);
159 const int num_to_move
= min
<int>(list
->max_length(), batch_size
);
161 int fetch_count
= Static::central_cache()[cl
].RemoveRange(
162 &start
, &end
, num_to_move
);
164 ASSERT((start
== NULL
) == (fetch_count
== 0));
165 if (--fetch_count
>= 0) {
166 size_
+= byte_size
* fetch_count
;
167 list
->PushRange(fetch_count
, SLL_Next(start
), end
);
170 // Increase max length slowly up to batch_size. After that,
171 // increase by batch_size in one shot so that the length is a
172 // multiple of batch_size.
173 if (list
->max_length() < batch_size
) {
174 list
->set_max_length(list
->max_length() + 1);
176 // Don't let the list get too long. In 32 bit builds, the length
177 // is represented by a 16 bit int, so we need to watch out for
179 int new_length
= min
<int>(list
->max_length() + batch_size
,
180 kMaxDynamicFreeListLength
);
181 // The list's max_length must always be a multiple of batch_size,
182 // and kMaxDynamicFreeListLength is not necessarily a multiple
184 new_length
-= new_length
% batch_size
;
185 ASSERT(new_length
% batch_size
== 0);
186 list
->set_max_length(new_length
);
191 void ThreadCache::ListTooLong(FreeList
* list
, size_t cl
) {
192 const int batch_size
= Static::sizemap()->num_objects_to_move(cl
);
193 ReleaseToCentralCache(list
, cl
, batch_size
);
195 // If the list is too long, we need to transfer some number of
196 // objects to the central cache. Ideally, we would transfer
197 // num_objects_to_move, so the code below tries to make max_length
198 // converge on num_objects_to_move.
200 if (list
->max_length() < batch_size
) {
201 // Slow start the max_length so we don't overreserve.
202 list
->set_max_length(list
->max_length() + 1);
203 } else if (list
->max_length() > batch_size
) {
204 // If we consistently go over max_length, shrink max_length. If we don't
205 // shrink it, some amount of memory will always stay in this freelist.
206 list
->set_length_overages(list
->length_overages() + 1);
207 if (list
->length_overages() > kMaxOverages
) {
208 ASSERT(list
->max_length() > batch_size
);
209 list
->set_max_length(list
->max_length() - batch_size
);
210 list
->set_length_overages(0);
215 // Remove some objects of class "cl" from thread heap and add to central cache
216 void ThreadCache::ReleaseToCentralCache(FreeList
* src
, size_t cl
, int N
) {
217 ASSERT(src
== &list_
[cl
]);
218 if (N
> src
->length()) N
= src
->length();
219 size_t delta_bytes
= N
* Static::sizemap()->ByteSizeForClass(cl
);
221 // We return prepackaged chains of the correct size to the central cache.
222 // TODO: Use the same format internally in the thread caches?
223 int batch_size
= Static::sizemap()->num_objects_to_move(cl
);
224 while (N
> batch_size
) {
226 src
->PopRange(batch_size
, &head
, &tail
);
227 Static::central_cache()[cl
].InsertRange(head
, tail
, batch_size
);
231 src
->PopRange(N
, &head
, &tail
);
232 Static::central_cache()[cl
].InsertRange(head
, tail
, N
);
233 size_
-= delta_bytes
;
236 // Release idle memory to the central cache
237 void ThreadCache::Scavenge() {
238 // If the low-water mark for the free list is L, it means we would
239 // not have had to allocate anything from the central cache even if
240 // we had reduced the free list size by L. We aim to get closer to
241 // that situation by dropping L/2 nodes from the free list. This
242 // may not release much memory, but if so we will call scavenge again
243 // pretty soon and the low-water marks will be high on that call.
244 //int64 start = CycleClock::Now();
245 for (int cl
= 0; cl
< kNumClasses
; cl
++) {
246 FreeList
* list
= &list_
[cl
];
247 const int lowmark
= list
->lowwatermark();
249 const int drop
= (lowmark
> 1) ? lowmark
/2 : 1;
250 ReleaseToCentralCache(list
, cl
, drop
);
252 // Shrink the max length if it isn't used. Only shrink down to
253 // batch_size -- if the thread was active enough to get the max_length
254 // above batch_size, it will likely be that active again. If
255 // max_length shinks below batch_size, the thread will have to
256 // go through the slow-start behavior again. The slow-start is useful
257 // mainly for threads that stay relatively idle for their entire
259 const int batch_size
= Static::sizemap()->num_objects_to_move(cl
);
260 if (list
->max_length() > batch_size
) {
261 list
->set_max_length(
262 max
<int>(list
->max_length() - batch_size
, batch_size
));
265 list
->clear_lowwatermark();
268 IncreaseCacheLimit();
271 void ThreadCache::IncreaseCacheLimit() {
272 SpinLockHolder
h(Static::pageheap_lock());
273 IncreaseCacheLimitLocked();
276 void ThreadCache::IncreaseCacheLimitLocked() {
277 if (unclaimed_cache_space_
> 0) {
278 // Possibly make unclaimed_cache_space_ negative.
279 unclaimed_cache_space_
-= kStealAmount
;
280 max_size_
+= kStealAmount
;
283 // Don't hold pageheap_lock too long. Try to steal from 10 other
284 // threads before giving up. The i < 10 condition also prevents an
285 // infinite loop in case none of the existing thread heaps are
286 // suitable places to steal from.
287 for (int i
= 0; i
< 10;
288 ++i
, next_memory_steal_
= next_memory_steal_
->next_
) {
289 // Reached the end of the linked list. Start at the beginning.
290 if (next_memory_steal_
== NULL
) {
291 ASSERT(thread_heaps_
!= NULL
);
292 next_memory_steal_
= thread_heaps_
;
294 if (next_memory_steal_
== this ||
295 next_memory_steal_
->max_size_
<= kMinThreadCacheSize
) {
298 next_memory_steal_
->max_size_
-= kStealAmount
;
299 max_size_
+= kStealAmount
;
301 next_memory_steal_
= next_memory_steal_
->next_
;
306 int ThreadCache::GetSamplePeriod() {
307 return sampler_
.GetSamplePeriod();
310 void ThreadCache::InitModule() {
311 SpinLockHolder
h(Static::pageheap_lock());
313 Static::InitStaticVars();
314 threadcache_allocator
.Init();
319 void ThreadCache::InitTSD() {
320 ASSERT(!tsd_inited_
);
321 perftools_pthread_key_create(&heap_key_
, DestroyThreadCache
);
324 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
325 // We may have used a fake pthread_t for the main thread. Fix it.
327 memset(&zero
, 0, sizeof(zero
));
328 SpinLockHolder
h(Static::pageheap_lock());
329 for (ThreadCache
* h
= thread_heaps_
; h
!= NULL
; h
= h
->next_
) {
330 if (h
->tid_
== zero
) {
331 h
->tid_
= pthread_self();
337 ThreadCache
* ThreadCache::CreateCacheIfNecessary() {
338 // Initialize per-thread data if necessary
339 ThreadCache
* heap
= NULL
;
341 SpinLockHolder
h(Static::pageheap_lock());
342 // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
343 // calling pthread routines (even pthread_self) too early could
344 // cause a segfault. Since we can call pthreads quite early, we
345 // have to protect against that in such situations by making a
346 // 'fake' pthread. This is not ideal since it doesn't work well
347 // when linking tcmalloc statically with apps that create threads
348 // before main, so we only do it if we have to.
349 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
352 memset(&me
, 0, sizeof(me
));
357 const pthread_t me
= pthread_self();
360 // This may be a recursive malloc call from pthread_setspecific()
361 // In that case, the heap for this thread has already been created
362 // and added to the linked list. So we search for that first.
363 for (ThreadCache
* h
= thread_heaps_
; h
!= NULL
; h
= h
->next_
) {
370 if (heap
== NULL
) heap
= NewHeap(me
);
373 // We call pthread_setspecific() outside the lock because it may
374 // call malloc() recursively. We check for the recursive call using
375 // the "in_setspecific_" flag so that we can avoid calling
376 // pthread_setspecific() if we are already inside pthread_setspecific().
377 if (!heap
->in_setspecific_
&& tsd_inited_
) {
378 heap
->in_setspecific_
= true;
379 perftools_pthread_setspecific(heap_key_
, heap
);
381 // Also keep a copy in __thread for faster retrieval
382 threadlocal_heap_
= heap
;
384 heap
->in_setspecific_
= false;
389 ThreadCache
* ThreadCache::NewHeap(pthread_t tid
) {
390 // Create the heap and add it to the linked list
391 ThreadCache
*heap
= threadcache_allocator
.New();
393 heap
->next_
= thread_heaps_
;
395 if (thread_heaps_
!= NULL
) {
396 thread_heaps_
->prev_
= heap
;
398 // This is the only thread heap at the momment.
399 ASSERT(next_memory_steal_
== NULL
);
400 next_memory_steal_
= heap
;
402 thread_heaps_
= heap
;
403 thread_heap_count_
++;
407 void ThreadCache::BecomeIdle() {
408 if (!tsd_inited_
) return; // No caches yet
409 ThreadCache
* heap
= GetThreadHeap();
410 if (heap
== NULL
) return; // No thread cache to remove
411 if (heap
->in_setspecific_
) return; // Do not disturb the active caller
413 heap
->in_setspecific_
= true;
414 perftools_pthread_setspecific(heap_key_
, NULL
);
416 // Also update the copy in __thread
417 threadlocal_heap_
= NULL
;
419 heap
->in_setspecific_
= false;
420 if (GetThreadHeap() == heap
) {
421 // Somehow heap got reinstated by a recursive call to malloc
422 // from pthread_setspecific. We give up in this case.
426 // We can now get rid of the heap
430 void ThreadCache::DestroyThreadCache(void* ptr
) {
431 // Note that "ptr" cannot be NULL since pthread promises not
432 // to invoke the destructor on NULL values, but for safety,
434 if (ptr
== NULL
) return;
436 // Prevent fast path of GetThreadHeap() from returning heap.
437 threadlocal_heap_
= NULL
;
439 DeleteCache(reinterpret_cast<ThreadCache
*>(ptr
));
442 void ThreadCache::DeleteCache(ThreadCache
* heap
) {
443 // Remove all memory from heap
446 // Remove from linked list
447 SpinLockHolder
h(Static::pageheap_lock());
448 if (heap
->next_
!= NULL
) heap
->next_
->prev_
= heap
->prev_
;
449 if (heap
->prev_
!= NULL
) heap
->prev_
->next_
= heap
->next_
;
450 if (thread_heaps_
== heap
) thread_heaps_
= heap
->next_
;
451 thread_heap_count_
--;
453 if (next_memory_steal_
== heap
) next_memory_steal_
= heap
->next_
;
454 if (next_memory_steal_
== NULL
) next_memory_steal_
= thread_heaps_
;
455 unclaimed_cache_space_
+= heap
->max_size_
;
457 threadcache_allocator
.Delete(heap
);
460 void ThreadCache::RecomputePerThreadCacheSize() {
461 // Divide available space across threads
462 int n
= thread_heap_count_
> 0 ? thread_heap_count_
: 1;
463 size_t space
= overall_thread_cache_size_
/ n
;
465 // Limit to allowed range
466 if (space
< kMinThreadCacheSize
) space
= kMinThreadCacheSize
;
467 if (space
> kMaxThreadCacheSize
) space
= kMaxThreadCacheSize
;
469 double ratio
= space
/ max
<double>(1, per_thread_cache_size_
);
471 for (ThreadCache
* h
= thread_heaps_
; h
!= NULL
; h
= h
->next_
) {
472 // Increasing the total cache size should not circumvent the
473 // slow-start growth of max_size_.
475 h
->max_size_
= static_cast<size_t>(h
->max_size_
* ratio
);
477 claimed
+= h
->max_size_
;
479 unclaimed_cache_space_
= overall_thread_cache_size_
- claimed
;
480 per_thread_cache_size_
= space
;
483 void ThreadCache::GetThreadStats(uint64_t* total_bytes
, uint64_t* class_count
) {
484 for (ThreadCache
* h
= thread_heaps_
; h
!= NULL
; h
= h
->next_
) {
485 *total_bytes
+= h
->Size();
487 for (int cl
= 0; cl
< kNumClasses
; ++cl
) {
488 class_count
[cl
] += h
->freelist_length(cl
);
494 void ThreadCache::set_overall_thread_cache_size(size_t new_size
) {
495 // Clip the value to a reasonable range
496 if (new_size
< kMinThreadCacheSize
) new_size
= kMinThreadCacheSize
;
497 if (new_size
> (1<<30)) new_size
= (1<<30); // Limit to 1GB
498 overall_thread_cache_size_
= new_size
;
500 RecomputePerThreadCacheSize();
503 } // namespace tcmalloc