1 // Copyright (c) 2008, Google Inc.
2 // All rights reserved.
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 // Author: Sanjay Ghemawat <opensource@google.com>
33 #ifndef TCMALLOC_THREAD_CACHE_H_
34 #define TCMALLOC_THREAD_CACHE_H_
38 #include <pthread.h> // for pthread_t, pthread_key_t
40 #include <stddef.h> // for size_t, NULL
42 #include <stdint.h> // for uint32_t, uint64_t
44 #include <sys/types.h> // for ssize_t
45 #include "common.h" // for SizeMap, kMaxSize, etc
46 #include "free_list.h" // for FL_Pop, FL_PopRange, etc
47 #include "internal_logging.h" // for ASSERT, etc
48 #include "maybe_threads.h"
49 #include "page_heap_allocator.h" // for PageHeapAllocator
50 #include "sampler.h" // for Sampler
51 #include "static_vars.h" // for Static
55 // Even if we have support for thread-local storage in the compiler
56 // and linker, the OS may not support it. We need to check that at
57 // runtime. Right now, we have to keep a manual set of "bad" OSes.
59 extern bool kernel_supports_tls
; // defined in thread_cache.cc
60 void CheckIfKernelSupportsTLS();
61 inline bool KernelSupportsTLS() {
62 return kernel_supports_tls
;
66 //-------------------------------------------------------------------
67 // Data kept per thread
68 //-------------------------------------------------------------------
72 // All ThreadCache objects are kept in a linked list (for stats collection)
76 void Init(pthread_t tid
);
79 // Accessors (mostly just for printing stats)
80 int freelist_length(size_t cl
) const { return list_
[cl
].length(); }
82 // Total byte size in cache
83 size_t Size() const { return size_
; }
85 // Allocate an object of the given size and class. The size given
86 // must be the same as the size of the class in the size map.
87 void* Allocate(size_t size
, size_t cl
);
88 void Deallocate(void* ptr
, size_t size_class
);
92 int GetSamplePeriod();
94 // Record allocation of "k" bytes. Return true iff allocation
96 bool SampleAllocation(size_t k
);
98 // Record additional bytes allocated.
99 void AddToByteAllocatedTotal(size_t k
) { total_bytes_allocated_
+= k
; }
101 // Return the total number of bytes allocated from this heap. The value will
102 // wrap when there is an overflow, and so only the differences between two
103 // values should be relied on (and even then, modulo 2^32).
104 uint32
GetTotalBytesAllocated() const;
106 // On the current thread, return GetTotalBytesAllocated().
107 static uint32
GetBytesAllocatedOnCurrentThread();
109 static void InitModule();
110 static void InitTSD();
111 static ThreadCache
* GetThreadHeap();
112 static ThreadCache
* GetCache();
113 static ThreadCache
* GetCacheIfPresent();
114 static ThreadCache
* CreateCacheIfNecessary();
115 static void BecomeIdle();
117 // Return the number of thread heaps in use.
118 static inline int HeapsInUse();
120 // Writes to total_bytes the total number of bytes used by all thread heaps.
121 // class_count must be an array of size kNumClasses. Writes the number of
122 // items on the corresponding freelist. class_count may be NULL.
123 // The storage of both parameters must be zero intialized.
124 // REQUIRES: Static::pageheap_lock is held.
125 static void GetThreadStats(uint64_t* total_bytes
, uint64_t* class_count
);
127 // Sets the total thread cache size to new_size, recomputing the
128 // individual thread cache sizes as necessary.
129 // REQUIRES: Static::pageheap lock is held.
130 static void set_overall_thread_cache_size(size_t new_size
);
131 static size_t overall_thread_cache_size() {
132 return overall_thread_cache_size_
;
138 void* list_
; // Linked list of nodes
141 // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
142 uint32_t length_
; // Current length.
143 uint32_t lowater_
; // Low water mark for list length.
144 uint32_t max_length_
; // Dynamic max list length based on usage.
145 // Tracks the number of times a deallocation has caused
146 // length_ > max_length_. After the kMaxOverages'th time, max_length_
147 // shrinks and length_overages_ is reset to zero.
148 uint32_t length_overages_
;
150 // If we aren't using 64-bit pointers then pack these into less space.
153 uint16_t max_length_
;
154 uint16_t length_overages_
;
163 length_overages_
= 0;
166 // Return current length of list
167 size_t length() const {
171 // Return the maximum length of the list.
172 size_t max_length() const {
176 // Set the maximum length of the list. If 'new_max' > length(), the
177 // client is responsible for removing objects from the list.
178 void set_max_length(size_t new_max
) {
179 max_length_
= new_max
;
182 // Return the number of times that length() has gone over max_length().
183 size_t length_overages() const {
184 return length_overages_
;
187 void set_length_overages(size_t new_count
) {
188 length_overages_
= new_count
;
193 return list_
== NULL
;
196 // Low-water mark management
197 int lowwatermark() const { return lowater_
; }
198 void clear_lowwatermark() { lowater_
= length_
; }
200 void Push(void* ptr
) {
201 FL_Push(&list_
, ptr
);
206 ASSERT(list_
!= NULL
);
208 if (length_
< lowater_
) lowater_
= length_
;
209 return FL_Pop(&list_
);
213 if (list_
== NULL
) return NULL
;
214 return FL_Next(list_
);
217 void PushRange(int N
, void *start
, void *end
) {
218 FL_PushRange(&list_
, start
, end
);
222 void PopRange(int N
, void **start
, void **end
) {
223 FL_PopRange(&list_
, N
, start
, end
);
224 ASSERT(length_
>= N
);
226 if (length_
< lowater_
) lowater_
= length_
;
230 // Gets and returns an object from the central cache, and, if possible,
231 // also adds some objects of that size class to this thread cache.
232 void* FetchFromCentralCache(size_t cl
, size_t byte_size
);
234 // Releases some number of items from src. Adjusts the list's max_length
235 // to eventually converge on num_objects_to_move(cl).
236 void ListTooLong(FreeList
* src
, size_t cl
);
238 // Releases N items from this thread cache.
239 void ReleaseToCentralCache(FreeList
* src
, size_t cl
, int N
);
241 // Increase max_size_ by reducing unclaimed_cache_space_ or by
242 // reducing the max_size_ of some other thread. In both cases,
243 // the delta is kStealAmount.
244 void IncreaseCacheLimit();
245 // Same as above but requires Static::pageheap_lock() is held.
246 void IncreaseCacheLimitLocked();
248 // If TLS is available, we also store a copy of the per-thread object
249 // in a __thread variable since __thread variables are faster to read
250 // than pthread_getspecific(). We still need pthread_setspecific()
251 // because __thread variables provide no way to run cleanup code when
252 // a thread is destroyed.
253 // We also give a hint to the compiler to use the "initial exec" TLS
254 // model. This is faster than the default TLS model, at the cost that
255 // you cannot dlopen this library. (To see the difference, look at
256 // the CPU use of __tls_get_addr with and without this attribute.)
257 // Since we don't really use dlopen in google code -- and using dlopen
258 // on a malloc replacement is asking for trouble in any case -- that's
259 // a good tradeoff for us.
261 static __thread ThreadCache
* threadlocal_heap_
262 // This code links against pyautolib.so, which causes dlopen() on that shared
263 // object to fail when -fprofile-generate is used with it. Ideally
264 // pyautolib.so should not link against this code. There is a bug filed for
266 // http://code.google.com/p/chromium/issues/detail?id=124489
267 // For now the workaround is to pass in -DPGO_GENERATE when building Chrome
268 // for instrumentation (-fprofile-generate).
269 // For all non-instrumentation builds, this define will not be set and the
270 // performance benefit of "intial-exec" will be achieved.
271 #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE)
272 __attribute__ ((tls_model ("initial-exec")))
277 // Thread-specific key. Initialization here is somewhat tricky
278 // because some Linux startup code invokes malloc() before it
279 // is in a good enough state to handle pthread_keycreate().
280 // Therefore, we use TSD keys only after tsd_inited is set to true.
281 // Until then, we use a slow path to get the heap object.
282 static bool tsd_inited_
;
283 static pthread_key_t heap_key_
;
285 // Linked list of heap objects. Protected by Static::pageheap_lock.
286 static ThreadCache
* thread_heaps_
;
287 static int thread_heap_count_
;
289 // A pointer to one of the objects in thread_heaps_. Represents
290 // the next ThreadCache from which a thread over its max_size_ should
291 // steal memory limit. Round-robin through all of the objects in
292 // thread_heaps_. Protected by Static::pageheap_lock.
293 static ThreadCache
* next_memory_steal_
;
295 // Overall thread cache size. Protected by Static::pageheap_lock.
296 static size_t overall_thread_cache_size_
;
298 // Global per-thread cache size. Writes are protected by
299 // Static::pageheap_lock. Reads are done without any locking, which should be
300 // fine as long as size_t can be written atomically and we don't place
301 // invariants between this variable and other pieces of state.
302 static volatile size_t per_thread_cache_size_
;
304 // Represents overall_thread_cache_size_ minus the sum of max_size_
305 // across all ThreadCaches. Protected by Static::pageheap_lock.
306 static ssize_t unclaimed_cache_space_
;
308 // This class is laid out with the most frequently used fields
309 // first so that hot elements are placed on the same cache line.
311 size_t size_
; // Combined size of data
312 size_t max_size_
; // size_ > max_size_ --> Scavenge()
314 // The following is the tally of bytes allocated on a thread as a response to
315 // any flavor of malloc() call. The aggegated amount includes all padding to
316 // the smallest class that can hold the request, or to the nearest whole page
317 // when a large allocation is made without using a class. This sum is
318 // currently used for Chromium profiling, where tallies are kept of the amount
319 // of memory allocated during the running of each task on each thread.
320 uint32 total_bytes_allocated_
; // Total, modulo 2^32.
322 // We sample allocations, biased by the size of the allocation
323 Sampler sampler_
; // A sampler
325 FreeList list_
[kNumClasses
]; // Array indexed by size-class
327 pthread_t tid_
; // Which thread owns it
328 bool in_setspecific_
; // In call to pthread_setspecific?
330 // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
331 static ThreadCache
* NewHeap(pthread_t tid
);
333 // Use only as pthread thread-specific destructor function.
334 static void DestroyThreadCache(void* ptr
);
336 static void DeleteCache(ThreadCache
* heap
);
337 static void RecomputePerThreadCacheSize();
339 // Ensure that this class is cacheline-aligned. This is critical for
340 // performance, as false sharing would negate many of the benefits
341 // of a per-thread cache.
344 // Allocator for thread heaps
345 // This is logically part of the ThreadCache class, but MSVC, at
346 // least, does not like using ThreadCache as a template argument
347 // before the class is fully defined. So we put it outside the class.
348 extern PageHeapAllocator
<ThreadCache
> threadcache_allocator
;
350 inline int ThreadCache::HeapsInUse() {
351 return threadcache_allocator
.inuse();
354 inline bool ThreadCache::SampleAllocation(size_t k
) {
355 return sampler_
.SampleAllocation(k
);
358 inline uint32
ThreadCache::GetTotalBytesAllocated() const {
359 return total_bytes_allocated_
;
362 inline void* ThreadCache::Allocate(size_t size
, size_t cl
) {
363 ASSERT(size
<= kMaxSize
);
364 ASSERT(size
== Static::sizemap()->ByteSizeForClass(cl
));
366 FreeList
* list
= &list_
[cl
];
368 return FetchFromCentralCache(cl
, size
);
374 inline void ThreadCache::Deallocate(void* ptr
, size_t cl
) {
375 FreeList
* list
= &list_
[cl
];
376 size_
+= Static::sizemap()->ByteSizeForClass(cl
);
377 ssize_t size_headroom
= max_size_
- size_
- 1;
379 // This catches back-to-back frees of allocs in the same size
380 // class. A more comprehensive (and expensive) test would be to walk
381 // the entire freelist. But this might be enough to find some bugs.
382 ASSERT(ptr
!= list
->Next());
385 ssize_t list_headroom
=
386 static_cast<ssize_t
>(list
->max_length()) - list
->length();
388 // There are two relatively uncommon things that require further work.
389 // In the common case we're done, and in that case we need a single branch
390 // because of the bitwise-or trick that follows.
391 if ((list_headroom
| size_headroom
) < 0) {
392 if (list_headroom
< 0) {
393 ListTooLong(list
, cl
);
395 if (size_
>= max_size_
) Scavenge();
399 inline ThreadCache
* ThreadCache::GetThreadHeap() {
401 // __thread is faster, but only when the kernel supports it
402 if (KernelSupportsTLS())
403 return threadlocal_heap_
;
405 return reinterpret_cast<ThreadCache
*>(
406 perftools_pthread_getspecific(heap_key_
));
409 inline ThreadCache
* ThreadCache::GetCache() {
410 ThreadCache
* ptr
= NULL
;
414 ptr
= GetThreadHeap();
416 if (ptr
== NULL
) ptr
= CreateCacheIfNecessary();
420 // In deletion paths, we do not try to create a thread-cache. This is
421 // because we may be in the thread destruction code and may have
422 // already cleaned up the cache for this thread.
423 inline ThreadCache
* ThreadCache::GetCacheIfPresent() {
424 if (!tsd_inited_
) return NULL
;
425 return GetThreadHeap();
428 } // namespace tcmalloc
430 #endif // TCMALLOC_THREAD_CACHE_H_