1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright 2023 Red Hat
6 #include <linux/delay.h>
8 #include <linux/sched/mm.h>
9 #include <linux/slab.h>
10 #include <linux/vmalloc.h>
13 #include "memory-alloc.h"
14 #include "permassert.h"
17 * UDS and VDO keep track of which threads are allowed to allocate memory freely, and which threads
18 * must be careful to not do a memory allocation that does an I/O request. The 'allocating_threads'
19 * thread_registry and its associated methods implement this tracking.
21 static struct thread_registry allocating_threads
;
23 static inline bool allocations_allowed(void)
25 return vdo_lookup_thread(&allocating_threads
) != NULL
;
29 * Register the current thread as an allocating thread.
31 * An optional flag location can be supplied indicating whether, at any given point in time, the
32 * threads associated with that flag should be allocating storage. If the flag is false, a message
35 * If no flag is supplied, the thread is always allowed to allocate storage without complaint.
37 * @new_thread: registered_thread structure to use for the current thread
38 * @flag_ptr: Location of the allocation-allowed flag
40 void vdo_register_allocating_thread(struct registered_thread
*new_thread
,
43 if (flag_ptr
== NULL
) {
44 static const bool allocation_always_allowed
= true;
46 flag_ptr
= &allocation_always_allowed
;
49 vdo_register_thread(&allocating_threads
, new_thread
, flag_ptr
);
52 /* Unregister the current thread as an allocating thread. */
53 void vdo_unregister_allocating_thread(void)
55 vdo_unregister_thread(&allocating_threads
);
59 * We track how much memory has been allocated and freed. When we unload the module, we log an
60 * error if we have not freed all the memory that we allocated. Nearly all memory allocation and
61 * freeing is done using this module.
63 * We do not use kernel functions like the kvasprintf() method, which allocate memory indirectly
66 * These data structures and methods are used to track the amount of memory used.
70 * We allocate very few large objects, and allocation/deallocation isn't done in a
71 * performance-critical stage for us, so a linked list should be fine.
73 struct vmalloc_block_info
{
76 struct vmalloc_block_info
*next
;
81 size_t kmalloc_blocks
;
83 size_t vmalloc_blocks
;
86 struct vmalloc_block_info
*vmalloc_list
;
87 } memory_stats __cacheline_aligned
;
89 static void update_peak_usage(void)
91 size_t total_bytes
= memory_stats
.kmalloc_bytes
+ memory_stats
.vmalloc_bytes
;
93 if (total_bytes
> memory_stats
.peak_bytes
)
94 memory_stats
.peak_bytes
= total_bytes
;
97 static void add_kmalloc_block(size_t size
)
101 spin_lock_irqsave(&memory_stats
.lock
, flags
);
102 memory_stats
.kmalloc_blocks
++;
103 memory_stats
.kmalloc_bytes
+= size
;
105 spin_unlock_irqrestore(&memory_stats
.lock
, flags
);
108 static void remove_kmalloc_block(size_t size
)
112 spin_lock_irqsave(&memory_stats
.lock
, flags
);
113 memory_stats
.kmalloc_blocks
--;
114 memory_stats
.kmalloc_bytes
-= size
;
115 spin_unlock_irqrestore(&memory_stats
.lock
, flags
);
118 static void add_vmalloc_block(struct vmalloc_block_info
*block
)
122 spin_lock_irqsave(&memory_stats
.lock
, flags
);
123 block
->next
= memory_stats
.vmalloc_list
;
124 memory_stats
.vmalloc_list
= block
;
125 memory_stats
.vmalloc_blocks
++;
126 memory_stats
.vmalloc_bytes
+= block
->size
;
128 spin_unlock_irqrestore(&memory_stats
.lock
, flags
);
131 static void remove_vmalloc_block(void *ptr
)
133 struct vmalloc_block_info
*block
;
134 struct vmalloc_block_info
**block_ptr
;
137 spin_lock_irqsave(&memory_stats
.lock
, flags
);
138 for (block_ptr
= &memory_stats
.vmalloc_list
;
139 (block
= *block_ptr
) != NULL
;
140 block_ptr
= &block
->next
) {
141 if (block
->ptr
== ptr
) {
142 *block_ptr
= block
->next
;
143 memory_stats
.vmalloc_blocks
--;
144 memory_stats
.vmalloc_bytes
-= block
->size
;
149 spin_unlock_irqrestore(&memory_stats
.lock
, flags
);
153 vdo_log_info("attempting to remove ptr %px not found in vmalloc list", ptr
);
157 * Determine whether allocating a memory block should use kmalloc or __vmalloc.
159 * vmalloc can allocate any integral number of pages.
161 * kmalloc can allocate any number of bytes up to a configured limit, which defaults to 8 megabytes
162 * on some systems. kmalloc is especially good when memory is being both allocated and freed, and
163 * it does this efficiently in a multi CPU environment.
165 * kmalloc usually rounds the size of the block up to the next power of two, so when the requested
166 * block is bigger than PAGE_SIZE / 2 bytes, kmalloc will never give you less space than the
167 * corresponding vmalloc allocation. Sometimes vmalloc will use less overhead than kmalloc.
169 * The advantages of kmalloc do not help out UDS or VDO, because we allocate all our memory up
170 * front and do not free and reallocate it. Sometimes we have problems using kmalloc, because the
171 * Linux memory page map can become so fragmented that kmalloc will not give us a 32KB chunk. We
172 * have used vmalloc as a backup to kmalloc in the past, and a follow-up vmalloc of 32KB will work.
173 * But there is no strong case to be made for using kmalloc over vmalloc for these size chunks.
175 * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB requests. There is no
176 * strong reason for favoring either kmalloc or vmalloc for 4KB requests, except that tracking
177 * vmalloc statistics uses a linked list implementation. Using a simple test, this choice of
178 * boundary results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB results in an
179 * additional 6374 vmalloc calls, which is much less efficient for tracking.
181 * @size: How many bytes to allocate
183 static inline bool use_kmalloc(size_t size
)
185 return size
<= PAGE_SIZE
;
189 * Allocate storage based on memory size and alignment, logging an error if the allocation fails.
190 * The memory will be zeroed.
192 * @size: The size of an object
193 * @align: The required alignment
194 * @what: What is being allocated (for error logging)
195 * @ptr: A pointer to hold the allocated memory
197 * Return: VDO_SUCCESS or an error code
199 int vdo_allocate_memory(size_t size
, size_t align
, const char *what
, void *ptr
)
202 * The __GFP_RETRY_MAYFAIL flag means the VM implementation will retry memory reclaim
203 * procedures that have previously failed if there is some indication that progress has
204 * been made elsewhere. It can wait for other tasks to attempt high level approaches to
205 * freeing memory such as compaction (which removes fragmentation) and page-out. There is
206 * still a definite limit to the number of retries, but it is a larger limit than with
207 * __GFP_NORETRY. Allocations with this flag may fail, but only when there is genuinely
208 * little unused memory. While these allocations do not directly trigger the OOM killer,
209 * their failure indicates that the system is likely to need to use the OOM killer soon.
210 * The caller must handle failure, but can reasonably do so by failing a higher-level
211 * request, or completing it only in a much less efficient manner.
213 const gfp_t gfp_flags
= GFP_KERNEL
| __GFP_ZERO
| __GFP_RETRY_MAYFAIL
;
214 unsigned int noio_flags
;
215 bool allocations_restricted
= !allocations_allowed();
216 unsigned long start_time
;
219 if (unlikely(ptr
== NULL
))
223 *((void **) ptr
) = NULL
;
227 if (allocations_restricted
)
228 noio_flags
= memalloc_noio_save();
230 start_time
= jiffies
;
231 if (use_kmalloc(size
) && (align
< PAGE_SIZE
)) {
232 p
= kmalloc(size
, gfp_flags
| __GFP_NOWARN
);
235 * It is possible for kmalloc to fail to allocate memory because there is
236 * no page available. A short sleep may allow the page reclaimer to
240 p
= kmalloc(size
, gfp_flags
);
244 add_kmalloc_block(ksize(p
));
246 struct vmalloc_block_info
*block
;
248 if (vdo_allocate(1, struct vmalloc_block_info
, __func__
, &block
) == VDO_SUCCESS
) {
250 * It is possible for __vmalloc to fail to allocate memory because there
251 * are no pages available. A short sleep may allow the page reclaimer
252 * to free enough pages for a small allocation.
254 * For larger allocations, the page_alloc code is racing against the page
255 * reclaimer. If the page reclaimer can stay ahead of page_alloc, the
256 * __vmalloc will succeed. But if page_alloc overtakes the page reclaimer,
257 * the allocation fails. It is possible that more retries will succeed.
260 p
= __vmalloc(size
, gfp_flags
| __GFP_NOWARN
);
264 if (jiffies_to_msecs(jiffies
- start_time
) > 1000) {
265 /* Try one more time, logging a failure for this call. */
266 p
= __vmalloc(size
, gfp_flags
);
277 block
->size
= PAGE_ALIGN(size
);
278 add_vmalloc_block(block
);
283 if (allocations_restricted
)
284 memalloc_noio_restore(noio_flags
);
286 if (unlikely(p
== NULL
)) {
287 vdo_log_error("Could not allocate %zu bytes for %s in %u msecs",
288 size
, what
, jiffies_to_msecs(jiffies
- start_time
));
292 *((void **) ptr
) = p
;
297 * Allocate storage based on memory size, failing immediately if the required memory is not
298 * available. The memory will be zeroed.
300 * @size: The size of an object.
301 * @what: What is being allocated (for error logging)
303 * Return: pointer to the allocated memory, or NULL if the required space is not available.
305 void *vdo_allocate_memory_nowait(size_t size
, const char *what __maybe_unused
)
307 void *p
= kmalloc(size
, GFP_NOWAIT
| __GFP_ZERO
);
310 add_kmalloc_block(ksize(p
));
315 void vdo_free(void *ptr
)
318 if (is_vmalloc_addr(ptr
)) {
319 remove_vmalloc_block(ptr
);
322 remove_kmalloc_block(ksize(ptr
));
329 * Reallocate dynamically allocated memory. There are no alignment guarantees for the reallocated
330 * memory. If the new memory is larger than the old memory, the new space will be zeroed.
332 * @ptr: The memory to reallocate.
333 * @old_size: The old size of the memory
334 * @size: The new size to allocate
335 * @what: What is being allocated (for error logging)
336 * @new_ptr: A pointer to hold the reallocated pointer
338 * Return: VDO_SUCCESS or an error code
340 int vdo_reallocate_memory(void *ptr
, size_t old_size
, size_t size
, const char *what
,
347 *(void **) new_ptr
= NULL
;
351 result
= vdo_allocate(size
, char, what
, new_ptr
);
352 if (result
!= VDO_SUCCESS
)
359 memcpy(*((void **) new_ptr
), ptr
, size
);
366 int vdo_duplicate_string(const char *string
, const char *what
, char **new_string
)
371 result
= vdo_allocate(strlen(string
) + 1, u8
, what
, &dup
);
372 if (result
!= VDO_SUCCESS
)
375 memcpy(dup
, string
, strlen(string
) + 1);
380 void vdo_memory_init(void)
382 spin_lock_init(&memory_stats
.lock
);
383 vdo_initialize_thread_registry(&allocating_threads
);
386 void vdo_memory_exit(void)
388 VDO_ASSERT_LOG_ONLY(memory_stats
.kmalloc_bytes
== 0,
389 "kmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
390 memory_stats
.kmalloc_bytes
, memory_stats
.kmalloc_blocks
);
391 VDO_ASSERT_LOG_ONLY(memory_stats
.vmalloc_bytes
== 0,
392 "vmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
393 memory_stats
.vmalloc_bytes
, memory_stats
.vmalloc_blocks
);
394 vdo_log_debug("peak usage %zd bytes", memory_stats
.peak_bytes
);
397 void vdo_get_memory_stats(u64
*bytes_used
, u64
*peak_bytes_used
)
401 spin_lock_irqsave(&memory_stats
.lock
, flags
);
402 *bytes_used
= memory_stats
.kmalloc_bytes
+ memory_stats
.vmalloc_bytes
;
403 *peak_bytes_used
= memory_stats
.peak_bytes
;
404 spin_unlock_irqrestore(&memory_stats
.lock
, flags
);
408 * Report stats on any allocated memory that we're tracking. Not all allocation types are
409 * guaranteed to be tracked in bytes (e.g., bios).
411 void vdo_report_memory_usage(void)
421 spin_lock_irqsave(&memory_stats
.lock
, flags
);
422 kmalloc_blocks
= memory_stats
.kmalloc_blocks
;
423 kmalloc_bytes
= memory_stats
.kmalloc_bytes
;
424 vmalloc_blocks
= memory_stats
.vmalloc_blocks
;
425 vmalloc_bytes
= memory_stats
.vmalloc_bytes
;
426 peak_usage
= memory_stats
.peak_bytes
;
427 spin_unlock_irqrestore(&memory_stats
.lock
, flags
);
428 total_bytes
= kmalloc_bytes
+ vmalloc_bytes
;
429 vdo_log_info("current module memory tracking (actual allocation sizes, not requested):");
430 vdo_log_info(" %llu bytes in %llu kmalloc blocks",
431 (unsigned long long) kmalloc_bytes
,
432 (unsigned long long) kmalloc_blocks
);
433 vdo_log_info(" %llu bytes in %llu vmalloc blocks",
434 (unsigned long long) vmalloc_bytes
,
435 (unsigned long long) vmalloc_blocks
);
436 vdo_log_info(" total %llu bytes, peak usage %llu bytes",
437 (unsigned long long) total_bytes
, (unsigned long long) peak_usage
);