drivers/md/dm-vdo/memory-alloc.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright 2023 Red Hat
   4  */
   5
   6 #include <linux/delay.h>
   7 #include <linux/mm.h>
   8 #include <linux/sched/mm.h>
   9 #include <linux/slab.h>
  10 #include <linux/vmalloc.h>
  11
  12 #include "logger.h"
  13 #include "memory-alloc.h"
  14 #include "permassert.h"
  15
  16 /*
  17  * UDS and VDO keep track of which threads are allowed to allocate memory freely, and which threads
  18  * must be careful to not do a memory allocation that does an I/O request. The 'allocating_threads'
  19  * thread_registry and its associated methods implement this tracking.
  20  */
  21 static struct thread_registry allocating_threads;
  22
  23 static inline bool allocations_allowed(void)
  24 {
  25         return vdo_lookup_thread(&allocating_threads) != NULL;
  26 }
  27
  28 /*
  29  * Register the current thread as an allocating thread.
  30  *
  31  * An optional flag location can be supplied indicating whether, at any given point in time, the
  32  * threads associated with that flag should be allocating storage. If the flag is false, a message
  33  * will be logged.
  34  *
  35  * If no flag is supplied, the thread is always allowed to allocate storage without complaint.
  36  *
  37  * @new_thread: registered_thread structure to use for the current thread
  38  * @flag_ptr: Location of the allocation-allowed flag
  39  */
  40 void vdo_register_allocating_thread(struct registered_thread *new_thread,
  41                                     const bool *flag_ptr)
  42 {
  43         if (flag_ptr == NULL) {
  44                 static const bool allocation_always_allowed = true;
  45
  46                 flag_ptr = &allocation_always_allowed;
  47         }
  48
  49         vdo_register_thread(&allocating_threads, new_thread, flag_ptr);
  50 }
  51
  52 /* Unregister the current thread as an allocating thread. */
  53 void vdo_unregister_allocating_thread(void)
  54 {
  55         vdo_unregister_thread(&allocating_threads);
  56 }
  57
  58 /*
  59  * We track how much memory has been allocated and freed. When we unload the module, we log an
  60  * error if we have not freed all the memory that we allocated. Nearly all memory allocation and
  61  * freeing is done using this module.
  62  *
  63  * We do not use kernel functions like the kvasprintf() method, which allocate memory indirectly
  64  * using kmalloc.
  65  *
  66  * These data structures and methods are used to track the amount of memory used.
  67  */
  68
  69 /*
  70  * We allocate very few large objects, and allocation/deallocation isn't done in a
  71  * performance-critical stage for us, so a linked list should be fine.
  72  */
  73 struct vmalloc_block_info {
  74         void *ptr;
  75         size_t size;
  76         struct vmalloc_block_info *next;
  77 };
  78
  79 static struct {
  80         spinlock_t lock;
  81         size_t kmalloc_blocks;
  82         size_t kmalloc_bytes;
  83         size_t vmalloc_blocks;
  84         size_t vmalloc_bytes;
  85         size_t peak_bytes;
  86         struct vmalloc_block_info *vmalloc_list;
  87 } memory_stats __cacheline_aligned;
  88
  89 static void update_peak_usage(void)
  90 {
  91         size_t total_bytes = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes;
  92
  93         if (total_bytes > memory_stats.peak_bytes)
  94                 memory_stats.peak_bytes = total_bytes;
  95 }
  96
  97 static void add_kmalloc_block(size_t size)
  98 {
  99         unsigned long flags;
 100
 101         spin_lock_irqsave(&memory_stats.lock, flags);
 102         memory_stats.kmalloc_blocks++;
 103         memory_stats.kmalloc_bytes += size;
 104         update_peak_usage();
 105         spin_unlock_irqrestore(&memory_stats.lock, flags);
 106 }
 107
 108 static void remove_kmalloc_block(size_t size)
 109 {
 110         unsigned long flags;
 111
 112         spin_lock_irqsave(&memory_stats.lock, flags);
 113         memory_stats.kmalloc_blocks--;
 114         memory_stats.kmalloc_bytes -= size;
 115         spin_unlock_irqrestore(&memory_stats.lock, flags);
 116 }
 117
 118 static void add_vmalloc_block(struct vmalloc_block_info *block)
 119 {
 120         unsigned long flags;
 121
 122         spin_lock_irqsave(&memory_stats.lock, flags);
 123         block->next = memory_stats.vmalloc_list;
 124         memory_stats.vmalloc_list = block;
 125         memory_stats.vmalloc_blocks++;
 126         memory_stats.vmalloc_bytes += block->size;
 127         update_peak_usage();
 128         spin_unlock_irqrestore(&memory_stats.lock, flags);
 129 }
 130
 131 static void remove_vmalloc_block(void *ptr)
 132 {
 133         struct vmalloc_block_info *block;
 134         struct vmalloc_block_info **block_ptr;
 135         unsigned long flags;
 136
 137         spin_lock_irqsave(&memory_stats.lock, flags);
 138         for (block_ptr = &memory_stats.vmalloc_list;
 139              (block = *block_ptr) != NULL;
 140              block_ptr = &block->next) {
 141                 if (block->ptr == ptr) {
 142                         *block_ptr = block->next;
 143                         memory_stats.vmalloc_blocks--;
 144                         memory_stats.vmalloc_bytes -= block->size;
 145                         break;
 146                 }
 147         }
 148
 149         spin_unlock_irqrestore(&memory_stats.lock, flags);
 150         if (block != NULL)
 151                 vdo_free(block);
 152         else
 153                 vdo_log_info("attempting to remove ptr %px not found in vmalloc list", ptr);
 154 }
 155
 156 /*
 157  * Determine whether allocating a memory block should use kmalloc or __vmalloc.
 158  *
 159  * vmalloc can allocate any integral number of pages.
 160  *
 161  * kmalloc can allocate any number of bytes up to a configured limit, which defaults to 8 megabytes
 162  * on some systems. kmalloc is especially good when memory is being both allocated and freed, and
 163  * it does this efficiently in a multi CPU environment.
 164  *
 165  * kmalloc usually rounds the size of the block up to the next power of two, so when the requested
 166  * block is bigger than PAGE_SIZE / 2 bytes, kmalloc will never give you less space than the
 167  * corresponding vmalloc allocation. Sometimes vmalloc will use less overhead than kmalloc.
 168  *
 169  * The advantages of kmalloc do not help out UDS or VDO, because we allocate all our memory up
 170  * front and do not free and reallocate it. Sometimes we have problems using kmalloc, because the
 171  * Linux memory page map can become so fragmented that kmalloc will not give us a 32KB chunk. We
 172  * have used vmalloc as a backup to kmalloc in the past, and a follow-up vmalloc of 32KB will work.
 173  * But there is no strong case to be made for using kmalloc over vmalloc for these size chunks.
 174  *
 175  * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB requests. There is no
 176  * strong reason for favoring either kmalloc or vmalloc for 4KB requests, except that tracking
 177  * vmalloc statistics uses a linked list implementation. Using a simple test, this choice of
 178  * boundary results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB results in an
 179  * additional 6374 vmalloc calls, which is much less efficient for tracking.
 180  *
 181  * @size: How many bytes to allocate
 182  */
 183 static inline bool use_kmalloc(size_t size)
 184 {
 185         return size <= PAGE_SIZE;
 186 }
 187
 188 /*
 189  * Allocate storage based on memory size and alignment, logging an error if the allocation fails.
 190  * The memory will be zeroed.
 191  *
 192  * @size: The size of an object
 193  * @align: The required alignment
 194  * @what: What is being allocated (for error logging)
 195  * @ptr: A pointer to hold the allocated memory
 196  *
 197  * Return: VDO_SUCCESS or an error code
 198  */
 199 int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 200 {
 201         /*
 202          * The __GFP_RETRY_MAYFAIL flag means the VM implementation will retry memory reclaim
 203          * procedures that have previously failed if there is some indication that progress has
 204          * been made elsewhere. It can wait for other tasks to attempt high level approaches to
 205          * freeing memory such as compaction (which removes fragmentation) and page-out. There is
 206          * still a definite limit to the number of retries, but it is a larger limit than with
 207          * __GFP_NORETRY. Allocations with this flag may fail, but only when there is genuinely
 208          * little unused memory. While these allocations do not directly trigger the OOM killer,
 209          * their failure indicates that the system is likely to need to use the OOM killer soon.
 210          * The caller must handle failure, but can reasonably do so by failing a higher-level
 211          * request, or completing it only in a much less efficient manner.
 212          */
 213         const gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL;
 214         unsigned int noio_flags;
 215         bool allocations_restricted = !allocations_allowed();
 216         unsigned long start_time;
 217         void *p = NULL;
 218
 219         if (unlikely(ptr == NULL))
 220                 return -EINVAL;
 221
 222         if (size == 0) {
 223                 *((void **) ptr) = NULL;
 224                 return VDO_SUCCESS;
 225         }
 226
 227         if (allocations_restricted)
 228                 noio_flags = memalloc_noio_save();
 229
 230         start_time = jiffies;
 231         if (use_kmalloc(size) && (align < PAGE_SIZE)) {
 232                 p = kmalloc(size, gfp_flags | __GFP_NOWARN);
 233                 if (p == NULL) {
 234                         /*
 235                          * It is possible for kmalloc to fail to allocate memory because there is
 236                          * no page available. A short sleep may allow the page reclaimer to
 237                          * free a page.
 238                          */
 239                         fsleep(1000);
 240                         p = kmalloc(size, gfp_flags);
 241                 }
 242
 243                 if (p != NULL)
 244                         add_kmalloc_block(ksize(p));
 245         } else {
 246                 struct vmalloc_block_info *block;
 247
 248                 if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == VDO_SUCCESS) {
 249                         /*
 250                          * It is possible for __vmalloc to fail to allocate memory because there
 251                          * are no pages available. A short sleep may allow the page reclaimer
 252                          * to free enough pages for a small allocation.
 253                          *
 254                          * For larger allocations, the page_alloc code is racing against the page
 255                          * reclaimer. If the page reclaimer can stay ahead of page_alloc, the
 256                          * __vmalloc will succeed. But if page_alloc overtakes the page reclaimer,
 257                          * the allocation fails. It is possible that more retries will succeed.
 258                          */
 259                         for (;;) {
 260                                 p = __vmalloc(size, gfp_flags | __GFP_NOWARN);
 261                                 if (p != NULL)
 262                                         break;
 263
 264                                 if (jiffies_to_msecs(jiffies - start_time) > 1000) {
 265                                         /* Try one more time, logging a failure for this call. */
 266                                         p = __vmalloc(size, gfp_flags);
 267                                         break;
 268                                 }
 269
 270                                 fsleep(1000);
 271                         }
 272
 273                         if (p == NULL) {
 274                                 vdo_free(block);
 275                         } else {
 276                                 block->ptr = p;
 277                                 block->size = PAGE_ALIGN(size);
 278                                 add_vmalloc_block(block);
 279                         }
 280                 }
 281         }
 282
 283         if (allocations_restricted)
 284                 memalloc_noio_restore(noio_flags);
 285
 286         if (unlikely(p == NULL)) {
 287                 vdo_log_error("Could not allocate %zu bytes for %s in %u msecs",
 288                               size, what, jiffies_to_msecs(jiffies - start_time));
 289                 return -ENOMEM;
 290         }
 291
 292         *((void **) ptr) = p;
 293         return VDO_SUCCESS;
 294 }
 295
 296 /*
 297  * Allocate storage based on memory size, failing immediately if the required memory is not
 298  * available. The memory will be zeroed.
 299  *
 300  * @size: The size of an object.
 301  * @what: What is being allocated (for error logging)
 302  *
 303  * Return: pointer to the allocated memory, or NULL if the required space is not available.
 304  */
 305 void *vdo_allocate_memory_nowait(size_t size, const char *what __maybe_unused)
 306 {
 307         void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO);
 308
 309         if (p != NULL)
 310                 add_kmalloc_block(ksize(p));
 311
 312         return p;
 313 }
 314
 315 void vdo_free(void *ptr)
 316 {
 317         if (ptr != NULL) {
 318                 if (is_vmalloc_addr(ptr)) {
 319                         remove_vmalloc_block(ptr);
 320                         vfree(ptr);
 321                 } else {
 322                         remove_kmalloc_block(ksize(ptr));
 323                         kfree(ptr);
 324                 }
 325         }
 326 }
 327
 328 /*
 329  * Reallocate dynamically allocated memory. There are no alignment guarantees for the reallocated
 330  * memory. If the new memory is larger than the old memory, the new space will be zeroed.
 331  *
 332  * @ptr: The memory to reallocate.
 333  * @old_size: The old size of the memory
 334  * @size: The new size to allocate
 335  * @what: What is being allocated (for error logging)
 336  * @new_ptr: A pointer to hold the reallocated pointer
 337  *
 338  * Return: VDO_SUCCESS or an error code
 339  */
 340 int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *what,
 341                           void *new_ptr)
 342 {
 343         int result;
 344
 345         if (size == 0) {
 346                 vdo_free(ptr);
 347                 *(void **) new_ptr = NULL;
 348                 return VDO_SUCCESS;
 349         }
 350
 351         result = vdo_allocate(size, char, what, new_ptr);
 352         if (result != VDO_SUCCESS)
 353                 return result;
 354
 355         if (ptr != NULL) {
 356                 if (old_size < size)
 357                         size = old_size;
 358
 359                 memcpy(*((void **) new_ptr), ptr, size);
 360                 vdo_free(ptr);
 361         }
 362
 363         return VDO_SUCCESS;
 364 }
 365
 366 int vdo_duplicate_string(const char *string, const char *what, char **new_string)
 367 {
 368         int result;
 369         u8 *dup;
 370
 371         result = vdo_allocate(strlen(string) + 1, u8, what, &dup);
 372         if (result != VDO_SUCCESS)
 373                 return result;
 374
 375         memcpy(dup, string, strlen(string) + 1);
 376         *new_string = dup;
 377         return VDO_SUCCESS;
 378 }
 379
 380 void vdo_memory_init(void)
 381 {
 382         spin_lock_init(&memory_stats.lock);
 383         vdo_initialize_thread_registry(&allocating_threads);
 384 }
 385
 386 void vdo_memory_exit(void)
 387 {
 388         VDO_ASSERT_LOG_ONLY(memory_stats.kmalloc_bytes == 0,
 389                             "kmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
 390                             memory_stats.kmalloc_bytes, memory_stats.kmalloc_blocks);
 391         VDO_ASSERT_LOG_ONLY(memory_stats.vmalloc_bytes == 0,
 392                             "vmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
 393                             memory_stats.vmalloc_bytes, memory_stats.vmalloc_blocks);
 394         vdo_log_debug("peak usage %zd bytes", memory_stats.peak_bytes);
 395 }
 396
 397 void vdo_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used)
 398 {
 399         unsigned long flags;
 400
 401         spin_lock_irqsave(&memory_stats.lock, flags);
 402         *bytes_used = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes;
 403         *peak_bytes_used = memory_stats.peak_bytes;
 404         spin_unlock_irqrestore(&memory_stats.lock, flags);
 405 }
 406
 407 /*
 408  * Report stats on any allocated memory that we're tracking. Not all allocation types are
 409  * guaranteed to be tracked in bytes (e.g., bios).
 410  */
 411 void vdo_report_memory_usage(void)
 412 {
 413         unsigned long flags;
 414         u64 kmalloc_blocks;
 415         u64 kmalloc_bytes;
 416         u64 vmalloc_blocks;
 417         u64 vmalloc_bytes;
 418         u64 peak_usage;
 419         u64 total_bytes;
 420
 421         spin_lock_irqsave(&memory_stats.lock, flags);
 422         kmalloc_blocks = memory_stats.kmalloc_blocks;
 423         kmalloc_bytes = memory_stats.kmalloc_bytes;
 424         vmalloc_blocks = memory_stats.vmalloc_blocks;
 425         vmalloc_bytes = memory_stats.vmalloc_bytes;
 426         peak_usage = memory_stats.peak_bytes;
 427         spin_unlock_irqrestore(&memory_stats.lock, flags);
 428         total_bytes = kmalloc_bytes + vmalloc_bytes;
 429         vdo_log_info("current module memory tracking (actual allocation sizes, not requested):");
 430         vdo_log_info("  %llu bytes in %llu kmalloc blocks",
 431                      (unsigned long long) kmalloc_bytes,
 432                      (unsigned long long) kmalloc_blocks);
 433         vdo_log_info("  %llu bytes in %llu vmalloc blocks",
 434                      (unsigned long long) vmalloc_bytes,
 435                      (unsigned long long) vmalloc_blocks);
 436         vdo_log_info("  total %llu bytes, peak usage %llu bytes",
 437                      (unsigned long long) total_bytes, (unsigned long long) peak_usage);
 438 }