1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Debugging module statistics.
5 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
8 #include <linux/module.h>
9 #include <uapi/linux/module.h>
10 #include <linux/string.h>
11 #include <linux/printk.h>
12 #include <linux/slab.h>
13 #include <linux/list.h>
14 #include <linux/debugfs.h>
15 #include <linux/rculist.h>
16 #include <linux/math.h>
21 * DOC: module debugging statistics overview
23 * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
24 * are useful to monitor and root cause memory pressure issues with module
25 * loading. These statistics are useful to allow us to improve production
28 * The current module debugging statistics supported help keep track of module
29 * loading failures to enable improvements either for kernel module auto-loading
30 * usage (request_module()) or interactions with userspace. Statistics are
31 * provided to track all possible failures in the finit_module() path and memory
32 * wasted in this process space. Each of the failure counters are associated
33 * to a type of module loading failure which is known to incur a certain amount
34 * of memory allocation loss. In the worst case loading a module will fail after
35 * a 3 step memory allocation process:
37 * a) memory allocated with kernel_read_file_from_fd()
38 * b) module decompression processes the file read from
39 * kernel_read_file_from_fd(), and vmap() is used to map
40 * the decompressed module to a new local buffer which represents
41 * a copy of the decompressed module passed from userspace. The buffer
42 * from kernel_read_file_from_fd() is freed right away.
43 * c) layout_and_allocate() allocates space for the final resting
44 * place where we would keep the module if it were to be processed
47 * If a failure occurs after these three different allocations only one
48 * counter will be incremented with the summation of the allocated bytes freed
49 * incurred during this failure. Likewise, if module loading failed only after
50 * step b) a separate counter is used and incremented for the bytes freed and
51 * not used during both of those allocations.
53 * Virtual memory space can be limited, for example on x86 virtual memory size
54 * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
55 * memory allocations when possible. These module debugging statistics help
56 * to evaluate how much memory is being wasted on bootup due to module loading
59 * All counters are designed to be incremental. Atomic counters are used so to
60 * remain simple and avoid delays and deadlocks.
64 * DOC: dup_failed_modules - tracks duplicate failed modules
66 * Linked list of modules which failed to be loaded because an already existing
67 * module with the same name was already being processed or already loaded.
68 * The finit_module() system call incurs heavy virtual memory allocations. In
69 * the worst case an finit_module() system call can end up allocating virtual
72 * 1) kernel_read_file_from_fd() call uses vmalloc()
73 * 2) optional module decompression uses vmap()
74 * 3) layout_and allocate() can use vzalloc() or an arch specific variation of
75 * vmalloc to deal with ELF sections requiring special permissions
77 * In practice on a typical boot today most finit_module() calls fail due to
78 * the module with the same name already being loaded or about to be processed.
79 * All virtual memory allocated to these failed modules will be freed with
82 * To help with this the dup_failed_modules allows us to track modules which
83 * failed to load due to the fact that a module was already loaded or being
84 * processed. There are only two points at which we can fail such calls,
85 * we list them below along with the number of virtual memory allocation
88 * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
89 * layout_and_allocate().
90 * - with module decompression: 2 virtual memory allocation calls
91 * - without module decompression: 1 virtual memory allocation calls
92 * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
93 * - with module decompression 3 virtual memory allocation calls
94 * - without module decompression 2 virtual memory allocation calls
96 * We should strive to get this list to be as small as possible. If this list
97 * is not empty it is a reflection of possible work or optimizations possible
98 * either in-kernel or in userspace.
100 static LIST_HEAD(dup_failed_modules
);
103 * DOC: module statistics debugfs counters
105 * The total amount of wasted virtual memory allocation space during module
106 * loading can be computed by adding the total from the summation:
108 * * @invalid_kread_bytes +
109 * @invalid_decompress_bytes +
110 * @invalid_becoming_bytes +
113 * The following debugfs counters are available to inspect module loading
116 * * total_mod_size: total bytes ever used by all modules we've dealt with on
118 * * total_text_size: total bytes of the .text and .init.text ELF section
119 * sizes we've dealt with on this system
120 * * invalid_kread_bytes: bytes allocated and then freed on failures which
121 * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
122 * uses vmalloc(). These should typically not happen unless your system is
123 * under memory pressure.
124 * * invalid_decompress_bytes: number of bytes allocated and freed due to
125 * memory allocations in the module decompression path that use vmap().
126 * These typically should not happen unless your system is under memory
128 * * invalid_becoming_bytes: total number of bytes allocated and freed used
129 * to read the kernel module userspace wants us to read before we
130 * promote it to be processed to be added to our @modules linked list. These
131 * failures can happen if we had a check in between a successful kernel_read_file_from_fd()
132 * call and right before we allocate the our private memory for the module
133 * which would be kept if the module is successfully loaded. The most common
134 * reason for this failure is when userspace is racing to load a module
135 * which it does not yet see loaded. The first module to succeed in
136 * add_unformed_module() will add a module to our &modules list and
137 * subsequent loads of modules with the same name will error out at the
138 * end of early_mod_check(). The check for module_patient_check_exists()
139 * at the end of early_mod_check() prevents duplicate allocations
140 * on layout_and_allocate() for modules already being processed. These
141 * duplicate failed modules are non-fatal, however they typically are
142 * indicative of userspace not seeing a module in userspace loaded yet and
143 * unnecessarily trying to load a module before the kernel even has a chance
144 * to begin to process prior requests. Although duplicate failures can be
145 * non-fatal, we should try to reduce vmalloc() pressure proactively, so
146 * ideally after boot this will be close to as 0 as possible. If module
147 * decompression was used we also add to this counter the cost of the
148 * initial kernel_read_file_from_fd() of the compressed module. If module
149 * decompression was not used the value represents the total allocated and
150 * freed bytes in kernel_read_file_from_fd() calls for these type of
151 * failures. These failures can occur because:
153 * * module_sig_check() - module signature checks
154 * * elf_validity_cache_copy() - some ELF validation issue
155 * * early_mod_check():
158 * * failed to rewrite section headers
160 * * live patch requirements didn't check out
161 * * the module was detected as being already present
163 * * invalid_mod_bytes: these are the total number of bytes allocated and
164 * freed due to failures after we did all the sanity checks of the module
165 * which userspace passed to us and after our first check that the module
166 * is unique. A module can still fail to load if we detect the module is
167 * loaded after we allocate space for it with layout_and_allocate(), we do
168 * this check right before processing the module as live and run its
169 * initialization routines. Note that you have a failure of this type it
170 * also means the respective kernel_read_file_from_fd() memory space was
171 * also freed and not used, and so we increment this counter with twice
172 * the size of the module. Additionally if you used module decompression
173 * the size of the compressed module is also added to this counter.
175 * * modcount: how many modules we've loaded in our kernel life time
176 * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
177 * * failed_decompress: how many failed module decompression attempts we've had.
178 * These really should not happen unless your compression / decompression
180 * * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
181 * it and before we allocate memory for it with layout_and_allocate(). This
182 * counter is never incremented if you manage to validate the module and
183 * call layout_and_allocate() for it.
184 * * failed_load_modules: how many modules failed once we've allocated our
185 * private space for our module using layout_and_allocate(). These failures
186 * should hopefully mostly be dealt with already. Races in theory could
187 * still exist here, but it would just mean the kernel had started processing
188 * two threads concurrently up to early_mod_check() and one thread won.
189 * These failures are good signs the kernel or userspace is doing something
190 * seriously stupid or that could be improved. We should strive to fix these,
191 * but it is perhaps not easy to fix them. A recent example are the modules
192 * requests incurred for frequency modules, a separate module request was
193 * being issued for each CPU on a system.
196 atomic_long_t total_mod_size
;
197 atomic_long_t total_text_size
;
198 atomic_long_t invalid_kread_bytes
;
199 atomic_long_t invalid_decompress_bytes
;
200 static atomic_long_t invalid_becoming_bytes
;
201 static atomic_long_t invalid_mod_bytes
;
203 atomic_t failed_kreads
;
204 atomic_t failed_decompress
;
205 static atomic_t failed_becoming
;
206 static atomic_t failed_load_modules
;
208 static const char *mod_fail_to_str(struct mod_fail_load
*mod_fail
)
210 if (test_bit(FAIL_DUP_MOD_BECOMING
, &mod_fail
->dup_fail_mask
) &&
211 test_bit(FAIL_DUP_MOD_LOAD
, &mod_fail
->dup_fail_mask
))
212 return "Becoming & Load";
213 if (test_bit(FAIL_DUP_MOD_BECOMING
, &mod_fail
->dup_fail_mask
))
215 if (test_bit(FAIL_DUP_MOD_LOAD
, &mod_fail
->dup_fail_mask
))
217 return "Bug-on-stats";
220 void mod_stat_bump_invalid(struct load_info
*info
, int flags
)
222 atomic_long_add(info
->len
* 2, &invalid_mod_bytes
);
223 atomic_inc(&failed_load_modules
);
224 #if defined(CONFIG_MODULE_DECOMPRESS)
225 if (flags
& MODULE_INIT_COMPRESSED_FILE
)
226 atomic_long_add(info
->compressed_len
, &invalid_mod_bytes
);
230 void mod_stat_bump_becoming(struct load_info
*info
, int flags
)
232 atomic_inc(&failed_becoming
);
233 atomic_long_add(info
->len
, &invalid_becoming_bytes
);
234 #if defined(CONFIG_MODULE_DECOMPRESS)
235 if (flags
& MODULE_INIT_COMPRESSED_FILE
)
236 atomic_long_add(info
->compressed_len
, &invalid_becoming_bytes
);
240 int try_add_failed_module(const char *name
, enum fail_dup_mod_reason reason
)
242 struct mod_fail_load
*mod_fail
;
244 list_for_each_entry_rcu(mod_fail
, &dup_failed_modules
, list
,
245 lockdep_is_held(&module_mutex
)) {
246 if (!strcmp(mod_fail
->name
, name
)) {
247 atomic_long_inc(&mod_fail
->count
);
248 __set_bit(reason
, &mod_fail
->dup_fail_mask
);
253 mod_fail
= kzalloc(sizeof(*mod_fail
), GFP_KERNEL
);
256 memcpy(mod_fail
->name
, name
, strlen(name
));
257 __set_bit(reason
, &mod_fail
->dup_fail_mask
);
258 atomic_long_inc(&mod_fail
->count
);
259 list_add_rcu(&mod_fail
->list
, &dup_failed_modules
);
265 * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
266 * 112 module prints within 8k.
268 * 1024 + (64*112) = 8k
270 #define MAX_PREAMBLE 1024
271 #define MAX_FAILED_MOD_PRINT 112
272 #define MAX_BYTES_PER_MOD 64
273 static ssize_t
read_file_mod_stats(struct file
*file
, char __user
*user_buf
,
274 size_t count
, loff_t
*ppos
)
276 struct mod_fail_load
*mod_fail
;
277 unsigned int len
, size
, count_failed
= 0;
280 u32 live_mod_count
, fkreads
, fdecompress
, fbecoming
, floads
;
281 unsigned long total_size
, text_size
, ikread_bytes
, ibecoming_bytes
,
282 idecompress_bytes
, imod_bytes
, total_virtual_lost
;
284 live_mod_count
= atomic_read(&modcount
);
285 fkreads
= atomic_read(&failed_kreads
);
286 fdecompress
= atomic_read(&failed_decompress
);
287 fbecoming
= atomic_read(&failed_becoming
);
288 floads
= atomic_read(&failed_load_modules
);
290 total_size
= atomic_long_read(&total_mod_size
);
291 text_size
= atomic_long_read(&total_text_size
);
292 ikread_bytes
= atomic_long_read(&invalid_kread_bytes
);
293 idecompress_bytes
= atomic_long_read(&invalid_decompress_bytes
);
294 ibecoming_bytes
= atomic_long_read(&invalid_becoming_bytes
);
295 imod_bytes
= atomic_long_read(&invalid_mod_bytes
);
297 total_virtual_lost
= ikread_bytes
+ idecompress_bytes
+ ibecoming_bytes
+ imod_bytes
;
299 size
= MAX_PREAMBLE
+ min((unsigned int)(floads
+ fbecoming
),
300 (unsigned int)MAX_FAILED_MOD_PRINT
) * MAX_BYTES_PER_MOD
;
301 buf
= kzalloc(size
, GFP_KERNEL
);
305 /* The beginning of our debug preamble */
306 len
= scnprintf(buf
, size
, "%25s\t%u\n", "Mods ever loaded", live_mod_count
);
308 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%u\n", "Mods failed on kread", fkreads
);
310 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%u\n", "Mods failed on decompress",
312 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%u\n", "Mods failed on becoming", fbecoming
);
314 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%u\n", "Mods failed on load", floads
);
316 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Total module size", total_size
);
317 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Total mod text size", text_size
);
319 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes
);
321 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Failed decompress bytes",
324 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes
);
326 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes
);
328 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost
);
330 if (live_mod_count
&& total_size
) {
331 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Average mod size",
332 DIV_ROUND_UP(total_size
, live_mod_count
));
335 if (live_mod_count
&& text_size
) {
336 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Average mod text size",
337 DIV_ROUND_UP(text_size
, live_mod_count
));
341 * We use WARN_ON_ONCE() for the counters to ensure we always have parity
342 * for keeping tabs on a type of failure with one type of byte counter.
343 * The counters for imod_bytes does not increase for fkreads failures
344 * for example, and so on.
347 WARN_ON_ONCE(ikread_bytes
&& !fkreads
);
348 if (fkreads
&& ikread_bytes
) {
349 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Avg fail kread bytes",
350 DIV_ROUND_UP(ikread_bytes
, fkreads
));
353 WARN_ON_ONCE(ibecoming_bytes
&& !fbecoming
);
354 if (fbecoming
&& ibecoming_bytes
) {
355 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Avg fail becoming bytes",
356 DIV_ROUND_UP(ibecoming_bytes
, fbecoming
));
359 WARN_ON_ONCE(idecompress_bytes
&& !fdecompress
);
360 if (fdecompress
&& idecompress_bytes
) {
361 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Avg fail decomp bytes",
362 DIV_ROUND_UP(idecompress_bytes
, fdecompress
));
365 WARN_ON_ONCE(imod_bytes
&& !floads
);
366 if (floads
&& imod_bytes
) {
367 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%lu\n", "Average fail load bytes",
368 DIV_ROUND_UP(imod_bytes
, floads
));
371 /* End of our debug preamble header. */
373 /* Catch when we've gone beyond our expected preamble */
374 WARN_ON_ONCE(len
>= MAX_PREAMBLE
);
376 if (list_empty(&dup_failed_modules
))
379 len
+= scnprintf(buf
+ len
, size
- len
, "Duplicate failed modules:\n");
380 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%15s\t%25s\n",
381 "Module-name", "How-many-times", "Reason");
382 mutex_lock(&module_mutex
);
385 list_for_each_entry_rcu(mod_fail
, &dup_failed_modules
, list
) {
386 if (WARN_ON_ONCE(++count_failed
>= MAX_FAILED_MOD_PRINT
))
388 len
+= scnprintf(buf
+ len
, size
- len
, "%25s\t%15lu\t%25s\n", mod_fail
->name
,
389 atomic_long_read(&mod_fail
->count
), mod_fail_to_str(mod_fail
));
392 mutex_unlock(&module_mutex
);
394 ret
= simple_read_from_buffer(user_buf
, count
, ppos
, buf
, len
);
399 #undef MAX_FAILED_MOD_PRINT
400 #undef MAX_BYTES_PER_MOD
402 static const struct file_operations fops_mod_stats
= {
403 .read
= read_file_mod_stats
,
405 .owner
= THIS_MODULE
,
406 .llseek
= default_llseek
,
409 #define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter)
410 #define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name)
411 static int __init
module_stats_init(void)
413 mod_debug_add_ulong(total_mod_size
);
414 mod_debug_add_ulong(total_text_size
);
415 mod_debug_add_ulong(invalid_kread_bytes
);
416 mod_debug_add_ulong(invalid_decompress_bytes
);
417 mod_debug_add_ulong(invalid_becoming_bytes
);
418 mod_debug_add_ulong(invalid_mod_bytes
);
420 mod_debug_add_atomic(modcount
);
421 mod_debug_add_atomic(failed_kreads
);
422 mod_debug_add_atomic(failed_decompress
);
423 mod_debug_add_atomic(failed_becoming
);
424 mod_debug_add_atomic(failed_load_modules
);
426 debugfs_create_file("stats", 0400, mod_debugfs_root
, mod_debugfs_root
, &fops_mod_stats
);
430 #undef mod_debug_add_ulong
431 #undef mod_debug_add_atomic
432 module_init(module_stats_init
);