1 // SPDX-License-Identifier: GPL-2.0
3 * fs/proc/kcore.c kernel ELF core dumper
5 * Modelled on fs/exec.c:aout_core_dump()
6 * Jeremy Fitzhardinge <jeremy@sw.oz.au>
7 * ELF version written by David Howells <David.Howells@nexor.co.uk>
8 * Modified and incorporated into 2.3.x by Tigran Aivazian <tigran@veritas.com>
9 * Support to dump vmalloc'd areas (ELF only), Tigran Aivazian <tigran@veritas.com>
10 * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
13 #include <linux/vmcore_info.h>
15 #include <linux/proc_fs.h>
16 #include <linux/kcore.h>
17 #include <linux/user.h>
18 #include <linux/capability.h>
19 #include <linux/elf.h>
20 #include <linux/elfcore.h>
21 #include <linux/vmalloc.h>
22 #include <linux/highmem.h>
23 #include <linux/printk.h>
24 #include <linux/memblock.h>
25 #include <linux/init.h>
26 #include <linux/slab.h>
27 #include <linux/uio.h>
29 #include <linux/list.h>
30 #include <linux/ioport.h>
31 #include <linux/memory.h>
32 #include <linux/sched/task.h>
33 #include <linux/security.h>
34 #include <asm/sections.h>
37 #define CORE_STR "CORE"
39 #ifndef ELF_CORE_EFLAGS
40 #define ELF_CORE_EFLAGS 0
43 static struct proc_dir_entry
*proc_root_kcore
;
46 #ifndef kc_vaddr_to_offset
47 #define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
49 #ifndef kc_offset_to_vaddr
50 #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
53 #ifndef kc_xlate_dev_mem_ptr
54 #define kc_xlate_dev_mem_ptr kc_xlate_dev_mem_ptr
55 static inline void *kc_xlate_dev_mem_ptr(phys_addr_t phys
)
60 #ifndef kc_unxlate_dev_mem_ptr
61 #define kc_unxlate_dev_mem_ptr kc_unxlate_dev_mem_ptr
62 static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys
, void *virt
)
67 static LIST_HEAD(kclist_head
);
68 static DECLARE_RWSEM(kclist_lock
);
69 static int kcore_need_update
= 1;
72 * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
73 * Same as oldmem_pfn_is_ram in vmcore
75 static int (*mem_pfn_is_ram
)(unsigned long pfn
);
77 int __init
register_mem_pfn_is_ram(int (*fn
)(unsigned long pfn
))
85 static int pfn_is_ram(unsigned long pfn
)
88 return mem_pfn_is_ram(pfn
);
93 /* This doesn't grab kclist_lock, so it should only be used at init time. */
94 void __init
kclist_add(struct kcore_list
*new, void *addr
, size_t size
,
97 new->addr
= (unsigned long)addr
;
101 list_add_tail(&new->list
, &kclist_head
);
104 static size_t get_kcore_size(int *nphdr
, size_t *phdrs_len
, size_t *notes_len
,
108 struct kcore_list
*m
;
110 *nphdr
= 1; /* PT_NOTE */
113 list_for_each_entry(m
, &kclist_head
, list
) {
114 try = kc_vaddr_to_offset((size_t)m
->addr
+ m
->size
);
120 *phdrs_len
= *nphdr
* sizeof(struct elf_phdr
);
121 *notes_len
= (4 * sizeof(struct elf_note
) +
122 3 * ALIGN(sizeof(CORE_STR
), 4) +
123 VMCOREINFO_NOTE_NAME_BYTES
+
124 ALIGN(sizeof(struct elf_prstatus
), 4) +
125 ALIGN(sizeof(struct elf_prpsinfo
), 4) +
126 ALIGN(arch_task_struct_size
, 4) +
127 ALIGN(vmcoreinfo_size
, 4));
128 *data_offset
= PAGE_ALIGN(sizeof(struct elfhdr
) + *phdrs_len
+
130 return *data_offset
+ size
;
133 #ifdef CONFIG_HIGHMEM
135 * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
136 * because memory hole is not as big as !HIGHMEM case.
137 * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
139 static int kcore_ram_list(struct list_head
*head
)
141 struct kcore_list
*ent
;
143 ent
= kmalloc(sizeof(*ent
), GFP_KERNEL
);
146 ent
->addr
= (unsigned long)__va(0);
147 ent
->size
= max_low_pfn
<< PAGE_SHIFT
;
148 ent
->type
= KCORE_RAM
;
149 list_add(&ent
->list
, head
);
153 #else /* !CONFIG_HIGHMEM */
155 #ifdef CONFIG_SPARSEMEM_VMEMMAP
156 /* calculate vmemmap's address from given system ram pfn and register it */
158 get_sparsemem_vmemmap_info(struct kcore_list
*ent
, struct list_head
*head
)
160 unsigned long pfn
= __pa(ent
->addr
) >> PAGE_SHIFT
;
161 unsigned long nr_pages
= ent
->size
>> PAGE_SHIFT
;
162 unsigned long start
, end
;
163 struct kcore_list
*vmm
, *tmp
;
166 start
= ((unsigned long)pfn_to_page(pfn
)) & PAGE_MASK
;
167 end
= ((unsigned long)pfn_to_page(pfn
+ nr_pages
)) - 1;
168 end
= PAGE_ALIGN(end
);
169 /* overlap check (because we have to align page */
170 list_for_each_entry(tmp
, head
, list
) {
171 if (tmp
->type
!= KCORE_VMEMMAP
)
173 if (start
< tmp
->addr
+ tmp
->size
)
178 vmm
= kmalloc(sizeof(*vmm
), GFP_KERNEL
);
182 vmm
->size
= end
- start
;
183 vmm
->type
= KCORE_VMEMMAP
;
184 list_add_tail(&vmm
->list
, head
);
191 get_sparsemem_vmemmap_info(struct kcore_list
*ent
, struct list_head
*head
)
199 kclist_add_private(unsigned long pfn
, unsigned long nr_pages
, void *arg
)
201 struct list_head
*head
= (struct list_head
*)arg
;
202 struct kcore_list
*ent
;
208 p
= pfn_to_page(pfn
);
210 ent
= kmalloc(sizeof(*ent
), GFP_KERNEL
);
213 ent
->addr
= (unsigned long)page_to_virt(p
);
214 ent
->size
= nr_pages
<< PAGE_SHIFT
;
216 if (!virt_addr_valid((void *)ent
->addr
))
219 /* cut not-mapped area. ....from ppc-32 code. */
220 if (ULONG_MAX
- ent
->addr
< ent
->size
)
221 ent
->size
= ULONG_MAX
- ent
->addr
;
224 * We've already checked virt_addr_valid so we know this address
225 * is a valid pointer, therefore we can check against it to determine
228 if (VMALLOC_START
> ent
->addr
) {
229 if (VMALLOC_START
- ent
->addr
< ent
->size
)
230 ent
->size
= VMALLOC_START
- ent
->addr
;
233 ent
->type
= KCORE_RAM
;
234 list_add_tail(&ent
->list
, head
);
236 if (!get_sparsemem_vmemmap_info(ent
, head
)) {
237 list_del(&ent
->list
);
247 static int kcore_ram_list(struct list_head
*list
)
250 unsigned long end_pfn
;
252 /* Not initialized....update now */
253 /* find out "max pfn" */
255 for_each_node_state(nid
, N_MEMORY
) {
256 unsigned long node_end
;
257 node_end
= node_end_pfn(nid
);
258 if (end_pfn
< node_end
)
261 /* scan 0 to max_pfn */
262 ret
= walk_system_ram_range(0, end_pfn
, list
, kclist_add_private
);
267 #endif /* CONFIG_HIGHMEM */
269 static int kcore_update_ram(void)
274 size_t phdrs_len
, notes_len
, data_offset
;
275 struct kcore_list
*tmp
, *pos
;
278 down_write(&kclist_lock
);
279 if (!xchg(&kcore_need_update
, 0))
282 ret
= kcore_ram_list(&list
);
284 /* Couldn't get the RAM list, try again next time. */
285 WRITE_ONCE(kcore_need_update
, 1);
286 list_splice_tail(&list
, &garbage
);
290 list_for_each_entry_safe(pos
, tmp
, &kclist_head
, list
) {
291 if (pos
->type
== KCORE_RAM
|| pos
->type
== KCORE_VMEMMAP
)
292 list_move(&pos
->list
, &garbage
);
294 list_splice_tail(&list
, &kclist_head
);
296 proc_root_kcore
->size
= get_kcore_size(&nphdr
, &phdrs_len
, ¬es_len
,
300 up_write(&kclist_lock
);
301 list_for_each_entry_safe(pos
, tmp
, &garbage
, list
) {
302 list_del(&pos
->list
);
308 static void append_kcore_note(char *notes
, size_t *i
, const char *name
,
309 unsigned int type
, const void *desc
,
312 struct elf_note
*note
= (struct elf_note
*)¬es
[*i
];
314 note
->n_namesz
= strlen(name
) + 1;
315 note
->n_descsz
= descsz
;
318 memcpy(¬es
[*i
], name
, note
->n_namesz
);
319 *i
= ALIGN(*i
+ note
->n_namesz
, 4);
320 memcpy(¬es
[*i
], desc
, descsz
);
321 *i
= ALIGN(*i
+ descsz
, 4);
324 static ssize_t
read_kcore_iter(struct kiocb
*iocb
, struct iov_iter
*iter
)
326 struct file
*file
= iocb
->ki_filp
;
327 char *buf
= file
->private_data
;
328 loff_t
*fpos
= &iocb
->ki_pos
;
329 size_t phdrs_offset
, notes_offset
, data_offset
;
330 size_t page_offline_frozen
= 1;
331 size_t phdrs_len
, notes_len
;
332 struct kcore_list
*m
;
336 size_t buflen
= iov_iter_count(iter
);
337 size_t orig_buflen
= buflen
;
340 down_read(&kclist_lock
);
342 * Don't race against drivers that set PageOffline() and expect no
343 * further page access.
345 page_offline_freeze();
347 get_kcore_size(&nphdr
, &phdrs_len
, ¬es_len
, &data_offset
);
348 phdrs_offset
= sizeof(struct elfhdr
);
349 notes_offset
= phdrs_offset
+ phdrs_len
;
351 /* ELF file header. */
352 if (buflen
&& *fpos
< sizeof(struct elfhdr
)) {
353 struct elfhdr ehdr
= {
359 [EI_CLASS
] = ELF_CLASS
,
360 [EI_DATA
] = ELF_DATA
,
361 [EI_VERSION
] = EV_CURRENT
,
362 [EI_OSABI
] = ELF_OSABI
,
365 .e_machine
= ELF_ARCH
,
366 .e_version
= EV_CURRENT
,
367 .e_phoff
= sizeof(struct elfhdr
),
368 .e_flags
= ELF_CORE_EFLAGS
,
369 .e_ehsize
= sizeof(struct elfhdr
),
370 .e_phentsize
= sizeof(struct elf_phdr
),
374 tsz
= min_t(size_t, buflen
, sizeof(struct elfhdr
) - *fpos
);
375 if (copy_to_iter((char *)&ehdr
+ *fpos
, tsz
, iter
) != tsz
) {
384 /* ELF program headers. */
385 if (buflen
&& *fpos
< phdrs_offset
+ phdrs_len
) {
386 struct elf_phdr
*phdrs
, *phdr
;
388 phdrs
= kzalloc(phdrs_len
, GFP_KERNEL
);
394 phdrs
[0].p_type
= PT_NOTE
;
395 phdrs
[0].p_offset
= notes_offset
;
396 phdrs
[0].p_filesz
= notes_len
;
399 list_for_each_entry(m
, &kclist_head
, list
) {
400 phdr
->p_type
= PT_LOAD
;
401 phdr
->p_flags
= PF_R
| PF_W
| PF_X
;
402 phdr
->p_offset
= kc_vaddr_to_offset(m
->addr
) + data_offset
;
403 phdr
->p_vaddr
= (size_t)m
->addr
;
404 if (m
->type
== KCORE_RAM
)
405 phdr
->p_paddr
= __pa(m
->addr
);
406 else if (m
->type
== KCORE_TEXT
)
407 phdr
->p_paddr
= __pa_symbol(m
->addr
);
409 phdr
->p_paddr
= (elf_addr_t
)-1;
410 phdr
->p_filesz
= phdr
->p_memsz
= m
->size
;
411 phdr
->p_align
= PAGE_SIZE
;
415 tsz
= min_t(size_t, buflen
, phdrs_offset
+ phdrs_len
- *fpos
);
416 if (copy_to_iter((char *)phdrs
+ *fpos
- phdrs_offset
, tsz
,
428 /* ELF note segment. */
429 if (buflen
&& *fpos
< notes_offset
+ notes_len
) {
430 struct elf_prstatus prstatus
= {};
431 struct elf_prpsinfo prpsinfo
= {
433 .pr_fname
= "vmlinux",
438 strscpy(prpsinfo
.pr_psargs
, saved_command_line
,
439 sizeof(prpsinfo
.pr_psargs
));
441 notes
= kzalloc(notes_len
, GFP_KERNEL
);
447 append_kcore_note(notes
, &i
, CORE_STR
, NT_PRSTATUS
, &prstatus
,
449 append_kcore_note(notes
, &i
, CORE_STR
, NT_PRPSINFO
, &prpsinfo
,
451 append_kcore_note(notes
, &i
, CORE_STR
, NT_TASKSTRUCT
, current
,
452 arch_task_struct_size
);
454 * vmcoreinfo_size is mostly constant after init time, but it
455 * can be changed by crash_save_vmcoreinfo(). Racing here with a
456 * panic on another CPU before the machine goes down is insanely
457 * unlikely, but it's better to not leave potential buffer
458 * overflows lying around, regardless.
460 append_kcore_note(notes
, &i
, VMCOREINFO_NOTE_NAME
, 0,
462 min(vmcoreinfo_size
, notes_len
- i
));
464 tsz
= min_t(size_t, buflen
, notes_offset
+ notes_len
- *fpos
);
465 if (copy_to_iter(notes
+ *fpos
- notes_offset
, tsz
, iter
) != tsz
) {
477 * Check to see if our file offset matches with any of
478 * the addresses in the elf_phdr on our list.
480 start
= kc_offset_to_vaddr(*fpos
- data_offset
);
481 if ((tsz
= (PAGE_SIZE
- (start
& ~PAGE_MASK
))) > buflen
)
492 * If this is the first iteration or the address is not within
493 * the previous entry, search for a matching entry.
495 if (!m
|| start
< m
->addr
|| start
>= m
->addr
+ m
->size
) {
496 struct kcore_list
*pos
;
499 list_for_each_entry(pos
, &kclist_head
, list
) {
500 if (start
>= pos
->addr
&&
501 start
< pos
->addr
+ pos
->size
) {
508 if (page_offline_frozen
++ % MAX_ORDER_NR_PAGES
== 0) {
511 page_offline_freeze();
515 if (iov_iter_zero(tsz
, iter
) != tsz
) {
525 const char *src
= (char *)start
;
526 size_t read
= 0, left
= tsz
;
529 * vmalloc uses spinlocks, so we optimistically try to
530 * read memory. If this fails, fault pages in and try
531 * again until we are done.
534 read
+= vread_iter(iter
, src
, left
);
541 if (fault_in_iov_iter_writeable(iter
, left
)) {
549 /* User page is handled prior to normal kernel page: */
550 if (copy_to_iter((char *)start
, tsz
, iter
) != tsz
) {
557 pfn
= phys
>> PAGE_SHIFT
;
558 page
= pfn_to_online_page(pfn
);
561 * Don't read offline sections, logically offline pages
562 * (e.g., inflated in a balloon), hwpoisoned pages,
563 * and explicitly excluded physical ranges.
565 if (!page
|| PageOffline(page
) ||
566 is_page_hwpoison(page
) || !pfn_is_ram(pfn
) ||
567 pfn_is_unaccepted_memory(pfn
)) {
568 if (iov_iter_zero(tsz
, iter
) != tsz
) {
577 if (m
->type
== KCORE_RAM
) {
578 __start
= kc_xlate_dev_mem_ptr(phys
);
581 if (iov_iter_zero(tsz
, iter
) != tsz
)
586 __start
= (void *)start
;
590 * Sadly we must use a bounce buffer here to be able to
591 * make use of copy_from_kernel_nofault(), as these
592 * memory regions might not always be mapped on all
595 ret
= copy_from_kernel_nofault(buf
, __start
, tsz
);
596 if (m
->type
== KCORE_RAM
)
597 kc_unxlate_dev_mem_ptr(phys
, __start
);
599 if (iov_iter_zero(tsz
, iter
) != tsz
) {
605 * We know the bounce buffer is safe to copy from, so
606 * use _copy_to_iter() directly.
608 } else if (_copy_to_iter(buf
, tsz
, iter
) != tsz
) {
614 pr_warn_once("Unhandled KCORE type: %d\n", m
->type
);
615 if (iov_iter_zero(tsz
, iter
) != tsz
) {
624 tsz
= (buflen
> PAGE_SIZE
? PAGE_SIZE
: buflen
);
629 up_read(&kclist_lock
);
632 return orig_buflen
- buflen
;
635 static int open_kcore(struct inode
*inode
, struct file
*filp
)
637 int ret
= security_locked_down(LOCKDOWN_KCORE
);
639 if (!capable(CAP_SYS_RAWIO
))
645 filp
->private_data
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
646 if (!filp
->private_data
)
649 if (kcore_need_update
)
651 if (i_size_read(inode
) != proc_root_kcore
->size
) {
653 i_size_write(inode
, proc_root_kcore
->size
);
659 static int release_kcore(struct inode
*inode
, struct file
*file
)
661 kfree(file
->private_data
);
665 static const struct proc_ops kcore_proc_ops
= {
666 .proc_read_iter
= read_kcore_iter
,
667 .proc_open
= open_kcore
,
668 .proc_release
= release_kcore
,
669 .proc_lseek
= default_llseek
,
672 /* just remember that we have to update kcore */
673 static int __meminit
kcore_callback(struct notifier_block
*self
,
674 unsigned long action
, void *arg
)
679 kcore_need_update
= 1;
686 static struct kcore_list kcore_vmalloc
;
688 #ifdef CONFIG_ARCH_PROC_KCORE_TEXT
689 static struct kcore_list kcore_text
;
691 * If defined, special segment is used for mapping kernel text instead of
692 * direct-map area. We need to create special TEXT section.
694 static void __init
proc_kcore_text_init(void)
696 kclist_add(&kcore_text
, _text
, _end
- _text
, KCORE_TEXT
);
699 static void __init
proc_kcore_text_init(void)
704 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
706 * MODULES_VADDR has no intersection with VMALLOC_ADDR.
708 static struct kcore_list kcore_modules
;
709 static void __init
add_modules_range(void)
711 if (MODULES_VADDR
!= VMALLOC_START
&& MODULES_END
!= VMALLOC_END
) {
712 kclist_add(&kcore_modules
, (void *)MODULES_VADDR
,
713 MODULES_END
- MODULES_VADDR
, KCORE_VMALLOC
);
717 static void __init
add_modules_range(void)
722 static int __init
proc_kcore_init(void)
724 proc_root_kcore
= proc_create("kcore", S_IRUSR
, NULL
, &kcore_proc_ops
);
725 if (!proc_root_kcore
) {
726 pr_err("couldn't create /proc/kcore\n");
727 return 0; /* Always returns 0. */
729 /* Store text area if it's special */
730 proc_kcore_text_init();
731 /* Store vmalloc area */
732 kclist_add(&kcore_vmalloc
, (void *)VMALLOC_START
,
733 VMALLOC_END
- VMALLOC_START
, KCORE_VMALLOC
);
735 /* Store direct-map area from physical memory map */
737 hotplug_memory_notifier(kcore_callback
, DEFAULT_CALLBACK_PRI
);
741 fs_initcall(proc_kcore_init
);