1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Firmware Assisted dump: A robust mechanism to get reliable kernel crash
4 * dump with assistance from firmware. This approach does not use kexec,
5 * instead firmware assists in booting the kdump kernel while preserving
6 * memory contents. The most of the code implementation has been adapted
7 * from phyp assisted dump implementation written by Linas Vepstas and
10 * Copyright 2011 IBM Corporation
11 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
15 #define pr_fmt(fmt) "fadump: " fmt
17 #include <linux/string.h>
18 #include <linux/memblock.h>
19 #include <linux/delay.h>
20 #include <linux/seq_file.h>
21 #include <linux/crash_dump.h>
22 #include <linux/kobject.h>
23 #include <linux/sysfs.h>
24 #include <linux/slab.h>
25 #include <linux/cma.h>
26 #include <linux/hugetlb.h>
28 #include <asm/debugfs.h>
31 #include <asm/fadump.h>
32 #include <asm/fadump-internal.h>
33 #include <asm/setup.h>
35 static struct fw_dump fw_dump
;
37 static void __init
fadump_reserve_crash_area(u64 base
);
39 #ifndef CONFIG_PRESERVE_FA_DUMP
40 static DEFINE_MUTEX(fadump_mutex
);
41 struct fadump_mrange_info crash_mrange_info
= { "crash", NULL
, 0, 0, 0 };
42 struct fadump_mrange_info reserved_mrange_info
= { "reserved", NULL
, 0, 0, 0 };
45 static struct cma
*fadump_cma
;
48 * fadump_cma_init() - Initialize CMA area from a fadump reserved memory
50 * This function initializes CMA area from fadump reserved memory.
51 * The total size of fadump reserved memory covers for boot memory size
52 * + cpu data size + hpte size and metadata.
53 * Initialize only the area equivalent to boot memory size for CMA use.
54 * The reamining portion of fadump reserved memory will be not given
55 * to CMA and pages for thoes will stay reserved. boot memory size is
56 * aligned per CMA requirement to satisy cma_init_reserved_mem() call.
57 * But for some reason even if it fails we still have the memory reservation
58 * with us and we can still continue doing fadump.
60 int __init
fadump_cma_init(void)
62 unsigned long long base
, size
;
65 if (!fw_dump
.fadump_enabled
)
69 * Do not use CMA if user has provided fadump=nocma kernel parameter.
70 * Return 1 to continue with fadump old behaviour.
75 base
= fw_dump
.reserve_dump_area_start
;
76 size
= fw_dump
.boot_memory_size
;
81 rc
= cma_init_reserved_mem(base
, size
, 0, "fadump_cma", &fadump_cma
);
83 pr_err("Failed to init cma area for firmware-assisted dump,%d\n", rc
);
85 * Though the CMA init has failed we still have memory
86 * reservation with us. The reserved memory will be
87 * blocked from production system usage. Hence return 1,
88 * so that we can continue with fadump.
94 * So we now have successfully initialized cma area for fadump.
96 pr_info("Initialized 0x%lx bytes cma area at %ldMB from 0x%lx "
97 "bytes of memory reserved for firmware-assisted dump\n",
98 cma_get_size(fadump_cma
),
99 (unsigned long)cma_get_base(fadump_cma
) >> 20,
100 fw_dump
.reserve_dump_area_size
);
104 static int __init
fadump_cma_init(void) { return 1; }
105 #endif /* CONFIG_CMA */
107 /* Scan the Firmware Assisted dump configuration details. */
108 int __init
early_init_dt_scan_fw_dump(unsigned long node
, const char *uname
,
109 int depth
, void *data
)
114 if (strcmp(uname
, "rtas") == 0) {
115 rtas_fadump_dt_scan(&fw_dump
, node
);
119 if (strcmp(uname
, "ibm,opal") == 0) {
120 opal_fadump_dt_scan(&fw_dump
, node
);
128 * If fadump is registered, check if the memory provided
129 * falls within boot memory area and reserved memory area.
131 int is_fadump_memory_area(u64 addr
, unsigned long size
)
135 if (!fw_dump
.dump_registered
)
141 d_start
= fw_dump
.reserve_dump_area_start
;
142 d_end
= d_start
+ fw_dump
.reserve_dump_area_size
;
143 if (((addr
+ size
) > d_start
) && (addr
<= d_end
))
146 return (addr
<= fw_dump
.boot_mem_top
);
149 int should_fadump_crash(void)
151 if (!fw_dump
.dump_registered
|| !fw_dump
.fadumphdr_addr
)
156 int is_fadump_active(void)
158 return fw_dump
.dump_active
;
162 * Returns true, if there are no holes in memory area between d_start to d_end,
165 static bool is_fadump_mem_area_contiguous(u64 d_start
, u64 d_end
)
167 struct memblock_region
*reg
;
171 for_each_memblock(memory
, reg
) {
172 start
= max_t(u64
, d_start
, reg
->base
);
173 end
= min_t(u64
, d_end
, (reg
->base
+ reg
->size
));
175 /* Memory hole from d_start to start */
192 * Returns true, if there are no holes in boot memory area,
195 bool is_fadump_boot_mem_contiguous(void)
197 unsigned long d_start
, d_end
;
201 for (i
= 0; i
< fw_dump
.boot_mem_regs_cnt
; i
++) {
202 d_start
= fw_dump
.boot_mem_addr
[i
];
203 d_end
= d_start
+ fw_dump
.boot_mem_sz
[i
];
205 ret
= is_fadump_mem_area_contiguous(d_start
, d_end
);
214 * Returns true, if there are no holes in reserved memory area,
217 bool is_fadump_reserved_mem_contiguous(void)
221 d_start
= fw_dump
.reserve_dump_area_start
;
222 d_end
= d_start
+ fw_dump
.reserve_dump_area_size
;
223 return is_fadump_mem_area_contiguous(d_start
, d_end
);
226 /* Print firmware assisted dump configurations for debugging purpose. */
227 static void fadump_show_config(void)
231 pr_debug("Support for firmware-assisted dump (fadump): %s\n",
232 (fw_dump
.fadump_supported
? "present" : "no support"));
234 if (!fw_dump
.fadump_supported
)
237 pr_debug("Fadump enabled : %s\n",
238 (fw_dump
.fadump_enabled
? "yes" : "no"));
239 pr_debug("Dump Active : %s\n",
240 (fw_dump
.dump_active
? "yes" : "no"));
241 pr_debug("Dump section sizes:\n");
242 pr_debug(" CPU state data size: %lx\n", fw_dump
.cpu_state_data_size
);
243 pr_debug(" HPTE region size : %lx\n", fw_dump
.hpte_region_size
);
244 pr_debug(" Boot memory size : %lx\n", fw_dump
.boot_memory_size
);
245 pr_debug(" Boot memory top : %llx\n", fw_dump
.boot_mem_top
);
246 pr_debug("Boot memory regions cnt: %llx\n", fw_dump
.boot_mem_regs_cnt
);
247 for (i
= 0; i
< fw_dump
.boot_mem_regs_cnt
; i
++) {
248 pr_debug("[%03d] base = %llx, size = %llx\n", i
,
249 fw_dump
.boot_mem_addr
[i
], fw_dump
.boot_mem_sz
[i
]);
254 * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM
256 * Function to find the largest memory size we need to reserve during early
257 * boot process. This will be the size of the memory that is required for a
258 * kernel to boot successfully.
260 * This function has been taken from phyp-assisted dump feature implementation.
262 * returns larger of 256MB or 5% rounded down to multiples of 256MB.
264 * TODO: Come up with better approach to find out more accurate memory size
265 * that is required for a kernel to boot successfully.
268 static inline u64
fadump_calculate_reserve_size(void)
270 u64 base
, size
, bootmem_min
;
273 if (fw_dump
.reserve_bootvar
)
274 pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n");
277 * Check if the size is specified through crashkernel= cmdline
278 * option. If yes, then use that but ignore base as fadump reserves
279 * memory at a predefined offset.
281 ret
= parse_crashkernel(boot_command_line
, memblock_phys_mem_size(),
283 if (ret
== 0 && size
> 0) {
284 unsigned long max_size
;
286 if (fw_dump
.reserve_bootvar
)
287 pr_info("Using 'crashkernel=' parameter for memory reservation.\n");
289 fw_dump
.reserve_bootvar
= (unsigned long)size
;
292 * Adjust if the boot memory size specified is above
295 max_size
= memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO
;
296 if (fw_dump
.reserve_bootvar
> max_size
) {
297 fw_dump
.reserve_bootvar
= max_size
;
298 pr_info("Adjusted boot memory size to %luMB\n",
299 (fw_dump
.reserve_bootvar
>> 20));
302 return fw_dump
.reserve_bootvar
;
303 } else if (fw_dump
.reserve_bootvar
) {
305 * 'fadump_reserve_mem=' is being used to reserve memory
306 * for firmware-assisted dump.
308 return fw_dump
.reserve_bootvar
;
311 /* divide by 20 to get 5% of value */
312 size
= memblock_phys_mem_size() / 20;
314 /* round it down in multiples of 256 */
315 size
= size
& ~0x0FFFFFFFUL
;
317 /* Truncate to memory_limit. We don't want to over reserve the memory.*/
318 if (memory_limit
&& size
> memory_limit
)
321 bootmem_min
= fw_dump
.ops
->fadump_get_bootmem_min();
322 return (size
> bootmem_min
? size
: bootmem_min
);
326 * Calculate the total memory size required to be reserved for
327 * firmware-assisted dump registration.
329 static unsigned long get_fadump_area_size(void)
331 unsigned long size
= 0;
333 size
+= fw_dump
.cpu_state_data_size
;
334 size
+= fw_dump
.hpte_region_size
;
335 size
+= fw_dump
.boot_memory_size
;
336 size
+= sizeof(struct fadump_crash_info_header
);
337 size
+= sizeof(struct elfhdr
); /* ELF core header.*/
338 size
+= sizeof(struct elf_phdr
); /* place holder for cpu notes */
339 /* Program headers for crash memory regions. */
340 size
+= sizeof(struct elf_phdr
) * (memblock_num_regions(memory
) + 2);
342 size
= PAGE_ALIGN(size
);
344 /* This is to hold kernel metadata on platforms that support it */
345 size
+= (fw_dump
.ops
->fadump_get_metadata_size
?
346 fw_dump
.ops
->fadump_get_metadata_size() : 0);
350 static int __init
add_boot_mem_region(unsigned long rstart
,
353 int i
= fw_dump
.boot_mem_regs_cnt
++;
355 if (fw_dump
.boot_mem_regs_cnt
> FADUMP_MAX_MEM_REGS
) {
356 fw_dump
.boot_mem_regs_cnt
= FADUMP_MAX_MEM_REGS
;
360 pr_debug("Added boot memory range[%d] [%#016lx-%#016lx)\n",
361 i
, rstart
, (rstart
+ rsize
));
362 fw_dump
.boot_mem_addr
[i
] = rstart
;
363 fw_dump
.boot_mem_sz
[i
] = rsize
;
368 * Firmware usually has a hard limit on the data it can copy per region.
369 * Honour that by splitting a memory range into multiple regions.
371 static int __init
add_boot_mem_regions(unsigned long mstart
,
374 unsigned long rstart
, rsize
, max_size
;
378 max_size
= fw_dump
.max_copy_size
? fw_dump
.max_copy_size
: msize
;
380 if (msize
> max_size
)
385 ret
= add_boot_mem_region(rstart
, rsize
);
396 static int __init
fadump_get_boot_mem_regions(void)
398 unsigned long base
, size
, cur_size
, hole_size
, last_end
;
399 unsigned long mem_size
= fw_dump
.boot_memory_size
;
400 struct memblock_region
*reg
;
403 fw_dump
.boot_mem_regs_cnt
= 0;
408 for_each_memblock(memory
, reg
) {
411 hole_size
+= (base
- last_end
);
413 if ((cur_size
+ size
) >= mem_size
) {
414 size
= (mem_size
- cur_size
);
415 ret
= add_boot_mem_regions(base
, size
);
421 ret
= add_boot_mem_regions(base
, size
);
425 last_end
= base
+ size
;
427 fw_dump
.boot_mem_top
= PAGE_ALIGN(fw_dump
.boot_memory_size
+ hole_size
);
432 int __init
fadump_reserve_mem(void)
434 u64 base
, size
, mem_boundary
, bootmem_min
, align
= PAGE_SIZE
;
435 bool is_memblock_bottom_up
= memblock_bottom_up();
438 if (!fw_dump
.fadump_enabled
)
441 if (!fw_dump
.fadump_supported
) {
442 pr_info("Firmware-Assisted Dump is not supported on this hardware\n");
447 * Initialize boot memory size
448 * If dump is active then we have already calculated the size during
451 if (!fw_dump
.dump_active
) {
452 fw_dump
.boot_memory_size
=
453 PAGE_ALIGN(fadump_calculate_reserve_size());
455 if (!fw_dump
.nocma
) {
456 align
= FADUMP_CMA_ALIGNMENT
;
457 fw_dump
.boot_memory_size
=
458 ALIGN(fw_dump
.boot_memory_size
, align
);
462 bootmem_min
= fw_dump
.ops
->fadump_get_bootmem_min();
463 if (fw_dump
.boot_memory_size
< bootmem_min
) {
464 pr_err("Can't enable fadump with boot memory size (0x%lx) less than 0x%llx\n",
465 fw_dump
.boot_memory_size
, bootmem_min
);
469 if (!fadump_get_boot_mem_regions()) {
470 pr_err("Too many holes in boot memory area to enable fadump\n");
476 * Calculate the memory boundary.
477 * If memory_limit is less than actual memory boundary then reserve
478 * the memory for fadump beyond the memory_limit and adjust the
479 * memory_limit accordingly, so that the running kernel can run with
480 * specified memory_limit.
482 if (memory_limit
&& memory_limit
< memblock_end_of_DRAM()) {
483 size
= get_fadump_area_size();
484 if ((memory_limit
+ size
) < memblock_end_of_DRAM())
485 memory_limit
+= size
;
487 memory_limit
= memblock_end_of_DRAM();
488 printk(KERN_INFO
"Adjusted memory_limit for firmware-assisted"
489 " dump, now %#016llx\n", memory_limit
);
492 mem_boundary
= memory_limit
;
494 mem_boundary
= memblock_end_of_DRAM();
496 base
= fw_dump
.boot_mem_top
;
497 size
= get_fadump_area_size();
498 fw_dump
.reserve_dump_area_size
= size
;
499 if (fw_dump
.dump_active
) {
500 pr_info("Firmware-assisted dump is active.\n");
502 #ifdef CONFIG_HUGETLB_PAGE
504 * FADump capture kernel doesn't care much about hugepages.
505 * In fact, handling hugepages in capture kernel is asking for
506 * trouble. So, disable HugeTLB support when fadump is active.
508 hugetlb_disabled
= true;
511 * If last boot has crashed then reserve all the memory
512 * above boot memory size so that we don't touch it until
513 * dump is written to disk by userspace tool. This memory
514 * can be released for general use by invalidating fadump.
516 fadump_reserve_crash_area(base
);
518 pr_debug("fadumphdr_addr = %#016lx\n", fw_dump
.fadumphdr_addr
);
519 pr_debug("Reserve dump area start address: 0x%lx\n",
520 fw_dump
.reserve_dump_area_start
);
523 * Reserve memory at an offset closer to bottom of the RAM to
524 * minimize the impact of memory hot-remove operation.
526 memblock_set_bottom_up(true);
527 base
= memblock_find_in_range(base
, mem_boundary
, size
, align
);
529 /* Restore the previous allocation mode */
530 memblock_set_bottom_up(is_memblock_bottom_up
);
533 pr_err("Failed to find memory chunk for reservation!\n");
536 fw_dump
.reserve_dump_area_start
= base
;
539 * Calculate the kernel metadata address and register it with
540 * f/w if the platform supports.
542 if (fw_dump
.ops
->fadump_setup_metadata
&&
543 (fw_dump
.ops
->fadump_setup_metadata(&fw_dump
) < 0))
546 if (memblock_reserve(base
, size
)) {
547 pr_err("Failed to reserve memory!\n");
551 pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n",
552 (size
>> 20), base
, (memblock_phys_mem_size() >> 20));
554 ret
= fadump_cma_init();
559 fw_dump
.fadump_enabled
= 0;
563 /* Look for fadump= cmdline option. */
564 static int __init
early_fadump_param(char *p
)
569 if (strncmp(p
, "on", 2) == 0)
570 fw_dump
.fadump_enabled
= 1;
571 else if (strncmp(p
, "off", 3) == 0)
572 fw_dump
.fadump_enabled
= 0;
573 else if (strncmp(p
, "nocma", 5) == 0) {
574 fw_dump
.fadump_enabled
= 1;
580 early_param("fadump", early_fadump_param
);
583 * Look for fadump_reserve_mem= cmdline option
584 * TODO: Remove references to 'fadump_reserve_mem=' parameter,
585 * the sooner 'crashkernel=' parameter is accustomed to.
587 static int __init
early_fadump_reserve_mem(char *p
)
590 fw_dump
.reserve_bootvar
= memparse(p
, &p
);
593 early_param("fadump_reserve_mem", early_fadump_reserve_mem
);
595 void crash_fadump(struct pt_regs
*regs
, const char *str
)
597 struct fadump_crash_info_header
*fdh
= NULL
;
598 int old_cpu
, this_cpu
;
600 if (!should_fadump_crash())
604 * old_cpu == -1 means this is the first CPU which has come here,
605 * go ahead and trigger fadump.
607 * old_cpu != -1 means some other CPU has already on it's way
608 * to trigger fadump, just keep looping here.
610 this_cpu
= smp_processor_id();
611 old_cpu
= cmpxchg(&crashing_cpu
, -1, this_cpu
);
615 * We can't loop here indefinitely. Wait as long as fadump
616 * is in force. If we race with fadump un-registration this
617 * loop will break and then we go down to normal panic path
618 * and reboot. If fadump is in force the first crashing
619 * cpu will definitely trigger fadump.
621 while (fw_dump
.dump_registered
)
626 fdh
= __va(fw_dump
.fadumphdr_addr
);
627 fdh
->crashing_cpu
= crashing_cpu
;
628 crash_save_vmcoreinfo();
633 ppc_save_regs(&fdh
->regs
);
635 fdh
->online_mask
= *cpu_online_mask
;
637 fw_dump
.ops
->fadump_trigger(fdh
, str
);
640 u32
*fadump_regs_to_elf_notes(u32
*buf
, struct pt_regs
*regs
)
642 struct elf_prstatus prstatus
;
644 memset(&prstatus
, 0, sizeof(prstatus
));
646 * FIXME: How do i get PID? Do I really need it?
647 * prstatus.pr_pid = ????
649 elf_core_copy_kernel_regs(&prstatus
.pr_reg
, regs
);
650 buf
= append_elf_note(buf
, CRASH_CORE_NOTE_NAME
, NT_PRSTATUS
,
651 &prstatus
, sizeof(prstatus
));
655 void fadump_update_elfcore_header(char *bufp
)
658 struct elf_phdr
*phdr
;
660 elf
= (struct elfhdr
*)bufp
;
661 bufp
+= sizeof(struct elfhdr
);
663 /* First note is a place holder for cpu notes info. */
664 phdr
= (struct elf_phdr
*)bufp
;
666 if (phdr
->p_type
== PT_NOTE
) {
667 phdr
->p_paddr
= __pa(fw_dump
.cpu_notes_buf_vaddr
);
668 phdr
->p_offset
= phdr
->p_paddr
;
669 phdr
->p_filesz
= fw_dump
.cpu_notes_buf_size
;
670 phdr
->p_memsz
= fw_dump
.cpu_notes_buf_size
;
675 static void *fadump_alloc_buffer(unsigned long size
)
677 unsigned long count
, i
;
681 vaddr
= alloc_pages_exact(size
, GFP_KERNEL
| __GFP_ZERO
);
685 count
= PAGE_ALIGN(size
) / PAGE_SIZE
;
686 page
= virt_to_page(vaddr
);
687 for (i
= 0; i
< count
; i
++)
688 mark_page_reserved(page
+ i
);
692 static void fadump_free_buffer(unsigned long vaddr
, unsigned long size
)
694 free_reserved_area((void *)vaddr
, (void *)(vaddr
+ size
), -1, NULL
);
697 s32
fadump_setup_cpu_notes_buf(u32 num_cpus
)
699 /* Allocate buffer to hold cpu crash notes. */
700 fw_dump
.cpu_notes_buf_size
= num_cpus
* sizeof(note_buf_t
);
701 fw_dump
.cpu_notes_buf_size
= PAGE_ALIGN(fw_dump
.cpu_notes_buf_size
);
702 fw_dump
.cpu_notes_buf_vaddr
=
703 (unsigned long)fadump_alloc_buffer(fw_dump
.cpu_notes_buf_size
);
704 if (!fw_dump
.cpu_notes_buf_vaddr
) {
705 pr_err("Failed to allocate %ld bytes for CPU notes buffer\n",
706 fw_dump
.cpu_notes_buf_size
);
710 pr_debug("Allocated buffer for cpu notes of size %ld at 0x%lx\n",
711 fw_dump
.cpu_notes_buf_size
,
712 fw_dump
.cpu_notes_buf_vaddr
);
716 void fadump_free_cpu_notes_buf(void)
718 if (!fw_dump
.cpu_notes_buf_vaddr
)
721 fadump_free_buffer(fw_dump
.cpu_notes_buf_vaddr
,
722 fw_dump
.cpu_notes_buf_size
);
723 fw_dump
.cpu_notes_buf_vaddr
= 0;
724 fw_dump
.cpu_notes_buf_size
= 0;
727 static void fadump_free_mem_ranges(struct fadump_mrange_info
*mrange_info
)
729 kfree(mrange_info
->mem_ranges
);
730 mrange_info
->mem_ranges
= NULL
;
731 mrange_info
->mem_ranges_sz
= 0;
732 mrange_info
->max_mem_ranges
= 0;
736 * Allocate or reallocate mem_ranges array in incremental units
739 static int fadump_alloc_mem_ranges(struct fadump_mrange_info
*mrange_info
)
741 struct fadump_memory_range
*new_array
;
744 new_size
= mrange_info
->mem_ranges_sz
+ PAGE_SIZE
;
745 pr_debug("Allocating %llu bytes of memory for %s memory ranges\n",
746 new_size
, mrange_info
->name
);
748 new_array
= krealloc(mrange_info
->mem_ranges
, new_size
, GFP_KERNEL
);
749 if (new_array
== NULL
) {
750 pr_err("Insufficient memory for setting up %s memory ranges\n",
752 fadump_free_mem_ranges(mrange_info
);
756 mrange_info
->mem_ranges
= new_array
;
757 mrange_info
->mem_ranges_sz
= new_size
;
758 mrange_info
->max_mem_ranges
= (new_size
/
759 sizeof(struct fadump_memory_range
));
763 static inline int fadump_add_mem_range(struct fadump_mrange_info
*mrange_info
,
766 struct fadump_memory_range
*mem_ranges
= mrange_info
->mem_ranges
;
767 bool is_adjacent
= false;
774 * Fold adjacent memory ranges to bring down the memory ranges/
775 * PT_LOAD segments count.
777 if (mrange_info
->mem_range_cnt
) {
778 start
= mem_ranges
[mrange_info
->mem_range_cnt
- 1].base
;
779 size
= mem_ranges
[mrange_info
->mem_range_cnt
- 1].size
;
781 if ((start
+ size
) == base
)
785 /* resize the array on reaching the limit */
786 if (mrange_info
->mem_range_cnt
== mrange_info
->max_mem_ranges
) {
789 ret
= fadump_alloc_mem_ranges(mrange_info
);
793 /* Update to the new resized array */
794 mem_ranges
= mrange_info
->mem_ranges
;
798 mem_ranges
[mrange_info
->mem_range_cnt
].base
= start
;
799 mrange_info
->mem_range_cnt
++;
802 mem_ranges
[mrange_info
->mem_range_cnt
- 1].size
= (end
- start
);
803 pr_debug("%s_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
804 mrange_info
->name
, (mrange_info
->mem_range_cnt
- 1),
805 start
, end
- 1, (end
- start
));
809 static int fadump_exclude_reserved_area(u64 start
, u64 end
)
811 u64 ra_start
, ra_end
;
814 ra_start
= fw_dump
.reserve_dump_area_start
;
815 ra_end
= ra_start
+ fw_dump
.reserve_dump_area_size
;
817 if ((ra_start
< end
) && (ra_end
> start
)) {
818 if ((start
< ra_start
) && (end
> ra_end
)) {
819 ret
= fadump_add_mem_range(&crash_mrange_info
,
824 ret
= fadump_add_mem_range(&crash_mrange_info
,
826 } else if (start
< ra_start
) {
827 ret
= fadump_add_mem_range(&crash_mrange_info
,
829 } else if (ra_end
< end
) {
830 ret
= fadump_add_mem_range(&crash_mrange_info
,
834 ret
= fadump_add_mem_range(&crash_mrange_info
, start
, end
);
839 static int fadump_init_elfcore_header(char *bufp
)
843 elf
= (struct elfhdr
*) bufp
;
844 bufp
+= sizeof(struct elfhdr
);
845 memcpy(elf
->e_ident
, ELFMAG
, SELFMAG
);
846 elf
->e_ident
[EI_CLASS
] = ELF_CLASS
;
847 elf
->e_ident
[EI_DATA
] = ELF_DATA
;
848 elf
->e_ident
[EI_VERSION
] = EV_CURRENT
;
849 elf
->e_ident
[EI_OSABI
] = ELF_OSABI
;
850 memset(elf
->e_ident
+EI_PAD
, 0, EI_NIDENT
-EI_PAD
);
851 elf
->e_type
= ET_CORE
;
852 elf
->e_machine
= ELF_ARCH
;
853 elf
->e_version
= EV_CURRENT
;
855 elf
->e_phoff
= sizeof(struct elfhdr
);
857 #if defined(_CALL_ELF)
858 elf
->e_flags
= _CALL_ELF
;
862 elf
->e_ehsize
= sizeof(struct elfhdr
);
863 elf
->e_phentsize
= sizeof(struct elf_phdr
);
865 elf
->e_shentsize
= 0;
873 * Traverse through memblock structure and setup crash memory ranges. These
874 * ranges will be used create PT_LOAD program headers in elfcore header.
876 static int fadump_setup_crash_memory_ranges(void)
878 struct memblock_region
*reg
;
882 pr_debug("Setup crash memory ranges.\n");
883 crash_mrange_info
.mem_range_cnt
= 0;
886 * Boot memory region(s) registered with firmware are moved to
887 * different location at the time of crash. Create separate program
888 * header(s) for this memory chunk(s) with the correct offset.
890 for (i
= 0; i
< fw_dump
.boot_mem_regs_cnt
; i
++) {
891 start
= fw_dump
.boot_mem_addr
[i
];
892 end
= start
+ fw_dump
.boot_mem_sz
[i
];
893 ret
= fadump_add_mem_range(&crash_mrange_info
, start
, end
);
898 for_each_memblock(memory
, reg
) {
899 start
= (u64
)reg
->base
;
900 end
= start
+ (u64
)reg
->size
;
903 * skip the memory chunk that is already added
904 * (0 through boot_memory_top).
906 if (start
< fw_dump
.boot_mem_top
) {
907 if (end
> fw_dump
.boot_mem_top
)
908 start
= fw_dump
.boot_mem_top
;
913 /* add this range excluding the reserved dump area. */
914 ret
= fadump_exclude_reserved_area(start
, end
);
923 * If the given physical address falls within the boot memory region then
924 * return the relocated address that points to the dump region reserved
925 * for saving initial boot memory contents.
927 static inline unsigned long fadump_relocate(unsigned long paddr
)
929 unsigned long raddr
, rstart
, rend
, rlast
, hole_size
;
935 for (i
= 0; i
< fw_dump
.boot_mem_regs_cnt
; i
++) {
936 rstart
= fw_dump
.boot_mem_addr
[i
];
937 rend
= rstart
+ fw_dump
.boot_mem_sz
[i
];
938 hole_size
+= (rstart
- rlast
);
940 if (paddr
>= rstart
&& paddr
< rend
) {
941 raddr
+= fw_dump
.boot_mem_dest_addr
- hole_size
;
948 pr_debug("vmcoreinfo: paddr = 0x%lx, raddr = 0x%lx\n", paddr
, raddr
);
952 static int fadump_create_elfcore_headers(char *bufp
)
954 unsigned long long raddr
, offset
;
955 struct elf_phdr
*phdr
;
959 fadump_init_elfcore_header(bufp
);
960 elf
= (struct elfhdr
*)bufp
;
961 bufp
+= sizeof(struct elfhdr
);
964 * setup ELF PT_NOTE, place holder for cpu notes info. The notes info
965 * will be populated during second kernel boot after crash. Hence
966 * this PT_NOTE will always be the first elf note.
968 * NOTE: Any new ELF note addition should be placed after this note.
970 phdr
= (struct elf_phdr
*)bufp
;
971 bufp
+= sizeof(struct elf_phdr
);
972 phdr
->p_type
= PT_NOTE
;
984 /* setup ELF PT_NOTE for vmcoreinfo */
985 phdr
= (struct elf_phdr
*)bufp
;
986 bufp
+= sizeof(struct elf_phdr
);
987 phdr
->p_type
= PT_NOTE
;
992 phdr
->p_paddr
= fadump_relocate(paddr_vmcoreinfo_note());
993 phdr
->p_offset
= phdr
->p_paddr
;
994 phdr
->p_memsz
= phdr
->p_filesz
= VMCOREINFO_NOTE_SIZE
;
996 /* Increment number of program headers. */
999 /* setup PT_LOAD sections. */
1002 raddr
= fw_dump
.boot_mem_addr
[0];
1003 for (i
= 0; i
< crash_mrange_info
.mem_range_cnt
; i
++) {
1006 mbase
= crash_mrange_info
.mem_ranges
[i
].base
;
1007 msize
= crash_mrange_info
.mem_ranges
[i
].size
;
1011 phdr
= (struct elf_phdr
*)bufp
;
1012 bufp
+= sizeof(struct elf_phdr
);
1013 phdr
->p_type
= PT_LOAD
;
1014 phdr
->p_flags
= PF_R
|PF_W
|PF_X
;
1015 phdr
->p_offset
= mbase
;
1017 if (mbase
== raddr
) {
1019 * The entire real memory region will be moved by
1020 * firmware to the specified destination_address.
1021 * Hence set the correct offset.
1023 phdr
->p_offset
= fw_dump
.boot_mem_dest_addr
+ offset
;
1024 if (j
< (fw_dump
.boot_mem_regs_cnt
- 1)) {
1025 offset
+= fw_dump
.boot_mem_sz
[j
];
1026 raddr
= fw_dump
.boot_mem_addr
[++j
];
1030 phdr
->p_paddr
= mbase
;
1031 phdr
->p_vaddr
= (unsigned long)__va(mbase
);
1032 phdr
->p_filesz
= msize
;
1033 phdr
->p_memsz
= msize
;
1036 /* Increment number of program headers. */
1042 static unsigned long init_fadump_header(unsigned long addr
)
1044 struct fadump_crash_info_header
*fdh
;
1050 addr
+= sizeof(struct fadump_crash_info_header
);
1052 memset(fdh
, 0, sizeof(struct fadump_crash_info_header
));
1053 fdh
->magic_number
= FADUMP_CRASH_INFO_MAGIC
;
1054 fdh
->elfcorehdr_addr
= addr
;
1055 /* We will set the crashing cpu id in crash_fadump() during crash. */
1056 fdh
->crashing_cpu
= FADUMP_CPU_UNKNOWN
;
1061 static int register_fadump(void)
1068 * If no memory is reserved then we can not register for firmware-
1071 if (!fw_dump
.reserve_dump_area_size
)
1074 ret
= fadump_setup_crash_memory_ranges();
1078 addr
= fw_dump
.fadumphdr_addr
;
1080 /* Initialize fadump crash info header. */
1081 addr
= init_fadump_header(addr
);
1084 pr_debug("Creating ELF core headers at %#016lx\n", addr
);
1085 fadump_create_elfcore_headers(vaddr
);
1087 /* register the future kernel dump with firmware. */
1088 pr_debug("Registering for firmware-assisted kernel dump...\n");
1089 return fw_dump
.ops
->fadump_register(&fw_dump
);
1092 void fadump_cleanup(void)
1094 if (!fw_dump
.fadump_supported
)
1097 /* Invalidate the registration only if dump is active. */
1098 if (fw_dump
.dump_active
) {
1099 pr_debug("Invalidating firmware-assisted dump registration\n");
1100 fw_dump
.ops
->fadump_invalidate(&fw_dump
);
1101 } else if (fw_dump
.dump_registered
) {
1102 /* Un-register Firmware-assisted dump if it was registered. */
1103 fw_dump
.ops
->fadump_unregister(&fw_dump
);
1104 fadump_free_mem_ranges(&crash_mrange_info
);
1107 if (fw_dump
.ops
->fadump_cleanup
)
1108 fw_dump
.ops
->fadump_cleanup(&fw_dump
);
1111 static void fadump_free_reserved_memory(unsigned long start_pfn
,
1112 unsigned long end_pfn
)
1115 unsigned long time_limit
= jiffies
+ HZ
;
1117 pr_info("freeing reserved memory (0x%llx - 0x%llx)\n",
1118 PFN_PHYS(start_pfn
), PFN_PHYS(end_pfn
));
1120 for (pfn
= start_pfn
; pfn
< end_pfn
; pfn
++) {
1121 free_reserved_page(pfn_to_page(pfn
));
1123 if (time_after(jiffies
, time_limit
)) {
1125 time_limit
= jiffies
+ HZ
;
1131 * Skip memory holes and free memory that was actually reserved.
1133 static void fadump_release_reserved_area(u64 start
, u64 end
)
1135 u64 tstart
, tend
, spfn
, epfn
;
1136 struct memblock_region
*reg
;
1138 spfn
= PHYS_PFN(start
);
1139 epfn
= PHYS_PFN(end
);
1140 for_each_memblock(memory
, reg
) {
1141 tstart
= max_t(u64
, spfn
, memblock_region_memory_base_pfn(reg
));
1142 tend
= min_t(u64
, epfn
, memblock_region_memory_end_pfn(reg
));
1143 if (tstart
< tend
) {
1144 fadump_free_reserved_memory(tstart
, tend
);
1155 * Sort the mem ranges in-place and merge adjacent ranges
1156 * to minimize the memory ranges count.
1158 static void sort_and_merge_mem_ranges(struct fadump_mrange_info
*mrange_info
)
1160 struct fadump_memory_range
*mem_ranges
;
1161 struct fadump_memory_range tmp_range
;
1165 if (!reserved_mrange_info
.mem_range_cnt
)
1168 /* Sort the memory ranges */
1169 mem_ranges
= mrange_info
->mem_ranges
;
1170 for (i
= 0; i
< mrange_info
->mem_range_cnt
; i
++) {
1172 for (j
= (i
+ 1); j
< mrange_info
->mem_range_cnt
; j
++) {
1173 if (mem_ranges
[idx
].base
> mem_ranges
[j
].base
)
1177 tmp_range
= mem_ranges
[idx
];
1178 mem_ranges
[idx
] = mem_ranges
[i
];
1179 mem_ranges
[i
] = tmp_range
;
1183 /* Merge adjacent reserved ranges */
1185 for (i
= 1; i
< mrange_info
->mem_range_cnt
; i
++) {
1186 base
= mem_ranges
[i
-1].base
;
1187 size
= mem_ranges
[i
-1].size
;
1188 if (mem_ranges
[i
].base
== (base
+ size
))
1189 mem_ranges
[idx
].size
+= mem_ranges
[i
].size
;
1195 mem_ranges
[idx
] = mem_ranges
[i
];
1198 mrange_info
->mem_range_cnt
= idx
+ 1;
1202 * Scan reserved-ranges to consider them while reserving/releasing
1203 * memory for FADump.
1205 static inline int fadump_scan_reserved_mem_ranges(void)
1207 struct device_node
*root
;
1212 root
= of_find_node_by_path("/");
1216 prop
= of_get_property(root
, "reserved-ranges", &len
);
1221 * Each reserved range is an (address,size) pair, 2 cells each,
1222 * totalling 4 cells per range.
1224 for (i
= 0; i
< len
/ (sizeof(*prop
) * 4); i
++) {
1227 base
= of_read_number(prop
+ (i
* 4) + 0, 2);
1228 size
= of_read_number(prop
+ (i
* 4) + 2, 2);
1231 ret
= fadump_add_mem_range(&reserved_mrange_info
,
1234 pr_warn("some reserved ranges are ignored!\n");
1244 * Release the memory that was reserved during early boot to preserve the
1245 * crash'ed kernel's memory contents except reserved dump area (permanent
1246 * reservation) and reserved ranges used by F/W. The released memory will
1247 * be available for general use.
1249 static void fadump_release_memory(u64 begin
, u64 end
)
1251 u64 ra_start
, ra_end
, tstart
;
1254 fadump_scan_reserved_mem_ranges();
1256 ra_start
= fw_dump
.reserve_dump_area_start
;
1257 ra_end
= ra_start
+ fw_dump
.reserve_dump_area_size
;
1260 * Add reserved dump area to reserved ranges list
1261 * and exclude all these ranges while releasing memory.
1263 ret
= fadump_add_mem_range(&reserved_mrange_info
, ra_start
, ra_end
);
1266 * Not enough memory to setup reserved ranges but the system is
1267 * running shortage of memory. So, release all the memory except
1268 * Reserved dump area (reused for next fadump registration).
1270 if (begin
< ra_end
&& end
> ra_start
) {
1271 if (begin
< ra_start
)
1272 fadump_release_reserved_area(begin
, ra_start
);
1274 fadump_release_reserved_area(ra_end
, end
);
1276 fadump_release_reserved_area(begin
, end
);
1281 /* Get the reserved ranges list in order first. */
1282 sort_and_merge_mem_ranges(&reserved_mrange_info
);
1284 /* Exclude reserved ranges and release remaining memory */
1286 for (i
= 0; i
< reserved_mrange_info
.mem_range_cnt
; i
++) {
1287 ra_start
= reserved_mrange_info
.mem_ranges
[i
].base
;
1288 ra_end
= ra_start
+ reserved_mrange_info
.mem_ranges
[i
].size
;
1290 if (tstart
>= ra_end
)
1293 if (tstart
< ra_start
)
1294 fadump_release_reserved_area(tstart
, ra_start
);
1299 fadump_release_reserved_area(tstart
, end
);
1302 static void fadump_invalidate_release_mem(void)
1304 mutex_lock(&fadump_mutex
);
1305 if (!fw_dump
.dump_active
) {
1306 mutex_unlock(&fadump_mutex
);
1311 mutex_unlock(&fadump_mutex
);
1313 fadump_release_memory(fw_dump
.boot_mem_top
, memblock_end_of_DRAM());
1314 fadump_free_cpu_notes_buf();
1317 * Setup kernel metadata and initialize the kernel dump
1318 * memory structure for FADump re-registration.
1320 if (fw_dump
.ops
->fadump_setup_metadata
&&
1321 (fw_dump
.ops
->fadump_setup_metadata(&fw_dump
) < 0))
1322 pr_warn("Failed to setup kernel metadata!\n");
1323 fw_dump
.ops
->fadump_init_mem_struct(&fw_dump
);
1326 static ssize_t
fadump_release_memory_store(struct kobject
*kobj
,
1327 struct kobj_attribute
*attr
,
1328 const char *buf
, size_t count
)
1332 if (!fw_dump
.dump_active
)
1335 if (kstrtoint(buf
, 0, &input
))
1340 * Take away the '/proc/vmcore'. We are releasing the dump
1341 * memory, hence it will not be valid anymore.
1343 #ifdef CONFIG_PROC_VMCORE
1346 fadump_invalidate_release_mem();
1353 static ssize_t
fadump_enabled_show(struct kobject
*kobj
,
1354 struct kobj_attribute
*attr
,
1357 return sprintf(buf
, "%d\n", fw_dump
.fadump_enabled
);
1360 static ssize_t
fadump_register_show(struct kobject
*kobj
,
1361 struct kobj_attribute
*attr
,
1364 return sprintf(buf
, "%d\n", fw_dump
.dump_registered
);
1367 static ssize_t
fadump_register_store(struct kobject
*kobj
,
1368 struct kobj_attribute
*attr
,
1369 const char *buf
, size_t count
)
1374 if (!fw_dump
.fadump_enabled
|| fw_dump
.dump_active
)
1377 if (kstrtoint(buf
, 0, &input
))
1380 mutex_lock(&fadump_mutex
);
1384 if (fw_dump
.dump_registered
== 0) {
1388 /* Un-register Firmware-assisted dump */
1389 pr_debug("Un-register firmware-assisted dump\n");
1390 fw_dump
.ops
->fadump_unregister(&fw_dump
);
1393 if (fw_dump
.dump_registered
== 1) {
1394 /* Un-register Firmware-assisted dump */
1395 fw_dump
.ops
->fadump_unregister(&fw_dump
);
1397 /* Register Firmware-assisted dump */
1398 ret
= register_fadump();
1406 mutex_unlock(&fadump_mutex
);
1407 return ret
< 0 ? ret
: count
;
1410 static int fadump_region_show(struct seq_file
*m
, void *private)
1412 if (!fw_dump
.fadump_enabled
)
1415 mutex_lock(&fadump_mutex
);
1416 fw_dump
.ops
->fadump_region_show(&fw_dump
, m
);
1417 mutex_unlock(&fadump_mutex
);
1421 static struct kobj_attribute fadump_release_attr
= __ATTR(fadump_release_mem
,
1423 fadump_release_memory_store
);
1424 static struct kobj_attribute fadump_attr
= __ATTR(fadump_enabled
,
1425 0444, fadump_enabled_show
,
1427 static struct kobj_attribute fadump_register_attr
= __ATTR(fadump_registered
,
1428 0644, fadump_register_show
,
1429 fadump_register_store
);
1431 DEFINE_SHOW_ATTRIBUTE(fadump_region
);
1433 static void fadump_init_files(void)
1435 struct dentry
*debugfs_file
;
1438 rc
= sysfs_create_file(kernel_kobj
, &fadump_attr
.attr
);
1440 printk(KERN_ERR
"fadump: unable to create sysfs file"
1441 " fadump_enabled (%d)\n", rc
);
1443 rc
= sysfs_create_file(kernel_kobj
, &fadump_register_attr
.attr
);
1445 printk(KERN_ERR
"fadump: unable to create sysfs file"
1446 " fadump_registered (%d)\n", rc
);
1448 debugfs_file
= debugfs_create_file("fadump_region", 0444,
1449 powerpc_debugfs_root
, NULL
,
1450 &fadump_region_fops
);
1452 printk(KERN_ERR
"fadump: unable to create debugfs file"
1453 " fadump_region\n");
1455 if (fw_dump
.dump_active
) {
1456 rc
= sysfs_create_file(kernel_kobj
, &fadump_release_attr
.attr
);
1458 printk(KERN_ERR
"fadump: unable to create sysfs file"
1459 " fadump_release_mem (%d)\n", rc
);
1465 * Prepare for firmware-assisted dump.
1467 int __init
setup_fadump(void)
1469 if (!fw_dump
.fadump_supported
)
1472 fadump_init_files();
1473 fadump_show_config();
1475 if (!fw_dump
.fadump_enabled
)
1479 * If dump data is available then see if it is valid and prepare for
1480 * saving it to the disk.
1482 if (fw_dump
.dump_active
) {
1484 * if dump process fails then invalidate the registration
1485 * and release memory before proceeding for re-registration.
1487 if (fw_dump
.ops
->fadump_process(&fw_dump
) < 0)
1488 fadump_invalidate_release_mem();
1490 /* Initialize the kernel dump memory structure for FAD registration. */
1491 else if (fw_dump
.reserve_dump_area_size
)
1492 fw_dump
.ops
->fadump_init_mem_struct(&fw_dump
);
1496 subsys_initcall(setup_fadump
);
1497 #else /* !CONFIG_PRESERVE_FA_DUMP */
1499 /* Scan the Firmware Assisted dump configuration details. */
1500 int __init
early_init_dt_scan_fw_dump(unsigned long node
, const char *uname
,
1501 int depth
, void *data
)
1503 if ((depth
!= 1) || (strcmp(uname
, "ibm,opal") != 0))
1506 opal_fadump_dt_scan(&fw_dump
, node
);
1511 * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
1512 * preserve crash data. The subsequent memory preserving kernel boot
1513 * is likely to process this crash data.
1515 int __init
fadump_reserve_mem(void)
1517 if (fw_dump
.dump_active
) {
1519 * If last boot has crashed then reserve all the memory
1520 * above boot memory to preserve crash data.
1522 pr_info("Preserving crash data for processing in next boot.\n");
1523 fadump_reserve_crash_area(fw_dump
.boot_mem_top
);
1525 pr_debug("FADump-aware kernel..\n");
1529 #endif /* CONFIG_PRESERVE_FA_DUMP */
1531 /* Preserve everything above the base address */
1532 static void __init
fadump_reserve_crash_area(u64 base
)
1534 struct memblock_region
*reg
;
1537 for_each_memblock(memory
, reg
) {
1541 if ((mstart
+ msize
) < base
)
1544 if (mstart
< base
) {
1545 msize
-= (base
- mstart
);
1549 pr_info("Reserving %lluMB of memory at %#016llx for preserving crash data",
1550 (msize
>> 20), mstart
);
1551 memblock_reserve(mstart
, msize
);
1555 unsigned long __init
arch_reserved_kernel_pages(void)
1557 return memblock_reserved_size() / PAGE_SIZE
;