1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright 2016, Rashmica Gupta, IBM Corp.
5 * This traverses the kernel pagetables and dumps the
6 * information about the used sections of memory to
7 * /sys/kernel/debug/kernel_pagetables.
9 * Derived from the arm64 implementation:
10 * Copyright (c) 2014, The Linux Foundation, Laura Abbott.
11 * (C) Copyright 2008 Intel Corporation, Arjan van de Ven.
13 #include <linux/debugfs.h>
15 #include <linux/hugetlb.h>
18 #include <linux/highmem.h>
19 #include <linux/sched.h>
20 #include <linux/seq_file.h>
21 #include <asm/fixmap.h>
22 #include <linux/const.h>
24 #include <asm/hugetlb.h>
26 #include <mm/mmu_decl.h>
31 * To visualise what is happening,
33 * - PTRS_PER_P** = how many entries there are in the corresponding P**
34 * - P**_SHIFT = how many bits of the address we use to index into the
36 * - P**_SIZE is how much memory we can access through the table - not the
37 * size of the table itself.
38 * P**={PGD, PUD, PMD, PTE}
41 * Each entry of the PGD points to a PUD. Each entry of a PUD points to a
42 * PMD. Each entry of a PMD points to a PTE. And every PTE entry points to
45 * In the case where there are only 3 levels, the PUD is folded into the
46 * PGD: every PUD has only one entry which points to the PMD.
48 * The page dumper groups page table entries of the same type into a single
49 * description. It uses pg_state to track the range information while
50 * iterating over the PTE entries. When the continuity is broken it then
51 * dumps out a description of the range - ie PTEs that are virtually contiguous
52 * with the same PTE flags are chunked together. This is to make it clear how
53 * different areas of the kernel virtual memory are used.
58 const struct addr_marker
*marker
;
59 unsigned long start_address
;
60 unsigned long start_pa
;
61 unsigned long last_pa
;
62 unsigned long page_size
;
66 unsigned long wx_pages
;
70 unsigned long start_address
;
74 static struct addr_marker address_markers
[] = {
75 { 0, "Start of kernel VM" },
77 { 0, "modules start" },
80 { 0, "vmalloc() Area" },
81 { 0, "vmalloc() End" },
83 { 0, "isa I/O start" },
85 { 0, "phb I/O start" },
87 { 0, "I/O remap start" },
88 { 0, "I/O remap end" },
89 { 0, "vmemmap start" },
91 { 0, "Early I/O remap start" },
92 { 0, "Early I/O remap end" },
94 { 0, "Highmem PTEs start" },
95 { 0, "Highmem PTEs end" },
97 { 0, "Fixmap start" },
101 { 0, "kasan shadow mem start" },
102 { 0, "kasan shadow mem end" },
107 #define pt_dump_seq_printf(m, fmt, args...) \
110 seq_printf(m, fmt, ##args); \
113 #define pt_dump_seq_putc(m, c) \
119 void pt_dump_size(struct seq_file
*m
, unsigned long size
)
121 static const char units
[] = "KMGTPE";
122 const char *unit
= units
;
124 /* Work out what appropriate unit to use */
125 while (!(size
& 1023) && unit
[1]) {
129 pt_dump_seq_printf(m
, "%9lu%c ", size
, *unit
);
132 static void dump_flag_info(struct pg_state
*st
, const struct flag_info
133 *flag
, u64 pte
, int num
)
137 for (i
= 0; i
< num
; i
++, flag
++) {
138 const char *s
= NULL
;
141 /* flag not defined so don't check it */
144 /* Some 'flags' are actually values */
146 val
= pte
& flag
->val
;
148 val
= val
>> flag
->shift
;
149 pt_dump_seq_printf(st
->seq
, " %s:%llx", flag
->set
, val
);
151 if ((pte
& flag
->mask
) == flag
->val
)
156 pt_dump_seq_printf(st
->seq
, " %s", s
);
158 st
->current_flags
&= ~flag
->mask
;
160 if (st
->current_flags
!= 0)
161 pt_dump_seq_printf(st
->seq
, " unknown flags:%llx", st
->current_flags
);
164 static void dump_addr(struct pg_state
*st
, unsigned long addr
)
169 #define REG "0x%016lx"
171 #define REG "0x%08lx"
174 pt_dump_seq_printf(st
->seq
, REG
"-" REG
" ", st
->start_address
, addr
- 1);
175 if (st
->start_pa
== st
->last_pa
&& st
->start_address
+ st
->page_size
!= addr
) {
176 pt_dump_seq_printf(st
->seq
, "[" REG
"]", st
->start_pa
);
177 delta
= st
->page_size
>> 10;
179 pt_dump_seq_printf(st
->seq
, " " REG
" ", st
->start_pa
);
180 delta
= (addr
- st
->start_address
) >> 10;
182 pt_dump_size(st
->seq
, delta
);
185 static void note_prot_wx(struct pg_state
*st
, unsigned long addr
)
187 pte_t pte
= __pte(st
->current_flags
);
189 if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX
) || !st
->check_wx
)
192 if (!pte_write(pte
) || !pte_exec(pte
))
195 WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
196 (void *)st
->start_address
, (void *)st
->start_address
);
198 st
->wx_pages
+= (addr
- st
->start_address
) / PAGE_SIZE
;
201 static void note_page_update_state(struct pg_state
*st
, unsigned long addr
,
202 unsigned int level
, u64 val
, unsigned long page_size
)
204 u64 flag
= val
& pg_level
[level
].mask
;
205 u64 pa
= val
& PTE_RPN_MASK
;
208 st
->current_flags
= flag
;
209 st
->start_address
= addr
;
211 st
->page_size
= page_size
;
213 while (addr
>= st
->marker
[1].start_address
) {
215 pt_dump_seq_printf(st
->seq
, "---[ %s ]---\n", st
->marker
->name
);
219 static void note_page(struct pg_state
*st
, unsigned long addr
,
220 unsigned int level
, u64 val
, unsigned long page_size
)
222 u64 flag
= val
& pg_level
[level
].mask
;
223 u64 pa
= val
& PTE_RPN_MASK
;
225 /* At first no level is set */
227 pt_dump_seq_printf(st
->seq
, "---[ %s ]---\n", st
->marker
->name
);
228 note_page_update_state(st
, addr
, level
, val
, page_size
);
230 * Dump the section of virtual memory when:
231 * - the PTE flags from one entry to the next differs.
232 * - we change levels in the tree.
233 * - the address is in a different section of memory and is thus
234 * used for a different purpose, regardless of the flags.
235 * - the pa of this page is not adjacent to the last inspected page
237 } else if (flag
!= st
->current_flags
|| level
!= st
->level
||
238 addr
>= st
->marker
[1].start_address
||
239 (pa
!= st
->last_pa
+ st
->page_size
&&
240 (pa
!= st
->start_pa
|| st
->start_pa
!= st
->last_pa
))) {
242 /* Check the PTE flags */
243 if (st
->current_flags
) {
244 note_prot_wx(st
, addr
);
247 /* Dump all the flags */
248 if (pg_level
[st
->level
].flag
)
249 dump_flag_info(st
, pg_level
[st
->level
].flag
,
251 pg_level
[st
->level
].num
);
253 pt_dump_seq_putc(st
->seq
, '\n');
257 * Address indicates we have passed the end of the
258 * current section of virtual memory
260 note_page_update_state(st
, addr
, level
, val
, page_size
);
265 static void walk_pte(struct pg_state
*st
, pmd_t
*pmd
, unsigned long start
)
267 pte_t
*pte
= pte_offset_kernel(pmd
, 0);
271 for (i
= 0; i
< PTRS_PER_PTE
; i
++, pte
++) {
272 addr
= start
+ i
* PAGE_SIZE
;
273 note_page(st
, addr
, 4, pte_val(*pte
), PAGE_SIZE
);
278 static void walk_hugepd(struct pg_state
*st
, hugepd_t
*phpd
, unsigned long start
,
279 int pdshift
, int level
)
281 #ifdef CONFIG_ARCH_HAS_HUGEPD
283 int shift
= hugepd_shift(*phpd
);
284 int ptrs_per_hpd
= pdshift
- shift
> 0 ? 1 << (pdshift
- shift
) : 1;
286 if (start
& ((1 << shift
) - 1))
289 for (i
= 0; i
< ptrs_per_hpd
; i
++) {
290 unsigned long addr
= start
+ (i
<< shift
);
291 pte_t
*pte
= hugepte_offset(*phpd
, addr
, pdshift
);
293 note_page(st
, addr
, level
+ 1, pte_val(*pte
), 1 << shift
);
298 static void walk_pmd(struct pg_state
*st
, pud_t
*pud
, unsigned long start
)
300 pmd_t
*pmd
= pmd_offset(pud
, 0);
304 for (i
= 0; i
< PTRS_PER_PMD
; i
++, pmd
++) {
305 addr
= start
+ i
* PMD_SIZE
;
306 if (!pmd_none(*pmd
) && !pmd_is_leaf(*pmd
))
308 walk_pte(st
, pmd
, addr
);
310 note_page(st
, addr
, 3, pmd_val(*pmd
), PMD_SIZE
);
314 static void walk_pud(struct pg_state
*st
, p4d_t
*p4d
, unsigned long start
)
316 pud_t
*pud
= pud_offset(p4d
, 0);
320 for (i
= 0; i
< PTRS_PER_PUD
; i
++, pud
++) {
321 addr
= start
+ i
* PUD_SIZE
;
322 if (!pud_none(*pud
) && !pud_is_leaf(*pud
))
324 walk_pmd(st
, pud
, addr
);
326 note_page(st
, addr
, 2, pud_val(*pud
), PUD_SIZE
);
330 static void walk_pagetables(struct pg_state
*st
)
333 unsigned long addr
= st
->start_address
& PGDIR_MASK
;
334 pgd_t
*pgd
= pgd_offset_k(addr
);
337 * Traverse the linux pagetable structure and dump pages that are in
338 * the hash pagetable.
340 for (i
= pgd_index(addr
); i
< PTRS_PER_PGD
; i
++, pgd
++, addr
+= PGDIR_SIZE
) {
341 p4d_t
*p4d
= p4d_offset(pgd
, 0);
343 if (p4d_none(*p4d
) || p4d_is_leaf(*p4d
))
344 note_page(st
, addr
, 1, p4d_val(*p4d
), PGDIR_SIZE
);
345 else if (is_hugepd(__hugepd(p4d_val(*p4d
))))
346 walk_hugepd(st
, (hugepd_t
*)p4d
, addr
, PGDIR_SHIFT
, 1);
349 walk_pud(st
, p4d
, addr
);
353 static void populate_markers(void)
358 address_markers
[i
++].start_address
= PAGE_OFFSET
;
360 address_markers
[i
++].start_address
= TASK_SIZE
;
363 address_markers
[i
++].start_address
= MODULES_VADDR
;
364 address_markers
[i
++].start_address
= MODULES_END
;
366 address_markers
[i
++].start_address
= VMALLOC_START
;
367 address_markers
[i
++].start_address
= VMALLOC_END
;
369 address_markers
[i
++].start_address
= ISA_IO_BASE
;
370 address_markers
[i
++].start_address
= ISA_IO_END
;
371 address_markers
[i
++].start_address
= PHB_IO_BASE
;
372 address_markers
[i
++].start_address
= PHB_IO_END
;
373 address_markers
[i
++].start_address
= IOREMAP_BASE
;
374 address_markers
[i
++].start_address
= IOREMAP_END
;
375 /* What is the ifdef about? */
376 #ifdef CONFIG_PPC_BOOK3S_64
377 address_markers
[i
++].start_address
= H_VMEMMAP_START
;
379 address_markers
[i
++].start_address
= VMEMMAP_BASE
;
381 #else /* !CONFIG_PPC64 */
382 address_markers
[i
++].start_address
= ioremap_bot
;
383 address_markers
[i
++].start_address
= IOREMAP_TOP
;
384 #ifdef CONFIG_HIGHMEM
385 address_markers
[i
++].start_address
= PKMAP_BASE
;
386 address_markers
[i
++].start_address
= PKMAP_ADDR(LAST_PKMAP
);
388 address_markers
[i
++].start_address
= FIXADDR_START
;
389 address_markers
[i
++].start_address
= FIXADDR_TOP
;
391 address_markers
[i
++].start_address
= KASAN_SHADOW_START
;
392 address_markers
[i
++].start_address
= KASAN_SHADOW_END
;
394 #endif /* CONFIG_PPC64 */
397 static int ptdump_show(struct seq_file
*m
, void *v
)
399 struct pg_state st
= {
401 .marker
= address_markers
,
402 .start_address
= IS_ENABLED(CONFIG_PPC64
) ? PAGE_OFFSET
: TASK_SIZE
,
406 if (!radix_enabled())
407 st
.start_address
= KERN_VIRT_START
;
410 /* Traverse kernel page tables */
411 walk_pagetables(&st
);
412 note_page(&st
, 0, 0, 0, 0);
417 static int ptdump_open(struct inode
*inode
, struct file
*file
)
419 return single_open(file
, ptdump_show
, NULL
);
422 static const struct file_operations ptdump_fops
= {
426 .release
= single_release
,
429 static void build_pgtable_complete_mask(void)
433 for (i
= 0; i
< ARRAY_SIZE(pg_level
); i
++)
434 if (pg_level
[i
].flag
)
435 for (j
= 0; j
< pg_level
[i
].num
; j
++)
436 pg_level
[i
].mask
|= pg_level
[i
].flag
[j
].mask
;
439 #ifdef CONFIG_PPC_DEBUG_WX
440 void ptdump_check_wx(void)
442 struct pg_state st
= {
444 .marker
= address_markers
,
446 .start_address
= IS_ENABLED(CONFIG_PPC64
) ? PAGE_OFFSET
: TASK_SIZE
,
450 if (!radix_enabled())
451 st
.start_address
= KERN_VIRT_START
;
454 walk_pagetables(&st
);
457 pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
460 pr_info("Checked W+X mappings: passed, no W+X pages found\n");
464 static int ptdump_init(void)
467 build_pgtable_complete_mask();
468 debugfs_create_file("kernel_page_tables", 0400, NULL
, NULL
,
472 device_initcall(ptdump_init
);