1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright IBM Corp. 2006
6 #include <linux/memory_hotplug.h>
7 #include <linux/memblock.h>
10 #include <linux/init.h>
11 #include <linux/list.h>
12 #include <linux/hugetlb.h>
13 #include <linux/slab.h>
14 #include <linux/sort.h>
15 #include <asm/page-states.h>
16 #include <asm/abs_lowcore.h>
17 #include <asm/cacheflush.h>
18 #include <asm/maccess.h>
19 #include <asm/nospec-branch.h>
20 #include <asm/ctlreg.h>
21 #include <asm/pgalloc.h>
22 #include <asm/setup.h>
23 #include <asm/tlbflush.h>
24 #include <asm/sections.h>
25 #include <asm/set_memory.h>
26 #include <asm/physmem_info.h>
28 static DEFINE_MUTEX(vmem_mutex
);
30 static void __ref
*vmem_alloc_pages(unsigned int order
)
32 unsigned long size
= PAGE_SIZE
<< order
;
34 if (slab_is_available())
35 return (void *)__get_free_pages(GFP_KERNEL
, order
);
36 return memblock_alloc(size
, size
);
39 static void vmem_free_pages(unsigned long addr
, int order
, struct vmem_altmap
*altmap
)
42 vmem_altmap_free(altmap
, 1 << order
);
45 /* We don't expect boot memory to be removed ever. */
46 if (!slab_is_available() ||
47 WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr
))))
49 free_pages(addr
, order
);
52 void *vmem_crst_alloc(unsigned long val
)
56 table
= vmem_alloc_pages(CRST_ALLOC_ORDER
);
59 crst_table_init(table
, val
);
60 __arch_set_page_dat(table
, 1UL << CRST_ALLOC_ORDER
);
64 pte_t __ref
*vmem_pte_alloc(void)
66 unsigned long size
= PTRS_PER_PTE
* sizeof(pte_t
);
69 if (slab_is_available())
70 pte
= (pte_t
*) page_table_alloc(&init_mm
);
72 pte
= (pte_t
*) memblock_alloc(size
, size
);
75 memset64((u64
*)pte
, _PAGE_INVALID
, PTRS_PER_PTE
);
76 __arch_set_page_dat(pte
, 1);
80 static void vmem_pte_free(unsigned long *table
)
82 /* We don't expect boot memory to be removed ever. */
83 if (!slab_is_available() ||
84 WARN_ON_ONCE(PageReserved(virt_to_page(table
))))
86 page_table_free(&init_mm
, table
);
89 #define PAGE_UNUSED 0xFD
92 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
93 * from unused_sub_pmd_start to next PMD_SIZE boundary.
95 static unsigned long unused_sub_pmd_start
;
97 static void vmemmap_flush_unused_sub_pmd(void)
99 if (!unused_sub_pmd_start
)
101 memset((void *)unused_sub_pmd_start
, PAGE_UNUSED
,
102 ALIGN(unused_sub_pmd_start
, PMD_SIZE
) - unused_sub_pmd_start
);
103 unused_sub_pmd_start
= 0;
106 static void vmemmap_mark_sub_pmd_used(unsigned long start
, unsigned long end
)
109 * As we expect to add in the same granularity as we remove, it's
110 * sufficient to mark only some piece used to block the memmap page from
111 * getting removed (just in case the memmap never gets initialized,
112 * e.g., because the memory block never gets onlined).
114 memset((void *)start
, 0, sizeof(struct page
));
117 static void vmemmap_use_sub_pmd(unsigned long start
, unsigned long end
)
120 * We only optimize if the new used range directly follows the
121 * previously unused range (esp., when populating consecutive sections).
123 if (unused_sub_pmd_start
== start
) {
124 unused_sub_pmd_start
= end
;
125 if (likely(IS_ALIGNED(unused_sub_pmd_start
, PMD_SIZE
)))
126 unused_sub_pmd_start
= 0;
129 vmemmap_flush_unused_sub_pmd();
130 vmemmap_mark_sub_pmd_used(start
, end
);
133 static void vmemmap_use_new_sub_pmd(unsigned long start
, unsigned long end
)
135 unsigned long page
= ALIGN_DOWN(start
, PMD_SIZE
);
137 vmemmap_flush_unused_sub_pmd();
139 /* Could be our memmap page is filled with PAGE_UNUSED already ... */
140 vmemmap_mark_sub_pmd_used(start
, end
);
142 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
143 if (!IS_ALIGNED(start
, PMD_SIZE
))
144 memset((void *)page
, PAGE_UNUSED
, start
- page
);
146 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
147 * consecutive sections. Remember for the last added PMD the last
148 * unused range in the populated PMD.
150 if (!IS_ALIGNED(end
, PMD_SIZE
))
151 unused_sub_pmd_start
= end
;
154 /* Returns true if the PMD is completely unused and can be freed. */
155 static bool vmemmap_unuse_sub_pmd(unsigned long start
, unsigned long end
)
157 unsigned long page
= ALIGN_DOWN(start
, PMD_SIZE
);
159 vmemmap_flush_unused_sub_pmd();
160 memset((void *)start
, PAGE_UNUSED
, end
- start
);
161 return !memchr_inv((void *)page
, PAGE_UNUSED
, PMD_SIZE
);
164 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
165 static int __ref
modify_pte_table(pmd_t
*pmd
, unsigned long addr
,
166 unsigned long end
, bool add
, bool direct
,
167 struct vmem_altmap
*altmap
)
169 unsigned long prot
, pages
= 0;
173 prot
= pgprot_val(PAGE_KERNEL
);
174 pte
= pte_offset_kernel(pmd
, addr
);
175 for (; addr
< end
; addr
+= PAGE_SIZE
, pte
++) {
180 vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte
)), get_order(PAGE_SIZE
), altmap
);
181 pte_clear(&init_mm
, addr
, pte
);
182 } else if (pte_none(*pte
)) {
184 void *new_page
= vmemmap_alloc_block_buf(PAGE_SIZE
, NUMA_NO_NODE
, altmap
);
188 set_pte(pte
, __pte(__pa(new_page
) | prot
));
190 set_pte(pte
, __pte(__pa(addr
) | prot
));
200 update_page_count(PG_DIRECT_MAP_4K
, add
? pages
: -pages
);
204 static void try_free_pte_table(pmd_t
*pmd
, unsigned long start
)
209 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
210 pte
= pte_offset_kernel(pmd
, start
);
211 for (i
= 0; i
< PTRS_PER_PTE
; i
++, pte
++) {
215 vmem_pte_free((unsigned long *) pmd_deref(*pmd
));
219 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
220 static int __ref
modify_pmd_table(pud_t
*pud
, unsigned long addr
,
221 unsigned long end
, bool add
, bool direct
,
222 struct vmem_altmap
*altmap
)
224 unsigned long next
, prot
, pages
= 0;
229 prot
= pgprot_val(SEGMENT_KERNEL
);
230 pmd
= pmd_offset(pud
, addr
);
231 for (; addr
< end
; addr
= next
, pmd
++) {
232 next
= pmd_addr_end(addr
, end
);
236 if (pmd_leaf(*pmd
)) {
237 if (IS_ALIGNED(addr
, PMD_SIZE
) &&
238 IS_ALIGNED(next
, PMD_SIZE
)) {
240 vmem_free_pages(pmd_deref(*pmd
), get_order(PMD_SIZE
), altmap
);
243 } else if (!direct
&& vmemmap_unuse_sub_pmd(addr
, next
)) {
244 vmem_free_pages(pmd_deref(*pmd
), get_order(PMD_SIZE
), altmap
);
249 } else if (pmd_none(*pmd
)) {
250 if (IS_ALIGNED(addr
, PMD_SIZE
) &&
251 IS_ALIGNED(next
, PMD_SIZE
) &&
252 MACHINE_HAS_EDAT1
&& direct
&&
253 !debug_pagealloc_enabled()) {
254 set_pmd(pmd
, __pmd(__pa(addr
) | prot
));
257 } else if (!direct
&& MACHINE_HAS_EDAT1
) {
261 * Use 1MB frames for vmemmap if available. We
262 * always use large frames even if they are only
263 * partially used. Otherwise we would have also
264 * page tables since vmemmap_populate gets
265 * called for each section separately.
267 new_page
= vmemmap_alloc_block_buf(PMD_SIZE
, NUMA_NO_NODE
, altmap
);
269 set_pmd(pmd
, __pmd(__pa(new_page
) | prot
));
270 if (!IS_ALIGNED(addr
, PMD_SIZE
) ||
271 !IS_ALIGNED(next
, PMD_SIZE
)) {
272 vmemmap_use_new_sub_pmd(addr
, next
);
277 pte
= vmem_pte_alloc();
280 pmd_populate(&init_mm
, pmd
, pte
);
281 } else if (pmd_leaf(*pmd
)) {
283 vmemmap_use_sub_pmd(addr
, next
);
286 ret
= modify_pte_table(pmd
, addr
, next
, add
, direct
, altmap
);
290 try_free_pte_table(pmd
, addr
& PMD_MASK
);
295 update_page_count(PG_DIRECT_MAP_1M
, add
? pages
: -pages
);
299 static void try_free_pmd_table(pud_t
*pud
, unsigned long start
)
304 pmd
= pmd_offset(pud
, start
);
305 for (i
= 0; i
< PTRS_PER_PMD
; i
++, pmd
++)
308 vmem_free_pages(pud_deref(*pud
), CRST_ALLOC_ORDER
, NULL
);
312 static int modify_pud_table(p4d_t
*p4d
, unsigned long addr
, unsigned long end
,
313 bool add
, bool direct
, struct vmem_altmap
*altmap
)
315 unsigned long next
, prot
, pages
= 0;
320 prot
= pgprot_val(REGION3_KERNEL
);
321 pud
= pud_offset(p4d
, addr
);
322 for (; addr
< end
; addr
= next
, pud
++) {
323 next
= pud_addr_end(addr
, end
);
327 if (pud_leaf(*pud
)) {
328 if (IS_ALIGNED(addr
, PUD_SIZE
) &&
329 IS_ALIGNED(next
, PUD_SIZE
)) {
335 } else if (pud_none(*pud
)) {
336 if (IS_ALIGNED(addr
, PUD_SIZE
) &&
337 IS_ALIGNED(next
, PUD_SIZE
) &&
338 MACHINE_HAS_EDAT2
&& direct
&&
339 !debug_pagealloc_enabled()) {
340 set_pud(pud
, __pud(__pa(addr
) | prot
));
344 pmd
= vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY
);
347 pud_populate(&init_mm
, pud
, pmd
);
348 } else if (pud_leaf(*pud
)) {
351 ret
= modify_pmd_table(pud
, addr
, next
, add
, direct
, altmap
);
355 try_free_pmd_table(pud
, addr
& PUD_MASK
);
360 update_page_count(PG_DIRECT_MAP_2G
, add
? pages
: -pages
);
364 static void try_free_pud_table(p4d_t
*p4d
, unsigned long start
)
369 pud
= pud_offset(p4d
, start
);
370 for (i
= 0; i
< PTRS_PER_PUD
; i
++, pud
++) {
374 vmem_free_pages(p4d_deref(*p4d
), CRST_ALLOC_ORDER
, NULL
);
378 static int modify_p4d_table(pgd_t
*pgd
, unsigned long addr
, unsigned long end
,
379 bool add
, bool direct
, struct vmem_altmap
*altmap
)
386 p4d
= p4d_offset(pgd
, addr
);
387 for (; addr
< end
; addr
= next
, p4d
++) {
388 next
= p4d_addr_end(addr
, end
);
392 } else if (p4d_none(*p4d
)) {
393 pud
= vmem_crst_alloc(_REGION3_ENTRY_EMPTY
);
396 p4d_populate(&init_mm
, p4d
, pud
);
398 ret
= modify_pud_table(p4d
, addr
, next
, add
, direct
, altmap
);
402 try_free_pud_table(p4d
, addr
& P4D_MASK
);
409 static void try_free_p4d_table(pgd_t
*pgd
, unsigned long start
)
414 p4d
= p4d_offset(pgd
, start
);
415 for (i
= 0; i
< PTRS_PER_P4D
; i
++, p4d
++) {
419 vmem_free_pages(pgd_deref(*pgd
), CRST_ALLOC_ORDER
, NULL
);
423 static int modify_pagetable(unsigned long start
, unsigned long end
, bool add
,
424 bool direct
, struct vmem_altmap
*altmap
)
426 unsigned long addr
, next
;
431 if (WARN_ON_ONCE(!PAGE_ALIGNED(start
| end
)))
433 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
434 if (WARN_ON_ONCE(end
> __abs_lowcore
))
436 for (addr
= start
; addr
< end
; addr
= next
) {
437 next
= pgd_addr_end(addr
, end
);
438 pgd
= pgd_offset_k(addr
);
443 } else if (pgd_none(*pgd
)) {
444 p4d
= vmem_crst_alloc(_REGION2_ENTRY_EMPTY
);
447 pgd_populate(&init_mm
, pgd
, p4d
);
449 ret
= modify_p4d_table(pgd
, addr
, next
, add
, direct
, altmap
);
453 try_free_p4d_table(pgd
, addr
& PGDIR_MASK
);
458 flush_tlb_kernel_range(start
, end
);
462 static int add_pagetable(unsigned long start
, unsigned long end
, bool direct
,
463 struct vmem_altmap
*altmap
)
465 return modify_pagetable(start
, end
, true, direct
, altmap
);
468 static int remove_pagetable(unsigned long start
, unsigned long end
, bool direct
,
469 struct vmem_altmap
*altmap
)
471 return modify_pagetable(start
, end
, false, direct
, altmap
);
475 * Add a physical memory range to the 1:1 mapping.
477 static int vmem_add_range(unsigned long start
, unsigned long size
)
479 start
= (unsigned long)__va(start
);
480 return add_pagetable(start
, start
+ size
, true, NULL
);
484 * Remove a physical memory range from the 1:1 mapping.
486 static void vmem_remove_range(unsigned long start
, unsigned long size
)
488 start
= (unsigned long)__va(start
);
489 remove_pagetable(start
, start
+ size
, true, NULL
);
493 * Add a backed mem_map array to the virtual mem_map array.
495 int __meminit
vmemmap_populate(unsigned long start
, unsigned long end
, int node
,
496 struct vmem_altmap
*altmap
)
500 mutex_lock(&vmem_mutex
);
501 /* We don't care about the node, just use NUMA_NO_NODE on allocations */
502 ret
= add_pagetable(start
, end
, false, altmap
);
504 remove_pagetable(start
, end
, false, altmap
);
505 mutex_unlock(&vmem_mutex
);
509 #ifdef CONFIG_MEMORY_HOTPLUG
511 void vmemmap_free(unsigned long start
, unsigned long end
,
512 struct vmem_altmap
*altmap
)
514 mutex_lock(&vmem_mutex
);
515 remove_pagetable(start
, end
, false, altmap
);
516 mutex_unlock(&vmem_mutex
);
521 void vmem_remove_mapping(unsigned long start
, unsigned long size
)
523 mutex_lock(&vmem_mutex
);
524 vmem_remove_range(start
, size
);
525 mutex_unlock(&vmem_mutex
);
528 struct range
arch_get_mappable_range(void)
530 struct range mhp_range
;
533 mhp_range
.end
= max_mappable
- 1;
537 int vmem_add_mapping(unsigned long start
, unsigned long size
)
539 struct range range
= arch_get_mappable_range();
542 if (start
< range
.start
||
543 start
+ size
> range
.end
+ 1 ||
544 start
+ size
< start
)
547 mutex_lock(&vmem_mutex
);
548 ret
= vmem_add_range(start
, size
);
550 vmem_remove_range(start
, size
);
551 mutex_unlock(&vmem_mutex
);
556 * Allocate new or return existing page-table entry, but do not map it
557 * to any physical address. If missing, allocate segment- and region-
558 * table entries along. Meeting a large segment- or region-table entry
559 * while traversing is an error, since the function is expected to be
560 * called against virtual regions reserved for 4KB mappings only.
562 pte_t
*vmem_get_alloc_pte(unsigned long addr
, bool alloc
)
571 pgd
= pgd_offset_k(addr
);
572 if (pgd_none(*pgd
)) {
575 p4d
= vmem_crst_alloc(_REGION2_ENTRY_EMPTY
);
578 pgd_populate(&init_mm
, pgd
, p4d
);
580 p4d
= p4d_offset(pgd
, addr
);
581 if (p4d_none(*p4d
)) {
584 pud
= vmem_crst_alloc(_REGION3_ENTRY_EMPTY
);
587 p4d_populate(&init_mm
, p4d
, pud
);
589 pud
= pud_offset(p4d
, addr
);
590 if (pud_none(*pud
)) {
593 pmd
= vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY
);
596 pud_populate(&init_mm
, pud
, pmd
);
597 } else if (WARN_ON_ONCE(pud_leaf(*pud
))) {
600 pmd
= pmd_offset(pud
, addr
);
601 if (pmd_none(*pmd
)) {
604 pte
= vmem_pte_alloc();
607 pmd_populate(&init_mm
, pmd
, pte
);
608 } else if (WARN_ON_ONCE(pmd_leaf(*pmd
))) {
611 ptep
= pte_offset_kernel(pmd
, addr
);
616 int __vmem_map_4k_page(unsigned long addr
, unsigned long phys
, pgprot_t prot
, bool alloc
)
620 if (!IS_ALIGNED(addr
, PAGE_SIZE
))
622 ptep
= vmem_get_alloc_pte(addr
, alloc
);
625 __ptep_ipte(addr
, ptep
, 0, 0, IPTE_GLOBAL
);
626 pte
= mk_pte_phys(phys
, prot
);
631 int vmem_map_4k_page(unsigned long addr
, unsigned long phys
, pgprot_t prot
)
635 mutex_lock(&vmem_mutex
);
636 rc
= __vmem_map_4k_page(addr
, phys
, prot
, true);
637 mutex_unlock(&vmem_mutex
);
641 void vmem_unmap_4k_page(unsigned long addr
)
645 mutex_lock(&vmem_mutex
);
646 ptep
= virt_to_kpte(addr
);
647 __ptep_ipte(addr
, ptep
, 0, 0, IPTE_GLOBAL
);
648 pte_clear(&init_mm
, addr
, ptep
);
649 mutex_unlock(&vmem_mutex
);
652 void __init
vmem_map_init(void)
654 __set_memory_rox(_stext
, _etext
);
655 __set_memory_ro(_etext
, __end_rodata
);
656 __set_memory_rox(__stext_amode31
, __etext_amode31
);
658 * If the BEAR-enhancement facility is not installed the first
659 * prefix page is used to return to the previous context with
660 * an LPSWE instruction and therefore must be executable.
662 if (!static_key_enabled(&cpu_has_bear
))
664 if (debug_pagealloc_enabled())
665 __set_memory_4k(__va(0), __va(0) + ident_map_size
);
666 pr_info("Write protected kernel read-only data: %luk\n",
667 (unsigned long)(__end_rodata
- _stext
) >> 10);