1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright IBM Corp. 2006
4 * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
7 #include <linux/memblock.h>
10 #include <linux/init.h>
11 #include <linux/list.h>
12 #include <linux/hugetlb.h>
13 #include <linux/slab.h>
14 #include <asm/cacheflush.h>
15 #include <asm/pgalloc.h>
16 #include <asm/setup.h>
17 #include <asm/tlbflush.h>
18 #include <asm/sections.h>
19 #include <asm/set_memory.h>
21 static DEFINE_MUTEX(vmem_mutex
);
23 static void __ref
*vmem_alloc_pages(unsigned int order
)
25 unsigned long size
= PAGE_SIZE
<< order
;
27 if (slab_is_available())
28 return (void *)__get_free_pages(GFP_KERNEL
, order
);
29 return (void *) memblock_phys_alloc(size
, size
);
32 static void vmem_free_pages(unsigned long addr
, int order
)
34 /* We don't expect boot memory to be removed ever. */
35 if (!slab_is_available() ||
36 WARN_ON_ONCE(PageReserved(phys_to_page(addr
))))
38 free_pages(addr
, order
);
41 void *vmem_crst_alloc(unsigned long val
)
45 table
= vmem_alloc_pages(CRST_ALLOC_ORDER
);
47 crst_table_init(table
, val
);
51 pte_t __ref
*vmem_pte_alloc(void)
53 unsigned long size
= PTRS_PER_PTE
* sizeof(pte_t
);
56 if (slab_is_available())
57 pte
= (pte_t
*) page_table_alloc(&init_mm
);
59 pte
= (pte_t
*) memblock_phys_alloc(size
, size
);
62 memset64((u64
*)pte
, _PAGE_INVALID
, PTRS_PER_PTE
);
66 static void vmem_pte_free(unsigned long *table
)
68 /* We don't expect boot memory to be removed ever. */
69 if (!slab_is_available() ||
70 WARN_ON_ONCE(PageReserved(virt_to_page(table
))))
72 page_table_free(&init_mm
, table
);
75 #define PAGE_UNUSED 0xFD
78 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
79 * from unused_sub_pmd_start to next PMD_SIZE boundary.
81 static unsigned long unused_sub_pmd_start
;
83 static void vmemmap_flush_unused_sub_pmd(void)
85 if (!unused_sub_pmd_start
)
87 memset(__va(unused_sub_pmd_start
), PAGE_UNUSED
,
88 ALIGN(unused_sub_pmd_start
, PMD_SIZE
) - unused_sub_pmd_start
);
89 unused_sub_pmd_start
= 0;
92 static void vmemmap_mark_sub_pmd_used(unsigned long start
, unsigned long end
)
95 * As we expect to add in the same granularity as we remove, it's
96 * sufficient to mark only some piece used to block the memmap page from
97 * getting removed (just in case the memmap never gets initialized,
98 * e.g., because the memory block never gets onlined).
100 memset(__va(start
), 0, sizeof(struct page
));
103 static void vmemmap_use_sub_pmd(unsigned long start
, unsigned long end
)
106 * We only optimize if the new used range directly follows the
107 * previously unused range (esp., when populating consecutive sections).
109 if (unused_sub_pmd_start
== start
) {
110 unused_sub_pmd_start
= end
;
111 if (likely(IS_ALIGNED(unused_sub_pmd_start
, PMD_SIZE
)))
112 unused_sub_pmd_start
= 0;
115 vmemmap_flush_unused_sub_pmd();
116 vmemmap_mark_sub_pmd_used(start
, end
);
119 static void vmemmap_use_new_sub_pmd(unsigned long start
, unsigned long end
)
121 void *page
= __va(ALIGN_DOWN(start
, PMD_SIZE
));
123 vmemmap_flush_unused_sub_pmd();
125 /* Could be our memmap page is filled with PAGE_UNUSED already ... */
126 vmemmap_mark_sub_pmd_used(start
, end
);
128 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
129 if (!IS_ALIGNED(start
, PMD_SIZE
))
130 memset(page
, PAGE_UNUSED
, start
- __pa(page
));
132 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
133 * consecutive sections. Remember for the last added PMD the last
134 * unused range in the populated PMD.
136 if (!IS_ALIGNED(end
, PMD_SIZE
))
137 unused_sub_pmd_start
= end
;
140 /* Returns true if the PMD is completely unused and can be freed. */
141 static bool vmemmap_unuse_sub_pmd(unsigned long start
, unsigned long end
)
143 void *page
= __va(ALIGN_DOWN(start
, PMD_SIZE
));
145 vmemmap_flush_unused_sub_pmd();
146 memset(__va(start
), PAGE_UNUSED
, end
- start
);
147 return !memchr_inv(page
, PAGE_UNUSED
, PMD_SIZE
);
150 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
151 static int __ref
modify_pte_table(pmd_t
*pmd
, unsigned long addr
,
152 unsigned long end
, bool add
, bool direct
)
154 unsigned long prot
, pages
= 0;
158 prot
= pgprot_val(PAGE_KERNEL
);
160 prot
&= ~_PAGE_NOEXEC
;
162 pte
= pte_offset_kernel(pmd
, addr
);
163 for (; addr
< end
; addr
+= PAGE_SIZE
, pte
++) {
168 vmem_free_pages(pfn_to_phys(pte_pfn(*pte
)), 0);
169 pte_clear(&init_mm
, addr
, pte
);
170 } else if (pte_none(*pte
)) {
172 void *new_page
= vmemmap_alloc_block(PAGE_SIZE
, NUMA_NO_NODE
);
176 pte_val(*pte
) = __pa(new_page
) | prot
;
178 pte_val(*pte
) = addr
| prot
;
188 update_page_count(PG_DIRECT_MAP_4K
, add
? pages
: -pages
);
192 static void try_free_pte_table(pmd_t
*pmd
, unsigned long start
)
197 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
198 pte
= pte_offset_kernel(pmd
, start
);
199 for (i
= 0; i
< PTRS_PER_PTE
; i
++, pte
++) {
203 vmem_pte_free(__va(pmd_deref(*pmd
)));
207 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
208 static int __ref
modify_pmd_table(pud_t
*pud
, unsigned long addr
,
209 unsigned long end
, bool add
, bool direct
)
211 unsigned long next
, prot
, pages
= 0;
216 prot
= pgprot_val(SEGMENT_KERNEL
);
218 prot
&= ~_SEGMENT_ENTRY_NOEXEC
;
220 pmd
= pmd_offset(pud
, addr
);
221 for (; addr
< end
; addr
= next
, pmd
++) {
222 next
= pmd_addr_end(addr
, end
);
226 if (pmd_large(*pmd
)) {
227 if (IS_ALIGNED(addr
, PMD_SIZE
) &&
228 IS_ALIGNED(next
, PMD_SIZE
)) {
230 vmem_free_pages(pmd_deref(*pmd
), get_order(PMD_SIZE
));
233 } else if (!direct
&& vmemmap_unuse_sub_pmd(addr
, next
)) {
234 vmem_free_pages(pmd_deref(*pmd
), get_order(PMD_SIZE
));
239 } else if (pmd_none(*pmd
)) {
240 if (IS_ALIGNED(addr
, PMD_SIZE
) &&
241 IS_ALIGNED(next
, PMD_SIZE
) &&
242 MACHINE_HAS_EDAT1
&& addr
&& direct
&&
243 !debug_pagealloc_enabled()) {
244 pmd_val(*pmd
) = addr
| prot
;
247 } else if (!direct
&& MACHINE_HAS_EDAT1
) {
251 * Use 1MB frames for vmemmap if available. We
252 * always use large frames even if they are only
253 * partially used. Otherwise we would have also
254 * page tables since vmemmap_populate gets
255 * called for each section separately.
257 new_page
= vmemmap_alloc_block(PMD_SIZE
, NUMA_NO_NODE
);
259 pmd_val(*pmd
) = __pa(new_page
) | prot
;
260 if (!IS_ALIGNED(addr
, PMD_SIZE
) ||
261 !IS_ALIGNED(next
, PMD_SIZE
)) {
262 vmemmap_use_new_sub_pmd(addr
, next
);
267 pte
= vmem_pte_alloc();
270 pmd_populate(&init_mm
, pmd
, pte
);
271 } else if (pmd_large(*pmd
)) {
273 vmemmap_use_sub_pmd(addr
, next
);
276 ret
= modify_pte_table(pmd
, addr
, next
, add
, direct
);
280 try_free_pte_table(pmd
, addr
& PMD_MASK
);
285 update_page_count(PG_DIRECT_MAP_1M
, add
? pages
: -pages
);
289 static void try_free_pmd_table(pud_t
*pud
, unsigned long start
)
291 const unsigned long end
= start
+ PUD_SIZE
;
295 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
296 if (end
> VMALLOC_START
)
299 if (start
< KASAN_SHADOW_END
&& KASAN_SHADOW_START
> end
)
302 pmd
= pmd_offset(pud
, start
);
303 for (i
= 0; i
< PTRS_PER_PMD
; i
++, pmd
++)
306 vmem_free_pages(pud_deref(*pud
), CRST_ALLOC_ORDER
);
310 static int modify_pud_table(p4d_t
*p4d
, unsigned long addr
, unsigned long end
,
311 bool add
, bool direct
)
313 unsigned long next
, prot
, pages
= 0;
318 prot
= pgprot_val(REGION3_KERNEL
);
320 prot
&= ~_REGION_ENTRY_NOEXEC
;
321 pud
= pud_offset(p4d
, addr
);
322 for (; addr
< end
; addr
= next
, pud
++) {
323 next
= pud_addr_end(addr
, end
);
327 if (pud_large(*pud
)) {
328 if (IS_ALIGNED(addr
, PUD_SIZE
) &&
329 IS_ALIGNED(next
, PUD_SIZE
)) {
335 } else if (pud_none(*pud
)) {
336 if (IS_ALIGNED(addr
, PUD_SIZE
) &&
337 IS_ALIGNED(next
, PUD_SIZE
) &&
338 MACHINE_HAS_EDAT2
&& addr
&& direct
&&
339 !debug_pagealloc_enabled()) {
340 pud_val(*pud
) = addr
| prot
;
344 pmd
= vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY
);
347 pud_populate(&init_mm
, pud
, pmd
);
348 } else if (pud_large(*pud
)) {
351 ret
= modify_pmd_table(pud
, addr
, next
, add
, direct
);
355 try_free_pmd_table(pud
, addr
& PUD_MASK
);
360 update_page_count(PG_DIRECT_MAP_2G
, add
? pages
: -pages
);
364 static void try_free_pud_table(p4d_t
*p4d
, unsigned long start
)
366 const unsigned long end
= start
+ P4D_SIZE
;
370 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
371 if (end
> VMALLOC_START
)
374 if (start
< KASAN_SHADOW_END
&& KASAN_SHADOW_START
> end
)
378 pud
= pud_offset(p4d
, start
);
379 for (i
= 0; i
< PTRS_PER_PUD
; i
++, pud
++) {
383 vmem_free_pages(p4d_deref(*p4d
), CRST_ALLOC_ORDER
);
387 static int modify_p4d_table(pgd_t
*pgd
, unsigned long addr
, unsigned long end
,
388 bool add
, bool direct
)
395 p4d
= p4d_offset(pgd
, addr
);
396 for (; addr
< end
; addr
= next
, p4d
++) {
397 next
= p4d_addr_end(addr
, end
);
401 } else if (p4d_none(*p4d
)) {
402 pud
= vmem_crst_alloc(_REGION3_ENTRY_EMPTY
);
405 p4d_populate(&init_mm
, p4d
, pud
);
407 ret
= modify_pud_table(p4d
, addr
, next
, add
, direct
);
411 try_free_pud_table(p4d
, addr
& P4D_MASK
);
418 static void try_free_p4d_table(pgd_t
*pgd
, unsigned long start
)
420 const unsigned long end
= start
+ PGDIR_SIZE
;
424 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
425 if (end
> VMALLOC_START
)
428 if (start
< KASAN_SHADOW_END
&& KASAN_SHADOW_START
> end
)
432 p4d
= p4d_offset(pgd
, start
);
433 for (i
= 0; i
< PTRS_PER_P4D
; i
++, p4d
++) {
437 vmem_free_pages(pgd_deref(*pgd
), CRST_ALLOC_ORDER
);
441 static int modify_pagetable(unsigned long start
, unsigned long end
, bool add
,
444 unsigned long addr
, next
;
449 if (WARN_ON_ONCE(!PAGE_ALIGNED(start
| end
)))
451 for (addr
= start
; addr
< end
; addr
= next
) {
452 next
= pgd_addr_end(addr
, end
);
453 pgd
= pgd_offset_k(addr
);
458 } else if (pgd_none(*pgd
)) {
459 p4d
= vmem_crst_alloc(_REGION2_ENTRY_EMPTY
);
462 pgd_populate(&init_mm
, pgd
, p4d
);
464 ret
= modify_p4d_table(pgd
, addr
, next
, add
, direct
);
468 try_free_p4d_table(pgd
, addr
& PGDIR_MASK
);
473 flush_tlb_kernel_range(start
, end
);
477 static int add_pagetable(unsigned long start
, unsigned long end
, bool direct
)
479 return modify_pagetable(start
, end
, true, direct
);
482 static int remove_pagetable(unsigned long start
, unsigned long end
, bool direct
)
484 return modify_pagetable(start
, end
, false, direct
);
488 * Add a physical memory range to the 1:1 mapping.
490 static int vmem_add_range(unsigned long start
, unsigned long size
)
492 return add_pagetable(start
, start
+ size
, true);
496 * Remove a physical memory range from the 1:1 mapping.
498 static void vmem_remove_range(unsigned long start
, unsigned long size
)
500 remove_pagetable(start
, start
+ size
, true);
504 * Add a backed mem_map array to the virtual mem_map array.
506 int __meminit
vmemmap_populate(unsigned long start
, unsigned long end
, int node
,
507 struct vmem_altmap
*altmap
)
511 mutex_lock(&vmem_mutex
);
512 /* We don't care about the node, just use NUMA_NO_NODE on allocations */
513 ret
= add_pagetable(start
, end
, false);
515 remove_pagetable(start
, end
, false);
516 mutex_unlock(&vmem_mutex
);
520 void vmemmap_free(unsigned long start
, unsigned long end
,
521 struct vmem_altmap
*altmap
)
523 mutex_lock(&vmem_mutex
);
524 remove_pagetable(start
, end
, false);
525 mutex_unlock(&vmem_mutex
);
528 void vmem_remove_mapping(unsigned long start
, unsigned long size
)
530 mutex_lock(&vmem_mutex
);
531 vmem_remove_range(start
, size
);
532 mutex_unlock(&vmem_mutex
);
535 int vmem_add_mapping(unsigned long start
, unsigned long size
)
539 if (start
+ size
> VMEM_MAX_PHYS
||
540 start
+ size
< start
)
543 mutex_lock(&vmem_mutex
);
544 ret
= vmem_add_range(start
, size
);
546 vmem_remove_range(start
, size
);
547 mutex_unlock(&vmem_mutex
);
552 * map whole physical memory to virtual memory (identity mapping)
553 * we reserve enough space in the vmalloc area for vmemmap to hotplug
554 * additional memory segments.
556 void __init
vmem_map_init(void)
558 phys_addr_t base
, end
;
561 for_each_mem_range(i
, &base
, &end
)
562 vmem_add_range(base
, end
- base
);
563 __set_memory((unsigned long)_stext
,
564 (unsigned long)(_etext
- _stext
) >> PAGE_SHIFT
,
565 SET_MEMORY_RO
| SET_MEMORY_X
);
566 __set_memory((unsigned long)_etext
,
567 (unsigned long)(__end_rodata
- _etext
) >> PAGE_SHIFT
,
569 __set_memory((unsigned long)_sinittext
,
570 (unsigned long)(_einittext
- _sinittext
) >> PAGE_SHIFT
,
571 SET_MEMORY_RO
| SET_MEMORY_X
);
572 __set_memory(__stext_dma
, (__etext_dma
- __stext_dma
) >> PAGE_SHIFT
,
573 SET_MEMORY_RO
| SET_MEMORY_X
);
575 /* we need lowcore executable for our LPSWE instructions */
578 pr_info("Write protected kernel read-only data: %luk\n",
579 (unsigned long)(__end_rodata
- _stext
) >> 10);