1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
6 #include <linux/sched.h>
7 #include <linux/mm_types.h>
8 #include <linux/memblock.h>
9 #include <misc/cxl-base.h>
11 #include <asm/debugfs.h>
12 #include <asm/pgalloc.h>
14 #include <asm/trace.h>
15 #include <asm/powernv.h>
16 #include <asm/firmware.h>
17 #include <asm/ultravisor.h>
18 #include <asm/kexec.h>
20 #include <mm/mmu_decl.h>
21 #include <trace/events/thp.h>
23 unsigned long __pmd_frag_nr
;
24 EXPORT_SYMBOL(__pmd_frag_nr
);
25 unsigned long __pmd_frag_size_shift
;
26 EXPORT_SYMBOL(__pmd_frag_size_shift
);
28 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
30 * This is called when relaxing access to a hugepage. It's also called in the page
31 * fault path when we don't hit any of the major fault cases, ie, a minor
32 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
33 * handled those two for us, we additionally deal with missing execute
34 * permission here on some processors
36 int pmdp_set_access_flags(struct vm_area_struct
*vma
, unsigned long address
,
37 pmd_t
*pmdp
, pmd_t entry
, int dirty
)
40 #ifdef CONFIG_DEBUG_VM
41 WARN_ON(!pmd_trans_huge(*pmdp
) && !pmd_devmap(*pmdp
));
42 assert_spin_locked(pmd_lockptr(vma
->vm_mm
, pmdp
));
44 changed
= !pmd_same(*(pmdp
), entry
);
47 * We can use MMU_PAGE_2M here, because only radix
48 * path look at the psize.
50 __ptep_set_access_flags(vma
, pmdp_ptep(pmdp
),
51 pmd_pte(entry
), address
, MMU_PAGE_2M
);
56 int pmdp_test_and_clear_young(struct vm_area_struct
*vma
,
57 unsigned long address
, pmd_t
*pmdp
)
59 return __pmdp_test_and_clear_young(vma
->vm_mm
, address
, pmdp
);
62 * set a new huge pmd. We should not be called for updating
63 * an existing pmd entry. That should go via pmd_hugepage_update.
65 void set_pmd_at(struct mm_struct
*mm
, unsigned long addr
,
66 pmd_t
*pmdp
, pmd_t pmd
)
68 #ifdef CONFIG_DEBUG_VM
70 * Make sure hardware valid bit is not set. We don't do
71 * tlb flush for this update.
74 WARN_ON(pte_hw_valid(pmd_pte(*pmdp
)) && !pte_protnone(pmd_pte(*pmdp
)));
75 assert_spin_locked(pmd_lockptr(mm
, pmdp
));
76 WARN_ON(!(pmd_large(pmd
)));
78 trace_hugepage_set_pmd(addr
, pmd_val(pmd
));
79 return set_pte_at(mm
, addr
, pmdp_ptep(pmdp
), pmd_pte(pmd
));
82 static void do_nothing(void *unused
)
87 * Serialize against find_current_mm_pte which does lock-less
88 * lookup in page tables with local interrupts disabled. For huge pages
89 * it casts pmd_t to pte_t. Since format of pte_t is different from
90 * pmd_t we want to prevent transit from pmd pointing to page table
91 * to pmd pointing to huge page (and back) while interrupts are disabled.
92 * We clear pmd to possibly replace it with page table pointer in
93 * different code paths. So make sure we wait for the parallel
94 * find_current_mm_pte to finish.
96 void serialize_against_pte_lookup(struct mm_struct
*mm
)
99 smp_call_function_many(mm_cpumask(mm
), do_nothing
, NULL
, 1);
103 * We use this to invalidate a pmdp entry before switching from a
104 * hugepte to regular pmd entry.
106 pmd_t
pmdp_invalidate(struct vm_area_struct
*vma
, unsigned long address
,
109 unsigned long old_pmd
;
111 old_pmd
= pmd_hugepage_update(vma
->vm_mm
, address
, pmdp
, _PAGE_PRESENT
, _PAGE_INVALID
);
112 flush_pmd_tlb_range(vma
, address
, address
+ HPAGE_PMD_SIZE
);
113 return __pmd(old_pmd
);
116 pmd_t
pmdp_huge_get_and_clear_full(struct vm_area_struct
*vma
,
117 unsigned long addr
, pmd_t
*pmdp
, int full
)
120 VM_BUG_ON(addr
& ~HPAGE_PMD_MASK
);
121 VM_BUG_ON((pmd_present(*pmdp
) && !pmd_trans_huge(*pmdp
) &&
122 !pmd_devmap(*pmdp
)) || !pmd_present(*pmdp
));
123 pmd
= pmdp_huge_get_and_clear(vma
->vm_mm
, addr
, pmdp
);
125 * if it not a fullmm flush, then we can possibly end up converting
126 * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
127 * Make sure we flush the tlb in this case.
130 flush_pmd_tlb_range(vma
, addr
, addr
+ HPAGE_PMD_SIZE
);
134 static pmd_t
pmd_set_protbits(pmd_t pmd
, pgprot_t pgprot
)
136 return __pmd(pmd_val(pmd
) | pgprot_val(pgprot
));
140 * At some point we should be able to get rid of
141 * pmd_mkhuge() and mk_huge_pmd() when we update all the
142 * other archs to mark the pmd huge in pfn_pmd()
144 pmd_t
pfn_pmd(unsigned long pfn
, pgprot_t pgprot
)
148 pmdv
= (pfn
<< PAGE_SHIFT
) & PTE_RPN_MASK
;
150 return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv
), pgprot
));
153 pmd_t
mk_pmd(struct page
*page
, pgprot_t pgprot
)
155 return pfn_pmd(page_to_pfn(page
), pgprot
);
158 pmd_t
pmd_modify(pmd_t pmd
, pgprot_t newprot
)
163 pmdv
&= _HPAGE_CHG_MASK
;
164 return pmd_set_protbits(__pmd(pmdv
), newprot
);
166 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
168 /* For use by kexec */
169 void mmu_cleanup_all(void)
172 radix__mmu_cleanup_all();
173 else if (mmu_hash_ops
.hpte_clear_all
)
174 mmu_hash_ops
.hpte_clear_all();
179 #ifdef CONFIG_MEMORY_HOTPLUG
180 int __meminit
create_section_mapping(unsigned long start
, unsigned long end
,
181 int nid
, pgprot_t prot
)
184 return radix__create_section_mapping(start
, end
, nid
, prot
);
186 return hash__create_section_mapping(start
, end
, nid
, prot
);
189 int __meminit
remove_section_mapping(unsigned long start
, unsigned long end
)
192 return radix__remove_section_mapping(start
, end
);
194 return hash__remove_section_mapping(start
, end
);
196 #endif /* CONFIG_MEMORY_HOTPLUG */
198 void __init
mmu_partition_table_init(void)
200 unsigned long patb_size
= 1UL << PATB_SIZE_SHIFT
;
203 BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT
> 36), "Partition table size too large.");
204 /* Initialize the Partition Table with no entries */
205 partition_tb
= memblock_alloc(patb_size
, patb_size
);
207 panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
208 __func__
, patb_size
, patb_size
);
211 * update partition table control register,
214 ptcr
= __pa(partition_tb
) | (PATB_SIZE_SHIFT
- 12);
215 set_ptcr_when_no_uv(ptcr
);
216 powernv_set_nmmu_ptcr(ptcr
);
219 static void flush_partition(unsigned int lpid
, bool radix
)
222 radix__flush_all_lpid(lpid
);
223 radix__flush_all_lpid_guest(lpid
);
225 asm volatile("ptesync" : : : "memory");
226 asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
227 "r" (TLBIEL_INVAL_SET_LPID
), "r" (lpid
));
228 /* do we need fixup here ?*/
229 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
230 trace_tlbie(lpid
, 0, TLBIEL_INVAL_SET_LPID
, lpid
, 2, 0, 0);
234 void mmu_partition_table_set_entry(unsigned int lpid
, unsigned long dw0
,
235 unsigned long dw1
, bool flush
)
237 unsigned long old
= be64_to_cpu(partition_tb
[lpid
].patb0
);
240 * When ultravisor is enabled, the partition table is stored in secure
241 * memory and can only be accessed doing an ultravisor call. However, we
242 * maintain a copy of the partition table in normal memory to allow Nest
243 * MMU translations to occur (for normal VMs).
245 * Therefore, here we always update partition_tb, regardless of whether
246 * we are running under an ultravisor or not.
248 partition_tb
[lpid
].patb0
= cpu_to_be64(dw0
);
249 partition_tb
[lpid
].patb1
= cpu_to_be64(dw1
);
252 * If ultravisor is enabled, we do an ultravisor call to register the
253 * partition table entry (PATE), which also do a global flush of TLBs
254 * and partition table caches for the lpid. Otherwise, just do the
255 * flush. The type of flush (hash or radix) depends on what the previous
256 * use of the partition ID was, not the new use.
258 if (firmware_has_feature(FW_FEATURE_ULTRAVISOR
)) {
259 uv_register_pate(lpid
, dw0
, dw1
);
260 pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
264 * Boot does not need to flush, because MMU is off and each
265 * CPU does a tlbiel_all() before switching them on, which
266 * flushes everything.
268 flush_partition(lpid
, (old
& PATB_HR
));
271 EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry
);
273 static pmd_t
*get_pmd_from_cache(struct mm_struct
*mm
)
275 void *pmd_frag
, *ret
;
277 if (PMD_FRAG_NR
== 1)
280 spin_lock(&mm
->page_table_lock
);
281 ret
= mm
->context
.pmd_frag
;
283 pmd_frag
= ret
+ PMD_FRAG_SIZE
;
285 * If we have taken up all the fragments mark PTE page NULL
287 if (((unsigned long)pmd_frag
& ~PAGE_MASK
) == 0)
289 mm
->context
.pmd_frag
= pmd_frag
;
291 spin_unlock(&mm
->page_table_lock
);
295 static pmd_t
*__alloc_for_pmdcache(struct mm_struct
*mm
)
299 gfp_t gfp
= GFP_KERNEL_ACCOUNT
| __GFP_ZERO
;
302 gfp
&= ~__GFP_ACCOUNT
;
303 page
= alloc_page(gfp
);
306 if (!pgtable_pmd_page_ctor(page
)) {
307 __free_pages(page
, 0);
311 atomic_set(&page
->pt_frag_refcount
, 1);
313 ret
= page_address(page
);
315 * if we support only one fragment just return the
318 if (PMD_FRAG_NR
== 1)
321 spin_lock(&mm
->page_table_lock
);
323 * If we find pgtable_page set, we return
324 * the allocated page with single fragement
327 if (likely(!mm
->context
.pmd_frag
)) {
328 atomic_set(&page
->pt_frag_refcount
, PMD_FRAG_NR
);
329 mm
->context
.pmd_frag
= ret
+ PMD_FRAG_SIZE
;
331 spin_unlock(&mm
->page_table_lock
);
336 pmd_t
*pmd_fragment_alloc(struct mm_struct
*mm
, unsigned long vmaddr
)
340 pmd
= get_pmd_from_cache(mm
);
344 return __alloc_for_pmdcache(mm
);
347 void pmd_fragment_free(unsigned long *pmd
)
349 struct page
*page
= virt_to_page(pmd
);
351 if (PageReserved(page
))
352 return free_reserved_page(page
);
354 BUG_ON(atomic_read(&page
->pt_frag_refcount
) <= 0);
355 if (atomic_dec_and_test(&page
->pt_frag_refcount
)) {
356 pgtable_pmd_page_dtor(page
);
361 static inline void pgtable_free(void *table
, int index
)
365 pte_fragment_free(table
, 0);
368 pmd_fragment_free(table
);
373 #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
374 /* 16M hugepd directory at pud level */
376 BUILD_BUG_ON(H_16M_CACHE_INDEX
<= 0);
377 kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX
), table
);
379 /* 16G hugepd directory at the pgd level */
381 BUILD_BUG_ON(H_16G_CACHE_INDEX
<= 0);
382 kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX
), table
);
385 /* We don't free pgd table via RCU callback */
391 void pgtable_free_tlb(struct mmu_gather
*tlb
, void *table
, int index
)
393 unsigned long pgf
= (unsigned long)table
;
395 BUG_ON(index
> MAX_PGTABLE_INDEX_SIZE
);
397 tlb_remove_table(tlb
, (void *)pgf
);
400 void __tlb_remove_table(void *_table
)
402 void *table
= (void *)((unsigned long)_table
& ~MAX_PGTABLE_INDEX_SIZE
);
403 unsigned int index
= (unsigned long)_table
& MAX_PGTABLE_INDEX_SIZE
;
405 return pgtable_free(table
, index
);
408 #ifdef CONFIG_PROC_FS
409 atomic_long_t direct_pages_count
[MMU_PAGE_COUNT
];
411 void arch_report_meminfo(struct seq_file
*m
)
414 * Hash maps the memory with one size mmu_linear_psize.
415 * So don't bother to print these on hash
417 if (!radix_enabled())
419 seq_printf(m
, "DirectMap4k: %8lu kB\n",
420 atomic_long_read(&direct_pages_count
[MMU_PAGE_4K
]) << 2);
421 seq_printf(m
, "DirectMap64k: %8lu kB\n",
422 atomic_long_read(&direct_pages_count
[MMU_PAGE_64K
]) << 6);
423 seq_printf(m
, "DirectMap2M: %8lu kB\n",
424 atomic_long_read(&direct_pages_count
[MMU_PAGE_2M
]) << 11);
425 seq_printf(m
, "DirectMap1G: %8lu kB\n",
426 atomic_long_read(&direct_pages_count
[MMU_PAGE_1G
]) << 20);
428 #endif /* CONFIG_PROC_FS */
430 pte_t
ptep_modify_prot_start(struct vm_area_struct
*vma
, unsigned long addr
,
433 unsigned long pte_val
;
436 * Clear the _PAGE_PRESENT so that no hardware parallel update is
437 * possible. Also keep the pte_present true so that we don't take
440 pte_val
= pte_update(vma
->vm_mm
, addr
, ptep
, _PAGE_PRESENT
, _PAGE_INVALID
, 0);
442 return __pte(pte_val
);
446 void ptep_modify_prot_commit(struct vm_area_struct
*vma
, unsigned long addr
,
447 pte_t
*ptep
, pte_t old_pte
, pte_t pte
)
450 return radix__ptep_modify_prot_commit(vma
, addr
,
452 set_pte_at(vma
->vm_mm
, addr
, ptep
, pte
);
456 * For hash translation mode, we use the deposited table to store hash slot
457 * information and they are stored at PTRS_PER_PMD offset from related pmd
458 * location. Hence a pmd move requires deposit and withdraw.
460 * For radix translation with split pmd ptl, we store the deposited table in the
461 * pmd page. Hence if we have different pmd page we need to withdraw during pmd
464 * With hash we use deposited table always irrespective of anon or not.
465 * With radix we use deposited table only for anonymous mapping.
467 int pmd_move_must_withdraw(struct spinlock
*new_pmd_ptl
,
468 struct spinlock
*old_pmd_ptl
,
469 struct vm_area_struct
*vma
)
472 return (new_pmd_ptl
!= old_pmd_ptl
) && vma_is_anonymous(vma
);
478 * Does the CPU support tlbie?
480 bool tlbie_capable __read_mostly
= true;
481 EXPORT_SYMBOL(tlbie_capable
);
484 * Should tlbie be used for management of CPU TLBs, for kernel and process
485 * address spaces? tlbie may still be used for nMMU accelerators, and for KVM
486 * guest address spaces.
488 bool tlbie_enabled __read_mostly
= true;
490 static int __init
setup_disable_tlbie(char *str
)
492 if (!radix_enabled()) {
493 pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n");
497 tlbie_capable
= false;
498 tlbie_enabled
= false;
502 __setup("disable_tlbie", setup_disable_tlbie
);
504 static int __init
pgtable_debugfs_setup(void)
510 * There is no locking vs tlb flushing when changing this value.
511 * The tlb flushers will see one value or another, and use either
512 * tlbie or tlbiel with IPIs. In both cases the TLBs will be
513 * invalidated as expected.
515 debugfs_create_bool("tlbie_enabled", 0600,
516 powerpc_debugfs_root
,
521 arch_initcall(pgtable_debugfs_setup
);