2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
6 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
9 #include <linux/types.h>
10 #include <linux/string.h>
11 #include <linux/kvm.h>
12 #include <linux/kvm_host.h>
14 #include <asm/kvm_ppc.h>
15 #include <asm/kvm_book3s.h>
18 #include <asm/pgtable.h>
19 #include <asm/pgalloc.h>
20 #include <asm/pte-walk.h>
23 * Supported radix tree geometry.
24 * Like p9, we support either 5 or 9 bits at the first (lowest) level,
25 * for a page size of 64k or 4k.
27 static int p9_supported_radix_bits
[4] = { 5, 9, 9, 13 };
29 int kvmppc_mmu_radix_xlate(struct kvm_vcpu
*vcpu
, gva_t eaddr
,
30 struct kvmppc_pte
*gpte
, bool data
, bool iswrite
)
32 struct kvm
*kvm
= vcpu
->kvm
;
37 unsigned long root
, pte
, index
;
38 unsigned long rts
, bits
, offset
;
40 unsigned long proc_tbl_size
;
42 /* Work out effective PID */
43 switch (eaddr
>> 62) {
53 proc_tbl_size
= 1 << ((kvm
->arch
.process_table
& PRTS_MASK
) + 12);
54 if (pid
* 16 >= proc_tbl_size
)
57 /* Read partition table to find root of tree for effective PID */
58 ptbl
= (kvm
->arch
.process_table
& PRTB_MASK
) + (pid
* 16);
59 ret
= kvm_read_guest(kvm
, ptbl
, &prte
, sizeof(prte
));
63 root
= be64_to_cpu(prte
);
64 rts
= ((root
& RTS1_MASK
) >> (RTS1_SHIFT
- 3)) |
65 ((root
& RTS2_MASK
) >> RTS2_SHIFT
);
66 bits
= root
& RPDS_MASK
;
67 root
= root
& RPDB_MASK
;
69 /* P9 DD1 interprets RTS (radix tree size) differently */
71 if (cpu_has_feature(CPU_FTR_POWER9_DD1
))
74 /* current implementations only support 52-bit space */
78 for (level
= 3; level
>= 0; --level
) {
79 if (level
&& bits
!= p9_supported_radix_bits
[level
])
81 if (level
== 0 && !(bits
== 5 || bits
== 9))
84 index
= (eaddr
>> offset
) & ((1UL << bits
) - 1);
85 /* check that low bits of page table base are zero */
86 if (root
& ((1UL << (bits
+ 3)) - 1))
88 ret
= kvm_read_guest(kvm
, root
+ index
* 8,
92 pte
= __be64_to_cpu(rpte
);
93 if (!(pte
& _PAGE_PRESENT
))
98 root
= pte
& 0x0fffffffffffff00ul
;
100 /* need a leaf at lowest level; 512GB pages not supported */
101 if (level
< 0 || level
== 3)
104 /* offset is now log base 2 of the page size */
105 gpa
= pte
& 0x01fffffffffff000ul
;
106 if (gpa
& ((1ul << offset
) - 1))
108 gpa
+= eaddr
& ((1ul << offset
) - 1);
109 for (ps
= MMU_PAGE_4K
; ps
< MMU_PAGE_COUNT
; ++ps
)
110 if (offset
== mmu_psize_defs
[ps
].shift
)
112 gpte
->page_size
= ps
;
117 /* Work out permissions */
118 gpte
->may_read
= !!(pte
& _PAGE_READ
);
119 gpte
->may_write
= !!(pte
& _PAGE_WRITE
);
120 gpte
->may_execute
= !!(pte
& _PAGE_EXEC
);
121 if (kvmppc_get_msr(vcpu
) & MSR_PR
) {
122 if (pte
& _PAGE_PRIVILEGED
) {
125 gpte
->may_execute
= 0;
128 if (!(pte
& _PAGE_PRIVILEGED
)) {
129 /* Check AMR/IAMR to see if strict mode is in force */
130 if (vcpu
->arch
.amr
& (1ul << 62))
132 if (vcpu
->arch
.amr
& (1ul << 63))
134 if (vcpu
->arch
.iamr
& (1ul << 62))
135 gpte
->may_execute
= 0;
142 #ifdef CONFIG_PPC_64K_PAGES
143 #define MMU_BASE_PSIZE MMU_PAGE_64K
145 #define MMU_BASE_PSIZE MMU_PAGE_4K
148 static void kvmppc_radix_tlbie_page(struct kvm
*kvm
, unsigned long addr
,
151 int psize
= MMU_BASE_PSIZE
;
153 if (pshift
>= PMD_SHIFT
)
156 addr
|= mmu_psize_defs
[psize
].ap
<< 5;
157 asm volatile("ptesync": : :"memory");
158 asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
159 : : "r" (addr
), "r" (kvm
->arch
.lpid
) : "memory");
160 asm volatile("ptesync": : :"memory");
163 unsigned long kvmppc_radix_update_pte(struct kvm
*kvm
, pte_t
*ptep
,
164 unsigned long clr
, unsigned long set
,
165 unsigned long addr
, unsigned int shift
)
167 unsigned long old
= 0;
169 if (!(clr
& _PAGE_PRESENT
) && cpu_has_feature(CPU_FTR_POWER9_DD1
) &&
170 pte_present(*ptep
)) {
171 /* have to invalidate it first */
172 old
= __radix_pte_update(ptep
, _PAGE_PRESENT
, 0);
173 kvmppc_radix_tlbie_page(kvm
, addr
, shift
);
174 set
|= _PAGE_PRESENT
;
175 old
&= _PAGE_PRESENT
;
177 return __radix_pte_update(ptep
, clr
, set
) | old
;
180 void kvmppc_radix_set_pte_at(struct kvm
*kvm
, unsigned long addr
,
181 pte_t
*ptep
, pte_t pte
)
183 radix__set_pte_at(kvm
->mm
, addr
, ptep
, pte
, 0);
186 static struct kmem_cache
*kvm_pte_cache
;
188 static pte_t
*kvmppc_pte_alloc(void)
190 return kmem_cache_alloc(kvm_pte_cache
, GFP_KERNEL
);
193 static void kvmppc_pte_free(pte_t
*ptep
)
195 kmem_cache_free(kvm_pte_cache
, ptep
);
198 static int kvmppc_create_pte(struct kvm
*kvm
, pte_t pte
, unsigned long gpa
,
199 unsigned int level
, unsigned long mmu_seq
)
202 pud_t
*pud
, *new_pud
= NULL
;
203 pmd_t
*pmd
, *new_pmd
= NULL
;
204 pte_t
*ptep
, *new_ptep
= NULL
;
208 /* Traverse the guest's 2nd-level tree, allocate new levels needed */
209 pgd
= kvm
->arch
.pgtable
+ pgd_index(gpa
);
211 if (pgd_present(*pgd
))
212 pud
= pud_offset(pgd
, gpa
);
214 new_pud
= pud_alloc_one(kvm
->mm
, gpa
);
217 if (pud
&& pud_present(*pud
))
218 pmd
= pmd_offset(pud
, gpa
);
220 new_pmd
= pmd_alloc_one(kvm
->mm
, gpa
);
222 if (level
== 0 && !(pmd
&& pmd_present(*pmd
)))
223 new_ptep
= kvmppc_pte_alloc();
225 /* Check if we might have been invalidated; let the guest retry if so */
226 spin_lock(&kvm
->mmu_lock
);
228 if (mmu_notifier_retry(kvm
, mmu_seq
))
231 /* Now traverse again under the lock and change the tree */
233 if (pgd_none(*pgd
)) {
236 pgd_populate(kvm
->mm
, pgd
, new_pud
);
239 pud
= pud_offset(pgd
, gpa
);
240 if (pud_none(*pud
)) {
243 pud_populate(kvm
->mm
, pud
, new_pmd
);
246 pmd
= pmd_offset(pud
, gpa
);
247 if (pmd_large(*pmd
)) {
248 /* Someone else has instantiated a large page here; retry */
252 if (level
== 1 && !pmd_none(*pmd
)) {
254 * There's a page table page here, but we wanted
255 * to install a large page. Tell the caller and let
256 * it try installing a normal page if it wants.
262 if (pmd_none(*pmd
)) {
265 pmd_populate(kvm
->mm
, pmd
, new_ptep
);
268 ptep
= pte_offset_kernel(pmd
, gpa
);
269 if (pte_present(*ptep
)) {
270 /* PTE was previously valid, so invalidate it */
271 old
= kvmppc_radix_update_pte(kvm
, ptep
, _PAGE_PRESENT
,
273 kvmppc_radix_tlbie_page(kvm
, gpa
, 0);
274 if (old
& _PAGE_DIRTY
)
275 mark_page_dirty(kvm
, gpa
>> PAGE_SHIFT
);
277 kvmppc_radix_set_pte_at(kvm
, gpa
, ptep
, pte
);
279 kvmppc_radix_set_pte_at(kvm
, gpa
, pmdp_ptep(pmd
), pte
);
284 spin_unlock(&kvm
->mmu_lock
);
286 pud_free(kvm
->mm
, new_pud
);
288 pmd_free(kvm
->mm
, new_pmd
);
290 kvmppc_pte_free(new_ptep
);
294 int kvmppc_book3s_radix_page_fault(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
,
295 unsigned long ea
, unsigned long dsisr
)
297 struct kvm
*kvm
= vcpu
->kvm
;
298 unsigned long mmu_seq
, pte_size
;
299 unsigned long gpa
, gfn
, hva
, pfn
;
300 struct kvm_memory_slot
*memslot
;
301 struct page
*page
= NULL
, *pages
[1];
302 long ret
, npages
, ok
;
303 unsigned int writing
;
304 struct vm_area_struct
*vma
;
307 unsigned long pgflags
;
308 unsigned int shift
, level
;
310 /* Check for unusual errors */
311 if (dsisr
& DSISR_UNSUPP_MMU
) {
312 pr_err("KVM: Got unsupported MMU fault\n");
315 if (dsisr
& DSISR_BADACCESS
) {
316 /* Reflect to the guest as DSI */
317 pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr
);
318 kvmppc_core_queue_data_storage(vcpu
, ea
, dsisr
);
322 /* Translate the logical address and get the page */
323 gpa
= vcpu
->arch
.fault_gpa
& ~0xfffUL
;
324 gpa
&= ~0xF000000000000000ul
;
325 gfn
= gpa
>> PAGE_SHIFT
;
326 if (!(dsisr
& DSISR_PRTABLE_FAULT
))
328 memslot
= gfn_to_memslot(kvm
, gfn
);
330 /* No memslot means it's an emulated MMIO region */
331 if (!memslot
|| (memslot
->flags
& KVM_MEMSLOT_INVALID
)) {
332 if (dsisr
& (DSISR_PRTABLE_FAULT
| DSISR_BADACCESS
|
335 * Bad address in guest page table tree, or other
336 * unusual error - reflect it to the guest as DSI.
338 kvmppc_core_queue_data_storage(vcpu
, ea
, dsisr
);
341 return kvmppc_hv_emulate_mmio(run
, vcpu
, gpa
, ea
,
342 dsisr
& DSISR_ISSTORE
);
345 /* used to check for invalidations in progress */
346 mmu_seq
= kvm
->mmu_notifier_seq
;
349 writing
= (dsisr
& DSISR_ISSTORE
) != 0;
350 hva
= gfn_to_hva_memslot(memslot
, gfn
);
351 if (dsisr
& DSISR_SET_RC
) {
353 * Need to set an R or C bit in the 2nd-level tables;
354 * if the relevant bits aren't already set in the linux
355 * page tables, fall through to do the gup_fast to
356 * set them in the linux page tables too.
359 pgflags
= _PAGE_ACCESSED
;
361 pgflags
|= _PAGE_DIRTY
;
362 local_irq_save(flags
);
363 ptep
= find_current_mm_pte(current
->mm
->pgd
, hva
, NULL
, NULL
);
365 pte
= READ_ONCE(*ptep
);
366 if (pte_present(pte
) &&
367 (pte_val(pte
) & pgflags
) == pgflags
)
370 local_irq_restore(flags
);
372 spin_lock(&kvm
->mmu_lock
);
373 if (mmu_notifier_retry(vcpu
->kvm
, mmu_seq
)) {
374 spin_unlock(&kvm
->mmu_lock
);
378 * We are walking the secondary page table here. We can do this
379 * without disabling irq.
381 ptep
= __find_linux_pte(kvm
->arch
.pgtable
,
383 if (ptep
&& pte_present(*ptep
)) {
384 kvmppc_radix_update_pte(kvm
, ptep
, 0, pgflags
,
386 spin_unlock(&kvm
->mmu_lock
);
389 spin_unlock(&kvm
->mmu_lock
);
395 pte_size
= PAGE_SIZE
;
396 pgflags
= _PAGE_READ
| _PAGE_EXEC
;
398 npages
= get_user_pages_fast(hva
, 1, writing
, pages
);
400 /* Check if it's an I/O mapping */
401 down_read(¤t
->mm
->mmap_sem
);
402 vma
= find_vma(current
->mm
, hva
);
403 if (vma
&& vma
->vm_start
<= hva
&& hva
< vma
->vm_end
&&
404 (vma
->vm_flags
& VM_PFNMAP
)) {
405 pfn
= vma
->vm_pgoff
+
406 ((hva
- vma
->vm_start
) >> PAGE_SHIFT
);
407 pgflags
= pgprot_val(vma
->vm_page_prot
);
409 up_read(¤t
->mm
->mmap_sem
);
414 pfn
= page_to_pfn(page
);
415 if (PageHuge(page
)) {
416 page
= compound_head(page
);
417 pte_size
<<= compound_order(page
);
418 /* See if we can insert a 2MB large-page PTE here */
419 if (pte_size
>= PMD_SIZE
&&
420 (gpa
& PMD_MASK
& PAGE_MASK
) ==
421 (hva
& PMD_MASK
& PAGE_MASK
)) {
423 pfn
&= ~((PMD_SIZE
>> PAGE_SHIFT
) - 1);
426 /* See if we can provide write access */
429 * We assume gup_fast has set dirty on the host PTE.
431 pgflags
|= _PAGE_WRITE
;
433 local_irq_save(flags
);
434 ptep
= find_current_mm_pte(current
->mm
->pgd
,
436 if (ptep
&& pte_write(*ptep
) && pte_dirty(*ptep
))
437 pgflags
|= _PAGE_WRITE
;
438 local_irq_restore(flags
);
443 * Compute the PTE value that we need to insert.
445 pgflags
|= _PAGE_PRESENT
| _PAGE_PTE
| _PAGE_ACCESSED
;
446 if (pgflags
& _PAGE_WRITE
)
447 pgflags
|= _PAGE_DIRTY
;
448 pte
= pfn_pte(pfn
, __pgprot(pgflags
));
450 /* Allocate space in the tree and write the PTE */
451 ret
= kvmppc_create_pte(kvm
, pte
, gpa
, level
, mmu_seq
);
454 * There's already a PMD where wanted to install a large page;
455 * for now, fall back to installing a small page.
458 pfn
|= gfn
& ((PMD_SIZE
>> PAGE_SHIFT
) - 1);
459 pte
= pfn_pte(pfn
, __pgprot(pgflags
));
460 ret
= kvmppc_create_pte(kvm
, pte
, gpa
, level
, mmu_seq
);
462 if (ret
== 0 || ret
== -EAGAIN
)
467 * We drop pages[0] here, not page because page might
468 * have been set to the head page of a compound, but
469 * we have to drop the reference on the correct tail
470 * page to match the get inside gup()
477 /* Called with kvm->lock held */
478 int kvm_unmap_radix(struct kvm
*kvm
, struct kvm_memory_slot
*memslot
,
482 unsigned long gpa
= gfn
<< PAGE_SHIFT
;
486 ptep
= __find_linux_pte(kvm
->arch
.pgtable
, gpa
, NULL
, &shift
);
487 if (ptep
&& pte_present(*ptep
)) {
488 old
= kvmppc_radix_update_pte(kvm
, ptep
, _PAGE_PRESENT
, 0,
490 kvmppc_radix_tlbie_page(kvm
, gpa
, shift
);
491 if ((old
& _PAGE_DIRTY
) && memslot
->dirty_bitmap
) {
492 unsigned long npages
= 1;
494 npages
= 1ul << (shift
- PAGE_SHIFT
);
495 kvmppc_update_dirty_map(memslot
, gfn
, npages
);
501 /* Called with kvm->lock held */
502 int kvm_age_radix(struct kvm
*kvm
, struct kvm_memory_slot
*memslot
,
506 unsigned long gpa
= gfn
<< PAGE_SHIFT
;
510 ptep
= __find_linux_pte(kvm
->arch
.pgtable
, gpa
, NULL
, &shift
);
511 if (ptep
&& pte_present(*ptep
) && pte_young(*ptep
)) {
512 kvmppc_radix_update_pte(kvm
, ptep
, _PAGE_ACCESSED
, 0,
514 /* XXX need to flush tlb here? */
520 /* Called with kvm->lock held */
521 int kvm_test_age_radix(struct kvm
*kvm
, struct kvm_memory_slot
*memslot
,
525 unsigned long gpa
= gfn
<< PAGE_SHIFT
;
529 ptep
= __find_linux_pte(kvm
->arch
.pgtable
, gpa
, NULL
, &shift
);
530 if (ptep
&& pte_present(*ptep
) && pte_young(*ptep
))
535 /* Returns the number of PAGE_SIZE pages that are dirty */
536 static int kvm_radix_test_clear_dirty(struct kvm
*kvm
,
537 struct kvm_memory_slot
*memslot
, int pagenum
)
539 unsigned long gfn
= memslot
->base_gfn
+ pagenum
;
540 unsigned long gpa
= gfn
<< PAGE_SHIFT
;
545 ptep
= __find_linux_pte(kvm
->arch
.pgtable
, gpa
, NULL
, &shift
);
546 if (ptep
&& pte_present(*ptep
) && pte_dirty(*ptep
)) {
549 ret
= 1 << (shift
- PAGE_SHIFT
);
550 kvmppc_radix_update_pte(kvm
, ptep
, _PAGE_DIRTY
, 0,
552 kvmppc_radix_tlbie_page(kvm
, gpa
, shift
);
557 long kvmppc_hv_get_dirty_log_radix(struct kvm
*kvm
,
558 struct kvm_memory_slot
*memslot
, unsigned long *map
)
563 for (i
= 0; i
< memslot
->npages
; i
= j
) {
564 npages
= kvm_radix_test_clear_dirty(kvm
, memslot
, i
);
567 * Note that if npages > 0 then i must be a multiple of npages,
568 * since huge pages are only used to back the guest at guest
569 * real addresses that are a multiple of their size.
570 * Since we have at most one PTE covering any given guest
571 * real address, if npages > 1 we can skip to i + npages.
575 set_dirty_bits(map
, i
, npages
);
582 static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info
*info
,
583 int psize
, int *indexp
)
585 if (!mmu_psize_defs
[psize
].shift
)
587 info
->ap_encodings
[*indexp
] = mmu_psize_defs
[psize
].shift
|
588 (mmu_psize_defs
[psize
].ap
<< 29);
592 int kvmhv_get_rmmu_info(struct kvm
*kvm
, struct kvm_ppc_rmmu_info
*info
)
596 if (!radix_enabled())
598 memset(info
, 0, sizeof(*info
));
601 info
->geometries
[0].page_shift
= 12;
602 info
->geometries
[0].level_bits
[0] = 9;
603 for (i
= 1; i
< 4; ++i
)
604 info
->geometries
[0].level_bits
[i
] = p9_supported_radix_bits
[i
];
606 info
->geometries
[1].page_shift
= 16;
607 for (i
= 0; i
< 4; ++i
)
608 info
->geometries
[1].level_bits
[i
] = p9_supported_radix_bits
[i
];
611 add_rmmu_ap_encoding(info
, MMU_PAGE_4K
, &i
);
612 add_rmmu_ap_encoding(info
, MMU_PAGE_64K
, &i
);
613 add_rmmu_ap_encoding(info
, MMU_PAGE_2M
, &i
);
614 add_rmmu_ap_encoding(info
, MMU_PAGE_1G
, &i
);
619 int kvmppc_init_vm_radix(struct kvm
*kvm
)
621 kvm
->arch
.pgtable
= pgd_alloc(kvm
->mm
);
622 if (!kvm
->arch
.pgtable
)
627 void kvmppc_free_radix(struct kvm
*kvm
)
629 unsigned long ig
, iu
, im
;
635 if (!kvm
->arch
.pgtable
)
637 pgd
= kvm
->arch
.pgtable
;
638 for (ig
= 0; ig
< PTRS_PER_PGD
; ++ig
, ++pgd
) {
639 if (!pgd_present(*pgd
))
641 pud
= pud_offset(pgd
, 0);
642 for (iu
= 0; iu
< PTRS_PER_PUD
; ++iu
, ++pud
) {
643 if (!pud_present(*pud
))
645 pmd
= pmd_offset(pud
, 0);
646 for (im
= 0; im
< PTRS_PER_PMD
; ++im
, ++pmd
) {
647 if (pmd_huge(*pmd
)) {
651 if (!pmd_present(*pmd
))
653 pte
= pte_offset_map(pmd
, 0);
654 memset(pte
, 0, sizeof(long) << PTE_INDEX_SIZE
);
655 kvmppc_pte_free(pte
);
658 pmd_free(kvm
->mm
, pmd_offset(pud
, 0));
661 pud_free(kvm
->mm
, pud_offset(pgd
, 0));
664 pgd_free(kvm
->mm
, kvm
->arch
.pgtable
);
665 kvm
->arch
.pgtable
= NULL
;
668 static void pte_ctor(void *addr
)
670 memset(addr
, 0, PTE_TABLE_SIZE
);
673 int kvmppc_radix_init(void)
675 unsigned long size
= sizeof(void *) << PTE_INDEX_SIZE
;
677 kvm_pte_cache
= kmem_cache_create("kvm-pte", size
, size
, 0, pte_ctor
);
683 void kvmppc_radix_exit(void)
685 kmem_cache_destroy(kvm_pte_cache
);