1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2019 Western Digital Corporation or its affiliates.
6 * Anup Patel <anup.patel@wdc.com>
9 #include <linux/bitops.h>
10 #include <linux/errno.h>
11 #include <linux/err.h>
12 #include <linux/hugetlb.h>
13 #include <linux/module.h>
14 #include <linux/uaccess.h>
15 #include <linux/vmalloc.h>
16 #include <linux/kvm_host.h>
17 #include <linux/sched/signal.h>
18 #include <asm/kvm_nacl.h>
20 #include <asm/pgtable.h>
23 static unsigned long gstage_mode __ro_after_init
= (HGATP_MODE_SV39X4
<< HGATP_MODE_SHIFT
);
24 static unsigned long gstage_pgd_levels __ro_after_init
= 3;
25 #define gstage_index_bits 9
27 static unsigned long gstage_mode __ro_after_init
= (HGATP_MODE_SV32X4
<< HGATP_MODE_SHIFT
);
28 static unsigned long gstage_pgd_levels __ro_after_init
= 2;
29 #define gstage_index_bits 10
32 #define gstage_pgd_xbits 2
33 #define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits))
34 #define gstage_gpa_bits (HGATP_PAGE_SHIFT + \
35 (gstage_pgd_levels * gstage_index_bits) + \
37 #define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits))
39 #define gstage_pte_leaf(__ptep) \
40 (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
42 static inline unsigned long gstage_pte_index(gpa_t addr
, u32 level
)
45 unsigned long shift
= HGATP_PAGE_SHIFT
+ (gstage_index_bits
* level
);
47 if (level
== (gstage_pgd_levels
- 1))
48 mask
= (PTRS_PER_PTE
* (1UL << gstage_pgd_xbits
)) - 1;
50 mask
= PTRS_PER_PTE
- 1;
52 return (addr
>> shift
) & mask
;
55 static inline unsigned long gstage_pte_page_vaddr(pte_t pte
)
57 return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte
)));
60 static int gstage_page_size_to_level(unsigned long page_size
, u32
*out_level
)
63 unsigned long psz
= 1UL << 12;
65 for (i
= 0; i
< gstage_pgd_levels
; i
++) {
66 if (page_size
== (psz
<< (i
* gstage_index_bits
))) {
75 static int gstage_level_to_page_order(u32 level
, unsigned long *out_pgorder
)
77 if (gstage_pgd_levels
< level
)
80 *out_pgorder
= 12 + (level
* gstage_index_bits
);
84 static int gstage_level_to_page_size(u32 level
, unsigned long *out_pgsize
)
87 unsigned long page_order
= PAGE_SHIFT
;
89 rc
= gstage_level_to_page_order(level
, &page_order
);
93 *out_pgsize
= BIT(page_order
);
97 static bool gstage_get_leaf_entry(struct kvm
*kvm
, gpa_t addr
,
98 pte_t
**ptepp
, u32
*ptep_level
)
101 u32 current_level
= gstage_pgd_levels
- 1;
103 *ptep_level
= current_level
;
104 ptep
= (pte_t
*)kvm
->arch
.pgd
;
105 ptep
= &ptep
[gstage_pte_index(addr
, current_level
)];
106 while (ptep
&& pte_val(ptep_get(ptep
))) {
107 if (gstage_pte_leaf(ptep
)) {
108 *ptep_level
= current_level
;
115 *ptep_level
= current_level
;
116 ptep
= (pte_t
*)gstage_pte_page_vaddr(ptep_get(ptep
));
117 ptep
= &ptep
[gstage_pte_index(addr
, current_level
)];
126 static void gstage_remote_tlb_flush(struct kvm
*kvm
, u32 level
, gpa_t addr
)
128 unsigned long order
= PAGE_SHIFT
;
130 if (gstage_level_to_page_order(level
, &order
))
132 addr
&= ~(BIT(order
) - 1);
134 kvm_riscv_hfence_gvma_vmid_gpa(kvm
, -1UL, 0, addr
, BIT(order
), order
);
137 static int gstage_set_pte(struct kvm
*kvm
, u32 level
,
138 struct kvm_mmu_memory_cache
*pcache
,
139 gpa_t addr
, const pte_t
*new_pte
)
141 u32 current_level
= gstage_pgd_levels
- 1;
142 pte_t
*next_ptep
= (pte_t
*)kvm
->arch
.pgd
;
143 pte_t
*ptep
= &next_ptep
[gstage_pte_index(addr
, current_level
)];
145 if (current_level
< level
)
148 while (current_level
!= level
) {
149 if (gstage_pte_leaf(ptep
))
152 if (!pte_val(ptep_get(ptep
))) {
155 next_ptep
= kvm_mmu_memory_cache_alloc(pcache
);
158 set_pte(ptep
, pfn_pte(PFN_DOWN(__pa(next_ptep
)),
159 __pgprot(_PAGE_TABLE
)));
161 if (gstage_pte_leaf(ptep
))
163 next_ptep
= (pte_t
*)gstage_pte_page_vaddr(ptep_get(ptep
));
167 ptep
= &next_ptep
[gstage_pte_index(addr
, current_level
)];
170 set_pte(ptep
, *new_pte
);
171 if (gstage_pte_leaf(ptep
))
172 gstage_remote_tlb_flush(kvm
, current_level
, addr
);
177 static int gstage_map_page(struct kvm
*kvm
,
178 struct kvm_mmu_memory_cache
*pcache
,
179 gpa_t gpa
, phys_addr_t hpa
,
180 unsigned long page_size
,
181 bool page_rdonly
, bool page_exec
)
188 ret
= gstage_page_size_to_level(page_size
, &level
);
193 * A RISC-V implementation can choose to either:
194 * 1) Update 'A' and 'D' PTE bits in hardware
195 * 2) Generate page fault when 'A' and/or 'D' bits are not set
196 * PTE so that software can update these bits.
198 * We support both options mentioned above. To achieve this, we
199 * always set 'A' and 'D' PTE bits at time of creating G-stage
200 * mapping. To support KVM dirty page logging with both options
201 * mentioned above, we will write-protect G-stage PTEs to track
207 prot
= PAGE_READ_EXEC
;
209 prot
= PAGE_WRITE_EXEC
;
216 new_pte
= pfn_pte(PFN_DOWN(hpa
), prot
);
217 new_pte
= pte_mkdirty(new_pte
);
219 return gstage_set_pte(kvm
, level
, pcache
, gpa
, &new_pte
);
223 GSTAGE_OP_NOP
= 0, /* Nothing */
224 GSTAGE_OP_CLEAR
, /* Clear/Unmap */
225 GSTAGE_OP_WP
, /* Write-protect */
228 static void gstage_op_pte(struct kvm
*kvm
, gpa_t addr
,
229 pte_t
*ptep
, u32 ptep_level
, enum gstage_op op
)
234 unsigned long next_page_size
, page_size
;
236 ret
= gstage_level_to_page_size(ptep_level
, &page_size
);
240 BUG_ON(addr
& (page_size
- 1));
242 if (!pte_val(ptep_get(ptep
)))
245 if (ptep_level
&& !gstage_pte_leaf(ptep
)) {
246 next_ptep
= (pte_t
*)gstage_pte_page_vaddr(ptep_get(ptep
));
247 next_ptep_level
= ptep_level
- 1;
248 ret
= gstage_level_to_page_size(next_ptep_level
,
253 if (op
== GSTAGE_OP_CLEAR
)
254 set_pte(ptep
, __pte(0));
255 for (i
= 0; i
< PTRS_PER_PTE
; i
++)
256 gstage_op_pte(kvm
, addr
+ i
* next_page_size
,
257 &next_ptep
[i
], next_ptep_level
, op
);
258 if (op
== GSTAGE_OP_CLEAR
)
259 put_page(virt_to_page(next_ptep
));
261 if (op
== GSTAGE_OP_CLEAR
)
262 set_pte(ptep
, __pte(0));
263 else if (op
== GSTAGE_OP_WP
)
264 set_pte(ptep
, __pte(pte_val(ptep_get(ptep
)) & ~_PAGE_WRITE
));
265 gstage_remote_tlb_flush(kvm
, ptep_level
, addr
);
269 static void gstage_unmap_range(struct kvm
*kvm
, gpa_t start
,
270 gpa_t size
, bool may_block
)
276 unsigned long page_size
;
277 gpa_t addr
= start
, end
= start
+ size
;
280 found_leaf
= gstage_get_leaf_entry(kvm
, addr
,
282 ret
= gstage_level_to_page_size(ptep_level
, &page_size
);
289 if (!(addr
& (page_size
- 1)) && ((end
- addr
) >= page_size
))
290 gstage_op_pte(kvm
, addr
, ptep
,
291 ptep_level
, GSTAGE_OP_CLEAR
);
297 * If the range is too large, release the kvm->mmu_lock
298 * to prevent starvation and lockup detector warnings.
300 if (may_block
&& addr
< end
)
301 cond_resched_lock(&kvm
->mmu_lock
);
305 static void gstage_wp_range(struct kvm
*kvm
, gpa_t start
, gpa_t end
)
312 unsigned long page_size
;
315 found_leaf
= gstage_get_leaf_entry(kvm
, addr
,
317 ret
= gstage_level_to_page_size(ptep_level
, &page_size
);
324 if (!(addr
& (page_size
- 1)) && ((end
- addr
) >= page_size
))
325 gstage_op_pte(kvm
, addr
, ptep
,
326 ptep_level
, GSTAGE_OP_WP
);
333 static void gstage_wp_memory_region(struct kvm
*kvm
, int slot
)
335 struct kvm_memslots
*slots
= kvm_memslots(kvm
);
336 struct kvm_memory_slot
*memslot
= id_to_memslot(slots
, slot
);
337 phys_addr_t start
= memslot
->base_gfn
<< PAGE_SHIFT
;
338 phys_addr_t end
= (memslot
->base_gfn
+ memslot
->npages
) << PAGE_SHIFT
;
340 spin_lock(&kvm
->mmu_lock
);
341 gstage_wp_range(kvm
, start
, end
);
342 spin_unlock(&kvm
->mmu_lock
);
343 kvm_flush_remote_tlbs(kvm
);
346 int kvm_riscv_gstage_ioremap(struct kvm
*kvm
, gpa_t gpa
,
347 phys_addr_t hpa
, unsigned long size
,
348 bool writable
, bool in_atomic
)
353 phys_addr_t addr
, end
;
354 struct kvm_mmu_memory_cache pcache
= {
355 .gfp_custom
= (in_atomic
) ? GFP_ATOMIC
| __GFP_ACCOUNT
: 0,
356 .gfp_zero
= __GFP_ZERO
,
359 end
= (gpa
+ size
+ PAGE_SIZE
- 1) & PAGE_MASK
;
360 pfn
= __phys_to_pfn(hpa
);
362 for (addr
= gpa
; addr
< end
; addr
+= PAGE_SIZE
) {
363 pte
= pfn_pte(pfn
, PAGE_KERNEL_IO
);
366 pte
= pte_wrprotect(pte
);
368 ret
= kvm_mmu_topup_memory_cache(&pcache
, gstage_pgd_levels
);
372 spin_lock(&kvm
->mmu_lock
);
373 ret
= gstage_set_pte(kvm
, 0, &pcache
, addr
, &pte
);
374 spin_unlock(&kvm
->mmu_lock
);
382 kvm_mmu_free_memory_cache(&pcache
);
386 void kvm_riscv_gstage_iounmap(struct kvm
*kvm
, gpa_t gpa
, unsigned long size
)
388 spin_lock(&kvm
->mmu_lock
);
389 gstage_unmap_range(kvm
, gpa
, size
, false);
390 spin_unlock(&kvm
->mmu_lock
);
393 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm
*kvm
,
394 struct kvm_memory_slot
*slot
,
398 phys_addr_t base_gfn
= slot
->base_gfn
+ gfn_offset
;
399 phys_addr_t start
= (base_gfn
+ __ffs(mask
)) << PAGE_SHIFT
;
400 phys_addr_t end
= (base_gfn
+ __fls(mask
) + 1) << PAGE_SHIFT
;
402 gstage_wp_range(kvm
, start
, end
);
405 void kvm_arch_sync_dirty_log(struct kvm
*kvm
, struct kvm_memory_slot
*memslot
)
409 void kvm_arch_free_memslot(struct kvm
*kvm
, struct kvm_memory_slot
*free
)
413 void kvm_arch_memslots_updated(struct kvm
*kvm
, u64 gen
)
417 void kvm_arch_flush_shadow_all(struct kvm
*kvm
)
419 kvm_riscv_gstage_free_pgd(kvm
);
422 void kvm_arch_flush_shadow_memslot(struct kvm
*kvm
,
423 struct kvm_memory_slot
*slot
)
425 gpa_t gpa
= slot
->base_gfn
<< PAGE_SHIFT
;
426 phys_addr_t size
= slot
->npages
<< PAGE_SHIFT
;
428 spin_lock(&kvm
->mmu_lock
);
429 gstage_unmap_range(kvm
, gpa
, size
, false);
430 spin_unlock(&kvm
->mmu_lock
);
433 void kvm_arch_commit_memory_region(struct kvm
*kvm
,
434 struct kvm_memory_slot
*old
,
435 const struct kvm_memory_slot
*new,
436 enum kvm_mr_change change
)
439 * At this point memslot has been committed and there is an
440 * allocated dirty_bitmap[], dirty pages will be tracked while
441 * the memory slot is write protected.
443 if (change
!= KVM_MR_DELETE
&& new->flags
& KVM_MEM_LOG_DIRTY_PAGES
)
444 gstage_wp_memory_region(kvm
, new->id
);
447 int kvm_arch_prepare_memory_region(struct kvm
*kvm
,
448 const struct kvm_memory_slot
*old
,
449 struct kvm_memory_slot
*new,
450 enum kvm_mr_change change
)
452 hva_t hva
, reg_end
, size
;
457 if (change
!= KVM_MR_CREATE
&& change
!= KVM_MR_MOVE
&&
458 change
!= KVM_MR_FLAGS_ONLY
)
462 * Prevent userspace from creating a memory region outside of the GPA
463 * space addressable by the KVM guest GPA space.
465 if ((new->base_gfn
+ new->npages
) >=
466 (gstage_gpa_size
>> PAGE_SHIFT
))
469 hva
= new->userspace_addr
;
470 size
= new->npages
<< PAGE_SHIFT
;
471 reg_end
= hva
+ size
;
472 base_gpa
= new->base_gfn
<< PAGE_SHIFT
;
473 writable
= !(new->flags
& KVM_MEM_READONLY
);
475 mmap_read_lock(current
->mm
);
478 * A memory region could potentially cover multiple VMAs, and
479 * any holes between them, so iterate over all of them to find
480 * out if we can map any of them right now.
482 * +--------------------------------------------+
483 * +---------------+----------------+ +----------------+
484 * | : VMA 1 | VMA 2 | | VMA 3 : |
485 * +---------------+----------------+ +----------------+
487 * +--------------------------------------------+
490 struct vm_area_struct
*vma
= find_vma(current
->mm
, hva
);
491 hva_t vm_start
, vm_end
;
493 if (!vma
|| vma
->vm_start
>= reg_end
)
497 * Mapping a read-only VMA is only allowed if the
498 * memory region is configured as read-only.
500 if (writable
&& !(vma
->vm_flags
& VM_WRITE
)) {
505 /* Take the intersection of this VMA with the memory region */
506 vm_start
= max(hva
, vma
->vm_start
);
507 vm_end
= min(reg_end
, vma
->vm_end
);
509 if (vma
->vm_flags
& VM_PFNMAP
) {
510 gpa_t gpa
= base_gpa
+ (vm_start
- hva
);
513 pa
= (phys_addr_t
)vma
->vm_pgoff
<< PAGE_SHIFT
;
514 pa
+= vm_start
- vma
->vm_start
;
516 /* IO region dirty page logging not allowed */
517 if (new->flags
& KVM_MEM_LOG_DIRTY_PAGES
) {
522 ret
= kvm_riscv_gstage_ioremap(kvm
, gpa
, pa
,
529 } while (hva
< reg_end
);
531 if (change
== KVM_MR_FLAGS_ONLY
)
535 kvm_riscv_gstage_iounmap(kvm
, base_gpa
, size
);
538 mmap_read_unlock(current
->mm
);
542 bool kvm_unmap_gfn_range(struct kvm
*kvm
, struct kvm_gfn_range
*range
)
547 gstage_unmap_range(kvm
, range
->start
<< PAGE_SHIFT
,
548 (range
->end
- range
->start
) << PAGE_SHIFT
,
553 bool kvm_age_gfn(struct kvm
*kvm
, struct kvm_gfn_range
*range
)
557 u64 size
= (range
->end
- range
->start
) << PAGE_SHIFT
;
562 WARN_ON(size
!= PAGE_SIZE
&& size
!= PMD_SIZE
&& size
!= PUD_SIZE
);
564 if (!gstage_get_leaf_entry(kvm
, range
->start
<< PAGE_SHIFT
,
568 return ptep_test_and_clear_young(NULL
, 0, ptep
);
571 bool kvm_test_age_gfn(struct kvm
*kvm
, struct kvm_gfn_range
*range
)
575 u64 size
= (range
->end
- range
->start
) << PAGE_SHIFT
;
580 WARN_ON(size
!= PAGE_SIZE
&& size
!= PMD_SIZE
&& size
!= PUD_SIZE
);
582 if (!gstage_get_leaf_entry(kvm
, range
->start
<< PAGE_SHIFT
,
586 return pte_young(ptep_get(ptep
));
589 int kvm_riscv_gstage_map(struct kvm_vcpu
*vcpu
,
590 struct kvm_memory_slot
*memslot
,
591 gpa_t gpa
, unsigned long hva
, bool is_write
)
597 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
598 struct vm_area_struct
*vma
;
599 struct kvm
*kvm
= vcpu
->kvm
;
600 struct kvm_mmu_memory_cache
*pcache
= &vcpu
->arch
.mmu_page_cache
;
601 bool logging
= (memslot
->dirty_bitmap
&&
602 !(memslot
->flags
& KVM_MEM_READONLY
)) ? true : false;
603 unsigned long vma_pagesize
, mmu_seq
;
606 /* We need minimum second+third level pages */
607 ret
= kvm_mmu_topup_memory_cache(pcache
, gstage_pgd_levels
);
609 kvm_err("Failed to topup G-stage cache\n");
613 mmap_read_lock(current
->mm
);
615 vma
= vma_lookup(current
->mm
, hva
);
616 if (unlikely(!vma
)) {
617 kvm_err("Failed to find VMA for hva 0x%lx\n", hva
);
618 mmap_read_unlock(current
->mm
);
622 if (is_vm_hugetlb_page(vma
))
623 vma_pageshift
= huge_page_shift(hstate_vma(vma
));
625 vma_pageshift
= PAGE_SHIFT
;
626 vma_pagesize
= 1ULL << vma_pageshift
;
627 if (logging
|| (vma
->vm_flags
& VM_PFNMAP
))
628 vma_pagesize
= PAGE_SIZE
;
630 if (vma_pagesize
== PMD_SIZE
|| vma_pagesize
== PUD_SIZE
)
631 gfn
= (gpa
& huge_page_mask(hstate_vma(vma
))) >> PAGE_SHIFT
;
634 * Read mmu_invalidate_seq so that KVM can detect if the results of
635 * vma_lookup() or __kvm_faultin_pfn() become stale prior to acquiring
638 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
639 * with the smp_wmb() in kvm_mmu_invalidate_end().
641 mmu_seq
= kvm
->mmu_invalidate_seq
;
642 mmap_read_unlock(current
->mm
);
644 if (vma_pagesize
!= PUD_SIZE
&&
645 vma_pagesize
!= PMD_SIZE
&&
646 vma_pagesize
!= PAGE_SIZE
) {
647 kvm_err("Invalid VMA page size 0x%lx\n", vma_pagesize
);
651 hfn
= kvm_faultin_pfn(vcpu
, gfn
, is_write
, &writable
, &page
);
652 if (hfn
== KVM_PFN_ERR_HWPOISON
) {
653 send_sig_mceerr(BUS_MCEERR_AR
, (void __user
*)hva
,
654 vma_pageshift
, current
);
657 if (is_error_noslot_pfn(hfn
))
661 * If logging is active then we allow writable pages only
664 if (logging
&& !is_write
)
667 spin_lock(&kvm
->mmu_lock
);
669 if (mmu_invalidate_retry(kvm
, mmu_seq
))
673 mark_page_dirty(kvm
, gfn
);
674 ret
= gstage_map_page(kvm
, pcache
, gpa
, hfn
<< PAGE_SHIFT
,
675 vma_pagesize
, false, true);
677 ret
= gstage_map_page(kvm
, pcache
, gpa
, hfn
<< PAGE_SHIFT
,
678 vma_pagesize
, true, true);
682 kvm_err("Failed to map in G-stage\n");
685 kvm_release_faultin_page(kvm
, page
, ret
&& ret
!= -EEXIST
, writable
);
686 spin_unlock(&kvm
->mmu_lock
);
690 int kvm_riscv_gstage_alloc_pgd(struct kvm
*kvm
)
692 struct page
*pgd_page
;
694 if (kvm
->arch
.pgd
!= NULL
) {
695 kvm_err("kvm_arch already initialized?\n");
699 pgd_page
= alloc_pages(GFP_KERNEL
| __GFP_ZERO
,
700 get_order(gstage_pgd_size
));
703 kvm
->arch
.pgd
= page_to_virt(pgd_page
);
704 kvm
->arch
.pgd_phys
= page_to_phys(pgd_page
);
709 void kvm_riscv_gstage_free_pgd(struct kvm
*kvm
)
713 spin_lock(&kvm
->mmu_lock
);
715 gstage_unmap_range(kvm
, 0UL, gstage_gpa_size
, false);
716 pgd
= READ_ONCE(kvm
->arch
.pgd
);
717 kvm
->arch
.pgd
= NULL
;
718 kvm
->arch
.pgd_phys
= 0;
720 spin_unlock(&kvm
->mmu_lock
);
723 free_pages((unsigned long)pgd
, get_order(gstage_pgd_size
));
726 void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu
*vcpu
)
728 unsigned long hgatp
= gstage_mode
;
729 struct kvm_arch
*k
= &vcpu
->kvm
->arch
;
731 hgatp
|= (READ_ONCE(k
->vmid
.vmid
) << HGATP_VMID_SHIFT
) & HGATP_VMID
;
732 hgatp
|= (k
->pgd_phys
>> PAGE_SHIFT
) & HGATP_PPN
;
734 ncsr_write(CSR_HGATP
, hgatp
);
736 if (!kvm_riscv_gstage_vmid_bits())
737 kvm_riscv_local_hfence_gvma_all();
740 void __init
kvm_riscv_gstage_mode_detect(void)
743 /* Try Sv57x4 G-stage mode */
744 csr_write(CSR_HGATP
, HGATP_MODE_SV57X4
<< HGATP_MODE_SHIFT
);
745 if ((csr_read(CSR_HGATP
) >> HGATP_MODE_SHIFT
) == HGATP_MODE_SV57X4
) {
746 gstage_mode
= (HGATP_MODE_SV57X4
<< HGATP_MODE_SHIFT
);
747 gstage_pgd_levels
= 5;
748 goto skip_sv48x4_test
;
751 /* Try Sv48x4 G-stage mode */
752 csr_write(CSR_HGATP
, HGATP_MODE_SV48X4
<< HGATP_MODE_SHIFT
);
753 if ((csr_read(CSR_HGATP
) >> HGATP_MODE_SHIFT
) == HGATP_MODE_SV48X4
) {
754 gstage_mode
= (HGATP_MODE_SV48X4
<< HGATP_MODE_SHIFT
);
755 gstage_pgd_levels
= 4;
759 csr_write(CSR_HGATP
, 0);
760 kvm_riscv_local_hfence_gvma_all();
764 unsigned long __init
kvm_riscv_gstage_mode(void)
766 return gstage_mode
>> HGATP_MODE_SHIFT
;
769 int kvm_riscv_gstage_gpa_bits(void)
771 return gstage_gpa_bits
;