kvm: take srcu lock around kvm_steal_time_set_preempted()
[linux/fpc-iii.git] / arch / powerpc / kvm / book3s_64_mmu_hv.c
blobb795dd1ac2ef6258f0687288332afb23002d114c
1 /*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
18 #include <linux/types.h>
19 #include <linux/string.h>
20 #include <linux/kvm.h>
21 #include <linux/kvm_host.h>
22 #include <linux/highmem.h>
23 #include <linux/gfp.h>
24 #include <linux/slab.h>
25 #include <linux/hugetlb.h>
26 #include <linux/vmalloc.h>
27 #include <linux/srcu.h>
28 #include <linux/anon_inodes.h>
29 #include <linux/file.h>
30 #include <linux/debugfs.h>
32 #include <asm/tlbflush.h>
33 #include <asm/kvm_ppc.h>
34 #include <asm/kvm_book3s.h>
35 #include <asm/book3s/64/mmu-hash.h>
36 #include <asm/hvcall.h>
37 #include <asm/synch.h>
38 #include <asm/ppc-opcode.h>
39 #include <asm/cputable.h>
41 #include "trace_hv.h"
43 /* Power architecture requires HPT is at least 256kB */
44 #define PPC_MIN_HPT_ORDER 18
46 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
47 long pte_index, unsigned long pteh,
48 unsigned long ptel, unsigned long *pte_idx_ret);
49 static void kvmppc_rmap_reset(struct kvm *kvm);
51 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
53 unsigned long hpt = 0;
54 struct revmap_entry *rev;
55 struct page *page = NULL;
56 long order = KVM_DEFAULT_HPT_ORDER;
58 if (htab_orderp) {
59 order = *htab_orderp;
60 if (order < PPC_MIN_HPT_ORDER)
61 order = PPC_MIN_HPT_ORDER;
64 kvm->arch.hpt_cma_alloc = 0;
65 page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
66 if (page) {
67 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
68 memset((void *)hpt, 0, (1ul << order));
69 kvm->arch.hpt_cma_alloc = 1;
72 /* Lastly try successively smaller sizes from the page allocator */
73 /* Only do this if userspace didn't specify a size via ioctl */
74 while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) {
75 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
76 __GFP_NOWARN, order - PAGE_SHIFT);
77 if (!hpt)
78 --order;
81 if (!hpt)
82 return -ENOMEM;
84 kvm->arch.hpt_virt = hpt;
85 kvm->arch.hpt_order = order;
86 /* HPTEs are 2**4 bytes long */
87 kvm->arch.hpt_npte = 1ul << (order - 4);
88 /* 128 (2**7) bytes in each HPTEG */
89 kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
91 atomic64_set(&kvm->arch.mmio_update, 0);
93 /* Allocate reverse map array */
94 rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
95 if (!rev) {
96 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
97 goto out_freehpt;
99 kvm->arch.revmap = rev;
100 kvm->arch.sdr1 = __pa(hpt) | (order - 18);
102 pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
103 hpt, order, kvm->arch.lpid);
105 if (htab_orderp)
106 *htab_orderp = order;
107 return 0;
109 out_freehpt:
110 if (kvm->arch.hpt_cma_alloc)
111 kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
112 else
113 free_pages(hpt, order - PAGE_SHIFT);
114 return -ENOMEM;
117 long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
119 long err = -EBUSY;
120 long order;
122 mutex_lock(&kvm->lock);
123 if (kvm->arch.hpte_setup_done) {
124 kvm->arch.hpte_setup_done = 0;
125 /* order hpte_setup_done vs. vcpus_running */
126 smp_mb();
127 if (atomic_read(&kvm->arch.vcpus_running)) {
128 kvm->arch.hpte_setup_done = 1;
129 goto out;
132 if (kvm->arch.hpt_virt) {
133 order = kvm->arch.hpt_order;
134 /* Set the entire HPT to 0, i.e. invalid HPTEs */
135 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
137 * Reset all the reverse-mapping chains for all memslots
139 kvmppc_rmap_reset(kvm);
140 /* Ensure that each vcpu will flush its TLB on next entry. */
141 cpumask_setall(&kvm->arch.need_tlb_flush);
142 *htab_orderp = order;
143 err = 0;
144 } else {
145 err = kvmppc_alloc_hpt(kvm, htab_orderp);
146 order = *htab_orderp;
148 out:
149 mutex_unlock(&kvm->lock);
150 return err;
153 void kvmppc_free_hpt(struct kvm *kvm)
155 kvmppc_free_lpid(kvm->arch.lpid);
156 vfree(kvm->arch.revmap);
157 if (kvm->arch.hpt_cma_alloc)
158 kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
159 1 << (kvm->arch.hpt_order - PAGE_SHIFT));
160 else
161 free_pages(kvm->arch.hpt_virt,
162 kvm->arch.hpt_order - PAGE_SHIFT);
165 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
166 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
168 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
171 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
172 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
174 return (pgsize == 0x10000) ? 0x1000 : 0;
177 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
178 unsigned long porder)
180 unsigned long i;
181 unsigned long npages;
182 unsigned long hp_v, hp_r;
183 unsigned long addr, hash;
184 unsigned long psize;
185 unsigned long hp0, hp1;
186 unsigned long idx_ret;
187 long ret;
188 struct kvm *kvm = vcpu->kvm;
190 psize = 1ul << porder;
191 npages = memslot->npages >> (porder - PAGE_SHIFT);
193 /* VRMA can't be > 1TB */
194 if (npages > 1ul << (40 - porder))
195 npages = 1ul << (40 - porder);
196 /* Can't use more than 1 HPTE per HPTEG */
197 if (npages > kvm->arch.hpt_mask + 1)
198 npages = kvm->arch.hpt_mask + 1;
200 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
201 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
202 hp1 = hpte1_pgsize_encoding(psize) |
203 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
205 for (i = 0; i < npages; ++i) {
206 addr = i << porder;
207 /* can't use hpt_hash since va > 64 bits */
208 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
210 * We assume that the hash table is empty and no
211 * vcpus are using it at this stage. Since we create
212 * at most one HPTE per HPTEG, we just assume entry 7
213 * is available and use it.
215 hash = (hash << 3) + 7;
216 hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
217 hp_r = hp1 | addr;
218 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
219 &idx_ret);
220 if (ret != H_SUCCESS) {
221 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
222 addr, ret);
223 break;
228 int kvmppc_mmu_hv_init(void)
230 unsigned long host_lpid, rsvd_lpid;
232 if (!cpu_has_feature(CPU_FTR_HVMODE))
233 return -EINVAL;
235 /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
236 host_lpid = mfspr(SPRN_LPID);
237 rsvd_lpid = LPID_RSVD;
239 kvmppc_init_lpid(rsvd_lpid + 1);
241 kvmppc_claim_lpid(host_lpid);
242 /* rsvd_lpid is reserved for use in partition switching */
243 kvmppc_claim_lpid(rsvd_lpid);
245 return 0;
248 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
250 unsigned long msr = vcpu->arch.intr_msr;
252 /* If transactional, change to suspend mode on IRQ delivery */
253 if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
254 msr |= MSR_TS_S;
255 else
256 msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
257 kvmppc_set_msr(vcpu, msr);
260 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
261 long pte_index, unsigned long pteh,
262 unsigned long ptel, unsigned long *pte_idx_ret)
264 long ret;
266 /* Protect linux PTE lookup from page table destruction */
267 rcu_read_lock_sched(); /* this disables preemption too */
268 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
269 current->mm->pgd, false, pte_idx_ret);
270 rcu_read_unlock_sched();
271 if (ret == H_TOO_HARD) {
272 /* this can't happen */
273 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
274 ret = H_RESOURCE; /* or something */
276 return ret;
280 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
281 gva_t eaddr)
283 u64 mask;
284 int i;
286 for (i = 0; i < vcpu->arch.slb_nr; i++) {
287 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
288 continue;
290 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
291 mask = ESID_MASK_1T;
292 else
293 mask = ESID_MASK;
295 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
296 return &vcpu->arch.slb[i];
298 return NULL;
301 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
302 unsigned long ea)
304 unsigned long ra_mask;
306 ra_mask = hpte_page_size(v, r) - 1;
307 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
310 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
311 struct kvmppc_pte *gpte, bool data, bool iswrite)
313 struct kvm *kvm = vcpu->kvm;
314 struct kvmppc_slb *slbe;
315 unsigned long slb_v;
316 unsigned long pp, key;
317 unsigned long v, orig_v, gr;
318 __be64 *hptep;
319 int index;
320 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
322 /* Get SLB entry */
323 if (virtmode) {
324 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
325 if (!slbe)
326 return -EINVAL;
327 slb_v = slbe->origv;
328 } else {
329 /* real mode access */
330 slb_v = vcpu->kvm->arch.vrma_slb_v;
333 preempt_disable();
334 /* Find the HPTE in the hash table */
335 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
336 HPTE_V_VALID | HPTE_V_ABSENT);
337 if (index < 0) {
338 preempt_enable();
339 return -ENOENT;
341 hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
342 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
343 if (cpu_has_feature(CPU_FTR_ARCH_300))
344 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
345 gr = kvm->arch.revmap[index].guest_rpte;
347 unlock_hpte(hptep, orig_v);
348 preempt_enable();
350 gpte->eaddr = eaddr;
351 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
353 /* Get PP bits and key for permission check */
354 pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
355 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
356 key &= slb_v;
358 /* Calculate permissions */
359 gpte->may_read = hpte_read_permission(pp, key);
360 gpte->may_write = hpte_write_permission(pp, key);
361 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
363 /* Storage key permission check for POWER7 */
364 if (data && virtmode) {
365 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
366 if (amrfield & 1)
367 gpte->may_read = 0;
368 if (amrfield & 2)
369 gpte->may_write = 0;
372 /* Get the guest physical address */
373 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
374 return 0;
378 * Quick test for whether an instruction is a load or a store.
379 * If the instruction is a load or a store, then this will indicate
380 * which it is, at least on server processors. (Embedded processors
381 * have some external PID instructions that don't follow the rule
382 * embodied here.) If the instruction isn't a load or store, then
383 * this doesn't return anything useful.
385 static int instruction_is_store(unsigned int instr)
387 unsigned int mask;
389 mask = 0x10000000;
390 if ((instr & 0xfc000000) == 0x7c000000)
391 mask = 0x100; /* major opcode 31 */
392 return (instr & mask) != 0;
395 static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
396 unsigned long gpa, gva_t ea, int is_store)
398 u32 last_inst;
401 * If we fail, we just return to the guest and try executing it again.
403 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
404 EMULATE_DONE)
405 return RESUME_GUEST;
408 * WARNING: We do not know for sure whether the instruction we just
409 * read from memory is the same that caused the fault in the first
410 * place. If the instruction we read is neither an load or a store,
411 * then it can't access memory, so we don't need to worry about
412 * enforcing access permissions. So, assuming it is a load or
413 * store, we just check that its direction (load or store) is
414 * consistent with the original fault, since that's what we
415 * checked the access permissions against. If there is a mismatch
416 * we just return and retry the instruction.
419 if (instruction_is_store(last_inst) != !!is_store)
420 return RESUME_GUEST;
423 * Emulated accesses are emulated by looking at the hash for
424 * translation once, then performing the access later. The
425 * translation could be invalidated in the meantime in which
426 * point performing the subsequent memory access on the old
427 * physical address could possibly be a security hole for the
428 * guest (but not the host).
430 * This is less of an issue for MMIO stores since they aren't
431 * globally visible. It could be an issue for MMIO loads to
432 * a certain extent but we'll ignore it for now.
435 vcpu->arch.paddr_accessed = gpa;
436 vcpu->arch.vaddr_accessed = ea;
437 return kvmppc_emulate_mmio(run, vcpu);
440 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
441 unsigned long ea, unsigned long dsisr)
443 struct kvm *kvm = vcpu->kvm;
444 unsigned long hpte[3], r;
445 unsigned long hnow_v, hnow_r;
446 __be64 *hptep;
447 unsigned long mmu_seq, psize, pte_size;
448 unsigned long gpa_base, gfn_base;
449 unsigned long gpa, gfn, hva, pfn;
450 struct kvm_memory_slot *memslot;
451 unsigned long *rmap;
452 struct revmap_entry *rev;
453 struct page *page, *pages[1];
454 long index, ret, npages;
455 bool is_ci;
456 unsigned int writing, write_ok;
457 struct vm_area_struct *vma;
458 unsigned long rcbits;
459 long mmio_update;
462 * Real-mode code has already searched the HPT and found the
463 * entry we're interested in. Lock the entry and check that
464 * it hasn't changed. If it has, just return and re-execute the
465 * instruction.
467 if (ea != vcpu->arch.pgfault_addr)
468 return RESUME_GUEST;
470 if (vcpu->arch.pgfault_cache) {
471 mmio_update = atomic64_read(&kvm->arch.mmio_update);
472 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
473 r = vcpu->arch.pgfault_cache->rpte;
474 psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
475 gpa_base = r & HPTE_R_RPN & ~(psize - 1);
476 gfn_base = gpa_base >> PAGE_SHIFT;
477 gpa = gpa_base | (ea & (psize - 1));
478 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
479 dsisr & DSISR_ISSTORE);
482 index = vcpu->arch.pgfault_index;
483 hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
484 rev = &kvm->arch.revmap[index];
485 preempt_disable();
486 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
487 cpu_relax();
488 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
489 hpte[1] = be64_to_cpu(hptep[1]);
490 hpte[2] = r = rev->guest_rpte;
491 unlock_hpte(hptep, hpte[0]);
492 preempt_enable();
494 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
495 hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
496 hpte[1] = hpte_new_to_old_r(hpte[1]);
498 if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
499 hpte[1] != vcpu->arch.pgfault_hpte[1])
500 return RESUME_GUEST;
502 /* Translate the logical address and get the page */
503 psize = hpte_page_size(hpte[0], r);
504 gpa_base = r & HPTE_R_RPN & ~(psize - 1);
505 gfn_base = gpa_base >> PAGE_SHIFT;
506 gpa = gpa_base | (ea & (psize - 1));
507 gfn = gpa >> PAGE_SHIFT;
508 memslot = gfn_to_memslot(kvm, gfn);
510 trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);
512 /* No memslot means it's an emulated MMIO region */
513 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
514 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
515 dsisr & DSISR_ISSTORE);
518 * This should never happen, because of the slot_is_aligned()
519 * check in kvmppc_do_h_enter().
521 if (gfn_base < memslot->base_gfn)
522 return -EFAULT;
524 /* used to check for invalidations in progress */
525 mmu_seq = kvm->mmu_notifier_seq;
526 smp_rmb();
528 ret = -EFAULT;
529 is_ci = false;
530 pfn = 0;
531 page = NULL;
532 pte_size = PAGE_SIZE;
533 writing = (dsisr & DSISR_ISSTORE) != 0;
534 /* If writing != 0, then the HPTE must allow writing, if we get here */
535 write_ok = writing;
536 hva = gfn_to_hva_memslot(memslot, gfn);
537 npages = get_user_pages_fast(hva, 1, writing, pages);
538 if (npages < 1) {
539 /* Check if it's an I/O mapping */
540 down_read(&current->mm->mmap_sem);
541 vma = find_vma(current->mm, hva);
542 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
543 (vma->vm_flags & VM_PFNMAP)) {
544 pfn = vma->vm_pgoff +
545 ((hva - vma->vm_start) >> PAGE_SHIFT);
546 pte_size = psize;
547 is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot))));
548 write_ok = vma->vm_flags & VM_WRITE;
550 up_read(&current->mm->mmap_sem);
551 if (!pfn)
552 goto out_put;
553 } else {
554 page = pages[0];
555 pfn = page_to_pfn(page);
556 if (PageHuge(page)) {
557 page = compound_head(page);
558 pte_size <<= compound_order(page);
560 /* if the guest wants write access, see if that is OK */
561 if (!writing && hpte_is_writable(r)) {
562 pte_t *ptep, pte;
563 unsigned long flags;
565 * We need to protect against page table destruction
566 * hugepage split and collapse.
568 local_irq_save(flags);
569 ptep = find_linux_pte_or_hugepte(current->mm->pgd,
570 hva, NULL, NULL);
571 if (ptep) {
572 pte = kvmppc_read_update_linux_pte(ptep, 1);
573 if (pte_write(pte))
574 write_ok = 1;
576 local_irq_restore(flags);
580 if (psize > pte_size)
581 goto out_put;
583 /* Check WIMG vs. the actual page we're accessing */
584 if (!hpte_cache_flags_ok(r, is_ci)) {
585 if (is_ci)
586 goto out_put;
588 * Allow guest to map emulated device memory as
589 * uncacheable, but actually make it cacheable.
591 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
595 * Set the HPTE to point to pfn.
596 * Since the pfn is at PAGE_SIZE granularity, make sure we
597 * don't mask out lower-order bits if psize < PAGE_SIZE.
599 if (psize < PAGE_SIZE)
600 psize = PAGE_SIZE;
601 r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) |
602 ((pfn << PAGE_SHIFT) & ~(psize - 1));
603 if (hpte_is_writable(r) && !write_ok)
604 r = hpte_make_readonly(r);
605 ret = RESUME_GUEST;
606 preempt_disable();
607 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
608 cpu_relax();
609 hnow_v = be64_to_cpu(hptep[0]);
610 hnow_r = be64_to_cpu(hptep[1]);
611 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
612 hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
613 hnow_r = hpte_new_to_old_r(hnow_r);
615 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
616 rev->guest_rpte != hpte[2])
617 /* HPTE has been changed under us; let the guest retry */
618 goto out_unlock;
619 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
621 /* Always put the HPTE in the rmap chain for the page base address */
622 rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
623 lock_rmap(rmap);
625 /* Check if we might have been invalidated; let the guest retry if so */
626 ret = RESUME_GUEST;
627 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
628 unlock_rmap(rmap);
629 goto out_unlock;
632 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
633 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
634 r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
636 if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
637 /* HPTE was previously valid, so we need to invalidate it */
638 unlock_rmap(rmap);
639 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
640 kvmppc_invalidate_hpte(kvm, hptep, index);
641 /* don't lose previous R and C bits */
642 r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
643 } else {
644 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
647 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
648 r = hpte_old_to_new_r(hpte[0], r);
649 hpte[0] = hpte_old_to_new_v(hpte[0]);
651 hptep[1] = cpu_to_be64(r);
652 eieio();
653 __unlock_hpte(hptep, hpte[0]);
654 asm volatile("ptesync" : : : "memory");
655 preempt_enable();
656 if (page && hpte_is_writable(r))
657 SetPageDirty(page);
659 out_put:
660 trace_kvm_page_fault_exit(vcpu, hpte, ret);
662 if (page) {
664 * We drop pages[0] here, not page because page might
665 * have been set to the head page of a compound, but
666 * we have to drop the reference on the correct tail
667 * page to match the get inside gup()
669 put_page(pages[0]);
671 return ret;
673 out_unlock:
674 __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
675 preempt_enable();
676 goto out_put;
679 static void kvmppc_rmap_reset(struct kvm *kvm)
681 struct kvm_memslots *slots;
682 struct kvm_memory_slot *memslot;
683 int srcu_idx;
685 srcu_idx = srcu_read_lock(&kvm->srcu);
686 slots = kvm_memslots(kvm);
687 kvm_for_each_memslot(memslot, slots) {
689 * This assumes it is acceptable to lose reference and
690 * change bits across a reset.
692 memset(memslot->arch.rmap, 0,
693 memslot->npages * sizeof(*memslot->arch.rmap));
695 srcu_read_unlock(&kvm->srcu, srcu_idx);
698 static int kvm_handle_hva_range(struct kvm *kvm,
699 unsigned long start,
700 unsigned long end,
701 int (*handler)(struct kvm *kvm,
702 unsigned long *rmapp,
703 unsigned long gfn))
705 int ret;
706 int retval = 0;
707 struct kvm_memslots *slots;
708 struct kvm_memory_slot *memslot;
710 slots = kvm_memslots(kvm);
711 kvm_for_each_memslot(memslot, slots) {
712 unsigned long hva_start, hva_end;
713 gfn_t gfn, gfn_end;
715 hva_start = max(start, memslot->userspace_addr);
716 hva_end = min(end, memslot->userspace_addr +
717 (memslot->npages << PAGE_SHIFT));
718 if (hva_start >= hva_end)
719 continue;
721 * {gfn(page) | page intersects with [hva_start, hva_end)} =
722 * {gfn, gfn+1, ..., gfn_end-1}.
724 gfn = hva_to_gfn_memslot(hva_start, memslot);
725 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
727 for (; gfn < gfn_end; ++gfn) {
728 gfn_t gfn_offset = gfn - memslot->base_gfn;
730 ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
731 retval |= ret;
735 return retval;
738 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
739 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
740 unsigned long gfn))
742 return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
745 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
746 unsigned long gfn)
748 struct revmap_entry *rev = kvm->arch.revmap;
749 unsigned long h, i, j;
750 __be64 *hptep;
751 unsigned long ptel, psize, rcbits;
753 for (;;) {
754 lock_rmap(rmapp);
755 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
756 unlock_rmap(rmapp);
757 break;
761 * To avoid an ABBA deadlock with the HPTE lock bit,
762 * we can't spin on the HPTE lock while holding the
763 * rmap chain lock.
765 i = *rmapp & KVMPPC_RMAP_INDEX;
766 hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
767 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
768 /* unlock rmap before spinning on the HPTE lock */
769 unlock_rmap(rmapp);
770 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
771 cpu_relax();
772 continue;
774 j = rev[i].forw;
775 if (j == i) {
776 /* chain is now empty */
777 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
778 } else {
779 /* remove i from chain */
780 h = rev[i].back;
781 rev[h].forw = j;
782 rev[j].back = h;
783 rev[i].forw = rev[i].back = i;
784 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
787 /* Now check and modify the HPTE */
788 ptel = rev[i].guest_rpte;
789 psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
790 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
791 hpte_rpn(ptel, psize) == gfn) {
792 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
793 kvmppc_invalidate_hpte(kvm, hptep, i);
794 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
795 /* Harvest R and C */
796 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
797 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
798 if (rcbits & HPTE_R_C)
799 kvmppc_update_rmap_change(rmapp, psize);
800 if (rcbits & ~rev[i].guest_rpte) {
801 rev[i].guest_rpte = ptel | rcbits;
802 note_hpte_modification(kvm, &rev[i]);
805 unlock_rmap(rmapp);
806 __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
808 return 0;
811 int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
813 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
814 return 0;
817 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
819 kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
820 return 0;
823 void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
824 struct kvm_memory_slot *memslot)
826 unsigned long *rmapp;
827 unsigned long gfn;
828 unsigned long n;
830 rmapp = memslot->arch.rmap;
831 gfn = memslot->base_gfn;
832 for (n = memslot->npages; n; --n) {
834 * Testing the present bit without locking is OK because
835 * the memslot has been marked invalid already, and hence
836 * no new HPTEs referencing this page can be created,
837 * thus the present bit can't go from 0 to 1.
839 if (*rmapp & KVMPPC_RMAP_PRESENT)
840 kvm_unmap_rmapp(kvm, rmapp, gfn);
841 ++rmapp;
842 ++gfn;
846 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
847 unsigned long gfn)
849 struct revmap_entry *rev = kvm->arch.revmap;
850 unsigned long head, i, j;
851 __be64 *hptep;
852 int ret = 0;
854 retry:
855 lock_rmap(rmapp);
856 if (*rmapp & KVMPPC_RMAP_REFERENCED) {
857 *rmapp &= ~KVMPPC_RMAP_REFERENCED;
858 ret = 1;
860 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
861 unlock_rmap(rmapp);
862 return ret;
865 i = head = *rmapp & KVMPPC_RMAP_INDEX;
866 do {
867 hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
868 j = rev[i].forw;
870 /* If this HPTE isn't referenced, ignore it */
871 if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
872 continue;
874 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
875 /* unlock rmap before spinning on the HPTE lock */
876 unlock_rmap(rmapp);
877 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
878 cpu_relax();
879 goto retry;
882 /* Now check and modify the HPTE */
883 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
884 (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
885 kvmppc_clear_ref_hpte(kvm, hptep, i);
886 if (!(rev[i].guest_rpte & HPTE_R_R)) {
887 rev[i].guest_rpte |= HPTE_R_R;
888 note_hpte_modification(kvm, &rev[i]);
890 ret = 1;
892 __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
893 } while ((i = j) != head);
895 unlock_rmap(rmapp);
896 return ret;
899 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
901 return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp);
904 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
905 unsigned long gfn)
907 struct revmap_entry *rev = kvm->arch.revmap;
908 unsigned long head, i, j;
909 unsigned long *hp;
910 int ret = 1;
912 if (*rmapp & KVMPPC_RMAP_REFERENCED)
913 return 1;
915 lock_rmap(rmapp);
916 if (*rmapp & KVMPPC_RMAP_REFERENCED)
917 goto out;
919 if (*rmapp & KVMPPC_RMAP_PRESENT) {
920 i = head = *rmapp & KVMPPC_RMAP_INDEX;
921 do {
922 hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
923 j = rev[i].forw;
924 if (be64_to_cpu(hp[1]) & HPTE_R_R)
925 goto out;
926 } while ((i = j) != head);
928 ret = 0;
930 out:
931 unlock_rmap(rmapp);
932 return ret;
935 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
937 return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
940 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
942 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
945 static int vcpus_running(struct kvm *kvm)
947 return atomic_read(&kvm->arch.vcpus_running) != 0;
951 * Returns the number of system pages that are dirty.
952 * This can be more than 1 if we find a huge-page HPTE.
954 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
956 struct revmap_entry *rev = kvm->arch.revmap;
957 unsigned long head, i, j;
958 unsigned long n;
959 unsigned long v, r;
960 __be64 *hptep;
961 int npages_dirty = 0;
963 retry:
964 lock_rmap(rmapp);
965 if (*rmapp & KVMPPC_RMAP_CHANGED) {
966 long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
967 >> KVMPPC_RMAP_CHG_SHIFT;
968 *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
969 npages_dirty = 1;
970 if (change_order > PAGE_SHIFT)
971 npages_dirty = 1ul << (change_order - PAGE_SHIFT);
973 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
974 unlock_rmap(rmapp);
975 return npages_dirty;
978 i = head = *rmapp & KVMPPC_RMAP_INDEX;
979 do {
980 unsigned long hptep1;
981 hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
982 j = rev[i].forw;
985 * Checking the C (changed) bit here is racy since there
986 * is no guarantee about when the hardware writes it back.
987 * If the HPTE is not writable then it is stable since the
988 * page can't be written to, and we would have done a tlbie
989 * (which forces the hardware to complete any writeback)
990 * when making the HPTE read-only.
991 * If vcpus are running then this call is racy anyway
992 * since the page could get dirtied subsequently, so we
993 * expect there to be a further call which would pick up
994 * any delayed C bit writeback.
995 * Otherwise we need to do the tlbie even if C==0 in
996 * order to pick up any delayed writeback of C.
998 hptep1 = be64_to_cpu(hptep[1]);
999 if (!(hptep1 & HPTE_R_C) &&
1000 (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
1001 continue;
1003 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
1004 /* unlock rmap before spinning on the HPTE lock */
1005 unlock_rmap(rmapp);
1006 while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
1007 cpu_relax();
1008 goto retry;
1011 /* Now check and modify the HPTE */
1012 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
1013 __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
1014 continue;
1017 /* need to make it temporarily absent so C is stable */
1018 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
1019 kvmppc_invalidate_hpte(kvm, hptep, i);
1020 v = be64_to_cpu(hptep[0]);
1021 r = be64_to_cpu(hptep[1]);
1022 if (r & HPTE_R_C) {
1023 hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
1024 if (!(rev[i].guest_rpte & HPTE_R_C)) {
1025 rev[i].guest_rpte |= HPTE_R_C;
1026 note_hpte_modification(kvm, &rev[i]);
1028 n = hpte_page_size(v, r);
1029 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
1030 if (n > npages_dirty)
1031 npages_dirty = n;
1032 eieio();
1034 v &= ~HPTE_V_ABSENT;
1035 v |= HPTE_V_VALID;
1036 __unlock_hpte(hptep, v);
1037 } while ((i = j) != head);
1039 unlock_rmap(rmapp);
1040 return npages_dirty;
1043 static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
1044 struct kvm_memory_slot *memslot,
1045 unsigned long *map)
1047 unsigned long gfn;
1049 if (!vpa->dirty || !vpa->pinned_addr)
1050 return;
1051 gfn = vpa->gpa >> PAGE_SHIFT;
1052 if (gfn < memslot->base_gfn ||
1053 gfn >= memslot->base_gfn + memslot->npages)
1054 return;
1056 vpa->dirty = false;
1057 if (map)
1058 __set_bit_le(gfn - memslot->base_gfn, map);
1061 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
1062 unsigned long *map)
1064 unsigned long i, j;
1065 unsigned long *rmapp;
1066 struct kvm_vcpu *vcpu;
1068 preempt_disable();
1069 rmapp = memslot->arch.rmap;
1070 for (i = 0; i < memslot->npages; ++i) {
1071 int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
1073 * Note that if npages > 0 then i must be a multiple of npages,
1074 * since we always put huge-page HPTEs in the rmap chain
1075 * corresponding to their page base address.
1077 if (npages && map)
1078 for (j = i; npages; ++j, --npages)
1079 __set_bit_le(j, map);
1080 ++rmapp;
1083 /* Harvest dirty bits from VPA and DTL updates */
1084 /* Note: we never modify the SLB shadow buffer areas */
1085 kvm_for_each_vcpu(i, vcpu, kvm) {
1086 spin_lock(&vcpu->arch.vpa_update_lock);
1087 harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
1088 harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
1089 spin_unlock(&vcpu->arch.vpa_update_lock);
1091 preempt_enable();
1092 return 0;
1095 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1096 unsigned long *nb_ret)
1098 struct kvm_memory_slot *memslot;
1099 unsigned long gfn = gpa >> PAGE_SHIFT;
1100 struct page *page, *pages[1];
1101 int npages;
1102 unsigned long hva, offset;
1103 int srcu_idx;
1105 srcu_idx = srcu_read_lock(&kvm->srcu);
1106 memslot = gfn_to_memslot(kvm, gfn);
1107 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1108 goto err;
1109 hva = gfn_to_hva_memslot(memslot, gfn);
1110 npages = get_user_pages_fast(hva, 1, 1, pages);
1111 if (npages < 1)
1112 goto err;
1113 page = pages[0];
1114 srcu_read_unlock(&kvm->srcu, srcu_idx);
1116 offset = gpa & (PAGE_SIZE - 1);
1117 if (nb_ret)
1118 *nb_ret = PAGE_SIZE - offset;
1119 return page_address(page) + offset;
1121 err:
1122 srcu_read_unlock(&kvm->srcu, srcu_idx);
1123 return NULL;
1126 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1127 bool dirty)
1129 struct page *page = virt_to_page(va);
1130 struct kvm_memory_slot *memslot;
1131 unsigned long gfn;
1132 unsigned long *rmap;
1133 int srcu_idx;
1135 put_page(page);
1137 if (!dirty)
1138 return;
1140 /* We need to mark this page dirty in the rmap chain */
1141 gfn = gpa >> PAGE_SHIFT;
1142 srcu_idx = srcu_read_lock(&kvm->srcu);
1143 memslot = gfn_to_memslot(kvm, gfn);
1144 if (memslot) {
1145 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
1146 lock_rmap(rmap);
1147 *rmap |= KVMPPC_RMAP_CHANGED;
1148 unlock_rmap(rmap);
1150 srcu_read_unlock(&kvm->srcu, srcu_idx);
1154 * Functions for reading and writing the hash table via reads and
1155 * writes on a file descriptor.
1157 * Reads return the guest view of the hash table, which has to be
1158 * pieced together from the real hash table and the guest_rpte
1159 * values in the revmap array.
1161 * On writes, each HPTE written is considered in turn, and if it
1162 * is valid, it is written to the HPT as if an H_ENTER with the
1163 * exact flag set was done. When the invalid count is non-zero
1164 * in the header written to the stream, the kernel will make
1165 * sure that that many HPTEs are invalid, and invalidate them
1166 * if not.
1169 struct kvm_htab_ctx {
1170 unsigned long index;
1171 unsigned long flags;
1172 struct kvm *kvm;
1173 int first_pass;
1176 #define HPTE_SIZE (2 * sizeof(unsigned long))
1179 * Returns 1 if this HPT entry has been modified or has pending
1180 * R/C bit changes.
1182 static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
1184 unsigned long rcbits_unset;
1186 if (revp->guest_rpte & HPTE_GR_MODIFIED)
1187 return 1;
1189 /* Also need to consider changes in reference and changed bits */
1190 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1191 if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
1192 (be64_to_cpu(hptp[1]) & rcbits_unset))
1193 return 1;
1195 return 0;
1198 static long record_hpte(unsigned long flags, __be64 *hptp,
1199 unsigned long *hpte, struct revmap_entry *revp,
1200 int want_valid, int first_pass)
1202 unsigned long v, r, hr;
1203 unsigned long rcbits_unset;
1204 int ok = 1;
1205 int valid, dirty;
1207 /* Unmodified entries are uninteresting except on the first pass */
1208 dirty = hpte_dirty(revp, hptp);
1209 if (!first_pass && !dirty)
1210 return 0;
1212 valid = 0;
1213 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1214 valid = 1;
1215 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
1216 !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
1217 valid = 0;
1219 if (valid != want_valid)
1220 return 0;
1222 v = r = 0;
1223 if (valid || dirty) {
1224 /* lock the HPTE so it's stable and read it */
1225 preempt_disable();
1226 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1227 cpu_relax();
1228 v = be64_to_cpu(hptp[0]);
1229 hr = be64_to_cpu(hptp[1]);
1230 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1231 v = hpte_new_to_old_v(v, hr);
1232 hr = hpte_new_to_old_r(hr);
1235 /* re-evaluate valid and dirty from synchronized HPTE value */
1236 valid = !!(v & HPTE_V_VALID);
1237 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1239 /* Harvest R and C into guest view if necessary */
1240 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1241 if (valid && (rcbits_unset & hr)) {
1242 revp->guest_rpte |= (hr &
1243 (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
1244 dirty = 1;
1247 if (v & HPTE_V_ABSENT) {
1248 v &= ~HPTE_V_ABSENT;
1249 v |= HPTE_V_VALID;
1250 valid = 1;
1252 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1253 valid = 0;
1255 r = revp->guest_rpte;
1256 /* only clear modified if this is the right sort of entry */
1257 if (valid == want_valid && dirty) {
1258 r &= ~HPTE_GR_MODIFIED;
1259 revp->guest_rpte = r;
1261 unlock_hpte(hptp, be64_to_cpu(hptp[0]));
1262 preempt_enable();
1263 if (!(valid == want_valid && (first_pass || dirty)))
1264 ok = 0;
1266 hpte[0] = cpu_to_be64(v);
1267 hpte[1] = cpu_to_be64(r);
1268 return ok;
1271 static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1272 size_t count, loff_t *ppos)
1274 struct kvm_htab_ctx *ctx = file->private_data;
1275 struct kvm *kvm = ctx->kvm;
1276 struct kvm_get_htab_header hdr;
1277 __be64 *hptp;
1278 struct revmap_entry *revp;
1279 unsigned long i, nb, nw;
1280 unsigned long __user *lbuf;
1281 struct kvm_get_htab_header __user *hptr;
1282 unsigned long flags;
1283 int first_pass;
1284 unsigned long hpte[2];
1286 if (!access_ok(VERIFY_WRITE, buf, count))
1287 return -EFAULT;
1289 first_pass = ctx->first_pass;
1290 flags = ctx->flags;
1292 i = ctx->index;
1293 hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1294 revp = kvm->arch.revmap + i;
1295 lbuf = (unsigned long __user *)buf;
1297 nb = 0;
1298 while (nb + sizeof(hdr) + HPTE_SIZE < count) {
1299 /* Initialize header */
1300 hptr = (struct kvm_get_htab_header __user *)buf;
1301 hdr.n_valid = 0;
1302 hdr.n_invalid = 0;
1303 nw = nb;
1304 nb += sizeof(hdr);
1305 lbuf = (unsigned long __user *)(buf + sizeof(hdr));
1307 /* Skip uninteresting entries, i.e. clean on not-first pass */
1308 if (!first_pass) {
1309 while (i < kvm->arch.hpt_npte &&
1310 !hpte_dirty(revp, hptp)) {
1311 ++i;
1312 hptp += 2;
1313 ++revp;
1316 hdr.index = i;
1318 /* Grab a series of valid entries */
1319 while (i < kvm->arch.hpt_npte &&
1320 hdr.n_valid < 0xffff &&
1321 nb + HPTE_SIZE < count &&
1322 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
1323 /* valid entry, write it out */
1324 ++hdr.n_valid;
1325 if (__put_user(hpte[0], lbuf) ||
1326 __put_user(hpte[1], lbuf + 1))
1327 return -EFAULT;
1328 nb += HPTE_SIZE;
1329 lbuf += 2;
1330 ++i;
1331 hptp += 2;
1332 ++revp;
1334 /* Now skip invalid entries while we can */
1335 while (i < kvm->arch.hpt_npte &&
1336 hdr.n_invalid < 0xffff &&
1337 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1338 /* found an invalid entry */
1339 ++hdr.n_invalid;
1340 ++i;
1341 hptp += 2;
1342 ++revp;
1345 if (hdr.n_valid || hdr.n_invalid) {
1346 /* write back the header */
1347 if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
1348 return -EFAULT;
1349 nw = nb;
1350 buf = (char __user *)lbuf;
1351 } else {
1352 nb = nw;
1355 /* Check if we've wrapped around the hash table */
1356 if (i >= kvm->arch.hpt_npte) {
1357 i = 0;
1358 ctx->first_pass = 0;
1359 break;
1363 ctx->index = i;
1365 return nb;
1368 static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1369 size_t count, loff_t *ppos)
1371 struct kvm_htab_ctx *ctx = file->private_data;
1372 struct kvm *kvm = ctx->kvm;
1373 struct kvm_get_htab_header hdr;
1374 unsigned long i, j;
1375 unsigned long v, r;
1376 unsigned long __user *lbuf;
1377 __be64 *hptp;
1378 unsigned long tmp[2];
1379 ssize_t nb;
1380 long int err, ret;
1381 int hpte_setup;
1383 if (!access_ok(VERIFY_READ, buf, count))
1384 return -EFAULT;
1386 /* lock out vcpus from running while we're doing this */
1387 mutex_lock(&kvm->lock);
1388 hpte_setup = kvm->arch.hpte_setup_done;
1389 if (hpte_setup) {
1390 kvm->arch.hpte_setup_done = 0; /* temporarily */
1391 /* order hpte_setup_done vs. vcpus_running */
1392 smp_mb();
1393 if (atomic_read(&kvm->arch.vcpus_running)) {
1394 kvm->arch.hpte_setup_done = 1;
1395 mutex_unlock(&kvm->lock);
1396 return -EBUSY;
1400 err = 0;
1401 for (nb = 0; nb + sizeof(hdr) <= count; ) {
1402 err = -EFAULT;
1403 if (__copy_from_user(&hdr, buf, sizeof(hdr)))
1404 break;
1406 err = 0;
1407 if (nb + hdr.n_valid * HPTE_SIZE > count)
1408 break;
1410 nb += sizeof(hdr);
1411 buf += sizeof(hdr);
1413 err = -EINVAL;
1414 i = hdr.index;
1415 if (i >= kvm->arch.hpt_npte ||
1416 i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
1417 break;
1419 hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1420 lbuf = (unsigned long __user *)buf;
1421 for (j = 0; j < hdr.n_valid; ++j) {
1422 __be64 hpte_v;
1423 __be64 hpte_r;
1425 err = -EFAULT;
1426 if (__get_user(hpte_v, lbuf) ||
1427 __get_user(hpte_r, lbuf + 1))
1428 goto out;
1429 v = be64_to_cpu(hpte_v);
1430 r = be64_to_cpu(hpte_r);
1431 err = -EINVAL;
1432 if (!(v & HPTE_V_VALID))
1433 goto out;
1434 lbuf += 2;
1435 nb += HPTE_SIZE;
1437 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1438 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1439 err = -EIO;
1440 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
1441 tmp);
1442 if (ret != H_SUCCESS) {
1443 pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
1444 "r=%lx\n", ret, i, v, r);
1445 goto out;
1447 if (!hpte_setup && is_vrma_hpte(v)) {
1448 unsigned long psize = hpte_base_page_size(v, r);
1449 unsigned long senc = slb_pgsize_encoding(psize);
1450 unsigned long lpcr;
1452 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1453 (VRMA_VSID << SLB_VSID_SHIFT_1T);
1454 lpcr = senc << (LPCR_VRMASD_SH - 4);
1455 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
1456 hpte_setup = 1;
1458 ++i;
1459 hptp += 2;
1462 for (j = 0; j < hdr.n_invalid; ++j) {
1463 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1464 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1465 ++i;
1466 hptp += 2;
1468 err = 0;
1471 out:
1472 /* Order HPTE updates vs. hpte_setup_done */
1473 smp_wmb();
1474 kvm->arch.hpte_setup_done = hpte_setup;
1475 mutex_unlock(&kvm->lock);
1477 if (err)
1478 return err;
1479 return nb;
1482 static int kvm_htab_release(struct inode *inode, struct file *filp)
1484 struct kvm_htab_ctx *ctx = filp->private_data;
1486 filp->private_data = NULL;
1487 if (!(ctx->flags & KVM_GET_HTAB_WRITE))
1488 atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
1489 kvm_put_kvm(ctx->kvm);
1490 kfree(ctx);
1491 return 0;
1494 static const struct file_operations kvm_htab_fops = {
1495 .read = kvm_htab_read,
1496 .write = kvm_htab_write,
1497 .llseek = default_llseek,
1498 .release = kvm_htab_release,
1501 int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1503 int ret;
1504 struct kvm_htab_ctx *ctx;
1505 int rwflag;
1507 /* reject flags we don't recognize */
1508 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
1509 return -EINVAL;
1510 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1511 if (!ctx)
1512 return -ENOMEM;
1513 kvm_get_kvm(kvm);
1514 ctx->kvm = kvm;
1515 ctx->index = ghf->start_index;
1516 ctx->flags = ghf->flags;
1517 ctx->first_pass = 1;
1519 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1520 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
1521 if (ret < 0) {
1522 kvm_put_kvm(kvm);
1523 return ret;
1526 if (rwflag == O_RDONLY) {
1527 mutex_lock(&kvm->slots_lock);
1528 atomic_inc(&kvm->arch.hpte_mod_interest);
1529 /* make sure kvmppc_do_h_enter etc. see the increment */
1530 synchronize_srcu_expedited(&kvm->srcu);
1531 mutex_unlock(&kvm->slots_lock);
1534 return ret;
1537 struct debugfs_htab_state {
1538 struct kvm *kvm;
1539 struct mutex mutex;
1540 unsigned long hpt_index;
1541 int chars_left;
1542 int buf_index;
1543 char buf[64];
1546 static int debugfs_htab_open(struct inode *inode, struct file *file)
1548 struct kvm *kvm = inode->i_private;
1549 struct debugfs_htab_state *p;
1551 p = kzalloc(sizeof(*p), GFP_KERNEL);
1552 if (!p)
1553 return -ENOMEM;
1555 kvm_get_kvm(kvm);
1556 p->kvm = kvm;
1557 mutex_init(&p->mutex);
1558 file->private_data = p;
1560 return nonseekable_open(inode, file);
1563 static int debugfs_htab_release(struct inode *inode, struct file *file)
1565 struct debugfs_htab_state *p = file->private_data;
1567 kvm_put_kvm(p->kvm);
1568 kfree(p);
1569 return 0;
1572 static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
1573 size_t len, loff_t *ppos)
1575 struct debugfs_htab_state *p = file->private_data;
1576 ssize_t ret, r;
1577 unsigned long i, n;
1578 unsigned long v, hr, gr;
1579 struct kvm *kvm;
1580 __be64 *hptp;
1582 ret = mutex_lock_interruptible(&p->mutex);
1583 if (ret)
1584 return ret;
1586 if (p->chars_left) {
1587 n = p->chars_left;
1588 if (n > len)
1589 n = len;
1590 r = copy_to_user(buf, p->buf + p->buf_index, n);
1591 n -= r;
1592 p->chars_left -= n;
1593 p->buf_index += n;
1594 buf += n;
1595 len -= n;
1596 ret = n;
1597 if (r) {
1598 if (!n)
1599 ret = -EFAULT;
1600 goto out;
1604 kvm = p->kvm;
1605 i = p->hpt_index;
1606 hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1607 for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
1608 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
1609 continue;
1611 /* lock the HPTE so it's stable and read it */
1612 preempt_disable();
1613 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1614 cpu_relax();
1615 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
1616 hr = be64_to_cpu(hptp[1]);
1617 gr = kvm->arch.revmap[i].guest_rpte;
1618 unlock_hpte(hptp, v);
1619 preempt_enable();
1621 if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
1622 continue;
1624 n = scnprintf(p->buf, sizeof(p->buf),
1625 "%6lx %.16lx %.16lx %.16lx\n",
1626 i, v, hr, gr);
1627 p->chars_left = n;
1628 if (n > len)
1629 n = len;
1630 r = copy_to_user(buf, p->buf, n);
1631 n -= r;
1632 p->chars_left -= n;
1633 p->buf_index = n;
1634 buf += n;
1635 len -= n;
1636 ret += n;
1637 if (r) {
1638 if (!ret)
1639 ret = -EFAULT;
1640 goto out;
1643 p->hpt_index = i;
1645 out:
1646 mutex_unlock(&p->mutex);
1647 return ret;
1650 static ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
1651 size_t len, loff_t *ppos)
1653 return -EACCES;
1656 static const struct file_operations debugfs_htab_fops = {
1657 .owner = THIS_MODULE,
1658 .open = debugfs_htab_open,
1659 .release = debugfs_htab_release,
1660 .read = debugfs_htab_read,
1661 .write = debugfs_htab_write,
1662 .llseek = generic_file_llseek,
1665 void kvmppc_mmu_debugfs_init(struct kvm *kvm)
1667 kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
1668 kvm->arch.debugfs_dir, kvm,
1669 &debugfs_htab_fops);
1672 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
1674 struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
1676 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
1678 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
1679 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
1681 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;