2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
3 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
6 * Paul Mackerras <paulus@au1.ibm.com>
7 * Alexander Graf <agraf@suse.de>
8 * Kevin Wolf <mail@kevin-wolf.de>
10 * Description: KVM functions specific to running on Book 3S
11 * processors in hypervisor mode (specifically POWER7 and later).
13 * This file is derived from arch/powerpc/kvm/book3s.c,
14 * by Alexander Graf <agraf@suse.de>.
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License, version 2, as
18 * published by the Free Software Foundation.
21 #include <linux/kvm_host.h>
22 #include <linux/err.h>
23 #include <linux/slab.h>
24 #include <linux/preempt.h>
25 #include <linux/sched.h>
26 #include <linux/delay.h>
27 #include <linux/export.h>
29 #include <linux/anon_inodes.h>
30 #include <linux/cpumask.h>
31 #include <linux/spinlock.h>
32 #include <linux/page-flags.h>
35 #include <asm/cputable.h>
36 #include <asm/cacheflush.h>
37 #include <asm/tlbflush.h>
38 #include <asm/uaccess.h>
40 #include <asm/kvm_ppc.h>
41 #include <asm/kvm_book3s.h>
42 #include <asm/mmu_context.h>
43 #include <asm/lppaca.h>
44 #include <asm/processor.h>
45 #include <asm/cputhreads.h>
47 #include <linux/gfp.h>
48 #include <linux/sched.h>
49 #include <linux/vmalloc.h>
50 #include <linux/highmem.h>
53 * For now, limit memory to 64GB and require it to be large pages.
54 * This value is chosen because it makes the ram_pginfo array be
55 * 64kB in size, which is about as large as we want to be trying
56 * to allocate with kmalloc.
58 #define MAX_MEM_ORDER 36
60 #define LARGE_PAGE_ORDER 24 /* 16MB pages */
62 /* #define EXIT_DEBUG */
63 /* #define EXIT_DEBUG_SIMPLE */
64 /* #define EXIT_DEBUG_INT */
66 static void kvmppc_end_cede(struct kvm_vcpu
*vcpu
);
68 void kvmppc_core_vcpu_load(struct kvm_vcpu
*vcpu
, int cpu
)
70 local_paca
->kvm_hstate
.kvm_vcpu
= vcpu
;
71 local_paca
->kvm_hstate
.kvm_vcore
= vcpu
->arch
.vcore
;
74 void kvmppc_core_vcpu_put(struct kvm_vcpu
*vcpu
)
78 void kvmppc_set_msr(struct kvm_vcpu
*vcpu
, u64 msr
)
80 vcpu
->arch
.shregs
.msr
= msr
;
81 kvmppc_end_cede(vcpu
);
84 void kvmppc_set_pvr(struct kvm_vcpu
*vcpu
, u32 pvr
)
89 void kvmppc_dump_regs(struct kvm_vcpu
*vcpu
)
93 pr_err("vcpu %p (%d):\n", vcpu
, vcpu
->vcpu_id
);
94 pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
95 vcpu
->arch
.pc
, vcpu
->arch
.shregs
.msr
, vcpu
->arch
.trap
);
96 for (r
= 0; r
< 16; ++r
)
97 pr_err("r%2d = %.16lx r%d = %.16lx\n",
98 r
, kvmppc_get_gpr(vcpu
, r
),
99 r
+16, kvmppc_get_gpr(vcpu
, r
+16));
100 pr_err("ctr = %.16lx lr = %.16lx\n",
101 vcpu
->arch
.ctr
, vcpu
->arch
.lr
);
102 pr_err("srr0 = %.16llx srr1 = %.16llx\n",
103 vcpu
->arch
.shregs
.srr0
, vcpu
->arch
.shregs
.srr1
);
104 pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
105 vcpu
->arch
.shregs
.sprg0
, vcpu
->arch
.shregs
.sprg1
);
106 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
107 vcpu
->arch
.shregs
.sprg2
, vcpu
->arch
.shregs
.sprg3
);
108 pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
109 vcpu
->arch
.cr
, vcpu
->arch
.xer
, vcpu
->arch
.shregs
.dsisr
);
110 pr_err("dar = %.16llx\n", vcpu
->arch
.shregs
.dar
);
111 pr_err("fault dar = %.16lx dsisr = %.8x\n",
112 vcpu
->arch
.fault_dar
, vcpu
->arch
.fault_dsisr
);
113 pr_err("SLB (%d entries):\n", vcpu
->arch
.slb_max
);
114 for (r
= 0; r
< vcpu
->arch
.slb_max
; ++r
)
115 pr_err(" ESID = %.16llx VSID = %.16llx\n",
116 vcpu
->arch
.slb
[r
].orige
, vcpu
->arch
.slb
[r
].origv
);
117 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
118 vcpu
->kvm
->arch
.lpcr
, vcpu
->kvm
->arch
.sdr1
,
119 vcpu
->arch
.last_inst
);
122 struct kvm_vcpu
*kvmppc_find_vcpu(struct kvm
*kvm
, int id
)
125 struct kvm_vcpu
*v
, *ret
= NULL
;
127 mutex_lock(&kvm
->lock
);
128 kvm_for_each_vcpu(r
, v
, kvm
) {
129 if (v
->vcpu_id
== id
) {
134 mutex_unlock(&kvm
->lock
);
138 static void init_vpa(struct kvm_vcpu
*vcpu
, struct lppaca
*vpa
)
140 vpa
->shared_proc
= 1;
141 vpa
->yield_count
= 1;
144 static unsigned long do_h_register_vpa(struct kvm_vcpu
*vcpu
,
146 unsigned long vcpuid
, unsigned long vpa
)
148 struct kvm
*kvm
= vcpu
->kvm
;
149 unsigned long pg_index
, ra
, len
;
150 unsigned long pg_offset
;
152 struct kvm_vcpu
*tvcpu
;
154 tvcpu
= kvmppc_find_vcpu(kvm
, vcpuid
);
160 if (flags
== 0 || flags
== 4)
165 /* registering new area; convert logical addr to real */
166 pg_index
= vpa
>> kvm
->arch
.ram_porder
;
167 pg_offset
= vpa
& (kvm
->arch
.ram_psize
- 1);
168 if (pg_index
>= kvm
->arch
.ram_npages
)
170 if (kvm
->arch
.ram_pginfo
[pg_index
].pfn
== 0)
172 ra
= kvm
->arch
.ram_pginfo
[pg_index
].pfn
<< PAGE_SHIFT
;
176 len
= *(unsigned short *)(va
+ 4);
178 len
= *(unsigned int *)(va
+ 4);
179 if (pg_offset
+ len
> kvm
->arch
.ram_psize
)
182 case 1: /* register VPA */
185 tvcpu
->arch
.vpa
= va
;
188 case 2: /* register DTL */
191 if (!tvcpu
->arch
.vpa
)
194 tvcpu
->arch
.dtl
= va
;
195 tvcpu
->arch
.dtl_end
= va
+ len
;
197 case 3: /* register SLB shadow buffer */
200 if (!tvcpu
->arch
.vpa
)
202 tvcpu
->arch
.slb_shadow
= va
;
203 len
= (len
- 16) / 16;
204 tvcpu
->arch
.slb_shadow
= va
;
209 case 5: /* unregister VPA */
210 if (tvcpu
->arch
.slb_shadow
|| tvcpu
->arch
.dtl
)
212 tvcpu
->arch
.vpa
= NULL
;
214 case 6: /* unregister DTL */
215 tvcpu
->arch
.dtl
= NULL
;
217 case 7: /* unregister SLB shadow buffer */
218 tvcpu
->arch
.slb_shadow
= NULL
;
225 int kvmppc_pseries_do_hcall(struct kvm_vcpu
*vcpu
)
227 unsigned long req
= kvmppc_get_gpr(vcpu
, 3);
228 unsigned long target
, ret
= H_SUCCESS
;
229 struct kvm_vcpu
*tvcpu
;
235 target
= kvmppc_get_gpr(vcpu
, 4);
236 tvcpu
= kvmppc_find_vcpu(vcpu
->kvm
, target
);
241 tvcpu
->arch
.prodded
= 1;
243 if (vcpu
->arch
.ceded
) {
244 if (waitqueue_active(&vcpu
->wq
)) {
245 wake_up_interruptible(&vcpu
->wq
);
246 vcpu
->stat
.halt_wakeup
++;
253 ret
= do_h_register_vpa(vcpu
, kvmppc_get_gpr(vcpu
, 4),
254 kvmppc_get_gpr(vcpu
, 5),
255 kvmppc_get_gpr(vcpu
, 6));
260 kvmppc_set_gpr(vcpu
, 3, ret
);
261 vcpu
->arch
.hcall_needed
= 0;
265 static int kvmppc_handle_exit(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
,
266 struct task_struct
*tsk
)
270 vcpu
->stat
.sum_exits
++;
272 run
->exit_reason
= KVM_EXIT_UNKNOWN
;
273 run
->ready_for_interrupt_injection
= 1;
274 switch (vcpu
->arch
.trap
) {
275 /* We're good on these - the host merely wanted to get our attention */
276 case BOOK3S_INTERRUPT_HV_DECREMENTER
:
277 vcpu
->stat
.dec_exits
++;
280 case BOOK3S_INTERRUPT_EXTERNAL
:
281 vcpu
->stat
.ext_intr_exits
++;
284 case BOOK3S_INTERRUPT_PERFMON
:
287 case BOOK3S_INTERRUPT_PROGRAM
:
291 * Normally program interrupts are delivered directly
292 * to the guest by the hardware, but we can get here
293 * as a result of a hypervisor emulation interrupt
294 * (e40) getting turned into a 700 by BML RTAS.
296 flags
= vcpu
->arch
.shregs
.msr
& 0x1f0000ull
;
297 kvmppc_core_queue_program(vcpu
, flags
);
301 case BOOK3S_INTERRUPT_SYSCALL
:
303 /* hcall - punt to userspace */
306 if (vcpu
->arch
.shregs
.msr
& MSR_PR
) {
307 /* sc 1 from userspace - reflect to guest syscall */
308 kvmppc_book3s_queue_irqprio(vcpu
, BOOK3S_INTERRUPT_SYSCALL
);
312 run
->papr_hcall
.nr
= kvmppc_get_gpr(vcpu
, 3);
313 for (i
= 0; i
< 9; ++i
)
314 run
->papr_hcall
.args
[i
] = kvmppc_get_gpr(vcpu
, 4 + i
);
315 run
->exit_reason
= KVM_EXIT_PAPR_HCALL
;
316 vcpu
->arch
.hcall_needed
= 1;
321 * We get these next two if the guest does a bad real-mode access,
322 * as we have enabled VRMA (virtualized real mode area) mode in the
323 * LPCR. We just generate an appropriate DSI/ISI to the guest.
325 case BOOK3S_INTERRUPT_H_DATA_STORAGE
:
326 vcpu
->arch
.shregs
.dsisr
= vcpu
->arch
.fault_dsisr
;
327 vcpu
->arch
.shregs
.dar
= vcpu
->arch
.fault_dar
;
328 kvmppc_inject_interrupt(vcpu
, BOOK3S_INTERRUPT_DATA_STORAGE
, 0);
331 case BOOK3S_INTERRUPT_H_INST_STORAGE
:
332 kvmppc_inject_interrupt(vcpu
, BOOK3S_INTERRUPT_INST_STORAGE
,
337 * This occurs if the guest executes an illegal instruction.
338 * We just generate a program interrupt to the guest, since
339 * we don't emulate any guest instructions at this stage.
341 case BOOK3S_INTERRUPT_H_EMUL_ASSIST
:
342 kvmppc_core_queue_program(vcpu
, 0x80000);
346 kvmppc_dump_regs(vcpu
);
347 printk(KERN_EMERG
"trap=0x%x | pc=0x%lx | msr=0x%llx\n",
348 vcpu
->arch
.trap
, kvmppc_get_pc(vcpu
),
349 vcpu
->arch
.shregs
.msr
);
358 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu
*vcpu
,
359 struct kvm_sregs
*sregs
)
363 sregs
->pvr
= vcpu
->arch
.pvr
;
365 memset(sregs
, 0, sizeof(struct kvm_sregs
));
366 for (i
= 0; i
< vcpu
->arch
.slb_max
; i
++) {
367 sregs
->u
.s
.ppc64
.slb
[i
].slbe
= vcpu
->arch
.slb
[i
].orige
;
368 sregs
->u
.s
.ppc64
.slb
[i
].slbv
= vcpu
->arch
.slb
[i
].origv
;
374 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu
*vcpu
,
375 struct kvm_sregs
*sregs
)
379 kvmppc_set_pvr(vcpu
, sregs
->pvr
);
382 for (i
= 0; i
< vcpu
->arch
.slb_nr
; i
++) {
383 if (sregs
->u
.s
.ppc64
.slb
[i
].slbe
& SLB_ESID_V
) {
384 vcpu
->arch
.slb
[j
].orige
= sregs
->u
.s
.ppc64
.slb
[i
].slbe
;
385 vcpu
->arch
.slb
[j
].origv
= sregs
->u
.s
.ppc64
.slb
[i
].slbv
;
389 vcpu
->arch
.slb_max
= j
;
394 int kvmppc_core_check_processor_compat(void)
396 if (cpu_has_feature(CPU_FTR_HVMODE
))
401 struct kvm_vcpu
*kvmppc_core_vcpu_create(struct kvm
*kvm
, unsigned int id
)
403 struct kvm_vcpu
*vcpu
;
406 struct kvmppc_vcore
*vcore
;
408 core
= id
/ threads_per_core
;
409 if (core
>= KVM_MAX_VCORES
)
413 vcpu
= kzalloc(sizeof(struct kvm_vcpu
), GFP_KERNEL
);
417 err
= kvm_vcpu_init(vcpu
, kvm
, id
);
421 vcpu
->arch
.shared
= &vcpu
->arch
.shregs
;
422 vcpu
->arch
.last_cpu
= -1;
423 vcpu
->arch
.mmcr
[0] = MMCR0_FC
;
424 vcpu
->arch
.ctrl
= CTRL_RUNLATCH
;
425 /* default to host PVR, since we can't spoof it */
426 vcpu
->arch
.pvr
= mfspr(SPRN_PVR
);
427 kvmppc_set_pvr(vcpu
, vcpu
->arch
.pvr
);
429 kvmppc_mmu_book3s_hv_init(vcpu
);
432 * We consider the vcpu stopped until we see the first run ioctl for it.
434 vcpu
->arch
.state
= KVMPPC_VCPU_STOPPED
;
436 init_waitqueue_head(&vcpu
->arch
.cpu_run
);
438 mutex_lock(&kvm
->lock
);
439 vcore
= kvm
->arch
.vcores
[core
];
441 vcore
= kzalloc(sizeof(struct kvmppc_vcore
), GFP_KERNEL
);
443 INIT_LIST_HEAD(&vcore
->runnable_threads
);
444 spin_lock_init(&vcore
->lock
);
445 init_waitqueue_head(&vcore
->wq
);
447 kvm
->arch
.vcores
[core
] = vcore
;
449 mutex_unlock(&kvm
->lock
);
454 spin_lock(&vcore
->lock
);
455 ++vcore
->num_threads
;
456 spin_unlock(&vcore
->lock
);
457 vcpu
->arch
.vcore
= vcore
;
459 vcpu
->arch
.cpu_type
= KVM_CPU_3S_64
;
460 kvmppc_sanity_check(vcpu
);
470 void kvmppc_core_vcpu_free(struct kvm_vcpu
*vcpu
)
472 kvm_vcpu_uninit(vcpu
);
476 static void kvmppc_set_timer(struct kvm_vcpu
*vcpu
)
478 unsigned long dec_nsec
, now
;
481 if (now
> vcpu
->arch
.dec_expires
) {
482 /* decrementer has already gone negative */
483 kvmppc_core_queue_dec(vcpu
);
484 kvmppc_core_deliver_interrupts(vcpu
);
487 dec_nsec
= (vcpu
->arch
.dec_expires
- now
) * NSEC_PER_SEC
489 hrtimer_start(&vcpu
->arch
.dec_timer
, ktime_set(0, dec_nsec
),
491 vcpu
->arch
.timer_running
= 1;
494 static void kvmppc_end_cede(struct kvm_vcpu
*vcpu
)
496 vcpu
->arch
.ceded
= 0;
497 if (vcpu
->arch
.timer_running
) {
498 hrtimer_try_to_cancel(&vcpu
->arch
.dec_timer
);
499 vcpu
->arch
.timer_running
= 0;
503 extern int __kvmppc_vcore_entry(struct kvm_run
*kvm_run
, struct kvm_vcpu
*vcpu
);
504 extern void xics_wake_cpu(int cpu
);
506 static void kvmppc_remove_runnable(struct kvmppc_vcore
*vc
,
507 struct kvm_vcpu
*vcpu
)
511 if (vcpu
->arch
.state
!= KVMPPC_VCPU_RUNNABLE
)
513 vcpu
->arch
.state
= KVMPPC_VCPU_BUSY_IN_HOST
;
516 /* decrement the physical thread id of each following vcpu */
518 list_for_each_entry_continue(v
, &vc
->runnable_threads
, arch
.run_list
)
520 list_del(&vcpu
->arch
.run_list
);
523 static void kvmppc_start_thread(struct kvm_vcpu
*vcpu
)
526 struct paca_struct
*tpaca
;
527 struct kvmppc_vcore
*vc
= vcpu
->arch
.vcore
;
529 if (vcpu
->arch
.timer_running
) {
530 hrtimer_try_to_cancel(&vcpu
->arch
.dec_timer
);
531 vcpu
->arch
.timer_running
= 0;
533 cpu
= vc
->pcpu
+ vcpu
->arch
.ptid
;
535 tpaca
->kvm_hstate
.kvm_vcpu
= vcpu
;
536 tpaca
->kvm_hstate
.kvm_vcore
= vc
;
537 tpaca
->kvm_hstate
.napping
= 0;
538 vcpu
->cpu
= vc
->pcpu
;
540 #ifdef CONFIG_PPC_ICP_NATIVE
541 if (vcpu
->arch
.ptid
) {
542 tpaca
->cpu_start
= 0x80;
550 static void kvmppc_wait_for_nap(struct kvmppc_vcore
*vc
)
556 while (vc
->nap_count
< vc
->n_woken
) {
557 if (++i
>= 1000000) {
558 pr_err("kvmppc_wait_for_nap timeout %d %d\n",
559 vc
->nap_count
, vc
->n_woken
);
568 * Check that we are on thread 0 and that any other threads in
569 * this core are off-line.
571 static int on_primary_thread(void)
573 int cpu
= smp_processor_id();
574 int thr
= cpu_thread_in_core(cpu
);
578 while (++thr
< threads_per_core
)
579 if (cpu_online(cpu
+ thr
))
585 * Run a set of guest threads on a physical core.
586 * Called with vc->lock held.
588 static int kvmppc_run_core(struct kvmppc_vcore
*vc
)
590 struct kvm_vcpu
*vcpu
, *vcpu0
, *vnext
;
595 /* don't start if any threads have a signal pending */
596 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
597 if (signal_pending(vcpu
->arch
.run_task
))
601 * Make sure we are running on thread 0, and that
602 * secondary threads are offline.
603 * XXX we should also block attempts to bring any
604 * secondary threads online.
606 if (threads_per_core
> 1 && !on_primary_thread()) {
607 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
608 vcpu
->arch
.ret
= -EBUSY
;
613 * Assign physical thread IDs, first to non-ceded vcpus
614 * and then to ceded ones.
618 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
) {
619 if (!vcpu
->arch
.ceded
) {
622 vcpu
->arch
.ptid
= ptid
++;
626 return 0; /* nothing to run */
627 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
628 if (vcpu
->arch
.ceded
)
629 vcpu
->arch
.ptid
= ptid
++;
633 vc
->entry_exit_count
= 0;
634 vc
->vcore_state
= VCORE_RUNNING
;
636 vc
->pcpu
= smp_processor_id();
637 vc
->napping_threads
= 0;
638 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
639 kvmppc_start_thread(vcpu
);
642 spin_unlock(&vc
->lock
);
645 __kvmppc_vcore_entry(NULL
, vcpu0
);
647 spin_lock(&vc
->lock
);
648 /* disable sending of IPIs on virtual external irqs */
649 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
651 /* wait for secondary threads to finish writing their state to memory */
652 if (vc
->nap_count
< vc
->n_woken
)
653 kvmppc_wait_for_nap(vc
);
654 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
655 vc
->vcore_state
= VCORE_EXITING
;
656 spin_unlock(&vc
->lock
);
658 /* make sure updates to secondary vcpu structs are visible now */
666 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
) {
667 /* cancel pending dec exception if dec is positive */
668 if (now
< vcpu
->arch
.dec_expires
&&
669 kvmppc_core_pending_dec(vcpu
))
670 kvmppc_core_dequeue_dec(vcpu
);
674 ret
= kvmppc_handle_exit(vcpu
->arch
.kvm_run
, vcpu
,
675 vcpu
->arch
.run_task
);
677 vcpu
->arch
.ret
= ret
;
680 if (vcpu
->arch
.ceded
) {
681 if (ret
!= RESUME_GUEST
)
682 kvmppc_end_cede(vcpu
);
684 kvmppc_set_timer(vcpu
);
688 spin_lock(&vc
->lock
);
690 vc
->vcore_state
= VCORE_INACTIVE
;
691 list_for_each_entry_safe(vcpu
, vnext
, &vc
->runnable_threads
,
693 if (vcpu
->arch
.ret
!= RESUME_GUEST
) {
694 kvmppc_remove_runnable(vc
, vcpu
);
695 wake_up(&vcpu
->arch
.cpu_run
);
703 * Wait for some other vcpu thread to execute us, and
704 * wake us up when we need to handle something in the host.
706 static void kvmppc_wait_for_exec(struct kvm_vcpu
*vcpu
, int wait_state
)
710 prepare_to_wait(&vcpu
->arch
.cpu_run
, &wait
, wait_state
);
711 if (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
)
713 finish_wait(&vcpu
->arch
.cpu_run
, &wait
);
717 * All the vcpus in this vcore are idle, so wait for a decrementer
718 * or external interrupt to one of the vcpus. vc->lock is held.
720 static void kvmppc_vcore_blocked(struct kvmppc_vcore
*vc
)
726 prepare_to_wait(&vc
->wq
, &wait
, TASK_INTERRUPTIBLE
);
727 vc
->vcore_state
= VCORE_SLEEPING
;
728 spin_unlock(&vc
->lock
);
729 list_for_each_entry(v
, &vc
->runnable_threads
, arch
.run_list
) {
730 if (!v
->arch
.ceded
|| v
->arch
.pending_exceptions
) {
737 finish_wait(&vc
->wq
, &wait
);
738 spin_lock(&vc
->lock
);
739 vc
->vcore_state
= VCORE_INACTIVE
;
742 static int kvmppc_run_vcpu(struct kvm_run
*kvm_run
, struct kvm_vcpu
*vcpu
)
746 struct kvmppc_vcore
*vc
;
747 struct kvm_vcpu
*v
, *vn
;
749 kvm_run
->exit_reason
= 0;
750 vcpu
->arch
.ret
= RESUME_GUEST
;
754 * Synchronize with other threads in this virtual core
756 vc
= vcpu
->arch
.vcore
;
757 spin_lock(&vc
->lock
);
758 vcpu
->arch
.ceded
= 0;
759 vcpu
->arch
.run_task
= current
;
760 vcpu
->arch
.kvm_run
= kvm_run
;
761 prev_state
= vcpu
->arch
.state
;
762 vcpu
->arch
.state
= KVMPPC_VCPU_RUNNABLE
;
763 list_add_tail(&vcpu
->arch
.run_list
, &vc
->runnable_threads
);
767 * This happens the first time this is called for a vcpu.
768 * If the vcore is already running, we may be able to start
769 * this thread straight away and have it join in.
771 if (prev_state
== KVMPPC_VCPU_STOPPED
) {
772 if (vc
->vcore_state
== VCORE_RUNNING
&&
773 VCORE_EXIT_COUNT(vc
) == 0) {
774 vcpu
->arch
.ptid
= vc
->n_runnable
- 1;
775 kvmppc_start_thread(vcpu
);
778 } else if (prev_state
== KVMPPC_VCPU_BUSY_IN_HOST
)
781 while (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
&&
782 !signal_pending(current
)) {
783 if (vc
->n_busy
|| vc
->vcore_state
!= VCORE_INACTIVE
) {
784 spin_unlock(&vc
->lock
);
785 kvmppc_wait_for_exec(vcpu
, TASK_INTERRUPTIBLE
);
786 spin_lock(&vc
->lock
);
790 list_for_each_entry(v
, &vc
->runnable_threads
, arch
.run_list
)
791 n_ceded
+= v
->arch
.ceded
;
792 if (n_ceded
== vc
->n_runnable
)
793 kvmppc_vcore_blocked(vc
);
797 list_for_each_entry_safe(v
, vn
, &vc
->runnable_threads
,
799 kvmppc_core_deliver_interrupts(v
);
800 if (signal_pending(v
->arch
.run_task
)) {
801 kvmppc_remove_runnable(vc
, v
);
802 v
->stat
.signal_exits
++;
803 v
->arch
.kvm_run
->exit_reason
= KVM_EXIT_INTR
;
804 v
->arch
.ret
= -EINTR
;
805 wake_up(&v
->arch
.cpu_run
);
810 if (signal_pending(current
)) {
811 if (vc
->vcore_state
== VCORE_RUNNING
||
812 vc
->vcore_state
== VCORE_EXITING
) {
813 spin_unlock(&vc
->lock
);
814 kvmppc_wait_for_exec(vcpu
, TASK_UNINTERRUPTIBLE
);
815 spin_lock(&vc
->lock
);
817 if (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
) {
818 kvmppc_remove_runnable(vc
, vcpu
);
819 vcpu
->stat
.signal_exits
++;
820 kvm_run
->exit_reason
= KVM_EXIT_INTR
;
821 vcpu
->arch
.ret
= -EINTR
;
825 spin_unlock(&vc
->lock
);
826 return vcpu
->arch
.ret
;
829 int kvmppc_vcpu_run(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
)
833 if (!vcpu
->arch
.sane
) {
834 run
->exit_reason
= KVM_EXIT_INTERNAL_ERROR
;
838 /* No need to go into the guest when all we'll do is come back out */
839 if (signal_pending(current
)) {
840 run
->exit_reason
= KVM_EXIT_INTR
;
844 /* On PPC970, check that we have an RMA region */
845 if (!vcpu
->kvm
->arch
.rma
&& cpu_has_feature(CPU_FTR_ARCH_201
))
848 flush_fp_to_thread(current
);
849 flush_altivec_to_thread(current
);
850 flush_vsx_to_thread(current
);
851 vcpu
->arch
.wqp
= &vcpu
->arch
.vcore
->wq
;
854 r
= kvmppc_run_vcpu(run
, vcpu
);
856 if (run
->exit_reason
== KVM_EXIT_PAPR_HCALL
&&
857 !(vcpu
->arch
.shregs
.msr
& MSR_PR
)) {
858 r
= kvmppc_pseries_do_hcall(vcpu
);
859 kvmppc_core_deliver_interrupts(vcpu
);
861 } while (r
== RESUME_GUEST
);
865 static long kvmppc_stt_npages(unsigned long window_size
)
867 return ALIGN((window_size
>> SPAPR_TCE_SHIFT
)
868 * sizeof(u64
), PAGE_SIZE
) / PAGE_SIZE
;
871 static void release_spapr_tce_table(struct kvmppc_spapr_tce_table
*stt
)
873 struct kvm
*kvm
= stt
->kvm
;
876 mutex_lock(&kvm
->lock
);
877 list_del(&stt
->list
);
878 for (i
= 0; i
< kvmppc_stt_npages(stt
->window_size
); i
++)
879 __free_page(stt
->pages
[i
]);
881 mutex_unlock(&kvm
->lock
);
886 static int kvm_spapr_tce_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
888 struct kvmppc_spapr_tce_table
*stt
= vma
->vm_file
->private_data
;
891 if (vmf
->pgoff
>= kvmppc_stt_npages(stt
->window_size
))
892 return VM_FAULT_SIGBUS
;
894 page
= stt
->pages
[vmf
->pgoff
];
900 static const struct vm_operations_struct kvm_spapr_tce_vm_ops
= {
901 .fault
= kvm_spapr_tce_fault
,
904 static int kvm_spapr_tce_mmap(struct file
*file
, struct vm_area_struct
*vma
)
906 vma
->vm_ops
= &kvm_spapr_tce_vm_ops
;
910 static int kvm_spapr_tce_release(struct inode
*inode
, struct file
*filp
)
912 struct kvmppc_spapr_tce_table
*stt
= filp
->private_data
;
914 release_spapr_tce_table(stt
);
918 static struct file_operations kvm_spapr_tce_fops
= {
919 .mmap
= kvm_spapr_tce_mmap
,
920 .release
= kvm_spapr_tce_release
,
923 long kvm_vm_ioctl_create_spapr_tce(struct kvm
*kvm
,
924 struct kvm_create_spapr_tce
*args
)
926 struct kvmppc_spapr_tce_table
*stt
= NULL
;
931 /* Check this LIOBN hasn't been previously allocated */
932 list_for_each_entry(stt
, &kvm
->arch
.spapr_tce_tables
, list
) {
933 if (stt
->liobn
== args
->liobn
)
937 npages
= kvmppc_stt_npages(args
->window_size
);
939 stt
= kzalloc(sizeof(*stt
) + npages
* sizeof(struct page
*),
944 stt
->liobn
= args
->liobn
;
945 stt
->window_size
= args
->window_size
;
948 for (i
= 0; i
< npages
; i
++) {
949 stt
->pages
[i
] = alloc_page(GFP_KERNEL
| __GFP_ZERO
);
956 mutex_lock(&kvm
->lock
);
957 list_add(&stt
->list
, &kvm
->arch
.spapr_tce_tables
);
959 mutex_unlock(&kvm
->lock
);
961 return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops
,
966 for (i
= 0; i
< npages
; i
++)
968 __free_page(stt
->pages
[i
]);
975 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
976 Assumes POWER7 or PPC970. */
977 static inline int lpcr_rmls(unsigned long rma_size
)
980 case 32ul << 20: /* 32 MB */
981 if (cpu_has_feature(CPU_FTR_ARCH_206
))
982 return 8; /* only supported on POWER7 */
984 case 64ul << 20: /* 64 MB */
986 case 128ul << 20: /* 128 MB */
988 case 256ul << 20: /* 256 MB */
990 case 1ul << 30: /* 1 GB */
992 case 16ul << 30: /* 16 GB */
994 case 256ul << 30: /* 256 GB */
1001 static int kvm_rma_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
1003 struct kvmppc_rma_info
*ri
= vma
->vm_file
->private_data
;
1006 if (vmf
->pgoff
>= ri
->npages
)
1007 return VM_FAULT_SIGBUS
;
1009 page
= pfn_to_page(ri
->base_pfn
+ vmf
->pgoff
);
1015 static const struct vm_operations_struct kvm_rma_vm_ops
= {
1016 .fault
= kvm_rma_fault
,
1019 static int kvm_rma_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1021 vma
->vm_flags
|= VM_RESERVED
;
1022 vma
->vm_ops
= &kvm_rma_vm_ops
;
1026 static int kvm_rma_release(struct inode
*inode
, struct file
*filp
)
1028 struct kvmppc_rma_info
*ri
= filp
->private_data
;
1030 kvm_release_rma(ri
);
1034 static struct file_operations kvm_rma_fops
= {
1035 .mmap
= kvm_rma_mmap
,
1036 .release
= kvm_rma_release
,
1039 long kvm_vm_ioctl_allocate_rma(struct kvm
*kvm
, struct kvm_allocate_rma
*ret
)
1041 struct kvmppc_rma_info
*ri
;
1044 ri
= kvm_alloc_rma();
1048 fd
= anon_inode_getfd("kvm-rma", &kvm_rma_fops
, ri
, O_RDWR
);
1050 kvm_release_rma(ri
);
1052 ret
->rma_size
= ri
->npages
<< PAGE_SHIFT
;
1056 static struct page
*hva_to_page(unsigned long addr
)
1058 struct page
*page
[1];
1063 npages
= get_user_pages_fast(addr
, 1, 1, page
);
1065 if (unlikely(npages
!= 1))
1071 int kvmppc_core_prepare_memory_region(struct kvm
*kvm
,
1072 struct kvm_userspace_memory_region
*mem
)
1074 unsigned long psize
, porder
;
1075 unsigned long i
, npages
, totalpages
;
1076 unsigned long pg_ix
;
1077 struct kvmppc_pginfo
*pginfo
;
1079 struct kvmppc_rma_info
*ri
= NULL
;
1082 /* For now, only allow 16MB pages */
1083 porder
= LARGE_PAGE_ORDER
;
1084 psize
= 1ul << porder
;
1085 if ((mem
->memory_size
& (psize
- 1)) ||
1086 (mem
->guest_phys_addr
& (psize
- 1))) {
1087 pr_err("bad memory_size=%llx @ %llx\n",
1088 mem
->memory_size
, mem
->guest_phys_addr
);
1092 npages
= mem
->memory_size
>> porder
;
1093 totalpages
= (mem
->guest_phys_addr
+ mem
->memory_size
) >> porder
;
1095 /* More memory than we have space to track? */
1096 if (totalpages
> (1ul << (MAX_MEM_ORDER
- LARGE_PAGE_ORDER
)))
1099 /* Do we already have an RMA registered? */
1100 if (mem
->guest_phys_addr
== 0 && kvm
->arch
.rma
)
1103 if (totalpages
> kvm
->arch
.ram_npages
)
1104 kvm
->arch
.ram_npages
= totalpages
;
1106 /* Is this one of our preallocated RMAs? */
1107 if (mem
->guest_phys_addr
== 0) {
1108 struct vm_area_struct
*vma
;
1110 down_read(¤t
->mm
->mmap_sem
);
1111 vma
= find_vma(current
->mm
, mem
->userspace_addr
);
1112 if (vma
&& vma
->vm_file
&&
1113 vma
->vm_file
->f_op
== &kvm_rma_fops
&&
1114 mem
->userspace_addr
== vma
->vm_start
)
1115 ri
= vma
->vm_file
->private_data
;
1116 up_read(¤t
->mm
->mmap_sem
);
1117 if (!ri
&& cpu_has_feature(CPU_FTR_ARCH_201
)) {
1118 pr_err("CPU requires an RMO\n");
1124 unsigned long rma_size
;
1128 rma_size
= ri
->npages
<< PAGE_SHIFT
;
1129 if (rma_size
> mem
->memory_size
)
1130 rma_size
= mem
->memory_size
;
1131 rmls
= lpcr_rmls(rma_size
);
1133 pr_err("Can't use RMA of 0x%lx bytes\n", rma_size
);
1136 atomic_inc(&ri
->use_count
);
1138 kvm
->arch
.n_rma_pages
= rma_size
>> porder
;
1140 /* Update LPCR and RMOR */
1141 lpcr
= kvm
->arch
.lpcr
;
1142 if (cpu_has_feature(CPU_FTR_ARCH_201
)) {
1143 /* PPC970; insert RMLS value (split field) in HID4 */
1144 lpcr
&= ~((1ul << HID4_RMLS0_SH
) |
1145 (3ul << HID4_RMLS2_SH
));
1146 lpcr
|= ((rmls
>> 2) << HID4_RMLS0_SH
) |
1147 ((rmls
& 3) << HID4_RMLS2_SH
);
1148 /* RMOR is also in HID4 */
1149 lpcr
|= ((ri
->base_pfn
>> (26 - PAGE_SHIFT
)) & 0xffff)
1153 lpcr
&= ~(LPCR_VPM0
| LPCR_VRMA_L
);
1154 lpcr
|= rmls
<< LPCR_RMLS_SH
;
1155 kvm
->arch
.rmor
= kvm
->arch
.rma
->base_pfn
<< PAGE_SHIFT
;
1157 kvm
->arch
.lpcr
= lpcr
;
1158 pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
1159 ri
->base_pfn
<< PAGE_SHIFT
, rma_size
, lpcr
);
1162 pg_ix
= mem
->guest_phys_addr
>> porder
;
1163 pginfo
= kvm
->arch
.ram_pginfo
+ pg_ix
;
1164 for (i
= 0; i
< npages
; ++i
, ++pg_ix
) {
1165 if (ri
&& pg_ix
< kvm
->arch
.n_rma_pages
) {
1166 pginfo
[i
].pfn
= ri
->base_pfn
+
1167 (pg_ix
<< (porder
- PAGE_SHIFT
));
1170 hva
= mem
->userspace_addr
+ (i
<< porder
);
1171 page
= hva_to_page(hva
);
1173 pr_err("oops, no pfn for hva %lx\n", hva
);
1176 /* Check it's a 16MB page */
1177 if (!PageHead(page
) ||
1178 compound_order(page
) != (LARGE_PAGE_ORDER
- PAGE_SHIFT
)) {
1179 pr_err("page at %lx isn't 16MB (o=%d)\n",
1180 hva
, compound_order(page
));
1183 pginfo
[i
].pfn
= page_to_pfn(page
);
1192 void kvmppc_core_commit_memory_region(struct kvm
*kvm
,
1193 struct kvm_userspace_memory_region
*mem
)
1195 if (mem
->guest_phys_addr
== 0 && mem
->memory_size
!= 0 &&
1197 kvmppc_map_vrma(kvm
, mem
);
1200 int kvmppc_core_init_vm(struct kvm
*kvm
)
1203 unsigned long npages
= 1ul << (MAX_MEM_ORDER
- LARGE_PAGE_ORDER
);
1207 /* Allocate hashed page table */
1208 r
= kvmppc_alloc_hpt(kvm
);
1212 INIT_LIST_HEAD(&kvm
->arch
.spapr_tce_tables
);
1214 kvm
->arch
.ram_pginfo
= kzalloc(npages
* sizeof(struct kvmppc_pginfo
),
1216 if (!kvm
->arch
.ram_pginfo
) {
1217 pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
1218 npages
* sizeof(struct kvmppc_pginfo
));
1222 kvm
->arch
.ram_npages
= 0;
1223 kvm
->arch
.ram_psize
= 1ul << LARGE_PAGE_ORDER
;
1224 kvm
->arch
.ram_porder
= LARGE_PAGE_ORDER
;
1225 kvm
->arch
.rma
= NULL
;
1226 kvm
->arch
.n_rma_pages
= 0;
1228 kvm
->arch
.host_sdr1
= mfspr(SPRN_SDR1
);
1230 if (cpu_has_feature(CPU_FTR_ARCH_201
)) {
1231 /* PPC970; HID4 is effectively the LPCR */
1232 unsigned long lpid
= kvm
->arch
.lpid
;
1233 kvm
->arch
.host_lpid
= 0;
1234 kvm
->arch
.host_lpcr
= lpcr
= mfspr(SPRN_HID4
);
1235 lpcr
&= ~((3 << HID4_LPID1_SH
) | (0xful
<< HID4_LPID5_SH
));
1236 lpcr
|= ((lpid
>> 4) << HID4_LPID1_SH
) |
1237 ((lpid
& 0xf) << HID4_LPID5_SH
);
1239 /* POWER7; init LPCR for virtual RMA mode */
1240 kvm
->arch
.host_lpid
= mfspr(SPRN_LPID
);
1241 kvm
->arch
.host_lpcr
= lpcr
= mfspr(SPRN_LPCR
);
1242 lpcr
&= LPCR_PECE
| LPCR_LPES
;
1243 lpcr
|= (4UL << LPCR_DPFD_SH
) | LPCR_HDICE
|
1244 LPCR_VPM0
| LPCR_VRMA_L
;
1246 kvm
->arch
.lpcr
= lpcr
;
1251 kvmppc_free_hpt(kvm
);
1255 void kvmppc_core_destroy_vm(struct kvm
*kvm
)
1257 struct kvmppc_pginfo
*pginfo
;
1260 if (kvm
->arch
.ram_pginfo
) {
1261 pginfo
= kvm
->arch
.ram_pginfo
;
1262 kvm
->arch
.ram_pginfo
= NULL
;
1263 for (i
= kvm
->arch
.n_rma_pages
; i
< kvm
->arch
.ram_npages
; ++i
)
1265 put_page(pfn_to_page(pginfo
[i
].pfn
));
1268 if (kvm
->arch
.rma
) {
1269 kvm_release_rma(kvm
->arch
.rma
);
1270 kvm
->arch
.rma
= NULL
;
1273 kvmppc_free_hpt(kvm
);
1274 WARN_ON(!list_empty(&kvm
->arch
.spapr_tce_tables
));
1277 /* These are stubs for now */
1278 void kvmppc_mmu_pte_pflush(struct kvm_vcpu
*vcpu
, ulong pa_start
, ulong pa_end
)
1282 /* We don't need to emulate any privileged instructions or dcbz */
1283 int kvmppc_core_emulate_op(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
,
1284 unsigned int inst
, int *advance
)
1286 return EMULATE_FAIL
;
1289 int kvmppc_core_emulate_mtspr(struct kvm_vcpu
*vcpu
, int sprn
, int rs
)
1291 return EMULATE_FAIL
;
1294 int kvmppc_core_emulate_mfspr(struct kvm_vcpu
*vcpu
, int sprn
, int rt
)
1296 return EMULATE_FAIL
;
1299 static int kvmppc_book3s_hv_init(void)
1303 r
= kvm_init(NULL
, sizeof(struct kvm_vcpu
), 0, THIS_MODULE
);
1308 r
= kvmppc_mmu_hv_init();
1313 static void kvmppc_book3s_hv_exit(void)
1318 module_init(kvmppc_book3s_hv_init
);
1319 module_exit(kvmppc_book3s_hv_exit
);