2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
3 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
6 * Paul Mackerras <paulus@au1.ibm.com>
7 * Alexander Graf <agraf@suse.de>
8 * Kevin Wolf <mail@kevin-wolf.de>
10 * Description: KVM functions specific to running on Book 3S
11 * processors in hypervisor mode (specifically POWER7 and later).
13 * This file is derived from arch/powerpc/kvm/book3s.c,
14 * by Alexander Graf <agraf@suse.de>.
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License, version 2, as
18 * published by the Free Software Foundation.
21 #include <linux/kvm_host.h>
22 #include <linux/err.h>
23 #include <linux/slab.h>
24 #include <linux/preempt.h>
25 #include <linux/sched.h>
26 #include <linux/delay.h>
28 #include <linux/anon_inodes.h>
29 #include <linux/cpumask.h>
30 #include <linux/spinlock.h>
31 #include <linux/page-flags.h>
34 #include <asm/cputable.h>
35 #include <asm/cacheflush.h>
36 #include <asm/tlbflush.h>
37 #include <asm/uaccess.h>
39 #include <asm/kvm_ppc.h>
40 #include <asm/kvm_book3s.h>
41 #include <asm/mmu_context.h>
42 #include <asm/lppaca.h>
43 #include <asm/processor.h>
44 #include <asm/cputhreads.h>
46 #include <linux/gfp.h>
47 #include <linux/sched.h>
48 #include <linux/vmalloc.h>
49 #include <linux/highmem.h>
52 * For now, limit memory to 64GB and require it to be large pages.
53 * This value is chosen because it makes the ram_pginfo array be
54 * 64kB in size, which is about as large as we want to be trying
55 * to allocate with kmalloc.
57 #define MAX_MEM_ORDER 36
59 #define LARGE_PAGE_ORDER 24 /* 16MB pages */
61 /* #define EXIT_DEBUG */
62 /* #define EXIT_DEBUG_SIMPLE */
63 /* #define EXIT_DEBUG_INT */
65 void kvmppc_core_vcpu_load(struct kvm_vcpu
*vcpu
, int cpu
)
67 local_paca
->kvm_hstate
.kvm_vcpu
= vcpu
;
68 local_paca
->kvm_hstate
.kvm_vcore
= vcpu
->arch
.vcore
;
71 void kvmppc_core_vcpu_put(struct kvm_vcpu
*vcpu
)
75 static void kvmppc_vcpu_blocked(struct kvm_vcpu
*vcpu
);
76 static void kvmppc_vcpu_unblocked(struct kvm_vcpu
*vcpu
);
78 void kvmppc_vcpu_block(struct kvm_vcpu
*vcpu
)
81 unsigned long dec_nsec
;
84 if (now
>= vcpu
->arch
.dec_expires
&& !kvmppc_core_pending_dec(vcpu
))
85 kvmppc_core_queue_dec(vcpu
);
86 if (vcpu
->arch
.pending_exceptions
)
88 if (vcpu
->arch
.dec_expires
!= ~(u64
)0) {
89 dec_nsec
= (vcpu
->arch
.dec_expires
- now
) * NSEC_PER_SEC
/
91 hrtimer_start(&vcpu
->arch
.dec_timer
, ktime_set(0, dec_nsec
),
95 kvmppc_vcpu_blocked(vcpu
);
98 vcpu
->stat
.halt_wakeup
++;
100 if (vcpu
->arch
.dec_expires
!= ~(u64
)0)
101 hrtimer_try_to_cancel(&vcpu
->arch
.dec_timer
);
103 kvmppc_vcpu_unblocked(vcpu
);
106 void kvmppc_set_msr(struct kvm_vcpu
*vcpu
, u64 msr
)
108 vcpu
->arch
.shregs
.msr
= msr
;
111 void kvmppc_set_pvr(struct kvm_vcpu
*vcpu
, u32 pvr
)
113 vcpu
->arch
.pvr
= pvr
;
116 void kvmppc_dump_regs(struct kvm_vcpu
*vcpu
)
120 pr_err("vcpu %p (%d):\n", vcpu
, vcpu
->vcpu_id
);
121 pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
122 vcpu
->arch
.pc
, vcpu
->arch
.shregs
.msr
, vcpu
->arch
.trap
);
123 for (r
= 0; r
< 16; ++r
)
124 pr_err("r%2d = %.16lx r%d = %.16lx\n",
125 r
, kvmppc_get_gpr(vcpu
, r
),
126 r
+16, kvmppc_get_gpr(vcpu
, r
+16));
127 pr_err("ctr = %.16lx lr = %.16lx\n",
128 vcpu
->arch
.ctr
, vcpu
->arch
.lr
);
129 pr_err("srr0 = %.16llx srr1 = %.16llx\n",
130 vcpu
->arch
.shregs
.srr0
, vcpu
->arch
.shregs
.srr1
);
131 pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
132 vcpu
->arch
.shregs
.sprg0
, vcpu
->arch
.shregs
.sprg1
);
133 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
134 vcpu
->arch
.shregs
.sprg2
, vcpu
->arch
.shregs
.sprg3
);
135 pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
136 vcpu
->arch
.cr
, vcpu
->arch
.xer
, vcpu
->arch
.shregs
.dsisr
);
137 pr_err("dar = %.16llx\n", vcpu
->arch
.shregs
.dar
);
138 pr_err("fault dar = %.16lx dsisr = %.8x\n",
139 vcpu
->arch
.fault_dar
, vcpu
->arch
.fault_dsisr
);
140 pr_err("SLB (%d entries):\n", vcpu
->arch
.slb_max
);
141 for (r
= 0; r
< vcpu
->arch
.slb_max
; ++r
)
142 pr_err(" ESID = %.16llx VSID = %.16llx\n",
143 vcpu
->arch
.slb
[r
].orige
, vcpu
->arch
.slb
[r
].origv
);
144 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
145 vcpu
->kvm
->arch
.lpcr
, vcpu
->kvm
->arch
.sdr1
,
146 vcpu
->arch
.last_inst
);
149 struct kvm_vcpu
*kvmppc_find_vcpu(struct kvm
*kvm
, int id
)
152 struct kvm_vcpu
*v
, *ret
= NULL
;
154 mutex_lock(&kvm
->lock
);
155 kvm_for_each_vcpu(r
, v
, kvm
) {
156 if (v
->vcpu_id
== id
) {
161 mutex_unlock(&kvm
->lock
);
165 static void init_vpa(struct kvm_vcpu
*vcpu
, struct lppaca
*vpa
)
167 vpa
->shared_proc
= 1;
168 vpa
->yield_count
= 1;
171 static unsigned long do_h_register_vpa(struct kvm_vcpu
*vcpu
,
173 unsigned long vcpuid
, unsigned long vpa
)
175 struct kvm
*kvm
= vcpu
->kvm
;
176 unsigned long pg_index
, ra
, len
;
177 unsigned long pg_offset
;
179 struct kvm_vcpu
*tvcpu
;
181 tvcpu
= kvmppc_find_vcpu(kvm
, vcpuid
);
187 if (flags
== 0 || flags
== 4)
192 /* registering new area; convert logical addr to real */
193 pg_index
= vpa
>> kvm
->arch
.ram_porder
;
194 pg_offset
= vpa
& (kvm
->arch
.ram_psize
- 1);
195 if (pg_index
>= kvm
->arch
.ram_npages
)
197 if (kvm
->arch
.ram_pginfo
[pg_index
].pfn
== 0)
199 ra
= kvm
->arch
.ram_pginfo
[pg_index
].pfn
<< PAGE_SHIFT
;
203 len
= *(unsigned short *)(va
+ 4);
205 len
= *(unsigned int *)(va
+ 4);
206 if (pg_offset
+ len
> kvm
->arch
.ram_psize
)
209 case 1: /* register VPA */
212 tvcpu
->arch
.vpa
= va
;
215 case 2: /* register DTL */
218 if (!tvcpu
->arch
.vpa
)
221 tvcpu
->arch
.dtl
= va
;
222 tvcpu
->arch
.dtl_end
= va
+ len
;
224 case 3: /* register SLB shadow buffer */
227 if (!tvcpu
->arch
.vpa
)
229 tvcpu
->arch
.slb_shadow
= va
;
230 len
= (len
- 16) / 16;
231 tvcpu
->arch
.slb_shadow
= va
;
236 case 5: /* unregister VPA */
237 if (tvcpu
->arch
.slb_shadow
|| tvcpu
->arch
.dtl
)
239 tvcpu
->arch
.vpa
= NULL
;
241 case 6: /* unregister DTL */
242 tvcpu
->arch
.dtl
= NULL
;
244 case 7: /* unregister SLB shadow buffer */
245 tvcpu
->arch
.slb_shadow
= NULL
;
252 int kvmppc_pseries_do_hcall(struct kvm_vcpu
*vcpu
)
254 unsigned long req
= kvmppc_get_gpr(vcpu
, 3);
255 unsigned long target
, ret
= H_SUCCESS
;
256 struct kvm_vcpu
*tvcpu
;
260 vcpu
->arch
.shregs
.msr
|= MSR_EE
;
261 vcpu
->arch
.ceded
= 1;
263 if (!vcpu
->arch
.prodded
)
264 kvmppc_vcpu_block(vcpu
);
266 vcpu
->arch
.prodded
= 0;
268 vcpu
->arch
.ceded
= 0;
271 target
= kvmppc_get_gpr(vcpu
, 4);
272 tvcpu
= kvmppc_find_vcpu(vcpu
->kvm
, target
);
277 tvcpu
->arch
.prodded
= 1;
279 if (vcpu
->arch
.ceded
) {
280 if (waitqueue_active(&vcpu
->wq
)) {
281 wake_up_interruptible(&vcpu
->wq
);
282 vcpu
->stat
.halt_wakeup
++;
289 ret
= do_h_register_vpa(vcpu
, kvmppc_get_gpr(vcpu
, 4),
290 kvmppc_get_gpr(vcpu
, 5),
291 kvmppc_get_gpr(vcpu
, 6));
296 kvmppc_set_gpr(vcpu
, 3, ret
);
297 vcpu
->arch
.hcall_needed
= 0;
301 static int kvmppc_handle_exit(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
,
302 struct task_struct
*tsk
)
306 vcpu
->stat
.sum_exits
++;
308 run
->exit_reason
= KVM_EXIT_UNKNOWN
;
309 run
->ready_for_interrupt_injection
= 1;
310 switch (vcpu
->arch
.trap
) {
311 /* We're good on these - the host merely wanted to get our attention */
312 case BOOK3S_INTERRUPT_HV_DECREMENTER
:
313 vcpu
->stat
.dec_exits
++;
316 case BOOK3S_INTERRUPT_EXTERNAL
:
317 vcpu
->stat
.ext_intr_exits
++;
320 case BOOK3S_INTERRUPT_PERFMON
:
323 case BOOK3S_INTERRUPT_PROGRAM
:
327 * Normally program interrupts are delivered directly
328 * to the guest by the hardware, but we can get here
329 * as a result of a hypervisor emulation interrupt
330 * (e40) getting turned into a 700 by BML RTAS.
332 flags
= vcpu
->arch
.shregs
.msr
& 0x1f0000ull
;
333 kvmppc_core_queue_program(vcpu
, flags
);
337 case BOOK3S_INTERRUPT_SYSCALL
:
339 /* hcall - punt to userspace */
342 if (vcpu
->arch
.shregs
.msr
& MSR_PR
) {
343 /* sc 1 from userspace - reflect to guest syscall */
344 kvmppc_book3s_queue_irqprio(vcpu
, BOOK3S_INTERRUPT_SYSCALL
);
348 run
->papr_hcall
.nr
= kvmppc_get_gpr(vcpu
, 3);
349 for (i
= 0; i
< 9; ++i
)
350 run
->papr_hcall
.args
[i
] = kvmppc_get_gpr(vcpu
, 4 + i
);
351 run
->exit_reason
= KVM_EXIT_PAPR_HCALL
;
352 vcpu
->arch
.hcall_needed
= 1;
357 * We get these next two if the guest does a bad real-mode access,
358 * as we have enabled VRMA (virtualized real mode area) mode in the
359 * LPCR. We just generate an appropriate DSI/ISI to the guest.
361 case BOOK3S_INTERRUPT_H_DATA_STORAGE
:
362 vcpu
->arch
.shregs
.dsisr
= vcpu
->arch
.fault_dsisr
;
363 vcpu
->arch
.shregs
.dar
= vcpu
->arch
.fault_dar
;
364 kvmppc_inject_interrupt(vcpu
, BOOK3S_INTERRUPT_DATA_STORAGE
, 0);
367 case BOOK3S_INTERRUPT_H_INST_STORAGE
:
368 kvmppc_inject_interrupt(vcpu
, BOOK3S_INTERRUPT_INST_STORAGE
,
373 * This occurs if the guest executes an illegal instruction.
374 * We just generate a program interrupt to the guest, since
375 * we don't emulate any guest instructions at this stage.
377 case BOOK3S_INTERRUPT_H_EMUL_ASSIST
:
378 kvmppc_core_queue_program(vcpu
, 0x80000);
382 kvmppc_dump_regs(vcpu
);
383 printk(KERN_EMERG
"trap=0x%x | pc=0x%lx | msr=0x%llx\n",
384 vcpu
->arch
.trap
, kvmppc_get_pc(vcpu
),
385 vcpu
->arch
.shregs
.msr
);
392 if (!(r
& RESUME_HOST
)) {
393 /* To avoid clobbering exit_reason, only check for signals if
394 * we aren't already exiting to userspace for some other
396 if (signal_pending(tsk
)) {
397 vcpu
->stat
.signal_exits
++;
398 run
->exit_reason
= KVM_EXIT_INTR
;
401 kvmppc_core_deliver_interrupts(vcpu
);
408 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu
*vcpu
,
409 struct kvm_sregs
*sregs
)
413 sregs
->pvr
= vcpu
->arch
.pvr
;
415 memset(sregs
, 0, sizeof(struct kvm_sregs
));
416 for (i
= 0; i
< vcpu
->arch
.slb_max
; i
++) {
417 sregs
->u
.s
.ppc64
.slb
[i
].slbe
= vcpu
->arch
.slb
[i
].orige
;
418 sregs
->u
.s
.ppc64
.slb
[i
].slbv
= vcpu
->arch
.slb
[i
].origv
;
424 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu
*vcpu
,
425 struct kvm_sregs
*sregs
)
429 kvmppc_set_pvr(vcpu
, sregs
->pvr
);
432 for (i
= 0; i
< vcpu
->arch
.slb_nr
; i
++) {
433 if (sregs
->u
.s
.ppc64
.slb
[i
].slbe
& SLB_ESID_V
) {
434 vcpu
->arch
.slb
[j
].orige
= sregs
->u
.s
.ppc64
.slb
[i
].slbe
;
435 vcpu
->arch
.slb
[j
].origv
= sregs
->u
.s
.ppc64
.slb
[i
].slbv
;
439 vcpu
->arch
.slb_max
= j
;
444 int kvmppc_core_check_processor_compat(void)
446 if (cpu_has_feature(CPU_FTR_HVMODE
))
451 struct kvm_vcpu
*kvmppc_core_vcpu_create(struct kvm
*kvm
, unsigned int id
)
453 struct kvm_vcpu
*vcpu
;
456 struct kvmppc_vcore
*vcore
;
458 core
= id
/ threads_per_core
;
459 if (core
>= KVM_MAX_VCORES
)
463 vcpu
= kzalloc(sizeof(struct kvm_vcpu
), GFP_KERNEL
);
467 err
= kvm_vcpu_init(vcpu
, kvm
, id
);
471 vcpu
->arch
.shared
= &vcpu
->arch
.shregs
;
472 vcpu
->arch
.last_cpu
= -1;
473 vcpu
->arch
.mmcr
[0] = MMCR0_FC
;
474 vcpu
->arch
.ctrl
= CTRL_RUNLATCH
;
475 /* default to host PVR, since we can't spoof it */
476 vcpu
->arch
.pvr
= mfspr(SPRN_PVR
);
477 kvmppc_set_pvr(vcpu
, vcpu
->arch
.pvr
);
479 kvmppc_mmu_book3s_hv_init(vcpu
);
482 * Some vcpus may start out in stopped state. If we initialize
483 * them to busy-in-host state they will stop other vcpus in the
484 * vcore from running. Instead we initialize them to blocked
485 * state, effectively considering them to be stopped until we
486 * see the first run ioctl for them.
488 vcpu
->arch
.state
= KVMPPC_VCPU_BLOCKED
;
490 init_waitqueue_head(&vcpu
->arch
.cpu_run
);
492 mutex_lock(&kvm
->lock
);
493 vcore
= kvm
->arch
.vcores
[core
];
495 vcore
= kzalloc(sizeof(struct kvmppc_vcore
), GFP_KERNEL
);
497 INIT_LIST_HEAD(&vcore
->runnable_threads
);
498 spin_lock_init(&vcore
->lock
);
500 kvm
->arch
.vcores
[core
] = vcore
;
502 mutex_unlock(&kvm
->lock
);
507 spin_lock(&vcore
->lock
);
508 ++vcore
->num_threads
;
510 spin_unlock(&vcore
->lock
);
511 vcpu
->arch
.vcore
= vcore
;
521 void kvmppc_core_vcpu_free(struct kvm_vcpu
*vcpu
)
523 kvm_vcpu_uninit(vcpu
);
527 static void kvmppc_vcpu_blocked(struct kvm_vcpu
*vcpu
)
529 struct kvmppc_vcore
*vc
= vcpu
->arch
.vcore
;
531 spin_lock(&vc
->lock
);
532 vcpu
->arch
.state
= KVMPPC_VCPU_BLOCKED
;
534 if (vc
->n_runnable
> 0 &&
535 vc
->n_runnable
+ vc
->n_blocked
== vc
->num_threads
) {
536 vcpu
= list_first_entry(&vc
->runnable_threads
, struct kvm_vcpu
,
538 wake_up(&vcpu
->arch
.cpu_run
);
540 spin_unlock(&vc
->lock
);
543 static void kvmppc_vcpu_unblocked(struct kvm_vcpu
*vcpu
)
545 struct kvmppc_vcore
*vc
= vcpu
->arch
.vcore
;
547 spin_lock(&vc
->lock
);
548 vcpu
->arch
.state
= KVMPPC_VCPU_BUSY_IN_HOST
;
550 spin_unlock(&vc
->lock
);
553 extern int __kvmppc_vcore_entry(struct kvm_run
*kvm_run
, struct kvm_vcpu
*vcpu
);
554 extern void xics_wake_cpu(int cpu
);
556 static void kvmppc_remove_runnable(struct kvmppc_vcore
*vc
,
557 struct kvm_vcpu
*vcpu
)
561 if (vcpu
->arch
.state
!= KVMPPC_VCPU_RUNNABLE
)
563 vcpu
->arch
.state
= KVMPPC_VCPU_BUSY_IN_HOST
;
565 /* decrement the physical thread id of each following vcpu */
567 list_for_each_entry_continue(v
, &vc
->runnable_threads
, arch
.run_list
)
569 list_del(&vcpu
->arch
.run_list
);
572 static void kvmppc_start_thread(struct kvm_vcpu
*vcpu
)
575 struct paca_struct
*tpaca
;
576 struct kvmppc_vcore
*vc
= vcpu
->arch
.vcore
;
578 cpu
= vc
->pcpu
+ vcpu
->arch
.ptid
;
580 tpaca
->kvm_hstate
.kvm_vcpu
= vcpu
;
581 tpaca
->kvm_hstate
.kvm_vcore
= vc
;
583 #ifdef CONFIG_PPC_ICP_NATIVE
584 if (vcpu
->arch
.ptid
) {
585 tpaca
->cpu_start
= 0x80;
586 tpaca
->kvm_hstate
.in_guest
= KVM_GUEST_MODE_GUEST
;
594 static void kvmppc_wait_for_nap(struct kvmppc_vcore
*vc
)
600 while (vc
->nap_count
< vc
->n_woken
) {
601 if (++i
>= 1000000) {
602 pr_err("kvmppc_wait_for_nap timeout %d %d\n",
603 vc
->nap_count
, vc
->n_woken
);
612 * Check that we are on thread 0 and that any other threads in
613 * this core are off-line.
615 static int on_primary_thread(void)
617 int cpu
= smp_processor_id();
618 int thr
= cpu_thread_in_core(cpu
);
622 while (++thr
< threads_per_core
)
623 if (cpu_online(cpu
+ thr
))
629 * Run a set of guest threads on a physical core.
630 * Called with vc->lock held.
632 static int kvmppc_run_core(struct kvmppc_vcore
*vc
)
634 struct kvm_vcpu
*vcpu
, *vnext
;
638 /* don't start if any threads have a signal pending */
639 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
640 if (signal_pending(vcpu
->arch
.run_task
))
644 * Make sure we are running on thread 0, and that
645 * secondary threads are offline.
646 * XXX we should also block attempts to bring any
647 * secondary threads online.
649 if (threads_per_core
> 1 && !on_primary_thread()) {
650 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
651 vcpu
->arch
.ret
= -EBUSY
;
657 vc
->entry_exit_count
= 0;
658 vc
->vcore_running
= 1;
660 vc
->pcpu
= smp_processor_id();
661 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
662 kvmppc_start_thread(vcpu
);
663 vcpu
= list_first_entry(&vc
->runnable_threads
, struct kvm_vcpu
,
666 spin_unlock(&vc
->lock
);
670 __kvmppc_vcore_entry(NULL
, vcpu
);
672 /* wait for secondary threads to finish writing their state to memory */
673 spin_lock(&vc
->lock
);
674 if (vc
->nap_count
< vc
->n_woken
)
675 kvmppc_wait_for_nap(vc
);
676 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
677 vc
->vcore_running
= 2;
678 spin_unlock(&vc
->lock
);
680 /* make sure updates to secondary vcpu structs are visible now */
688 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
) {
689 /* cancel pending dec exception if dec is positive */
690 if (now
< vcpu
->arch
.dec_expires
&&
691 kvmppc_core_pending_dec(vcpu
))
692 kvmppc_core_dequeue_dec(vcpu
);
693 if (!vcpu
->arch
.trap
) {
694 if (signal_pending(vcpu
->arch
.run_task
)) {
695 vcpu
->arch
.kvm_run
->exit_reason
= KVM_EXIT_INTR
;
696 vcpu
->arch
.ret
= -EINTR
;
698 continue; /* didn't get to run */
700 ret
= kvmppc_handle_exit(vcpu
->arch
.kvm_run
, vcpu
,
701 vcpu
->arch
.run_task
);
702 vcpu
->arch
.ret
= ret
;
706 spin_lock(&vc
->lock
);
708 vc
->vcore_running
= 0;
709 list_for_each_entry_safe(vcpu
, vnext
, &vc
->runnable_threads
,
711 if (vcpu
->arch
.ret
!= RESUME_GUEST
) {
712 kvmppc_remove_runnable(vc
, vcpu
);
713 wake_up(&vcpu
->arch
.cpu_run
);
720 static int kvmppc_run_vcpu(struct kvm_run
*kvm_run
, struct kvm_vcpu
*vcpu
)
724 struct kvmppc_vcore
*vc
;
727 /* No need to go into the guest when all we do is going out */
728 if (signal_pending(current
)) {
729 kvm_run
->exit_reason
= KVM_EXIT_INTR
;
733 /* On PPC970, check that we have an RMA region */
734 if (!vcpu
->kvm
->arch
.rma
&& cpu_has_feature(CPU_FTR_ARCH_201
))
737 kvm_run
->exit_reason
= 0;
738 vcpu
->arch
.ret
= RESUME_GUEST
;
741 flush_fp_to_thread(current
);
742 flush_altivec_to_thread(current
);
743 flush_vsx_to_thread(current
);
746 * Synchronize with other threads in this virtual core
748 vc
= vcpu
->arch
.vcore
;
749 spin_lock(&vc
->lock
);
750 /* This happens the first time this is called for a vcpu */
751 if (vcpu
->arch
.state
== KVMPPC_VCPU_BLOCKED
)
753 vcpu
->arch
.state
= KVMPPC_VCPU_RUNNABLE
;
754 ptid
= vc
->n_runnable
;
755 vcpu
->arch
.run_task
= current
;
756 vcpu
->arch
.kvm_run
= kvm_run
;
757 vcpu
->arch
.ptid
= ptid
;
758 list_add_tail(&vcpu
->arch
.run_list
, &vc
->runnable_threads
);
761 wait_state
= TASK_INTERRUPTIBLE
;
762 while (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
) {
763 if (signal_pending(current
)) {
764 if (!vc
->vcore_running
) {
765 kvm_run
->exit_reason
= KVM_EXIT_INTR
;
766 vcpu
->arch
.ret
= -EINTR
;
769 /* have to wait for vcore to stop executing guest */
770 wait_state
= TASK_UNINTERRUPTIBLE
;
771 smp_send_reschedule(vc
->pcpu
);
774 if (!vc
->vcore_running
&&
775 vc
->n_runnable
+ vc
->n_blocked
== vc
->num_threads
) {
777 if (kvmppc_run_core(vc
))
781 if (vc
->vcore_running
== 1 && VCORE_EXIT_COUNT(vc
) == 0)
782 kvmppc_start_thread(vcpu
);
784 /* wait for other threads to come in, or wait for vcore */
785 prepare_to_wait(&vcpu
->arch
.cpu_run
, &wait
, wait_state
);
786 spin_unlock(&vc
->lock
);
788 finish_wait(&vcpu
->arch
.cpu_run
, &wait
);
789 spin_lock(&vc
->lock
);
792 if (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
)
793 kvmppc_remove_runnable(vc
, vcpu
);
794 spin_unlock(&vc
->lock
);
796 return vcpu
->arch
.ret
;
799 int kvmppc_vcpu_run(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
)
804 r
= kvmppc_run_vcpu(run
, vcpu
);
806 if (run
->exit_reason
== KVM_EXIT_PAPR_HCALL
&&
807 !(vcpu
->arch
.shregs
.msr
& MSR_PR
)) {
808 r
= kvmppc_pseries_do_hcall(vcpu
);
809 kvmppc_core_deliver_interrupts(vcpu
);
811 } while (r
== RESUME_GUEST
);
815 static long kvmppc_stt_npages(unsigned long window_size
)
817 return ALIGN((window_size
>> SPAPR_TCE_SHIFT
)
818 * sizeof(u64
), PAGE_SIZE
) / PAGE_SIZE
;
821 static void release_spapr_tce_table(struct kvmppc_spapr_tce_table
*stt
)
823 struct kvm
*kvm
= stt
->kvm
;
826 mutex_lock(&kvm
->lock
);
827 list_del(&stt
->list
);
828 for (i
= 0; i
< kvmppc_stt_npages(stt
->window_size
); i
++)
829 __free_page(stt
->pages
[i
]);
831 mutex_unlock(&kvm
->lock
);
836 static int kvm_spapr_tce_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
838 struct kvmppc_spapr_tce_table
*stt
= vma
->vm_file
->private_data
;
841 if (vmf
->pgoff
>= kvmppc_stt_npages(stt
->window_size
))
842 return VM_FAULT_SIGBUS
;
844 page
= stt
->pages
[vmf
->pgoff
];
850 static const struct vm_operations_struct kvm_spapr_tce_vm_ops
= {
851 .fault
= kvm_spapr_tce_fault
,
854 static int kvm_spapr_tce_mmap(struct file
*file
, struct vm_area_struct
*vma
)
856 vma
->vm_ops
= &kvm_spapr_tce_vm_ops
;
860 static int kvm_spapr_tce_release(struct inode
*inode
, struct file
*filp
)
862 struct kvmppc_spapr_tce_table
*stt
= filp
->private_data
;
864 release_spapr_tce_table(stt
);
868 static struct file_operations kvm_spapr_tce_fops
= {
869 .mmap
= kvm_spapr_tce_mmap
,
870 .release
= kvm_spapr_tce_release
,
873 long kvm_vm_ioctl_create_spapr_tce(struct kvm
*kvm
,
874 struct kvm_create_spapr_tce
*args
)
876 struct kvmppc_spapr_tce_table
*stt
= NULL
;
881 /* Check this LIOBN hasn't been previously allocated */
882 list_for_each_entry(stt
, &kvm
->arch
.spapr_tce_tables
, list
) {
883 if (stt
->liobn
== args
->liobn
)
887 npages
= kvmppc_stt_npages(args
->window_size
);
889 stt
= kzalloc(sizeof(*stt
) + npages
* sizeof(struct page
*),
894 stt
->liobn
= args
->liobn
;
895 stt
->window_size
= args
->window_size
;
898 for (i
= 0; i
< npages
; i
++) {
899 stt
->pages
[i
] = alloc_page(GFP_KERNEL
| __GFP_ZERO
);
906 mutex_lock(&kvm
->lock
);
907 list_add(&stt
->list
, &kvm
->arch
.spapr_tce_tables
);
909 mutex_unlock(&kvm
->lock
);
911 return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops
,
916 for (i
= 0; i
< npages
; i
++)
918 __free_page(stt
->pages
[i
]);
925 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
926 Assumes POWER7 or PPC970. */
927 static inline int lpcr_rmls(unsigned long rma_size
)
930 case 32ul << 20: /* 32 MB */
931 if (cpu_has_feature(CPU_FTR_ARCH_206
))
932 return 8; /* only supported on POWER7 */
934 case 64ul << 20: /* 64 MB */
936 case 128ul << 20: /* 128 MB */
938 case 256ul << 20: /* 256 MB */
940 case 1ul << 30: /* 1 GB */
942 case 16ul << 30: /* 16 GB */
944 case 256ul << 30: /* 256 GB */
951 static int kvm_rma_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
953 struct kvmppc_rma_info
*ri
= vma
->vm_file
->private_data
;
956 if (vmf
->pgoff
>= ri
->npages
)
957 return VM_FAULT_SIGBUS
;
959 page
= pfn_to_page(ri
->base_pfn
+ vmf
->pgoff
);
965 static const struct vm_operations_struct kvm_rma_vm_ops
= {
966 .fault
= kvm_rma_fault
,
969 static int kvm_rma_mmap(struct file
*file
, struct vm_area_struct
*vma
)
971 vma
->vm_flags
|= VM_RESERVED
;
972 vma
->vm_ops
= &kvm_rma_vm_ops
;
976 static int kvm_rma_release(struct inode
*inode
, struct file
*filp
)
978 struct kvmppc_rma_info
*ri
= filp
->private_data
;
984 static struct file_operations kvm_rma_fops
= {
985 .mmap
= kvm_rma_mmap
,
986 .release
= kvm_rma_release
,
989 long kvm_vm_ioctl_allocate_rma(struct kvm
*kvm
, struct kvm_allocate_rma
*ret
)
991 struct kvmppc_rma_info
*ri
;
994 ri
= kvm_alloc_rma();
998 fd
= anon_inode_getfd("kvm-rma", &kvm_rma_fops
, ri
, O_RDWR
);
1000 kvm_release_rma(ri
);
1002 ret
->rma_size
= ri
->npages
<< PAGE_SHIFT
;
1006 static struct page
*hva_to_page(unsigned long addr
)
1008 struct page
*page
[1];
1013 npages
= get_user_pages_fast(addr
, 1, 1, page
);
1015 if (unlikely(npages
!= 1))
1021 int kvmppc_core_prepare_memory_region(struct kvm
*kvm
,
1022 struct kvm_userspace_memory_region
*mem
)
1024 unsigned long psize
, porder
;
1025 unsigned long i
, npages
, totalpages
;
1026 unsigned long pg_ix
;
1027 struct kvmppc_pginfo
*pginfo
;
1029 struct kvmppc_rma_info
*ri
= NULL
;
1032 /* For now, only allow 16MB pages */
1033 porder
= LARGE_PAGE_ORDER
;
1034 psize
= 1ul << porder
;
1035 if ((mem
->memory_size
& (psize
- 1)) ||
1036 (mem
->guest_phys_addr
& (psize
- 1))) {
1037 pr_err("bad memory_size=%llx @ %llx\n",
1038 mem
->memory_size
, mem
->guest_phys_addr
);
1042 npages
= mem
->memory_size
>> porder
;
1043 totalpages
= (mem
->guest_phys_addr
+ mem
->memory_size
) >> porder
;
1045 /* More memory than we have space to track? */
1046 if (totalpages
> (1ul << (MAX_MEM_ORDER
- LARGE_PAGE_ORDER
)))
1049 /* Do we already have an RMA registered? */
1050 if (mem
->guest_phys_addr
== 0 && kvm
->arch
.rma
)
1053 if (totalpages
> kvm
->arch
.ram_npages
)
1054 kvm
->arch
.ram_npages
= totalpages
;
1056 /* Is this one of our preallocated RMAs? */
1057 if (mem
->guest_phys_addr
== 0) {
1058 struct vm_area_struct
*vma
;
1060 down_read(¤t
->mm
->mmap_sem
);
1061 vma
= find_vma(current
->mm
, mem
->userspace_addr
);
1062 if (vma
&& vma
->vm_file
&&
1063 vma
->vm_file
->f_op
== &kvm_rma_fops
&&
1064 mem
->userspace_addr
== vma
->vm_start
)
1065 ri
= vma
->vm_file
->private_data
;
1066 up_read(¤t
->mm
->mmap_sem
);
1067 if (!ri
&& cpu_has_feature(CPU_FTR_ARCH_201
)) {
1068 pr_err("CPU requires an RMO\n");
1074 unsigned long rma_size
;
1078 rma_size
= ri
->npages
<< PAGE_SHIFT
;
1079 if (rma_size
> mem
->memory_size
)
1080 rma_size
= mem
->memory_size
;
1081 rmls
= lpcr_rmls(rma_size
);
1083 pr_err("Can't use RMA of 0x%lx bytes\n", rma_size
);
1086 atomic_inc(&ri
->use_count
);
1088 kvm
->arch
.n_rma_pages
= rma_size
>> porder
;
1090 /* Update LPCR and RMOR */
1091 lpcr
= kvm
->arch
.lpcr
;
1092 if (cpu_has_feature(CPU_FTR_ARCH_201
)) {
1093 /* PPC970; insert RMLS value (split field) in HID4 */
1094 lpcr
&= ~((1ul << HID4_RMLS0_SH
) |
1095 (3ul << HID4_RMLS2_SH
));
1096 lpcr
|= ((rmls
>> 2) << HID4_RMLS0_SH
) |
1097 ((rmls
& 3) << HID4_RMLS2_SH
);
1098 /* RMOR is also in HID4 */
1099 lpcr
|= ((ri
->base_pfn
>> (26 - PAGE_SHIFT
)) & 0xffff)
1103 lpcr
&= ~(LPCR_VPM0
| LPCR_VRMA_L
);
1104 lpcr
|= rmls
<< LPCR_RMLS_SH
;
1105 kvm
->arch
.rmor
= kvm
->arch
.rma
->base_pfn
<< PAGE_SHIFT
;
1107 kvm
->arch
.lpcr
= lpcr
;
1108 pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
1109 ri
->base_pfn
<< PAGE_SHIFT
, rma_size
, lpcr
);
1112 pg_ix
= mem
->guest_phys_addr
>> porder
;
1113 pginfo
= kvm
->arch
.ram_pginfo
+ pg_ix
;
1114 for (i
= 0; i
< npages
; ++i
, ++pg_ix
) {
1115 if (ri
&& pg_ix
< kvm
->arch
.n_rma_pages
) {
1116 pginfo
[i
].pfn
= ri
->base_pfn
+
1117 (pg_ix
<< (porder
- PAGE_SHIFT
));
1120 hva
= mem
->userspace_addr
+ (i
<< porder
);
1121 page
= hva_to_page(hva
);
1123 pr_err("oops, no pfn for hva %lx\n", hva
);
1126 /* Check it's a 16MB page */
1127 if (!PageHead(page
) ||
1128 compound_order(page
) != (LARGE_PAGE_ORDER
- PAGE_SHIFT
)) {
1129 pr_err("page at %lx isn't 16MB (o=%d)\n",
1130 hva
, compound_order(page
));
1133 pginfo
[i
].pfn
= page_to_pfn(page
);
1142 void kvmppc_core_commit_memory_region(struct kvm
*kvm
,
1143 struct kvm_userspace_memory_region
*mem
)
1145 if (mem
->guest_phys_addr
== 0 && mem
->memory_size
!= 0 &&
1147 kvmppc_map_vrma(kvm
, mem
);
1150 int kvmppc_core_init_vm(struct kvm
*kvm
)
1153 unsigned long npages
= 1ul << (MAX_MEM_ORDER
- LARGE_PAGE_ORDER
);
1157 /* Allocate hashed page table */
1158 r
= kvmppc_alloc_hpt(kvm
);
1162 INIT_LIST_HEAD(&kvm
->arch
.spapr_tce_tables
);
1164 kvm
->arch
.ram_pginfo
= kzalloc(npages
* sizeof(struct kvmppc_pginfo
),
1166 if (!kvm
->arch
.ram_pginfo
) {
1167 pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
1168 npages
* sizeof(struct kvmppc_pginfo
));
1172 kvm
->arch
.ram_npages
= 0;
1173 kvm
->arch
.ram_psize
= 1ul << LARGE_PAGE_ORDER
;
1174 kvm
->arch
.ram_porder
= LARGE_PAGE_ORDER
;
1175 kvm
->arch
.rma
= NULL
;
1176 kvm
->arch
.n_rma_pages
= 0;
1178 kvm
->arch
.host_sdr1
= mfspr(SPRN_SDR1
);
1180 if (cpu_has_feature(CPU_FTR_ARCH_201
)) {
1181 /* PPC970; HID4 is effectively the LPCR */
1182 unsigned long lpid
= kvm
->arch
.lpid
;
1183 kvm
->arch
.host_lpid
= 0;
1184 kvm
->arch
.host_lpcr
= lpcr
= mfspr(SPRN_HID4
);
1185 lpcr
&= ~((3 << HID4_LPID1_SH
) | (0xful
<< HID4_LPID5_SH
));
1186 lpcr
|= ((lpid
>> 4) << HID4_LPID1_SH
) |
1187 ((lpid
& 0xf) << HID4_LPID5_SH
);
1189 /* POWER7; init LPCR for virtual RMA mode */
1190 kvm
->arch
.host_lpid
= mfspr(SPRN_LPID
);
1191 kvm
->arch
.host_lpcr
= lpcr
= mfspr(SPRN_LPCR
);
1192 lpcr
&= LPCR_PECE
| LPCR_LPES
;
1193 lpcr
|= (4UL << LPCR_DPFD_SH
) | LPCR_HDICE
|
1194 LPCR_VPM0
| LPCR_VRMA_L
;
1196 kvm
->arch
.lpcr
= lpcr
;
1201 kvmppc_free_hpt(kvm
);
1205 void kvmppc_core_destroy_vm(struct kvm
*kvm
)
1207 struct kvmppc_pginfo
*pginfo
;
1210 if (kvm
->arch
.ram_pginfo
) {
1211 pginfo
= kvm
->arch
.ram_pginfo
;
1212 kvm
->arch
.ram_pginfo
= NULL
;
1213 for (i
= kvm
->arch
.n_rma_pages
; i
< kvm
->arch
.ram_npages
; ++i
)
1215 put_page(pfn_to_page(pginfo
[i
].pfn
));
1218 if (kvm
->arch
.rma
) {
1219 kvm_release_rma(kvm
->arch
.rma
);
1220 kvm
->arch
.rma
= NULL
;
1223 kvmppc_free_hpt(kvm
);
1224 WARN_ON(!list_empty(&kvm
->arch
.spapr_tce_tables
));
1227 /* These are stubs for now */
1228 void kvmppc_mmu_pte_pflush(struct kvm_vcpu
*vcpu
, ulong pa_start
, ulong pa_end
)
1232 /* We don't need to emulate any privileged instructions or dcbz */
1233 int kvmppc_core_emulate_op(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
,
1234 unsigned int inst
, int *advance
)
1236 return EMULATE_FAIL
;
1239 int kvmppc_core_emulate_mtspr(struct kvm_vcpu
*vcpu
, int sprn
, int rs
)
1241 return EMULATE_FAIL
;
1244 int kvmppc_core_emulate_mfspr(struct kvm_vcpu
*vcpu
, int sprn
, int rt
)
1246 return EMULATE_FAIL
;
1249 static int kvmppc_book3s_hv_init(void)
1253 r
= kvm_init(NULL
, sizeof(struct kvm_vcpu
), 0, THIS_MODULE
);
1258 r
= kvmppc_mmu_hv_init();
1263 static void kvmppc_book3s_hv_exit(void)
1268 module_init(kvmppc_book3s_hv_init
);
1269 module_exit(kvmppc_book3s_hv_exit
);