2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
3 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
6 * Paul Mackerras <paulus@au1.ibm.com>
7 * Alexander Graf <agraf@suse.de>
8 * Kevin Wolf <mail@kevin-wolf.de>
10 * Description: KVM functions specific to running on Book 3S
11 * processors in hypervisor mode (specifically POWER7 and later).
13 * This file is derived from arch/powerpc/kvm/book3s.c,
14 * by Alexander Graf <agraf@suse.de>.
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License, version 2, as
18 * published by the Free Software Foundation.
21 #include <linux/kvm_host.h>
22 #include <linux/err.h>
23 #include <linux/slab.h>
24 #include <linux/preempt.h>
25 #include <linux/sched.h>
26 #include <linux/delay.h>
27 #include <linux/export.h>
29 #include <linux/anon_inodes.h>
30 #include <linux/cpumask.h>
31 #include <linux/spinlock.h>
32 #include <linux/page-flags.h>
35 #include <asm/cputable.h>
36 #include <asm/cacheflush.h>
37 #include <asm/tlbflush.h>
38 #include <asm/uaccess.h>
40 #include <asm/kvm_ppc.h>
41 #include <asm/kvm_book3s.h>
42 #include <asm/mmu_context.h>
43 #include <asm/lppaca.h>
44 #include <asm/processor.h>
45 #include <asm/cputhreads.h>
47 #include <linux/gfp.h>
48 #include <linux/sched.h>
49 #include <linux/vmalloc.h>
50 #include <linux/highmem.h>
53 * For now, limit memory to 64GB and require it to be large pages.
54 * This value is chosen because it makes the ram_pginfo array be
55 * 64kB in size, which is about as large as we want to be trying
56 * to allocate with kmalloc.
58 #define MAX_MEM_ORDER 36
60 #define LARGE_PAGE_ORDER 24 /* 16MB pages */
62 /* #define EXIT_DEBUG */
63 /* #define EXIT_DEBUG_SIMPLE */
64 /* #define EXIT_DEBUG_INT */
66 void kvmppc_core_vcpu_load(struct kvm_vcpu
*vcpu
, int cpu
)
68 local_paca
->kvm_hstate
.kvm_vcpu
= vcpu
;
69 local_paca
->kvm_hstate
.kvm_vcore
= vcpu
->arch
.vcore
;
72 void kvmppc_core_vcpu_put(struct kvm_vcpu
*vcpu
)
76 static void kvmppc_vcpu_blocked(struct kvm_vcpu
*vcpu
);
77 static void kvmppc_vcpu_unblocked(struct kvm_vcpu
*vcpu
);
79 void kvmppc_vcpu_block(struct kvm_vcpu
*vcpu
)
82 unsigned long dec_nsec
;
85 if (now
>= vcpu
->arch
.dec_expires
&& !kvmppc_core_pending_dec(vcpu
))
86 kvmppc_core_queue_dec(vcpu
);
87 if (vcpu
->arch
.pending_exceptions
)
89 if (vcpu
->arch
.dec_expires
!= ~(u64
)0) {
90 dec_nsec
= (vcpu
->arch
.dec_expires
- now
) * NSEC_PER_SEC
/
92 hrtimer_start(&vcpu
->arch
.dec_timer
, ktime_set(0, dec_nsec
),
96 kvmppc_vcpu_blocked(vcpu
);
99 vcpu
->stat
.halt_wakeup
++;
101 if (vcpu
->arch
.dec_expires
!= ~(u64
)0)
102 hrtimer_try_to_cancel(&vcpu
->arch
.dec_timer
);
104 kvmppc_vcpu_unblocked(vcpu
);
107 void kvmppc_set_msr(struct kvm_vcpu
*vcpu
, u64 msr
)
109 vcpu
->arch
.shregs
.msr
= msr
;
112 void kvmppc_set_pvr(struct kvm_vcpu
*vcpu
, u32 pvr
)
114 vcpu
->arch
.pvr
= pvr
;
117 void kvmppc_dump_regs(struct kvm_vcpu
*vcpu
)
121 pr_err("vcpu %p (%d):\n", vcpu
, vcpu
->vcpu_id
);
122 pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
123 vcpu
->arch
.pc
, vcpu
->arch
.shregs
.msr
, vcpu
->arch
.trap
);
124 for (r
= 0; r
< 16; ++r
)
125 pr_err("r%2d = %.16lx r%d = %.16lx\n",
126 r
, kvmppc_get_gpr(vcpu
, r
),
127 r
+16, kvmppc_get_gpr(vcpu
, r
+16));
128 pr_err("ctr = %.16lx lr = %.16lx\n",
129 vcpu
->arch
.ctr
, vcpu
->arch
.lr
);
130 pr_err("srr0 = %.16llx srr1 = %.16llx\n",
131 vcpu
->arch
.shregs
.srr0
, vcpu
->arch
.shregs
.srr1
);
132 pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
133 vcpu
->arch
.shregs
.sprg0
, vcpu
->arch
.shregs
.sprg1
);
134 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
135 vcpu
->arch
.shregs
.sprg2
, vcpu
->arch
.shregs
.sprg3
);
136 pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
137 vcpu
->arch
.cr
, vcpu
->arch
.xer
, vcpu
->arch
.shregs
.dsisr
);
138 pr_err("dar = %.16llx\n", vcpu
->arch
.shregs
.dar
);
139 pr_err("fault dar = %.16lx dsisr = %.8x\n",
140 vcpu
->arch
.fault_dar
, vcpu
->arch
.fault_dsisr
);
141 pr_err("SLB (%d entries):\n", vcpu
->arch
.slb_max
);
142 for (r
= 0; r
< vcpu
->arch
.slb_max
; ++r
)
143 pr_err(" ESID = %.16llx VSID = %.16llx\n",
144 vcpu
->arch
.slb
[r
].orige
, vcpu
->arch
.slb
[r
].origv
);
145 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
146 vcpu
->kvm
->arch
.lpcr
, vcpu
->kvm
->arch
.sdr1
,
147 vcpu
->arch
.last_inst
);
150 struct kvm_vcpu
*kvmppc_find_vcpu(struct kvm
*kvm
, int id
)
153 struct kvm_vcpu
*v
, *ret
= NULL
;
155 mutex_lock(&kvm
->lock
);
156 kvm_for_each_vcpu(r
, v
, kvm
) {
157 if (v
->vcpu_id
== id
) {
162 mutex_unlock(&kvm
->lock
);
166 static void init_vpa(struct kvm_vcpu
*vcpu
, struct lppaca
*vpa
)
168 vpa
->shared_proc
= 1;
169 vpa
->yield_count
= 1;
172 static unsigned long do_h_register_vpa(struct kvm_vcpu
*vcpu
,
174 unsigned long vcpuid
, unsigned long vpa
)
176 struct kvm
*kvm
= vcpu
->kvm
;
177 unsigned long pg_index
, ra
, len
;
178 unsigned long pg_offset
;
180 struct kvm_vcpu
*tvcpu
;
182 tvcpu
= kvmppc_find_vcpu(kvm
, vcpuid
);
188 if (flags
== 0 || flags
== 4)
193 /* registering new area; convert logical addr to real */
194 pg_index
= vpa
>> kvm
->arch
.ram_porder
;
195 pg_offset
= vpa
& (kvm
->arch
.ram_psize
- 1);
196 if (pg_index
>= kvm
->arch
.ram_npages
)
198 if (kvm
->arch
.ram_pginfo
[pg_index
].pfn
== 0)
200 ra
= kvm
->arch
.ram_pginfo
[pg_index
].pfn
<< PAGE_SHIFT
;
204 len
= *(unsigned short *)(va
+ 4);
206 len
= *(unsigned int *)(va
+ 4);
207 if (pg_offset
+ len
> kvm
->arch
.ram_psize
)
210 case 1: /* register VPA */
213 tvcpu
->arch
.vpa
= va
;
216 case 2: /* register DTL */
219 if (!tvcpu
->arch
.vpa
)
222 tvcpu
->arch
.dtl
= va
;
223 tvcpu
->arch
.dtl_end
= va
+ len
;
225 case 3: /* register SLB shadow buffer */
228 if (!tvcpu
->arch
.vpa
)
230 tvcpu
->arch
.slb_shadow
= va
;
231 len
= (len
- 16) / 16;
232 tvcpu
->arch
.slb_shadow
= va
;
237 case 5: /* unregister VPA */
238 if (tvcpu
->arch
.slb_shadow
|| tvcpu
->arch
.dtl
)
240 tvcpu
->arch
.vpa
= NULL
;
242 case 6: /* unregister DTL */
243 tvcpu
->arch
.dtl
= NULL
;
245 case 7: /* unregister SLB shadow buffer */
246 tvcpu
->arch
.slb_shadow
= NULL
;
253 int kvmppc_pseries_do_hcall(struct kvm_vcpu
*vcpu
)
255 unsigned long req
= kvmppc_get_gpr(vcpu
, 3);
256 unsigned long target
, ret
= H_SUCCESS
;
257 struct kvm_vcpu
*tvcpu
;
261 vcpu
->arch
.shregs
.msr
|= MSR_EE
;
262 vcpu
->arch
.ceded
= 1;
264 if (!vcpu
->arch
.prodded
)
265 kvmppc_vcpu_block(vcpu
);
267 vcpu
->arch
.prodded
= 0;
269 vcpu
->arch
.ceded
= 0;
272 target
= kvmppc_get_gpr(vcpu
, 4);
273 tvcpu
= kvmppc_find_vcpu(vcpu
->kvm
, target
);
278 tvcpu
->arch
.prodded
= 1;
280 if (vcpu
->arch
.ceded
) {
281 if (waitqueue_active(&vcpu
->wq
)) {
282 wake_up_interruptible(&vcpu
->wq
);
283 vcpu
->stat
.halt_wakeup
++;
290 ret
= do_h_register_vpa(vcpu
, kvmppc_get_gpr(vcpu
, 4),
291 kvmppc_get_gpr(vcpu
, 5),
292 kvmppc_get_gpr(vcpu
, 6));
297 kvmppc_set_gpr(vcpu
, 3, ret
);
298 vcpu
->arch
.hcall_needed
= 0;
302 static int kvmppc_handle_exit(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
,
303 struct task_struct
*tsk
)
307 vcpu
->stat
.sum_exits
++;
309 run
->exit_reason
= KVM_EXIT_UNKNOWN
;
310 run
->ready_for_interrupt_injection
= 1;
311 switch (vcpu
->arch
.trap
) {
312 /* We're good on these - the host merely wanted to get our attention */
313 case BOOK3S_INTERRUPT_HV_DECREMENTER
:
314 vcpu
->stat
.dec_exits
++;
317 case BOOK3S_INTERRUPT_EXTERNAL
:
318 vcpu
->stat
.ext_intr_exits
++;
321 case BOOK3S_INTERRUPT_PERFMON
:
324 case BOOK3S_INTERRUPT_PROGRAM
:
328 * Normally program interrupts are delivered directly
329 * to the guest by the hardware, but we can get here
330 * as a result of a hypervisor emulation interrupt
331 * (e40) getting turned into a 700 by BML RTAS.
333 flags
= vcpu
->arch
.shregs
.msr
& 0x1f0000ull
;
334 kvmppc_core_queue_program(vcpu
, flags
);
338 case BOOK3S_INTERRUPT_SYSCALL
:
340 /* hcall - punt to userspace */
343 if (vcpu
->arch
.shregs
.msr
& MSR_PR
) {
344 /* sc 1 from userspace - reflect to guest syscall */
345 kvmppc_book3s_queue_irqprio(vcpu
, BOOK3S_INTERRUPT_SYSCALL
);
349 run
->papr_hcall
.nr
= kvmppc_get_gpr(vcpu
, 3);
350 for (i
= 0; i
< 9; ++i
)
351 run
->papr_hcall
.args
[i
] = kvmppc_get_gpr(vcpu
, 4 + i
);
352 run
->exit_reason
= KVM_EXIT_PAPR_HCALL
;
353 vcpu
->arch
.hcall_needed
= 1;
358 * We get these next two if the guest does a bad real-mode access,
359 * as we have enabled VRMA (virtualized real mode area) mode in the
360 * LPCR. We just generate an appropriate DSI/ISI to the guest.
362 case BOOK3S_INTERRUPT_H_DATA_STORAGE
:
363 vcpu
->arch
.shregs
.dsisr
= vcpu
->arch
.fault_dsisr
;
364 vcpu
->arch
.shregs
.dar
= vcpu
->arch
.fault_dar
;
365 kvmppc_inject_interrupt(vcpu
, BOOK3S_INTERRUPT_DATA_STORAGE
, 0);
368 case BOOK3S_INTERRUPT_H_INST_STORAGE
:
369 kvmppc_inject_interrupt(vcpu
, BOOK3S_INTERRUPT_INST_STORAGE
,
374 * This occurs if the guest executes an illegal instruction.
375 * We just generate a program interrupt to the guest, since
376 * we don't emulate any guest instructions at this stage.
378 case BOOK3S_INTERRUPT_H_EMUL_ASSIST
:
379 kvmppc_core_queue_program(vcpu
, 0x80000);
383 kvmppc_dump_regs(vcpu
);
384 printk(KERN_EMERG
"trap=0x%x | pc=0x%lx | msr=0x%llx\n",
385 vcpu
->arch
.trap
, kvmppc_get_pc(vcpu
),
386 vcpu
->arch
.shregs
.msr
);
393 if (!(r
& RESUME_HOST
)) {
394 /* To avoid clobbering exit_reason, only check for signals if
395 * we aren't already exiting to userspace for some other
397 if (signal_pending(tsk
)) {
398 vcpu
->stat
.signal_exits
++;
399 run
->exit_reason
= KVM_EXIT_INTR
;
402 kvmppc_core_deliver_interrupts(vcpu
);
409 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu
*vcpu
,
410 struct kvm_sregs
*sregs
)
414 sregs
->pvr
= vcpu
->arch
.pvr
;
416 memset(sregs
, 0, sizeof(struct kvm_sregs
));
417 for (i
= 0; i
< vcpu
->arch
.slb_max
; i
++) {
418 sregs
->u
.s
.ppc64
.slb
[i
].slbe
= vcpu
->arch
.slb
[i
].orige
;
419 sregs
->u
.s
.ppc64
.slb
[i
].slbv
= vcpu
->arch
.slb
[i
].origv
;
425 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu
*vcpu
,
426 struct kvm_sregs
*sregs
)
430 kvmppc_set_pvr(vcpu
, sregs
->pvr
);
433 for (i
= 0; i
< vcpu
->arch
.slb_nr
; i
++) {
434 if (sregs
->u
.s
.ppc64
.slb
[i
].slbe
& SLB_ESID_V
) {
435 vcpu
->arch
.slb
[j
].orige
= sregs
->u
.s
.ppc64
.slb
[i
].slbe
;
436 vcpu
->arch
.slb
[j
].origv
= sregs
->u
.s
.ppc64
.slb
[i
].slbv
;
440 vcpu
->arch
.slb_max
= j
;
445 int kvmppc_core_check_processor_compat(void)
447 if (cpu_has_feature(CPU_FTR_HVMODE
))
452 struct kvm_vcpu
*kvmppc_core_vcpu_create(struct kvm
*kvm
, unsigned int id
)
454 struct kvm_vcpu
*vcpu
;
457 struct kvmppc_vcore
*vcore
;
459 core
= id
/ threads_per_core
;
460 if (core
>= KVM_MAX_VCORES
)
464 vcpu
= kzalloc(sizeof(struct kvm_vcpu
), GFP_KERNEL
);
468 err
= kvm_vcpu_init(vcpu
, kvm
, id
);
472 vcpu
->arch
.shared
= &vcpu
->arch
.shregs
;
473 vcpu
->arch
.last_cpu
= -1;
474 vcpu
->arch
.mmcr
[0] = MMCR0_FC
;
475 vcpu
->arch
.ctrl
= CTRL_RUNLATCH
;
476 /* default to host PVR, since we can't spoof it */
477 vcpu
->arch
.pvr
= mfspr(SPRN_PVR
);
478 kvmppc_set_pvr(vcpu
, vcpu
->arch
.pvr
);
480 kvmppc_mmu_book3s_hv_init(vcpu
);
483 * Some vcpus may start out in stopped state. If we initialize
484 * them to busy-in-host state they will stop other vcpus in the
485 * vcore from running. Instead we initialize them to blocked
486 * state, effectively considering them to be stopped until we
487 * see the first run ioctl for them.
489 vcpu
->arch
.state
= KVMPPC_VCPU_BLOCKED
;
491 init_waitqueue_head(&vcpu
->arch
.cpu_run
);
493 mutex_lock(&kvm
->lock
);
494 vcore
= kvm
->arch
.vcores
[core
];
496 vcore
= kzalloc(sizeof(struct kvmppc_vcore
), GFP_KERNEL
);
498 INIT_LIST_HEAD(&vcore
->runnable_threads
);
499 spin_lock_init(&vcore
->lock
);
501 kvm
->arch
.vcores
[core
] = vcore
;
503 mutex_unlock(&kvm
->lock
);
508 spin_lock(&vcore
->lock
);
509 ++vcore
->num_threads
;
511 spin_unlock(&vcore
->lock
);
512 vcpu
->arch
.vcore
= vcore
;
522 void kvmppc_core_vcpu_free(struct kvm_vcpu
*vcpu
)
524 kvm_vcpu_uninit(vcpu
);
528 static void kvmppc_vcpu_blocked(struct kvm_vcpu
*vcpu
)
530 struct kvmppc_vcore
*vc
= vcpu
->arch
.vcore
;
532 spin_lock(&vc
->lock
);
533 vcpu
->arch
.state
= KVMPPC_VCPU_BLOCKED
;
535 if (vc
->n_runnable
> 0 &&
536 vc
->n_runnable
+ vc
->n_blocked
== vc
->num_threads
) {
537 vcpu
= list_first_entry(&vc
->runnable_threads
, struct kvm_vcpu
,
539 wake_up(&vcpu
->arch
.cpu_run
);
541 spin_unlock(&vc
->lock
);
544 static void kvmppc_vcpu_unblocked(struct kvm_vcpu
*vcpu
)
546 struct kvmppc_vcore
*vc
= vcpu
->arch
.vcore
;
548 spin_lock(&vc
->lock
);
549 vcpu
->arch
.state
= KVMPPC_VCPU_BUSY_IN_HOST
;
551 spin_unlock(&vc
->lock
);
554 extern int __kvmppc_vcore_entry(struct kvm_run
*kvm_run
, struct kvm_vcpu
*vcpu
);
555 extern void xics_wake_cpu(int cpu
);
557 static void kvmppc_remove_runnable(struct kvmppc_vcore
*vc
,
558 struct kvm_vcpu
*vcpu
)
562 if (vcpu
->arch
.state
!= KVMPPC_VCPU_RUNNABLE
)
564 vcpu
->arch
.state
= KVMPPC_VCPU_BUSY_IN_HOST
;
566 /* decrement the physical thread id of each following vcpu */
568 list_for_each_entry_continue(v
, &vc
->runnable_threads
, arch
.run_list
)
570 list_del(&vcpu
->arch
.run_list
);
573 static void kvmppc_start_thread(struct kvm_vcpu
*vcpu
)
576 struct paca_struct
*tpaca
;
577 struct kvmppc_vcore
*vc
= vcpu
->arch
.vcore
;
579 cpu
= vc
->pcpu
+ vcpu
->arch
.ptid
;
581 tpaca
->kvm_hstate
.kvm_vcpu
= vcpu
;
582 tpaca
->kvm_hstate
.kvm_vcore
= vc
;
584 #ifdef CONFIG_PPC_ICP_NATIVE
585 if (vcpu
->arch
.ptid
) {
586 tpaca
->cpu_start
= 0x80;
587 tpaca
->kvm_hstate
.in_guest
= KVM_GUEST_MODE_GUEST
;
595 static void kvmppc_wait_for_nap(struct kvmppc_vcore
*vc
)
601 while (vc
->nap_count
< vc
->n_woken
) {
602 if (++i
>= 1000000) {
603 pr_err("kvmppc_wait_for_nap timeout %d %d\n",
604 vc
->nap_count
, vc
->n_woken
);
613 * Check that we are on thread 0 and that any other threads in
614 * this core are off-line.
616 static int on_primary_thread(void)
618 int cpu
= smp_processor_id();
619 int thr
= cpu_thread_in_core(cpu
);
623 while (++thr
< threads_per_core
)
624 if (cpu_online(cpu
+ thr
))
630 * Run a set of guest threads on a physical core.
631 * Called with vc->lock held.
633 static int kvmppc_run_core(struct kvmppc_vcore
*vc
)
635 struct kvm_vcpu
*vcpu
, *vnext
;
639 /* don't start if any threads have a signal pending */
640 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
641 if (signal_pending(vcpu
->arch
.run_task
))
645 * Make sure we are running on thread 0, and that
646 * secondary threads are offline.
647 * XXX we should also block attempts to bring any
648 * secondary threads online.
650 if (threads_per_core
> 1 && !on_primary_thread()) {
651 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
652 vcpu
->arch
.ret
= -EBUSY
;
658 vc
->entry_exit_count
= 0;
659 vc
->vcore_running
= 1;
661 vc
->pcpu
= smp_processor_id();
662 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
663 kvmppc_start_thread(vcpu
);
664 vcpu
= list_first_entry(&vc
->runnable_threads
, struct kvm_vcpu
,
667 spin_unlock(&vc
->lock
);
671 __kvmppc_vcore_entry(NULL
, vcpu
);
673 /* wait for secondary threads to finish writing their state to memory */
674 spin_lock(&vc
->lock
);
675 if (vc
->nap_count
< vc
->n_woken
)
676 kvmppc_wait_for_nap(vc
);
677 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
678 vc
->vcore_running
= 2;
679 spin_unlock(&vc
->lock
);
681 /* make sure updates to secondary vcpu structs are visible now */
689 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
) {
690 /* cancel pending dec exception if dec is positive */
691 if (now
< vcpu
->arch
.dec_expires
&&
692 kvmppc_core_pending_dec(vcpu
))
693 kvmppc_core_dequeue_dec(vcpu
);
694 if (!vcpu
->arch
.trap
) {
695 if (signal_pending(vcpu
->arch
.run_task
)) {
696 vcpu
->arch
.kvm_run
->exit_reason
= KVM_EXIT_INTR
;
697 vcpu
->arch
.ret
= -EINTR
;
699 continue; /* didn't get to run */
701 ret
= kvmppc_handle_exit(vcpu
->arch
.kvm_run
, vcpu
,
702 vcpu
->arch
.run_task
);
703 vcpu
->arch
.ret
= ret
;
707 spin_lock(&vc
->lock
);
709 vc
->vcore_running
= 0;
710 list_for_each_entry_safe(vcpu
, vnext
, &vc
->runnable_threads
,
712 if (vcpu
->arch
.ret
!= RESUME_GUEST
) {
713 kvmppc_remove_runnable(vc
, vcpu
);
714 wake_up(&vcpu
->arch
.cpu_run
);
721 static int kvmppc_run_vcpu(struct kvm_run
*kvm_run
, struct kvm_vcpu
*vcpu
)
725 struct kvmppc_vcore
*vc
;
728 /* No need to go into the guest when all we do is going out */
729 if (signal_pending(current
)) {
730 kvm_run
->exit_reason
= KVM_EXIT_INTR
;
734 /* On PPC970, check that we have an RMA region */
735 if (!vcpu
->kvm
->arch
.rma
&& cpu_has_feature(CPU_FTR_ARCH_201
))
738 kvm_run
->exit_reason
= 0;
739 vcpu
->arch
.ret
= RESUME_GUEST
;
742 flush_fp_to_thread(current
);
743 flush_altivec_to_thread(current
);
744 flush_vsx_to_thread(current
);
747 * Synchronize with other threads in this virtual core
749 vc
= vcpu
->arch
.vcore
;
750 spin_lock(&vc
->lock
);
751 /* This happens the first time this is called for a vcpu */
752 if (vcpu
->arch
.state
== KVMPPC_VCPU_BLOCKED
)
754 vcpu
->arch
.state
= KVMPPC_VCPU_RUNNABLE
;
755 ptid
= vc
->n_runnable
;
756 vcpu
->arch
.run_task
= current
;
757 vcpu
->arch
.kvm_run
= kvm_run
;
758 vcpu
->arch
.ptid
= ptid
;
759 list_add_tail(&vcpu
->arch
.run_list
, &vc
->runnable_threads
);
762 wait_state
= TASK_INTERRUPTIBLE
;
763 while (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
) {
764 if (signal_pending(current
)) {
765 if (!vc
->vcore_running
) {
766 kvm_run
->exit_reason
= KVM_EXIT_INTR
;
767 vcpu
->arch
.ret
= -EINTR
;
770 /* have to wait for vcore to stop executing guest */
771 wait_state
= TASK_UNINTERRUPTIBLE
;
772 smp_send_reschedule(vc
->pcpu
);
775 if (!vc
->vcore_running
&&
776 vc
->n_runnable
+ vc
->n_blocked
== vc
->num_threads
) {
778 if (kvmppc_run_core(vc
))
782 if (vc
->vcore_running
== 1 && VCORE_EXIT_COUNT(vc
) == 0)
783 kvmppc_start_thread(vcpu
);
785 /* wait for other threads to come in, or wait for vcore */
786 prepare_to_wait(&vcpu
->arch
.cpu_run
, &wait
, wait_state
);
787 spin_unlock(&vc
->lock
);
789 finish_wait(&vcpu
->arch
.cpu_run
, &wait
);
790 spin_lock(&vc
->lock
);
793 if (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
)
794 kvmppc_remove_runnable(vc
, vcpu
);
795 spin_unlock(&vc
->lock
);
797 return vcpu
->arch
.ret
;
800 int kvmppc_vcpu_run(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
)
805 r
= kvmppc_run_vcpu(run
, vcpu
);
807 if (run
->exit_reason
== KVM_EXIT_PAPR_HCALL
&&
808 !(vcpu
->arch
.shregs
.msr
& MSR_PR
)) {
809 r
= kvmppc_pseries_do_hcall(vcpu
);
810 kvmppc_core_deliver_interrupts(vcpu
);
812 } while (r
== RESUME_GUEST
);
816 static long kvmppc_stt_npages(unsigned long window_size
)
818 return ALIGN((window_size
>> SPAPR_TCE_SHIFT
)
819 * sizeof(u64
), PAGE_SIZE
) / PAGE_SIZE
;
822 static void release_spapr_tce_table(struct kvmppc_spapr_tce_table
*stt
)
824 struct kvm
*kvm
= stt
->kvm
;
827 mutex_lock(&kvm
->lock
);
828 list_del(&stt
->list
);
829 for (i
= 0; i
< kvmppc_stt_npages(stt
->window_size
); i
++)
830 __free_page(stt
->pages
[i
]);
832 mutex_unlock(&kvm
->lock
);
837 static int kvm_spapr_tce_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
839 struct kvmppc_spapr_tce_table
*stt
= vma
->vm_file
->private_data
;
842 if (vmf
->pgoff
>= kvmppc_stt_npages(stt
->window_size
))
843 return VM_FAULT_SIGBUS
;
845 page
= stt
->pages
[vmf
->pgoff
];
851 static const struct vm_operations_struct kvm_spapr_tce_vm_ops
= {
852 .fault
= kvm_spapr_tce_fault
,
855 static int kvm_spapr_tce_mmap(struct file
*file
, struct vm_area_struct
*vma
)
857 vma
->vm_ops
= &kvm_spapr_tce_vm_ops
;
861 static int kvm_spapr_tce_release(struct inode
*inode
, struct file
*filp
)
863 struct kvmppc_spapr_tce_table
*stt
= filp
->private_data
;
865 release_spapr_tce_table(stt
);
869 static struct file_operations kvm_spapr_tce_fops
= {
870 .mmap
= kvm_spapr_tce_mmap
,
871 .release
= kvm_spapr_tce_release
,
874 long kvm_vm_ioctl_create_spapr_tce(struct kvm
*kvm
,
875 struct kvm_create_spapr_tce
*args
)
877 struct kvmppc_spapr_tce_table
*stt
= NULL
;
882 /* Check this LIOBN hasn't been previously allocated */
883 list_for_each_entry(stt
, &kvm
->arch
.spapr_tce_tables
, list
) {
884 if (stt
->liobn
== args
->liobn
)
888 npages
= kvmppc_stt_npages(args
->window_size
);
890 stt
= kzalloc(sizeof(*stt
) + npages
* sizeof(struct page
*),
895 stt
->liobn
= args
->liobn
;
896 stt
->window_size
= args
->window_size
;
899 for (i
= 0; i
< npages
; i
++) {
900 stt
->pages
[i
] = alloc_page(GFP_KERNEL
| __GFP_ZERO
);
907 mutex_lock(&kvm
->lock
);
908 list_add(&stt
->list
, &kvm
->arch
.spapr_tce_tables
);
910 mutex_unlock(&kvm
->lock
);
912 return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops
,
917 for (i
= 0; i
< npages
; i
++)
919 __free_page(stt
->pages
[i
]);
926 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
927 Assumes POWER7 or PPC970. */
928 static inline int lpcr_rmls(unsigned long rma_size
)
931 case 32ul << 20: /* 32 MB */
932 if (cpu_has_feature(CPU_FTR_ARCH_206
))
933 return 8; /* only supported on POWER7 */
935 case 64ul << 20: /* 64 MB */
937 case 128ul << 20: /* 128 MB */
939 case 256ul << 20: /* 256 MB */
941 case 1ul << 30: /* 1 GB */
943 case 16ul << 30: /* 16 GB */
945 case 256ul << 30: /* 256 GB */
952 static int kvm_rma_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
954 struct kvmppc_rma_info
*ri
= vma
->vm_file
->private_data
;
957 if (vmf
->pgoff
>= ri
->npages
)
958 return VM_FAULT_SIGBUS
;
960 page
= pfn_to_page(ri
->base_pfn
+ vmf
->pgoff
);
966 static const struct vm_operations_struct kvm_rma_vm_ops
= {
967 .fault
= kvm_rma_fault
,
970 static int kvm_rma_mmap(struct file
*file
, struct vm_area_struct
*vma
)
972 vma
->vm_flags
|= VM_RESERVED
;
973 vma
->vm_ops
= &kvm_rma_vm_ops
;
977 static int kvm_rma_release(struct inode
*inode
, struct file
*filp
)
979 struct kvmppc_rma_info
*ri
= filp
->private_data
;
985 static struct file_operations kvm_rma_fops
= {
986 .mmap
= kvm_rma_mmap
,
987 .release
= kvm_rma_release
,
990 long kvm_vm_ioctl_allocate_rma(struct kvm
*kvm
, struct kvm_allocate_rma
*ret
)
992 struct kvmppc_rma_info
*ri
;
995 ri
= kvm_alloc_rma();
999 fd
= anon_inode_getfd("kvm-rma", &kvm_rma_fops
, ri
, O_RDWR
);
1001 kvm_release_rma(ri
);
1003 ret
->rma_size
= ri
->npages
<< PAGE_SHIFT
;
1007 static struct page
*hva_to_page(unsigned long addr
)
1009 struct page
*page
[1];
1014 npages
= get_user_pages_fast(addr
, 1, 1, page
);
1016 if (unlikely(npages
!= 1))
1022 int kvmppc_core_prepare_memory_region(struct kvm
*kvm
,
1023 struct kvm_userspace_memory_region
*mem
)
1025 unsigned long psize
, porder
;
1026 unsigned long i
, npages
, totalpages
;
1027 unsigned long pg_ix
;
1028 struct kvmppc_pginfo
*pginfo
;
1030 struct kvmppc_rma_info
*ri
= NULL
;
1033 /* For now, only allow 16MB pages */
1034 porder
= LARGE_PAGE_ORDER
;
1035 psize
= 1ul << porder
;
1036 if ((mem
->memory_size
& (psize
- 1)) ||
1037 (mem
->guest_phys_addr
& (psize
- 1))) {
1038 pr_err("bad memory_size=%llx @ %llx\n",
1039 mem
->memory_size
, mem
->guest_phys_addr
);
1043 npages
= mem
->memory_size
>> porder
;
1044 totalpages
= (mem
->guest_phys_addr
+ mem
->memory_size
) >> porder
;
1046 /* More memory than we have space to track? */
1047 if (totalpages
> (1ul << (MAX_MEM_ORDER
- LARGE_PAGE_ORDER
)))
1050 /* Do we already have an RMA registered? */
1051 if (mem
->guest_phys_addr
== 0 && kvm
->arch
.rma
)
1054 if (totalpages
> kvm
->arch
.ram_npages
)
1055 kvm
->arch
.ram_npages
= totalpages
;
1057 /* Is this one of our preallocated RMAs? */
1058 if (mem
->guest_phys_addr
== 0) {
1059 struct vm_area_struct
*vma
;
1061 down_read(¤t
->mm
->mmap_sem
);
1062 vma
= find_vma(current
->mm
, mem
->userspace_addr
);
1063 if (vma
&& vma
->vm_file
&&
1064 vma
->vm_file
->f_op
== &kvm_rma_fops
&&
1065 mem
->userspace_addr
== vma
->vm_start
)
1066 ri
= vma
->vm_file
->private_data
;
1067 up_read(¤t
->mm
->mmap_sem
);
1068 if (!ri
&& cpu_has_feature(CPU_FTR_ARCH_201
)) {
1069 pr_err("CPU requires an RMO\n");
1075 unsigned long rma_size
;
1079 rma_size
= ri
->npages
<< PAGE_SHIFT
;
1080 if (rma_size
> mem
->memory_size
)
1081 rma_size
= mem
->memory_size
;
1082 rmls
= lpcr_rmls(rma_size
);
1084 pr_err("Can't use RMA of 0x%lx bytes\n", rma_size
);
1087 atomic_inc(&ri
->use_count
);
1089 kvm
->arch
.n_rma_pages
= rma_size
>> porder
;
1091 /* Update LPCR and RMOR */
1092 lpcr
= kvm
->arch
.lpcr
;
1093 if (cpu_has_feature(CPU_FTR_ARCH_201
)) {
1094 /* PPC970; insert RMLS value (split field) in HID4 */
1095 lpcr
&= ~((1ul << HID4_RMLS0_SH
) |
1096 (3ul << HID4_RMLS2_SH
));
1097 lpcr
|= ((rmls
>> 2) << HID4_RMLS0_SH
) |
1098 ((rmls
& 3) << HID4_RMLS2_SH
);
1099 /* RMOR is also in HID4 */
1100 lpcr
|= ((ri
->base_pfn
>> (26 - PAGE_SHIFT
)) & 0xffff)
1104 lpcr
&= ~(LPCR_VPM0
| LPCR_VRMA_L
);
1105 lpcr
|= rmls
<< LPCR_RMLS_SH
;
1106 kvm
->arch
.rmor
= kvm
->arch
.rma
->base_pfn
<< PAGE_SHIFT
;
1108 kvm
->arch
.lpcr
= lpcr
;
1109 pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
1110 ri
->base_pfn
<< PAGE_SHIFT
, rma_size
, lpcr
);
1113 pg_ix
= mem
->guest_phys_addr
>> porder
;
1114 pginfo
= kvm
->arch
.ram_pginfo
+ pg_ix
;
1115 for (i
= 0; i
< npages
; ++i
, ++pg_ix
) {
1116 if (ri
&& pg_ix
< kvm
->arch
.n_rma_pages
) {
1117 pginfo
[i
].pfn
= ri
->base_pfn
+
1118 (pg_ix
<< (porder
- PAGE_SHIFT
));
1121 hva
= mem
->userspace_addr
+ (i
<< porder
);
1122 page
= hva_to_page(hva
);
1124 pr_err("oops, no pfn for hva %lx\n", hva
);
1127 /* Check it's a 16MB page */
1128 if (!PageHead(page
) ||
1129 compound_order(page
) != (LARGE_PAGE_ORDER
- PAGE_SHIFT
)) {
1130 pr_err("page at %lx isn't 16MB (o=%d)\n",
1131 hva
, compound_order(page
));
1134 pginfo
[i
].pfn
= page_to_pfn(page
);
1143 void kvmppc_core_commit_memory_region(struct kvm
*kvm
,
1144 struct kvm_userspace_memory_region
*mem
)
1146 if (mem
->guest_phys_addr
== 0 && mem
->memory_size
!= 0 &&
1148 kvmppc_map_vrma(kvm
, mem
);
1151 int kvmppc_core_init_vm(struct kvm
*kvm
)
1154 unsigned long npages
= 1ul << (MAX_MEM_ORDER
- LARGE_PAGE_ORDER
);
1158 /* Allocate hashed page table */
1159 r
= kvmppc_alloc_hpt(kvm
);
1163 INIT_LIST_HEAD(&kvm
->arch
.spapr_tce_tables
);
1165 kvm
->arch
.ram_pginfo
= kzalloc(npages
* sizeof(struct kvmppc_pginfo
),
1167 if (!kvm
->arch
.ram_pginfo
) {
1168 pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
1169 npages
* sizeof(struct kvmppc_pginfo
));
1173 kvm
->arch
.ram_npages
= 0;
1174 kvm
->arch
.ram_psize
= 1ul << LARGE_PAGE_ORDER
;
1175 kvm
->arch
.ram_porder
= LARGE_PAGE_ORDER
;
1176 kvm
->arch
.rma
= NULL
;
1177 kvm
->arch
.n_rma_pages
= 0;
1179 kvm
->arch
.host_sdr1
= mfspr(SPRN_SDR1
);
1181 if (cpu_has_feature(CPU_FTR_ARCH_201
)) {
1182 /* PPC970; HID4 is effectively the LPCR */
1183 unsigned long lpid
= kvm
->arch
.lpid
;
1184 kvm
->arch
.host_lpid
= 0;
1185 kvm
->arch
.host_lpcr
= lpcr
= mfspr(SPRN_HID4
);
1186 lpcr
&= ~((3 << HID4_LPID1_SH
) | (0xful
<< HID4_LPID5_SH
));
1187 lpcr
|= ((lpid
>> 4) << HID4_LPID1_SH
) |
1188 ((lpid
& 0xf) << HID4_LPID5_SH
);
1190 /* POWER7; init LPCR for virtual RMA mode */
1191 kvm
->arch
.host_lpid
= mfspr(SPRN_LPID
);
1192 kvm
->arch
.host_lpcr
= lpcr
= mfspr(SPRN_LPCR
);
1193 lpcr
&= LPCR_PECE
| LPCR_LPES
;
1194 lpcr
|= (4UL << LPCR_DPFD_SH
) | LPCR_HDICE
|
1195 LPCR_VPM0
| LPCR_VRMA_L
;
1197 kvm
->arch
.lpcr
= lpcr
;
1202 kvmppc_free_hpt(kvm
);
1206 void kvmppc_core_destroy_vm(struct kvm
*kvm
)
1208 struct kvmppc_pginfo
*pginfo
;
1211 if (kvm
->arch
.ram_pginfo
) {
1212 pginfo
= kvm
->arch
.ram_pginfo
;
1213 kvm
->arch
.ram_pginfo
= NULL
;
1214 for (i
= kvm
->arch
.n_rma_pages
; i
< kvm
->arch
.ram_npages
; ++i
)
1216 put_page(pfn_to_page(pginfo
[i
].pfn
));
1219 if (kvm
->arch
.rma
) {
1220 kvm_release_rma(kvm
->arch
.rma
);
1221 kvm
->arch
.rma
= NULL
;
1224 kvmppc_free_hpt(kvm
);
1225 WARN_ON(!list_empty(&kvm
->arch
.spapr_tce_tables
));
1228 /* These are stubs for now */
1229 void kvmppc_mmu_pte_pflush(struct kvm_vcpu
*vcpu
, ulong pa_start
, ulong pa_end
)
1233 /* We don't need to emulate any privileged instructions or dcbz */
1234 int kvmppc_core_emulate_op(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
,
1235 unsigned int inst
, int *advance
)
1237 return EMULATE_FAIL
;
1240 int kvmppc_core_emulate_mtspr(struct kvm_vcpu
*vcpu
, int sprn
, int rs
)
1242 return EMULATE_FAIL
;
1245 int kvmppc_core_emulate_mfspr(struct kvm_vcpu
*vcpu
, int sprn
, int rt
)
1247 return EMULATE_FAIL
;
1250 static int kvmppc_book3s_hv_init(void)
1254 r
= kvm_init(NULL
, sizeof(struct kvm_vcpu
), 0, THIS_MODULE
);
1259 r
= kvmppc_mmu_hv_init();
1264 static void kvmppc_book3s_hv_exit(void)
1269 module_init(kvmppc_book3s_hv_init
);
1270 module_exit(kvmppc_book3s_hv_exit
);