2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
3 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
6 * Paul Mackerras <paulus@au1.ibm.com>
7 * Alexander Graf <agraf@suse.de>
8 * Kevin Wolf <mail@kevin-wolf.de>
10 * Description: KVM functions specific to running on Book 3S
11 * processors in hypervisor mode (specifically POWER7 and later).
13 * This file is derived from arch/powerpc/kvm/book3s.c,
14 * by Alexander Graf <agraf@suse.de>.
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License, version 2, as
18 * published by the Free Software Foundation.
21 #include <linux/kvm_host.h>
22 #include <linux/err.h>
23 #include <linux/slab.h>
24 #include <linux/preempt.h>
25 #include <linux/sched.h>
26 #include <linux/delay.h>
27 #include <linux/export.h>
29 #include <linux/anon_inodes.h>
30 #include <linux/cpumask.h>
31 #include <linux/spinlock.h>
32 #include <linux/page-flags.h>
35 #include <asm/cputable.h>
36 #include <asm/cacheflush.h>
37 #include <asm/tlbflush.h>
38 #include <asm/uaccess.h>
40 #include <asm/kvm_ppc.h>
41 #include <asm/kvm_book3s.h>
42 #include <asm/mmu_context.h>
43 #include <asm/lppaca.h>
44 #include <asm/processor.h>
45 #include <asm/cputhreads.h>
47 #include <asm/hvcall.h>
48 #include <linux/gfp.h>
49 #include <linux/sched.h>
50 #include <linux/vmalloc.h>
51 #include <linux/highmem.h>
54 * For now, limit memory to 64GB and require it to be large pages.
55 * This value is chosen because it makes the ram_pginfo array be
56 * 64kB in size, which is about as large as we want to be trying
57 * to allocate with kmalloc.
59 #define MAX_MEM_ORDER 36
61 #define LARGE_PAGE_ORDER 24 /* 16MB pages */
63 /* #define EXIT_DEBUG */
64 /* #define EXIT_DEBUG_SIMPLE */
65 /* #define EXIT_DEBUG_INT */
67 static void kvmppc_end_cede(struct kvm_vcpu
*vcpu
);
69 void kvmppc_core_vcpu_load(struct kvm_vcpu
*vcpu
, int cpu
)
71 local_paca
->kvm_hstate
.kvm_vcpu
= vcpu
;
72 local_paca
->kvm_hstate
.kvm_vcore
= vcpu
->arch
.vcore
;
75 void kvmppc_core_vcpu_put(struct kvm_vcpu
*vcpu
)
79 void kvmppc_set_msr(struct kvm_vcpu
*vcpu
, u64 msr
)
81 vcpu
->arch
.shregs
.msr
= msr
;
82 kvmppc_end_cede(vcpu
);
85 void kvmppc_set_pvr(struct kvm_vcpu
*vcpu
, u32 pvr
)
90 void kvmppc_dump_regs(struct kvm_vcpu
*vcpu
)
94 pr_err("vcpu %p (%d):\n", vcpu
, vcpu
->vcpu_id
);
95 pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
96 vcpu
->arch
.pc
, vcpu
->arch
.shregs
.msr
, vcpu
->arch
.trap
);
97 for (r
= 0; r
< 16; ++r
)
98 pr_err("r%2d = %.16lx r%d = %.16lx\n",
99 r
, kvmppc_get_gpr(vcpu
, r
),
100 r
+16, kvmppc_get_gpr(vcpu
, r
+16));
101 pr_err("ctr = %.16lx lr = %.16lx\n",
102 vcpu
->arch
.ctr
, vcpu
->arch
.lr
);
103 pr_err("srr0 = %.16llx srr1 = %.16llx\n",
104 vcpu
->arch
.shregs
.srr0
, vcpu
->arch
.shregs
.srr1
);
105 pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
106 vcpu
->arch
.shregs
.sprg0
, vcpu
->arch
.shregs
.sprg1
);
107 pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
108 vcpu
->arch
.shregs
.sprg2
, vcpu
->arch
.shregs
.sprg3
);
109 pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
110 vcpu
->arch
.cr
, vcpu
->arch
.xer
, vcpu
->arch
.shregs
.dsisr
);
111 pr_err("dar = %.16llx\n", vcpu
->arch
.shregs
.dar
);
112 pr_err("fault dar = %.16lx dsisr = %.8x\n",
113 vcpu
->arch
.fault_dar
, vcpu
->arch
.fault_dsisr
);
114 pr_err("SLB (%d entries):\n", vcpu
->arch
.slb_max
);
115 for (r
= 0; r
< vcpu
->arch
.slb_max
; ++r
)
116 pr_err(" ESID = %.16llx VSID = %.16llx\n",
117 vcpu
->arch
.slb
[r
].orige
, vcpu
->arch
.slb
[r
].origv
);
118 pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
119 vcpu
->kvm
->arch
.lpcr
, vcpu
->kvm
->arch
.sdr1
,
120 vcpu
->arch
.last_inst
);
123 struct kvm_vcpu
*kvmppc_find_vcpu(struct kvm
*kvm
, int id
)
126 struct kvm_vcpu
*v
, *ret
= NULL
;
128 mutex_lock(&kvm
->lock
);
129 kvm_for_each_vcpu(r
, v
, kvm
) {
130 if (v
->vcpu_id
== id
) {
135 mutex_unlock(&kvm
->lock
);
139 static void init_vpa(struct kvm_vcpu
*vcpu
, struct lppaca
*vpa
)
141 vpa
->shared_proc
= 1;
142 vpa
->yield_count
= 1;
145 static unsigned long do_h_register_vpa(struct kvm_vcpu
*vcpu
,
147 unsigned long vcpuid
, unsigned long vpa
)
149 struct kvm
*kvm
= vcpu
->kvm
;
150 unsigned long pg_index
, ra
, len
;
151 unsigned long pg_offset
;
153 struct kvm_vcpu
*tvcpu
;
155 tvcpu
= kvmppc_find_vcpu(kvm
, vcpuid
);
161 if (flags
== 0 || flags
== 4)
166 /* registering new area; convert logical addr to real */
167 pg_index
= vpa
>> kvm
->arch
.ram_porder
;
168 pg_offset
= vpa
& (kvm
->arch
.ram_psize
- 1);
169 if (pg_index
>= kvm
->arch
.ram_npages
)
171 if (kvm
->arch
.ram_pginfo
[pg_index
].pfn
== 0)
173 ra
= kvm
->arch
.ram_pginfo
[pg_index
].pfn
<< PAGE_SHIFT
;
177 len
= *(unsigned short *)(va
+ 4);
179 len
= *(unsigned int *)(va
+ 4);
180 if (pg_offset
+ len
> kvm
->arch
.ram_psize
)
183 case 1: /* register VPA */
186 tvcpu
->arch
.vpa
= va
;
189 case 2: /* register DTL */
192 if (!tvcpu
->arch
.vpa
)
195 tvcpu
->arch
.dtl
= va
;
196 tvcpu
->arch
.dtl_end
= va
+ len
;
198 case 3: /* register SLB shadow buffer */
201 if (!tvcpu
->arch
.vpa
)
203 tvcpu
->arch
.slb_shadow
= va
;
204 len
= (len
- 16) / 16;
205 tvcpu
->arch
.slb_shadow
= va
;
210 case 5: /* unregister VPA */
211 if (tvcpu
->arch
.slb_shadow
|| tvcpu
->arch
.dtl
)
213 tvcpu
->arch
.vpa
= NULL
;
215 case 6: /* unregister DTL */
216 tvcpu
->arch
.dtl
= NULL
;
218 case 7: /* unregister SLB shadow buffer */
219 tvcpu
->arch
.slb_shadow
= NULL
;
226 int kvmppc_pseries_do_hcall(struct kvm_vcpu
*vcpu
)
228 unsigned long req
= kvmppc_get_gpr(vcpu
, 3);
229 unsigned long target
, ret
= H_SUCCESS
;
230 struct kvm_vcpu
*tvcpu
;
236 target
= kvmppc_get_gpr(vcpu
, 4);
237 tvcpu
= kvmppc_find_vcpu(vcpu
->kvm
, target
);
242 tvcpu
->arch
.prodded
= 1;
244 if (vcpu
->arch
.ceded
) {
245 if (waitqueue_active(&vcpu
->wq
)) {
246 wake_up_interruptible(&vcpu
->wq
);
247 vcpu
->stat
.halt_wakeup
++;
254 ret
= do_h_register_vpa(vcpu
, kvmppc_get_gpr(vcpu
, 4),
255 kvmppc_get_gpr(vcpu
, 5),
256 kvmppc_get_gpr(vcpu
, 6));
261 kvmppc_set_gpr(vcpu
, 3, ret
);
262 vcpu
->arch
.hcall_needed
= 0;
266 static int kvmppc_handle_exit(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
,
267 struct task_struct
*tsk
)
271 vcpu
->stat
.sum_exits
++;
273 run
->exit_reason
= KVM_EXIT_UNKNOWN
;
274 run
->ready_for_interrupt_injection
= 1;
275 switch (vcpu
->arch
.trap
) {
276 /* We're good on these - the host merely wanted to get our attention */
277 case BOOK3S_INTERRUPT_HV_DECREMENTER
:
278 vcpu
->stat
.dec_exits
++;
281 case BOOK3S_INTERRUPT_EXTERNAL
:
282 vcpu
->stat
.ext_intr_exits
++;
285 case BOOK3S_INTERRUPT_PERFMON
:
288 case BOOK3S_INTERRUPT_PROGRAM
:
292 * Normally program interrupts are delivered directly
293 * to the guest by the hardware, but we can get here
294 * as a result of a hypervisor emulation interrupt
295 * (e40) getting turned into a 700 by BML RTAS.
297 flags
= vcpu
->arch
.shregs
.msr
& 0x1f0000ull
;
298 kvmppc_core_queue_program(vcpu
, flags
);
302 case BOOK3S_INTERRUPT_SYSCALL
:
304 /* hcall - punt to userspace */
307 if (vcpu
->arch
.shregs
.msr
& MSR_PR
) {
308 /* sc 1 from userspace - reflect to guest syscall */
309 kvmppc_book3s_queue_irqprio(vcpu
, BOOK3S_INTERRUPT_SYSCALL
);
313 run
->papr_hcall
.nr
= kvmppc_get_gpr(vcpu
, 3);
314 for (i
= 0; i
< 9; ++i
)
315 run
->papr_hcall
.args
[i
] = kvmppc_get_gpr(vcpu
, 4 + i
);
316 run
->exit_reason
= KVM_EXIT_PAPR_HCALL
;
317 vcpu
->arch
.hcall_needed
= 1;
322 * We get these next two if the guest does a bad real-mode access,
323 * as we have enabled VRMA (virtualized real mode area) mode in the
324 * LPCR. We just generate an appropriate DSI/ISI to the guest.
326 case BOOK3S_INTERRUPT_H_DATA_STORAGE
:
327 vcpu
->arch
.shregs
.dsisr
= vcpu
->arch
.fault_dsisr
;
328 vcpu
->arch
.shregs
.dar
= vcpu
->arch
.fault_dar
;
329 kvmppc_inject_interrupt(vcpu
, BOOK3S_INTERRUPT_DATA_STORAGE
, 0);
332 case BOOK3S_INTERRUPT_H_INST_STORAGE
:
333 kvmppc_inject_interrupt(vcpu
, BOOK3S_INTERRUPT_INST_STORAGE
,
338 * This occurs if the guest executes an illegal instruction.
339 * We just generate a program interrupt to the guest, since
340 * we don't emulate any guest instructions at this stage.
342 case BOOK3S_INTERRUPT_H_EMUL_ASSIST
:
343 kvmppc_core_queue_program(vcpu
, 0x80000);
347 kvmppc_dump_regs(vcpu
);
348 printk(KERN_EMERG
"trap=0x%x | pc=0x%lx | msr=0x%llx\n",
349 vcpu
->arch
.trap
, kvmppc_get_pc(vcpu
),
350 vcpu
->arch
.shregs
.msr
);
359 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu
*vcpu
,
360 struct kvm_sregs
*sregs
)
364 sregs
->pvr
= vcpu
->arch
.pvr
;
366 memset(sregs
, 0, sizeof(struct kvm_sregs
));
367 for (i
= 0; i
< vcpu
->arch
.slb_max
; i
++) {
368 sregs
->u
.s
.ppc64
.slb
[i
].slbe
= vcpu
->arch
.slb
[i
].orige
;
369 sregs
->u
.s
.ppc64
.slb
[i
].slbv
= vcpu
->arch
.slb
[i
].origv
;
375 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu
*vcpu
,
376 struct kvm_sregs
*sregs
)
380 kvmppc_set_pvr(vcpu
, sregs
->pvr
);
383 for (i
= 0; i
< vcpu
->arch
.slb_nr
; i
++) {
384 if (sregs
->u
.s
.ppc64
.slb
[i
].slbe
& SLB_ESID_V
) {
385 vcpu
->arch
.slb
[j
].orige
= sregs
->u
.s
.ppc64
.slb
[i
].slbe
;
386 vcpu
->arch
.slb
[j
].origv
= sregs
->u
.s
.ppc64
.slb
[i
].slbv
;
390 vcpu
->arch
.slb_max
= j
;
395 int kvmppc_core_check_processor_compat(void)
397 if (cpu_has_feature(CPU_FTR_HVMODE
))
402 struct kvm_vcpu
*kvmppc_core_vcpu_create(struct kvm
*kvm
, unsigned int id
)
404 struct kvm_vcpu
*vcpu
;
407 struct kvmppc_vcore
*vcore
;
409 core
= id
/ threads_per_core
;
410 if (core
>= KVM_MAX_VCORES
)
414 vcpu
= kzalloc(sizeof(struct kvm_vcpu
), GFP_KERNEL
);
418 err
= kvm_vcpu_init(vcpu
, kvm
, id
);
422 vcpu
->arch
.shared
= &vcpu
->arch
.shregs
;
423 vcpu
->arch
.last_cpu
= -1;
424 vcpu
->arch
.mmcr
[0] = MMCR0_FC
;
425 vcpu
->arch
.ctrl
= CTRL_RUNLATCH
;
426 /* default to host PVR, since we can't spoof it */
427 vcpu
->arch
.pvr
= mfspr(SPRN_PVR
);
428 kvmppc_set_pvr(vcpu
, vcpu
->arch
.pvr
);
430 kvmppc_mmu_book3s_hv_init(vcpu
);
433 * We consider the vcpu stopped until we see the first run ioctl for it.
435 vcpu
->arch
.state
= KVMPPC_VCPU_STOPPED
;
437 init_waitqueue_head(&vcpu
->arch
.cpu_run
);
439 mutex_lock(&kvm
->lock
);
440 vcore
= kvm
->arch
.vcores
[core
];
442 vcore
= kzalloc(sizeof(struct kvmppc_vcore
), GFP_KERNEL
);
444 INIT_LIST_HEAD(&vcore
->runnable_threads
);
445 spin_lock_init(&vcore
->lock
);
446 init_waitqueue_head(&vcore
->wq
);
448 kvm
->arch
.vcores
[core
] = vcore
;
450 mutex_unlock(&kvm
->lock
);
455 spin_lock(&vcore
->lock
);
456 ++vcore
->num_threads
;
457 spin_unlock(&vcore
->lock
);
458 vcpu
->arch
.vcore
= vcore
;
460 vcpu
->arch
.cpu_type
= KVM_CPU_3S_64
;
461 kvmppc_sanity_check(vcpu
);
471 void kvmppc_core_vcpu_free(struct kvm_vcpu
*vcpu
)
473 kvm_vcpu_uninit(vcpu
);
477 static void kvmppc_set_timer(struct kvm_vcpu
*vcpu
)
479 unsigned long dec_nsec
, now
;
482 if (now
> vcpu
->arch
.dec_expires
) {
483 /* decrementer has already gone negative */
484 kvmppc_core_queue_dec(vcpu
);
485 kvmppc_core_deliver_interrupts(vcpu
);
488 dec_nsec
= (vcpu
->arch
.dec_expires
- now
) * NSEC_PER_SEC
490 hrtimer_start(&vcpu
->arch
.dec_timer
, ktime_set(0, dec_nsec
),
492 vcpu
->arch
.timer_running
= 1;
495 static void kvmppc_end_cede(struct kvm_vcpu
*vcpu
)
497 vcpu
->arch
.ceded
= 0;
498 if (vcpu
->arch
.timer_running
) {
499 hrtimer_try_to_cancel(&vcpu
->arch
.dec_timer
);
500 vcpu
->arch
.timer_running
= 0;
504 extern int __kvmppc_vcore_entry(struct kvm_run
*kvm_run
, struct kvm_vcpu
*vcpu
);
505 extern void xics_wake_cpu(int cpu
);
507 static void kvmppc_remove_runnable(struct kvmppc_vcore
*vc
,
508 struct kvm_vcpu
*vcpu
)
512 if (vcpu
->arch
.state
!= KVMPPC_VCPU_RUNNABLE
)
514 vcpu
->arch
.state
= KVMPPC_VCPU_BUSY_IN_HOST
;
517 /* decrement the physical thread id of each following vcpu */
519 list_for_each_entry_continue(v
, &vc
->runnable_threads
, arch
.run_list
)
521 list_del(&vcpu
->arch
.run_list
);
524 static void kvmppc_start_thread(struct kvm_vcpu
*vcpu
)
527 struct paca_struct
*tpaca
;
528 struct kvmppc_vcore
*vc
= vcpu
->arch
.vcore
;
530 if (vcpu
->arch
.timer_running
) {
531 hrtimer_try_to_cancel(&vcpu
->arch
.dec_timer
);
532 vcpu
->arch
.timer_running
= 0;
534 cpu
= vc
->pcpu
+ vcpu
->arch
.ptid
;
536 tpaca
->kvm_hstate
.kvm_vcpu
= vcpu
;
537 tpaca
->kvm_hstate
.kvm_vcore
= vc
;
538 tpaca
->kvm_hstate
.napping
= 0;
539 vcpu
->cpu
= vc
->pcpu
;
541 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
542 if (vcpu
->arch
.ptid
) {
543 tpaca
->cpu_start
= 0x80;
551 static void kvmppc_wait_for_nap(struct kvmppc_vcore
*vc
)
557 while (vc
->nap_count
< vc
->n_woken
) {
558 if (++i
>= 1000000) {
559 pr_err("kvmppc_wait_for_nap timeout %d %d\n",
560 vc
->nap_count
, vc
->n_woken
);
569 * Check that we are on thread 0 and that any other threads in
570 * this core are off-line.
572 static int on_primary_thread(void)
574 int cpu
= smp_processor_id();
575 int thr
= cpu_thread_in_core(cpu
);
579 while (++thr
< threads_per_core
)
580 if (cpu_online(cpu
+ thr
))
586 * Run a set of guest threads on a physical core.
587 * Called with vc->lock held.
589 static int kvmppc_run_core(struct kvmppc_vcore
*vc
)
591 struct kvm_vcpu
*vcpu
, *vcpu0
, *vnext
;
596 /* don't start if any threads have a signal pending */
597 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
598 if (signal_pending(vcpu
->arch
.run_task
))
602 * Make sure we are running on thread 0, and that
603 * secondary threads are offline.
604 * XXX we should also block attempts to bring any
605 * secondary threads online.
607 if (threads_per_core
> 1 && !on_primary_thread()) {
608 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
609 vcpu
->arch
.ret
= -EBUSY
;
614 * Assign physical thread IDs, first to non-ceded vcpus
615 * and then to ceded ones.
619 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
) {
620 if (!vcpu
->arch
.ceded
) {
623 vcpu
->arch
.ptid
= ptid
++;
627 return 0; /* nothing to run */
628 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
629 if (vcpu
->arch
.ceded
)
630 vcpu
->arch
.ptid
= ptid
++;
634 vc
->entry_exit_count
= 0;
635 vc
->vcore_state
= VCORE_RUNNING
;
637 vc
->pcpu
= smp_processor_id();
638 vc
->napping_threads
= 0;
639 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
640 kvmppc_start_thread(vcpu
);
643 spin_unlock(&vc
->lock
);
646 __kvmppc_vcore_entry(NULL
, vcpu0
);
648 spin_lock(&vc
->lock
);
649 /* disable sending of IPIs on virtual external irqs */
650 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
)
652 /* wait for secondary threads to finish writing their state to memory */
653 if (vc
->nap_count
< vc
->n_woken
)
654 kvmppc_wait_for_nap(vc
);
655 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
656 vc
->vcore_state
= VCORE_EXITING
;
657 spin_unlock(&vc
->lock
);
659 /* make sure updates to secondary vcpu structs are visible now */
667 list_for_each_entry(vcpu
, &vc
->runnable_threads
, arch
.run_list
) {
668 /* cancel pending dec exception if dec is positive */
669 if (now
< vcpu
->arch
.dec_expires
&&
670 kvmppc_core_pending_dec(vcpu
))
671 kvmppc_core_dequeue_dec(vcpu
);
675 ret
= kvmppc_handle_exit(vcpu
->arch
.kvm_run
, vcpu
,
676 vcpu
->arch
.run_task
);
678 vcpu
->arch
.ret
= ret
;
681 if (vcpu
->arch
.ceded
) {
682 if (ret
!= RESUME_GUEST
)
683 kvmppc_end_cede(vcpu
);
685 kvmppc_set_timer(vcpu
);
689 spin_lock(&vc
->lock
);
691 vc
->vcore_state
= VCORE_INACTIVE
;
692 list_for_each_entry_safe(vcpu
, vnext
, &vc
->runnable_threads
,
694 if (vcpu
->arch
.ret
!= RESUME_GUEST
) {
695 kvmppc_remove_runnable(vc
, vcpu
);
696 wake_up(&vcpu
->arch
.cpu_run
);
704 * Wait for some other vcpu thread to execute us, and
705 * wake us up when we need to handle something in the host.
707 static void kvmppc_wait_for_exec(struct kvm_vcpu
*vcpu
, int wait_state
)
711 prepare_to_wait(&vcpu
->arch
.cpu_run
, &wait
, wait_state
);
712 if (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
)
714 finish_wait(&vcpu
->arch
.cpu_run
, &wait
);
718 * All the vcpus in this vcore are idle, so wait for a decrementer
719 * or external interrupt to one of the vcpus. vc->lock is held.
721 static void kvmppc_vcore_blocked(struct kvmppc_vcore
*vc
)
727 prepare_to_wait(&vc
->wq
, &wait
, TASK_INTERRUPTIBLE
);
728 vc
->vcore_state
= VCORE_SLEEPING
;
729 spin_unlock(&vc
->lock
);
730 list_for_each_entry(v
, &vc
->runnable_threads
, arch
.run_list
) {
731 if (!v
->arch
.ceded
|| v
->arch
.pending_exceptions
) {
738 finish_wait(&vc
->wq
, &wait
);
739 spin_lock(&vc
->lock
);
740 vc
->vcore_state
= VCORE_INACTIVE
;
743 static int kvmppc_run_vcpu(struct kvm_run
*kvm_run
, struct kvm_vcpu
*vcpu
)
747 struct kvmppc_vcore
*vc
;
748 struct kvm_vcpu
*v
, *vn
;
750 kvm_run
->exit_reason
= 0;
751 vcpu
->arch
.ret
= RESUME_GUEST
;
755 * Synchronize with other threads in this virtual core
757 vc
= vcpu
->arch
.vcore
;
758 spin_lock(&vc
->lock
);
759 vcpu
->arch
.ceded
= 0;
760 vcpu
->arch
.run_task
= current
;
761 vcpu
->arch
.kvm_run
= kvm_run
;
762 prev_state
= vcpu
->arch
.state
;
763 vcpu
->arch
.state
= KVMPPC_VCPU_RUNNABLE
;
764 list_add_tail(&vcpu
->arch
.run_list
, &vc
->runnable_threads
);
768 * This happens the first time this is called for a vcpu.
769 * If the vcore is already running, we may be able to start
770 * this thread straight away and have it join in.
772 if (prev_state
== KVMPPC_VCPU_STOPPED
) {
773 if (vc
->vcore_state
== VCORE_RUNNING
&&
774 VCORE_EXIT_COUNT(vc
) == 0) {
775 vcpu
->arch
.ptid
= vc
->n_runnable
- 1;
776 kvmppc_start_thread(vcpu
);
779 } else if (prev_state
== KVMPPC_VCPU_BUSY_IN_HOST
)
782 while (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
&&
783 !signal_pending(current
)) {
784 if (vc
->n_busy
|| vc
->vcore_state
!= VCORE_INACTIVE
) {
785 spin_unlock(&vc
->lock
);
786 kvmppc_wait_for_exec(vcpu
, TASK_INTERRUPTIBLE
);
787 spin_lock(&vc
->lock
);
791 list_for_each_entry(v
, &vc
->runnable_threads
, arch
.run_list
)
792 n_ceded
+= v
->arch
.ceded
;
793 if (n_ceded
== vc
->n_runnable
)
794 kvmppc_vcore_blocked(vc
);
798 list_for_each_entry_safe(v
, vn
, &vc
->runnable_threads
,
800 kvmppc_core_deliver_interrupts(v
);
801 if (signal_pending(v
->arch
.run_task
)) {
802 kvmppc_remove_runnable(vc
, v
);
803 v
->stat
.signal_exits
++;
804 v
->arch
.kvm_run
->exit_reason
= KVM_EXIT_INTR
;
805 v
->arch
.ret
= -EINTR
;
806 wake_up(&v
->arch
.cpu_run
);
811 if (signal_pending(current
)) {
812 if (vc
->vcore_state
== VCORE_RUNNING
||
813 vc
->vcore_state
== VCORE_EXITING
) {
814 spin_unlock(&vc
->lock
);
815 kvmppc_wait_for_exec(vcpu
, TASK_UNINTERRUPTIBLE
);
816 spin_lock(&vc
->lock
);
818 if (vcpu
->arch
.state
== KVMPPC_VCPU_RUNNABLE
) {
819 kvmppc_remove_runnable(vc
, vcpu
);
820 vcpu
->stat
.signal_exits
++;
821 kvm_run
->exit_reason
= KVM_EXIT_INTR
;
822 vcpu
->arch
.ret
= -EINTR
;
826 spin_unlock(&vc
->lock
);
827 return vcpu
->arch
.ret
;
830 int kvmppc_vcpu_run(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
)
834 if (!vcpu
->arch
.sane
) {
835 run
->exit_reason
= KVM_EXIT_INTERNAL_ERROR
;
839 /* No need to go into the guest when all we'll do is come back out */
840 if (signal_pending(current
)) {
841 run
->exit_reason
= KVM_EXIT_INTR
;
845 /* On PPC970, check that we have an RMA region */
846 if (!vcpu
->kvm
->arch
.rma
&& cpu_has_feature(CPU_FTR_ARCH_201
))
849 flush_fp_to_thread(current
);
850 flush_altivec_to_thread(current
);
851 flush_vsx_to_thread(current
);
852 vcpu
->arch
.wqp
= &vcpu
->arch
.vcore
->wq
;
855 r
= kvmppc_run_vcpu(run
, vcpu
);
857 if (run
->exit_reason
== KVM_EXIT_PAPR_HCALL
&&
858 !(vcpu
->arch
.shregs
.msr
& MSR_PR
)) {
859 r
= kvmppc_pseries_do_hcall(vcpu
);
860 kvmppc_core_deliver_interrupts(vcpu
);
862 } while (r
== RESUME_GUEST
);
866 static long kvmppc_stt_npages(unsigned long window_size
)
868 return ALIGN((window_size
>> SPAPR_TCE_SHIFT
)
869 * sizeof(u64
), PAGE_SIZE
) / PAGE_SIZE
;
872 static void release_spapr_tce_table(struct kvmppc_spapr_tce_table
*stt
)
874 struct kvm
*kvm
= stt
->kvm
;
877 mutex_lock(&kvm
->lock
);
878 list_del(&stt
->list
);
879 for (i
= 0; i
< kvmppc_stt_npages(stt
->window_size
); i
++)
880 __free_page(stt
->pages
[i
]);
882 mutex_unlock(&kvm
->lock
);
887 static int kvm_spapr_tce_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
889 struct kvmppc_spapr_tce_table
*stt
= vma
->vm_file
->private_data
;
892 if (vmf
->pgoff
>= kvmppc_stt_npages(stt
->window_size
))
893 return VM_FAULT_SIGBUS
;
895 page
= stt
->pages
[vmf
->pgoff
];
901 static const struct vm_operations_struct kvm_spapr_tce_vm_ops
= {
902 .fault
= kvm_spapr_tce_fault
,
905 static int kvm_spapr_tce_mmap(struct file
*file
, struct vm_area_struct
*vma
)
907 vma
->vm_ops
= &kvm_spapr_tce_vm_ops
;
911 static int kvm_spapr_tce_release(struct inode
*inode
, struct file
*filp
)
913 struct kvmppc_spapr_tce_table
*stt
= filp
->private_data
;
915 release_spapr_tce_table(stt
);
919 static struct file_operations kvm_spapr_tce_fops
= {
920 .mmap
= kvm_spapr_tce_mmap
,
921 .release
= kvm_spapr_tce_release
,
924 long kvm_vm_ioctl_create_spapr_tce(struct kvm
*kvm
,
925 struct kvm_create_spapr_tce
*args
)
927 struct kvmppc_spapr_tce_table
*stt
= NULL
;
932 /* Check this LIOBN hasn't been previously allocated */
933 list_for_each_entry(stt
, &kvm
->arch
.spapr_tce_tables
, list
) {
934 if (stt
->liobn
== args
->liobn
)
938 npages
= kvmppc_stt_npages(args
->window_size
);
940 stt
= kzalloc(sizeof(*stt
) + npages
* sizeof(struct page
*),
945 stt
->liobn
= args
->liobn
;
946 stt
->window_size
= args
->window_size
;
949 for (i
= 0; i
< npages
; i
++) {
950 stt
->pages
[i
] = alloc_page(GFP_KERNEL
| __GFP_ZERO
);
957 mutex_lock(&kvm
->lock
);
958 list_add(&stt
->list
, &kvm
->arch
.spapr_tce_tables
);
960 mutex_unlock(&kvm
->lock
);
962 return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops
,
967 for (i
= 0; i
< npages
; i
++)
969 __free_page(stt
->pages
[i
]);
976 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
977 Assumes POWER7 or PPC970. */
978 static inline int lpcr_rmls(unsigned long rma_size
)
981 case 32ul << 20: /* 32 MB */
982 if (cpu_has_feature(CPU_FTR_ARCH_206
))
983 return 8; /* only supported on POWER7 */
985 case 64ul << 20: /* 64 MB */
987 case 128ul << 20: /* 128 MB */
989 case 256ul << 20: /* 256 MB */
991 case 1ul << 30: /* 1 GB */
993 case 16ul << 30: /* 16 GB */
995 case 256ul << 30: /* 256 GB */
1002 static int kvm_rma_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
1004 struct kvmppc_rma_info
*ri
= vma
->vm_file
->private_data
;
1007 if (vmf
->pgoff
>= ri
->npages
)
1008 return VM_FAULT_SIGBUS
;
1010 page
= pfn_to_page(ri
->base_pfn
+ vmf
->pgoff
);
1016 static const struct vm_operations_struct kvm_rma_vm_ops
= {
1017 .fault
= kvm_rma_fault
,
1020 static int kvm_rma_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1022 vma
->vm_flags
|= VM_RESERVED
;
1023 vma
->vm_ops
= &kvm_rma_vm_ops
;
1027 static int kvm_rma_release(struct inode
*inode
, struct file
*filp
)
1029 struct kvmppc_rma_info
*ri
= filp
->private_data
;
1031 kvm_release_rma(ri
);
1035 static struct file_operations kvm_rma_fops
= {
1036 .mmap
= kvm_rma_mmap
,
1037 .release
= kvm_rma_release
,
1040 long kvm_vm_ioctl_allocate_rma(struct kvm
*kvm
, struct kvm_allocate_rma
*ret
)
1042 struct kvmppc_rma_info
*ri
;
1045 ri
= kvm_alloc_rma();
1049 fd
= anon_inode_getfd("kvm-rma", &kvm_rma_fops
, ri
, O_RDWR
);
1051 kvm_release_rma(ri
);
1053 ret
->rma_size
= ri
->npages
<< PAGE_SHIFT
;
1057 static struct page
*hva_to_page(unsigned long addr
)
1059 struct page
*page
[1];
1064 npages
= get_user_pages_fast(addr
, 1, 1, page
);
1066 if (unlikely(npages
!= 1))
1072 int kvmppc_core_prepare_memory_region(struct kvm
*kvm
,
1073 struct kvm_userspace_memory_region
*mem
)
1075 unsigned long psize
, porder
;
1076 unsigned long i
, npages
, totalpages
;
1077 unsigned long pg_ix
;
1078 struct kvmppc_pginfo
*pginfo
;
1080 struct kvmppc_rma_info
*ri
= NULL
;
1083 /* For now, only allow 16MB pages */
1084 porder
= LARGE_PAGE_ORDER
;
1085 psize
= 1ul << porder
;
1086 if ((mem
->memory_size
& (psize
- 1)) ||
1087 (mem
->guest_phys_addr
& (psize
- 1))) {
1088 pr_err("bad memory_size=%llx @ %llx\n",
1089 mem
->memory_size
, mem
->guest_phys_addr
);
1093 npages
= mem
->memory_size
>> porder
;
1094 totalpages
= (mem
->guest_phys_addr
+ mem
->memory_size
) >> porder
;
1096 /* More memory than we have space to track? */
1097 if (totalpages
> (1ul << (MAX_MEM_ORDER
- LARGE_PAGE_ORDER
)))
1100 /* Do we already have an RMA registered? */
1101 if (mem
->guest_phys_addr
== 0 && kvm
->arch
.rma
)
1104 if (totalpages
> kvm
->arch
.ram_npages
)
1105 kvm
->arch
.ram_npages
= totalpages
;
1107 /* Is this one of our preallocated RMAs? */
1108 if (mem
->guest_phys_addr
== 0) {
1109 struct vm_area_struct
*vma
;
1111 down_read(¤t
->mm
->mmap_sem
);
1112 vma
= find_vma(current
->mm
, mem
->userspace_addr
);
1113 if (vma
&& vma
->vm_file
&&
1114 vma
->vm_file
->f_op
== &kvm_rma_fops
&&
1115 mem
->userspace_addr
== vma
->vm_start
)
1116 ri
= vma
->vm_file
->private_data
;
1117 up_read(¤t
->mm
->mmap_sem
);
1118 if (!ri
&& cpu_has_feature(CPU_FTR_ARCH_201
)) {
1119 pr_err("CPU requires an RMO\n");
1125 unsigned long rma_size
;
1129 rma_size
= ri
->npages
<< PAGE_SHIFT
;
1130 if (rma_size
> mem
->memory_size
)
1131 rma_size
= mem
->memory_size
;
1132 rmls
= lpcr_rmls(rma_size
);
1134 pr_err("Can't use RMA of 0x%lx bytes\n", rma_size
);
1137 atomic_inc(&ri
->use_count
);
1139 kvm
->arch
.n_rma_pages
= rma_size
>> porder
;
1141 /* Update LPCR and RMOR */
1142 lpcr
= kvm
->arch
.lpcr
;
1143 if (cpu_has_feature(CPU_FTR_ARCH_201
)) {
1144 /* PPC970; insert RMLS value (split field) in HID4 */
1145 lpcr
&= ~((1ul << HID4_RMLS0_SH
) |
1146 (3ul << HID4_RMLS2_SH
));
1147 lpcr
|= ((rmls
>> 2) << HID4_RMLS0_SH
) |
1148 ((rmls
& 3) << HID4_RMLS2_SH
);
1149 /* RMOR is also in HID4 */
1150 lpcr
|= ((ri
->base_pfn
>> (26 - PAGE_SHIFT
)) & 0xffff)
1154 lpcr
&= ~(LPCR_VPM0
| LPCR_VRMA_L
);
1155 lpcr
|= rmls
<< LPCR_RMLS_SH
;
1156 kvm
->arch
.rmor
= kvm
->arch
.rma
->base_pfn
<< PAGE_SHIFT
;
1158 kvm
->arch
.lpcr
= lpcr
;
1159 pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
1160 ri
->base_pfn
<< PAGE_SHIFT
, rma_size
, lpcr
);
1163 pg_ix
= mem
->guest_phys_addr
>> porder
;
1164 pginfo
= kvm
->arch
.ram_pginfo
+ pg_ix
;
1165 for (i
= 0; i
< npages
; ++i
, ++pg_ix
) {
1166 if (ri
&& pg_ix
< kvm
->arch
.n_rma_pages
) {
1167 pginfo
[i
].pfn
= ri
->base_pfn
+
1168 (pg_ix
<< (porder
- PAGE_SHIFT
));
1171 hva
= mem
->userspace_addr
+ (i
<< porder
);
1172 page
= hva_to_page(hva
);
1174 pr_err("oops, no pfn for hva %lx\n", hva
);
1177 /* Check it's a 16MB page */
1178 if (!PageHead(page
) ||
1179 compound_order(page
) != (LARGE_PAGE_ORDER
- PAGE_SHIFT
)) {
1180 pr_err("page at %lx isn't 16MB (o=%d)\n",
1181 hva
, compound_order(page
));
1184 pginfo
[i
].pfn
= page_to_pfn(page
);
1193 void kvmppc_core_commit_memory_region(struct kvm
*kvm
,
1194 struct kvm_userspace_memory_region
*mem
)
1196 if (mem
->guest_phys_addr
== 0 && mem
->memory_size
!= 0 &&
1198 kvmppc_map_vrma(kvm
, mem
);
1201 int kvmppc_core_init_vm(struct kvm
*kvm
)
1204 unsigned long npages
= 1ul << (MAX_MEM_ORDER
- LARGE_PAGE_ORDER
);
1208 /* Allocate hashed page table */
1209 r
= kvmppc_alloc_hpt(kvm
);
1213 INIT_LIST_HEAD(&kvm
->arch
.spapr_tce_tables
);
1215 kvm
->arch
.ram_pginfo
= kzalloc(npages
* sizeof(struct kvmppc_pginfo
),
1217 if (!kvm
->arch
.ram_pginfo
) {
1218 pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
1219 npages
* sizeof(struct kvmppc_pginfo
));
1223 kvm
->arch
.ram_npages
= 0;
1224 kvm
->arch
.ram_psize
= 1ul << LARGE_PAGE_ORDER
;
1225 kvm
->arch
.ram_porder
= LARGE_PAGE_ORDER
;
1226 kvm
->arch
.rma
= NULL
;
1227 kvm
->arch
.n_rma_pages
= 0;
1229 kvm
->arch
.host_sdr1
= mfspr(SPRN_SDR1
);
1231 if (cpu_has_feature(CPU_FTR_ARCH_201
)) {
1232 /* PPC970; HID4 is effectively the LPCR */
1233 unsigned long lpid
= kvm
->arch
.lpid
;
1234 kvm
->arch
.host_lpid
= 0;
1235 kvm
->arch
.host_lpcr
= lpcr
= mfspr(SPRN_HID4
);
1236 lpcr
&= ~((3 << HID4_LPID1_SH
) | (0xful
<< HID4_LPID5_SH
));
1237 lpcr
|= ((lpid
>> 4) << HID4_LPID1_SH
) |
1238 ((lpid
& 0xf) << HID4_LPID5_SH
);
1240 /* POWER7; init LPCR for virtual RMA mode */
1241 kvm
->arch
.host_lpid
= mfspr(SPRN_LPID
);
1242 kvm
->arch
.host_lpcr
= lpcr
= mfspr(SPRN_LPCR
);
1243 lpcr
&= LPCR_PECE
| LPCR_LPES
;
1244 lpcr
|= (4UL << LPCR_DPFD_SH
) | LPCR_HDICE
|
1245 LPCR_VPM0
| LPCR_VRMA_L
;
1247 kvm
->arch
.lpcr
= lpcr
;
1252 kvmppc_free_hpt(kvm
);
1256 void kvmppc_core_destroy_vm(struct kvm
*kvm
)
1258 struct kvmppc_pginfo
*pginfo
;
1261 if (kvm
->arch
.ram_pginfo
) {
1262 pginfo
= kvm
->arch
.ram_pginfo
;
1263 kvm
->arch
.ram_pginfo
= NULL
;
1264 for (i
= kvm
->arch
.n_rma_pages
; i
< kvm
->arch
.ram_npages
; ++i
)
1266 put_page(pfn_to_page(pginfo
[i
].pfn
));
1269 if (kvm
->arch
.rma
) {
1270 kvm_release_rma(kvm
->arch
.rma
);
1271 kvm
->arch
.rma
= NULL
;
1274 kvmppc_free_hpt(kvm
);
1275 WARN_ON(!list_empty(&kvm
->arch
.spapr_tce_tables
));
1278 /* These are stubs for now */
1279 void kvmppc_mmu_pte_pflush(struct kvm_vcpu
*vcpu
, ulong pa_start
, ulong pa_end
)
1283 /* We don't need to emulate any privileged instructions or dcbz */
1284 int kvmppc_core_emulate_op(struct kvm_run
*run
, struct kvm_vcpu
*vcpu
,
1285 unsigned int inst
, int *advance
)
1287 return EMULATE_FAIL
;
1290 int kvmppc_core_emulate_mtspr(struct kvm_vcpu
*vcpu
, int sprn
, int rs
)
1292 return EMULATE_FAIL
;
1295 int kvmppc_core_emulate_mfspr(struct kvm_vcpu
*vcpu
, int sprn
, int rt
)
1297 return EMULATE_FAIL
;
1300 static int kvmppc_book3s_hv_init(void)
1304 r
= kvm_init(NULL
, sizeof(struct kvm_vcpu
), 0, THIS_MODULE
);
1309 r
= kvmppc_mmu_hv_init();
1314 static void kvmppc_book3s_hv_exit(void)
1319 module_init(kvmppc_book3s_hv_init
);
1320 module_exit(kvmppc_book3s_hv_exit
);