1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2017-2019, IBM Corporation.
6 #define pr_fmt(fmt) "xive-kvm: " fmt
8 #include <linux/kernel.h>
9 #include <linux/kvm_host.h>
10 #include <linux/err.h>
11 #include <linux/gfp.h>
12 #include <linux/spinlock.h>
13 #include <linux/delay.h>
14 #include <linux/file.h>
15 #include <asm/uaccess.h>
16 #include <asm/kvm_book3s.h>
17 #include <asm/kvm_ppc.h>
18 #include <asm/hvcall.h>
20 #include <asm/xive-regs.h>
21 #include <asm/debug.h>
22 #include <asm/debugfs.h>
25 #include <linux/debugfs.h>
26 #include <linux/seq_file.h>
28 #include "book3s_xive.h"
30 static u8
xive_vm_esb_load(struct xive_irq_data
*xd
, u32 offset
)
34 if (xd
->flags
& XIVE_IRQ_FLAG_SHIFT_BUG
)
35 offset
|= offset
<< 4;
37 val
= in_be64(xd
->eoi_mmio
+ offset
);
41 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu
*vcpu
, int prio
)
43 struct kvmppc_xive_vcpu
*xc
= vcpu
->arch
.xive_vcpu
;
44 struct xive_q
*q
= &xc
->queues
[prio
];
46 xive_native_disable_queue(xc
->vp_id
, q
, prio
);
48 put_page(virt_to_page(q
->qpage
));
53 static int kvmppc_xive_native_configure_queue(u32 vp_id
, struct xive_q
*q
,
54 u8 prio
, __be32
*qpage
,
55 u32 order
, bool can_escalate
)
58 __be32
*qpage_prev
= q
->qpage
;
60 rc
= xive_native_configure_queue(vp_id
, q
, prio
, qpage
, order
,
66 put_page(virt_to_page(qpage_prev
));
71 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu
*vcpu
)
73 struct kvmppc_xive_vcpu
*xc
= vcpu
->arch
.xive_vcpu
;
76 if (!kvmppc_xive_enabled(vcpu
))
82 pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc
->server_num
);
84 /* Ensure no interrupt is still routed to that VP */
86 kvmppc_xive_disable_vcpu_interrupts(vcpu
);
88 /* Free escalations */
89 for (i
= 0; i
< KVMPPC_XIVE_Q_COUNT
; i
++) {
90 /* Free the escalation irq */
91 if (xc
->esc_virq
[i
]) {
92 if (xc
->xive
->single_escalation
)
93 xive_cleanup_single_escalation(vcpu
, xc
,
95 free_irq(xc
->esc_virq
[i
], vcpu
);
96 irq_dispose_mapping(xc
->esc_virq
[i
]);
97 kfree(xc
->esc_virq_names
[i
]);
103 xive_native_disable_vp(xc
->vp_id
);
105 /* Clear the cam word so guest entry won't try to push context */
106 vcpu
->arch
.xive_cam_word
= 0;
108 /* Free the queues */
109 for (i
= 0; i
< KVMPPC_XIVE_Q_COUNT
; i
++) {
110 kvmppc_xive_native_cleanup_queue(vcpu
, i
);
116 /* Cleanup the vcpu */
117 vcpu
->arch
.irq_type
= KVMPPC_IRQ_DEFAULT
;
118 vcpu
->arch
.xive_vcpu
= NULL
;
121 int kvmppc_xive_native_connect_vcpu(struct kvm_device
*dev
,
122 struct kvm_vcpu
*vcpu
, u32 server_num
)
124 struct kvmppc_xive
*xive
= dev
->private;
125 struct kvmppc_xive_vcpu
*xc
= NULL
;
129 pr_devel("native_connect_vcpu(server=%d)\n", server_num
);
131 if (dev
->ops
!= &kvm_xive_native_ops
) {
132 pr_devel("Wrong ops !\n");
135 if (xive
->kvm
!= vcpu
->kvm
)
137 if (vcpu
->arch
.irq_type
!= KVMPPC_IRQ_DEFAULT
)
140 mutex_lock(&xive
->lock
);
142 rc
= kvmppc_xive_compute_vp_id(xive
, server_num
, &vp_id
);
146 xc
= kzalloc(sizeof(*xc
), GFP_KERNEL
);
152 vcpu
->arch
.xive_vcpu
= xc
;
155 xc
->server_num
= server_num
;
159 vcpu
->arch
.irq_type
= KVMPPC_IRQ_XIVE
;
161 rc
= xive_native_get_vp_info(xc
->vp_id
, &xc
->vp_cam
, &xc
->vp_chip_id
);
163 pr_err("Failed to get VP info from OPAL: %d\n", rc
);
168 * Enable the VP first as the single escalation mode will
169 * affect escalation interrupts numbering
171 rc
= xive_native_enable_vp(xc
->vp_id
, xive
->single_escalation
);
173 pr_err("Failed to enable VP in OPAL: %d\n", rc
);
177 /* Configure VCPU fields for use by assembly push/pull */
178 vcpu
->arch
.xive_saved_state
.w01
= cpu_to_be64(0xff000000);
179 vcpu
->arch
.xive_cam_word
= cpu_to_be32(xc
->vp_cam
| TM_QW1W2_VO
);
181 /* TODO: reset all queues to a clean state ? */
183 mutex_unlock(&xive
->lock
);
185 kvmppc_xive_native_cleanup_vcpu(vcpu
);
191 * Device passthrough support
193 static int kvmppc_xive_native_reset_mapped(struct kvm
*kvm
, unsigned long irq
)
195 struct kvmppc_xive
*xive
= kvm
->arch
.xive
;
196 pgoff_t esb_pgoff
= KVM_XIVE_ESB_PAGE_OFFSET
+ irq
* 2;
198 if (irq
>= KVMPPC_XIVE_NR_IRQS
)
202 * Clear the ESB pages of the IRQ number being mapped (or
203 * unmapped) into the guest and let the the VM fault handler
204 * repopulate with the appropriate ESB pages (device or IC)
206 pr_debug("clearing esb pages for girq 0x%lx\n", irq
);
207 mutex_lock(&xive
->mapping_lock
);
209 unmap_mapping_range(xive
->mapping
,
210 esb_pgoff
<< PAGE_SHIFT
,
211 2ull << PAGE_SHIFT
, 1);
212 mutex_unlock(&xive
->mapping_lock
);
216 static struct kvmppc_xive_ops kvmppc_xive_native_ops
= {
217 .reset_mapped
= kvmppc_xive_native_reset_mapped
,
220 static vm_fault_t
xive_native_esb_fault(struct vm_fault
*vmf
)
222 struct vm_area_struct
*vma
= vmf
->vma
;
223 struct kvm_device
*dev
= vma
->vm_file
->private_data
;
224 struct kvmppc_xive
*xive
= dev
->private;
225 struct kvmppc_xive_src_block
*sb
;
226 struct kvmppc_xive_irq_state
*state
;
227 struct xive_irq_data
*xd
;
235 * Linux/KVM uses a two pages ESB setting, one for trigger and
238 page_offset
= vmf
->pgoff
- vma
->vm_pgoff
;
239 irq
= page_offset
/ 2;
241 sb
= kvmppc_xive_find_source(xive
, irq
, &src
);
243 pr_devel("%s: source %lx not found !\n", __func__
, irq
);
244 return VM_FAULT_SIGBUS
;
247 state
= &sb
->irq_state
[src
];
248 kvmppc_xive_select_irq(state
, &hw_num
, &xd
);
250 arch_spin_lock(&sb
->lock
);
253 * first/even page is for trigger
254 * second/odd page is for EOI and management.
256 page
= page_offset
% 2 ? xd
->eoi_page
: xd
->trig_page
;
257 arch_spin_unlock(&sb
->lock
);
259 if (WARN_ON(!page
)) {
260 pr_err("%s: accessing invalid ESB page for source %lx !\n",
262 return VM_FAULT_SIGBUS
;
265 vmf_insert_pfn(vma
, vmf
->address
, page
>> PAGE_SHIFT
);
266 return VM_FAULT_NOPAGE
;
269 static const struct vm_operations_struct xive_native_esb_vmops
= {
270 .fault
= xive_native_esb_fault
,
273 static vm_fault_t
xive_native_tima_fault(struct vm_fault
*vmf
)
275 struct vm_area_struct
*vma
= vmf
->vma
;
277 switch (vmf
->pgoff
- vma
->vm_pgoff
) {
278 case 0: /* HW - forbid access */
279 case 1: /* HV - forbid access */
280 return VM_FAULT_SIGBUS
;
282 vmf_insert_pfn(vma
, vmf
->address
, xive_tima_os
>> PAGE_SHIFT
);
283 return VM_FAULT_NOPAGE
;
284 case 3: /* USER - TODO */
286 return VM_FAULT_SIGBUS
;
290 static const struct vm_operations_struct xive_native_tima_vmops
= {
291 .fault
= xive_native_tima_fault
,
294 static int kvmppc_xive_native_mmap(struct kvm_device
*dev
,
295 struct vm_area_struct
*vma
)
297 struct kvmppc_xive
*xive
= dev
->private;
299 /* We only allow mappings at fixed offset for now */
300 if (vma
->vm_pgoff
== KVM_XIVE_TIMA_PAGE_OFFSET
) {
301 if (vma_pages(vma
) > 4)
303 vma
->vm_ops
= &xive_native_tima_vmops
;
304 } else if (vma
->vm_pgoff
== KVM_XIVE_ESB_PAGE_OFFSET
) {
305 if (vma_pages(vma
) > KVMPPC_XIVE_NR_IRQS
* 2)
307 vma
->vm_ops
= &xive_native_esb_vmops
;
312 vma
->vm_flags
|= VM_IO
| VM_PFNMAP
;
313 vma
->vm_page_prot
= pgprot_noncached_wc(vma
->vm_page_prot
);
316 * Grab the KVM device file address_space to be able to clear
317 * the ESB pages mapping when a device is passed-through into
320 xive
->mapping
= vma
->vm_file
->f_mapping
;
324 static int kvmppc_xive_native_set_source(struct kvmppc_xive
*xive
, long irq
,
327 struct kvmppc_xive_src_block
*sb
;
328 struct kvmppc_xive_irq_state
*state
;
329 u64 __user
*ubufp
= (u64 __user
*) addr
;
334 pr_devel("%s irq=0x%lx\n", __func__
, irq
);
336 if (irq
< KVMPPC_XIVE_FIRST_IRQ
|| irq
>= KVMPPC_XIVE_NR_IRQS
)
339 sb
= kvmppc_xive_find_source(xive
, irq
, &idx
);
341 pr_debug("No source, creating source block...\n");
342 sb
= kvmppc_xive_create_src_block(xive
, irq
);
344 pr_err("Failed to create block...\n");
348 state
= &sb
->irq_state
[idx
];
350 if (get_user(val
, ubufp
)) {
351 pr_err("fault getting user info !\n");
355 arch_spin_lock(&sb
->lock
);
358 * If the source doesn't already have an IPI, allocate
359 * one and get the corresponding data
361 if (!state
->ipi_number
) {
362 state
->ipi_number
= xive_native_alloc_irq();
363 if (state
->ipi_number
== 0) {
364 pr_err("Failed to allocate IRQ !\n");
368 xive_native_populate_irq_data(state
->ipi_number
,
370 pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__
,
371 state
->ipi_number
, irq
);
374 /* Restore LSI state */
375 if (val
& KVM_XIVE_LEVEL_SENSITIVE
) {
377 if (val
& KVM_XIVE_LEVEL_ASSERTED
)
378 state
->asserted
= true;
379 pr_devel(" LSI ! Asserted=%d\n", state
->asserted
);
382 /* Mask IRQ to start with */
383 state
->act_server
= 0;
384 state
->act_priority
= MASKED
;
385 xive_vm_esb_load(&state
->ipi_data
, XIVE_ESB_SET_PQ_01
);
386 xive_native_configure_irq(state
->ipi_number
, 0, MASKED
, 0);
388 /* Increment the number of valid sources and mark this one valid */
396 arch_spin_unlock(&sb
->lock
);
401 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive
*xive
,
402 struct kvmppc_xive_src_block
*sb
,
403 struct kvmppc_xive_irq_state
*state
,
404 u32 server
, u8 priority
, bool masked
,
407 struct kvm
*kvm
= xive
->kvm
;
411 arch_spin_lock(&sb
->lock
);
413 if (state
->act_server
== server
&& state
->act_priority
== priority
&&
417 pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
418 priority
, server
, masked
, state
->act_server
,
419 state
->act_priority
);
421 kvmppc_xive_select_irq(state
, &hw_num
, NULL
);
423 if (priority
!= MASKED
&& !masked
) {
424 rc
= kvmppc_xive_select_target(kvm
, &server
, priority
);
428 state
->act_priority
= priority
;
429 state
->act_server
= server
;
432 rc
= xive_native_configure_irq(hw_num
,
433 kvmppc_xive_vp(xive
, server
),
436 state
->act_priority
= MASKED
;
437 state
->act_server
= 0;
440 rc
= xive_native_configure_irq(hw_num
, 0, MASKED
, 0);
444 arch_spin_unlock(&sb
->lock
);
448 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive
*xive
,
451 struct kvmppc_xive_src_block
*sb
;
452 struct kvmppc_xive_irq_state
*state
;
453 u64 __user
*ubufp
= (u64 __user
*) addr
;
461 sb
= kvmppc_xive_find_source(xive
, irq
, &src
);
465 state
= &sb
->irq_state
[src
];
470 if (get_user(kvm_cfg
, ubufp
))
473 pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__
, irq
, kvm_cfg
);
475 priority
= (kvm_cfg
& KVM_XIVE_SOURCE_PRIORITY_MASK
) >>
476 KVM_XIVE_SOURCE_PRIORITY_SHIFT
;
477 server
= (kvm_cfg
& KVM_XIVE_SOURCE_SERVER_MASK
) >>
478 KVM_XIVE_SOURCE_SERVER_SHIFT
;
479 masked
= (kvm_cfg
& KVM_XIVE_SOURCE_MASKED_MASK
) >>
480 KVM_XIVE_SOURCE_MASKED_SHIFT
;
481 eisn
= (kvm_cfg
& KVM_XIVE_SOURCE_EISN_MASK
) >>
482 KVM_XIVE_SOURCE_EISN_SHIFT
;
484 if (priority
!= xive_prio_from_guest(priority
)) {
485 pr_err("invalid priority for queue %d for VCPU %d\n",
490 return kvmppc_xive_native_update_source_config(xive
, sb
, state
, server
,
491 priority
, masked
, eisn
);
494 static int kvmppc_xive_native_sync_source(struct kvmppc_xive
*xive
,
497 struct kvmppc_xive_src_block
*sb
;
498 struct kvmppc_xive_irq_state
*state
;
499 struct xive_irq_data
*xd
;
504 pr_devel("%s irq=0x%lx", __func__
, irq
);
506 sb
= kvmppc_xive_find_source(xive
, irq
, &src
);
510 state
= &sb
->irq_state
[src
];
514 arch_spin_lock(&sb
->lock
);
517 kvmppc_xive_select_irq(state
, &hw_num
, &xd
);
518 xive_native_sync_source(hw_num
);
522 arch_spin_unlock(&sb
->lock
);
526 static int xive_native_validate_queue_size(u32 qshift
)
529 * We only support 64K pages for the moment. This is also
530 * advertised in the DT property "ibm,xive-eq-sizes"
533 case 0: /* EQ reset */
544 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive
*xive
,
545 long eq_idx
, u64 addr
)
547 struct kvm
*kvm
= xive
->kvm
;
548 struct kvm_vcpu
*vcpu
;
549 struct kvmppc_xive_vcpu
*xc
;
550 void __user
*ubufp
= (void __user
*) addr
;
553 struct kvm_ppc_xive_eq kvm_eq
;
559 unsigned long page_size
;
563 * Demangle priority/server tuple from the EQ identifier
565 priority
= (eq_idx
& KVM_XIVE_EQ_PRIORITY_MASK
) >>
566 KVM_XIVE_EQ_PRIORITY_SHIFT
;
567 server
= (eq_idx
& KVM_XIVE_EQ_SERVER_MASK
) >>
568 KVM_XIVE_EQ_SERVER_SHIFT
;
570 if (copy_from_user(&kvm_eq
, ubufp
, sizeof(kvm_eq
)))
573 vcpu
= kvmppc_xive_find_server(kvm
, server
);
575 pr_err("Can't find server %d\n", server
);
578 xc
= vcpu
->arch
.xive_vcpu
;
580 if (priority
!= xive_prio_from_guest(priority
)) {
581 pr_err("Trying to restore invalid queue %d for VCPU %d\n",
585 q
= &xc
->queues
[priority
];
587 pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
588 __func__
, server
, priority
, kvm_eq
.flags
,
589 kvm_eq
.qshift
, kvm_eq
.qaddr
, kvm_eq
.qtoggle
, kvm_eq
.qindex
);
591 /* reset queue and disable queueing */
592 if (!kvm_eq
.qshift
) {
596 rc
= kvmppc_xive_native_configure_queue(xc
->vp_id
, q
, priority
,
599 pr_err("Failed to reset queue %d for VCPU %d: %d\n",
600 priority
, xc
->server_num
, rc
);
608 * sPAPR specifies a "Unconditional Notify (n) flag" for the
609 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
610 * without using the coalescing mechanisms provided by the
611 * XIVE END ESBs. This is required on KVM as notification
612 * using the END ESBs is not supported.
614 if (kvm_eq
.flags
!= KVM_XIVE_EQ_ALWAYS_NOTIFY
) {
615 pr_err("invalid flags %d\n", kvm_eq
.flags
);
619 rc
= xive_native_validate_queue_size(kvm_eq
.qshift
);
621 pr_err("invalid queue size %d\n", kvm_eq
.qshift
);
625 if (kvm_eq
.qaddr
& ((1ull << kvm_eq
.qshift
) - 1)) {
626 pr_err("queue page is not aligned %llx/%llx\n", kvm_eq
.qaddr
,
627 1ull << kvm_eq
.qshift
);
631 srcu_idx
= srcu_read_lock(&kvm
->srcu
);
632 gfn
= gpa_to_gfn(kvm_eq
.qaddr
);
634 page_size
= kvm_host_page_size(vcpu
, gfn
);
635 if (1ull << kvm_eq
.qshift
> page_size
) {
636 srcu_read_unlock(&kvm
->srcu
, srcu_idx
);
637 pr_warn("Incompatible host page size %lx!\n", page_size
);
641 page
= gfn_to_page(kvm
, gfn
);
642 if (is_error_page(page
)) {
643 srcu_read_unlock(&kvm
->srcu
, srcu_idx
);
644 pr_err("Couldn't get queue page %llx!\n", kvm_eq
.qaddr
);
648 qaddr
= page_to_virt(page
) + (kvm_eq
.qaddr
& ~PAGE_MASK
);
649 srcu_read_unlock(&kvm
->srcu
, srcu_idx
);
652 * Backup the queue page guest address to the mark EQ page
653 * dirty for migration.
655 q
->guest_qaddr
= kvm_eq
.qaddr
;
656 q
->guest_qshift
= kvm_eq
.qshift
;
659 * Unconditional Notification is forced by default at the
660 * OPAL level because the use of END ESBs is not supported by
663 rc
= kvmppc_xive_native_configure_queue(xc
->vp_id
, q
, priority
,
664 (__be32
*) qaddr
, kvm_eq
.qshift
, true);
666 pr_err("Failed to configure queue %d for VCPU %d: %d\n",
667 priority
, xc
->server_num
, rc
);
673 * Only restore the queue state when needed. When doing the
674 * H_INT_SET_SOURCE_CONFIG hcall, it should not.
676 if (kvm_eq
.qtoggle
!= 1 || kvm_eq
.qindex
!= 0) {
677 rc
= xive_native_set_queue_state(xc
->vp_id
, priority
,
684 rc
= kvmppc_xive_attach_escalation(vcpu
, priority
,
685 xive
->single_escalation
);
688 kvmppc_xive_native_cleanup_queue(vcpu
, priority
);
692 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive
*xive
,
693 long eq_idx
, u64 addr
)
695 struct kvm
*kvm
= xive
->kvm
;
696 struct kvm_vcpu
*vcpu
;
697 struct kvmppc_xive_vcpu
*xc
;
699 void __user
*ubufp
= (u64 __user
*) addr
;
702 struct kvm_ppc_xive_eq kvm_eq
;
711 * Demangle priority/server tuple from the EQ identifier
713 priority
= (eq_idx
& KVM_XIVE_EQ_PRIORITY_MASK
) >>
714 KVM_XIVE_EQ_PRIORITY_SHIFT
;
715 server
= (eq_idx
& KVM_XIVE_EQ_SERVER_MASK
) >>
716 KVM_XIVE_EQ_SERVER_SHIFT
;
718 vcpu
= kvmppc_xive_find_server(kvm
, server
);
720 pr_err("Can't find server %d\n", server
);
723 xc
= vcpu
->arch
.xive_vcpu
;
725 if (priority
!= xive_prio_from_guest(priority
)) {
726 pr_err("invalid priority for queue %d for VCPU %d\n",
730 q
= &xc
->queues
[priority
];
732 memset(&kvm_eq
, 0, sizeof(kvm_eq
));
737 rc
= xive_native_get_queue_info(xc
->vp_id
, priority
, &qaddr
, &qshift
,
738 &qeoi_page
, &escalate_irq
, &qflags
);
743 if (qflags
& OPAL_XIVE_EQ_ALWAYS_NOTIFY
)
744 kvm_eq
.flags
|= KVM_XIVE_EQ_ALWAYS_NOTIFY
;
746 kvm_eq
.qshift
= q
->guest_qshift
;
747 kvm_eq
.qaddr
= q
->guest_qaddr
;
749 rc
= xive_native_get_queue_state(xc
->vp_id
, priority
, &kvm_eq
.qtoggle
,
754 pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
755 __func__
, server
, priority
, kvm_eq
.flags
,
756 kvm_eq
.qshift
, kvm_eq
.qaddr
, kvm_eq
.qtoggle
, kvm_eq
.qindex
);
758 if (copy_to_user(ubufp
, &kvm_eq
, sizeof(kvm_eq
)))
764 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block
*sb
)
768 for (i
= 0; i
< KVMPPC_XICS_IRQ_PER_ICS
; i
++) {
769 struct kvmppc_xive_irq_state
*state
= &sb
->irq_state
[i
];
774 if (state
->act_priority
== MASKED
)
778 state
->act_server
= 0;
779 state
->act_priority
= MASKED
;
780 xive_vm_esb_load(&state
->ipi_data
, XIVE_ESB_SET_PQ_01
);
781 xive_native_configure_irq(state
->ipi_number
, 0, MASKED
, 0);
782 if (state
->pt_number
) {
783 xive_vm_esb_load(state
->pt_data
, XIVE_ESB_SET_PQ_01
);
784 xive_native_configure_irq(state
->pt_number
,
790 static int kvmppc_xive_reset(struct kvmppc_xive
*xive
)
792 struct kvm
*kvm
= xive
->kvm
;
793 struct kvm_vcpu
*vcpu
;
796 pr_devel("%s\n", __func__
);
798 mutex_lock(&xive
->lock
);
800 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
801 struct kvmppc_xive_vcpu
*xc
= vcpu
->arch
.xive_vcpu
;
807 kvmppc_xive_disable_vcpu_interrupts(vcpu
);
809 for (prio
= 0; prio
< KVMPPC_XIVE_Q_COUNT
; prio
++) {
811 /* Single escalation, no queue 7 */
812 if (prio
== 7 && xive
->single_escalation
)
815 if (xc
->esc_virq
[prio
]) {
816 free_irq(xc
->esc_virq
[prio
], vcpu
);
817 irq_dispose_mapping(xc
->esc_virq
[prio
]);
818 kfree(xc
->esc_virq_names
[prio
]);
819 xc
->esc_virq
[prio
] = 0;
822 kvmppc_xive_native_cleanup_queue(vcpu
, prio
);
826 for (i
= 0; i
<= xive
->max_sbid
; i
++) {
827 struct kvmppc_xive_src_block
*sb
= xive
->src_blocks
[i
];
830 arch_spin_lock(&sb
->lock
);
831 kvmppc_xive_reset_sources(sb
);
832 arch_spin_unlock(&sb
->lock
);
836 mutex_unlock(&xive
->lock
);
841 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block
*sb
)
845 for (j
= 0; j
< KVMPPC_XICS_IRQ_PER_ICS
; j
++) {
846 struct kvmppc_xive_irq_state
*state
= &sb
->irq_state
[j
];
847 struct xive_irq_data
*xd
;
854 * The struct kvmppc_xive_irq_state reflects the state
855 * of the EAS configuration and not the state of the
856 * source. The source is masked setting the PQ bits to
857 * '-Q', which is what is being done before calling
858 * the KVM_DEV_XIVE_EQ_SYNC control.
860 * If a source EAS is configured, OPAL syncs the XIVE
861 * IC of the source and the XIVE IC of the previous
864 * So it should be fine ignoring MASKED sources as
865 * they have been synced already.
867 if (state
->act_priority
== MASKED
)
870 kvmppc_xive_select_irq(state
, &hw_num
, &xd
);
871 xive_native_sync_source(hw_num
);
872 xive_native_sync_queue(hw_num
);
876 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu
*vcpu
)
878 struct kvmppc_xive_vcpu
*xc
= vcpu
->arch
.xive_vcpu
;
885 for (prio
= 0; prio
< KVMPPC_XIVE_Q_COUNT
; prio
++) {
886 struct xive_q
*q
= &xc
->queues
[prio
];
891 /* Mark EQ page dirty for migration */
892 srcu_idx
= srcu_read_lock(&vcpu
->kvm
->srcu
);
893 mark_page_dirty(vcpu
->kvm
, gpa_to_gfn(q
->guest_qaddr
));
894 srcu_read_unlock(&vcpu
->kvm
->srcu
, srcu_idx
);
899 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive
*xive
)
901 struct kvm
*kvm
= xive
->kvm
;
902 struct kvm_vcpu
*vcpu
;
905 pr_devel("%s\n", __func__
);
907 mutex_lock(&xive
->lock
);
908 for (i
= 0; i
<= xive
->max_sbid
; i
++) {
909 struct kvmppc_xive_src_block
*sb
= xive
->src_blocks
[i
];
912 arch_spin_lock(&sb
->lock
);
913 kvmppc_xive_native_sync_sources(sb
);
914 arch_spin_unlock(&sb
->lock
);
918 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
919 kvmppc_xive_native_vcpu_eq_sync(vcpu
);
921 mutex_unlock(&xive
->lock
);
926 static int kvmppc_xive_native_set_attr(struct kvm_device
*dev
,
927 struct kvm_device_attr
*attr
)
929 struct kvmppc_xive
*xive
= dev
->private;
931 switch (attr
->group
) {
932 case KVM_DEV_XIVE_GRP_CTRL
:
933 switch (attr
->attr
) {
934 case KVM_DEV_XIVE_RESET
:
935 return kvmppc_xive_reset(xive
);
936 case KVM_DEV_XIVE_EQ_SYNC
:
937 return kvmppc_xive_native_eq_sync(xive
);
938 case KVM_DEV_XIVE_NR_SERVERS
:
939 return kvmppc_xive_set_nr_servers(xive
, attr
->addr
);
942 case KVM_DEV_XIVE_GRP_SOURCE
:
943 return kvmppc_xive_native_set_source(xive
, attr
->attr
,
945 case KVM_DEV_XIVE_GRP_SOURCE_CONFIG
:
946 return kvmppc_xive_native_set_source_config(xive
, attr
->attr
,
948 case KVM_DEV_XIVE_GRP_EQ_CONFIG
:
949 return kvmppc_xive_native_set_queue_config(xive
, attr
->attr
,
951 case KVM_DEV_XIVE_GRP_SOURCE_SYNC
:
952 return kvmppc_xive_native_sync_source(xive
, attr
->attr
,
958 static int kvmppc_xive_native_get_attr(struct kvm_device
*dev
,
959 struct kvm_device_attr
*attr
)
961 struct kvmppc_xive
*xive
= dev
->private;
963 switch (attr
->group
) {
964 case KVM_DEV_XIVE_GRP_EQ_CONFIG
:
965 return kvmppc_xive_native_get_queue_config(xive
, attr
->attr
,
971 static int kvmppc_xive_native_has_attr(struct kvm_device
*dev
,
972 struct kvm_device_attr
*attr
)
974 switch (attr
->group
) {
975 case KVM_DEV_XIVE_GRP_CTRL
:
976 switch (attr
->attr
) {
977 case KVM_DEV_XIVE_RESET
:
978 case KVM_DEV_XIVE_EQ_SYNC
:
979 case KVM_DEV_XIVE_NR_SERVERS
:
983 case KVM_DEV_XIVE_GRP_SOURCE
:
984 case KVM_DEV_XIVE_GRP_SOURCE_CONFIG
:
985 case KVM_DEV_XIVE_GRP_SOURCE_SYNC
:
986 if (attr
->attr
>= KVMPPC_XIVE_FIRST_IRQ
&&
987 attr
->attr
< KVMPPC_XIVE_NR_IRQS
)
990 case KVM_DEV_XIVE_GRP_EQ_CONFIG
:
997 * Called when device fd is closed. kvm->lock is held.
999 static void kvmppc_xive_native_release(struct kvm_device
*dev
)
1001 struct kvmppc_xive
*xive
= dev
->private;
1002 struct kvm
*kvm
= xive
->kvm
;
1003 struct kvm_vcpu
*vcpu
;
1006 pr_devel("Releasing xive native device\n");
1009 * Clear the KVM device file address_space which is used to
1010 * unmap the ESB pages when a device is passed-through.
1012 mutex_lock(&xive
->mapping_lock
);
1013 xive
->mapping
= NULL
;
1014 mutex_unlock(&xive
->mapping_lock
);
1017 * Since this is the device release function, we know that
1018 * userspace does not have any open fd or mmap referring to
1019 * the device. Therefore there can not be any of the
1020 * device attribute set/get, mmap, or page fault functions
1021 * being executed concurrently, and similarly, the
1022 * connect_vcpu and set/clr_mapped functions also cannot
1023 * be being executed.
1026 debugfs_remove(xive
->dentry
);
1029 * We should clean up the vCPU interrupt presenters first.
1031 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
1033 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1034 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1035 * Holding the vcpu->mutex also means that the vcpu cannot
1036 * be executing the KVM_RUN ioctl, and therefore it cannot
1037 * be executing the XIVE push or pull code or accessing
1038 * the XIVE MMIO regions.
1040 mutex_lock(&vcpu
->mutex
);
1041 kvmppc_xive_native_cleanup_vcpu(vcpu
);
1042 mutex_unlock(&vcpu
->mutex
);
1046 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1047 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1048 * against xive code getting called during vcpu execution or
1049 * set/get one_reg operations.
1051 kvm
->arch
.xive
= NULL
;
1053 for (i
= 0; i
<= xive
->max_sbid
; i
++) {
1054 if (xive
->src_blocks
[i
])
1055 kvmppc_xive_free_sources(xive
->src_blocks
[i
]);
1056 kfree(xive
->src_blocks
[i
]);
1057 xive
->src_blocks
[i
] = NULL
;
1060 if (xive
->vp_base
!= XIVE_INVALID_VP
)
1061 xive_native_free_vp_block(xive
->vp_base
);
1064 * A reference of the kvmppc_xive pointer is now kept under
1065 * the xive_devices struct of the machine for reuse. It is
1066 * freed when the VM is destroyed for now until we fix all the
1074 * Create a XIVE device. kvm->lock is held.
1076 static int kvmppc_xive_native_create(struct kvm_device
*dev
, u32 type
)
1078 struct kvmppc_xive
*xive
;
1079 struct kvm
*kvm
= dev
->kvm
;
1081 pr_devel("Creating xive native device\n");
1086 xive
= kvmppc_xive_get_device(kvm
, type
);
1090 dev
->private = xive
;
1093 mutex_init(&xive
->mapping_lock
);
1094 mutex_init(&xive
->lock
);
1096 /* VP allocation is delayed to the first call to connect_vcpu */
1097 xive
->vp_base
= XIVE_INVALID_VP
;
1098 /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
1099 * on a POWER9 system.
1101 xive
->nr_servers
= KVM_MAX_VCPUS
;
1103 xive
->single_escalation
= xive_native_has_single_escalation();
1104 xive
->ops
= &kvmppc_xive_native_ops
;
1106 kvm
->arch
.xive
= xive
;
1111 * Interrupt Pending Buffer (IPB) offset
1113 #define TM_IPB_SHIFT 40
1114 #define TM_IPB_MASK (((u64) 0xFF) << TM_IPB_SHIFT)
1116 int kvmppc_xive_native_get_vp(struct kvm_vcpu
*vcpu
, union kvmppc_one_reg
*val
)
1118 struct kvmppc_xive_vcpu
*xc
= vcpu
->arch
.xive_vcpu
;
1122 if (!kvmppc_xive_enabled(vcpu
))
1128 /* Thread context registers. We only care about IPB and CPPR */
1129 val
->xive_timaval
[0] = vcpu
->arch
.xive_saved_state
.w01
;
1131 /* Get the VP state from OPAL */
1132 rc
= xive_native_get_vp_state(xc
->vp_id
, &opal_state
);
1137 * Capture the backup of IPB register in the NVT structure and
1138 * merge it in our KVM VP state.
1140 val
->xive_timaval
[0] |= cpu_to_be64(opal_state
& TM_IPB_MASK
);
1142 pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1144 vcpu
->arch
.xive_saved_state
.nsr
,
1145 vcpu
->arch
.xive_saved_state
.cppr
,
1146 vcpu
->arch
.xive_saved_state
.ipb
,
1147 vcpu
->arch
.xive_saved_state
.pipr
,
1148 vcpu
->arch
.xive_saved_state
.w01
,
1149 (u32
) vcpu
->arch
.xive_cam_word
, opal_state
);
1154 int kvmppc_xive_native_set_vp(struct kvm_vcpu
*vcpu
, union kvmppc_one_reg
*val
)
1156 struct kvmppc_xive_vcpu
*xc
= vcpu
->arch
.xive_vcpu
;
1157 struct kvmppc_xive
*xive
= vcpu
->kvm
->arch
.xive
;
1159 pr_devel("%s w01=%016llx vp=%016llx\n", __func__
,
1160 val
->xive_timaval
[0], val
->xive_timaval
[1]);
1162 if (!kvmppc_xive_enabled(vcpu
))
1168 /* We can't update the state of a "pushed" VCPU */
1169 if (WARN_ON(vcpu
->arch
.xive_pushed
))
1173 * Restore the thread context registers. IPB and CPPR should
1174 * be the only ones that matter.
1176 vcpu
->arch
.xive_saved_state
.w01
= val
->xive_timaval
[0];
1179 * There is no need to restore the XIVE internal state (IPB
1180 * stored in the NVT) as the IPB register was merged in KVM VP
1181 * state when captured.
1186 bool kvmppc_xive_native_supported(void)
1188 return xive_native_has_queue_state_support();
1191 static int xive_native_debug_show(struct seq_file
*m
, void *private)
1193 struct kvmppc_xive
*xive
= m
->private;
1194 struct kvm
*kvm
= xive
->kvm
;
1195 struct kvm_vcpu
*vcpu
;
1201 seq_puts(m
, "=========\nVCPU state\n=========\n");
1203 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
1204 struct kvmppc_xive_vcpu
*xc
= vcpu
->arch
.xive_vcpu
;
1209 seq_printf(m
, "cpu server %#x VP=%#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1210 xc
->server_num
, xc
->vp_id
,
1211 vcpu
->arch
.xive_saved_state
.nsr
,
1212 vcpu
->arch
.xive_saved_state
.cppr
,
1213 vcpu
->arch
.xive_saved_state
.ipb
,
1214 vcpu
->arch
.xive_saved_state
.pipr
,
1215 vcpu
->arch
.xive_saved_state
.w01
,
1216 (u32
) vcpu
->arch
.xive_cam_word
);
1218 kvmppc_xive_debug_show_queues(m
, vcpu
);
1224 static int xive_native_debug_open(struct inode
*inode
, struct file
*file
)
1226 return single_open(file
, xive_native_debug_show
, inode
->i_private
);
1229 static const struct file_operations xive_native_debug_fops
= {
1230 .open
= xive_native_debug_open
,
1232 .llseek
= seq_lseek
,
1233 .release
= single_release
,
1236 static void xive_native_debugfs_init(struct kvmppc_xive
*xive
)
1240 name
= kasprintf(GFP_KERNEL
, "kvm-xive-%p", xive
);
1242 pr_err("%s: no memory for name\n", __func__
);
1246 xive
->dentry
= debugfs_create_file(name
, 0444, powerpc_debugfs_root
,
1247 xive
, &xive_native_debug_fops
);
1249 pr_debug("%s: created %s\n", __func__
, name
);
1253 static void kvmppc_xive_native_init(struct kvm_device
*dev
)
1255 struct kvmppc_xive
*xive
= (struct kvmppc_xive
*)dev
->private;
1257 /* Register some debug interfaces */
1258 xive_native_debugfs_init(xive
);
1261 struct kvm_device_ops kvm_xive_native_ops
= {
1262 .name
= "kvm-xive-native",
1263 .create
= kvmppc_xive_native_create
,
1264 .init
= kvmppc_xive_native_init
,
1265 .release
= kvmppc_xive_native_release
,
1266 .set_attr
= kvmppc_xive_native_set_attr
,
1267 .get_attr
= kvmppc_xive_native_get_attr
,
1268 .has_attr
= kvmppc_xive_native_has_attr
,
1269 .mmap
= kvmppc_xive_native_mmap
,
1272 void kvmppc_xive_native_init_module(void)
1277 void kvmppc_xive_native_exit_module(void)