2 * Kernel-based Virtual Machine - device assignment support
4 * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
11 #include <linux/kvm_host.h>
12 #include <linux/kvm.h>
13 #include <linux/uaccess.h>
14 #include <linux/vmalloc.h>
15 #include <linux/errno.h>
16 #include <linux/spinlock.h>
17 #include <linux/pci.h>
18 #include <linux/interrupt.h>
19 #include <linux/slab.h>
22 static struct kvm_assigned_dev_kernel
*kvm_find_assigned_dev(struct list_head
*head
,
25 struct list_head
*ptr
;
26 struct kvm_assigned_dev_kernel
*match
;
28 list_for_each(ptr
, head
) {
29 match
= list_entry(ptr
, struct kvm_assigned_dev_kernel
, list
);
30 if (match
->assigned_dev_id
== assigned_dev_id
)
36 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
37 *assigned_dev
, int irq
)
40 struct msix_entry
*host_msix_entries
;
42 host_msix_entries
= assigned_dev
->host_msix_entries
;
45 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
46 if (irq
== host_msix_entries
[i
].vector
) {
51 printk(KERN_WARNING
"Fail to find correlated MSI-X entry!\n");
58 static irqreturn_t
kvm_assigned_dev_thread(int irq
, void *dev_id
)
60 struct kvm_assigned_dev_kernel
*assigned_dev
= dev_id
;
62 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_INTX
) {
63 spin_lock(&assigned_dev
->intx_lock
);
64 disable_irq_nosync(irq
);
65 assigned_dev
->host_irq_disabled
= true;
66 spin_unlock(&assigned_dev
->intx_lock
);
69 kvm_set_irq(assigned_dev
->kvm
, assigned_dev
->irq_source_id
,
70 assigned_dev
->guest_irq
, 1);
75 #ifdef __KVM_HAVE_MSIX
76 static irqreturn_t
kvm_assigned_dev_thread_msix(int irq
, void *dev_id
)
78 struct kvm_assigned_dev_kernel
*assigned_dev
= dev_id
;
79 int index
= find_index_from_host_irq(assigned_dev
, irq
);
83 vector
= assigned_dev
->guest_msix_entries
[index
].vector
;
84 kvm_set_irq(assigned_dev
->kvm
, assigned_dev
->irq_source_id
,
92 /* Ack the irq line for an assigned device */
93 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier
*kian
)
95 struct kvm_assigned_dev_kernel
*dev
=
96 container_of(kian
, struct kvm_assigned_dev_kernel
,
99 kvm_set_irq(dev
->kvm
, dev
->irq_source_id
, dev
->guest_irq
, 0);
101 /* The guest irq may be shared so this ack may be
102 * from another device.
104 spin_lock(&dev
->intx_lock
);
105 if (dev
->host_irq_disabled
) {
106 enable_irq(dev
->host_irq
);
107 dev
->host_irq_disabled
= false;
109 spin_unlock(&dev
->intx_lock
);
112 static void deassign_guest_irq(struct kvm
*kvm
,
113 struct kvm_assigned_dev_kernel
*assigned_dev
)
115 if (assigned_dev
->ack_notifier
.gsi
!= -1)
116 kvm_unregister_irq_ack_notifier(kvm
,
117 &assigned_dev
->ack_notifier
);
119 kvm_set_irq(assigned_dev
->kvm
, assigned_dev
->irq_source_id
,
120 assigned_dev
->guest_irq
, 0);
122 if (assigned_dev
->irq_source_id
!= -1)
123 kvm_free_irq_source_id(kvm
, assigned_dev
->irq_source_id
);
124 assigned_dev
->irq_source_id
= -1;
125 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_GUEST_MASK
);
128 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
129 static void deassign_host_irq(struct kvm
*kvm
,
130 struct kvm_assigned_dev_kernel
*assigned_dev
)
133 * We disable irq here to prevent further events.
135 * Notice this maybe result in nested disable if the interrupt type is
136 * INTx, but it's OK for we are going to free it.
138 * If this function is a part of VM destroy, please ensure that till
139 * now, the kvm state is still legal for probably we also have to wait
140 * on a currently running IRQ handler.
142 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
144 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
145 disable_irq(assigned_dev
->host_msix_entries
[i
].vector
);
147 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
148 free_irq(assigned_dev
->host_msix_entries
[i
].vector
,
151 assigned_dev
->entries_nr
= 0;
152 kfree(assigned_dev
->host_msix_entries
);
153 kfree(assigned_dev
->guest_msix_entries
);
154 pci_disable_msix(assigned_dev
->dev
);
156 /* Deal with MSI and INTx */
157 disable_irq(assigned_dev
->host_irq
);
159 free_irq(assigned_dev
->host_irq
, assigned_dev
);
161 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSI
)
162 pci_disable_msi(assigned_dev
->dev
);
165 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_HOST_MASK
);
168 static int kvm_deassign_irq(struct kvm
*kvm
,
169 struct kvm_assigned_dev_kernel
*assigned_dev
,
170 unsigned long irq_requested_type
)
172 unsigned long guest_irq_type
, host_irq_type
;
174 if (!irqchip_in_kernel(kvm
))
176 /* no irq assignment to deassign */
177 if (!assigned_dev
->irq_requested_type
)
180 host_irq_type
= irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
;
181 guest_irq_type
= irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
;
184 deassign_host_irq(kvm
, assigned_dev
);
186 deassign_guest_irq(kvm
, assigned_dev
);
191 static void kvm_free_assigned_irq(struct kvm
*kvm
,
192 struct kvm_assigned_dev_kernel
*assigned_dev
)
194 kvm_deassign_irq(kvm
, assigned_dev
, assigned_dev
->irq_requested_type
);
197 static void kvm_free_assigned_device(struct kvm
*kvm
,
198 struct kvm_assigned_dev_kernel
201 kvm_free_assigned_irq(kvm
, assigned_dev
);
203 pci_reset_function(assigned_dev
->dev
);
204 if (pci_load_and_free_saved_state(assigned_dev
->dev
,
205 &assigned_dev
->pci_saved_state
))
206 printk(KERN_INFO
"%s: Couldn't reload %s saved state\n",
207 __func__
, dev_name(&assigned_dev
->dev
->dev
));
209 pci_restore_state(assigned_dev
->dev
);
211 assigned_dev
->dev
->dev_flags
&= ~PCI_DEV_FLAGS_ASSIGNED
;
213 pci_release_regions(assigned_dev
->dev
);
214 pci_disable_device(assigned_dev
->dev
);
215 pci_dev_put(assigned_dev
->dev
);
217 list_del(&assigned_dev
->list
);
221 void kvm_free_all_assigned_devices(struct kvm
*kvm
)
223 struct list_head
*ptr
, *ptr2
;
224 struct kvm_assigned_dev_kernel
*assigned_dev
;
226 list_for_each_safe(ptr
, ptr2
, &kvm
->arch
.assigned_dev_head
) {
227 assigned_dev
= list_entry(ptr
,
228 struct kvm_assigned_dev_kernel
,
231 kvm_free_assigned_device(kvm
, assigned_dev
);
235 static int assigned_device_enable_host_intx(struct kvm
*kvm
,
236 struct kvm_assigned_dev_kernel
*dev
)
238 dev
->host_irq
= dev
->dev
->irq
;
239 /* Even though this is PCI, we don't want to use shared
240 * interrupts. Sharing host devices with guest-assigned devices
241 * on the same interrupt line is not a happy situation: there
242 * are going to be long delays in accepting, acking, etc.
244 if (request_threaded_irq(dev
->host_irq
, NULL
, kvm_assigned_dev_thread
,
245 IRQF_ONESHOT
, dev
->irq_name
, dev
))
250 #ifdef __KVM_HAVE_MSI
251 static int assigned_device_enable_host_msi(struct kvm
*kvm
,
252 struct kvm_assigned_dev_kernel
*dev
)
256 if (!dev
->dev
->msi_enabled
) {
257 r
= pci_enable_msi(dev
->dev
);
262 dev
->host_irq
= dev
->dev
->irq
;
263 if (request_threaded_irq(dev
->host_irq
, NULL
, kvm_assigned_dev_thread
,
264 0, dev
->irq_name
, dev
)) {
265 pci_disable_msi(dev
->dev
);
273 #ifdef __KVM_HAVE_MSIX
274 static int assigned_device_enable_host_msix(struct kvm
*kvm
,
275 struct kvm_assigned_dev_kernel
*dev
)
279 /* host_msix_entries and guest_msix_entries should have been
281 if (dev
->entries_nr
== 0)
284 r
= pci_enable_msix(dev
->dev
, dev
->host_msix_entries
, dev
->entries_nr
);
288 for (i
= 0; i
< dev
->entries_nr
; i
++) {
289 r
= request_threaded_irq(dev
->host_msix_entries
[i
].vector
,
290 NULL
, kvm_assigned_dev_thread_msix
,
291 0, dev
->irq_name
, dev
);
298 for (i
-= 1; i
>= 0; i
--)
299 free_irq(dev
->host_msix_entries
[i
].vector
, dev
);
300 pci_disable_msix(dev
->dev
);
306 static int assigned_device_enable_guest_intx(struct kvm
*kvm
,
307 struct kvm_assigned_dev_kernel
*dev
,
308 struct kvm_assigned_irq
*irq
)
310 dev
->guest_irq
= irq
->guest_irq
;
311 dev
->ack_notifier
.gsi
= irq
->guest_irq
;
315 #ifdef __KVM_HAVE_MSI
316 static int assigned_device_enable_guest_msi(struct kvm
*kvm
,
317 struct kvm_assigned_dev_kernel
*dev
,
318 struct kvm_assigned_irq
*irq
)
320 dev
->guest_irq
= irq
->guest_irq
;
321 dev
->ack_notifier
.gsi
= -1;
322 dev
->host_irq_disabled
= false;
327 #ifdef __KVM_HAVE_MSIX
328 static int assigned_device_enable_guest_msix(struct kvm
*kvm
,
329 struct kvm_assigned_dev_kernel
*dev
,
330 struct kvm_assigned_irq
*irq
)
332 dev
->guest_irq
= irq
->guest_irq
;
333 dev
->ack_notifier
.gsi
= -1;
334 dev
->host_irq_disabled
= false;
339 static int assign_host_irq(struct kvm
*kvm
,
340 struct kvm_assigned_dev_kernel
*dev
,
345 if (dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
)
348 snprintf(dev
->irq_name
, sizeof(dev
->irq_name
), "kvm:%s",
351 switch (host_irq_type
) {
352 case KVM_DEV_IRQ_HOST_INTX
:
353 r
= assigned_device_enable_host_intx(kvm
, dev
);
355 #ifdef __KVM_HAVE_MSI
356 case KVM_DEV_IRQ_HOST_MSI
:
357 r
= assigned_device_enable_host_msi(kvm
, dev
);
360 #ifdef __KVM_HAVE_MSIX
361 case KVM_DEV_IRQ_HOST_MSIX
:
362 r
= assigned_device_enable_host_msix(kvm
, dev
);
370 dev
->irq_requested_type
|= host_irq_type
;
375 static int assign_guest_irq(struct kvm
*kvm
,
376 struct kvm_assigned_dev_kernel
*dev
,
377 struct kvm_assigned_irq
*irq
,
378 unsigned long guest_irq_type
)
383 if (dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
)
386 id
= kvm_request_irq_source_id(kvm
);
390 dev
->irq_source_id
= id
;
392 switch (guest_irq_type
) {
393 case KVM_DEV_IRQ_GUEST_INTX
:
394 r
= assigned_device_enable_guest_intx(kvm
, dev
, irq
);
396 #ifdef __KVM_HAVE_MSI
397 case KVM_DEV_IRQ_GUEST_MSI
:
398 r
= assigned_device_enable_guest_msi(kvm
, dev
, irq
);
401 #ifdef __KVM_HAVE_MSIX
402 case KVM_DEV_IRQ_GUEST_MSIX
:
403 r
= assigned_device_enable_guest_msix(kvm
, dev
, irq
);
411 dev
->irq_requested_type
|= guest_irq_type
;
412 if (dev
->ack_notifier
.gsi
!= -1)
413 kvm_register_irq_ack_notifier(kvm
, &dev
->ack_notifier
);
415 kvm_free_irq_source_id(kvm
, dev
->irq_source_id
);
420 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
421 static int kvm_vm_ioctl_assign_irq(struct kvm
*kvm
,
422 struct kvm_assigned_irq
*assigned_irq
)
425 struct kvm_assigned_dev_kernel
*match
;
426 unsigned long host_irq_type
, guest_irq_type
;
428 if (!irqchip_in_kernel(kvm
))
431 mutex_lock(&kvm
->lock
);
433 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
434 assigned_irq
->assigned_dev_id
);
438 host_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_HOST_MASK
);
439 guest_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_GUEST_MASK
);
442 /* can only assign one type at a time */
443 if (hweight_long(host_irq_type
) > 1)
445 if (hweight_long(guest_irq_type
) > 1)
447 if (host_irq_type
== 0 && guest_irq_type
== 0)
452 r
= assign_host_irq(kvm
, match
, host_irq_type
);
457 r
= assign_guest_irq(kvm
, match
, assigned_irq
, guest_irq_type
);
459 mutex_unlock(&kvm
->lock
);
463 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm
*kvm
,
464 struct kvm_assigned_irq
468 struct kvm_assigned_dev_kernel
*match
;
470 mutex_lock(&kvm
->lock
);
472 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
473 assigned_irq
->assigned_dev_id
);
477 r
= kvm_deassign_irq(kvm
, match
, assigned_irq
->flags
);
479 mutex_unlock(&kvm
->lock
);
483 static int kvm_vm_ioctl_assign_device(struct kvm
*kvm
,
484 struct kvm_assigned_pci_dev
*assigned_dev
)
487 struct kvm_assigned_dev_kernel
*match
;
490 mutex_lock(&kvm
->lock
);
491 idx
= srcu_read_lock(&kvm
->srcu
);
493 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
494 assigned_dev
->assigned_dev_id
);
496 /* device already assigned */
501 match
= kzalloc(sizeof(struct kvm_assigned_dev_kernel
), GFP_KERNEL
);
503 printk(KERN_INFO
"%s: Couldn't allocate memory\n",
508 dev
= pci_get_domain_bus_and_slot(assigned_dev
->segnr
,
510 assigned_dev
->devfn
);
512 printk(KERN_INFO
"%s: host device not found\n", __func__
);
516 if (pci_enable_device(dev
)) {
517 printk(KERN_INFO
"%s: Could not enable PCI device\n", __func__
);
521 r
= pci_request_regions(dev
, "kvm_assigned_device");
523 printk(KERN_INFO
"%s: Could not get access to device regions\n",
528 pci_reset_function(dev
);
530 match
->pci_saved_state
= pci_store_saved_state(dev
);
531 if (!match
->pci_saved_state
)
532 printk(KERN_DEBUG
"%s: Couldn't store %s saved state\n",
533 __func__
, dev_name(&dev
->dev
));
534 match
->assigned_dev_id
= assigned_dev
->assigned_dev_id
;
535 match
->host_segnr
= assigned_dev
->segnr
;
536 match
->host_busnr
= assigned_dev
->busnr
;
537 match
->host_devfn
= assigned_dev
->devfn
;
538 match
->flags
= assigned_dev
->flags
;
540 spin_lock_init(&match
->intx_lock
);
541 match
->irq_source_id
= -1;
543 match
->ack_notifier
.irq_acked
= kvm_assigned_dev_ack_irq
;
545 list_add(&match
->list
, &kvm
->arch
.assigned_dev_head
);
547 if (assigned_dev
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
) {
548 if (!kvm
->arch
.iommu_domain
) {
549 r
= kvm_iommu_map_guest(kvm
);
553 r
= kvm_assign_device(kvm
, match
);
559 srcu_read_unlock(&kvm
->srcu
, idx
);
560 mutex_unlock(&kvm
->lock
);
563 if (pci_load_and_free_saved_state(dev
, &match
->pci_saved_state
))
564 printk(KERN_INFO
"%s: Couldn't reload %s saved state\n",
565 __func__
, dev_name(&dev
->dev
));
566 list_del(&match
->list
);
567 pci_release_regions(dev
);
569 pci_disable_device(dev
);
574 srcu_read_unlock(&kvm
->srcu
, idx
);
575 mutex_unlock(&kvm
->lock
);
579 static int kvm_vm_ioctl_deassign_device(struct kvm
*kvm
,
580 struct kvm_assigned_pci_dev
*assigned_dev
)
583 struct kvm_assigned_dev_kernel
*match
;
585 mutex_lock(&kvm
->lock
);
587 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
588 assigned_dev
->assigned_dev_id
);
590 printk(KERN_INFO
"%s: device hasn't been assigned before, "
591 "so cannot be deassigned\n", __func__
);
596 if (match
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
)
597 kvm_deassign_device(kvm
, match
);
599 kvm_free_assigned_device(kvm
, match
);
602 mutex_unlock(&kvm
->lock
);
607 #ifdef __KVM_HAVE_MSIX
608 static int kvm_vm_ioctl_set_msix_nr(struct kvm
*kvm
,
609 struct kvm_assigned_msix_nr
*entry_nr
)
612 struct kvm_assigned_dev_kernel
*adev
;
614 mutex_lock(&kvm
->lock
);
616 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
617 entry_nr
->assigned_dev_id
);
623 if (adev
->entries_nr
== 0) {
624 adev
->entries_nr
= entry_nr
->entry_nr
;
625 if (adev
->entries_nr
== 0 ||
626 adev
->entries_nr
> KVM_MAX_MSIX_PER_DEV
) {
631 adev
->host_msix_entries
= kzalloc(sizeof(struct msix_entry
) *
634 if (!adev
->host_msix_entries
) {
638 adev
->guest_msix_entries
=
639 kzalloc(sizeof(struct msix_entry
) * entry_nr
->entry_nr
,
641 if (!adev
->guest_msix_entries
) {
642 kfree(adev
->host_msix_entries
);
646 } else /* Not allowed set MSI-X number twice */
649 mutex_unlock(&kvm
->lock
);
653 static int kvm_vm_ioctl_set_msix_entry(struct kvm
*kvm
,
654 struct kvm_assigned_msix_entry
*entry
)
657 struct kvm_assigned_dev_kernel
*adev
;
659 mutex_lock(&kvm
->lock
);
661 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
662 entry
->assigned_dev_id
);
669 for (i
= 0; i
< adev
->entries_nr
; i
++)
670 if (adev
->guest_msix_entries
[i
].vector
== 0 ||
671 adev
->guest_msix_entries
[i
].entry
== entry
->entry
) {
672 adev
->guest_msix_entries
[i
].entry
= entry
->entry
;
673 adev
->guest_msix_entries
[i
].vector
= entry
->gsi
;
674 adev
->host_msix_entries
[i
].entry
= entry
->entry
;
677 if (i
== adev
->entries_nr
) {
683 mutex_unlock(&kvm
->lock
);
689 long kvm_vm_ioctl_assigned_device(struct kvm
*kvm
, unsigned ioctl
,
692 void __user
*argp
= (void __user
*)arg
;
696 case KVM_ASSIGN_PCI_DEVICE
: {
697 struct kvm_assigned_pci_dev assigned_dev
;
700 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
702 r
= kvm_vm_ioctl_assign_device(kvm
, &assigned_dev
);
707 case KVM_ASSIGN_IRQ
: {
711 case KVM_ASSIGN_DEV_IRQ
: {
712 struct kvm_assigned_irq assigned_irq
;
715 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
717 r
= kvm_vm_ioctl_assign_irq(kvm
, &assigned_irq
);
722 case KVM_DEASSIGN_DEV_IRQ
: {
723 struct kvm_assigned_irq assigned_irq
;
726 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
728 r
= kvm_vm_ioctl_deassign_dev_irq(kvm
, &assigned_irq
);
733 case KVM_DEASSIGN_PCI_DEVICE
: {
734 struct kvm_assigned_pci_dev assigned_dev
;
737 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
739 r
= kvm_vm_ioctl_deassign_device(kvm
, &assigned_dev
);
744 #ifdef KVM_CAP_IRQ_ROUTING
745 case KVM_SET_GSI_ROUTING
: {
746 struct kvm_irq_routing routing
;
747 struct kvm_irq_routing __user
*urouting
;
748 struct kvm_irq_routing_entry
*entries
;
751 if (copy_from_user(&routing
, argp
, sizeof(routing
)))
754 if (routing
.nr
>= KVM_MAX_IRQ_ROUTES
)
759 entries
= vmalloc(routing
.nr
* sizeof(*entries
));
764 if (copy_from_user(entries
, urouting
->entries
,
765 routing
.nr
* sizeof(*entries
)))
766 goto out_free_irq_routing
;
767 r
= kvm_set_irq_routing(kvm
, entries
, routing
.nr
,
769 out_free_irq_routing
:
773 #endif /* KVM_CAP_IRQ_ROUTING */
774 #ifdef __KVM_HAVE_MSIX
775 case KVM_ASSIGN_SET_MSIX_NR
: {
776 struct kvm_assigned_msix_nr entry_nr
;
778 if (copy_from_user(&entry_nr
, argp
, sizeof entry_nr
))
780 r
= kvm_vm_ioctl_set_msix_nr(kvm
, &entry_nr
);
785 case KVM_ASSIGN_SET_MSIX_ENTRY
: {
786 struct kvm_assigned_msix_entry entry
;
788 if (copy_from_user(&entry
, argp
, sizeof entry
))
790 r
= kvm_vm_ioctl_set_msix_entry(kvm
, &entry
);