4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
38 #define DRIVER_VERSION "0.3"
39 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
44 struct list_head iommu_drivers_list
;
45 struct mutex iommu_drivers_lock
;
46 struct list_head group_list
;
48 struct mutex group_lock
;
49 struct cdev group_cdev
;
51 wait_queue_head_t release_q
;
54 struct vfio_iommu_driver
{
55 const struct vfio_iommu_driver_ops
*ops
;
56 struct list_head vfio_next
;
59 struct vfio_container
{
61 struct list_head group_list
;
62 struct rw_semaphore group_lock
;
63 struct vfio_iommu_driver
*iommu_driver
;
68 struct vfio_unbound_dev
{
70 struct list_head unbound_next
;
76 atomic_t container_users
;
77 struct iommu_group
*iommu_group
;
78 struct vfio_container
*container
;
79 struct list_head device_list
;
80 struct mutex device_lock
;
82 struct notifier_block nb
;
83 struct list_head vfio_next
;
84 struct list_head container_next
;
85 struct list_head unbound_list
;
86 struct mutex unbound_lock
;
90 struct blocking_notifier_head notifier
;
96 const struct vfio_device_ops
*ops
;
97 struct vfio_group
*group
;
98 struct list_head group_next
;
102 #ifdef CONFIG_VFIO_NOIOMMU
103 static bool noiommu __read_mostly
;
104 module_param_named(enable_unsafe_noiommu_mode
,
105 noiommu
, bool, S_IRUGO
| S_IWUSR
);
106 MODULE_PARM_DESC(enable_unsafe_noiommu_mode
, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
110 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
111 * and remove functions, any use cases other than acquiring the first
112 * reference for the purpose of calling vfio_add_group_dev() or removing
113 * that symmetric reference after vfio_del_group_dev() should use the raw
114 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
115 * removes the device from the dummy group and cannot be nested.
117 struct iommu_group
*vfio_iommu_group_get(struct device
*dev
)
119 struct iommu_group
*group
;
120 int __maybe_unused ret
;
122 group
= iommu_group_get(dev
);
124 #ifdef CONFIG_VFIO_NOIOMMU
126 * With noiommu enabled, an IOMMU group will be created for a device
127 * that doesn't already have one and doesn't have an iommu_ops on their
128 * bus. We set iommudata simply to be able to identify these groups
129 * as special use and for reclamation later.
131 if (group
|| !noiommu
|| iommu_present(dev
->bus
))
134 group
= iommu_group_alloc();
138 iommu_group_set_name(group
, "vfio-noiommu");
139 iommu_group_set_iommudata(group
, &noiommu
, NULL
);
140 ret
= iommu_group_add_device(group
, dev
);
141 iommu_group_put(group
);
146 * Where to taint? At this point we've added an IOMMU group for a
147 * device that is not backed by iommu_ops, therefore any iommu_
148 * callback using iommu_ops can legitimately Oops. So, while we may
149 * be about to give a DMA capable device to a user without IOMMU
150 * protection, which is clearly taint-worthy, let's go ahead and do
153 add_taint(TAINT_USER
, LOCKDEP_STILL_OK
);
154 dev_warn(dev
, "Adding kernel taint for vfio-noiommu group on device\n");
159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get
);
161 void vfio_iommu_group_put(struct iommu_group
*group
, struct device
*dev
)
163 #ifdef CONFIG_VFIO_NOIOMMU
164 if (iommu_group_get_iommudata(group
) == &noiommu
)
165 iommu_group_remove_device(dev
);
168 iommu_group_put(group
);
170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put
);
172 #ifdef CONFIG_VFIO_NOIOMMU
173 static void *vfio_noiommu_open(unsigned long arg
)
175 if (arg
!= VFIO_NOIOMMU_IOMMU
)
176 return ERR_PTR(-EINVAL
);
177 if (!capable(CAP_SYS_RAWIO
))
178 return ERR_PTR(-EPERM
);
183 static void vfio_noiommu_release(void *iommu_data
)
187 static long vfio_noiommu_ioctl(void *iommu_data
,
188 unsigned int cmd
, unsigned long arg
)
190 if (cmd
== VFIO_CHECK_EXTENSION
)
191 return noiommu
&& (arg
== VFIO_NOIOMMU_IOMMU
) ? 1 : 0;
196 static int vfio_noiommu_attach_group(void *iommu_data
,
197 struct iommu_group
*iommu_group
)
199 return iommu_group_get_iommudata(iommu_group
) == &noiommu
? 0 : -EINVAL
;
202 static void vfio_noiommu_detach_group(void *iommu_data
,
203 struct iommu_group
*iommu_group
)
207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops
= {
208 .name
= "vfio-noiommu",
209 .owner
= THIS_MODULE
,
210 .open
= vfio_noiommu_open
,
211 .release
= vfio_noiommu_release
,
212 .ioctl
= vfio_noiommu_ioctl
,
213 .attach_group
= vfio_noiommu_attach_group
,
214 .detach_group
= vfio_noiommu_detach_group
,
220 * IOMMU driver registration
222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops
*ops
)
224 struct vfio_iommu_driver
*driver
, *tmp
;
226 driver
= kzalloc(sizeof(*driver
), GFP_KERNEL
);
232 mutex_lock(&vfio
.iommu_drivers_lock
);
234 /* Check for duplicates */
235 list_for_each_entry(tmp
, &vfio
.iommu_drivers_list
, vfio_next
) {
236 if (tmp
->ops
== ops
) {
237 mutex_unlock(&vfio
.iommu_drivers_lock
);
243 list_add(&driver
->vfio_next
, &vfio
.iommu_drivers_list
);
245 mutex_unlock(&vfio
.iommu_drivers_lock
);
249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver
);
251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops
*ops
)
253 struct vfio_iommu_driver
*driver
;
255 mutex_lock(&vfio
.iommu_drivers_lock
);
256 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
, vfio_next
) {
257 if (driver
->ops
== ops
) {
258 list_del(&driver
->vfio_next
);
259 mutex_unlock(&vfio
.iommu_drivers_lock
);
264 mutex_unlock(&vfio
.iommu_drivers_lock
);
266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver
);
269 * Group minor allocation/free - both called with vfio.group_lock held
271 static int vfio_alloc_group_minor(struct vfio_group
*group
)
273 return idr_alloc(&vfio
.group_idr
, group
, 0, MINORMASK
+ 1, GFP_KERNEL
);
276 static void vfio_free_group_minor(int minor
)
278 idr_remove(&vfio
.group_idr
, minor
);
281 static int vfio_iommu_group_notifier(struct notifier_block
*nb
,
282 unsigned long action
, void *data
);
283 static void vfio_group_get(struct vfio_group
*group
);
286 * Container objects - containers are created when /dev/vfio/vfio is
287 * opened, but their lifecycle extends until the last user is done, so
288 * it's freed via kref. Must support container/group/device being
289 * closed in any order.
291 static void vfio_container_get(struct vfio_container
*container
)
293 kref_get(&container
->kref
);
296 static void vfio_container_release(struct kref
*kref
)
298 struct vfio_container
*container
;
299 container
= container_of(kref
, struct vfio_container
, kref
);
304 static void vfio_container_put(struct vfio_container
*container
)
306 kref_put(&container
->kref
, vfio_container_release
);
309 static void vfio_group_unlock_and_free(struct vfio_group
*group
)
311 mutex_unlock(&vfio
.group_lock
);
313 * Unregister outside of lock. A spurious callback is harmless now
314 * that the group is no longer in vfio.group_list.
316 iommu_group_unregister_notifier(group
->iommu_group
, &group
->nb
);
321 * Group objects - create, release, get, put, search
323 static struct vfio_group
*vfio_create_group(struct iommu_group
*iommu_group
)
325 struct vfio_group
*group
, *tmp
;
329 group
= kzalloc(sizeof(*group
), GFP_KERNEL
);
331 return ERR_PTR(-ENOMEM
);
333 kref_init(&group
->kref
);
334 INIT_LIST_HEAD(&group
->device_list
);
335 mutex_init(&group
->device_lock
);
336 INIT_LIST_HEAD(&group
->unbound_list
);
337 mutex_init(&group
->unbound_lock
);
338 atomic_set(&group
->container_users
, 0);
339 atomic_set(&group
->opened
, 0);
340 group
->iommu_group
= iommu_group
;
341 #ifdef CONFIG_VFIO_NOIOMMU
342 group
->noiommu
= (iommu_group_get_iommudata(iommu_group
) == &noiommu
);
344 BLOCKING_INIT_NOTIFIER_HEAD(&group
->notifier
);
346 group
->nb
.notifier_call
= vfio_iommu_group_notifier
;
349 * blocking notifiers acquire a rwsem around registering and hold
350 * it around callback. Therefore, need to register outside of
351 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
352 * do anything unless it can find the group in vfio.group_list, so
353 * no harm in registering early.
355 ret
= iommu_group_register_notifier(iommu_group
, &group
->nb
);
361 mutex_lock(&vfio
.group_lock
);
363 /* Did we race creating this group? */
364 list_for_each_entry(tmp
, &vfio
.group_list
, vfio_next
) {
365 if (tmp
->iommu_group
== iommu_group
) {
367 vfio_group_unlock_and_free(group
);
372 minor
= vfio_alloc_group_minor(group
);
374 vfio_group_unlock_and_free(group
);
375 return ERR_PTR(minor
);
378 dev
= device_create(vfio
.class, NULL
,
379 MKDEV(MAJOR(vfio
.group_devt
), minor
),
380 group
, "%s%d", group
->noiommu
? "noiommu-" : "",
381 iommu_group_id(iommu_group
));
383 vfio_free_group_minor(minor
);
384 vfio_group_unlock_and_free(group
);
385 return (struct vfio_group
*)dev
; /* ERR_PTR */
388 group
->minor
= minor
;
391 list_add(&group
->vfio_next
, &vfio
.group_list
);
393 mutex_unlock(&vfio
.group_lock
);
398 /* called with vfio.group_lock held */
399 static void vfio_group_release(struct kref
*kref
)
401 struct vfio_group
*group
= container_of(kref
, struct vfio_group
, kref
);
402 struct vfio_unbound_dev
*unbound
, *tmp
;
403 struct iommu_group
*iommu_group
= group
->iommu_group
;
405 WARN_ON(!list_empty(&group
->device_list
));
407 list_for_each_entry_safe(unbound
, tmp
,
408 &group
->unbound_list
, unbound_next
) {
409 list_del(&unbound
->unbound_next
);
413 device_destroy(vfio
.class, MKDEV(MAJOR(vfio
.group_devt
), group
->minor
));
414 list_del(&group
->vfio_next
);
415 vfio_free_group_minor(group
->minor
);
416 vfio_group_unlock_and_free(group
);
417 iommu_group_put(iommu_group
);
420 static void vfio_group_put(struct vfio_group
*group
)
422 kref_put_mutex(&group
->kref
, vfio_group_release
, &vfio
.group_lock
);
425 /* Assume group_lock or group reference is held */
426 static void vfio_group_get(struct vfio_group
*group
)
428 kref_get(&group
->kref
);
432 * Not really a try as we will sleep for mutex, but we need to make
433 * sure the group pointer is valid under lock and get a reference.
435 static struct vfio_group
*vfio_group_try_get(struct vfio_group
*group
)
437 struct vfio_group
*target
= group
;
439 mutex_lock(&vfio
.group_lock
);
440 list_for_each_entry(group
, &vfio
.group_list
, vfio_next
) {
441 if (group
== target
) {
442 vfio_group_get(group
);
443 mutex_unlock(&vfio
.group_lock
);
447 mutex_unlock(&vfio
.group_lock
);
453 struct vfio_group
*vfio_group_get_from_iommu(struct iommu_group
*iommu_group
)
455 struct vfio_group
*group
;
457 mutex_lock(&vfio
.group_lock
);
458 list_for_each_entry(group
, &vfio
.group_list
, vfio_next
) {
459 if (group
->iommu_group
== iommu_group
) {
460 vfio_group_get(group
);
461 mutex_unlock(&vfio
.group_lock
);
465 mutex_unlock(&vfio
.group_lock
);
470 static struct vfio_group
*vfio_group_get_from_minor(int minor
)
472 struct vfio_group
*group
;
474 mutex_lock(&vfio
.group_lock
);
475 group
= idr_find(&vfio
.group_idr
, minor
);
477 mutex_unlock(&vfio
.group_lock
);
480 vfio_group_get(group
);
481 mutex_unlock(&vfio
.group_lock
);
486 static struct vfio_group
*vfio_group_get_from_dev(struct device
*dev
)
488 struct iommu_group
*iommu_group
;
489 struct vfio_group
*group
;
491 iommu_group
= iommu_group_get(dev
);
495 group
= vfio_group_get_from_iommu(iommu_group
);
496 iommu_group_put(iommu_group
);
502 * Device objects - create, release, get, put, search
505 struct vfio_device
*vfio_group_create_device(struct vfio_group
*group
,
507 const struct vfio_device_ops
*ops
,
510 struct vfio_device
*device
;
512 device
= kzalloc(sizeof(*device
), GFP_KERNEL
);
514 return ERR_PTR(-ENOMEM
);
516 kref_init(&device
->kref
);
518 device
->group
= group
;
520 device
->device_data
= device_data
;
521 dev_set_drvdata(dev
, device
);
523 /* No need to get group_lock, caller has group reference */
524 vfio_group_get(group
);
526 mutex_lock(&group
->device_lock
);
527 list_add(&device
->group_next
, &group
->device_list
);
528 mutex_unlock(&group
->device_lock
);
533 static void vfio_device_release(struct kref
*kref
)
535 struct vfio_device
*device
= container_of(kref
,
536 struct vfio_device
, kref
);
537 struct vfio_group
*group
= device
->group
;
539 list_del(&device
->group_next
);
540 mutex_unlock(&group
->device_lock
);
542 dev_set_drvdata(device
->dev
, NULL
);
546 /* vfio_del_group_dev may be waiting for this device */
547 wake_up(&vfio
.release_q
);
550 /* Device reference always implies a group reference */
551 void vfio_device_put(struct vfio_device
*device
)
553 struct vfio_group
*group
= device
->group
;
554 kref_put_mutex(&device
->kref
, vfio_device_release
, &group
->device_lock
);
555 vfio_group_put(group
);
557 EXPORT_SYMBOL_GPL(vfio_device_put
);
559 static void vfio_device_get(struct vfio_device
*device
)
561 vfio_group_get(device
->group
);
562 kref_get(&device
->kref
);
565 static struct vfio_device
*vfio_group_get_device(struct vfio_group
*group
,
568 struct vfio_device
*device
;
570 mutex_lock(&group
->device_lock
);
571 list_for_each_entry(device
, &group
->device_list
, group_next
) {
572 if (device
->dev
== dev
) {
573 vfio_device_get(device
);
574 mutex_unlock(&group
->device_lock
);
578 mutex_unlock(&group
->device_lock
);
583 * Some drivers, like pci-stub, are only used to prevent other drivers from
584 * claiming a device and are therefore perfectly legitimate for a user owned
585 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
586 * of the device, but it does prevent the user from having direct access to
587 * the device, which is useful in some circumstances.
589 * We also assume that we can include PCI interconnect devices, ie. bridges.
590 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
591 * then all of the downstream devices will be part of the same IOMMU group as
592 * the bridge. Thus, if placing the bridge into the user owned IOVA space
593 * breaks anything, it only does so for user owned devices downstream. Note
594 * that error notification via MSI can be affected for platforms that handle
595 * MSI within the same IOVA space as DMA.
597 static const char * const vfio_driver_whitelist
[] = { "pci-stub" };
599 static bool vfio_dev_whitelisted(struct device
*dev
, struct device_driver
*drv
)
603 if (dev_is_pci(dev
)) {
604 struct pci_dev
*pdev
= to_pci_dev(dev
);
606 if (pdev
->hdr_type
!= PCI_HEADER_TYPE_NORMAL
)
610 for (i
= 0; i
< ARRAY_SIZE(vfio_driver_whitelist
); i
++) {
611 if (!strcmp(drv
->name
, vfio_driver_whitelist
[i
]))
619 * A vfio group is viable for use by userspace if all devices are in
620 * one of the following states:
622 * - bound to a vfio driver
623 * - bound to a whitelisted driver
624 * - a PCI interconnect device
626 * We use two methods to determine whether a device is bound to a vfio
627 * driver. The first is to test whether the device exists in the vfio
628 * group. The second is to test if the device exists on the group
629 * unbound_list, indicating it's in the middle of transitioning from
630 * a vfio driver to driver-less.
632 static int vfio_dev_viable(struct device
*dev
, void *data
)
634 struct vfio_group
*group
= data
;
635 struct vfio_device
*device
;
636 struct device_driver
*drv
= ACCESS_ONCE(dev
->driver
);
637 struct vfio_unbound_dev
*unbound
;
640 mutex_lock(&group
->unbound_lock
);
641 list_for_each_entry(unbound
, &group
->unbound_list
, unbound_next
) {
642 if (dev
== unbound
->dev
) {
647 mutex_unlock(&group
->unbound_lock
);
649 if (!ret
|| !drv
|| vfio_dev_whitelisted(dev
, drv
))
652 device
= vfio_group_get_device(group
, dev
);
654 vfio_device_put(device
);
662 * Async device support
664 static int vfio_group_nb_add_dev(struct vfio_group
*group
, struct device
*dev
)
666 struct vfio_device
*device
;
668 /* Do we already know about it? We shouldn't */
669 device
= vfio_group_get_device(group
, dev
);
670 if (WARN_ON_ONCE(device
)) {
671 vfio_device_put(device
);
675 /* Nothing to do for idle groups */
676 if (!atomic_read(&group
->container_users
))
679 /* TODO Prevent device auto probing */
680 WARN(1, "Device %s added to live group %d!\n", dev_name(dev
),
681 iommu_group_id(group
->iommu_group
));
686 static int vfio_group_nb_verify(struct vfio_group
*group
, struct device
*dev
)
688 /* We don't care what happens when the group isn't in use */
689 if (!atomic_read(&group
->container_users
))
692 return vfio_dev_viable(dev
, group
);
695 static int vfio_iommu_group_notifier(struct notifier_block
*nb
,
696 unsigned long action
, void *data
)
698 struct vfio_group
*group
= container_of(nb
, struct vfio_group
, nb
);
699 struct device
*dev
= data
;
700 struct vfio_unbound_dev
*unbound
;
703 * Need to go through a group_lock lookup to get a reference or we
704 * risk racing a group being removed. Ignore spurious notifies.
706 group
= vfio_group_try_get(group
);
711 case IOMMU_GROUP_NOTIFY_ADD_DEVICE
:
712 vfio_group_nb_add_dev(group
, dev
);
714 case IOMMU_GROUP_NOTIFY_DEL_DEVICE
:
716 * Nothing to do here. If the device is in use, then the
717 * vfio sub-driver should block the remove callback until
718 * it is unused. If the device is unused or attached to a
719 * stub driver, then it should be released and we don't
720 * care that it will be going away.
723 case IOMMU_GROUP_NOTIFY_BIND_DRIVER
:
724 pr_debug("%s: Device %s, group %d binding to driver\n",
725 __func__
, dev_name(dev
),
726 iommu_group_id(group
->iommu_group
));
728 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER
:
729 pr_debug("%s: Device %s, group %d bound to driver %s\n",
730 __func__
, dev_name(dev
),
731 iommu_group_id(group
->iommu_group
), dev
->driver
->name
);
732 BUG_ON(vfio_group_nb_verify(group
, dev
));
734 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER
:
735 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
736 __func__
, dev_name(dev
),
737 iommu_group_id(group
->iommu_group
), dev
->driver
->name
);
739 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER
:
740 pr_debug("%s: Device %s, group %d unbound from driver\n",
741 __func__
, dev_name(dev
),
742 iommu_group_id(group
->iommu_group
));
744 * XXX An unbound device in a live group is ok, but we'd
745 * really like to avoid the above BUG_ON by preventing other
746 * drivers from binding to it. Once that occurs, we have to
747 * stop the system to maintain isolation. At a minimum, we'd
748 * want a toggle to disable driver auto probe for this device.
751 mutex_lock(&group
->unbound_lock
);
752 list_for_each_entry(unbound
,
753 &group
->unbound_list
, unbound_next
) {
754 if (dev
== unbound
->dev
) {
755 list_del(&unbound
->unbound_next
);
760 mutex_unlock(&group
->unbound_lock
);
764 vfio_group_put(group
);
771 int vfio_add_group_dev(struct device
*dev
,
772 const struct vfio_device_ops
*ops
, void *device_data
)
774 struct iommu_group
*iommu_group
;
775 struct vfio_group
*group
;
776 struct vfio_device
*device
;
778 iommu_group
= iommu_group_get(dev
);
782 group
= vfio_group_get_from_iommu(iommu_group
);
784 group
= vfio_create_group(iommu_group
);
786 iommu_group_put(iommu_group
);
787 return PTR_ERR(group
);
791 * A found vfio_group already holds a reference to the
792 * iommu_group. A created vfio_group keeps the reference.
794 iommu_group_put(iommu_group
);
797 device
= vfio_group_get_device(group
, dev
);
799 WARN(1, "Device %s already exists on group %d\n",
800 dev_name(dev
), iommu_group_id(iommu_group
));
801 vfio_device_put(device
);
802 vfio_group_put(group
);
806 device
= vfio_group_create_device(group
, dev
, ops
, device_data
);
807 if (IS_ERR(device
)) {
808 vfio_group_put(group
);
809 return PTR_ERR(device
);
813 * Drop all but the vfio_device reference. The vfio_device holds
814 * a reference to the vfio_group, which holds a reference to the
817 vfio_group_put(group
);
821 EXPORT_SYMBOL_GPL(vfio_add_group_dev
);
824 * Get a reference to the vfio_device for a device. Even if the
825 * caller thinks they own the device, they could be racing with a
826 * release call path, so we can't trust drvdata for the shortcut.
827 * Go the long way around, from the iommu_group to the vfio_group
828 * to the vfio_device.
830 struct vfio_device
*vfio_device_get_from_dev(struct device
*dev
)
832 struct vfio_group
*group
;
833 struct vfio_device
*device
;
835 group
= vfio_group_get_from_dev(dev
);
839 device
= vfio_group_get_device(group
, dev
);
840 vfio_group_put(group
);
844 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev
);
846 static struct vfio_device
*vfio_device_get_from_name(struct vfio_group
*group
,
849 struct vfio_device
*it
, *device
= NULL
;
851 mutex_lock(&group
->device_lock
);
852 list_for_each_entry(it
, &group
->device_list
, group_next
) {
853 if (!strcmp(dev_name(it
->dev
), buf
)) {
855 vfio_device_get(device
);
859 mutex_unlock(&group
->device_lock
);
865 * Caller must hold a reference to the vfio_device
867 void *vfio_device_data(struct vfio_device
*device
)
869 return device
->device_data
;
871 EXPORT_SYMBOL_GPL(vfio_device_data
);
873 /* Given a referenced group, check if it contains the device */
874 static bool vfio_dev_present(struct vfio_group
*group
, struct device
*dev
)
876 struct vfio_device
*device
;
878 device
= vfio_group_get_device(group
, dev
);
882 vfio_device_put(device
);
887 * Decrement the device reference count and wait for the device to be
888 * removed. Open file descriptors for the device... */
889 void *vfio_del_group_dev(struct device
*dev
)
891 struct vfio_device
*device
= dev_get_drvdata(dev
);
892 struct vfio_group
*group
= device
->group
;
893 void *device_data
= device
->device_data
;
894 struct vfio_unbound_dev
*unbound
;
897 bool interrupted
= false;
900 * The group exists so long as we have a device reference. Get
901 * a group reference and use it to scan for the device going away.
903 vfio_group_get(group
);
906 * When the device is removed from the group, the group suddenly
907 * becomes non-viable; the device has a driver (until the unbind
908 * completes), but it's not present in the group. This is bad news
909 * for any external users that need to re-acquire a group reference
910 * in order to match and release their existing reference. To
911 * solve this, we track such devices on the unbound_list to bridge
912 * the gap until they're fully unbound.
914 unbound
= kzalloc(sizeof(*unbound
), GFP_KERNEL
);
917 mutex_lock(&group
->unbound_lock
);
918 list_add(&unbound
->unbound_next
, &group
->unbound_list
);
919 mutex_unlock(&group
->unbound_lock
);
923 vfio_device_put(device
);
926 * If the device is still present in the group after the above
927 * 'put', then it is in use and we need to request it from the
928 * bus driver. The driver may in turn need to request the
929 * device from the user. We send the request on an arbitrary
930 * interval with counter to allow the driver to take escalating
931 * measures to release the device if it has the ability to do so.
934 device
= vfio_group_get_device(group
, dev
);
938 if (device
->ops
->request
)
939 device
->ops
->request(device_data
, i
++);
941 vfio_device_put(device
);
944 ret
= wait_event_timeout(vfio
.release_q
,
945 !vfio_dev_present(group
, dev
), HZ
* 10);
947 ret
= wait_event_interruptible_timeout(vfio
.release_q
,
948 !vfio_dev_present(group
, dev
), HZ
* 10);
949 if (ret
== -ERESTARTSYS
) {
952 "Device is currently in use, task"
954 "blocked until device is released",
955 current
->comm
, task_pid_nr(current
));
960 vfio_group_put(group
);
964 EXPORT_SYMBOL_GPL(vfio_del_group_dev
);
967 * VFIO base fd, /dev/vfio/vfio
969 static long vfio_ioctl_check_extension(struct vfio_container
*container
,
972 struct vfio_iommu_driver
*driver
;
975 down_read(&container
->group_lock
);
977 driver
= container
->iommu_driver
;
980 /* No base extensions yet */
983 * If no driver is set, poll all registered drivers for
984 * extensions and return the first positive result. If
985 * a driver is already set, further queries will be passed
986 * only to that driver.
989 mutex_lock(&vfio
.iommu_drivers_lock
);
990 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
,
993 #ifdef CONFIG_VFIO_NOIOMMU
994 if (!list_empty(&container
->group_list
) &&
995 (container
->noiommu
!=
996 (driver
->ops
== &vfio_noiommu_ops
)))
1000 if (!try_module_get(driver
->ops
->owner
))
1003 ret
= driver
->ops
->ioctl(NULL
,
1004 VFIO_CHECK_EXTENSION
,
1006 module_put(driver
->ops
->owner
);
1010 mutex_unlock(&vfio
.iommu_drivers_lock
);
1012 ret
= driver
->ops
->ioctl(container
->iommu_data
,
1013 VFIO_CHECK_EXTENSION
, arg
);
1016 up_read(&container
->group_lock
);
1021 /* hold write lock on container->group_lock */
1022 static int __vfio_container_attach_groups(struct vfio_container
*container
,
1023 struct vfio_iommu_driver
*driver
,
1026 struct vfio_group
*group
;
1029 list_for_each_entry(group
, &container
->group_list
, container_next
) {
1030 ret
= driver
->ops
->attach_group(data
, group
->iommu_group
);
1038 list_for_each_entry_continue_reverse(group
, &container
->group_list
,
1040 driver
->ops
->detach_group(data
, group
->iommu_group
);
1046 static long vfio_ioctl_set_iommu(struct vfio_container
*container
,
1049 struct vfio_iommu_driver
*driver
;
1052 down_write(&container
->group_lock
);
1055 * The container is designed to be an unprivileged interface while
1056 * the group can be assigned to specific users. Therefore, only by
1057 * adding a group to a container does the user get the privilege of
1058 * enabling the iommu, which may allocate finite resources. There
1059 * is no unset_iommu, but by removing all the groups from a container,
1060 * the container is deprivileged and returns to an unset state.
1062 if (list_empty(&container
->group_list
) || container
->iommu_driver
) {
1063 up_write(&container
->group_lock
);
1067 mutex_lock(&vfio
.iommu_drivers_lock
);
1068 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
, vfio_next
) {
1071 #ifdef CONFIG_VFIO_NOIOMMU
1073 * Only noiommu containers can use vfio-noiommu and noiommu
1074 * containers can only use vfio-noiommu.
1076 if (container
->noiommu
!= (driver
->ops
== &vfio_noiommu_ops
))
1080 if (!try_module_get(driver
->ops
->owner
))
1084 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1085 * so test which iommu driver reported support for this
1086 * extension and call open on them. We also pass them the
1087 * magic, allowing a single driver to support multiple
1088 * interfaces if they'd like.
1090 if (driver
->ops
->ioctl(NULL
, VFIO_CHECK_EXTENSION
, arg
) <= 0) {
1091 module_put(driver
->ops
->owner
);
1095 data
= driver
->ops
->open(arg
);
1097 ret
= PTR_ERR(data
);
1098 module_put(driver
->ops
->owner
);
1102 ret
= __vfio_container_attach_groups(container
, driver
, data
);
1104 driver
->ops
->release(data
);
1105 module_put(driver
->ops
->owner
);
1109 container
->iommu_driver
= driver
;
1110 container
->iommu_data
= data
;
1114 mutex_unlock(&vfio
.iommu_drivers_lock
);
1115 up_write(&container
->group_lock
);
1120 static long vfio_fops_unl_ioctl(struct file
*filep
,
1121 unsigned int cmd
, unsigned long arg
)
1123 struct vfio_container
*container
= filep
->private_data
;
1124 struct vfio_iommu_driver
*driver
;
1132 case VFIO_GET_API_VERSION
:
1133 ret
= VFIO_API_VERSION
;
1135 case VFIO_CHECK_EXTENSION
:
1136 ret
= vfio_ioctl_check_extension(container
, arg
);
1138 case VFIO_SET_IOMMU
:
1139 ret
= vfio_ioctl_set_iommu(container
, arg
);
1142 down_read(&container
->group_lock
);
1144 driver
= container
->iommu_driver
;
1145 data
= container
->iommu_data
;
1147 if (driver
) /* passthrough all unrecognized ioctls */
1148 ret
= driver
->ops
->ioctl(data
, cmd
, arg
);
1150 up_read(&container
->group_lock
);
1156 #ifdef CONFIG_COMPAT
1157 static long vfio_fops_compat_ioctl(struct file
*filep
,
1158 unsigned int cmd
, unsigned long arg
)
1160 arg
= (unsigned long)compat_ptr(arg
);
1161 return vfio_fops_unl_ioctl(filep
, cmd
, arg
);
1163 #endif /* CONFIG_COMPAT */
1165 static int vfio_fops_open(struct inode
*inode
, struct file
*filep
)
1167 struct vfio_container
*container
;
1169 container
= kzalloc(sizeof(*container
), GFP_KERNEL
);
1173 INIT_LIST_HEAD(&container
->group_list
);
1174 init_rwsem(&container
->group_lock
);
1175 kref_init(&container
->kref
);
1177 filep
->private_data
= container
;
1182 static int vfio_fops_release(struct inode
*inode
, struct file
*filep
)
1184 struct vfio_container
*container
= filep
->private_data
;
1186 filep
->private_data
= NULL
;
1188 vfio_container_put(container
);
1194 * Once an iommu driver is set, we optionally pass read/write/mmap
1195 * on to the driver, allowing management interfaces beyond ioctl.
1197 static ssize_t
vfio_fops_read(struct file
*filep
, char __user
*buf
,
1198 size_t count
, loff_t
*ppos
)
1200 struct vfio_container
*container
= filep
->private_data
;
1201 struct vfio_iommu_driver
*driver
;
1202 ssize_t ret
= -EINVAL
;
1204 down_read(&container
->group_lock
);
1206 driver
= container
->iommu_driver
;
1207 if (likely(driver
&& driver
->ops
->read
))
1208 ret
= driver
->ops
->read(container
->iommu_data
,
1211 up_read(&container
->group_lock
);
1216 static ssize_t
vfio_fops_write(struct file
*filep
, const char __user
*buf
,
1217 size_t count
, loff_t
*ppos
)
1219 struct vfio_container
*container
= filep
->private_data
;
1220 struct vfio_iommu_driver
*driver
;
1221 ssize_t ret
= -EINVAL
;
1223 down_read(&container
->group_lock
);
1225 driver
= container
->iommu_driver
;
1226 if (likely(driver
&& driver
->ops
->write
))
1227 ret
= driver
->ops
->write(container
->iommu_data
,
1230 up_read(&container
->group_lock
);
1235 static int vfio_fops_mmap(struct file
*filep
, struct vm_area_struct
*vma
)
1237 struct vfio_container
*container
= filep
->private_data
;
1238 struct vfio_iommu_driver
*driver
;
1241 down_read(&container
->group_lock
);
1243 driver
= container
->iommu_driver
;
1244 if (likely(driver
&& driver
->ops
->mmap
))
1245 ret
= driver
->ops
->mmap(container
->iommu_data
, vma
);
1247 up_read(&container
->group_lock
);
1252 static const struct file_operations vfio_fops
= {
1253 .owner
= THIS_MODULE
,
1254 .open
= vfio_fops_open
,
1255 .release
= vfio_fops_release
,
1256 .read
= vfio_fops_read
,
1257 .write
= vfio_fops_write
,
1258 .unlocked_ioctl
= vfio_fops_unl_ioctl
,
1259 #ifdef CONFIG_COMPAT
1260 .compat_ioctl
= vfio_fops_compat_ioctl
,
1262 .mmap
= vfio_fops_mmap
,
1266 * VFIO Group fd, /dev/vfio/$GROUP
1268 static void __vfio_group_unset_container(struct vfio_group
*group
)
1270 struct vfio_container
*container
= group
->container
;
1271 struct vfio_iommu_driver
*driver
;
1273 down_write(&container
->group_lock
);
1275 driver
= container
->iommu_driver
;
1277 driver
->ops
->detach_group(container
->iommu_data
,
1278 group
->iommu_group
);
1280 group
->container
= NULL
;
1281 list_del(&group
->container_next
);
1283 /* Detaching the last group deprivileges a container, remove iommu */
1284 if (driver
&& list_empty(&container
->group_list
)) {
1285 driver
->ops
->release(container
->iommu_data
);
1286 module_put(driver
->ops
->owner
);
1287 container
->iommu_driver
= NULL
;
1288 container
->iommu_data
= NULL
;
1291 up_write(&container
->group_lock
);
1293 vfio_container_put(container
);
1297 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1298 * if there was no container to unset. Since the ioctl is called on
1299 * the group, we know that still exists, therefore the only valid
1300 * transition here is 1->0.
1302 static int vfio_group_unset_container(struct vfio_group
*group
)
1304 int users
= atomic_cmpxchg(&group
->container_users
, 1, 0);
1311 __vfio_group_unset_container(group
);
1317 * When removing container users, anything that removes the last user
1318 * implicitly removes the group from the container. That is, if the
1319 * group file descriptor is closed, as well as any device file descriptors,
1320 * the group is free.
1322 static void vfio_group_try_dissolve_container(struct vfio_group
*group
)
1324 if (0 == atomic_dec_if_positive(&group
->container_users
))
1325 __vfio_group_unset_container(group
);
1328 static int vfio_group_set_container(struct vfio_group
*group
, int container_fd
)
1331 struct vfio_container
*container
;
1332 struct vfio_iommu_driver
*driver
;
1335 if (atomic_read(&group
->container_users
))
1338 if (group
->noiommu
&& !capable(CAP_SYS_RAWIO
))
1341 f
= fdget(container_fd
);
1345 /* Sanity check, is this really our fd? */
1346 if (f
.file
->f_op
!= &vfio_fops
) {
1351 container
= f
.file
->private_data
;
1352 WARN_ON(!container
); /* fget ensures we don't race vfio_release */
1354 down_write(&container
->group_lock
);
1356 /* Real groups and fake groups cannot mix */
1357 if (!list_empty(&container
->group_list
) &&
1358 container
->noiommu
!= group
->noiommu
) {
1363 driver
= container
->iommu_driver
;
1365 ret
= driver
->ops
->attach_group(container
->iommu_data
,
1366 group
->iommu_group
);
1371 group
->container
= container
;
1372 container
->noiommu
= group
->noiommu
;
1373 list_add(&group
->container_next
, &container
->group_list
);
1375 /* Get a reference on the container and mark a user within the group */
1376 vfio_container_get(container
);
1377 atomic_inc(&group
->container_users
);
1380 up_write(&container
->group_lock
);
1385 static bool vfio_group_viable(struct vfio_group
*group
)
1387 return (iommu_group_for_each_dev(group
->iommu_group
,
1388 group
, vfio_dev_viable
) == 0);
1391 static int vfio_group_add_container_user(struct vfio_group
*group
)
1393 if (!atomic_inc_not_zero(&group
->container_users
))
1396 if (group
->noiommu
) {
1397 atomic_dec(&group
->container_users
);
1400 if (!group
->container
->iommu_driver
|| !vfio_group_viable(group
)) {
1401 atomic_dec(&group
->container_users
);
1408 static const struct file_operations vfio_device_fops
;
1410 static int vfio_group_get_device_fd(struct vfio_group
*group
, char *buf
)
1412 struct vfio_device
*device
;
1416 if (0 == atomic_read(&group
->container_users
) ||
1417 !group
->container
->iommu_driver
|| !vfio_group_viable(group
))
1420 if (group
->noiommu
&& !capable(CAP_SYS_RAWIO
))
1423 device
= vfio_device_get_from_name(group
, buf
);
1427 ret
= device
->ops
->open(device
->device_data
);
1429 vfio_device_put(device
);
1434 * We can't use anon_inode_getfd() because we need to modify
1435 * the f_mode flags directly to allow more than just ioctls
1437 ret
= get_unused_fd_flags(O_CLOEXEC
);
1439 device
->ops
->release(device
->device_data
);
1440 vfio_device_put(device
);
1444 filep
= anon_inode_getfile("[vfio-device]", &vfio_device_fops
,
1446 if (IS_ERR(filep
)) {
1448 ret
= PTR_ERR(filep
);
1449 device
->ops
->release(device
->device_data
);
1450 vfio_device_put(device
);
1455 * TODO: add an anon_inode interface to do this.
1456 * Appears to be missing by lack of need rather than
1457 * explicitly prevented. Now there's need.
1459 filep
->f_mode
|= (FMODE_LSEEK
| FMODE_PREAD
| FMODE_PWRITE
);
1461 atomic_inc(&group
->container_users
);
1463 fd_install(ret
, filep
);
1466 dev_warn(device
->dev
, "vfio-noiommu device opened by user "
1467 "(%s:%d)\n", current
->comm
, task_pid_nr(current
));
1472 static long vfio_group_fops_unl_ioctl(struct file
*filep
,
1473 unsigned int cmd
, unsigned long arg
)
1475 struct vfio_group
*group
= filep
->private_data
;
1479 case VFIO_GROUP_GET_STATUS
:
1481 struct vfio_group_status status
;
1482 unsigned long minsz
;
1484 minsz
= offsetofend(struct vfio_group_status
, flags
);
1486 if (copy_from_user(&status
, (void __user
*)arg
, minsz
))
1489 if (status
.argsz
< minsz
)
1494 if (vfio_group_viable(group
))
1495 status
.flags
|= VFIO_GROUP_FLAGS_VIABLE
;
1497 if (group
->container
)
1498 status
.flags
|= VFIO_GROUP_FLAGS_CONTAINER_SET
;
1500 if (copy_to_user((void __user
*)arg
, &status
, minsz
))
1506 case VFIO_GROUP_SET_CONTAINER
:
1510 if (get_user(fd
, (int __user
*)arg
))
1516 ret
= vfio_group_set_container(group
, fd
);
1519 case VFIO_GROUP_UNSET_CONTAINER
:
1520 ret
= vfio_group_unset_container(group
);
1522 case VFIO_GROUP_GET_DEVICE_FD
:
1526 buf
= strndup_user((const char __user
*)arg
, PAGE_SIZE
);
1528 return PTR_ERR(buf
);
1530 ret
= vfio_group_get_device_fd(group
, buf
);
1539 #ifdef CONFIG_COMPAT
1540 static long vfio_group_fops_compat_ioctl(struct file
*filep
,
1541 unsigned int cmd
, unsigned long arg
)
1543 arg
= (unsigned long)compat_ptr(arg
);
1544 return vfio_group_fops_unl_ioctl(filep
, cmd
, arg
);
1546 #endif /* CONFIG_COMPAT */
1548 static int vfio_group_fops_open(struct inode
*inode
, struct file
*filep
)
1550 struct vfio_group
*group
;
1553 group
= vfio_group_get_from_minor(iminor(inode
));
1557 if (group
->noiommu
&& !capable(CAP_SYS_RAWIO
)) {
1558 vfio_group_put(group
);
1562 /* Do we need multiple instances of the group open? Seems not. */
1563 opened
= atomic_cmpxchg(&group
->opened
, 0, 1);
1565 vfio_group_put(group
);
1569 /* Is something still in use from a previous open? */
1570 if (group
->container
) {
1571 atomic_dec(&group
->opened
);
1572 vfio_group_put(group
);
1576 filep
->private_data
= group
;
1581 static int vfio_group_fops_release(struct inode
*inode
, struct file
*filep
)
1583 struct vfio_group
*group
= filep
->private_data
;
1585 filep
->private_data
= NULL
;
1587 /* Any user didn't unregister? */
1588 WARN_ON(group
->notifier
.head
);
1590 vfio_group_try_dissolve_container(group
);
1592 atomic_dec(&group
->opened
);
1594 vfio_group_put(group
);
1599 static const struct file_operations vfio_group_fops
= {
1600 .owner
= THIS_MODULE
,
1601 .unlocked_ioctl
= vfio_group_fops_unl_ioctl
,
1602 #ifdef CONFIG_COMPAT
1603 .compat_ioctl
= vfio_group_fops_compat_ioctl
,
1605 .open
= vfio_group_fops_open
,
1606 .release
= vfio_group_fops_release
,
1612 static int vfio_device_fops_release(struct inode
*inode
, struct file
*filep
)
1614 struct vfio_device
*device
= filep
->private_data
;
1616 device
->ops
->release(device
->device_data
);
1618 vfio_group_try_dissolve_container(device
->group
);
1620 vfio_device_put(device
);
1625 static long vfio_device_fops_unl_ioctl(struct file
*filep
,
1626 unsigned int cmd
, unsigned long arg
)
1628 struct vfio_device
*device
= filep
->private_data
;
1630 if (unlikely(!device
->ops
->ioctl
))
1633 return device
->ops
->ioctl(device
->device_data
, cmd
, arg
);
1636 static ssize_t
vfio_device_fops_read(struct file
*filep
, char __user
*buf
,
1637 size_t count
, loff_t
*ppos
)
1639 struct vfio_device
*device
= filep
->private_data
;
1641 if (unlikely(!device
->ops
->read
))
1644 return device
->ops
->read(device
->device_data
, buf
, count
, ppos
);
1647 static ssize_t
vfio_device_fops_write(struct file
*filep
,
1648 const char __user
*buf
,
1649 size_t count
, loff_t
*ppos
)
1651 struct vfio_device
*device
= filep
->private_data
;
1653 if (unlikely(!device
->ops
->write
))
1656 return device
->ops
->write(device
->device_data
, buf
, count
, ppos
);
1659 static int vfio_device_fops_mmap(struct file
*filep
, struct vm_area_struct
*vma
)
1661 struct vfio_device
*device
= filep
->private_data
;
1663 if (unlikely(!device
->ops
->mmap
))
1666 return device
->ops
->mmap(device
->device_data
, vma
);
1669 #ifdef CONFIG_COMPAT
1670 static long vfio_device_fops_compat_ioctl(struct file
*filep
,
1671 unsigned int cmd
, unsigned long arg
)
1673 arg
= (unsigned long)compat_ptr(arg
);
1674 return vfio_device_fops_unl_ioctl(filep
, cmd
, arg
);
1676 #endif /* CONFIG_COMPAT */
1678 static const struct file_operations vfio_device_fops
= {
1679 .owner
= THIS_MODULE
,
1680 .release
= vfio_device_fops_release
,
1681 .read
= vfio_device_fops_read
,
1682 .write
= vfio_device_fops_write
,
1683 .unlocked_ioctl
= vfio_device_fops_unl_ioctl
,
1684 #ifdef CONFIG_COMPAT
1685 .compat_ioctl
= vfio_device_fops_compat_ioctl
,
1687 .mmap
= vfio_device_fops_mmap
,
1691 * External user API, exported by symbols to be linked dynamically.
1693 * The protocol includes:
1694 * 1. do normal VFIO init operation:
1695 * - opening a new container;
1696 * - attaching group(s) to it;
1697 * - setting an IOMMU driver for a container.
1698 * When IOMMU is set for a container, all groups in it are
1699 * considered ready to use by an external user.
1701 * 2. User space passes a group fd to an external user.
1702 * The external user calls vfio_group_get_external_user()
1704 * - the group is initialized;
1705 * - IOMMU is set for it.
1706 * If both checks passed, vfio_group_get_external_user()
1707 * increments the container user counter to prevent
1708 * the VFIO group from disposal before KVM exits.
1710 * 3. The external user calls vfio_external_user_iommu_id()
1711 * to know an IOMMU ID.
1713 * 4. When the external KVM finishes, it calls
1714 * vfio_group_put_external_user() to release the VFIO group.
1715 * This call decrements the container user counter.
1717 struct vfio_group
*vfio_group_get_external_user(struct file
*filep
)
1719 struct vfio_group
*group
= filep
->private_data
;
1722 if (filep
->f_op
!= &vfio_group_fops
)
1723 return ERR_PTR(-EINVAL
);
1725 ret
= vfio_group_add_container_user(group
);
1727 return ERR_PTR(ret
);
1729 vfio_group_get(group
);
1733 EXPORT_SYMBOL_GPL(vfio_group_get_external_user
);
1735 void vfio_group_put_external_user(struct vfio_group
*group
)
1737 vfio_group_try_dissolve_container(group
);
1738 vfio_group_put(group
);
1740 EXPORT_SYMBOL_GPL(vfio_group_put_external_user
);
1742 int vfio_external_user_iommu_id(struct vfio_group
*group
)
1744 return iommu_group_id(group
->iommu_group
);
1746 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id
);
1748 long vfio_external_check_extension(struct vfio_group
*group
, unsigned long arg
)
1750 return vfio_ioctl_check_extension(group
->container
, arg
);
1752 EXPORT_SYMBOL_GPL(vfio_external_check_extension
);
1755 * Sub-module support
1758 * Helper for managing a buffer of info chain capabilities, allocate or
1759 * reallocate a buffer with additional @size, filling in @id and @version
1760 * of the capability. A pointer to the new capability is returned.
1762 * NB. The chain is based at the head of the buffer, so new entries are
1763 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1764 * next offsets prior to copying to the user buffer.
1766 struct vfio_info_cap_header
*vfio_info_cap_add(struct vfio_info_cap
*caps
,
1767 size_t size
, u16 id
, u16 version
)
1770 struct vfio_info_cap_header
*header
, *tmp
;
1772 buf
= krealloc(caps
->buf
, caps
->size
+ size
, GFP_KERNEL
);
1776 return ERR_PTR(-ENOMEM
);
1780 header
= buf
+ caps
->size
;
1782 /* Eventually copied to user buffer, zero */
1783 memset(header
, 0, size
);
1786 header
->version
= version
;
1788 /* Add to the end of the capability chain */
1789 for (tmp
= buf
; tmp
->next
; tmp
= buf
+ tmp
->next
)
1792 tmp
->next
= caps
->size
;
1797 EXPORT_SYMBOL_GPL(vfio_info_cap_add
);
1799 void vfio_info_cap_shift(struct vfio_info_cap
*caps
, size_t offset
)
1801 struct vfio_info_cap_header
*tmp
;
1802 void *buf
= (void *)caps
->buf
;
1804 for (tmp
= buf
; tmp
->next
; tmp
= buf
+ tmp
->next
- offset
)
1805 tmp
->next
+= offset
;
1807 EXPORT_SYMBOL(vfio_info_cap_shift
);
1809 static int sparse_mmap_cap(struct vfio_info_cap
*caps
, void *cap_type
)
1811 struct vfio_info_cap_header
*header
;
1812 struct vfio_region_info_cap_sparse_mmap
*sparse_cap
, *sparse
= cap_type
;
1815 size
= sizeof(*sparse
) + sparse
->nr_areas
* sizeof(*sparse
->areas
);
1816 header
= vfio_info_cap_add(caps
, size
,
1817 VFIO_REGION_INFO_CAP_SPARSE_MMAP
, 1);
1819 return PTR_ERR(header
);
1821 sparse_cap
= container_of(header
,
1822 struct vfio_region_info_cap_sparse_mmap
, header
);
1823 sparse_cap
->nr_areas
= sparse
->nr_areas
;
1824 memcpy(sparse_cap
->areas
, sparse
->areas
,
1825 sparse
->nr_areas
* sizeof(*sparse
->areas
));
1829 static int region_type_cap(struct vfio_info_cap
*caps
, void *cap_type
)
1831 struct vfio_info_cap_header
*header
;
1832 struct vfio_region_info_cap_type
*type_cap
, *cap
= cap_type
;
1834 header
= vfio_info_cap_add(caps
, sizeof(*cap
),
1835 VFIO_REGION_INFO_CAP_TYPE
, 1);
1837 return PTR_ERR(header
);
1839 type_cap
= container_of(header
, struct vfio_region_info_cap_type
,
1841 type_cap
->type
= cap
->type
;
1842 type_cap
->subtype
= cap
->subtype
;
1846 int vfio_info_add_capability(struct vfio_info_cap
*caps
, int cap_type_id
,
1854 switch (cap_type_id
) {
1855 case VFIO_REGION_INFO_CAP_SPARSE_MMAP
:
1856 ret
= sparse_mmap_cap(caps
, cap_type
);
1859 case VFIO_REGION_INFO_CAP_TYPE
:
1860 ret
= region_type_cap(caps
, cap_type
);
1866 EXPORT_SYMBOL(vfio_info_add_capability
);
1868 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set
*hdr
, int num_irqs
,
1869 int max_irq_type
, size_t *data_size
)
1871 unsigned long minsz
;
1874 minsz
= offsetofend(struct vfio_irq_set
, count
);
1876 if ((hdr
->argsz
< minsz
) || (hdr
->index
>= max_irq_type
) ||
1877 (hdr
->count
>= (U32_MAX
- hdr
->start
)) ||
1878 (hdr
->flags
& ~(VFIO_IRQ_SET_DATA_TYPE_MASK
|
1879 VFIO_IRQ_SET_ACTION_TYPE_MASK
)))
1885 if (hdr
->start
>= num_irqs
|| hdr
->start
+ hdr
->count
> num_irqs
)
1888 switch (hdr
->flags
& VFIO_IRQ_SET_DATA_TYPE_MASK
) {
1889 case VFIO_IRQ_SET_DATA_NONE
:
1892 case VFIO_IRQ_SET_DATA_BOOL
:
1893 size
= sizeof(uint8_t);
1895 case VFIO_IRQ_SET_DATA_EVENTFD
:
1896 size
= sizeof(int32_t);
1903 if (hdr
->argsz
- minsz
< hdr
->count
* size
)
1909 *data_size
= hdr
->count
* size
;
1914 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare
);
1917 * Pin a set of guest PFNs and return their associated host PFNs for local
1919 * @dev [in] : device
1920 * @user_pfn [in]: array of user/guest PFNs to be unpinned.
1921 * @npage [in] : count of elements in user_pfn array. This count should not
1922 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1923 * @prot [in] : protection flags
1924 * @phys_pfn[out]: array of host PFNs
1925 * Return error or number of pages pinned.
1927 int vfio_pin_pages(struct device
*dev
, unsigned long *user_pfn
, int npage
,
1928 int prot
, unsigned long *phys_pfn
)
1930 struct vfio_container
*container
;
1931 struct vfio_group
*group
;
1932 struct vfio_iommu_driver
*driver
;
1935 if (!dev
|| !user_pfn
|| !phys_pfn
|| !npage
)
1938 if (npage
> VFIO_PIN_PAGES_MAX_ENTRIES
)
1941 group
= vfio_group_get_from_dev(dev
);
1945 ret
= vfio_group_add_container_user(group
);
1949 container
= group
->container
;
1950 down_read(&container
->group_lock
);
1952 driver
= container
->iommu_driver
;
1953 if (likely(driver
&& driver
->ops
->pin_pages
))
1954 ret
= driver
->ops
->pin_pages(container
->iommu_data
, user_pfn
,
1955 npage
, prot
, phys_pfn
);
1959 up_read(&container
->group_lock
);
1960 vfio_group_try_dissolve_container(group
);
1963 vfio_group_put(group
);
1966 EXPORT_SYMBOL(vfio_pin_pages
);
1969 * Unpin set of host PFNs for local domain only.
1970 * @dev [in] : device
1971 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1972 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1973 * @npage [in] : count of elements in user_pfn array. This count should not
1974 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1975 * Return error or number of pages unpinned.
1977 int vfio_unpin_pages(struct device
*dev
, unsigned long *user_pfn
, int npage
)
1979 struct vfio_container
*container
;
1980 struct vfio_group
*group
;
1981 struct vfio_iommu_driver
*driver
;
1984 if (!dev
|| !user_pfn
|| !npage
)
1987 if (npage
> VFIO_PIN_PAGES_MAX_ENTRIES
)
1990 group
= vfio_group_get_from_dev(dev
);
1994 ret
= vfio_group_add_container_user(group
);
1996 goto err_unpin_pages
;
1998 container
= group
->container
;
1999 down_read(&container
->group_lock
);
2001 driver
= container
->iommu_driver
;
2002 if (likely(driver
&& driver
->ops
->unpin_pages
))
2003 ret
= driver
->ops
->unpin_pages(container
->iommu_data
, user_pfn
,
2008 up_read(&container
->group_lock
);
2009 vfio_group_try_dissolve_container(group
);
2012 vfio_group_put(group
);
2015 EXPORT_SYMBOL(vfio_unpin_pages
);
2017 static int vfio_register_iommu_notifier(struct vfio_group
*group
,
2018 unsigned long *events
,
2019 struct notifier_block
*nb
)
2021 struct vfio_container
*container
;
2022 struct vfio_iommu_driver
*driver
;
2025 ret
= vfio_group_add_container_user(group
);
2029 container
= group
->container
;
2030 down_read(&container
->group_lock
);
2032 driver
= container
->iommu_driver
;
2033 if (likely(driver
&& driver
->ops
->register_notifier
))
2034 ret
= driver
->ops
->register_notifier(container
->iommu_data
,
2039 up_read(&container
->group_lock
);
2040 vfio_group_try_dissolve_container(group
);
2045 static int vfio_unregister_iommu_notifier(struct vfio_group
*group
,
2046 struct notifier_block
*nb
)
2048 struct vfio_container
*container
;
2049 struct vfio_iommu_driver
*driver
;
2052 ret
= vfio_group_add_container_user(group
);
2056 container
= group
->container
;
2057 down_read(&container
->group_lock
);
2059 driver
= container
->iommu_driver
;
2060 if (likely(driver
&& driver
->ops
->unregister_notifier
))
2061 ret
= driver
->ops
->unregister_notifier(container
->iommu_data
,
2066 up_read(&container
->group_lock
);
2067 vfio_group_try_dissolve_container(group
);
2072 void vfio_group_set_kvm(struct vfio_group
*group
, struct kvm
*kvm
)
2075 blocking_notifier_call_chain(&group
->notifier
,
2076 VFIO_GROUP_NOTIFY_SET_KVM
, kvm
);
2078 EXPORT_SYMBOL_GPL(vfio_group_set_kvm
);
2080 static int vfio_register_group_notifier(struct vfio_group
*group
,
2081 unsigned long *events
,
2082 struct notifier_block
*nb
)
2084 struct vfio_container
*container
;
2086 bool set_kvm
= false;
2088 if (*events
& VFIO_GROUP_NOTIFY_SET_KVM
)
2091 /* clear known events */
2092 *events
&= ~VFIO_GROUP_NOTIFY_SET_KVM
;
2094 /* refuse to continue if still events remaining */
2098 ret
= vfio_group_add_container_user(group
);
2102 container
= group
->container
;
2103 down_read(&container
->group_lock
);
2105 ret
= blocking_notifier_chain_register(&group
->notifier
, nb
);
2108 * The attaching of kvm and vfio_group might already happen, so
2109 * here we replay once upon registration.
2111 if (!ret
&& set_kvm
&& group
->kvm
)
2112 blocking_notifier_call_chain(&group
->notifier
,
2113 VFIO_GROUP_NOTIFY_SET_KVM
, group
->kvm
);
2115 up_read(&container
->group_lock
);
2116 vfio_group_try_dissolve_container(group
);
2121 static int vfio_unregister_group_notifier(struct vfio_group
*group
,
2122 struct notifier_block
*nb
)
2124 struct vfio_container
*container
;
2127 ret
= vfio_group_add_container_user(group
);
2131 container
= group
->container
;
2132 down_read(&container
->group_lock
);
2134 ret
= blocking_notifier_chain_unregister(&group
->notifier
, nb
);
2136 up_read(&container
->group_lock
);
2137 vfio_group_try_dissolve_container(group
);
2142 int vfio_register_notifier(struct device
*dev
, enum vfio_notify_type type
,
2143 unsigned long *events
, struct notifier_block
*nb
)
2145 struct vfio_group
*group
;
2148 if (!dev
|| !nb
|| !events
|| (*events
== 0))
2151 group
= vfio_group_get_from_dev(dev
);
2156 case VFIO_IOMMU_NOTIFY
:
2157 ret
= vfio_register_iommu_notifier(group
, events
, nb
);
2159 case VFIO_GROUP_NOTIFY
:
2160 ret
= vfio_register_group_notifier(group
, events
, nb
);
2166 vfio_group_put(group
);
2169 EXPORT_SYMBOL(vfio_register_notifier
);
2171 int vfio_unregister_notifier(struct device
*dev
, enum vfio_notify_type type
,
2172 struct notifier_block
*nb
)
2174 struct vfio_group
*group
;
2180 group
= vfio_group_get_from_dev(dev
);
2185 case VFIO_IOMMU_NOTIFY
:
2186 ret
= vfio_unregister_iommu_notifier(group
, nb
);
2188 case VFIO_GROUP_NOTIFY
:
2189 ret
= vfio_unregister_group_notifier(group
, nb
);
2195 vfio_group_put(group
);
2198 EXPORT_SYMBOL(vfio_unregister_notifier
);
2201 * Module/class support
2203 static char *vfio_devnode(struct device
*dev
, umode_t
*mode
)
2205 return kasprintf(GFP_KERNEL
, "vfio/%s", dev_name(dev
));
2208 static struct miscdevice vfio_dev
= {
2209 .minor
= VFIO_MINOR
,
2212 .nodename
= "vfio/vfio",
2213 .mode
= S_IRUGO
| S_IWUGO
,
2216 static int __init
vfio_init(void)
2220 idr_init(&vfio
.group_idr
);
2221 mutex_init(&vfio
.group_lock
);
2222 mutex_init(&vfio
.iommu_drivers_lock
);
2223 INIT_LIST_HEAD(&vfio
.group_list
);
2224 INIT_LIST_HEAD(&vfio
.iommu_drivers_list
);
2225 init_waitqueue_head(&vfio
.release_q
);
2227 ret
= misc_register(&vfio_dev
);
2229 pr_err("vfio: misc device register failed\n");
2233 /* /dev/vfio/$GROUP */
2234 vfio
.class = class_create(THIS_MODULE
, "vfio");
2235 if (IS_ERR(vfio
.class)) {
2236 ret
= PTR_ERR(vfio
.class);
2240 vfio
.class->devnode
= vfio_devnode
;
2242 ret
= alloc_chrdev_region(&vfio
.group_devt
, 0, MINORMASK
, "vfio");
2244 goto err_alloc_chrdev
;
2246 cdev_init(&vfio
.group_cdev
, &vfio_group_fops
);
2247 ret
= cdev_add(&vfio
.group_cdev
, vfio
.group_devt
, MINORMASK
);
2251 pr_info(DRIVER_DESC
" version: " DRIVER_VERSION
"\n");
2254 * Attempt to load known iommu-drivers. This gives us a working
2255 * environment without the user needing to explicitly load iommu
2258 request_module_nowait("vfio_iommu_type1");
2259 request_module_nowait("vfio_iommu_spapr_tce");
2261 #ifdef CONFIG_VFIO_NOIOMMU
2262 vfio_register_iommu_driver(&vfio_noiommu_ops
);
2267 unregister_chrdev_region(vfio
.group_devt
, MINORMASK
);
2269 class_destroy(vfio
.class);
2272 misc_deregister(&vfio_dev
);
2276 static void __exit
vfio_cleanup(void)
2278 WARN_ON(!list_empty(&vfio
.group_list
));
2280 #ifdef CONFIG_VFIO_NOIOMMU
2281 vfio_unregister_iommu_driver(&vfio_noiommu_ops
);
2283 idr_destroy(&vfio
.group_idr
);
2284 cdev_del(&vfio
.group_cdev
);
2285 unregister_chrdev_region(vfio
.group_devt
, MINORMASK
);
2286 class_destroy(vfio
.class);
2288 misc_deregister(&vfio_dev
);
2291 module_init(vfio_init
);
2292 module_exit(vfio_cleanup
);
2294 MODULE_VERSION(DRIVER_VERSION
);
2295 MODULE_LICENSE("GPL v2");
2296 MODULE_AUTHOR(DRIVER_AUTHOR
);
2297 MODULE_DESCRIPTION(DRIVER_DESC
);
2298 MODULE_ALIAS_MISCDEV(VFIO_MINOR
);
2299 MODULE_ALIAS("devname:vfio/vfio");