4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
38 #define DRIVER_VERSION "0.3"
39 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
44 struct list_head iommu_drivers_list
;
45 struct mutex iommu_drivers_lock
;
46 struct list_head group_list
;
48 struct mutex group_lock
;
49 struct cdev group_cdev
;
51 wait_queue_head_t release_q
;
54 struct vfio_iommu_driver
{
55 const struct vfio_iommu_driver_ops
*ops
;
56 struct list_head vfio_next
;
59 struct vfio_container
{
61 struct list_head group_list
;
62 struct rw_semaphore group_lock
;
63 struct vfio_iommu_driver
*iommu_driver
;
68 struct vfio_unbound_dev
{
70 struct list_head unbound_next
;
76 atomic_t container_users
;
77 struct iommu_group
*iommu_group
;
78 struct vfio_container
*container
;
79 struct list_head device_list
;
80 struct mutex device_lock
;
82 struct notifier_block nb
;
83 struct list_head vfio_next
;
84 struct list_head container_next
;
85 struct list_head unbound_list
;
86 struct mutex unbound_lock
;
90 struct blocking_notifier_head notifier
;
96 const struct vfio_device_ops
*ops
;
97 struct vfio_group
*group
;
98 struct list_head group_next
;
102 #ifdef CONFIG_VFIO_NOIOMMU
103 static bool noiommu __read_mostly
;
104 module_param_named(enable_unsafe_noiommu_mode
,
105 noiommu
, bool, S_IRUGO
| S_IWUSR
);
106 MODULE_PARM_DESC(enable_unsafe_noiommu_mode
, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
110 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
111 * and remove functions, any use cases other than acquiring the first
112 * reference for the purpose of calling vfio_add_group_dev() or removing
113 * that symmetric reference after vfio_del_group_dev() should use the raw
114 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
115 * removes the device from the dummy group and cannot be nested.
117 struct iommu_group
*vfio_iommu_group_get(struct device
*dev
)
119 struct iommu_group
*group
;
120 int __maybe_unused ret
;
122 group
= iommu_group_get(dev
);
124 #ifdef CONFIG_VFIO_NOIOMMU
126 * With noiommu enabled, an IOMMU group will be created for a device
127 * that doesn't already have one and doesn't have an iommu_ops on their
128 * bus. We set iommudata simply to be able to identify these groups
129 * as special use and for reclamation later.
131 if (group
|| !noiommu
|| iommu_present(dev
->bus
))
134 group
= iommu_group_alloc();
138 iommu_group_set_name(group
, "vfio-noiommu");
139 iommu_group_set_iommudata(group
, &noiommu
, NULL
);
140 ret
= iommu_group_add_device(group
, dev
);
141 iommu_group_put(group
);
146 * Where to taint? At this point we've added an IOMMU group for a
147 * device that is not backed by iommu_ops, therefore any iommu_
148 * callback using iommu_ops can legitimately Oops. So, while we may
149 * be about to give a DMA capable device to a user without IOMMU
150 * protection, which is clearly taint-worthy, let's go ahead and do
153 add_taint(TAINT_USER
, LOCKDEP_STILL_OK
);
154 dev_warn(dev
, "Adding kernel taint for vfio-noiommu group on device\n");
159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get
);
161 void vfio_iommu_group_put(struct iommu_group
*group
, struct device
*dev
)
163 #ifdef CONFIG_VFIO_NOIOMMU
164 if (iommu_group_get_iommudata(group
) == &noiommu
)
165 iommu_group_remove_device(dev
);
168 iommu_group_put(group
);
170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put
);
172 #ifdef CONFIG_VFIO_NOIOMMU
173 static void *vfio_noiommu_open(unsigned long arg
)
175 if (arg
!= VFIO_NOIOMMU_IOMMU
)
176 return ERR_PTR(-EINVAL
);
177 if (!capable(CAP_SYS_RAWIO
))
178 return ERR_PTR(-EPERM
);
183 static void vfio_noiommu_release(void *iommu_data
)
187 static long vfio_noiommu_ioctl(void *iommu_data
,
188 unsigned int cmd
, unsigned long arg
)
190 if (cmd
== VFIO_CHECK_EXTENSION
)
191 return noiommu
&& (arg
== VFIO_NOIOMMU_IOMMU
) ? 1 : 0;
196 static int vfio_noiommu_attach_group(void *iommu_data
,
197 struct iommu_group
*iommu_group
)
199 return iommu_group_get_iommudata(iommu_group
) == &noiommu
? 0 : -EINVAL
;
202 static void vfio_noiommu_detach_group(void *iommu_data
,
203 struct iommu_group
*iommu_group
)
207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops
= {
208 .name
= "vfio-noiommu",
209 .owner
= THIS_MODULE
,
210 .open
= vfio_noiommu_open
,
211 .release
= vfio_noiommu_release
,
212 .ioctl
= vfio_noiommu_ioctl
,
213 .attach_group
= vfio_noiommu_attach_group
,
214 .detach_group
= vfio_noiommu_detach_group
,
220 * IOMMU driver registration
222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops
*ops
)
224 struct vfio_iommu_driver
*driver
, *tmp
;
226 driver
= kzalloc(sizeof(*driver
), GFP_KERNEL
);
232 mutex_lock(&vfio
.iommu_drivers_lock
);
234 /* Check for duplicates */
235 list_for_each_entry(tmp
, &vfio
.iommu_drivers_list
, vfio_next
) {
236 if (tmp
->ops
== ops
) {
237 mutex_unlock(&vfio
.iommu_drivers_lock
);
243 list_add(&driver
->vfio_next
, &vfio
.iommu_drivers_list
);
245 mutex_unlock(&vfio
.iommu_drivers_lock
);
249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver
);
251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops
*ops
)
253 struct vfio_iommu_driver
*driver
;
255 mutex_lock(&vfio
.iommu_drivers_lock
);
256 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
, vfio_next
) {
257 if (driver
->ops
== ops
) {
258 list_del(&driver
->vfio_next
);
259 mutex_unlock(&vfio
.iommu_drivers_lock
);
264 mutex_unlock(&vfio
.iommu_drivers_lock
);
266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver
);
269 * Group minor allocation/free - both called with vfio.group_lock held
271 static int vfio_alloc_group_minor(struct vfio_group
*group
)
273 return idr_alloc(&vfio
.group_idr
, group
, 0, MINORMASK
+ 1, GFP_KERNEL
);
276 static void vfio_free_group_minor(int minor
)
278 idr_remove(&vfio
.group_idr
, minor
);
281 static int vfio_iommu_group_notifier(struct notifier_block
*nb
,
282 unsigned long action
, void *data
);
283 static void vfio_group_get(struct vfio_group
*group
);
286 * Container objects - containers are created when /dev/vfio/vfio is
287 * opened, but their lifecycle extends until the last user is done, so
288 * it's freed via kref. Must support container/group/device being
289 * closed in any order.
291 static void vfio_container_get(struct vfio_container
*container
)
293 kref_get(&container
->kref
);
296 static void vfio_container_release(struct kref
*kref
)
298 struct vfio_container
*container
;
299 container
= container_of(kref
, struct vfio_container
, kref
);
304 static void vfio_container_put(struct vfio_container
*container
)
306 kref_put(&container
->kref
, vfio_container_release
);
309 static void vfio_group_unlock_and_free(struct vfio_group
*group
)
311 mutex_unlock(&vfio
.group_lock
);
313 * Unregister outside of lock. A spurious callback is harmless now
314 * that the group is no longer in vfio.group_list.
316 iommu_group_unregister_notifier(group
->iommu_group
, &group
->nb
);
321 * Group objects - create, release, get, put, search
323 static struct vfio_group
*vfio_create_group(struct iommu_group
*iommu_group
)
325 struct vfio_group
*group
, *tmp
;
329 group
= kzalloc(sizeof(*group
), GFP_KERNEL
);
331 return ERR_PTR(-ENOMEM
);
333 kref_init(&group
->kref
);
334 INIT_LIST_HEAD(&group
->device_list
);
335 mutex_init(&group
->device_lock
);
336 INIT_LIST_HEAD(&group
->unbound_list
);
337 mutex_init(&group
->unbound_lock
);
338 atomic_set(&group
->container_users
, 0);
339 atomic_set(&group
->opened
, 0);
340 group
->iommu_group
= iommu_group
;
341 #ifdef CONFIG_VFIO_NOIOMMU
342 group
->noiommu
= (iommu_group_get_iommudata(iommu_group
) == &noiommu
);
344 BLOCKING_INIT_NOTIFIER_HEAD(&group
->notifier
);
346 group
->nb
.notifier_call
= vfio_iommu_group_notifier
;
349 * blocking notifiers acquire a rwsem around registering and hold
350 * it around callback. Therefore, need to register outside of
351 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
352 * do anything unless it can find the group in vfio.group_list, so
353 * no harm in registering early.
355 ret
= iommu_group_register_notifier(iommu_group
, &group
->nb
);
361 mutex_lock(&vfio
.group_lock
);
363 /* Did we race creating this group? */
364 list_for_each_entry(tmp
, &vfio
.group_list
, vfio_next
) {
365 if (tmp
->iommu_group
== iommu_group
) {
367 vfio_group_unlock_and_free(group
);
372 minor
= vfio_alloc_group_minor(group
);
374 vfio_group_unlock_and_free(group
);
375 return ERR_PTR(minor
);
378 dev
= device_create(vfio
.class, NULL
,
379 MKDEV(MAJOR(vfio
.group_devt
), minor
),
380 group
, "%s%d", group
->noiommu
? "noiommu-" : "",
381 iommu_group_id(iommu_group
));
383 vfio_free_group_minor(minor
);
384 vfio_group_unlock_and_free(group
);
385 return (struct vfio_group
*)dev
; /* ERR_PTR */
388 group
->minor
= minor
;
391 list_add(&group
->vfio_next
, &vfio
.group_list
);
393 mutex_unlock(&vfio
.group_lock
);
398 /* called with vfio.group_lock held */
399 static void vfio_group_release(struct kref
*kref
)
401 struct vfio_group
*group
= container_of(kref
, struct vfio_group
, kref
);
402 struct vfio_unbound_dev
*unbound
, *tmp
;
403 struct iommu_group
*iommu_group
= group
->iommu_group
;
405 WARN_ON(!list_empty(&group
->device_list
));
406 WARN_ON(group
->notifier
.head
);
408 list_for_each_entry_safe(unbound
, tmp
,
409 &group
->unbound_list
, unbound_next
) {
410 list_del(&unbound
->unbound_next
);
414 device_destroy(vfio
.class, MKDEV(MAJOR(vfio
.group_devt
), group
->minor
));
415 list_del(&group
->vfio_next
);
416 vfio_free_group_minor(group
->minor
);
417 vfio_group_unlock_and_free(group
);
418 iommu_group_put(iommu_group
);
421 static void vfio_group_put(struct vfio_group
*group
)
423 kref_put_mutex(&group
->kref
, vfio_group_release
, &vfio
.group_lock
);
426 /* Assume group_lock or group reference is held */
427 static void vfio_group_get(struct vfio_group
*group
)
429 kref_get(&group
->kref
);
433 * Not really a try as we will sleep for mutex, but we need to make
434 * sure the group pointer is valid under lock and get a reference.
436 static struct vfio_group
*vfio_group_try_get(struct vfio_group
*group
)
438 struct vfio_group
*target
= group
;
440 mutex_lock(&vfio
.group_lock
);
441 list_for_each_entry(group
, &vfio
.group_list
, vfio_next
) {
442 if (group
== target
) {
443 vfio_group_get(group
);
444 mutex_unlock(&vfio
.group_lock
);
448 mutex_unlock(&vfio
.group_lock
);
454 struct vfio_group
*vfio_group_get_from_iommu(struct iommu_group
*iommu_group
)
456 struct vfio_group
*group
;
458 mutex_lock(&vfio
.group_lock
);
459 list_for_each_entry(group
, &vfio
.group_list
, vfio_next
) {
460 if (group
->iommu_group
== iommu_group
) {
461 vfio_group_get(group
);
462 mutex_unlock(&vfio
.group_lock
);
466 mutex_unlock(&vfio
.group_lock
);
471 static struct vfio_group
*vfio_group_get_from_minor(int minor
)
473 struct vfio_group
*group
;
475 mutex_lock(&vfio
.group_lock
);
476 group
= idr_find(&vfio
.group_idr
, minor
);
478 mutex_unlock(&vfio
.group_lock
);
481 vfio_group_get(group
);
482 mutex_unlock(&vfio
.group_lock
);
487 static struct vfio_group
*vfio_group_get_from_dev(struct device
*dev
)
489 struct iommu_group
*iommu_group
;
490 struct vfio_group
*group
;
492 iommu_group
= iommu_group_get(dev
);
496 group
= vfio_group_get_from_iommu(iommu_group
);
497 iommu_group_put(iommu_group
);
503 * Device objects - create, release, get, put, search
506 struct vfio_device
*vfio_group_create_device(struct vfio_group
*group
,
508 const struct vfio_device_ops
*ops
,
511 struct vfio_device
*device
;
513 device
= kzalloc(sizeof(*device
), GFP_KERNEL
);
515 return ERR_PTR(-ENOMEM
);
517 kref_init(&device
->kref
);
519 device
->group
= group
;
521 device
->device_data
= device_data
;
522 dev_set_drvdata(dev
, device
);
524 /* No need to get group_lock, caller has group reference */
525 vfio_group_get(group
);
527 mutex_lock(&group
->device_lock
);
528 list_add(&device
->group_next
, &group
->device_list
);
529 mutex_unlock(&group
->device_lock
);
534 static void vfio_device_release(struct kref
*kref
)
536 struct vfio_device
*device
= container_of(kref
,
537 struct vfio_device
, kref
);
538 struct vfio_group
*group
= device
->group
;
540 list_del(&device
->group_next
);
541 mutex_unlock(&group
->device_lock
);
543 dev_set_drvdata(device
->dev
, NULL
);
547 /* vfio_del_group_dev may be waiting for this device */
548 wake_up(&vfio
.release_q
);
551 /* Device reference always implies a group reference */
552 void vfio_device_put(struct vfio_device
*device
)
554 struct vfio_group
*group
= device
->group
;
555 kref_put_mutex(&device
->kref
, vfio_device_release
, &group
->device_lock
);
556 vfio_group_put(group
);
558 EXPORT_SYMBOL_GPL(vfio_device_put
);
560 static void vfio_device_get(struct vfio_device
*device
)
562 vfio_group_get(device
->group
);
563 kref_get(&device
->kref
);
566 static struct vfio_device
*vfio_group_get_device(struct vfio_group
*group
,
569 struct vfio_device
*device
;
571 mutex_lock(&group
->device_lock
);
572 list_for_each_entry(device
, &group
->device_list
, group_next
) {
573 if (device
->dev
== dev
) {
574 vfio_device_get(device
);
575 mutex_unlock(&group
->device_lock
);
579 mutex_unlock(&group
->device_lock
);
584 * Some drivers, like pci-stub, are only used to prevent other drivers from
585 * claiming a device and are therefore perfectly legitimate for a user owned
586 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
587 * of the device, but it does prevent the user from having direct access to
588 * the device, which is useful in some circumstances.
590 * We also assume that we can include PCI interconnect devices, ie. bridges.
591 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
592 * then all of the downstream devices will be part of the same IOMMU group as
593 * the bridge. Thus, if placing the bridge into the user owned IOVA space
594 * breaks anything, it only does so for user owned devices downstream. Note
595 * that error notification via MSI can be affected for platforms that handle
596 * MSI within the same IOVA space as DMA.
598 static const char * const vfio_driver_whitelist
[] = { "pci-stub" };
600 static bool vfio_dev_whitelisted(struct device
*dev
, struct device_driver
*drv
)
604 if (dev_is_pci(dev
)) {
605 struct pci_dev
*pdev
= to_pci_dev(dev
);
607 if (pdev
->hdr_type
!= PCI_HEADER_TYPE_NORMAL
)
611 for (i
= 0; i
< ARRAY_SIZE(vfio_driver_whitelist
); i
++) {
612 if (!strcmp(drv
->name
, vfio_driver_whitelist
[i
]))
620 * A vfio group is viable for use by userspace if all devices are in
621 * one of the following states:
623 * - bound to a vfio driver
624 * - bound to a whitelisted driver
625 * - a PCI interconnect device
627 * We use two methods to determine whether a device is bound to a vfio
628 * driver. The first is to test whether the device exists in the vfio
629 * group. The second is to test if the device exists on the group
630 * unbound_list, indicating it's in the middle of transitioning from
631 * a vfio driver to driver-less.
633 static int vfio_dev_viable(struct device
*dev
, void *data
)
635 struct vfio_group
*group
= data
;
636 struct vfio_device
*device
;
637 struct device_driver
*drv
= ACCESS_ONCE(dev
->driver
);
638 struct vfio_unbound_dev
*unbound
;
641 mutex_lock(&group
->unbound_lock
);
642 list_for_each_entry(unbound
, &group
->unbound_list
, unbound_next
) {
643 if (dev
== unbound
->dev
) {
648 mutex_unlock(&group
->unbound_lock
);
650 if (!ret
|| !drv
|| vfio_dev_whitelisted(dev
, drv
))
653 device
= vfio_group_get_device(group
, dev
);
655 vfio_device_put(device
);
663 * Async device support
665 static int vfio_group_nb_add_dev(struct vfio_group
*group
, struct device
*dev
)
667 struct vfio_device
*device
;
669 /* Do we already know about it? We shouldn't */
670 device
= vfio_group_get_device(group
, dev
);
671 if (WARN_ON_ONCE(device
)) {
672 vfio_device_put(device
);
676 /* Nothing to do for idle groups */
677 if (!atomic_read(&group
->container_users
))
680 /* TODO Prevent device auto probing */
681 WARN(1, "Device %s added to live group %d!\n", dev_name(dev
),
682 iommu_group_id(group
->iommu_group
));
687 static int vfio_group_nb_verify(struct vfio_group
*group
, struct device
*dev
)
689 /* We don't care what happens when the group isn't in use */
690 if (!atomic_read(&group
->container_users
))
693 return vfio_dev_viable(dev
, group
);
696 static int vfio_iommu_group_notifier(struct notifier_block
*nb
,
697 unsigned long action
, void *data
)
699 struct vfio_group
*group
= container_of(nb
, struct vfio_group
, nb
);
700 struct device
*dev
= data
;
701 struct vfio_unbound_dev
*unbound
;
704 * Need to go through a group_lock lookup to get a reference or we
705 * risk racing a group being removed. Ignore spurious notifies.
707 group
= vfio_group_try_get(group
);
712 case IOMMU_GROUP_NOTIFY_ADD_DEVICE
:
713 vfio_group_nb_add_dev(group
, dev
);
715 case IOMMU_GROUP_NOTIFY_DEL_DEVICE
:
717 * Nothing to do here. If the device is in use, then the
718 * vfio sub-driver should block the remove callback until
719 * it is unused. If the device is unused or attached to a
720 * stub driver, then it should be released and we don't
721 * care that it will be going away.
724 case IOMMU_GROUP_NOTIFY_BIND_DRIVER
:
725 pr_debug("%s: Device %s, group %d binding to driver\n",
726 __func__
, dev_name(dev
),
727 iommu_group_id(group
->iommu_group
));
729 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER
:
730 pr_debug("%s: Device %s, group %d bound to driver %s\n",
731 __func__
, dev_name(dev
),
732 iommu_group_id(group
->iommu_group
), dev
->driver
->name
);
733 BUG_ON(vfio_group_nb_verify(group
, dev
));
735 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER
:
736 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
737 __func__
, dev_name(dev
),
738 iommu_group_id(group
->iommu_group
), dev
->driver
->name
);
740 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER
:
741 pr_debug("%s: Device %s, group %d unbound from driver\n",
742 __func__
, dev_name(dev
),
743 iommu_group_id(group
->iommu_group
));
745 * XXX An unbound device in a live group is ok, but we'd
746 * really like to avoid the above BUG_ON by preventing other
747 * drivers from binding to it. Once that occurs, we have to
748 * stop the system to maintain isolation. At a minimum, we'd
749 * want a toggle to disable driver auto probe for this device.
752 mutex_lock(&group
->unbound_lock
);
753 list_for_each_entry(unbound
,
754 &group
->unbound_list
, unbound_next
) {
755 if (dev
== unbound
->dev
) {
756 list_del(&unbound
->unbound_next
);
761 mutex_unlock(&group
->unbound_lock
);
765 vfio_group_put(group
);
772 int vfio_add_group_dev(struct device
*dev
,
773 const struct vfio_device_ops
*ops
, void *device_data
)
775 struct iommu_group
*iommu_group
;
776 struct vfio_group
*group
;
777 struct vfio_device
*device
;
779 iommu_group
= iommu_group_get(dev
);
783 group
= vfio_group_get_from_iommu(iommu_group
);
785 group
= vfio_create_group(iommu_group
);
787 iommu_group_put(iommu_group
);
788 return PTR_ERR(group
);
792 * A found vfio_group already holds a reference to the
793 * iommu_group. A created vfio_group keeps the reference.
795 iommu_group_put(iommu_group
);
798 device
= vfio_group_get_device(group
, dev
);
800 WARN(1, "Device %s already exists on group %d\n",
801 dev_name(dev
), iommu_group_id(iommu_group
));
802 vfio_device_put(device
);
803 vfio_group_put(group
);
807 device
= vfio_group_create_device(group
, dev
, ops
, device_data
);
808 if (IS_ERR(device
)) {
809 vfio_group_put(group
);
810 return PTR_ERR(device
);
814 * Drop all but the vfio_device reference. The vfio_device holds
815 * a reference to the vfio_group, which holds a reference to the
818 vfio_group_put(group
);
822 EXPORT_SYMBOL_GPL(vfio_add_group_dev
);
825 * Get a reference to the vfio_device for a device. Even if the
826 * caller thinks they own the device, they could be racing with a
827 * release call path, so we can't trust drvdata for the shortcut.
828 * Go the long way around, from the iommu_group to the vfio_group
829 * to the vfio_device.
831 struct vfio_device
*vfio_device_get_from_dev(struct device
*dev
)
833 struct vfio_group
*group
;
834 struct vfio_device
*device
;
836 group
= vfio_group_get_from_dev(dev
);
840 device
= vfio_group_get_device(group
, dev
);
841 vfio_group_put(group
);
845 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev
);
847 static struct vfio_device
*vfio_device_get_from_name(struct vfio_group
*group
,
850 struct vfio_device
*it
, *device
= NULL
;
852 mutex_lock(&group
->device_lock
);
853 list_for_each_entry(it
, &group
->device_list
, group_next
) {
854 if (!strcmp(dev_name(it
->dev
), buf
)) {
856 vfio_device_get(device
);
860 mutex_unlock(&group
->device_lock
);
866 * Caller must hold a reference to the vfio_device
868 void *vfio_device_data(struct vfio_device
*device
)
870 return device
->device_data
;
872 EXPORT_SYMBOL_GPL(vfio_device_data
);
874 /* Given a referenced group, check if it contains the device */
875 static bool vfio_dev_present(struct vfio_group
*group
, struct device
*dev
)
877 struct vfio_device
*device
;
879 device
= vfio_group_get_device(group
, dev
);
883 vfio_device_put(device
);
888 * Decrement the device reference count and wait for the device to be
889 * removed. Open file descriptors for the device... */
890 void *vfio_del_group_dev(struct device
*dev
)
892 struct vfio_device
*device
= dev_get_drvdata(dev
);
893 struct vfio_group
*group
= device
->group
;
894 void *device_data
= device
->device_data
;
895 struct vfio_unbound_dev
*unbound
;
898 bool interrupted
= false;
901 * The group exists so long as we have a device reference. Get
902 * a group reference and use it to scan for the device going away.
904 vfio_group_get(group
);
907 * When the device is removed from the group, the group suddenly
908 * becomes non-viable; the device has a driver (until the unbind
909 * completes), but it's not present in the group. This is bad news
910 * for any external users that need to re-acquire a group reference
911 * in order to match and release their existing reference. To
912 * solve this, we track such devices on the unbound_list to bridge
913 * the gap until they're fully unbound.
915 unbound
= kzalloc(sizeof(*unbound
), GFP_KERNEL
);
918 mutex_lock(&group
->unbound_lock
);
919 list_add(&unbound
->unbound_next
, &group
->unbound_list
);
920 mutex_unlock(&group
->unbound_lock
);
924 vfio_device_put(device
);
927 * If the device is still present in the group after the above
928 * 'put', then it is in use and we need to request it from the
929 * bus driver. The driver may in turn need to request the
930 * device from the user. We send the request on an arbitrary
931 * interval with counter to allow the driver to take escalating
932 * measures to release the device if it has the ability to do so.
935 device
= vfio_group_get_device(group
, dev
);
939 if (device
->ops
->request
)
940 device
->ops
->request(device_data
, i
++);
942 vfio_device_put(device
);
945 ret
= wait_event_timeout(vfio
.release_q
,
946 !vfio_dev_present(group
, dev
), HZ
* 10);
948 ret
= wait_event_interruptible_timeout(vfio
.release_q
,
949 !vfio_dev_present(group
, dev
), HZ
* 10);
950 if (ret
== -ERESTARTSYS
) {
953 "Device is currently in use, task"
955 "blocked until device is released",
956 current
->comm
, task_pid_nr(current
));
961 vfio_group_put(group
);
965 EXPORT_SYMBOL_GPL(vfio_del_group_dev
);
968 * VFIO base fd, /dev/vfio/vfio
970 static long vfio_ioctl_check_extension(struct vfio_container
*container
,
973 struct vfio_iommu_driver
*driver
;
976 down_read(&container
->group_lock
);
978 driver
= container
->iommu_driver
;
981 /* No base extensions yet */
984 * If no driver is set, poll all registered drivers for
985 * extensions and return the first positive result. If
986 * a driver is already set, further queries will be passed
987 * only to that driver.
990 mutex_lock(&vfio
.iommu_drivers_lock
);
991 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
,
994 #ifdef CONFIG_VFIO_NOIOMMU
995 if (!list_empty(&container
->group_list
) &&
996 (container
->noiommu
!=
997 (driver
->ops
== &vfio_noiommu_ops
)))
1001 if (!try_module_get(driver
->ops
->owner
))
1004 ret
= driver
->ops
->ioctl(NULL
,
1005 VFIO_CHECK_EXTENSION
,
1007 module_put(driver
->ops
->owner
);
1011 mutex_unlock(&vfio
.iommu_drivers_lock
);
1013 ret
= driver
->ops
->ioctl(container
->iommu_data
,
1014 VFIO_CHECK_EXTENSION
, arg
);
1017 up_read(&container
->group_lock
);
1022 /* hold write lock on container->group_lock */
1023 static int __vfio_container_attach_groups(struct vfio_container
*container
,
1024 struct vfio_iommu_driver
*driver
,
1027 struct vfio_group
*group
;
1030 list_for_each_entry(group
, &container
->group_list
, container_next
) {
1031 ret
= driver
->ops
->attach_group(data
, group
->iommu_group
);
1039 list_for_each_entry_continue_reverse(group
, &container
->group_list
,
1041 driver
->ops
->detach_group(data
, group
->iommu_group
);
1047 static long vfio_ioctl_set_iommu(struct vfio_container
*container
,
1050 struct vfio_iommu_driver
*driver
;
1053 down_write(&container
->group_lock
);
1056 * The container is designed to be an unprivileged interface while
1057 * the group can be assigned to specific users. Therefore, only by
1058 * adding a group to a container does the user get the privilege of
1059 * enabling the iommu, which may allocate finite resources. There
1060 * is no unset_iommu, but by removing all the groups from a container,
1061 * the container is deprivileged and returns to an unset state.
1063 if (list_empty(&container
->group_list
) || container
->iommu_driver
) {
1064 up_write(&container
->group_lock
);
1068 mutex_lock(&vfio
.iommu_drivers_lock
);
1069 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
, vfio_next
) {
1072 #ifdef CONFIG_VFIO_NOIOMMU
1074 * Only noiommu containers can use vfio-noiommu and noiommu
1075 * containers can only use vfio-noiommu.
1077 if (container
->noiommu
!= (driver
->ops
== &vfio_noiommu_ops
))
1081 if (!try_module_get(driver
->ops
->owner
))
1085 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1086 * so test which iommu driver reported support for this
1087 * extension and call open on them. We also pass them the
1088 * magic, allowing a single driver to support multiple
1089 * interfaces if they'd like.
1091 if (driver
->ops
->ioctl(NULL
, VFIO_CHECK_EXTENSION
, arg
) <= 0) {
1092 module_put(driver
->ops
->owner
);
1096 data
= driver
->ops
->open(arg
);
1098 ret
= PTR_ERR(data
);
1099 module_put(driver
->ops
->owner
);
1103 ret
= __vfio_container_attach_groups(container
, driver
, data
);
1105 driver
->ops
->release(data
);
1106 module_put(driver
->ops
->owner
);
1110 container
->iommu_driver
= driver
;
1111 container
->iommu_data
= data
;
1115 mutex_unlock(&vfio
.iommu_drivers_lock
);
1116 up_write(&container
->group_lock
);
1121 static long vfio_fops_unl_ioctl(struct file
*filep
,
1122 unsigned int cmd
, unsigned long arg
)
1124 struct vfio_container
*container
= filep
->private_data
;
1125 struct vfio_iommu_driver
*driver
;
1133 case VFIO_GET_API_VERSION
:
1134 ret
= VFIO_API_VERSION
;
1136 case VFIO_CHECK_EXTENSION
:
1137 ret
= vfio_ioctl_check_extension(container
, arg
);
1139 case VFIO_SET_IOMMU
:
1140 ret
= vfio_ioctl_set_iommu(container
, arg
);
1143 down_read(&container
->group_lock
);
1145 driver
= container
->iommu_driver
;
1146 data
= container
->iommu_data
;
1148 if (driver
) /* passthrough all unrecognized ioctls */
1149 ret
= driver
->ops
->ioctl(data
, cmd
, arg
);
1151 up_read(&container
->group_lock
);
1157 #ifdef CONFIG_COMPAT
1158 static long vfio_fops_compat_ioctl(struct file
*filep
,
1159 unsigned int cmd
, unsigned long arg
)
1161 arg
= (unsigned long)compat_ptr(arg
);
1162 return vfio_fops_unl_ioctl(filep
, cmd
, arg
);
1164 #endif /* CONFIG_COMPAT */
1166 static int vfio_fops_open(struct inode
*inode
, struct file
*filep
)
1168 struct vfio_container
*container
;
1170 container
= kzalloc(sizeof(*container
), GFP_KERNEL
);
1174 INIT_LIST_HEAD(&container
->group_list
);
1175 init_rwsem(&container
->group_lock
);
1176 kref_init(&container
->kref
);
1178 filep
->private_data
= container
;
1183 static int vfio_fops_release(struct inode
*inode
, struct file
*filep
)
1185 struct vfio_container
*container
= filep
->private_data
;
1187 filep
->private_data
= NULL
;
1189 vfio_container_put(container
);
1195 * Once an iommu driver is set, we optionally pass read/write/mmap
1196 * on to the driver, allowing management interfaces beyond ioctl.
1198 static ssize_t
vfio_fops_read(struct file
*filep
, char __user
*buf
,
1199 size_t count
, loff_t
*ppos
)
1201 struct vfio_container
*container
= filep
->private_data
;
1202 struct vfio_iommu_driver
*driver
;
1203 ssize_t ret
= -EINVAL
;
1205 down_read(&container
->group_lock
);
1207 driver
= container
->iommu_driver
;
1208 if (likely(driver
&& driver
->ops
->read
))
1209 ret
= driver
->ops
->read(container
->iommu_data
,
1212 up_read(&container
->group_lock
);
1217 static ssize_t
vfio_fops_write(struct file
*filep
, const char __user
*buf
,
1218 size_t count
, loff_t
*ppos
)
1220 struct vfio_container
*container
= filep
->private_data
;
1221 struct vfio_iommu_driver
*driver
;
1222 ssize_t ret
= -EINVAL
;
1224 down_read(&container
->group_lock
);
1226 driver
= container
->iommu_driver
;
1227 if (likely(driver
&& driver
->ops
->write
))
1228 ret
= driver
->ops
->write(container
->iommu_data
,
1231 up_read(&container
->group_lock
);
1236 static int vfio_fops_mmap(struct file
*filep
, struct vm_area_struct
*vma
)
1238 struct vfio_container
*container
= filep
->private_data
;
1239 struct vfio_iommu_driver
*driver
;
1242 down_read(&container
->group_lock
);
1244 driver
= container
->iommu_driver
;
1245 if (likely(driver
&& driver
->ops
->mmap
))
1246 ret
= driver
->ops
->mmap(container
->iommu_data
, vma
);
1248 up_read(&container
->group_lock
);
1253 static const struct file_operations vfio_fops
= {
1254 .owner
= THIS_MODULE
,
1255 .open
= vfio_fops_open
,
1256 .release
= vfio_fops_release
,
1257 .read
= vfio_fops_read
,
1258 .write
= vfio_fops_write
,
1259 .unlocked_ioctl
= vfio_fops_unl_ioctl
,
1260 #ifdef CONFIG_COMPAT
1261 .compat_ioctl
= vfio_fops_compat_ioctl
,
1263 .mmap
= vfio_fops_mmap
,
1267 * VFIO Group fd, /dev/vfio/$GROUP
1269 static void __vfio_group_unset_container(struct vfio_group
*group
)
1271 struct vfio_container
*container
= group
->container
;
1272 struct vfio_iommu_driver
*driver
;
1274 down_write(&container
->group_lock
);
1276 driver
= container
->iommu_driver
;
1278 driver
->ops
->detach_group(container
->iommu_data
,
1279 group
->iommu_group
);
1281 group
->container
= NULL
;
1282 list_del(&group
->container_next
);
1284 /* Detaching the last group deprivileges a container, remove iommu */
1285 if (driver
&& list_empty(&container
->group_list
)) {
1286 driver
->ops
->release(container
->iommu_data
);
1287 module_put(driver
->ops
->owner
);
1288 container
->iommu_driver
= NULL
;
1289 container
->iommu_data
= NULL
;
1292 up_write(&container
->group_lock
);
1294 vfio_container_put(container
);
1298 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1299 * if there was no container to unset. Since the ioctl is called on
1300 * the group, we know that still exists, therefore the only valid
1301 * transition here is 1->0.
1303 static int vfio_group_unset_container(struct vfio_group
*group
)
1305 int users
= atomic_cmpxchg(&group
->container_users
, 1, 0);
1312 __vfio_group_unset_container(group
);
1318 * When removing container users, anything that removes the last user
1319 * implicitly removes the group from the container. That is, if the
1320 * group file descriptor is closed, as well as any device file descriptors,
1321 * the group is free.
1323 static void vfio_group_try_dissolve_container(struct vfio_group
*group
)
1325 if (0 == atomic_dec_if_positive(&group
->container_users
))
1326 __vfio_group_unset_container(group
);
1329 static int vfio_group_set_container(struct vfio_group
*group
, int container_fd
)
1332 struct vfio_container
*container
;
1333 struct vfio_iommu_driver
*driver
;
1336 if (atomic_read(&group
->container_users
))
1339 if (group
->noiommu
&& !capable(CAP_SYS_RAWIO
))
1342 f
= fdget(container_fd
);
1346 /* Sanity check, is this really our fd? */
1347 if (f
.file
->f_op
!= &vfio_fops
) {
1352 container
= f
.file
->private_data
;
1353 WARN_ON(!container
); /* fget ensures we don't race vfio_release */
1355 down_write(&container
->group_lock
);
1357 /* Real groups and fake groups cannot mix */
1358 if (!list_empty(&container
->group_list
) &&
1359 container
->noiommu
!= group
->noiommu
) {
1364 driver
= container
->iommu_driver
;
1366 ret
= driver
->ops
->attach_group(container
->iommu_data
,
1367 group
->iommu_group
);
1372 group
->container
= container
;
1373 container
->noiommu
= group
->noiommu
;
1374 list_add(&group
->container_next
, &container
->group_list
);
1376 /* Get a reference on the container and mark a user within the group */
1377 vfio_container_get(container
);
1378 atomic_inc(&group
->container_users
);
1381 up_write(&container
->group_lock
);
1386 static bool vfio_group_viable(struct vfio_group
*group
)
1388 return (iommu_group_for_each_dev(group
->iommu_group
,
1389 group
, vfio_dev_viable
) == 0);
1392 static int vfio_group_add_container_user(struct vfio_group
*group
)
1394 if (!atomic_inc_not_zero(&group
->container_users
))
1397 if (group
->noiommu
) {
1398 atomic_dec(&group
->container_users
);
1401 if (!group
->container
->iommu_driver
|| !vfio_group_viable(group
)) {
1402 atomic_dec(&group
->container_users
);
1409 static const struct file_operations vfio_device_fops
;
1411 static int vfio_group_get_device_fd(struct vfio_group
*group
, char *buf
)
1413 struct vfio_device
*device
;
1417 if (0 == atomic_read(&group
->container_users
) ||
1418 !group
->container
->iommu_driver
|| !vfio_group_viable(group
))
1421 if (group
->noiommu
&& !capable(CAP_SYS_RAWIO
))
1424 device
= vfio_device_get_from_name(group
, buf
);
1428 ret
= device
->ops
->open(device
->device_data
);
1430 vfio_device_put(device
);
1435 * We can't use anon_inode_getfd() because we need to modify
1436 * the f_mode flags directly to allow more than just ioctls
1438 ret
= get_unused_fd_flags(O_CLOEXEC
);
1440 device
->ops
->release(device
->device_data
);
1441 vfio_device_put(device
);
1445 filep
= anon_inode_getfile("[vfio-device]", &vfio_device_fops
,
1447 if (IS_ERR(filep
)) {
1449 ret
= PTR_ERR(filep
);
1450 device
->ops
->release(device
->device_data
);
1451 vfio_device_put(device
);
1456 * TODO: add an anon_inode interface to do this.
1457 * Appears to be missing by lack of need rather than
1458 * explicitly prevented. Now there's need.
1460 filep
->f_mode
|= (FMODE_LSEEK
| FMODE_PREAD
| FMODE_PWRITE
);
1462 atomic_inc(&group
->container_users
);
1464 fd_install(ret
, filep
);
1467 dev_warn(device
->dev
, "vfio-noiommu device opened by user "
1468 "(%s:%d)\n", current
->comm
, task_pid_nr(current
));
1473 static long vfio_group_fops_unl_ioctl(struct file
*filep
,
1474 unsigned int cmd
, unsigned long arg
)
1476 struct vfio_group
*group
= filep
->private_data
;
1480 case VFIO_GROUP_GET_STATUS
:
1482 struct vfio_group_status status
;
1483 unsigned long minsz
;
1485 minsz
= offsetofend(struct vfio_group_status
, flags
);
1487 if (copy_from_user(&status
, (void __user
*)arg
, minsz
))
1490 if (status
.argsz
< minsz
)
1495 if (vfio_group_viable(group
))
1496 status
.flags
|= VFIO_GROUP_FLAGS_VIABLE
;
1498 if (group
->container
)
1499 status
.flags
|= VFIO_GROUP_FLAGS_CONTAINER_SET
;
1501 if (copy_to_user((void __user
*)arg
, &status
, minsz
))
1507 case VFIO_GROUP_SET_CONTAINER
:
1511 if (get_user(fd
, (int __user
*)arg
))
1517 ret
= vfio_group_set_container(group
, fd
);
1520 case VFIO_GROUP_UNSET_CONTAINER
:
1521 ret
= vfio_group_unset_container(group
);
1523 case VFIO_GROUP_GET_DEVICE_FD
:
1527 buf
= strndup_user((const char __user
*)arg
, PAGE_SIZE
);
1529 return PTR_ERR(buf
);
1531 ret
= vfio_group_get_device_fd(group
, buf
);
1540 #ifdef CONFIG_COMPAT
1541 static long vfio_group_fops_compat_ioctl(struct file
*filep
,
1542 unsigned int cmd
, unsigned long arg
)
1544 arg
= (unsigned long)compat_ptr(arg
);
1545 return vfio_group_fops_unl_ioctl(filep
, cmd
, arg
);
1547 #endif /* CONFIG_COMPAT */
1549 static int vfio_group_fops_open(struct inode
*inode
, struct file
*filep
)
1551 struct vfio_group
*group
;
1554 group
= vfio_group_get_from_minor(iminor(inode
));
1558 if (group
->noiommu
&& !capable(CAP_SYS_RAWIO
)) {
1559 vfio_group_put(group
);
1563 /* Do we need multiple instances of the group open? Seems not. */
1564 opened
= atomic_cmpxchg(&group
->opened
, 0, 1);
1566 vfio_group_put(group
);
1570 /* Is something still in use from a previous open? */
1571 if (group
->container
) {
1572 atomic_dec(&group
->opened
);
1573 vfio_group_put(group
);
1577 /* Warn if previous user didn't cleanup and re-init to drop them */
1578 if (WARN_ON(group
->notifier
.head
))
1579 BLOCKING_INIT_NOTIFIER_HEAD(&group
->notifier
);
1581 filep
->private_data
= group
;
1586 static int vfio_group_fops_release(struct inode
*inode
, struct file
*filep
)
1588 struct vfio_group
*group
= filep
->private_data
;
1590 filep
->private_data
= NULL
;
1592 vfio_group_try_dissolve_container(group
);
1594 atomic_dec(&group
->opened
);
1596 vfio_group_put(group
);
1601 static const struct file_operations vfio_group_fops
= {
1602 .owner
= THIS_MODULE
,
1603 .unlocked_ioctl
= vfio_group_fops_unl_ioctl
,
1604 #ifdef CONFIG_COMPAT
1605 .compat_ioctl
= vfio_group_fops_compat_ioctl
,
1607 .open
= vfio_group_fops_open
,
1608 .release
= vfio_group_fops_release
,
1614 static int vfio_device_fops_release(struct inode
*inode
, struct file
*filep
)
1616 struct vfio_device
*device
= filep
->private_data
;
1618 device
->ops
->release(device
->device_data
);
1620 vfio_group_try_dissolve_container(device
->group
);
1622 vfio_device_put(device
);
1627 static long vfio_device_fops_unl_ioctl(struct file
*filep
,
1628 unsigned int cmd
, unsigned long arg
)
1630 struct vfio_device
*device
= filep
->private_data
;
1632 if (unlikely(!device
->ops
->ioctl
))
1635 return device
->ops
->ioctl(device
->device_data
, cmd
, arg
);
1638 static ssize_t
vfio_device_fops_read(struct file
*filep
, char __user
*buf
,
1639 size_t count
, loff_t
*ppos
)
1641 struct vfio_device
*device
= filep
->private_data
;
1643 if (unlikely(!device
->ops
->read
))
1646 return device
->ops
->read(device
->device_data
, buf
, count
, ppos
);
1649 static ssize_t
vfio_device_fops_write(struct file
*filep
,
1650 const char __user
*buf
,
1651 size_t count
, loff_t
*ppos
)
1653 struct vfio_device
*device
= filep
->private_data
;
1655 if (unlikely(!device
->ops
->write
))
1658 return device
->ops
->write(device
->device_data
, buf
, count
, ppos
);
1661 static int vfio_device_fops_mmap(struct file
*filep
, struct vm_area_struct
*vma
)
1663 struct vfio_device
*device
= filep
->private_data
;
1665 if (unlikely(!device
->ops
->mmap
))
1668 return device
->ops
->mmap(device
->device_data
, vma
);
1671 #ifdef CONFIG_COMPAT
1672 static long vfio_device_fops_compat_ioctl(struct file
*filep
,
1673 unsigned int cmd
, unsigned long arg
)
1675 arg
= (unsigned long)compat_ptr(arg
);
1676 return vfio_device_fops_unl_ioctl(filep
, cmd
, arg
);
1678 #endif /* CONFIG_COMPAT */
1680 static const struct file_operations vfio_device_fops
= {
1681 .owner
= THIS_MODULE
,
1682 .release
= vfio_device_fops_release
,
1683 .read
= vfio_device_fops_read
,
1684 .write
= vfio_device_fops_write
,
1685 .unlocked_ioctl
= vfio_device_fops_unl_ioctl
,
1686 #ifdef CONFIG_COMPAT
1687 .compat_ioctl
= vfio_device_fops_compat_ioctl
,
1689 .mmap
= vfio_device_fops_mmap
,
1693 * External user API, exported by symbols to be linked dynamically.
1695 * The protocol includes:
1696 * 1. do normal VFIO init operation:
1697 * - opening a new container;
1698 * - attaching group(s) to it;
1699 * - setting an IOMMU driver for a container.
1700 * When IOMMU is set for a container, all groups in it are
1701 * considered ready to use by an external user.
1703 * 2. User space passes a group fd to an external user.
1704 * The external user calls vfio_group_get_external_user()
1706 * - the group is initialized;
1707 * - IOMMU is set for it.
1708 * If both checks passed, vfio_group_get_external_user()
1709 * increments the container user counter to prevent
1710 * the VFIO group from disposal before KVM exits.
1712 * 3. The external user calls vfio_external_user_iommu_id()
1713 * to know an IOMMU ID.
1715 * 4. When the external KVM finishes, it calls
1716 * vfio_group_put_external_user() to release the VFIO group.
1717 * This call decrements the container user counter.
1719 struct vfio_group
*vfio_group_get_external_user(struct file
*filep
)
1721 struct vfio_group
*group
= filep
->private_data
;
1724 if (filep
->f_op
!= &vfio_group_fops
)
1725 return ERR_PTR(-EINVAL
);
1727 ret
= vfio_group_add_container_user(group
);
1729 return ERR_PTR(ret
);
1731 vfio_group_get(group
);
1735 EXPORT_SYMBOL_GPL(vfio_group_get_external_user
);
1737 void vfio_group_put_external_user(struct vfio_group
*group
)
1739 vfio_group_try_dissolve_container(group
);
1740 vfio_group_put(group
);
1742 EXPORT_SYMBOL_GPL(vfio_group_put_external_user
);
1744 int vfio_external_user_iommu_id(struct vfio_group
*group
)
1746 return iommu_group_id(group
->iommu_group
);
1748 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id
);
1750 long vfio_external_check_extension(struct vfio_group
*group
, unsigned long arg
)
1752 return vfio_ioctl_check_extension(group
->container
, arg
);
1754 EXPORT_SYMBOL_GPL(vfio_external_check_extension
);
1757 * Sub-module support
1760 * Helper for managing a buffer of info chain capabilities, allocate or
1761 * reallocate a buffer with additional @size, filling in @id and @version
1762 * of the capability. A pointer to the new capability is returned.
1764 * NB. The chain is based at the head of the buffer, so new entries are
1765 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1766 * next offsets prior to copying to the user buffer.
1768 struct vfio_info_cap_header
*vfio_info_cap_add(struct vfio_info_cap
*caps
,
1769 size_t size
, u16 id
, u16 version
)
1772 struct vfio_info_cap_header
*header
, *tmp
;
1774 buf
= krealloc(caps
->buf
, caps
->size
+ size
, GFP_KERNEL
);
1778 return ERR_PTR(-ENOMEM
);
1782 header
= buf
+ caps
->size
;
1784 /* Eventually copied to user buffer, zero */
1785 memset(header
, 0, size
);
1788 header
->version
= version
;
1790 /* Add to the end of the capability chain */
1791 for (tmp
= buf
; tmp
->next
; tmp
= buf
+ tmp
->next
)
1794 tmp
->next
= caps
->size
;
1799 EXPORT_SYMBOL_GPL(vfio_info_cap_add
);
1801 void vfio_info_cap_shift(struct vfio_info_cap
*caps
, size_t offset
)
1803 struct vfio_info_cap_header
*tmp
;
1804 void *buf
= (void *)caps
->buf
;
1806 for (tmp
= buf
; tmp
->next
; tmp
= buf
+ tmp
->next
- offset
)
1807 tmp
->next
+= offset
;
1809 EXPORT_SYMBOL(vfio_info_cap_shift
);
1811 static int sparse_mmap_cap(struct vfio_info_cap
*caps
, void *cap_type
)
1813 struct vfio_info_cap_header
*header
;
1814 struct vfio_region_info_cap_sparse_mmap
*sparse_cap
, *sparse
= cap_type
;
1817 size
= sizeof(*sparse
) + sparse
->nr_areas
* sizeof(*sparse
->areas
);
1818 header
= vfio_info_cap_add(caps
, size
,
1819 VFIO_REGION_INFO_CAP_SPARSE_MMAP
, 1);
1821 return PTR_ERR(header
);
1823 sparse_cap
= container_of(header
,
1824 struct vfio_region_info_cap_sparse_mmap
, header
);
1825 sparse_cap
->nr_areas
= sparse
->nr_areas
;
1826 memcpy(sparse_cap
->areas
, sparse
->areas
,
1827 sparse
->nr_areas
* sizeof(*sparse
->areas
));
1831 static int region_type_cap(struct vfio_info_cap
*caps
, void *cap_type
)
1833 struct vfio_info_cap_header
*header
;
1834 struct vfio_region_info_cap_type
*type_cap
, *cap
= cap_type
;
1836 header
= vfio_info_cap_add(caps
, sizeof(*cap
),
1837 VFIO_REGION_INFO_CAP_TYPE
, 1);
1839 return PTR_ERR(header
);
1841 type_cap
= container_of(header
, struct vfio_region_info_cap_type
,
1843 type_cap
->type
= cap
->type
;
1844 type_cap
->subtype
= cap
->subtype
;
1848 int vfio_info_add_capability(struct vfio_info_cap
*caps
, int cap_type_id
,
1856 switch (cap_type_id
) {
1857 case VFIO_REGION_INFO_CAP_SPARSE_MMAP
:
1858 ret
= sparse_mmap_cap(caps
, cap_type
);
1861 case VFIO_REGION_INFO_CAP_TYPE
:
1862 ret
= region_type_cap(caps
, cap_type
);
1868 EXPORT_SYMBOL(vfio_info_add_capability
);
1870 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set
*hdr
, int num_irqs
,
1871 int max_irq_type
, size_t *data_size
)
1873 unsigned long minsz
;
1876 minsz
= offsetofend(struct vfio_irq_set
, count
);
1878 if ((hdr
->argsz
< minsz
) || (hdr
->index
>= max_irq_type
) ||
1879 (hdr
->count
>= (U32_MAX
- hdr
->start
)) ||
1880 (hdr
->flags
& ~(VFIO_IRQ_SET_DATA_TYPE_MASK
|
1881 VFIO_IRQ_SET_ACTION_TYPE_MASK
)))
1887 if (hdr
->start
>= num_irqs
|| hdr
->start
+ hdr
->count
> num_irqs
)
1890 switch (hdr
->flags
& VFIO_IRQ_SET_DATA_TYPE_MASK
) {
1891 case VFIO_IRQ_SET_DATA_NONE
:
1894 case VFIO_IRQ_SET_DATA_BOOL
:
1895 size
= sizeof(uint8_t);
1897 case VFIO_IRQ_SET_DATA_EVENTFD
:
1898 size
= sizeof(int32_t);
1905 if (hdr
->argsz
- minsz
< hdr
->count
* size
)
1911 *data_size
= hdr
->count
* size
;
1916 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare
);
1919 * Pin a set of guest PFNs and return their associated host PFNs for local
1921 * @dev [in] : device
1922 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1923 * @npage [in] : count of elements in user_pfn array. This count should not
1924 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1925 * @prot [in] : protection flags
1926 * @phys_pfn[out]: array of host PFNs
1927 * Return error or number of pages pinned.
1929 int vfio_pin_pages(struct device
*dev
, unsigned long *user_pfn
, int npage
,
1930 int prot
, unsigned long *phys_pfn
)
1932 struct vfio_container
*container
;
1933 struct vfio_group
*group
;
1934 struct vfio_iommu_driver
*driver
;
1937 if (!dev
|| !user_pfn
|| !phys_pfn
|| !npage
)
1940 if (npage
> VFIO_PIN_PAGES_MAX_ENTRIES
)
1943 group
= vfio_group_get_from_dev(dev
);
1947 ret
= vfio_group_add_container_user(group
);
1951 container
= group
->container
;
1952 down_read(&container
->group_lock
);
1954 driver
= container
->iommu_driver
;
1955 if (likely(driver
&& driver
->ops
->pin_pages
))
1956 ret
= driver
->ops
->pin_pages(container
->iommu_data
, user_pfn
,
1957 npage
, prot
, phys_pfn
);
1961 up_read(&container
->group_lock
);
1962 vfio_group_try_dissolve_container(group
);
1965 vfio_group_put(group
);
1968 EXPORT_SYMBOL(vfio_pin_pages
);
1971 * Unpin set of host PFNs for local domain only.
1972 * @dev [in] : device
1973 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1974 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1975 * @npage [in] : count of elements in user_pfn array. This count should not
1976 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1977 * Return error or number of pages unpinned.
1979 int vfio_unpin_pages(struct device
*dev
, unsigned long *user_pfn
, int npage
)
1981 struct vfio_container
*container
;
1982 struct vfio_group
*group
;
1983 struct vfio_iommu_driver
*driver
;
1986 if (!dev
|| !user_pfn
|| !npage
)
1989 if (npage
> VFIO_PIN_PAGES_MAX_ENTRIES
)
1992 group
= vfio_group_get_from_dev(dev
);
1996 ret
= vfio_group_add_container_user(group
);
1998 goto err_unpin_pages
;
2000 container
= group
->container
;
2001 down_read(&container
->group_lock
);
2003 driver
= container
->iommu_driver
;
2004 if (likely(driver
&& driver
->ops
->unpin_pages
))
2005 ret
= driver
->ops
->unpin_pages(container
->iommu_data
, user_pfn
,
2010 up_read(&container
->group_lock
);
2011 vfio_group_try_dissolve_container(group
);
2014 vfio_group_put(group
);
2017 EXPORT_SYMBOL(vfio_unpin_pages
);
2019 static int vfio_register_iommu_notifier(struct vfio_group
*group
,
2020 unsigned long *events
,
2021 struct notifier_block
*nb
)
2023 struct vfio_container
*container
;
2024 struct vfio_iommu_driver
*driver
;
2027 ret
= vfio_group_add_container_user(group
);
2031 container
= group
->container
;
2032 down_read(&container
->group_lock
);
2034 driver
= container
->iommu_driver
;
2035 if (likely(driver
&& driver
->ops
->register_notifier
))
2036 ret
= driver
->ops
->register_notifier(container
->iommu_data
,
2041 up_read(&container
->group_lock
);
2042 vfio_group_try_dissolve_container(group
);
2047 static int vfio_unregister_iommu_notifier(struct vfio_group
*group
,
2048 struct notifier_block
*nb
)
2050 struct vfio_container
*container
;
2051 struct vfio_iommu_driver
*driver
;
2054 ret
= vfio_group_add_container_user(group
);
2058 container
= group
->container
;
2059 down_read(&container
->group_lock
);
2061 driver
= container
->iommu_driver
;
2062 if (likely(driver
&& driver
->ops
->unregister_notifier
))
2063 ret
= driver
->ops
->unregister_notifier(container
->iommu_data
,
2068 up_read(&container
->group_lock
);
2069 vfio_group_try_dissolve_container(group
);
2074 void vfio_group_set_kvm(struct vfio_group
*group
, struct kvm
*kvm
)
2077 blocking_notifier_call_chain(&group
->notifier
,
2078 VFIO_GROUP_NOTIFY_SET_KVM
, kvm
);
2080 EXPORT_SYMBOL_GPL(vfio_group_set_kvm
);
2082 static int vfio_register_group_notifier(struct vfio_group
*group
,
2083 unsigned long *events
,
2084 struct notifier_block
*nb
)
2086 struct vfio_container
*container
;
2088 bool set_kvm
= false;
2090 if (*events
& VFIO_GROUP_NOTIFY_SET_KVM
)
2093 /* clear known events */
2094 *events
&= ~VFIO_GROUP_NOTIFY_SET_KVM
;
2096 /* refuse to continue if still events remaining */
2100 ret
= vfio_group_add_container_user(group
);
2104 container
= group
->container
;
2105 down_read(&container
->group_lock
);
2107 ret
= blocking_notifier_chain_register(&group
->notifier
, nb
);
2110 * The attaching of kvm and vfio_group might already happen, so
2111 * here we replay once upon registration.
2113 if (!ret
&& set_kvm
&& group
->kvm
)
2114 blocking_notifier_call_chain(&group
->notifier
,
2115 VFIO_GROUP_NOTIFY_SET_KVM
, group
->kvm
);
2117 up_read(&container
->group_lock
);
2118 vfio_group_try_dissolve_container(group
);
2123 static int vfio_unregister_group_notifier(struct vfio_group
*group
,
2124 struct notifier_block
*nb
)
2126 struct vfio_container
*container
;
2129 ret
= vfio_group_add_container_user(group
);
2133 container
= group
->container
;
2134 down_read(&container
->group_lock
);
2136 ret
= blocking_notifier_chain_unregister(&group
->notifier
, nb
);
2138 up_read(&container
->group_lock
);
2139 vfio_group_try_dissolve_container(group
);
2144 int vfio_register_notifier(struct device
*dev
, enum vfio_notify_type type
,
2145 unsigned long *events
, struct notifier_block
*nb
)
2147 struct vfio_group
*group
;
2150 if (!dev
|| !nb
|| !events
|| (*events
== 0))
2153 group
= vfio_group_get_from_dev(dev
);
2158 case VFIO_IOMMU_NOTIFY
:
2159 ret
= vfio_register_iommu_notifier(group
, events
, nb
);
2161 case VFIO_GROUP_NOTIFY
:
2162 ret
= vfio_register_group_notifier(group
, events
, nb
);
2168 vfio_group_put(group
);
2171 EXPORT_SYMBOL(vfio_register_notifier
);
2173 int vfio_unregister_notifier(struct device
*dev
, enum vfio_notify_type type
,
2174 struct notifier_block
*nb
)
2176 struct vfio_group
*group
;
2182 group
= vfio_group_get_from_dev(dev
);
2187 case VFIO_IOMMU_NOTIFY
:
2188 ret
= vfio_unregister_iommu_notifier(group
, nb
);
2190 case VFIO_GROUP_NOTIFY
:
2191 ret
= vfio_unregister_group_notifier(group
, nb
);
2197 vfio_group_put(group
);
2200 EXPORT_SYMBOL(vfio_unregister_notifier
);
2203 * Module/class support
2205 static char *vfio_devnode(struct device
*dev
, umode_t
*mode
)
2207 return kasprintf(GFP_KERNEL
, "vfio/%s", dev_name(dev
));
2210 static struct miscdevice vfio_dev
= {
2211 .minor
= VFIO_MINOR
,
2214 .nodename
= "vfio/vfio",
2215 .mode
= S_IRUGO
| S_IWUGO
,
2218 static int __init
vfio_init(void)
2222 idr_init(&vfio
.group_idr
);
2223 mutex_init(&vfio
.group_lock
);
2224 mutex_init(&vfio
.iommu_drivers_lock
);
2225 INIT_LIST_HEAD(&vfio
.group_list
);
2226 INIT_LIST_HEAD(&vfio
.iommu_drivers_list
);
2227 init_waitqueue_head(&vfio
.release_q
);
2229 ret
= misc_register(&vfio_dev
);
2231 pr_err("vfio: misc device register failed\n");
2235 /* /dev/vfio/$GROUP */
2236 vfio
.class = class_create(THIS_MODULE
, "vfio");
2237 if (IS_ERR(vfio
.class)) {
2238 ret
= PTR_ERR(vfio
.class);
2242 vfio
.class->devnode
= vfio_devnode
;
2244 ret
= alloc_chrdev_region(&vfio
.group_devt
, 0, MINORMASK
, "vfio");
2246 goto err_alloc_chrdev
;
2248 cdev_init(&vfio
.group_cdev
, &vfio_group_fops
);
2249 ret
= cdev_add(&vfio
.group_cdev
, vfio
.group_devt
, MINORMASK
);
2253 pr_info(DRIVER_DESC
" version: " DRIVER_VERSION
"\n");
2255 #ifdef CONFIG_VFIO_NOIOMMU
2256 vfio_register_iommu_driver(&vfio_noiommu_ops
);
2261 unregister_chrdev_region(vfio
.group_devt
, MINORMASK
);
2263 class_destroy(vfio
.class);
2266 misc_deregister(&vfio_dev
);
2270 static void __exit
vfio_cleanup(void)
2272 WARN_ON(!list_empty(&vfio
.group_list
));
2274 #ifdef CONFIG_VFIO_NOIOMMU
2275 vfio_unregister_iommu_driver(&vfio_noiommu_ops
);
2277 idr_destroy(&vfio
.group_idr
);
2278 cdev_del(&vfio
.group_cdev
);
2279 unregister_chrdev_region(vfio
.group_devt
, MINORMASK
);
2280 class_destroy(vfio
.class);
2282 misc_deregister(&vfio_dev
);
2285 module_init(vfio_init
);
2286 module_exit(vfio_cleanup
);
2288 MODULE_VERSION(DRIVER_VERSION
);
2289 MODULE_LICENSE("GPL v2");
2290 MODULE_AUTHOR(DRIVER_AUTHOR
);
2291 MODULE_DESCRIPTION(DRIVER_DESC
);
2292 MODULE_ALIAS_MISCDEV(VFIO_MINOR
);
2293 MODULE_ALIAS("devname:vfio/vfio");
2294 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");