Revert "ALSA: hda: Flush interrupts on disabling"
[linux/fpc-iii.git] / drivers / vfio / vfio.c
blob881fc3a55edce847d9786c58ca74b98060c716d3
1 /*
2 * VFIO core
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/fs.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
38 #define DRIVER_VERSION "0.3"
39 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
42 static struct vfio {
43 struct class *class;
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
47 struct idr group_idr;
48 struct mutex group_lock;
49 struct cdev group_cdev;
50 dev_t group_devt;
51 wait_queue_head_t release_q;
52 } vfio;
54 struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
59 struct vfio_container {
60 struct kref kref;
61 struct list_head group_list;
62 struct rw_semaphore group_lock;
63 struct vfio_iommu_driver *iommu_driver;
64 void *iommu_data;
65 bool noiommu;
68 struct vfio_unbound_dev {
69 struct device *dev;
70 struct list_head unbound_next;
73 struct vfio_group {
74 struct kref kref;
75 int minor;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
81 struct device *dev;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
87 atomic_t opened;
88 bool noiommu;
91 struct vfio_device {
92 struct kref kref;
93 struct device *dev;
94 const struct vfio_device_ops *ops;
95 struct vfio_group *group;
96 struct list_head group_next;
97 void *device_data;
100 #ifdef CONFIG_VFIO_NOIOMMU
101 static bool noiommu __read_mostly;
102 module_param_named(enable_unsafe_noiommu_mode,
103 noiommu, bool, S_IRUGO | S_IWUSR);
104 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
105 #endif
108 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
109 * and remove functions, any use cases other than acquiring the first
110 * reference for the purpose of calling vfio_add_group_dev() or removing
111 * that symmetric reference after vfio_del_group_dev() should use the raw
112 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
113 * removes the device from the dummy group and cannot be nested.
115 struct iommu_group *vfio_iommu_group_get(struct device *dev)
117 struct iommu_group *group;
118 int __maybe_unused ret;
120 group = iommu_group_get(dev);
122 #ifdef CONFIG_VFIO_NOIOMMU
124 * With noiommu enabled, an IOMMU group will be created for a device
125 * that doesn't already have one and doesn't have an iommu_ops on their
126 * bus. We set iommudata simply to be able to identify these groups
127 * as special use and for reclamation later.
129 if (group || !noiommu || iommu_present(dev->bus))
130 return group;
132 group = iommu_group_alloc();
133 if (IS_ERR(group))
134 return NULL;
136 iommu_group_set_name(group, "vfio-noiommu");
137 iommu_group_set_iommudata(group, &noiommu, NULL);
138 ret = iommu_group_add_device(group, dev);
139 iommu_group_put(group);
140 if (ret)
141 return NULL;
144 * Where to taint? At this point we've added an IOMMU group for a
145 * device that is not backed by iommu_ops, therefore any iommu_
146 * callback using iommu_ops can legitimately Oops. So, while we may
147 * be about to give a DMA capable device to a user without IOMMU
148 * protection, which is clearly taint-worthy, let's go ahead and do
149 * it here.
151 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
152 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
153 #endif
155 return group;
157 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
159 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
161 #ifdef CONFIG_VFIO_NOIOMMU
162 if (iommu_group_get_iommudata(group) == &noiommu)
163 iommu_group_remove_device(dev);
164 #endif
166 iommu_group_put(group);
168 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
170 #ifdef CONFIG_VFIO_NOIOMMU
171 static void *vfio_noiommu_open(unsigned long arg)
173 if (arg != VFIO_NOIOMMU_IOMMU)
174 return ERR_PTR(-EINVAL);
175 if (!capable(CAP_SYS_RAWIO))
176 return ERR_PTR(-EPERM);
178 return NULL;
181 static void vfio_noiommu_release(void *iommu_data)
185 static long vfio_noiommu_ioctl(void *iommu_data,
186 unsigned int cmd, unsigned long arg)
188 if (cmd == VFIO_CHECK_EXTENSION)
189 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
191 return -ENOTTY;
194 static int vfio_noiommu_attach_group(void *iommu_data,
195 struct iommu_group *iommu_group)
197 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
200 static void vfio_noiommu_detach_group(void *iommu_data,
201 struct iommu_group *iommu_group)
205 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
206 .name = "vfio-noiommu",
207 .owner = THIS_MODULE,
208 .open = vfio_noiommu_open,
209 .release = vfio_noiommu_release,
210 .ioctl = vfio_noiommu_ioctl,
211 .attach_group = vfio_noiommu_attach_group,
212 .detach_group = vfio_noiommu_detach_group,
214 #endif
218 * IOMMU driver registration
220 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
222 struct vfio_iommu_driver *driver, *tmp;
224 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
225 if (!driver)
226 return -ENOMEM;
228 driver->ops = ops;
230 mutex_lock(&vfio.iommu_drivers_lock);
232 /* Check for duplicates */
233 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
234 if (tmp->ops == ops) {
235 mutex_unlock(&vfio.iommu_drivers_lock);
236 kfree(driver);
237 return -EINVAL;
241 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
243 mutex_unlock(&vfio.iommu_drivers_lock);
245 return 0;
247 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
249 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
251 struct vfio_iommu_driver *driver;
253 mutex_lock(&vfio.iommu_drivers_lock);
254 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
255 if (driver->ops == ops) {
256 list_del(&driver->vfio_next);
257 mutex_unlock(&vfio.iommu_drivers_lock);
258 kfree(driver);
259 return;
262 mutex_unlock(&vfio.iommu_drivers_lock);
264 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
267 * Group minor allocation/free - both called with vfio.group_lock held
269 static int vfio_alloc_group_minor(struct vfio_group *group)
271 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
274 static void vfio_free_group_minor(int minor)
276 idr_remove(&vfio.group_idr, minor);
279 static int vfio_iommu_group_notifier(struct notifier_block *nb,
280 unsigned long action, void *data);
281 static void vfio_group_get(struct vfio_group *group);
284 * Container objects - containers are created when /dev/vfio/vfio is
285 * opened, but their lifecycle extends until the last user is done, so
286 * it's freed via kref. Must support container/group/device being
287 * closed in any order.
289 static void vfio_container_get(struct vfio_container *container)
291 kref_get(&container->kref);
294 static void vfio_container_release(struct kref *kref)
296 struct vfio_container *container;
297 container = container_of(kref, struct vfio_container, kref);
299 kfree(container);
302 static void vfio_container_put(struct vfio_container *container)
304 kref_put(&container->kref, vfio_container_release);
307 static void vfio_group_unlock_and_free(struct vfio_group *group)
309 mutex_unlock(&vfio.group_lock);
311 * Unregister outside of lock. A spurious callback is harmless now
312 * that the group is no longer in vfio.group_list.
314 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
315 kfree(group);
319 * Group objects - create, release, get, put, search
321 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
323 struct vfio_group *group, *tmp;
324 struct device *dev;
325 int ret, minor;
327 group = kzalloc(sizeof(*group), GFP_KERNEL);
328 if (!group)
329 return ERR_PTR(-ENOMEM);
331 kref_init(&group->kref);
332 INIT_LIST_HEAD(&group->device_list);
333 mutex_init(&group->device_lock);
334 INIT_LIST_HEAD(&group->unbound_list);
335 mutex_init(&group->unbound_lock);
336 atomic_set(&group->container_users, 0);
337 atomic_set(&group->opened, 0);
338 group->iommu_group = iommu_group;
339 #ifdef CONFIG_VFIO_NOIOMMU
340 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
341 #endif
343 group->nb.notifier_call = vfio_iommu_group_notifier;
346 * blocking notifiers acquire a rwsem around registering and hold
347 * it around callback. Therefore, need to register outside of
348 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
349 * do anything unless it can find the group in vfio.group_list, so
350 * no harm in registering early.
352 ret = iommu_group_register_notifier(iommu_group, &group->nb);
353 if (ret) {
354 kfree(group);
355 return ERR_PTR(ret);
358 mutex_lock(&vfio.group_lock);
360 /* Did we race creating this group? */
361 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
362 if (tmp->iommu_group == iommu_group) {
363 vfio_group_get(tmp);
364 vfio_group_unlock_and_free(group);
365 return tmp;
369 minor = vfio_alloc_group_minor(group);
370 if (minor < 0) {
371 vfio_group_unlock_and_free(group);
372 return ERR_PTR(minor);
375 dev = device_create(vfio.class, NULL,
376 MKDEV(MAJOR(vfio.group_devt), minor),
377 group, "%s%d", group->noiommu ? "noiommu-" : "",
378 iommu_group_id(iommu_group));
379 if (IS_ERR(dev)) {
380 vfio_free_group_minor(minor);
381 vfio_group_unlock_and_free(group);
382 return (struct vfio_group *)dev; /* ERR_PTR */
385 group->minor = minor;
386 group->dev = dev;
388 list_add(&group->vfio_next, &vfio.group_list);
390 mutex_unlock(&vfio.group_lock);
392 return group;
395 /* called with vfio.group_lock held */
396 static void vfio_group_release(struct kref *kref)
398 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
399 struct vfio_unbound_dev *unbound, *tmp;
400 struct iommu_group *iommu_group = group->iommu_group;
402 WARN_ON(!list_empty(&group->device_list));
404 list_for_each_entry_safe(unbound, tmp,
405 &group->unbound_list, unbound_next) {
406 list_del(&unbound->unbound_next);
407 kfree(unbound);
410 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
411 list_del(&group->vfio_next);
412 vfio_free_group_minor(group->minor);
413 vfio_group_unlock_and_free(group);
414 iommu_group_put(iommu_group);
417 static void vfio_group_put(struct vfio_group *group)
419 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
422 struct vfio_group_put_work {
423 struct work_struct work;
424 struct vfio_group *group;
427 static void vfio_group_put_bg(struct work_struct *work)
429 struct vfio_group_put_work *do_work;
431 do_work = container_of(work, struct vfio_group_put_work, work);
433 vfio_group_put(do_work->group);
434 kfree(do_work);
437 static void vfio_group_schedule_put(struct vfio_group *group)
439 struct vfio_group_put_work *do_work;
441 do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
442 if (WARN_ON(!do_work))
443 return;
445 INIT_WORK(&do_work->work, vfio_group_put_bg);
446 do_work->group = group;
447 schedule_work(&do_work->work);
450 /* Assume group_lock or group reference is held */
451 static void vfio_group_get(struct vfio_group *group)
453 kref_get(&group->kref);
457 * Not really a try as we will sleep for mutex, but we need to make
458 * sure the group pointer is valid under lock and get a reference.
460 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
462 struct vfio_group *target = group;
464 mutex_lock(&vfio.group_lock);
465 list_for_each_entry(group, &vfio.group_list, vfio_next) {
466 if (group == target) {
467 vfio_group_get(group);
468 mutex_unlock(&vfio.group_lock);
469 return group;
472 mutex_unlock(&vfio.group_lock);
474 return NULL;
477 static
478 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
480 struct vfio_group *group;
482 mutex_lock(&vfio.group_lock);
483 list_for_each_entry(group, &vfio.group_list, vfio_next) {
484 if (group->iommu_group == iommu_group) {
485 vfio_group_get(group);
486 mutex_unlock(&vfio.group_lock);
487 return group;
490 mutex_unlock(&vfio.group_lock);
492 return NULL;
495 static struct vfio_group *vfio_group_get_from_minor(int minor)
497 struct vfio_group *group;
499 mutex_lock(&vfio.group_lock);
500 group = idr_find(&vfio.group_idr, minor);
501 if (!group) {
502 mutex_unlock(&vfio.group_lock);
503 return NULL;
505 vfio_group_get(group);
506 mutex_unlock(&vfio.group_lock);
508 return group;
512 * Device objects - create, release, get, put, search
514 static
515 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
516 struct device *dev,
517 const struct vfio_device_ops *ops,
518 void *device_data)
520 struct vfio_device *device;
522 device = kzalloc(sizeof(*device), GFP_KERNEL);
523 if (!device)
524 return ERR_PTR(-ENOMEM);
526 kref_init(&device->kref);
527 device->dev = dev;
528 device->group = group;
529 device->ops = ops;
530 device->device_data = device_data;
531 dev_set_drvdata(dev, device);
533 /* No need to get group_lock, caller has group reference */
534 vfio_group_get(group);
536 mutex_lock(&group->device_lock);
537 list_add(&device->group_next, &group->device_list);
538 mutex_unlock(&group->device_lock);
540 return device;
543 static void vfio_device_release(struct kref *kref)
545 struct vfio_device *device = container_of(kref,
546 struct vfio_device, kref);
547 struct vfio_group *group = device->group;
549 list_del(&device->group_next);
550 mutex_unlock(&group->device_lock);
552 dev_set_drvdata(device->dev, NULL);
554 kfree(device);
556 /* vfio_del_group_dev may be waiting for this device */
557 wake_up(&vfio.release_q);
560 /* Device reference always implies a group reference */
561 void vfio_device_put(struct vfio_device *device)
563 struct vfio_group *group = device->group;
564 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
565 vfio_group_put(group);
567 EXPORT_SYMBOL_GPL(vfio_device_put);
569 static void vfio_device_get(struct vfio_device *device)
571 vfio_group_get(device->group);
572 kref_get(&device->kref);
575 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
576 struct device *dev)
578 struct vfio_device *device;
580 mutex_lock(&group->device_lock);
581 list_for_each_entry(device, &group->device_list, group_next) {
582 if (device->dev == dev) {
583 vfio_device_get(device);
584 mutex_unlock(&group->device_lock);
585 return device;
588 mutex_unlock(&group->device_lock);
589 return NULL;
593 * Some drivers, like pci-stub, are only used to prevent other drivers from
594 * claiming a device and are therefore perfectly legitimate for a user owned
595 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
596 * of the device, but it does prevent the user from having direct access to
597 * the device, which is useful in some circumstances.
599 * We also assume that we can include PCI interconnect devices, ie. bridges.
600 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
601 * then all of the downstream devices will be part of the same IOMMU group as
602 * the bridge. Thus, if placing the bridge into the user owned IOVA space
603 * breaks anything, it only does so for user owned devices downstream. Note
604 * that error notification via MSI can be affected for platforms that handle
605 * MSI within the same IOVA space as DMA.
607 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
609 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
611 int i;
613 if (dev_is_pci(dev)) {
614 struct pci_dev *pdev = to_pci_dev(dev);
616 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
617 return true;
620 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
621 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
622 return true;
625 return false;
629 * A vfio group is viable for use by userspace if all devices are in
630 * one of the following states:
631 * - driver-less
632 * - bound to a vfio driver
633 * - bound to a whitelisted driver
634 * - a PCI interconnect device
636 * We use two methods to determine whether a device is bound to a vfio
637 * driver. The first is to test whether the device exists in the vfio
638 * group. The second is to test if the device exists on the group
639 * unbound_list, indicating it's in the middle of transitioning from
640 * a vfio driver to driver-less.
642 static int vfio_dev_viable(struct device *dev, void *data)
644 struct vfio_group *group = data;
645 struct vfio_device *device;
646 struct device_driver *drv = ACCESS_ONCE(dev->driver);
647 struct vfio_unbound_dev *unbound;
648 int ret = -EINVAL;
650 mutex_lock(&group->unbound_lock);
651 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
652 if (dev == unbound->dev) {
653 ret = 0;
654 break;
657 mutex_unlock(&group->unbound_lock);
659 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
660 return 0;
662 device = vfio_group_get_device(group, dev);
663 if (device) {
664 vfio_device_put(device);
665 return 0;
668 return ret;
672 * Async device support
674 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
676 struct vfio_device *device;
678 /* Do we already know about it? We shouldn't */
679 device = vfio_group_get_device(group, dev);
680 if (WARN_ON_ONCE(device)) {
681 vfio_device_put(device);
682 return 0;
685 /* Nothing to do for idle groups */
686 if (!atomic_read(&group->container_users))
687 return 0;
689 /* TODO Prevent device auto probing */
690 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
691 iommu_group_id(group->iommu_group));
693 return 0;
696 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
698 /* We don't care what happens when the group isn't in use */
699 if (!atomic_read(&group->container_users))
700 return 0;
702 return vfio_dev_viable(dev, group);
705 static int vfio_iommu_group_notifier(struct notifier_block *nb,
706 unsigned long action, void *data)
708 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
709 struct device *dev = data;
710 struct vfio_unbound_dev *unbound;
713 * Need to go through a group_lock lookup to get a reference or we
714 * risk racing a group being removed. Ignore spurious notifies.
716 group = vfio_group_try_get(group);
717 if (!group)
718 return NOTIFY_OK;
720 switch (action) {
721 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
722 vfio_group_nb_add_dev(group, dev);
723 break;
724 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
726 * Nothing to do here. If the device is in use, then the
727 * vfio sub-driver should block the remove callback until
728 * it is unused. If the device is unused or attached to a
729 * stub driver, then it should be released and we don't
730 * care that it will be going away.
732 break;
733 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
734 pr_debug("%s: Device %s, group %d binding to driver\n",
735 __func__, dev_name(dev),
736 iommu_group_id(group->iommu_group));
737 break;
738 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
739 pr_debug("%s: Device %s, group %d bound to driver %s\n",
740 __func__, dev_name(dev),
741 iommu_group_id(group->iommu_group), dev->driver->name);
742 BUG_ON(vfio_group_nb_verify(group, dev));
743 break;
744 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
745 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
746 __func__, dev_name(dev),
747 iommu_group_id(group->iommu_group), dev->driver->name);
748 break;
749 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
750 pr_debug("%s: Device %s, group %d unbound from driver\n",
751 __func__, dev_name(dev),
752 iommu_group_id(group->iommu_group));
754 * XXX An unbound device in a live group is ok, but we'd
755 * really like to avoid the above BUG_ON by preventing other
756 * drivers from binding to it. Once that occurs, we have to
757 * stop the system to maintain isolation. At a minimum, we'd
758 * want a toggle to disable driver auto probe for this device.
761 mutex_lock(&group->unbound_lock);
762 list_for_each_entry(unbound,
763 &group->unbound_list, unbound_next) {
764 if (dev == unbound->dev) {
765 list_del(&unbound->unbound_next);
766 kfree(unbound);
767 break;
770 mutex_unlock(&group->unbound_lock);
771 break;
775 * If we're the last reference to the group, the group will be
776 * released, which includes unregistering the iommu group notifier.
777 * We hold a read-lock on that notifier list, unregistering needs
778 * a write-lock... deadlock. Release our reference asynchronously
779 * to avoid that situation.
781 vfio_group_schedule_put(group);
782 return NOTIFY_OK;
786 * VFIO driver API
788 int vfio_add_group_dev(struct device *dev,
789 const struct vfio_device_ops *ops, void *device_data)
791 struct iommu_group *iommu_group;
792 struct vfio_group *group;
793 struct vfio_device *device;
795 iommu_group = iommu_group_get(dev);
796 if (!iommu_group)
797 return -EINVAL;
799 group = vfio_group_get_from_iommu(iommu_group);
800 if (!group) {
801 group = vfio_create_group(iommu_group);
802 if (IS_ERR(group)) {
803 iommu_group_put(iommu_group);
804 return PTR_ERR(group);
806 } else {
808 * A found vfio_group already holds a reference to the
809 * iommu_group. A created vfio_group keeps the reference.
811 iommu_group_put(iommu_group);
814 device = vfio_group_get_device(group, dev);
815 if (device) {
816 WARN(1, "Device %s already exists on group %d\n",
817 dev_name(dev), iommu_group_id(iommu_group));
818 vfio_device_put(device);
819 vfio_group_put(group);
820 return -EBUSY;
823 device = vfio_group_create_device(group, dev, ops, device_data);
824 if (IS_ERR(device)) {
825 vfio_group_put(group);
826 return PTR_ERR(device);
830 * Drop all but the vfio_device reference. The vfio_device holds
831 * a reference to the vfio_group, which holds a reference to the
832 * iommu_group.
834 vfio_group_put(group);
836 return 0;
838 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
841 * Get a reference to the vfio_device for a device. Even if the
842 * caller thinks they own the device, they could be racing with a
843 * release call path, so we can't trust drvdata for the shortcut.
844 * Go the long way around, from the iommu_group to the vfio_group
845 * to the vfio_device.
847 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
849 struct iommu_group *iommu_group;
850 struct vfio_group *group;
851 struct vfio_device *device;
853 iommu_group = iommu_group_get(dev);
854 if (!iommu_group)
855 return NULL;
857 group = vfio_group_get_from_iommu(iommu_group);
858 iommu_group_put(iommu_group);
859 if (!group)
860 return NULL;
862 device = vfio_group_get_device(group, dev);
863 vfio_group_put(group);
865 return device;
867 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
869 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
870 char *buf)
872 struct vfio_device *it, *device = NULL;
874 mutex_lock(&group->device_lock);
875 list_for_each_entry(it, &group->device_list, group_next) {
876 if (!strcmp(dev_name(it->dev), buf)) {
877 device = it;
878 vfio_device_get(device);
879 break;
882 mutex_unlock(&group->device_lock);
884 return device;
888 * Caller must hold a reference to the vfio_device
890 void *vfio_device_data(struct vfio_device *device)
892 return device->device_data;
894 EXPORT_SYMBOL_GPL(vfio_device_data);
896 /* Given a referenced group, check if it contains the device */
897 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
899 struct vfio_device *device;
901 device = vfio_group_get_device(group, dev);
902 if (!device)
903 return false;
905 vfio_device_put(device);
906 return true;
910 * Decrement the device reference count and wait for the device to be
911 * removed. Open file descriptors for the device... */
912 void *vfio_del_group_dev(struct device *dev)
914 struct vfio_device *device = dev_get_drvdata(dev);
915 struct vfio_group *group = device->group;
916 void *device_data = device->device_data;
917 struct vfio_unbound_dev *unbound;
918 unsigned int i = 0;
919 long ret;
920 bool interrupted = false;
923 * The group exists so long as we have a device reference. Get
924 * a group reference and use it to scan for the device going away.
926 vfio_group_get(group);
929 * When the device is removed from the group, the group suddenly
930 * becomes non-viable; the device has a driver (until the unbind
931 * completes), but it's not present in the group. This is bad news
932 * for any external users that need to re-acquire a group reference
933 * in order to match and release their existing reference. To
934 * solve this, we track such devices on the unbound_list to bridge
935 * the gap until they're fully unbound.
937 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
938 if (unbound) {
939 unbound->dev = dev;
940 mutex_lock(&group->unbound_lock);
941 list_add(&unbound->unbound_next, &group->unbound_list);
942 mutex_unlock(&group->unbound_lock);
944 WARN_ON(!unbound);
946 vfio_device_put(device);
949 * If the device is still present in the group after the above
950 * 'put', then it is in use and we need to request it from the
951 * bus driver. The driver may in turn need to request the
952 * device from the user. We send the request on an arbitrary
953 * interval with counter to allow the driver to take escalating
954 * measures to release the device if it has the ability to do so.
956 do {
957 device = vfio_group_get_device(group, dev);
958 if (!device)
959 break;
961 if (device->ops->request)
962 device->ops->request(device_data, i++);
964 vfio_device_put(device);
966 if (interrupted) {
967 ret = wait_event_timeout(vfio.release_q,
968 !vfio_dev_present(group, dev), HZ * 10);
969 } else {
970 ret = wait_event_interruptible_timeout(vfio.release_q,
971 !vfio_dev_present(group, dev), HZ * 10);
972 if (ret == -ERESTARTSYS) {
973 interrupted = true;
974 dev_warn(dev,
975 "Device is currently in use, task"
976 " \"%s\" (%d) "
977 "blocked until device is released",
978 current->comm, task_pid_nr(current));
981 } while (ret <= 0);
983 vfio_group_put(group);
985 return device_data;
987 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
990 * VFIO base fd, /dev/vfio/vfio
992 static long vfio_ioctl_check_extension(struct vfio_container *container,
993 unsigned long arg)
995 struct vfio_iommu_driver *driver;
996 long ret = 0;
998 down_read(&container->group_lock);
1000 driver = container->iommu_driver;
1002 switch (arg) {
1003 /* No base extensions yet */
1004 default:
1006 * If no driver is set, poll all registered drivers for
1007 * extensions and return the first positive result. If
1008 * a driver is already set, further queries will be passed
1009 * only to that driver.
1011 if (!driver) {
1012 mutex_lock(&vfio.iommu_drivers_lock);
1013 list_for_each_entry(driver, &vfio.iommu_drivers_list,
1014 vfio_next) {
1016 #ifdef CONFIG_VFIO_NOIOMMU
1017 if (!list_empty(&container->group_list) &&
1018 (container->noiommu !=
1019 (driver->ops == &vfio_noiommu_ops)))
1020 continue;
1021 #endif
1023 if (!try_module_get(driver->ops->owner))
1024 continue;
1026 ret = driver->ops->ioctl(NULL,
1027 VFIO_CHECK_EXTENSION,
1028 arg);
1029 module_put(driver->ops->owner);
1030 if (ret > 0)
1031 break;
1033 mutex_unlock(&vfio.iommu_drivers_lock);
1034 } else
1035 ret = driver->ops->ioctl(container->iommu_data,
1036 VFIO_CHECK_EXTENSION, arg);
1039 up_read(&container->group_lock);
1041 return ret;
1044 /* hold write lock on container->group_lock */
1045 static int __vfio_container_attach_groups(struct vfio_container *container,
1046 struct vfio_iommu_driver *driver,
1047 void *data)
1049 struct vfio_group *group;
1050 int ret = -ENODEV;
1052 list_for_each_entry(group, &container->group_list, container_next) {
1053 ret = driver->ops->attach_group(data, group->iommu_group);
1054 if (ret)
1055 goto unwind;
1058 return ret;
1060 unwind:
1061 list_for_each_entry_continue_reverse(group, &container->group_list,
1062 container_next) {
1063 driver->ops->detach_group(data, group->iommu_group);
1066 return ret;
1069 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1070 unsigned long arg)
1072 struct vfio_iommu_driver *driver;
1073 long ret = -ENODEV;
1075 down_write(&container->group_lock);
1078 * The container is designed to be an unprivileged interface while
1079 * the group can be assigned to specific users. Therefore, only by
1080 * adding a group to a container does the user get the privilege of
1081 * enabling the iommu, which may allocate finite resources. There
1082 * is no unset_iommu, but by removing all the groups from a container,
1083 * the container is deprivileged and returns to an unset state.
1085 if (list_empty(&container->group_list) || container->iommu_driver) {
1086 up_write(&container->group_lock);
1087 return -EINVAL;
1090 mutex_lock(&vfio.iommu_drivers_lock);
1091 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1092 void *data;
1094 #ifdef CONFIG_VFIO_NOIOMMU
1096 * Only noiommu containers can use vfio-noiommu and noiommu
1097 * containers can only use vfio-noiommu.
1099 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1100 continue;
1101 #endif
1103 if (!try_module_get(driver->ops->owner))
1104 continue;
1107 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1108 * so test which iommu driver reported support for this
1109 * extension and call open on them. We also pass them the
1110 * magic, allowing a single driver to support multiple
1111 * interfaces if they'd like.
1113 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1114 module_put(driver->ops->owner);
1115 continue;
1118 data = driver->ops->open(arg);
1119 if (IS_ERR(data)) {
1120 ret = PTR_ERR(data);
1121 module_put(driver->ops->owner);
1122 continue;
1125 ret = __vfio_container_attach_groups(container, driver, data);
1126 if (ret) {
1127 driver->ops->release(data);
1128 module_put(driver->ops->owner);
1129 continue;
1132 container->iommu_driver = driver;
1133 container->iommu_data = data;
1134 break;
1137 mutex_unlock(&vfio.iommu_drivers_lock);
1138 up_write(&container->group_lock);
1140 return ret;
1143 static long vfio_fops_unl_ioctl(struct file *filep,
1144 unsigned int cmd, unsigned long arg)
1146 struct vfio_container *container = filep->private_data;
1147 struct vfio_iommu_driver *driver;
1148 void *data;
1149 long ret = -EINVAL;
1151 if (!container)
1152 return ret;
1154 switch (cmd) {
1155 case VFIO_GET_API_VERSION:
1156 ret = VFIO_API_VERSION;
1157 break;
1158 case VFIO_CHECK_EXTENSION:
1159 ret = vfio_ioctl_check_extension(container, arg);
1160 break;
1161 case VFIO_SET_IOMMU:
1162 ret = vfio_ioctl_set_iommu(container, arg);
1163 break;
1164 default:
1165 down_read(&container->group_lock);
1167 driver = container->iommu_driver;
1168 data = container->iommu_data;
1170 if (driver) /* passthrough all unrecognized ioctls */
1171 ret = driver->ops->ioctl(data, cmd, arg);
1173 up_read(&container->group_lock);
1176 return ret;
1179 #ifdef CONFIG_COMPAT
1180 static long vfio_fops_compat_ioctl(struct file *filep,
1181 unsigned int cmd, unsigned long arg)
1183 arg = (unsigned long)compat_ptr(arg);
1184 return vfio_fops_unl_ioctl(filep, cmd, arg);
1186 #endif /* CONFIG_COMPAT */
1188 static int vfio_fops_open(struct inode *inode, struct file *filep)
1190 struct vfio_container *container;
1192 container = kzalloc(sizeof(*container), GFP_KERNEL);
1193 if (!container)
1194 return -ENOMEM;
1196 INIT_LIST_HEAD(&container->group_list);
1197 init_rwsem(&container->group_lock);
1198 kref_init(&container->kref);
1200 filep->private_data = container;
1202 return 0;
1205 static int vfio_fops_release(struct inode *inode, struct file *filep)
1207 struct vfio_container *container = filep->private_data;
1209 filep->private_data = NULL;
1211 vfio_container_put(container);
1213 return 0;
1217 * Once an iommu driver is set, we optionally pass read/write/mmap
1218 * on to the driver, allowing management interfaces beyond ioctl.
1220 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1221 size_t count, loff_t *ppos)
1223 struct vfio_container *container = filep->private_data;
1224 struct vfio_iommu_driver *driver;
1225 ssize_t ret = -EINVAL;
1227 down_read(&container->group_lock);
1229 driver = container->iommu_driver;
1230 if (likely(driver && driver->ops->read))
1231 ret = driver->ops->read(container->iommu_data,
1232 buf, count, ppos);
1234 up_read(&container->group_lock);
1236 return ret;
1239 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1240 size_t count, loff_t *ppos)
1242 struct vfio_container *container = filep->private_data;
1243 struct vfio_iommu_driver *driver;
1244 ssize_t ret = -EINVAL;
1246 down_read(&container->group_lock);
1248 driver = container->iommu_driver;
1249 if (likely(driver && driver->ops->write))
1250 ret = driver->ops->write(container->iommu_data,
1251 buf, count, ppos);
1253 up_read(&container->group_lock);
1255 return ret;
1258 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1260 struct vfio_container *container = filep->private_data;
1261 struct vfio_iommu_driver *driver;
1262 int ret = -EINVAL;
1264 down_read(&container->group_lock);
1266 driver = container->iommu_driver;
1267 if (likely(driver && driver->ops->mmap))
1268 ret = driver->ops->mmap(container->iommu_data, vma);
1270 up_read(&container->group_lock);
1272 return ret;
1275 static const struct file_operations vfio_fops = {
1276 .owner = THIS_MODULE,
1277 .open = vfio_fops_open,
1278 .release = vfio_fops_release,
1279 .read = vfio_fops_read,
1280 .write = vfio_fops_write,
1281 .unlocked_ioctl = vfio_fops_unl_ioctl,
1282 #ifdef CONFIG_COMPAT
1283 .compat_ioctl = vfio_fops_compat_ioctl,
1284 #endif
1285 .mmap = vfio_fops_mmap,
1289 * VFIO Group fd, /dev/vfio/$GROUP
1291 static void __vfio_group_unset_container(struct vfio_group *group)
1293 struct vfio_container *container = group->container;
1294 struct vfio_iommu_driver *driver;
1296 down_write(&container->group_lock);
1298 driver = container->iommu_driver;
1299 if (driver)
1300 driver->ops->detach_group(container->iommu_data,
1301 group->iommu_group);
1303 group->container = NULL;
1304 list_del(&group->container_next);
1306 /* Detaching the last group deprivileges a container, remove iommu */
1307 if (driver && list_empty(&container->group_list)) {
1308 driver->ops->release(container->iommu_data);
1309 module_put(driver->ops->owner);
1310 container->iommu_driver = NULL;
1311 container->iommu_data = NULL;
1314 up_write(&container->group_lock);
1316 vfio_container_put(container);
1320 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1321 * if there was no container to unset. Since the ioctl is called on
1322 * the group, we know that still exists, therefore the only valid
1323 * transition here is 1->0.
1325 static int vfio_group_unset_container(struct vfio_group *group)
1327 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1329 if (!users)
1330 return -EINVAL;
1331 if (users != 1)
1332 return -EBUSY;
1334 __vfio_group_unset_container(group);
1336 return 0;
1340 * When removing container users, anything that removes the last user
1341 * implicitly removes the group from the container. That is, if the
1342 * group file descriptor is closed, as well as any device file descriptors,
1343 * the group is free.
1345 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1347 if (0 == atomic_dec_if_positive(&group->container_users))
1348 __vfio_group_unset_container(group);
1351 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1353 struct fd f;
1354 struct vfio_container *container;
1355 struct vfio_iommu_driver *driver;
1356 int ret = 0;
1358 if (atomic_read(&group->container_users))
1359 return -EINVAL;
1361 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1362 return -EPERM;
1364 f = fdget(container_fd);
1365 if (!f.file)
1366 return -EBADF;
1368 /* Sanity check, is this really our fd? */
1369 if (f.file->f_op != &vfio_fops) {
1370 fdput(f);
1371 return -EINVAL;
1374 container = f.file->private_data;
1375 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1377 down_write(&container->group_lock);
1379 /* Real groups and fake groups cannot mix */
1380 if (!list_empty(&container->group_list) &&
1381 container->noiommu != group->noiommu) {
1382 ret = -EPERM;
1383 goto unlock_out;
1386 driver = container->iommu_driver;
1387 if (driver) {
1388 ret = driver->ops->attach_group(container->iommu_data,
1389 group->iommu_group);
1390 if (ret)
1391 goto unlock_out;
1394 group->container = container;
1395 container->noiommu = group->noiommu;
1396 list_add(&group->container_next, &container->group_list);
1398 /* Get a reference on the container and mark a user within the group */
1399 vfio_container_get(container);
1400 atomic_inc(&group->container_users);
1402 unlock_out:
1403 up_write(&container->group_lock);
1404 fdput(f);
1405 return ret;
1408 static bool vfio_group_viable(struct vfio_group *group)
1410 return (iommu_group_for_each_dev(group->iommu_group,
1411 group, vfio_dev_viable) == 0);
1414 static const struct file_operations vfio_device_fops;
1416 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1418 struct vfio_device *device;
1419 struct file *filep;
1420 int ret;
1422 if (0 == atomic_read(&group->container_users) ||
1423 !group->container->iommu_driver || !vfio_group_viable(group))
1424 return -EINVAL;
1426 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1427 return -EPERM;
1429 device = vfio_device_get_from_name(group, buf);
1430 if (!device)
1431 return -ENODEV;
1433 ret = device->ops->open(device->device_data);
1434 if (ret) {
1435 vfio_device_put(device);
1436 return ret;
1440 * We can't use anon_inode_getfd() because we need to modify
1441 * the f_mode flags directly to allow more than just ioctls
1443 ret = get_unused_fd_flags(O_CLOEXEC);
1444 if (ret < 0) {
1445 device->ops->release(device->device_data);
1446 vfio_device_put(device);
1447 return ret;
1450 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1451 device, O_RDWR);
1452 if (IS_ERR(filep)) {
1453 put_unused_fd(ret);
1454 ret = PTR_ERR(filep);
1455 device->ops->release(device->device_data);
1456 vfio_device_put(device);
1457 return ret;
1461 * TODO: add an anon_inode interface to do this.
1462 * Appears to be missing by lack of need rather than
1463 * explicitly prevented. Now there's need.
1465 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1467 atomic_inc(&group->container_users);
1469 fd_install(ret, filep);
1471 if (group->noiommu)
1472 dev_warn(device->dev, "vfio-noiommu device opened by user "
1473 "(%s:%d)\n", current->comm, task_pid_nr(current));
1475 return ret;
1478 static long vfio_group_fops_unl_ioctl(struct file *filep,
1479 unsigned int cmd, unsigned long arg)
1481 struct vfio_group *group = filep->private_data;
1482 long ret = -ENOTTY;
1484 switch (cmd) {
1485 case VFIO_GROUP_GET_STATUS:
1487 struct vfio_group_status status;
1488 unsigned long minsz;
1490 minsz = offsetofend(struct vfio_group_status, flags);
1492 if (copy_from_user(&status, (void __user *)arg, minsz))
1493 return -EFAULT;
1495 if (status.argsz < minsz)
1496 return -EINVAL;
1498 status.flags = 0;
1500 if (vfio_group_viable(group))
1501 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1503 if (group->container)
1504 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1506 if (copy_to_user((void __user *)arg, &status, minsz))
1507 return -EFAULT;
1509 ret = 0;
1510 break;
1512 case VFIO_GROUP_SET_CONTAINER:
1514 int fd;
1516 if (get_user(fd, (int __user *)arg))
1517 return -EFAULT;
1519 if (fd < 0)
1520 return -EINVAL;
1522 ret = vfio_group_set_container(group, fd);
1523 break;
1525 case VFIO_GROUP_UNSET_CONTAINER:
1526 ret = vfio_group_unset_container(group);
1527 break;
1528 case VFIO_GROUP_GET_DEVICE_FD:
1530 char *buf;
1532 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1533 if (IS_ERR(buf))
1534 return PTR_ERR(buf);
1536 ret = vfio_group_get_device_fd(group, buf);
1537 kfree(buf);
1538 break;
1542 return ret;
1545 #ifdef CONFIG_COMPAT
1546 static long vfio_group_fops_compat_ioctl(struct file *filep,
1547 unsigned int cmd, unsigned long arg)
1549 arg = (unsigned long)compat_ptr(arg);
1550 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1552 #endif /* CONFIG_COMPAT */
1554 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1556 struct vfio_group *group;
1557 int opened;
1559 group = vfio_group_get_from_minor(iminor(inode));
1560 if (!group)
1561 return -ENODEV;
1563 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1564 vfio_group_put(group);
1565 return -EPERM;
1568 /* Do we need multiple instances of the group open? Seems not. */
1569 opened = atomic_cmpxchg(&group->opened, 0, 1);
1570 if (opened) {
1571 vfio_group_put(group);
1572 return -EBUSY;
1575 /* Is something still in use from a previous open? */
1576 if (group->container) {
1577 atomic_dec(&group->opened);
1578 vfio_group_put(group);
1579 return -EBUSY;
1582 filep->private_data = group;
1584 return 0;
1587 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1589 struct vfio_group *group = filep->private_data;
1591 filep->private_data = NULL;
1593 vfio_group_try_dissolve_container(group);
1595 atomic_dec(&group->opened);
1597 vfio_group_put(group);
1599 return 0;
1602 static const struct file_operations vfio_group_fops = {
1603 .owner = THIS_MODULE,
1604 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1605 #ifdef CONFIG_COMPAT
1606 .compat_ioctl = vfio_group_fops_compat_ioctl,
1607 #endif
1608 .open = vfio_group_fops_open,
1609 .release = vfio_group_fops_release,
1613 * VFIO Device fd
1615 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1617 struct vfio_device *device = filep->private_data;
1619 device->ops->release(device->device_data);
1621 vfio_group_try_dissolve_container(device->group);
1623 vfio_device_put(device);
1625 return 0;
1628 static long vfio_device_fops_unl_ioctl(struct file *filep,
1629 unsigned int cmd, unsigned long arg)
1631 struct vfio_device *device = filep->private_data;
1633 if (unlikely(!device->ops->ioctl))
1634 return -EINVAL;
1636 return device->ops->ioctl(device->device_data, cmd, arg);
1639 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1640 size_t count, loff_t *ppos)
1642 struct vfio_device *device = filep->private_data;
1644 if (unlikely(!device->ops->read))
1645 return -EINVAL;
1647 return device->ops->read(device->device_data, buf, count, ppos);
1650 static ssize_t vfio_device_fops_write(struct file *filep,
1651 const char __user *buf,
1652 size_t count, loff_t *ppos)
1654 struct vfio_device *device = filep->private_data;
1656 if (unlikely(!device->ops->write))
1657 return -EINVAL;
1659 return device->ops->write(device->device_data, buf, count, ppos);
1662 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1664 struct vfio_device *device = filep->private_data;
1666 if (unlikely(!device->ops->mmap))
1667 return -EINVAL;
1669 return device->ops->mmap(device->device_data, vma);
1672 #ifdef CONFIG_COMPAT
1673 static long vfio_device_fops_compat_ioctl(struct file *filep,
1674 unsigned int cmd, unsigned long arg)
1676 arg = (unsigned long)compat_ptr(arg);
1677 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1679 #endif /* CONFIG_COMPAT */
1681 static const struct file_operations vfio_device_fops = {
1682 .owner = THIS_MODULE,
1683 .release = vfio_device_fops_release,
1684 .read = vfio_device_fops_read,
1685 .write = vfio_device_fops_write,
1686 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1687 #ifdef CONFIG_COMPAT
1688 .compat_ioctl = vfio_device_fops_compat_ioctl,
1689 #endif
1690 .mmap = vfio_device_fops_mmap,
1694 * External user API, exported by symbols to be linked dynamically.
1696 * The protocol includes:
1697 * 1. do normal VFIO init operation:
1698 * - opening a new container;
1699 * - attaching group(s) to it;
1700 * - setting an IOMMU driver for a container.
1701 * When IOMMU is set for a container, all groups in it are
1702 * considered ready to use by an external user.
1704 * 2. User space passes a group fd to an external user.
1705 * The external user calls vfio_group_get_external_user()
1706 * to verify that:
1707 * - the group is initialized;
1708 * - IOMMU is set for it.
1709 * If both checks passed, vfio_group_get_external_user()
1710 * increments the container user counter to prevent
1711 * the VFIO group from disposal before KVM exits.
1713 * 3. The external user calls vfio_external_user_iommu_id()
1714 * to know an IOMMU ID.
1716 * 4. When the external KVM finishes, it calls
1717 * vfio_group_put_external_user() to release the VFIO group.
1718 * This call decrements the container user counter.
1720 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1722 struct vfio_group *group = filep->private_data;
1724 if (filep->f_op != &vfio_group_fops)
1725 return ERR_PTR(-EINVAL);
1727 if (!atomic_inc_not_zero(&group->container_users))
1728 return ERR_PTR(-EINVAL);
1730 if (group->noiommu) {
1731 atomic_dec(&group->container_users);
1732 return ERR_PTR(-EPERM);
1735 if (!group->container->iommu_driver ||
1736 !vfio_group_viable(group)) {
1737 atomic_dec(&group->container_users);
1738 return ERR_PTR(-EINVAL);
1741 vfio_group_get(group);
1743 return group;
1745 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1747 void vfio_group_put_external_user(struct vfio_group *group)
1749 vfio_group_try_dissolve_container(group);
1750 vfio_group_put(group);
1752 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1754 bool vfio_external_group_match_file(struct vfio_group *test_group,
1755 struct file *filep)
1757 struct vfio_group *group = filep->private_data;
1759 return (filep->f_op == &vfio_group_fops) && (group == test_group);
1761 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1763 int vfio_external_user_iommu_id(struct vfio_group *group)
1765 return iommu_group_id(group->iommu_group);
1767 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1769 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1771 return vfio_ioctl_check_extension(group->container, arg);
1773 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1776 * Sub-module support
1779 * Helper for managing a buffer of info chain capabilities, allocate or
1780 * reallocate a buffer with additional @size, filling in @id and @version
1781 * of the capability. A pointer to the new capability is returned.
1783 * NB. The chain is based at the head of the buffer, so new entries are
1784 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1785 * next offsets prior to copying to the user buffer.
1787 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1788 size_t size, u16 id, u16 version)
1790 void *buf;
1791 struct vfio_info_cap_header *header, *tmp;
1793 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1794 if (!buf) {
1795 kfree(caps->buf);
1796 caps->size = 0;
1797 return ERR_PTR(-ENOMEM);
1800 caps->buf = buf;
1801 header = buf + caps->size;
1803 /* Eventually copied to user buffer, zero */
1804 memset(header, 0, size);
1806 header->id = id;
1807 header->version = version;
1809 /* Add to the end of the capability chain */
1810 for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next)
1811 ; /* nothing */
1813 tmp->next = caps->size;
1814 caps->size += size;
1816 return header;
1818 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1820 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1822 struct vfio_info_cap_header *tmp;
1824 for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset)
1825 tmp->next += offset;
1827 EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
1830 * Module/class support
1832 static char *vfio_devnode(struct device *dev, umode_t *mode)
1834 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1837 static struct miscdevice vfio_dev = {
1838 .minor = VFIO_MINOR,
1839 .name = "vfio",
1840 .fops = &vfio_fops,
1841 .nodename = "vfio/vfio",
1842 .mode = S_IRUGO | S_IWUGO,
1845 static int __init vfio_init(void)
1847 int ret;
1849 idr_init(&vfio.group_idr);
1850 mutex_init(&vfio.group_lock);
1851 mutex_init(&vfio.iommu_drivers_lock);
1852 INIT_LIST_HEAD(&vfio.group_list);
1853 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1854 init_waitqueue_head(&vfio.release_q);
1856 ret = misc_register(&vfio_dev);
1857 if (ret) {
1858 pr_err("vfio: misc device register failed\n");
1859 return ret;
1862 /* /dev/vfio/$GROUP */
1863 vfio.class = class_create(THIS_MODULE, "vfio");
1864 if (IS_ERR(vfio.class)) {
1865 ret = PTR_ERR(vfio.class);
1866 goto err_class;
1869 vfio.class->devnode = vfio_devnode;
1871 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
1872 if (ret)
1873 goto err_alloc_chrdev;
1875 cdev_init(&vfio.group_cdev, &vfio_group_fops);
1876 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
1877 if (ret)
1878 goto err_cdev_add;
1880 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1883 * Attempt to load known iommu-drivers. This gives us a working
1884 * environment without the user needing to explicitly load iommu
1885 * drivers.
1887 request_module_nowait("vfio_iommu_type1");
1888 request_module_nowait("vfio_iommu_spapr_tce");
1890 #ifdef CONFIG_VFIO_NOIOMMU
1891 vfio_register_iommu_driver(&vfio_noiommu_ops);
1892 #endif
1893 return 0;
1895 err_cdev_add:
1896 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1897 err_alloc_chrdev:
1898 class_destroy(vfio.class);
1899 vfio.class = NULL;
1900 err_class:
1901 misc_deregister(&vfio_dev);
1902 return ret;
1905 static void __exit vfio_cleanup(void)
1907 WARN_ON(!list_empty(&vfio.group_list));
1909 #ifdef CONFIG_VFIO_NOIOMMU
1910 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
1911 #endif
1912 idr_destroy(&vfio.group_idr);
1913 cdev_del(&vfio.group_cdev);
1914 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1915 class_destroy(vfio.class);
1916 vfio.class = NULL;
1917 misc_deregister(&vfio_dev);
1920 module_init(vfio_init);
1921 module_exit(vfio_cleanup);
1923 MODULE_VERSION(DRIVER_VERSION);
1924 MODULE_LICENSE("GPL v2");
1925 MODULE_AUTHOR(DRIVER_AUTHOR);
1926 MODULE_DESCRIPTION(DRIVER_DESC);
1927 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
1928 MODULE_ALIAS("devname:vfio/vfio");