x86/mm/pat: Don't report PAT on CPUs that don't support it
[linux/fpc-iii.git] / drivers / vfio / vfio.c
blob561084ab387f3fd7c8ae3fa3e91c27d8329f7fe4
1 /*
2 * VFIO core
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/fs.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
38 #define DRIVER_VERSION "0.3"
39 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
42 static struct vfio {
43 struct class *class;
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
47 struct idr group_idr;
48 struct mutex group_lock;
49 struct cdev group_cdev;
50 dev_t group_devt;
51 wait_queue_head_t release_q;
52 } vfio;
54 struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
59 struct vfio_container {
60 struct kref kref;
61 struct list_head group_list;
62 struct rw_semaphore group_lock;
63 struct vfio_iommu_driver *iommu_driver;
64 void *iommu_data;
65 bool noiommu;
68 struct vfio_unbound_dev {
69 struct device *dev;
70 struct list_head unbound_next;
73 struct vfio_group {
74 struct kref kref;
75 int minor;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
81 struct device *dev;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
87 atomic_t opened;
88 bool noiommu;
89 struct kvm *kvm;
90 struct blocking_notifier_head notifier;
93 struct vfio_device {
94 struct kref kref;
95 struct device *dev;
96 const struct vfio_device_ops *ops;
97 struct vfio_group *group;
98 struct list_head group_next;
99 void *device_data;
102 #ifdef CONFIG_VFIO_NOIOMMU
103 static bool noiommu __read_mostly;
104 module_param_named(enable_unsafe_noiommu_mode,
105 noiommu, bool, S_IRUGO | S_IWUSR);
106 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
107 #endif
110 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
111 * and remove functions, any use cases other than acquiring the first
112 * reference for the purpose of calling vfio_add_group_dev() or removing
113 * that symmetric reference after vfio_del_group_dev() should use the raw
114 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
115 * removes the device from the dummy group and cannot be nested.
117 struct iommu_group *vfio_iommu_group_get(struct device *dev)
119 struct iommu_group *group;
120 int __maybe_unused ret;
122 group = iommu_group_get(dev);
124 #ifdef CONFIG_VFIO_NOIOMMU
126 * With noiommu enabled, an IOMMU group will be created for a device
127 * that doesn't already have one and doesn't have an iommu_ops on their
128 * bus. We set iommudata simply to be able to identify these groups
129 * as special use and for reclamation later.
131 if (group || !noiommu || iommu_present(dev->bus))
132 return group;
134 group = iommu_group_alloc();
135 if (IS_ERR(group))
136 return NULL;
138 iommu_group_set_name(group, "vfio-noiommu");
139 iommu_group_set_iommudata(group, &noiommu, NULL);
140 ret = iommu_group_add_device(group, dev);
141 iommu_group_put(group);
142 if (ret)
143 return NULL;
146 * Where to taint? At this point we've added an IOMMU group for a
147 * device that is not backed by iommu_ops, therefore any iommu_
148 * callback using iommu_ops can legitimately Oops. So, while we may
149 * be about to give a DMA capable device to a user without IOMMU
150 * protection, which is clearly taint-worthy, let's go ahead and do
151 * it here.
153 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
154 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
155 #endif
157 return group;
159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
163 #ifdef CONFIG_VFIO_NOIOMMU
164 if (iommu_group_get_iommudata(group) == &noiommu)
165 iommu_group_remove_device(dev);
166 #endif
168 iommu_group_put(group);
170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
172 #ifdef CONFIG_VFIO_NOIOMMU
173 static void *vfio_noiommu_open(unsigned long arg)
175 if (arg != VFIO_NOIOMMU_IOMMU)
176 return ERR_PTR(-EINVAL);
177 if (!capable(CAP_SYS_RAWIO))
178 return ERR_PTR(-EPERM);
180 return NULL;
183 static void vfio_noiommu_release(void *iommu_data)
187 static long vfio_noiommu_ioctl(void *iommu_data,
188 unsigned int cmd, unsigned long arg)
190 if (cmd == VFIO_CHECK_EXTENSION)
191 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
193 return -ENOTTY;
196 static int vfio_noiommu_attach_group(void *iommu_data,
197 struct iommu_group *iommu_group)
199 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
202 static void vfio_noiommu_detach_group(void *iommu_data,
203 struct iommu_group *iommu_group)
207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
208 .name = "vfio-noiommu",
209 .owner = THIS_MODULE,
210 .open = vfio_noiommu_open,
211 .release = vfio_noiommu_release,
212 .ioctl = vfio_noiommu_ioctl,
213 .attach_group = vfio_noiommu_attach_group,
214 .detach_group = vfio_noiommu_detach_group,
216 #endif
220 * IOMMU driver registration
222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
224 struct vfio_iommu_driver *driver, *tmp;
226 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
227 if (!driver)
228 return -ENOMEM;
230 driver->ops = ops;
232 mutex_lock(&vfio.iommu_drivers_lock);
234 /* Check for duplicates */
235 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
236 if (tmp->ops == ops) {
237 mutex_unlock(&vfio.iommu_drivers_lock);
238 kfree(driver);
239 return -EINVAL;
243 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
245 mutex_unlock(&vfio.iommu_drivers_lock);
247 return 0;
249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
253 struct vfio_iommu_driver *driver;
255 mutex_lock(&vfio.iommu_drivers_lock);
256 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
257 if (driver->ops == ops) {
258 list_del(&driver->vfio_next);
259 mutex_unlock(&vfio.iommu_drivers_lock);
260 kfree(driver);
261 return;
264 mutex_unlock(&vfio.iommu_drivers_lock);
266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
269 * Group minor allocation/free - both called with vfio.group_lock held
271 static int vfio_alloc_group_minor(struct vfio_group *group)
273 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
276 static void vfio_free_group_minor(int minor)
278 idr_remove(&vfio.group_idr, minor);
281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
282 unsigned long action, void *data);
283 static void vfio_group_get(struct vfio_group *group);
286 * Container objects - containers are created when /dev/vfio/vfio is
287 * opened, but their lifecycle extends until the last user is done, so
288 * it's freed via kref. Must support container/group/device being
289 * closed in any order.
291 static void vfio_container_get(struct vfio_container *container)
293 kref_get(&container->kref);
296 static void vfio_container_release(struct kref *kref)
298 struct vfio_container *container;
299 container = container_of(kref, struct vfio_container, kref);
301 kfree(container);
304 static void vfio_container_put(struct vfio_container *container)
306 kref_put(&container->kref, vfio_container_release);
309 static void vfio_group_unlock_and_free(struct vfio_group *group)
311 mutex_unlock(&vfio.group_lock);
313 * Unregister outside of lock. A spurious callback is harmless now
314 * that the group is no longer in vfio.group_list.
316 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
317 kfree(group);
321 * Group objects - create, release, get, put, search
323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
325 struct vfio_group *group, *tmp;
326 struct device *dev;
327 int ret, minor;
329 group = kzalloc(sizeof(*group), GFP_KERNEL);
330 if (!group)
331 return ERR_PTR(-ENOMEM);
333 kref_init(&group->kref);
334 INIT_LIST_HEAD(&group->device_list);
335 mutex_init(&group->device_lock);
336 INIT_LIST_HEAD(&group->unbound_list);
337 mutex_init(&group->unbound_lock);
338 atomic_set(&group->container_users, 0);
339 atomic_set(&group->opened, 0);
340 group->iommu_group = iommu_group;
341 #ifdef CONFIG_VFIO_NOIOMMU
342 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
343 #endif
344 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
346 group->nb.notifier_call = vfio_iommu_group_notifier;
349 * blocking notifiers acquire a rwsem around registering and hold
350 * it around callback. Therefore, need to register outside of
351 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
352 * do anything unless it can find the group in vfio.group_list, so
353 * no harm in registering early.
355 ret = iommu_group_register_notifier(iommu_group, &group->nb);
356 if (ret) {
357 kfree(group);
358 return ERR_PTR(ret);
361 mutex_lock(&vfio.group_lock);
363 /* Did we race creating this group? */
364 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
365 if (tmp->iommu_group == iommu_group) {
366 vfio_group_get(tmp);
367 vfio_group_unlock_and_free(group);
368 return tmp;
372 minor = vfio_alloc_group_minor(group);
373 if (minor < 0) {
374 vfio_group_unlock_and_free(group);
375 return ERR_PTR(minor);
378 dev = device_create(vfio.class, NULL,
379 MKDEV(MAJOR(vfio.group_devt), minor),
380 group, "%s%d", group->noiommu ? "noiommu-" : "",
381 iommu_group_id(iommu_group));
382 if (IS_ERR(dev)) {
383 vfio_free_group_minor(minor);
384 vfio_group_unlock_and_free(group);
385 return (struct vfio_group *)dev; /* ERR_PTR */
388 group->minor = minor;
389 group->dev = dev;
391 list_add(&group->vfio_next, &vfio.group_list);
393 mutex_unlock(&vfio.group_lock);
395 return group;
398 /* called with vfio.group_lock held */
399 static void vfio_group_release(struct kref *kref)
401 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
402 struct vfio_unbound_dev *unbound, *tmp;
403 struct iommu_group *iommu_group = group->iommu_group;
405 WARN_ON(!list_empty(&group->device_list));
406 WARN_ON(group->notifier.head);
408 list_for_each_entry_safe(unbound, tmp,
409 &group->unbound_list, unbound_next) {
410 list_del(&unbound->unbound_next);
411 kfree(unbound);
414 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
415 list_del(&group->vfio_next);
416 vfio_free_group_minor(group->minor);
417 vfio_group_unlock_and_free(group);
418 iommu_group_put(iommu_group);
421 static void vfio_group_put(struct vfio_group *group)
423 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
426 /* Assume group_lock or group reference is held */
427 static void vfio_group_get(struct vfio_group *group)
429 kref_get(&group->kref);
433 * Not really a try as we will sleep for mutex, but we need to make
434 * sure the group pointer is valid under lock and get a reference.
436 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
438 struct vfio_group *target = group;
440 mutex_lock(&vfio.group_lock);
441 list_for_each_entry(group, &vfio.group_list, vfio_next) {
442 if (group == target) {
443 vfio_group_get(group);
444 mutex_unlock(&vfio.group_lock);
445 return group;
448 mutex_unlock(&vfio.group_lock);
450 return NULL;
453 static
454 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
456 struct vfio_group *group;
458 mutex_lock(&vfio.group_lock);
459 list_for_each_entry(group, &vfio.group_list, vfio_next) {
460 if (group->iommu_group == iommu_group) {
461 vfio_group_get(group);
462 mutex_unlock(&vfio.group_lock);
463 return group;
466 mutex_unlock(&vfio.group_lock);
468 return NULL;
471 static struct vfio_group *vfio_group_get_from_minor(int minor)
473 struct vfio_group *group;
475 mutex_lock(&vfio.group_lock);
476 group = idr_find(&vfio.group_idr, minor);
477 if (!group) {
478 mutex_unlock(&vfio.group_lock);
479 return NULL;
481 vfio_group_get(group);
482 mutex_unlock(&vfio.group_lock);
484 return group;
487 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
489 struct iommu_group *iommu_group;
490 struct vfio_group *group;
492 iommu_group = iommu_group_get(dev);
493 if (!iommu_group)
494 return NULL;
496 group = vfio_group_get_from_iommu(iommu_group);
497 iommu_group_put(iommu_group);
499 return group;
503 * Device objects - create, release, get, put, search
505 static
506 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
507 struct device *dev,
508 const struct vfio_device_ops *ops,
509 void *device_data)
511 struct vfio_device *device;
513 device = kzalloc(sizeof(*device), GFP_KERNEL);
514 if (!device)
515 return ERR_PTR(-ENOMEM);
517 kref_init(&device->kref);
518 device->dev = dev;
519 device->group = group;
520 device->ops = ops;
521 device->device_data = device_data;
522 dev_set_drvdata(dev, device);
524 /* No need to get group_lock, caller has group reference */
525 vfio_group_get(group);
527 mutex_lock(&group->device_lock);
528 list_add(&device->group_next, &group->device_list);
529 mutex_unlock(&group->device_lock);
531 return device;
534 static void vfio_device_release(struct kref *kref)
536 struct vfio_device *device = container_of(kref,
537 struct vfio_device, kref);
538 struct vfio_group *group = device->group;
540 list_del(&device->group_next);
541 mutex_unlock(&group->device_lock);
543 dev_set_drvdata(device->dev, NULL);
545 kfree(device);
547 /* vfio_del_group_dev may be waiting for this device */
548 wake_up(&vfio.release_q);
551 /* Device reference always implies a group reference */
552 void vfio_device_put(struct vfio_device *device)
554 struct vfio_group *group = device->group;
555 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
556 vfio_group_put(group);
558 EXPORT_SYMBOL_GPL(vfio_device_put);
560 static void vfio_device_get(struct vfio_device *device)
562 vfio_group_get(device->group);
563 kref_get(&device->kref);
566 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
567 struct device *dev)
569 struct vfio_device *device;
571 mutex_lock(&group->device_lock);
572 list_for_each_entry(device, &group->device_list, group_next) {
573 if (device->dev == dev) {
574 vfio_device_get(device);
575 mutex_unlock(&group->device_lock);
576 return device;
579 mutex_unlock(&group->device_lock);
580 return NULL;
584 * Some drivers, like pci-stub, are only used to prevent other drivers from
585 * claiming a device and are therefore perfectly legitimate for a user owned
586 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
587 * of the device, but it does prevent the user from having direct access to
588 * the device, which is useful in some circumstances.
590 * We also assume that we can include PCI interconnect devices, ie. bridges.
591 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
592 * then all of the downstream devices will be part of the same IOMMU group as
593 * the bridge. Thus, if placing the bridge into the user owned IOVA space
594 * breaks anything, it only does so for user owned devices downstream. Note
595 * that error notification via MSI can be affected for platforms that handle
596 * MSI within the same IOVA space as DMA.
598 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
600 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
602 int i;
604 if (dev_is_pci(dev)) {
605 struct pci_dev *pdev = to_pci_dev(dev);
607 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
608 return true;
611 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
612 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
613 return true;
616 return false;
620 * A vfio group is viable for use by userspace if all devices are in
621 * one of the following states:
622 * - driver-less
623 * - bound to a vfio driver
624 * - bound to a whitelisted driver
625 * - a PCI interconnect device
627 * We use two methods to determine whether a device is bound to a vfio
628 * driver. The first is to test whether the device exists in the vfio
629 * group. The second is to test if the device exists on the group
630 * unbound_list, indicating it's in the middle of transitioning from
631 * a vfio driver to driver-less.
633 static int vfio_dev_viable(struct device *dev, void *data)
635 struct vfio_group *group = data;
636 struct vfio_device *device;
637 struct device_driver *drv = ACCESS_ONCE(dev->driver);
638 struct vfio_unbound_dev *unbound;
639 int ret = -EINVAL;
641 mutex_lock(&group->unbound_lock);
642 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
643 if (dev == unbound->dev) {
644 ret = 0;
645 break;
648 mutex_unlock(&group->unbound_lock);
650 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
651 return 0;
653 device = vfio_group_get_device(group, dev);
654 if (device) {
655 vfio_device_put(device);
656 return 0;
659 return ret;
663 * Async device support
665 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
667 struct vfio_device *device;
669 /* Do we already know about it? We shouldn't */
670 device = vfio_group_get_device(group, dev);
671 if (WARN_ON_ONCE(device)) {
672 vfio_device_put(device);
673 return 0;
676 /* Nothing to do for idle groups */
677 if (!atomic_read(&group->container_users))
678 return 0;
680 /* TODO Prevent device auto probing */
681 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
682 iommu_group_id(group->iommu_group));
684 return 0;
687 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
689 /* We don't care what happens when the group isn't in use */
690 if (!atomic_read(&group->container_users))
691 return 0;
693 return vfio_dev_viable(dev, group);
696 static int vfio_iommu_group_notifier(struct notifier_block *nb,
697 unsigned long action, void *data)
699 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
700 struct device *dev = data;
701 struct vfio_unbound_dev *unbound;
704 * Need to go through a group_lock lookup to get a reference or we
705 * risk racing a group being removed. Ignore spurious notifies.
707 group = vfio_group_try_get(group);
708 if (!group)
709 return NOTIFY_OK;
711 switch (action) {
712 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
713 vfio_group_nb_add_dev(group, dev);
714 break;
715 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
717 * Nothing to do here. If the device is in use, then the
718 * vfio sub-driver should block the remove callback until
719 * it is unused. If the device is unused or attached to a
720 * stub driver, then it should be released and we don't
721 * care that it will be going away.
723 break;
724 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
725 pr_debug("%s: Device %s, group %d binding to driver\n",
726 __func__, dev_name(dev),
727 iommu_group_id(group->iommu_group));
728 break;
729 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
730 pr_debug("%s: Device %s, group %d bound to driver %s\n",
731 __func__, dev_name(dev),
732 iommu_group_id(group->iommu_group), dev->driver->name);
733 BUG_ON(vfio_group_nb_verify(group, dev));
734 break;
735 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
736 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
737 __func__, dev_name(dev),
738 iommu_group_id(group->iommu_group), dev->driver->name);
739 break;
740 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
741 pr_debug("%s: Device %s, group %d unbound from driver\n",
742 __func__, dev_name(dev),
743 iommu_group_id(group->iommu_group));
745 * XXX An unbound device in a live group is ok, but we'd
746 * really like to avoid the above BUG_ON by preventing other
747 * drivers from binding to it. Once that occurs, we have to
748 * stop the system to maintain isolation. At a minimum, we'd
749 * want a toggle to disable driver auto probe for this device.
752 mutex_lock(&group->unbound_lock);
753 list_for_each_entry(unbound,
754 &group->unbound_list, unbound_next) {
755 if (dev == unbound->dev) {
756 list_del(&unbound->unbound_next);
757 kfree(unbound);
758 break;
761 mutex_unlock(&group->unbound_lock);
762 break;
765 vfio_group_put(group);
766 return NOTIFY_OK;
770 * VFIO driver API
772 int vfio_add_group_dev(struct device *dev,
773 const struct vfio_device_ops *ops, void *device_data)
775 struct iommu_group *iommu_group;
776 struct vfio_group *group;
777 struct vfio_device *device;
779 iommu_group = iommu_group_get(dev);
780 if (!iommu_group)
781 return -EINVAL;
783 group = vfio_group_get_from_iommu(iommu_group);
784 if (!group) {
785 group = vfio_create_group(iommu_group);
786 if (IS_ERR(group)) {
787 iommu_group_put(iommu_group);
788 return PTR_ERR(group);
790 } else {
792 * A found vfio_group already holds a reference to the
793 * iommu_group. A created vfio_group keeps the reference.
795 iommu_group_put(iommu_group);
798 device = vfio_group_get_device(group, dev);
799 if (device) {
800 WARN(1, "Device %s already exists on group %d\n",
801 dev_name(dev), iommu_group_id(iommu_group));
802 vfio_device_put(device);
803 vfio_group_put(group);
804 return -EBUSY;
807 device = vfio_group_create_device(group, dev, ops, device_data);
808 if (IS_ERR(device)) {
809 vfio_group_put(group);
810 return PTR_ERR(device);
814 * Drop all but the vfio_device reference. The vfio_device holds
815 * a reference to the vfio_group, which holds a reference to the
816 * iommu_group.
818 vfio_group_put(group);
820 return 0;
822 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
825 * Get a reference to the vfio_device for a device. Even if the
826 * caller thinks they own the device, they could be racing with a
827 * release call path, so we can't trust drvdata for the shortcut.
828 * Go the long way around, from the iommu_group to the vfio_group
829 * to the vfio_device.
831 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
833 struct vfio_group *group;
834 struct vfio_device *device;
836 group = vfio_group_get_from_dev(dev);
837 if (!group)
838 return NULL;
840 device = vfio_group_get_device(group, dev);
841 vfio_group_put(group);
843 return device;
845 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
847 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
848 char *buf)
850 struct vfio_device *it, *device = NULL;
852 mutex_lock(&group->device_lock);
853 list_for_each_entry(it, &group->device_list, group_next) {
854 if (!strcmp(dev_name(it->dev), buf)) {
855 device = it;
856 vfio_device_get(device);
857 break;
860 mutex_unlock(&group->device_lock);
862 return device;
866 * Caller must hold a reference to the vfio_device
868 void *vfio_device_data(struct vfio_device *device)
870 return device->device_data;
872 EXPORT_SYMBOL_GPL(vfio_device_data);
874 /* Given a referenced group, check if it contains the device */
875 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
877 struct vfio_device *device;
879 device = vfio_group_get_device(group, dev);
880 if (!device)
881 return false;
883 vfio_device_put(device);
884 return true;
888 * Decrement the device reference count and wait for the device to be
889 * removed. Open file descriptors for the device... */
890 void *vfio_del_group_dev(struct device *dev)
892 struct vfio_device *device = dev_get_drvdata(dev);
893 struct vfio_group *group = device->group;
894 void *device_data = device->device_data;
895 struct vfio_unbound_dev *unbound;
896 unsigned int i = 0;
897 long ret;
898 bool interrupted = false;
901 * The group exists so long as we have a device reference. Get
902 * a group reference and use it to scan for the device going away.
904 vfio_group_get(group);
907 * When the device is removed from the group, the group suddenly
908 * becomes non-viable; the device has a driver (until the unbind
909 * completes), but it's not present in the group. This is bad news
910 * for any external users that need to re-acquire a group reference
911 * in order to match and release their existing reference. To
912 * solve this, we track such devices on the unbound_list to bridge
913 * the gap until they're fully unbound.
915 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
916 if (unbound) {
917 unbound->dev = dev;
918 mutex_lock(&group->unbound_lock);
919 list_add(&unbound->unbound_next, &group->unbound_list);
920 mutex_unlock(&group->unbound_lock);
922 WARN_ON(!unbound);
924 vfio_device_put(device);
927 * If the device is still present in the group after the above
928 * 'put', then it is in use and we need to request it from the
929 * bus driver. The driver may in turn need to request the
930 * device from the user. We send the request on an arbitrary
931 * interval with counter to allow the driver to take escalating
932 * measures to release the device if it has the ability to do so.
934 do {
935 device = vfio_group_get_device(group, dev);
936 if (!device)
937 break;
939 if (device->ops->request)
940 device->ops->request(device_data, i++);
942 vfio_device_put(device);
944 if (interrupted) {
945 ret = wait_event_timeout(vfio.release_q,
946 !vfio_dev_present(group, dev), HZ * 10);
947 } else {
948 ret = wait_event_interruptible_timeout(vfio.release_q,
949 !vfio_dev_present(group, dev), HZ * 10);
950 if (ret == -ERESTARTSYS) {
951 interrupted = true;
952 dev_warn(dev,
953 "Device is currently in use, task"
954 " \"%s\" (%d) "
955 "blocked until device is released",
956 current->comm, task_pid_nr(current));
959 } while (ret <= 0);
961 vfio_group_put(group);
963 return device_data;
965 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
968 * VFIO base fd, /dev/vfio/vfio
970 static long vfio_ioctl_check_extension(struct vfio_container *container,
971 unsigned long arg)
973 struct vfio_iommu_driver *driver;
974 long ret = 0;
976 down_read(&container->group_lock);
978 driver = container->iommu_driver;
980 switch (arg) {
981 /* No base extensions yet */
982 default:
984 * If no driver is set, poll all registered drivers for
985 * extensions and return the first positive result. If
986 * a driver is already set, further queries will be passed
987 * only to that driver.
989 if (!driver) {
990 mutex_lock(&vfio.iommu_drivers_lock);
991 list_for_each_entry(driver, &vfio.iommu_drivers_list,
992 vfio_next) {
994 #ifdef CONFIG_VFIO_NOIOMMU
995 if (!list_empty(&container->group_list) &&
996 (container->noiommu !=
997 (driver->ops == &vfio_noiommu_ops)))
998 continue;
999 #endif
1001 if (!try_module_get(driver->ops->owner))
1002 continue;
1004 ret = driver->ops->ioctl(NULL,
1005 VFIO_CHECK_EXTENSION,
1006 arg);
1007 module_put(driver->ops->owner);
1008 if (ret > 0)
1009 break;
1011 mutex_unlock(&vfio.iommu_drivers_lock);
1012 } else
1013 ret = driver->ops->ioctl(container->iommu_data,
1014 VFIO_CHECK_EXTENSION, arg);
1017 up_read(&container->group_lock);
1019 return ret;
1022 /* hold write lock on container->group_lock */
1023 static int __vfio_container_attach_groups(struct vfio_container *container,
1024 struct vfio_iommu_driver *driver,
1025 void *data)
1027 struct vfio_group *group;
1028 int ret = -ENODEV;
1030 list_for_each_entry(group, &container->group_list, container_next) {
1031 ret = driver->ops->attach_group(data, group->iommu_group);
1032 if (ret)
1033 goto unwind;
1036 return ret;
1038 unwind:
1039 list_for_each_entry_continue_reverse(group, &container->group_list,
1040 container_next) {
1041 driver->ops->detach_group(data, group->iommu_group);
1044 return ret;
1047 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1048 unsigned long arg)
1050 struct vfio_iommu_driver *driver;
1051 long ret = -ENODEV;
1053 down_write(&container->group_lock);
1056 * The container is designed to be an unprivileged interface while
1057 * the group can be assigned to specific users. Therefore, only by
1058 * adding a group to a container does the user get the privilege of
1059 * enabling the iommu, which may allocate finite resources. There
1060 * is no unset_iommu, but by removing all the groups from a container,
1061 * the container is deprivileged and returns to an unset state.
1063 if (list_empty(&container->group_list) || container->iommu_driver) {
1064 up_write(&container->group_lock);
1065 return -EINVAL;
1068 mutex_lock(&vfio.iommu_drivers_lock);
1069 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1070 void *data;
1072 #ifdef CONFIG_VFIO_NOIOMMU
1074 * Only noiommu containers can use vfio-noiommu and noiommu
1075 * containers can only use vfio-noiommu.
1077 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1078 continue;
1079 #endif
1081 if (!try_module_get(driver->ops->owner))
1082 continue;
1085 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1086 * so test which iommu driver reported support for this
1087 * extension and call open on them. We also pass them the
1088 * magic, allowing a single driver to support multiple
1089 * interfaces if they'd like.
1091 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1092 module_put(driver->ops->owner);
1093 continue;
1096 data = driver->ops->open(arg);
1097 if (IS_ERR(data)) {
1098 ret = PTR_ERR(data);
1099 module_put(driver->ops->owner);
1100 continue;
1103 ret = __vfio_container_attach_groups(container, driver, data);
1104 if (ret) {
1105 driver->ops->release(data);
1106 module_put(driver->ops->owner);
1107 continue;
1110 container->iommu_driver = driver;
1111 container->iommu_data = data;
1112 break;
1115 mutex_unlock(&vfio.iommu_drivers_lock);
1116 up_write(&container->group_lock);
1118 return ret;
1121 static long vfio_fops_unl_ioctl(struct file *filep,
1122 unsigned int cmd, unsigned long arg)
1124 struct vfio_container *container = filep->private_data;
1125 struct vfio_iommu_driver *driver;
1126 void *data;
1127 long ret = -EINVAL;
1129 if (!container)
1130 return ret;
1132 switch (cmd) {
1133 case VFIO_GET_API_VERSION:
1134 ret = VFIO_API_VERSION;
1135 break;
1136 case VFIO_CHECK_EXTENSION:
1137 ret = vfio_ioctl_check_extension(container, arg);
1138 break;
1139 case VFIO_SET_IOMMU:
1140 ret = vfio_ioctl_set_iommu(container, arg);
1141 break;
1142 default:
1143 down_read(&container->group_lock);
1145 driver = container->iommu_driver;
1146 data = container->iommu_data;
1148 if (driver) /* passthrough all unrecognized ioctls */
1149 ret = driver->ops->ioctl(data, cmd, arg);
1151 up_read(&container->group_lock);
1154 return ret;
1157 #ifdef CONFIG_COMPAT
1158 static long vfio_fops_compat_ioctl(struct file *filep,
1159 unsigned int cmd, unsigned long arg)
1161 arg = (unsigned long)compat_ptr(arg);
1162 return vfio_fops_unl_ioctl(filep, cmd, arg);
1164 #endif /* CONFIG_COMPAT */
1166 static int vfio_fops_open(struct inode *inode, struct file *filep)
1168 struct vfio_container *container;
1170 container = kzalloc(sizeof(*container), GFP_KERNEL);
1171 if (!container)
1172 return -ENOMEM;
1174 INIT_LIST_HEAD(&container->group_list);
1175 init_rwsem(&container->group_lock);
1176 kref_init(&container->kref);
1178 filep->private_data = container;
1180 return 0;
1183 static int vfio_fops_release(struct inode *inode, struct file *filep)
1185 struct vfio_container *container = filep->private_data;
1187 filep->private_data = NULL;
1189 vfio_container_put(container);
1191 return 0;
1195 * Once an iommu driver is set, we optionally pass read/write/mmap
1196 * on to the driver, allowing management interfaces beyond ioctl.
1198 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1199 size_t count, loff_t *ppos)
1201 struct vfio_container *container = filep->private_data;
1202 struct vfio_iommu_driver *driver;
1203 ssize_t ret = -EINVAL;
1205 down_read(&container->group_lock);
1207 driver = container->iommu_driver;
1208 if (likely(driver && driver->ops->read))
1209 ret = driver->ops->read(container->iommu_data,
1210 buf, count, ppos);
1212 up_read(&container->group_lock);
1214 return ret;
1217 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1218 size_t count, loff_t *ppos)
1220 struct vfio_container *container = filep->private_data;
1221 struct vfio_iommu_driver *driver;
1222 ssize_t ret = -EINVAL;
1224 down_read(&container->group_lock);
1226 driver = container->iommu_driver;
1227 if (likely(driver && driver->ops->write))
1228 ret = driver->ops->write(container->iommu_data,
1229 buf, count, ppos);
1231 up_read(&container->group_lock);
1233 return ret;
1236 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1238 struct vfio_container *container = filep->private_data;
1239 struct vfio_iommu_driver *driver;
1240 int ret = -EINVAL;
1242 down_read(&container->group_lock);
1244 driver = container->iommu_driver;
1245 if (likely(driver && driver->ops->mmap))
1246 ret = driver->ops->mmap(container->iommu_data, vma);
1248 up_read(&container->group_lock);
1250 return ret;
1253 static const struct file_operations vfio_fops = {
1254 .owner = THIS_MODULE,
1255 .open = vfio_fops_open,
1256 .release = vfio_fops_release,
1257 .read = vfio_fops_read,
1258 .write = vfio_fops_write,
1259 .unlocked_ioctl = vfio_fops_unl_ioctl,
1260 #ifdef CONFIG_COMPAT
1261 .compat_ioctl = vfio_fops_compat_ioctl,
1262 #endif
1263 .mmap = vfio_fops_mmap,
1267 * VFIO Group fd, /dev/vfio/$GROUP
1269 static void __vfio_group_unset_container(struct vfio_group *group)
1271 struct vfio_container *container = group->container;
1272 struct vfio_iommu_driver *driver;
1274 down_write(&container->group_lock);
1276 driver = container->iommu_driver;
1277 if (driver)
1278 driver->ops->detach_group(container->iommu_data,
1279 group->iommu_group);
1281 group->container = NULL;
1282 list_del(&group->container_next);
1284 /* Detaching the last group deprivileges a container, remove iommu */
1285 if (driver && list_empty(&container->group_list)) {
1286 driver->ops->release(container->iommu_data);
1287 module_put(driver->ops->owner);
1288 container->iommu_driver = NULL;
1289 container->iommu_data = NULL;
1292 up_write(&container->group_lock);
1294 vfio_container_put(container);
1298 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1299 * if there was no container to unset. Since the ioctl is called on
1300 * the group, we know that still exists, therefore the only valid
1301 * transition here is 1->0.
1303 static int vfio_group_unset_container(struct vfio_group *group)
1305 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1307 if (!users)
1308 return -EINVAL;
1309 if (users != 1)
1310 return -EBUSY;
1312 __vfio_group_unset_container(group);
1314 return 0;
1318 * When removing container users, anything that removes the last user
1319 * implicitly removes the group from the container. That is, if the
1320 * group file descriptor is closed, as well as any device file descriptors,
1321 * the group is free.
1323 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1325 if (0 == atomic_dec_if_positive(&group->container_users))
1326 __vfio_group_unset_container(group);
1329 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1331 struct fd f;
1332 struct vfio_container *container;
1333 struct vfio_iommu_driver *driver;
1334 int ret = 0;
1336 if (atomic_read(&group->container_users))
1337 return -EINVAL;
1339 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1340 return -EPERM;
1342 f = fdget(container_fd);
1343 if (!f.file)
1344 return -EBADF;
1346 /* Sanity check, is this really our fd? */
1347 if (f.file->f_op != &vfio_fops) {
1348 fdput(f);
1349 return -EINVAL;
1352 container = f.file->private_data;
1353 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1355 down_write(&container->group_lock);
1357 /* Real groups and fake groups cannot mix */
1358 if (!list_empty(&container->group_list) &&
1359 container->noiommu != group->noiommu) {
1360 ret = -EPERM;
1361 goto unlock_out;
1364 driver = container->iommu_driver;
1365 if (driver) {
1366 ret = driver->ops->attach_group(container->iommu_data,
1367 group->iommu_group);
1368 if (ret)
1369 goto unlock_out;
1372 group->container = container;
1373 container->noiommu = group->noiommu;
1374 list_add(&group->container_next, &container->group_list);
1376 /* Get a reference on the container and mark a user within the group */
1377 vfio_container_get(container);
1378 atomic_inc(&group->container_users);
1380 unlock_out:
1381 up_write(&container->group_lock);
1382 fdput(f);
1383 return ret;
1386 static bool vfio_group_viable(struct vfio_group *group)
1388 return (iommu_group_for_each_dev(group->iommu_group,
1389 group, vfio_dev_viable) == 0);
1392 static int vfio_group_add_container_user(struct vfio_group *group)
1394 if (!atomic_inc_not_zero(&group->container_users))
1395 return -EINVAL;
1397 if (group->noiommu) {
1398 atomic_dec(&group->container_users);
1399 return -EPERM;
1401 if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1402 atomic_dec(&group->container_users);
1403 return -EINVAL;
1406 return 0;
1409 static const struct file_operations vfio_device_fops;
1411 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1413 struct vfio_device *device;
1414 struct file *filep;
1415 int ret;
1417 if (0 == atomic_read(&group->container_users) ||
1418 !group->container->iommu_driver || !vfio_group_viable(group))
1419 return -EINVAL;
1421 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1422 return -EPERM;
1424 device = vfio_device_get_from_name(group, buf);
1425 if (!device)
1426 return -ENODEV;
1428 ret = device->ops->open(device->device_data);
1429 if (ret) {
1430 vfio_device_put(device);
1431 return ret;
1435 * We can't use anon_inode_getfd() because we need to modify
1436 * the f_mode flags directly to allow more than just ioctls
1438 ret = get_unused_fd_flags(O_CLOEXEC);
1439 if (ret < 0) {
1440 device->ops->release(device->device_data);
1441 vfio_device_put(device);
1442 return ret;
1445 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1446 device, O_RDWR);
1447 if (IS_ERR(filep)) {
1448 put_unused_fd(ret);
1449 ret = PTR_ERR(filep);
1450 device->ops->release(device->device_data);
1451 vfio_device_put(device);
1452 return ret;
1456 * TODO: add an anon_inode interface to do this.
1457 * Appears to be missing by lack of need rather than
1458 * explicitly prevented. Now there's need.
1460 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1462 atomic_inc(&group->container_users);
1464 fd_install(ret, filep);
1466 if (group->noiommu)
1467 dev_warn(device->dev, "vfio-noiommu device opened by user "
1468 "(%s:%d)\n", current->comm, task_pid_nr(current));
1470 return ret;
1473 static long vfio_group_fops_unl_ioctl(struct file *filep,
1474 unsigned int cmd, unsigned long arg)
1476 struct vfio_group *group = filep->private_data;
1477 long ret = -ENOTTY;
1479 switch (cmd) {
1480 case VFIO_GROUP_GET_STATUS:
1482 struct vfio_group_status status;
1483 unsigned long minsz;
1485 minsz = offsetofend(struct vfio_group_status, flags);
1487 if (copy_from_user(&status, (void __user *)arg, minsz))
1488 return -EFAULT;
1490 if (status.argsz < minsz)
1491 return -EINVAL;
1493 status.flags = 0;
1495 if (vfio_group_viable(group))
1496 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1498 if (group->container)
1499 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1501 if (copy_to_user((void __user *)arg, &status, minsz))
1502 return -EFAULT;
1504 ret = 0;
1505 break;
1507 case VFIO_GROUP_SET_CONTAINER:
1509 int fd;
1511 if (get_user(fd, (int __user *)arg))
1512 return -EFAULT;
1514 if (fd < 0)
1515 return -EINVAL;
1517 ret = vfio_group_set_container(group, fd);
1518 break;
1520 case VFIO_GROUP_UNSET_CONTAINER:
1521 ret = vfio_group_unset_container(group);
1522 break;
1523 case VFIO_GROUP_GET_DEVICE_FD:
1525 char *buf;
1527 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1528 if (IS_ERR(buf))
1529 return PTR_ERR(buf);
1531 ret = vfio_group_get_device_fd(group, buf);
1532 kfree(buf);
1533 break;
1537 return ret;
1540 #ifdef CONFIG_COMPAT
1541 static long vfio_group_fops_compat_ioctl(struct file *filep,
1542 unsigned int cmd, unsigned long arg)
1544 arg = (unsigned long)compat_ptr(arg);
1545 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1547 #endif /* CONFIG_COMPAT */
1549 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1551 struct vfio_group *group;
1552 int opened;
1554 group = vfio_group_get_from_minor(iminor(inode));
1555 if (!group)
1556 return -ENODEV;
1558 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1559 vfio_group_put(group);
1560 return -EPERM;
1563 /* Do we need multiple instances of the group open? Seems not. */
1564 opened = atomic_cmpxchg(&group->opened, 0, 1);
1565 if (opened) {
1566 vfio_group_put(group);
1567 return -EBUSY;
1570 /* Is something still in use from a previous open? */
1571 if (group->container) {
1572 atomic_dec(&group->opened);
1573 vfio_group_put(group);
1574 return -EBUSY;
1577 /* Warn if previous user didn't cleanup and re-init to drop them */
1578 if (WARN_ON(group->notifier.head))
1579 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1581 filep->private_data = group;
1583 return 0;
1586 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1588 struct vfio_group *group = filep->private_data;
1590 filep->private_data = NULL;
1592 vfio_group_try_dissolve_container(group);
1594 atomic_dec(&group->opened);
1596 vfio_group_put(group);
1598 return 0;
1601 static const struct file_operations vfio_group_fops = {
1602 .owner = THIS_MODULE,
1603 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1604 #ifdef CONFIG_COMPAT
1605 .compat_ioctl = vfio_group_fops_compat_ioctl,
1606 #endif
1607 .open = vfio_group_fops_open,
1608 .release = vfio_group_fops_release,
1612 * VFIO Device fd
1614 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1616 struct vfio_device *device = filep->private_data;
1618 device->ops->release(device->device_data);
1620 vfio_group_try_dissolve_container(device->group);
1622 vfio_device_put(device);
1624 return 0;
1627 static long vfio_device_fops_unl_ioctl(struct file *filep,
1628 unsigned int cmd, unsigned long arg)
1630 struct vfio_device *device = filep->private_data;
1632 if (unlikely(!device->ops->ioctl))
1633 return -EINVAL;
1635 return device->ops->ioctl(device->device_data, cmd, arg);
1638 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1639 size_t count, loff_t *ppos)
1641 struct vfio_device *device = filep->private_data;
1643 if (unlikely(!device->ops->read))
1644 return -EINVAL;
1646 return device->ops->read(device->device_data, buf, count, ppos);
1649 static ssize_t vfio_device_fops_write(struct file *filep,
1650 const char __user *buf,
1651 size_t count, loff_t *ppos)
1653 struct vfio_device *device = filep->private_data;
1655 if (unlikely(!device->ops->write))
1656 return -EINVAL;
1658 return device->ops->write(device->device_data, buf, count, ppos);
1661 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1663 struct vfio_device *device = filep->private_data;
1665 if (unlikely(!device->ops->mmap))
1666 return -EINVAL;
1668 return device->ops->mmap(device->device_data, vma);
1671 #ifdef CONFIG_COMPAT
1672 static long vfio_device_fops_compat_ioctl(struct file *filep,
1673 unsigned int cmd, unsigned long arg)
1675 arg = (unsigned long)compat_ptr(arg);
1676 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1678 #endif /* CONFIG_COMPAT */
1680 static const struct file_operations vfio_device_fops = {
1681 .owner = THIS_MODULE,
1682 .release = vfio_device_fops_release,
1683 .read = vfio_device_fops_read,
1684 .write = vfio_device_fops_write,
1685 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1686 #ifdef CONFIG_COMPAT
1687 .compat_ioctl = vfio_device_fops_compat_ioctl,
1688 #endif
1689 .mmap = vfio_device_fops_mmap,
1693 * External user API, exported by symbols to be linked dynamically.
1695 * The protocol includes:
1696 * 1. do normal VFIO init operation:
1697 * - opening a new container;
1698 * - attaching group(s) to it;
1699 * - setting an IOMMU driver for a container.
1700 * When IOMMU is set for a container, all groups in it are
1701 * considered ready to use by an external user.
1703 * 2. User space passes a group fd to an external user.
1704 * The external user calls vfio_group_get_external_user()
1705 * to verify that:
1706 * - the group is initialized;
1707 * - IOMMU is set for it.
1708 * If both checks passed, vfio_group_get_external_user()
1709 * increments the container user counter to prevent
1710 * the VFIO group from disposal before KVM exits.
1712 * 3. The external user calls vfio_external_user_iommu_id()
1713 * to know an IOMMU ID.
1715 * 4. When the external KVM finishes, it calls
1716 * vfio_group_put_external_user() to release the VFIO group.
1717 * This call decrements the container user counter.
1719 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1721 struct vfio_group *group = filep->private_data;
1722 int ret;
1724 if (filep->f_op != &vfio_group_fops)
1725 return ERR_PTR(-EINVAL);
1727 ret = vfio_group_add_container_user(group);
1728 if (ret)
1729 return ERR_PTR(ret);
1731 vfio_group_get(group);
1733 return group;
1735 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1737 void vfio_group_put_external_user(struct vfio_group *group)
1739 vfio_group_try_dissolve_container(group);
1740 vfio_group_put(group);
1742 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1744 int vfio_external_user_iommu_id(struct vfio_group *group)
1746 return iommu_group_id(group->iommu_group);
1748 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1750 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1752 return vfio_ioctl_check_extension(group->container, arg);
1754 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1757 * Sub-module support
1760 * Helper for managing a buffer of info chain capabilities, allocate or
1761 * reallocate a buffer with additional @size, filling in @id and @version
1762 * of the capability. A pointer to the new capability is returned.
1764 * NB. The chain is based at the head of the buffer, so new entries are
1765 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1766 * next offsets prior to copying to the user buffer.
1768 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1769 size_t size, u16 id, u16 version)
1771 void *buf;
1772 struct vfio_info_cap_header *header, *tmp;
1774 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1775 if (!buf) {
1776 kfree(caps->buf);
1777 caps->size = 0;
1778 return ERR_PTR(-ENOMEM);
1781 caps->buf = buf;
1782 header = buf + caps->size;
1784 /* Eventually copied to user buffer, zero */
1785 memset(header, 0, size);
1787 header->id = id;
1788 header->version = version;
1790 /* Add to the end of the capability chain */
1791 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1792 ; /* nothing */
1794 tmp->next = caps->size;
1795 caps->size += size;
1797 return header;
1799 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1801 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1803 struct vfio_info_cap_header *tmp;
1804 void *buf = (void *)caps->buf;
1806 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1807 tmp->next += offset;
1809 EXPORT_SYMBOL(vfio_info_cap_shift);
1811 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1813 struct vfio_info_cap_header *header;
1814 struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1815 size_t size;
1817 size = sizeof(*sparse) + sparse->nr_areas * sizeof(*sparse->areas);
1818 header = vfio_info_cap_add(caps, size,
1819 VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1820 if (IS_ERR(header))
1821 return PTR_ERR(header);
1823 sparse_cap = container_of(header,
1824 struct vfio_region_info_cap_sparse_mmap, header);
1825 sparse_cap->nr_areas = sparse->nr_areas;
1826 memcpy(sparse_cap->areas, sparse->areas,
1827 sparse->nr_areas * sizeof(*sparse->areas));
1828 return 0;
1831 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1833 struct vfio_info_cap_header *header;
1834 struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1836 header = vfio_info_cap_add(caps, sizeof(*cap),
1837 VFIO_REGION_INFO_CAP_TYPE, 1);
1838 if (IS_ERR(header))
1839 return PTR_ERR(header);
1841 type_cap = container_of(header, struct vfio_region_info_cap_type,
1842 header);
1843 type_cap->type = cap->type;
1844 type_cap->subtype = cap->subtype;
1845 return 0;
1848 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1849 void *cap_type)
1851 int ret = -EINVAL;
1853 if (!cap_type)
1854 return 0;
1856 switch (cap_type_id) {
1857 case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1858 ret = sparse_mmap_cap(caps, cap_type);
1859 break;
1861 case VFIO_REGION_INFO_CAP_TYPE:
1862 ret = region_type_cap(caps, cap_type);
1863 break;
1866 return ret;
1868 EXPORT_SYMBOL(vfio_info_add_capability);
1870 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1871 int max_irq_type, size_t *data_size)
1873 unsigned long minsz;
1874 size_t size;
1876 minsz = offsetofend(struct vfio_irq_set, count);
1878 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1879 (hdr->count >= (U32_MAX - hdr->start)) ||
1880 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1881 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1882 return -EINVAL;
1884 if (data_size)
1885 *data_size = 0;
1887 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1888 return -EINVAL;
1890 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1891 case VFIO_IRQ_SET_DATA_NONE:
1892 size = 0;
1893 break;
1894 case VFIO_IRQ_SET_DATA_BOOL:
1895 size = sizeof(uint8_t);
1896 break;
1897 case VFIO_IRQ_SET_DATA_EVENTFD:
1898 size = sizeof(int32_t);
1899 break;
1900 default:
1901 return -EINVAL;
1904 if (size) {
1905 if (hdr->argsz - minsz < hdr->count * size)
1906 return -EINVAL;
1908 if (!data_size)
1909 return -EINVAL;
1911 *data_size = hdr->count * size;
1914 return 0;
1916 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1919 * Pin a set of guest PFNs and return their associated host PFNs for local
1920 * domain only.
1921 * @dev [in] : device
1922 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1923 * @npage [in] : count of elements in user_pfn array. This count should not
1924 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1925 * @prot [in] : protection flags
1926 * @phys_pfn[out]: array of host PFNs
1927 * Return error or number of pages pinned.
1929 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1930 int prot, unsigned long *phys_pfn)
1932 struct vfio_container *container;
1933 struct vfio_group *group;
1934 struct vfio_iommu_driver *driver;
1935 int ret;
1937 if (!dev || !user_pfn || !phys_pfn || !npage)
1938 return -EINVAL;
1940 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1941 return -E2BIG;
1943 group = vfio_group_get_from_dev(dev);
1944 if (!group)
1945 return -ENODEV;
1947 ret = vfio_group_add_container_user(group);
1948 if (ret)
1949 goto err_pin_pages;
1951 container = group->container;
1952 down_read(&container->group_lock);
1954 driver = container->iommu_driver;
1955 if (likely(driver && driver->ops->pin_pages))
1956 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1957 npage, prot, phys_pfn);
1958 else
1959 ret = -ENOTTY;
1961 up_read(&container->group_lock);
1962 vfio_group_try_dissolve_container(group);
1964 err_pin_pages:
1965 vfio_group_put(group);
1966 return ret;
1968 EXPORT_SYMBOL(vfio_pin_pages);
1971 * Unpin set of host PFNs for local domain only.
1972 * @dev [in] : device
1973 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1974 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1975 * @npage [in] : count of elements in user_pfn array. This count should not
1976 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1977 * Return error or number of pages unpinned.
1979 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1981 struct vfio_container *container;
1982 struct vfio_group *group;
1983 struct vfio_iommu_driver *driver;
1984 int ret;
1986 if (!dev || !user_pfn || !npage)
1987 return -EINVAL;
1989 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1990 return -E2BIG;
1992 group = vfio_group_get_from_dev(dev);
1993 if (!group)
1994 return -ENODEV;
1996 ret = vfio_group_add_container_user(group);
1997 if (ret)
1998 goto err_unpin_pages;
2000 container = group->container;
2001 down_read(&container->group_lock);
2003 driver = container->iommu_driver;
2004 if (likely(driver && driver->ops->unpin_pages))
2005 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2006 npage);
2007 else
2008 ret = -ENOTTY;
2010 up_read(&container->group_lock);
2011 vfio_group_try_dissolve_container(group);
2013 err_unpin_pages:
2014 vfio_group_put(group);
2015 return ret;
2017 EXPORT_SYMBOL(vfio_unpin_pages);
2019 static int vfio_register_iommu_notifier(struct vfio_group *group,
2020 unsigned long *events,
2021 struct notifier_block *nb)
2023 struct vfio_container *container;
2024 struct vfio_iommu_driver *driver;
2025 int ret;
2027 ret = vfio_group_add_container_user(group);
2028 if (ret)
2029 return -EINVAL;
2031 container = group->container;
2032 down_read(&container->group_lock);
2034 driver = container->iommu_driver;
2035 if (likely(driver && driver->ops->register_notifier))
2036 ret = driver->ops->register_notifier(container->iommu_data,
2037 events, nb);
2038 else
2039 ret = -ENOTTY;
2041 up_read(&container->group_lock);
2042 vfio_group_try_dissolve_container(group);
2044 return ret;
2047 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2048 struct notifier_block *nb)
2050 struct vfio_container *container;
2051 struct vfio_iommu_driver *driver;
2052 int ret;
2054 ret = vfio_group_add_container_user(group);
2055 if (ret)
2056 return -EINVAL;
2058 container = group->container;
2059 down_read(&container->group_lock);
2061 driver = container->iommu_driver;
2062 if (likely(driver && driver->ops->unregister_notifier))
2063 ret = driver->ops->unregister_notifier(container->iommu_data,
2064 nb);
2065 else
2066 ret = -ENOTTY;
2068 up_read(&container->group_lock);
2069 vfio_group_try_dissolve_container(group);
2071 return ret;
2074 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2076 group->kvm = kvm;
2077 blocking_notifier_call_chain(&group->notifier,
2078 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2080 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2082 static int vfio_register_group_notifier(struct vfio_group *group,
2083 unsigned long *events,
2084 struct notifier_block *nb)
2086 struct vfio_container *container;
2087 int ret;
2088 bool set_kvm = false;
2090 if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2091 set_kvm = true;
2093 /* clear known events */
2094 *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2096 /* refuse to continue if still events remaining */
2097 if (*events)
2098 return -EINVAL;
2100 ret = vfio_group_add_container_user(group);
2101 if (ret)
2102 return -EINVAL;
2104 container = group->container;
2105 down_read(&container->group_lock);
2107 ret = blocking_notifier_chain_register(&group->notifier, nb);
2110 * The attaching of kvm and vfio_group might already happen, so
2111 * here we replay once upon registration.
2113 if (!ret && set_kvm && group->kvm)
2114 blocking_notifier_call_chain(&group->notifier,
2115 VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2117 up_read(&container->group_lock);
2118 vfio_group_try_dissolve_container(group);
2120 return ret;
2123 static int vfio_unregister_group_notifier(struct vfio_group *group,
2124 struct notifier_block *nb)
2126 struct vfio_container *container;
2127 int ret;
2129 ret = vfio_group_add_container_user(group);
2130 if (ret)
2131 return -EINVAL;
2133 container = group->container;
2134 down_read(&container->group_lock);
2136 ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2138 up_read(&container->group_lock);
2139 vfio_group_try_dissolve_container(group);
2141 return ret;
2144 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2145 unsigned long *events, struct notifier_block *nb)
2147 struct vfio_group *group;
2148 int ret;
2150 if (!dev || !nb || !events || (*events == 0))
2151 return -EINVAL;
2153 group = vfio_group_get_from_dev(dev);
2154 if (!group)
2155 return -ENODEV;
2157 switch (type) {
2158 case VFIO_IOMMU_NOTIFY:
2159 ret = vfio_register_iommu_notifier(group, events, nb);
2160 break;
2161 case VFIO_GROUP_NOTIFY:
2162 ret = vfio_register_group_notifier(group, events, nb);
2163 break;
2164 default:
2165 ret = -EINVAL;
2168 vfio_group_put(group);
2169 return ret;
2171 EXPORT_SYMBOL(vfio_register_notifier);
2173 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2174 struct notifier_block *nb)
2176 struct vfio_group *group;
2177 int ret;
2179 if (!dev || !nb)
2180 return -EINVAL;
2182 group = vfio_group_get_from_dev(dev);
2183 if (!group)
2184 return -ENODEV;
2186 switch (type) {
2187 case VFIO_IOMMU_NOTIFY:
2188 ret = vfio_unregister_iommu_notifier(group, nb);
2189 break;
2190 case VFIO_GROUP_NOTIFY:
2191 ret = vfio_unregister_group_notifier(group, nb);
2192 break;
2193 default:
2194 ret = -EINVAL;
2197 vfio_group_put(group);
2198 return ret;
2200 EXPORT_SYMBOL(vfio_unregister_notifier);
2203 * Module/class support
2205 static char *vfio_devnode(struct device *dev, umode_t *mode)
2207 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2210 static struct miscdevice vfio_dev = {
2211 .minor = VFIO_MINOR,
2212 .name = "vfio",
2213 .fops = &vfio_fops,
2214 .nodename = "vfio/vfio",
2215 .mode = S_IRUGO | S_IWUGO,
2218 static int __init vfio_init(void)
2220 int ret;
2222 idr_init(&vfio.group_idr);
2223 mutex_init(&vfio.group_lock);
2224 mutex_init(&vfio.iommu_drivers_lock);
2225 INIT_LIST_HEAD(&vfio.group_list);
2226 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2227 init_waitqueue_head(&vfio.release_q);
2229 ret = misc_register(&vfio_dev);
2230 if (ret) {
2231 pr_err("vfio: misc device register failed\n");
2232 return ret;
2235 /* /dev/vfio/$GROUP */
2236 vfio.class = class_create(THIS_MODULE, "vfio");
2237 if (IS_ERR(vfio.class)) {
2238 ret = PTR_ERR(vfio.class);
2239 goto err_class;
2242 vfio.class->devnode = vfio_devnode;
2244 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2245 if (ret)
2246 goto err_alloc_chrdev;
2248 cdev_init(&vfio.group_cdev, &vfio_group_fops);
2249 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2250 if (ret)
2251 goto err_cdev_add;
2253 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2255 #ifdef CONFIG_VFIO_NOIOMMU
2256 vfio_register_iommu_driver(&vfio_noiommu_ops);
2257 #endif
2258 return 0;
2260 err_cdev_add:
2261 unregister_chrdev_region(vfio.group_devt, MINORMASK);
2262 err_alloc_chrdev:
2263 class_destroy(vfio.class);
2264 vfio.class = NULL;
2265 err_class:
2266 misc_deregister(&vfio_dev);
2267 return ret;
2270 static void __exit vfio_cleanup(void)
2272 WARN_ON(!list_empty(&vfio.group_list));
2274 #ifdef CONFIG_VFIO_NOIOMMU
2275 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2276 #endif
2277 idr_destroy(&vfio.group_idr);
2278 cdev_del(&vfio.group_cdev);
2279 unregister_chrdev_region(vfio.group_devt, MINORMASK);
2280 class_destroy(vfio.class);
2281 vfio.class = NULL;
2282 misc_deregister(&vfio_dev);
2285 module_init(vfio_init);
2286 module_exit(vfio_cleanup);
2288 MODULE_VERSION(DRIVER_VERSION);
2289 MODULE_LICENSE("GPL v2");
2290 MODULE_AUTHOR(DRIVER_AUTHOR);
2291 MODULE_DESCRIPTION(DRIVER_DESC);
2292 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2293 MODULE_ALIAS("devname:vfio/vfio");
2294 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");