drivers/vfio/vfio.c

   1 /*
   2  * VFIO core
   3  *
   4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5  *     Author: Alex Williamson <alex.williamson@redhat.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  *
  11  * Derived from original vfio:
  12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13  * Author: Tom Lyon, pugs@cisco.com
  14  */
  15
  16 #include <linux/cdev.h>
  17 #include <linux/compat.h>
  18 #include <linux/device.h>
  19 #include <linux/file.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/fs.h>
  22 #include <linux/idr.h>
  23 #include <linux/iommu.h>
  24 #include <linux/list.h>
  25 #include <linux/miscdevice.h>
  26 #include <linux/module.h>
  27 #include <linux/mutex.h>
  28 #include <linux/pci.h>
  29 #include <linux/rwsem.h>
  30 #include <linux/sched.h>
  31 #include <linux/slab.h>
  32 #include <linux/stat.h>
  33 #include <linux/string.h>
  34 #include <linux/uaccess.h>
  35 #include <linux/vfio.h>
  36 #include <linux/wait.h>
  37
  38 #define DRIVER_VERSION  "0.3"
  39 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  40 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  41
  42 static struct vfio {
  43         struct class                    *class;
  44         struct list_head                iommu_drivers_list;
  45         struct mutex                    iommu_drivers_lock;
  46         struct list_head                group_list;
  47         struct idr                      group_idr;
  48         struct mutex                    group_lock;
  49         struct cdev                     group_cdev;
  50         dev_t                           group_devt;
  51         wait_queue_head_t               release_q;
  52 } vfio;
  53
  54 struct vfio_iommu_driver {
  55         const struct vfio_iommu_driver_ops      *ops;
  56         struct list_head                        vfio_next;
  57 };
  58
  59 struct vfio_container {
  60         struct kref                     kref;
  61         struct list_head                group_list;
  62         struct rw_semaphore             group_lock;
  63         struct vfio_iommu_driver        *iommu_driver;
  64         void                            *iommu_data;
  65         bool                            noiommu;
  66 };
  67
  68 struct vfio_unbound_dev {
  69         struct device                   *dev;
  70         struct list_head                unbound_next;
  71 };
  72
  73 struct vfio_group {
  74         struct kref                     kref;
  75         int                             minor;
  76         atomic_t                        container_users;
  77         struct iommu_group              *iommu_group;
  78         struct vfio_container           *container;
  79         struct list_head                device_list;
  80         struct mutex                    device_lock;
  81         struct device                   *dev;
  82         struct notifier_block           nb;
  83         struct list_head                vfio_next;
  84         struct list_head                container_next;
  85         struct list_head                unbound_list;
  86         struct mutex                    unbound_lock;
  87         atomic_t                        opened;
  88         bool                            noiommu;
  89         struct kvm                      *kvm;
  90         struct blocking_notifier_head   notifier;
  91 };
  92
  93 struct vfio_device {
  94         struct kref                     kref;
  95         struct device                   *dev;
  96         const struct vfio_device_ops    *ops;
  97         struct vfio_group               *group;
  98         struct list_head                group_next;
  99         void                            *device_data;
 100 };
 101
 102 #ifdef CONFIG_VFIO_NOIOMMU
 103 static bool noiommu __read_mostly;
 104 module_param_named(enable_unsafe_noiommu_mode,
 105                    noiommu, bool, S_IRUGO | S_IWUSR);
 106 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 107 #endif
 108
 109 /*
 110  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 111  * and remove functions, any use cases other than acquiring the first
 112  * reference for the purpose of calling vfio_add_group_dev() or removing
 113  * that symmetric reference after vfio_del_group_dev() should use the raw
 114  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 115  * removes the device from the dummy group and cannot be nested.
 116  */
 117 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 118 {
 119         struct iommu_group *group;
 120         int __maybe_unused ret;
 121
 122         group = iommu_group_get(dev);
 123
 124 #ifdef CONFIG_VFIO_NOIOMMU
 125         /*
 126          * With noiommu enabled, an IOMMU group will be created for a device
 127          * that doesn't already have one and doesn't have an iommu_ops on their
 128          * bus.  We set iommudata simply to be able to identify these groups
 129          * as special use and for reclamation later.
 130          */
 131         if (group || !noiommu || iommu_present(dev->bus))
 132                 return group;
 133
 134         group = iommu_group_alloc();
 135         if (IS_ERR(group))
 136                 return NULL;
 137
 138         iommu_group_set_name(group, "vfio-noiommu");
 139         iommu_group_set_iommudata(group, &noiommu, NULL);
 140         ret = iommu_group_add_device(group, dev);
 141         iommu_group_put(group);
 142         if (ret)
 143                 return NULL;
 144
 145         /*
 146          * Where to taint?  At this point we've added an IOMMU group for a
 147          * device that is not backed by iommu_ops, therefore any iommu_
 148          * callback using iommu_ops can legitimately Oops.  So, while we may
 149          * be about to give a DMA capable device to a user without IOMMU
 150          * protection, which is clearly taint-worthy, let's go ahead and do
 151          * it here.
 152          */
 153         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155 #endif
 156
 157         return group;
 158 }
 159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162 {
 163 #ifdef CONFIG_VFIO_NOIOMMU
 164         if (iommu_group_get_iommudata(group) == &noiommu)
 165                 iommu_group_remove_device(dev);
 166 #endif
 167
 168         iommu_group_put(group);
 169 }
 170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172 #ifdef CONFIG_VFIO_NOIOMMU
 173 static void *vfio_noiommu_open(unsigned long arg)
 174 {
 175         if (arg != VFIO_NOIOMMU_IOMMU)
 176                 return ERR_PTR(-EINVAL);
 177         if (!capable(CAP_SYS_RAWIO))
 178                 return ERR_PTR(-EPERM);
 179
 180         return NULL;
 181 }
 182
 183 static void vfio_noiommu_release(void *iommu_data)
 184 {
 185 }
 186
 187 static long vfio_noiommu_ioctl(void *iommu_data,
 188                                unsigned int cmd, unsigned long arg)
 189 {
 190         if (cmd == VFIO_CHECK_EXTENSION)
 191                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193         return -ENOTTY;
 194 }
 195
 196 static int vfio_noiommu_attach_group(void *iommu_data,
 197                                      struct iommu_group *iommu_group)
 198 {
 199         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200 }
 201
 202 static void vfio_noiommu_detach_group(void *iommu_data,
 203                                       struct iommu_group *iommu_group)
 204 {
 205 }
 206
 207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208         .name = "vfio-noiommu",
 209         .owner = THIS_MODULE,
 210         .open = vfio_noiommu_open,
 211         .release = vfio_noiommu_release,
 212         .ioctl = vfio_noiommu_ioctl,
 213         .attach_group = vfio_noiommu_attach_group,
 214         .detach_group = vfio_noiommu_detach_group,
 215 };
 216 #endif
 217
 218
 219 /**
 220  * IOMMU driver registration
 221  */
 222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223 {
 224         struct vfio_iommu_driver *driver, *tmp;
 225
 226         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227         if (!driver)
 228                 return -ENOMEM;
 229
 230         driver->ops = ops;
 231
 232         mutex_lock(&vfio.iommu_drivers_lock);
 233
 234         /* Check for duplicates */
 235         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                 if (tmp->ops == ops) {
 237                         mutex_unlock(&vfio.iommu_drivers_lock);
 238                         kfree(driver);
 239                         return -EINVAL;
 240                 }
 241         }
 242
 243         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245         mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247         return 0;
 248 }
 249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252 {
 253         struct vfio_iommu_driver *driver;
 254
 255         mutex_lock(&vfio.iommu_drivers_lock);
 256         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                 if (driver->ops == ops) {
 258                         list_del(&driver->vfio_next);
 259                         mutex_unlock(&vfio.iommu_drivers_lock);
 260                         kfree(driver);
 261                         return;
 262                 }
 263         }
 264         mutex_unlock(&vfio.iommu_drivers_lock);
 265 }
 266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268 /**
 269  * Group minor allocation/free - both called with vfio.group_lock held
 270  */
 271 static int vfio_alloc_group_minor(struct vfio_group *group)
 272 {
 273         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274 }
 275
 276 static void vfio_free_group_minor(int minor)
 277 {
 278         idr_remove(&vfio.group_idr, minor);
 279 }
 280
 281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                      unsigned long action, void *data);
 283 static void vfio_group_get(struct vfio_group *group);
 284
 285 /**
 286  * Container objects - containers are created when /dev/vfio/vfio is
 287  * opened, but their lifecycle extends until the last user is done, so
 288  * it's freed via kref.  Must support container/group/device being
 289  * closed in any order.
 290  */
 291 static void vfio_container_get(struct vfio_container *container)
 292 {
 293         kref_get(&container->kref);
 294 }
 295
 296 static void vfio_container_release(struct kref *kref)
 297 {
 298         struct vfio_container *container;
 299         container = container_of(kref, struct vfio_container, kref);
 300
 301         kfree(container);
 302 }
 303
 304 static void vfio_container_put(struct vfio_container *container)
 305 {
 306         kref_put(&container->kref, vfio_container_release);
 307 }
 308
 309 static void vfio_group_unlock_and_free(struct vfio_group *group)
 310 {
 311         mutex_unlock(&vfio.group_lock);
 312         /*
 313          * Unregister outside of lock.  A spurious callback is harmless now
 314          * that the group is no longer in vfio.group_list.
 315          */
 316         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317         kfree(group);
 318 }
 319
 320 /**
 321  * Group objects - create, release, get, put, search
 322  */
 323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324 {
 325         struct vfio_group *group, *tmp;
 326         struct device *dev;
 327         int ret, minor;
 328
 329         group = kzalloc(sizeof(*group), GFP_KERNEL);
 330         if (!group)
 331                 return ERR_PTR(-ENOMEM);
 332
 333         kref_init(&group->kref);
 334         INIT_LIST_HEAD(&group->device_list);
 335         mutex_init(&group->device_lock);
 336         INIT_LIST_HEAD(&group->unbound_list);
 337         mutex_init(&group->unbound_lock);
 338         atomic_set(&group->container_users, 0);
 339         atomic_set(&group->opened, 0);
 340         group->iommu_group = iommu_group;
 341 #ifdef CONFIG_VFIO_NOIOMMU
 342         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 343 #endif
 344         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 345
 346         group->nb.notifier_call = vfio_iommu_group_notifier;
 347
 348         /*
 349          * blocking notifiers acquire a rwsem around registering and hold
 350          * it around callback.  Therefore, need to register outside of
 351          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 352          * do anything unless it can find the group in vfio.group_list, so
 353          * no harm in registering early.
 354          */
 355         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 356         if (ret) {
 357                 kfree(group);
 358                 return ERR_PTR(ret);
 359         }
 360
 361         mutex_lock(&vfio.group_lock);
 362
 363         /* Did we race creating this group? */
 364         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 365                 if (tmp->iommu_group == iommu_group) {
 366                         vfio_group_get(tmp);
 367                         vfio_group_unlock_and_free(group);
 368                         return tmp;
 369                 }
 370         }
 371
 372         minor = vfio_alloc_group_minor(group);
 373         if (minor < 0) {
 374                 vfio_group_unlock_and_free(group);
 375                 return ERR_PTR(minor);
 376         }
 377
 378         dev = device_create(vfio.class, NULL,
 379                             MKDEV(MAJOR(vfio.group_devt), minor),
 380                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 381                             iommu_group_id(iommu_group));
 382         if (IS_ERR(dev)) {
 383                 vfio_free_group_minor(minor);
 384                 vfio_group_unlock_and_free(group);
 385                 return (struct vfio_group *)dev; /* ERR_PTR */
 386         }
 387
 388         group->minor = minor;
 389         group->dev = dev;
 390
 391         list_add(&group->vfio_next, &vfio.group_list);
 392
 393         mutex_unlock(&vfio.group_lock);
 394
 395         return group;
 396 }
 397
 398 /* called with vfio.group_lock held */
 399 static void vfio_group_release(struct kref *kref)
 400 {
 401         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 402         struct vfio_unbound_dev *unbound, *tmp;
 403         struct iommu_group *iommu_group = group->iommu_group;
 404
 405         WARN_ON(!list_empty(&group->device_list));
 406
 407         list_for_each_entry_safe(unbound, tmp,
 408                                  &group->unbound_list, unbound_next) {
 409                 list_del(&unbound->unbound_next);
 410                 kfree(unbound);
 411         }
 412
 413         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 414         list_del(&group->vfio_next);
 415         vfio_free_group_minor(group->minor);
 416         vfio_group_unlock_and_free(group);
 417         iommu_group_put(iommu_group);
 418 }
 419
 420 static void vfio_group_put(struct vfio_group *group)
 421 {
 422         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 423 }
 424
 425 /* Assume group_lock or group reference is held */
 426 static void vfio_group_get(struct vfio_group *group)
 427 {
 428         kref_get(&group->kref);
 429 }
 430
 431 /*
 432  * Not really a try as we will sleep for mutex, but we need to make
 433  * sure the group pointer is valid under lock and get a reference.
 434  */
 435 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 436 {
 437         struct vfio_group *target = group;
 438
 439         mutex_lock(&vfio.group_lock);
 440         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 441                 if (group == target) {
 442                         vfio_group_get(group);
 443                         mutex_unlock(&vfio.group_lock);
 444                         return group;
 445                 }
 446         }
 447         mutex_unlock(&vfio.group_lock);
 448
 449         return NULL;
 450 }
 451
 452 static
 453 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 454 {
 455         struct vfio_group *group;
 456
 457         mutex_lock(&vfio.group_lock);
 458         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 459                 if (group->iommu_group == iommu_group) {
 460                         vfio_group_get(group);
 461                         mutex_unlock(&vfio.group_lock);
 462                         return group;
 463                 }
 464         }
 465         mutex_unlock(&vfio.group_lock);
 466
 467         return NULL;
 468 }
 469
 470 static struct vfio_group *vfio_group_get_from_minor(int minor)
 471 {
 472         struct vfio_group *group;
 473
 474         mutex_lock(&vfio.group_lock);
 475         group = idr_find(&vfio.group_idr, minor);
 476         if (!group) {
 477                 mutex_unlock(&vfio.group_lock);
 478                 return NULL;
 479         }
 480         vfio_group_get(group);
 481         mutex_unlock(&vfio.group_lock);
 482
 483         return group;
 484 }
 485
 486 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 487 {
 488         struct iommu_group *iommu_group;
 489         struct vfio_group *group;
 490
 491         iommu_group = iommu_group_get(dev);
 492         if (!iommu_group)
 493                 return NULL;
 494
 495         group = vfio_group_get_from_iommu(iommu_group);
 496         iommu_group_put(iommu_group);
 497
 498         return group;
 499 }
 500
 501 /**
 502  * Device objects - create, release, get, put, search
 503  */
 504 static
 505 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 506                                              struct device *dev,
 507                                              const struct vfio_device_ops *ops,
 508                                              void *device_data)
 509 {
 510         struct vfio_device *device;
 511
 512         device = kzalloc(sizeof(*device), GFP_KERNEL);
 513         if (!device)
 514                 return ERR_PTR(-ENOMEM);
 515
 516         kref_init(&device->kref);
 517         device->dev = dev;
 518         device->group = group;
 519         device->ops = ops;
 520         device->device_data = device_data;
 521         dev_set_drvdata(dev, device);
 522
 523         /* No need to get group_lock, caller has group reference */
 524         vfio_group_get(group);
 525
 526         mutex_lock(&group->device_lock);
 527         list_add(&device->group_next, &group->device_list);
 528         mutex_unlock(&group->device_lock);
 529
 530         return device;
 531 }
 532
 533 static void vfio_device_release(struct kref *kref)
 534 {
 535         struct vfio_device *device = container_of(kref,
 536                                                   struct vfio_device, kref);
 537         struct vfio_group *group = device->group;
 538
 539         list_del(&device->group_next);
 540         mutex_unlock(&group->device_lock);
 541
 542         dev_set_drvdata(device->dev, NULL);
 543
 544         kfree(device);
 545
 546         /* vfio_del_group_dev may be waiting for this device */
 547         wake_up(&vfio.release_q);
 548 }
 549
 550 /* Device reference always implies a group reference */
 551 void vfio_device_put(struct vfio_device *device)
 552 {
 553         struct vfio_group *group = device->group;
 554         kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 555         vfio_group_put(group);
 556 }
 557 EXPORT_SYMBOL_GPL(vfio_device_put);
 558
 559 static void vfio_device_get(struct vfio_device *device)
 560 {
 561         vfio_group_get(device->group);
 562         kref_get(&device->kref);
 563 }
 564
 565 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 566                                                  struct device *dev)
 567 {
 568         struct vfio_device *device;
 569
 570         mutex_lock(&group->device_lock);
 571         list_for_each_entry(device, &group->device_list, group_next) {
 572                 if (device->dev == dev) {
 573                         vfio_device_get(device);
 574                         mutex_unlock(&group->device_lock);
 575                         return device;
 576                 }
 577         }
 578         mutex_unlock(&group->device_lock);
 579         return NULL;
 580 }
 581
 582 /*
 583  * Some drivers, like pci-stub, are only used to prevent other drivers from
 584  * claiming a device and are therefore perfectly legitimate for a user owned
 585  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 586  * of the device, but it does prevent the user from having direct access to
 587  * the device, which is useful in some circumstances.
 588  *
 589  * We also assume that we can include PCI interconnect devices, ie. bridges.
 590  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 591  * then all of the downstream devices will be part of the same IOMMU group as
 592  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 593  * breaks anything, it only does so for user owned devices downstream.  Note
 594  * that error notification via MSI can be affected for platforms that handle
 595  * MSI within the same IOVA space as DMA.
 596  */
 597 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 598
 599 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 600 {
 601         int i;
 602
 603         if (dev_is_pci(dev)) {
 604                 struct pci_dev *pdev = to_pci_dev(dev);
 605
 606                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 607                         return true;
 608         }
 609
 610         for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 611                 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 612                         return true;
 613         }
 614
 615         return false;
 616 }
 617
 618 /*
 619  * A vfio group is viable for use by userspace if all devices are in
 620  * one of the following states:
 621  *  - driver-less
 622  *  - bound to a vfio driver
 623  *  - bound to a whitelisted driver
 624  *  - a PCI interconnect device
 625  *
 626  * We use two methods to determine whether a device is bound to a vfio
 627  * driver.  The first is to test whether the device exists in the vfio
 628  * group.  The second is to test if the device exists on the group
 629  * unbound_list, indicating it's in the middle of transitioning from
 630  * a vfio driver to driver-less.
 631  */
 632 static int vfio_dev_viable(struct device *dev, void *data)
 633 {
 634         struct vfio_group *group = data;
 635         struct vfio_device *device;
 636         struct device_driver *drv = ACCESS_ONCE(dev->driver);
 637         struct vfio_unbound_dev *unbound;
 638         int ret = -EINVAL;
 639
 640         mutex_lock(&group->unbound_lock);
 641         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 642                 if (dev == unbound->dev) {
 643                         ret = 0;
 644                         break;
 645                 }
 646         }
 647         mutex_unlock(&group->unbound_lock);
 648
 649         if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 650                 return 0;
 651
 652         device = vfio_group_get_device(group, dev);
 653         if (device) {
 654                 vfio_device_put(device);
 655                 return 0;
 656         }
 657
 658         return ret;
 659 }
 660
 661 /**
 662  * Async device support
 663  */
 664 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 665 {
 666         struct vfio_device *device;
 667
 668         /* Do we already know about it?  We shouldn't */
 669         device = vfio_group_get_device(group, dev);
 670         if (WARN_ON_ONCE(device)) {
 671                 vfio_device_put(device);
 672                 return 0;
 673         }
 674
 675         /* Nothing to do for idle groups */
 676         if (!atomic_read(&group->container_users))
 677                 return 0;
 678
 679         /* TODO Prevent device auto probing */
 680         WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
 681              iommu_group_id(group->iommu_group));
 682
 683         return 0;
 684 }
 685
 686 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 687 {
 688         /* We don't care what happens when the group isn't in use */
 689         if (!atomic_read(&group->container_users))
 690                 return 0;
 691
 692         return vfio_dev_viable(dev, group);
 693 }
 694
 695 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 696                                      unsigned long action, void *data)
 697 {
 698         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 699         struct device *dev = data;
 700         struct vfio_unbound_dev *unbound;
 701
 702         /*
 703          * Need to go through a group_lock lookup to get a reference or we
 704          * risk racing a group being removed.  Ignore spurious notifies.
 705          */
 706         group = vfio_group_try_get(group);
 707         if (!group)
 708                 return NOTIFY_OK;
 709
 710         switch (action) {
 711         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 712                 vfio_group_nb_add_dev(group, dev);
 713                 break;
 714         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 715                 /*
 716                  * Nothing to do here.  If the device is in use, then the
 717                  * vfio sub-driver should block the remove callback until
 718                  * it is unused.  If the device is unused or attached to a
 719                  * stub driver, then it should be released and we don't
 720                  * care that it will be going away.
 721                  */
 722                 break;
 723         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 724                 pr_debug("%s: Device %s, group %d binding to driver\n",
 725                          __func__, dev_name(dev),
 726                          iommu_group_id(group->iommu_group));
 727                 break;
 728         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 729                 pr_debug("%s: Device %s, group %d bound to driver %s\n",
 730                          __func__, dev_name(dev),
 731                          iommu_group_id(group->iommu_group), dev->driver->name);
 732                 BUG_ON(vfio_group_nb_verify(group, dev));
 733                 break;
 734         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 735                 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 736                          __func__, dev_name(dev),
 737                          iommu_group_id(group->iommu_group), dev->driver->name);
 738                 break;
 739         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 740                 pr_debug("%s: Device %s, group %d unbound from driver\n",
 741                          __func__, dev_name(dev),
 742                          iommu_group_id(group->iommu_group));
 743                 /*
 744                  * XXX An unbound device in a live group is ok, but we'd
 745                  * really like to avoid the above BUG_ON by preventing other
 746                  * drivers from binding to it.  Once that occurs, we have to
 747                  * stop the system to maintain isolation.  At a minimum, we'd
 748                  * want a toggle to disable driver auto probe for this device.
 749                  */
 750
 751                 mutex_lock(&group->unbound_lock);
 752                 list_for_each_entry(unbound,
 753                                     &group->unbound_list, unbound_next) {
 754                         if (dev == unbound->dev) {
 755                                 list_del(&unbound->unbound_next);
 756                                 kfree(unbound);
 757                                 break;
 758                         }
 759                 }
 760                 mutex_unlock(&group->unbound_lock);
 761                 break;
 762         }
 763
 764         vfio_group_put(group);
 765         return NOTIFY_OK;
 766 }
 767
 768 /**
 769  * VFIO driver API
 770  */
 771 int vfio_add_group_dev(struct device *dev,
 772                        const struct vfio_device_ops *ops, void *device_data)
 773 {
 774         struct iommu_group *iommu_group;
 775         struct vfio_group *group;
 776         struct vfio_device *device;
 777
 778         iommu_group = iommu_group_get(dev);
 779         if (!iommu_group)
 780                 return -EINVAL;
 781
 782         group = vfio_group_get_from_iommu(iommu_group);
 783         if (!group) {
 784                 group = vfio_create_group(iommu_group);
 785                 if (IS_ERR(group)) {
 786                         iommu_group_put(iommu_group);
 787                         return PTR_ERR(group);
 788                 }
 789         } else {
 790                 /*
 791                  * A found vfio_group already holds a reference to the
 792                  * iommu_group.  A created vfio_group keeps the reference.
 793                  */
 794                 iommu_group_put(iommu_group);
 795         }
 796
 797         device = vfio_group_get_device(group, dev);
 798         if (device) {
 799                 WARN(1, "Device %s already exists on group %d\n",
 800                      dev_name(dev), iommu_group_id(iommu_group));
 801                 vfio_device_put(device);
 802                 vfio_group_put(group);
 803                 return -EBUSY;
 804         }
 805
 806         device = vfio_group_create_device(group, dev, ops, device_data);
 807         if (IS_ERR(device)) {
 808                 vfio_group_put(group);
 809                 return PTR_ERR(device);
 810         }
 811
 812         /*
 813          * Drop all but the vfio_device reference.  The vfio_device holds
 814          * a reference to the vfio_group, which holds a reference to the
 815          * iommu_group.
 816          */
 817         vfio_group_put(group);
 818
 819         return 0;
 820 }
 821 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 822
 823 /**
 824  * Get a reference to the vfio_device for a device.  Even if the
 825  * caller thinks they own the device, they could be racing with a
 826  * release call path, so we can't trust drvdata for the shortcut.
 827  * Go the long way around, from the iommu_group to the vfio_group
 828  * to the vfio_device.
 829  */
 830 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 831 {
 832         struct vfio_group *group;
 833         struct vfio_device *device;
 834
 835         group = vfio_group_get_from_dev(dev);
 836         if (!group)
 837                 return NULL;
 838
 839         device = vfio_group_get_device(group, dev);
 840         vfio_group_put(group);
 841
 842         return device;
 843 }
 844 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 845
 846 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 847                                                      char *buf)
 848 {
 849         struct vfio_device *it, *device = NULL;
 850
 851         mutex_lock(&group->device_lock);
 852         list_for_each_entry(it, &group->device_list, group_next) {
 853                 if (!strcmp(dev_name(it->dev), buf)) {
 854                         device = it;
 855                         vfio_device_get(device);
 856                         break;
 857                 }
 858         }
 859         mutex_unlock(&group->device_lock);
 860
 861         return device;
 862 }
 863
 864 /*
 865  * Caller must hold a reference to the vfio_device
 866  */
 867 void *vfio_device_data(struct vfio_device *device)
 868 {
 869         return device->device_data;
 870 }
 871 EXPORT_SYMBOL_GPL(vfio_device_data);
 872
 873 /* Given a referenced group, check if it contains the device */
 874 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 875 {
 876         struct vfio_device *device;
 877
 878         device = vfio_group_get_device(group, dev);
 879         if (!device)
 880                 return false;
 881
 882         vfio_device_put(device);
 883         return true;
 884 }
 885
 886 /*
 887  * Decrement the device reference count and wait for the device to be
 888  * removed.  Open file descriptors for the device... */
 889 void *vfio_del_group_dev(struct device *dev)
 890 {
 891         struct vfio_device *device = dev_get_drvdata(dev);
 892         struct vfio_group *group = device->group;
 893         void *device_data = device->device_data;
 894         struct vfio_unbound_dev *unbound;
 895         unsigned int i = 0;
 896         long ret;
 897         bool interrupted = false;
 898
 899         /*
 900          * The group exists so long as we have a device reference.  Get
 901          * a group reference and use it to scan for the device going away.
 902          */
 903         vfio_group_get(group);
 904
 905         /*
 906          * When the device is removed from the group, the group suddenly
 907          * becomes non-viable; the device has a driver (until the unbind
 908          * completes), but it's not present in the group.  This is bad news
 909          * for any external users that need to re-acquire a group reference
 910          * in order to match and release their existing reference.  To
 911          * solve this, we track such devices on the unbound_list to bridge
 912          * the gap until they're fully unbound.
 913          */
 914         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 915         if (unbound) {
 916                 unbound->dev = dev;
 917                 mutex_lock(&group->unbound_lock);
 918                 list_add(&unbound->unbound_next, &group->unbound_list);
 919                 mutex_unlock(&group->unbound_lock);
 920         }
 921         WARN_ON(!unbound);
 922
 923         vfio_device_put(device);
 924
 925         /*
 926          * If the device is still present in the group after the above
 927          * 'put', then it is in use and we need to request it from the
 928          * bus driver.  The driver may in turn need to request the
 929          * device from the user.  We send the request on an arbitrary
 930          * interval with counter to allow the driver to take escalating
 931          * measures to release the device if it has the ability to do so.
 932          */
 933         do {
 934                 device = vfio_group_get_device(group, dev);
 935                 if (!device)
 936                         break;
 937
 938                 if (device->ops->request)
 939                         device->ops->request(device_data, i++);
 940
 941                 vfio_device_put(device);
 942
 943                 if (interrupted) {
 944                         ret = wait_event_timeout(vfio.release_q,
 945                                         !vfio_dev_present(group, dev), HZ * 10);
 946                 } else {
 947                         ret = wait_event_interruptible_timeout(vfio.release_q,
 948                                         !vfio_dev_present(group, dev), HZ * 10);
 949                         if (ret == -ERESTARTSYS) {
 950                                 interrupted = true;
 951                                 dev_warn(dev,
 952                                          "Device is currently in use, task"
 953                                          " \"%s\" (%d) "
 954                                          "blocked until device is released",
 955                                          current->comm, task_pid_nr(current));
 956                         }
 957                 }
 958         } while (ret <= 0);
 959
 960         vfio_group_put(group);
 961
 962         return device_data;
 963 }
 964 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
 965
 966 /**
 967  * VFIO base fd, /dev/vfio/vfio
 968  */
 969 static long vfio_ioctl_check_extension(struct vfio_container *container,
 970                                        unsigned long arg)
 971 {
 972         struct vfio_iommu_driver *driver;
 973         long ret = 0;
 974
 975         down_read(&container->group_lock);
 976
 977         driver = container->iommu_driver;
 978
 979         switch (arg) {
 980                 /* No base extensions yet */
 981         default:
 982                 /*
 983                  * If no driver is set, poll all registered drivers for
 984                  * extensions and return the first positive result.  If
 985                  * a driver is already set, further queries will be passed
 986                  * only to that driver.
 987                  */
 988                 if (!driver) {
 989                         mutex_lock(&vfio.iommu_drivers_lock);
 990                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
 991                                             vfio_next) {
 992
 993 #ifdef CONFIG_VFIO_NOIOMMU
 994                                 if (!list_empty(&container->group_list) &&
 995                                     (container->noiommu !=
 996                                      (driver->ops == &vfio_noiommu_ops)))
 997                                         continue;
 998 #endif
 999
1000                                 if (!try_module_get(driver->ops->owner))
1001                                         continue;
1002
1003                                 ret = driver->ops->ioctl(NULL,
1004                                                          VFIO_CHECK_EXTENSION,
1005                                                          arg);
1006                                 module_put(driver->ops->owner);
1007                                 if (ret > 0)
1008                                         break;
1009                         }
1010                         mutex_unlock(&vfio.iommu_drivers_lock);
1011                 } else
1012                         ret = driver->ops->ioctl(container->iommu_data,
1013                                                  VFIO_CHECK_EXTENSION, arg);
1014         }
1015
1016         up_read(&container->group_lock);
1017
1018         return ret;
1019 }
1020
1021 /* hold write lock on container->group_lock */
1022 static int __vfio_container_attach_groups(struct vfio_container *container,
1023                                           struct vfio_iommu_driver *driver,
1024                                           void *data)
1025 {
1026         struct vfio_group *group;
1027         int ret = -ENODEV;
1028
1029         list_for_each_entry(group, &container->group_list, container_next) {
1030                 ret = driver->ops->attach_group(data, group->iommu_group);
1031                 if (ret)
1032                         goto unwind;
1033         }
1034
1035         return ret;
1036
1037 unwind:
1038         list_for_each_entry_continue_reverse(group, &container->group_list,
1039                                              container_next) {
1040                 driver->ops->detach_group(data, group->iommu_group);
1041         }
1042
1043         return ret;
1044 }
1045
1046 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1047                                  unsigned long arg)
1048 {
1049         struct vfio_iommu_driver *driver;
1050         long ret = -ENODEV;
1051
1052         down_write(&container->group_lock);
1053
1054         /*
1055          * The container is designed to be an unprivileged interface while
1056          * the group can be assigned to specific users.  Therefore, only by
1057          * adding a group to a container does the user get the privilege of
1058          * enabling the iommu, which may allocate finite resources.  There
1059          * is no unset_iommu, but by removing all the groups from a container,
1060          * the container is deprivileged and returns to an unset state.
1061          */
1062         if (list_empty(&container->group_list) || container->iommu_driver) {
1063                 up_write(&container->group_lock);
1064                 return -EINVAL;
1065         }
1066
1067         mutex_lock(&vfio.iommu_drivers_lock);
1068         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1069                 void *data;
1070
1071 #ifdef CONFIG_VFIO_NOIOMMU
1072                 /*
1073                  * Only noiommu containers can use vfio-noiommu and noiommu
1074                  * containers can only use vfio-noiommu.
1075                  */
1076                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1077                         continue;
1078 #endif
1079
1080                 if (!try_module_get(driver->ops->owner))
1081                         continue;
1082
1083                 /*
1084                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1085                  * so test which iommu driver reported support for this
1086                  * extension and call open on them.  We also pass them the
1087                  * magic, allowing a single driver to support multiple
1088                  * interfaces if they'd like.
1089                  */
1090                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1091                         module_put(driver->ops->owner);
1092                         continue;
1093                 }
1094
1095                 data = driver->ops->open(arg);
1096                 if (IS_ERR(data)) {
1097                         ret = PTR_ERR(data);
1098                         module_put(driver->ops->owner);
1099                         continue;
1100                 }
1101
1102                 ret = __vfio_container_attach_groups(container, driver, data);
1103                 if (ret) {
1104                         driver->ops->release(data);
1105                         module_put(driver->ops->owner);
1106                         continue;
1107                 }
1108
1109                 container->iommu_driver = driver;
1110                 container->iommu_data = data;
1111                 break;
1112         }
1113
1114         mutex_unlock(&vfio.iommu_drivers_lock);
1115         up_write(&container->group_lock);
1116
1117         return ret;
1118 }
1119
1120 static long vfio_fops_unl_ioctl(struct file *filep,
1121                                 unsigned int cmd, unsigned long arg)
1122 {
1123         struct vfio_container *container = filep->private_data;
1124         struct vfio_iommu_driver *driver;
1125         void *data;
1126         long ret = -EINVAL;
1127
1128         if (!container)
1129                 return ret;
1130
1131         switch (cmd) {
1132         case VFIO_GET_API_VERSION:
1133                 ret = VFIO_API_VERSION;
1134                 break;
1135         case VFIO_CHECK_EXTENSION:
1136                 ret = vfio_ioctl_check_extension(container, arg);
1137                 break;
1138         case VFIO_SET_IOMMU:
1139                 ret = vfio_ioctl_set_iommu(container, arg);
1140                 break;
1141         default:
1142                 down_read(&container->group_lock);
1143
1144                 driver = container->iommu_driver;
1145                 data = container->iommu_data;
1146
1147                 if (driver) /* passthrough all unrecognized ioctls */
1148                         ret = driver->ops->ioctl(data, cmd, arg);
1149
1150                 up_read(&container->group_lock);
1151         }
1152
1153         return ret;
1154 }
1155
1156 #ifdef CONFIG_COMPAT
1157 static long vfio_fops_compat_ioctl(struct file *filep,
1158                                    unsigned int cmd, unsigned long arg)
1159 {
1160         arg = (unsigned long)compat_ptr(arg);
1161         return vfio_fops_unl_ioctl(filep, cmd, arg);
1162 }
1163 #endif  /* CONFIG_COMPAT */
1164
1165 static int vfio_fops_open(struct inode *inode, struct file *filep)
1166 {
1167         struct vfio_container *container;
1168
1169         container = kzalloc(sizeof(*container), GFP_KERNEL);
1170         if (!container)
1171                 return -ENOMEM;
1172
1173         INIT_LIST_HEAD(&container->group_list);
1174         init_rwsem(&container->group_lock);
1175         kref_init(&container->kref);
1176
1177         filep->private_data = container;
1178
1179         return 0;
1180 }
1181
1182 static int vfio_fops_release(struct inode *inode, struct file *filep)
1183 {
1184         struct vfio_container *container = filep->private_data;
1185
1186         filep->private_data = NULL;
1187
1188         vfio_container_put(container);
1189
1190         return 0;
1191 }
1192
1193 /*
1194  * Once an iommu driver is set, we optionally pass read/write/mmap
1195  * on to the driver, allowing management interfaces beyond ioctl.
1196  */
1197 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1198                               size_t count, loff_t *ppos)
1199 {
1200         struct vfio_container *container = filep->private_data;
1201         struct vfio_iommu_driver *driver;
1202         ssize_t ret = -EINVAL;
1203
1204         down_read(&container->group_lock);
1205
1206         driver = container->iommu_driver;
1207         if (likely(driver && driver->ops->read))
1208                 ret = driver->ops->read(container->iommu_data,
1209                                         buf, count, ppos);
1210
1211         up_read(&container->group_lock);
1212
1213         return ret;
1214 }
1215
1216 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1217                                size_t count, loff_t *ppos)
1218 {
1219         struct vfio_container *container = filep->private_data;
1220         struct vfio_iommu_driver *driver;
1221         ssize_t ret = -EINVAL;
1222
1223         down_read(&container->group_lock);
1224
1225         driver = container->iommu_driver;
1226         if (likely(driver && driver->ops->write))
1227                 ret = driver->ops->write(container->iommu_data,
1228                                          buf, count, ppos);
1229
1230         up_read(&container->group_lock);
1231
1232         return ret;
1233 }
1234
1235 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1236 {
1237         struct vfio_container *container = filep->private_data;
1238         struct vfio_iommu_driver *driver;
1239         int ret = -EINVAL;
1240
1241         down_read(&container->group_lock);
1242
1243         driver = container->iommu_driver;
1244         if (likely(driver && driver->ops->mmap))
1245                 ret = driver->ops->mmap(container->iommu_data, vma);
1246
1247         up_read(&container->group_lock);
1248
1249         return ret;
1250 }
1251
1252 static const struct file_operations vfio_fops = {
1253         .owner          = THIS_MODULE,
1254         .open           = vfio_fops_open,
1255         .release        = vfio_fops_release,
1256         .read           = vfio_fops_read,
1257         .write          = vfio_fops_write,
1258         .unlocked_ioctl = vfio_fops_unl_ioctl,
1259 #ifdef CONFIG_COMPAT
1260         .compat_ioctl   = vfio_fops_compat_ioctl,
1261 #endif
1262         .mmap           = vfio_fops_mmap,
1263 };
1264
1265 /**
1266  * VFIO Group fd, /dev/vfio/$GROUP
1267  */
1268 static void __vfio_group_unset_container(struct vfio_group *group)
1269 {
1270         struct vfio_container *container = group->container;
1271         struct vfio_iommu_driver *driver;
1272
1273         down_write(&container->group_lock);
1274
1275         driver = container->iommu_driver;
1276         if (driver)
1277                 driver->ops->detach_group(container->iommu_data,
1278                                           group->iommu_group);
1279
1280         group->container = NULL;
1281         list_del(&group->container_next);
1282
1283         /* Detaching the last group deprivileges a container, remove iommu */
1284         if (driver && list_empty(&container->group_list)) {
1285                 driver->ops->release(container->iommu_data);
1286                 module_put(driver->ops->owner);
1287                 container->iommu_driver = NULL;
1288                 container->iommu_data = NULL;
1289         }
1290
1291         up_write(&container->group_lock);
1292
1293         vfio_container_put(container);
1294 }
1295
1296 /*
1297  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1298  * if there was no container to unset.  Since the ioctl is called on
1299  * the group, we know that still exists, therefore the only valid
1300  * transition here is 1->0.
1301  */
1302 static int vfio_group_unset_container(struct vfio_group *group)
1303 {
1304         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1305
1306         if (!users)
1307                 return -EINVAL;
1308         if (users != 1)
1309                 return -EBUSY;
1310
1311         __vfio_group_unset_container(group);
1312
1313         return 0;
1314 }
1315
1316 /*
1317  * When removing container users, anything that removes the last user
1318  * implicitly removes the group from the container.  That is, if the
1319  * group file descriptor is closed, as well as any device file descriptors,
1320  * the group is free.
1321  */
1322 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1323 {
1324         if (0 == atomic_dec_if_positive(&group->container_users))
1325                 __vfio_group_unset_container(group);
1326 }
1327
1328 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1329 {
1330         struct fd f;
1331         struct vfio_container *container;
1332         struct vfio_iommu_driver *driver;
1333         int ret = 0;
1334
1335         if (atomic_read(&group->container_users))
1336                 return -EINVAL;
1337
1338         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1339                 return -EPERM;
1340
1341         f = fdget(container_fd);
1342         if (!f.file)
1343                 return -EBADF;
1344
1345         /* Sanity check, is this really our fd? */
1346         if (f.file->f_op != &vfio_fops) {
1347                 fdput(f);
1348                 return -EINVAL;
1349         }
1350
1351         container = f.file->private_data;
1352         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1353
1354         down_write(&container->group_lock);
1355
1356         /* Real groups and fake groups cannot mix */
1357         if (!list_empty(&container->group_list) &&
1358             container->noiommu != group->noiommu) {
1359                 ret = -EPERM;
1360                 goto unlock_out;
1361         }
1362
1363         driver = container->iommu_driver;
1364         if (driver) {
1365                 ret = driver->ops->attach_group(container->iommu_data,
1366                                                 group->iommu_group);
1367                 if (ret)
1368                         goto unlock_out;
1369         }
1370
1371         group->container = container;
1372         container->noiommu = group->noiommu;
1373         list_add(&group->container_next, &container->group_list);
1374
1375         /* Get a reference on the container and mark a user within the group */
1376         vfio_container_get(container);
1377         atomic_inc(&group->container_users);
1378
1379 unlock_out:
1380         up_write(&container->group_lock);
1381         fdput(f);
1382         return ret;
1383 }
1384
1385 static bool vfio_group_viable(struct vfio_group *group)
1386 {
1387         return (iommu_group_for_each_dev(group->iommu_group,
1388                                          group, vfio_dev_viable) == 0);
1389 }
1390
1391 static int vfio_group_add_container_user(struct vfio_group *group)
1392 {
1393         if (!atomic_inc_not_zero(&group->container_users))
1394                 return -EINVAL;
1395
1396         if (group->noiommu) {
1397                 atomic_dec(&group->container_users);
1398                 return -EPERM;
1399         }
1400         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1401                 atomic_dec(&group->container_users);
1402                 return -EINVAL;
1403         }
1404
1405         return 0;
1406 }
1407
1408 static const struct file_operations vfio_device_fops;
1409
1410 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1411 {
1412         struct vfio_device *device;
1413         struct file *filep;
1414         int ret;
1415
1416         if (0 == atomic_read(&group->container_users) ||
1417             !group->container->iommu_driver || !vfio_group_viable(group))
1418                 return -EINVAL;
1419
1420         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1421                 return -EPERM;
1422
1423         device = vfio_device_get_from_name(group, buf);
1424         if (!device)
1425                 return -ENODEV;
1426
1427         ret = device->ops->open(device->device_data);
1428         if (ret) {
1429                 vfio_device_put(device);
1430                 return ret;
1431         }
1432
1433         /*
1434          * We can't use anon_inode_getfd() because we need to modify
1435          * the f_mode flags directly to allow more than just ioctls
1436          */
1437         ret = get_unused_fd_flags(O_CLOEXEC);
1438         if (ret < 0) {
1439                 device->ops->release(device->device_data);
1440                 vfio_device_put(device);
1441                 return ret;
1442         }
1443
1444         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1445                                    device, O_RDWR);
1446         if (IS_ERR(filep)) {
1447                 put_unused_fd(ret);
1448                 ret = PTR_ERR(filep);
1449                 device->ops->release(device->device_data);
1450                 vfio_device_put(device);
1451                 return ret;
1452         }
1453
1454         /*
1455          * TODO: add an anon_inode interface to do this.
1456          * Appears to be missing by lack of need rather than
1457          * explicitly prevented.  Now there's need.
1458          */
1459         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1460
1461         atomic_inc(&group->container_users);
1462
1463         fd_install(ret, filep);
1464
1465         if (group->noiommu)
1466                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1467                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1468
1469         return ret;
1470 }
1471
1472 static long vfio_group_fops_unl_ioctl(struct file *filep,
1473                                       unsigned int cmd, unsigned long arg)
1474 {
1475         struct vfio_group *group = filep->private_data;
1476         long ret = -ENOTTY;
1477
1478         switch (cmd) {
1479         case VFIO_GROUP_GET_STATUS:
1480         {
1481                 struct vfio_group_status status;
1482                 unsigned long minsz;
1483
1484                 minsz = offsetofend(struct vfio_group_status, flags);
1485
1486                 if (copy_from_user(&status, (void __user *)arg, minsz))
1487                         return -EFAULT;
1488
1489                 if (status.argsz < minsz)
1490                         return -EINVAL;
1491
1492                 status.flags = 0;
1493
1494                 if (vfio_group_viable(group))
1495                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1496
1497                 if (group->container)
1498                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1499
1500                 if (copy_to_user((void __user *)arg, &status, minsz))
1501                         return -EFAULT;
1502
1503                 ret = 0;
1504                 break;
1505         }
1506         case VFIO_GROUP_SET_CONTAINER:
1507         {
1508                 int fd;
1509
1510                 if (get_user(fd, (int __user *)arg))
1511                         return -EFAULT;
1512
1513                 if (fd < 0)
1514                         return -EINVAL;
1515
1516                 ret = vfio_group_set_container(group, fd);
1517                 break;
1518         }
1519         case VFIO_GROUP_UNSET_CONTAINER:
1520                 ret = vfio_group_unset_container(group);
1521                 break;
1522         case VFIO_GROUP_GET_DEVICE_FD:
1523         {
1524                 char *buf;
1525
1526                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1527                 if (IS_ERR(buf))
1528                         return PTR_ERR(buf);
1529
1530                 ret = vfio_group_get_device_fd(group, buf);
1531                 kfree(buf);
1532                 break;
1533         }
1534         }
1535
1536         return ret;
1537 }
1538
1539 #ifdef CONFIG_COMPAT
1540 static long vfio_group_fops_compat_ioctl(struct file *filep,
1541                                          unsigned int cmd, unsigned long arg)
1542 {
1543         arg = (unsigned long)compat_ptr(arg);
1544         return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1545 }
1546 #endif  /* CONFIG_COMPAT */
1547
1548 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1549 {
1550         struct vfio_group *group;
1551         int opened;
1552
1553         group = vfio_group_get_from_minor(iminor(inode));
1554         if (!group)
1555                 return -ENODEV;
1556
1557         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1558                 vfio_group_put(group);
1559                 return -EPERM;
1560         }
1561
1562         /* Do we need multiple instances of the group open?  Seems not. */
1563         opened = atomic_cmpxchg(&group->opened, 0, 1);
1564         if (opened) {
1565                 vfio_group_put(group);
1566                 return -EBUSY;
1567         }
1568
1569         /* Is something still in use from a previous open? */
1570         if (group->container) {
1571                 atomic_dec(&group->opened);
1572                 vfio_group_put(group);
1573                 return -EBUSY;
1574         }
1575
1576         filep->private_data = group;
1577
1578         return 0;
1579 }
1580
1581 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1582 {
1583         struct vfio_group *group = filep->private_data;
1584
1585         filep->private_data = NULL;
1586
1587         /* Any user didn't unregister? */
1588         WARN_ON(group->notifier.head);
1589
1590         vfio_group_try_dissolve_container(group);
1591
1592         atomic_dec(&group->opened);
1593
1594         vfio_group_put(group);
1595
1596         return 0;
1597 }
1598
1599 static const struct file_operations vfio_group_fops = {
1600         .owner          = THIS_MODULE,
1601         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1602 #ifdef CONFIG_COMPAT
1603         .compat_ioctl   = vfio_group_fops_compat_ioctl,
1604 #endif
1605         .open           = vfio_group_fops_open,
1606         .release        = vfio_group_fops_release,
1607 };
1608
1609 /**
1610  * VFIO Device fd
1611  */
1612 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1613 {
1614         struct vfio_device *device = filep->private_data;
1615
1616         device->ops->release(device->device_data);
1617
1618         vfio_group_try_dissolve_container(device->group);
1619
1620         vfio_device_put(device);
1621
1622         return 0;
1623 }
1624
1625 static long vfio_device_fops_unl_ioctl(struct file *filep,
1626                                        unsigned int cmd, unsigned long arg)
1627 {
1628         struct vfio_device *device = filep->private_data;
1629
1630         if (unlikely(!device->ops->ioctl))
1631                 return -EINVAL;
1632
1633         return device->ops->ioctl(device->device_data, cmd, arg);
1634 }
1635
1636 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1637                                      size_t count, loff_t *ppos)
1638 {
1639         struct vfio_device *device = filep->private_data;
1640
1641         if (unlikely(!device->ops->read))
1642                 return -EINVAL;
1643
1644         return device->ops->read(device->device_data, buf, count, ppos);
1645 }
1646
1647 static ssize_t vfio_device_fops_write(struct file *filep,
1648                                       const char __user *buf,
1649                                       size_t count, loff_t *ppos)
1650 {
1651         struct vfio_device *device = filep->private_data;
1652
1653         if (unlikely(!device->ops->write))
1654                 return -EINVAL;
1655
1656         return device->ops->write(device->device_data, buf, count, ppos);
1657 }
1658
1659 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1660 {
1661         struct vfio_device *device = filep->private_data;
1662
1663         if (unlikely(!device->ops->mmap))
1664                 return -EINVAL;
1665
1666         return device->ops->mmap(device->device_data, vma);
1667 }
1668
1669 #ifdef CONFIG_COMPAT
1670 static long vfio_device_fops_compat_ioctl(struct file *filep,
1671                                           unsigned int cmd, unsigned long arg)
1672 {
1673         arg = (unsigned long)compat_ptr(arg);
1674         return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1675 }
1676 #endif  /* CONFIG_COMPAT */
1677
1678 static const struct file_operations vfio_device_fops = {
1679         .owner          = THIS_MODULE,
1680         .release        = vfio_device_fops_release,
1681         .read           = vfio_device_fops_read,
1682         .write          = vfio_device_fops_write,
1683         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1684 #ifdef CONFIG_COMPAT
1685         .compat_ioctl   = vfio_device_fops_compat_ioctl,
1686 #endif
1687         .mmap           = vfio_device_fops_mmap,
1688 };
1689
1690 /**
1691  * External user API, exported by symbols to be linked dynamically.
1692  *
1693  * The protocol includes:
1694  *  1. do normal VFIO init operation:
1695  *      - opening a new container;
1696  *      - attaching group(s) to it;
1697  *      - setting an IOMMU driver for a container.
1698  * When IOMMU is set for a container, all groups in it are
1699  * considered ready to use by an external user.
1700  *
1701  * 2. User space passes a group fd to an external user.
1702  * The external user calls vfio_group_get_external_user()
1703  * to verify that:
1704  *      - the group is initialized;
1705  *      - IOMMU is set for it.
1706  * If both checks passed, vfio_group_get_external_user()
1707  * increments the container user counter to prevent
1708  * the VFIO group from disposal before KVM exits.
1709  *
1710  * 3. The external user calls vfio_external_user_iommu_id()
1711  * to know an IOMMU ID.
1712  *
1713  * 4. When the external KVM finishes, it calls
1714  * vfio_group_put_external_user() to release the VFIO group.
1715  * This call decrements the container user counter.
1716  */
1717 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1718 {
1719         struct vfio_group *group = filep->private_data;
1720         int ret;
1721
1722         if (filep->f_op != &vfio_group_fops)
1723                 return ERR_PTR(-EINVAL);
1724
1725         ret = vfio_group_add_container_user(group);
1726         if (ret)
1727                 return ERR_PTR(ret);
1728
1729         vfio_group_get(group);
1730
1731         return group;
1732 }
1733 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1734
1735 void vfio_group_put_external_user(struct vfio_group *group)
1736 {
1737         vfio_group_try_dissolve_container(group);
1738         vfio_group_put(group);
1739 }
1740 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1741
1742 int vfio_external_user_iommu_id(struct vfio_group *group)
1743 {
1744         return iommu_group_id(group->iommu_group);
1745 }
1746 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1747
1748 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1749 {
1750         return vfio_ioctl_check_extension(group->container, arg);
1751 }
1752 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1753
1754 /**
1755  * Sub-module support
1756  */
1757 /*
1758  * Helper for managing a buffer of info chain capabilities, allocate or
1759  * reallocate a buffer with additional @size, filling in @id and @version
1760  * of the capability.  A pointer to the new capability is returned.
1761  *
1762  * NB. The chain is based at the head of the buffer, so new entries are
1763  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1764  * next offsets prior to copying to the user buffer.
1765  */
1766 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1767                                                size_t size, u16 id, u16 version)
1768 {
1769         void *buf;
1770         struct vfio_info_cap_header *header, *tmp;
1771
1772         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1773         if (!buf) {
1774                 kfree(caps->buf);
1775                 caps->size = 0;
1776                 return ERR_PTR(-ENOMEM);
1777         }
1778
1779         caps->buf = buf;
1780         header = buf + caps->size;
1781
1782         /* Eventually copied to user buffer, zero */
1783         memset(header, 0, size);
1784
1785         header->id = id;
1786         header->version = version;
1787
1788         /* Add to the end of the capability chain */
1789         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1790                 ; /* nothing */
1791
1792         tmp->next = caps->size;
1793         caps->size += size;
1794
1795         return header;
1796 }
1797 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1798
1799 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1800 {
1801         struct vfio_info_cap_header *tmp;
1802         void *buf = (void *)caps->buf;
1803
1804         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1805                 tmp->next += offset;
1806 }
1807 EXPORT_SYMBOL(vfio_info_cap_shift);
1808
1809 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1810 {
1811         struct vfio_info_cap_header *header;
1812         struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1813         size_t size;
1814
1815         size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
1816         header = vfio_info_cap_add(caps, size,
1817                                    VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1818         if (IS_ERR(header))
1819                 return PTR_ERR(header);
1820
1821         sparse_cap = container_of(header,
1822                         struct vfio_region_info_cap_sparse_mmap, header);
1823         sparse_cap->nr_areas = sparse->nr_areas;
1824         memcpy(sparse_cap->areas, sparse->areas,
1825                sparse->nr_areas * sizeof(*sparse->areas));
1826         return 0;
1827 }
1828
1829 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1830 {
1831         struct vfio_info_cap_header *header;
1832         struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1833
1834         header = vfio_info_cap_add(caps, sizeof(*cap),
1835                                    VFIO_REGION_INFO_CAP_TYPE, 1);
1836         if (IS_ERR(header))
1837                 return PTR_ERR(header);
1838
1839         type_cap = container_of(header, struct vfio_region_info_cap_type,
1840                                 header);
1841         type_cap->type = cap->type;
1842         type_cap->subtype = cap->subtype;
1843         return 0;
1844 }
1845
1846 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1847                              void *cap_type)
1848 {
1849         int ret = -EINVAL;
1850
1851         if (!cap_type)
1852                 return 0;
1853
1854         switch (cap_type_id) {
1855         case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1856                 ret = sparse_mmap_cap(caps, cap_type);
1857                 break;
1858
1859         case VFIO_REGION_INFO_CAP_TYPE:
1860                 ret = region_type_cap(caps, cap_type);
1861                 break;
1862         }
1863
1864         return ret;
1865 }
1866 EXPORT_SYMBOL(vfio_info_add_capability);
1867
1868 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1869                                        int max_irq_type, size_t *data_size)
1870 {
1871         unsigned long minsz;
1872         size_t size;
1873
1874         minsz = offsetofend(struct vfio_irq_set, count);
1875
1876         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1877             (hdr->count >= (U32_MAX - hdr->start)) ||
1878             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1879                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1880                 return -EINVAL;
1881
1882         if (data_size)
1883                 *data_size = 0;
1884
1885         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1886                 return -EINVAL;
1887
1888         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1889         case VFIO_IRQ_SET_DATA_NONE:
1890                 size = 0;
1891                 break;
1892         case VFIO_IRQ_SET_DATA_BOOL:
1893                 size = sizeof(uint8_t);
1894                 break;
1895         case VFIO_IRQ_SET_DATA_EVENTFD:
1896                 size = sizeof(int32_t);
1897                 break;
1898         default:
1899                 return -EINVAL;
1900         }
1901
1902         if (size) {
1903                 if (hdr->argsz - minsz < hdr->count * size)
1904                         return -EINVAL;
1905
1906                 if (!data_size)
1907                         return -EINVAL;
1908
1909                 *data_size = hdr->count * size;
1910         }
1911
1912         return 0;
1913 }
1914 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1915
1916 /*
1917  * Pin a set of guest PFNs and return their associated host PFNs for local
1918  * domain only.
1919  * @dev [in]     : device
1920  * @user_pfn [in]: array of user/guest PFNs to be unpinned.
1921  * @npage [in]   : count of elements in user_pfn array.  This count should not
1922  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1923  * @prot [in]    : protection flags
1924  * @phys_pfn[out]: array of host PFNs
1925  * Return error or number of pages pinned.
1926  */
1927 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1928                    int prot, unsigned long *phys_pfn)
1929 {
1930         struct vfio_container *container;
1931         struct vfio_group *group;
1932         struct vfio_iommu_driver *driver;
1933         int ret;
1934
1935         if (!dev || !user_pfn || !phys_pfn || !npage)
1936                 return -EINVAL;
1937
1938         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1939                 return -E2BIG;
1940
1941         group = vfio_group_get_from_dev(dev);
1942         if (!group)
1943                 return -ENODEV;
1944
1945         ret = vfio_group_add_container_user(group);
1946         if (ret)
1947                 goto err_pin_pages;
1948
1949         container = group->container;
1950         down_read(&container->group_lock);
1951
1952         driver = container->iommu_driver;
1953         if (likely(driver && driver->ops->pin_pages))
1954                 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1955                                              npage, prot, phys_pfn);
1956         else
1957                 ret = -ENOTTY;
1958
1959         up_read(&container->group_lock);
1960         vfio_group_try_dissolve_container(group);
1961
1962 err_pin_pages:
1963         vfio_group_put(group);
1964         return ret;
1965 }
1966 EXPORT_SYMBOL(vfio_pin_pages);
1967
1968 /*
1969  * Unpin set of host PFNs for local domain only.
1970  * @dev [in]     : device
1971  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1972  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1973  * @npage [in]   : count of elements in user_pfn array.  This count should not
1974  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1975  * Return error or number of pages unpinned.
1976  */
1977 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1978 {
1979         struct vfio_container *container;
1980         struct vfio_group *group;
1981         struct vfio_iommu_driver *driver;
1982         int ret;
1983
1984         if (!dev || !user_pfn || !npage)
1985                 return -EINVAL;
1986
1987         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1988                 return -E2BIG;
1989
1990         group = vfio_group_get_from_dev(dev);
1991         if (!group)
1992                 return -ENODEV;
1993
1994         ret = vfio_group_add_container_user(group);
1995         if (ret)
1996                 goto err_unpin_pages;
1997
1998         container = group->container;
1999         down_read(&container->group_lock);
2000
2001         driver = container->iommu_driver;
2002         if (likely(driver && driver->ops->unpin_pages))
2003                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2004                                                npage);
2005         else
2006                 ret = -ENOTTY;
2007
2008         up_read(&container->group_lock);
2009         vfio_group_try_dissolve_container(group);
2010
2011 err_unpin_pages:
2012         vfio_group_put(group);
2013         return ret;
2014 }
2015 EXPORT_SYMBOL(vfio_unpin_pages);
2016
2017 static int vfio_register_iommu_notifier(struct vfio_group *group,
2018                                         unsigned long *events,
2019                                         struct notifier_block *nb)
2020 {
2021         struct vfio_container *container;
2022         struct vfio_iommu_driver *driver;
2023         int ret;
2024
2025         ret = vfio_group_add_container_user(group);
2026         if (ret)
2027                 return -EINVAL;
2028
2029         container = group->container;
2030         down_read(&container->group_lock);
2031
2032         driver = container->iommu_driver;
2033         if (likely(driver && driver->ops->register_notifier))
2034                 ret = driver->ops->register_notifier(container->iommu_data,
2035                                                      events, nb);
2036         else
2037                 ret = -ENOTTY;
2038
2039         up_read(&container->group_lock);
2040         vfio_group_try_dissolve_container(group);
2041
2042         return ret;
2043 }
2044
2045 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2046                                           struct notifier_block *nb)
2047 {
2048         struct vfio_container *container;
2049         struct vfio_iommu_driver *driver;
2050         int ret;
2051
2052         ret = vfio_group_add_container_user(group);
2053         if (ret)
2054                 return -EINVAL;
2055
2056         container = group->container;
2057         down_read(&container->group_lock);
2058
2059         driver = container->iommu_driver;
2060         if (likely(driver && driver->ops->unregister_notifier))
2061                 ret = driver->ops->unregister_notifier(container->iommu_data,
2062                                                        nb);
2063         else
2064                 ret = -ENOTTY;
2065
2066         up_read(&container->group_lock);
2067         vfio_group_try_dissolve_container(group);
2068
2069         return ret;
2070 }
2071
2072 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2073 {
2074         group->kvm = kvm;
2075         blocking_notifier_call_chain(&group->notifier,
2076                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2077 }
2078 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2079
2080 static int vfio_register_group_notifier(struct vfio_group *group,
2081                                         unsigned long *events,
2082                                         struct notifier_block *nb)
2083 {
2084         struct vfio_container *container;
2085         int ret;
2086         bool set_kvm = false;
2087
2088         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2089                 set_kvm = true;
2090
2091         /* clear known events */
2092         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2093
2094         /* refuse to continue if still events remaining */
2095         if (*events)
2096                 return -EINVAL;
2097
2098         ret = vfio_group_add_container_user(group);
2099         if (ret)
2100                 return -EINVAL;
2101
2102         container = group->container;
2103         down_read(&container->group_lock);
2104
2105         ret = blocking_notifier_chain_register(&group->notifier, nb);
2106
2107         /*
2108          * The attaching of kvm and vfio_group might already happen, so
2109          * here we replay once upon registration.
2110          */
2111         if (!ret && set_kvm && group->kvm)
2112                 blocking_notifier_call_chain(&group->notifier,
2113                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2114
2115         up_read(&container->group_lock);
2116         vfio_group_try_dissolve_container(group);
2117
2118         return ret;
2119 }
2120
2121 static int vfio_unregister_group_notifier(struct vfio_group *group,
2122                                          struct notifier_block *nb)
2123 {
2124         struct vfio_container *container;
2125         int ret;
2126
2127         ret = vfio_group_add_container_user(group);
2128         if (ret)
2129                 return -EINVAL;
2130
2131         container = group->container;
2132         down_read(&container->group_lock);
2133
2134         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2135
2136         up_read(&container->group_lock);
2137         vfio_group_try_dissolve_container(group);
2138
2139         return ret;
2140 }
2141
2142 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2143                            unsigned long *events, struct notifier_block *nb)
2144 {
2145         struct vfio_group *group;
2146         int ret;
2147
2148         if (!dev || !nb || !events || (*events == 0))
2149                 return -EINVAL;
2150
2151         group = vfio_group_get_from_dev(dev);
2152         if (!group)
2153                 return -ENODEV;
2154
2155         switch (type) {
2156         case VFIO_IOMMU_NOTIFY:
2157                 ret = vfio_register_iommu_notifier(group, events, nb);
2158                 break;
2159         case VFIO_GROUP_NOTIFY:
2160                 ret = vfio_register_group_notifier(group, events, nb);
2161                 break;
2162         default:
2163                 ret = -EINVAL;
2164         }
2165
2166         vfio_group_put(group);
2167         return ret;
2168 }
2169 EXPORT_SYMBOL(vfio_register_notifier);
2170
2171 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2172                              struct notifier_block *nb)
2173 {
2174         struct vfio_group *group;
2175         int ret;
2176
2177         if (!dev || !nb)
2178                 return -EINVAL;
2179
2180         group = vfio_group_get_from_dev(dev);
2181         if (!group)
2182                 return -ENODEV;
2183
2184         switch (type) {
2185         case VFIO_IOMMU_NOTIFY:
2186                 ret = vfio_unregister_iommu_notifier(group, nb);
2187                 break;
2188         case VFIO_GROUP_NOTIFY:
2189                 ret = vfio_unregister_group_notifier(group, nb);
2190                 break;
2191         default:
2192                 ret = -EINVAL;
2193         }
2194
2195         vfio_group_put(group);
2196         return ret;
2197 }
2198 EXPORT_SYMBOL(vfio_unregister_notifier);
2199
2200 /**
2201  * Module/class support
2202  */
2203 static char *vfio_devnode(struct device *dev, umode_t *mode)
2204 {
2205         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2206 }
2207
2208 static struct miscdevice vfio_dev = {
2209         .minor = VFIO_MINOR,
2210         .name = "vfio",
2211         .fops = &vfio_fops,
2212         .nodename = "vfio/vfio",
2213         .mode = S_IRUGO | S_IWUGO,
2214 };
2215
2216 static int __init vfio_init(void)
2217 {
2218         int ret;
2219
2220         idr_init(&vfio.group_idr);
2221         mutex_init(&vfio.group_lock);
2222         mutex_init(&vfio.iommu_drivers_lock);
2223         INIT_LIST_HEAD(&vfio.group_list);
2224         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2225         init_waitqueue_head(&vfio.release_q);
2226
2227         ret = misc_register(&vfio_dev);
2228         if (ret) {
2229                 pr_err("vfio: misc device register failed\n");
2230                 return ret;
2231         }
2232
2233         /* /dev/vfio/$GROUP */
2234         vfio.class = class_create(THIS_MODULE, "vfio");
2235         if (IS_ERR(vfio.class)) {
2236                 ret = PTR_ERR(vfio.class);
2237                 goto err_class;
2238         }
2239
2240         vfio.class->devnode = vfio_devnode;
2241
2242         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2243         if (ret)
2244                 goto err_alloc_chrdev;
2245
2246         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2247         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2248         if (ret)
2249                 goto err_cdev_add;
2250
2251         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2252
2253         /*
2254          * Attempt to load known iommu-drivers.  This gives us a working
2255          * environment without the user needing to explicitly load iommu
2256          * drivers.
2257          */
2258         request_module_nowait("vfio_iommu_type1");
2259         request_module_nowait("vfio_iommu_spapr_tce");
2260
2261 #ifdef CONFIG_VFIO_NOIOMMU
2262         vfio_register_iommu_driver(&vfio_noiommu_ops);
2263 #endif
2264         return 0;
2265
2266 err_cdev_add:
2267         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2268 err_alloc_chrdev:
2269         class_destroy(vfio.class);
2270         vfio.class = NULL;
2271 err_class:
2272         misc_deregister(&vfio_dev);
2273         return ret;
2274 }
2275
2276 static void __exit vfio_cleanup(void)
2277 {
2278         WARN_ON(!list_empty(&vfio.group_list));
2279
2280 #ifdef CONFIG_VFIO_NOIOMMU
2281         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2282 #endif
2283         idr_destroy(&vfio.group_idr);
2284         cdev_del(&vfio.group_cdev);
2285         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2286         class_destroy(vfio.class);
2287         vfio.class = NULL;
2288         misc_deregister(&vfio_dev);
2289 }
2290
2291 module_init(vfio_init);
2292 module_exit(vfio_cleanup);
2293
2294 MODULE_VERSION(DRIVER_VERSION);
2295 MODULE_LICENSE("GPL v2");
2296 MODULE_AUTHOR(DRIVER_AUTHOR);
2297 MODULE_DESCRIPTION(DRIVER_DESC);
2298 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2299 MODULE_ALIAS("devname:vfio/vfio");