1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
4 * Author: Alex Williamson <alex.williamson@redhat.com>
6 * Derived from original vfio:
7 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
8 * Author: Tom Lyon, pugs@cisco.com
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <linux/device.h>
14 #include <linux/eventfd.h>
15 #include <linux/file.h>
16 #include <linux/interrupt.h>
17 #include <linux/iommu.h>
18 #include <linux/module.h>
19 #include <linux/mutex.h>
20 #include <linux/notifier.h>
21 #include <linux/pci.h>
22 #include <linux/pm_runtime.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/uaccess.h>
26 #include <linux/vfio.h>
27 #include <linux/vgaarb.h>
28 #include <linux/nospec.h>
30 #include "vfio_pci_private.h"
32 #define DRIVER_VERSION "0.2"
33 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
34 #define DRIVER_DESC "VFIO PCI - User Level meta-driver"
36 static char ids
[1024] __initdata
;
37 module_param_string(ids
, ids
, sizeof(ids
), 0);
38 MODULE_PARM_DESC(ids
, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified");
40 static bool nointxmask
;
41 module_param_named(nointxmask
, nointxmask
, bool, S_IRUGO
| S_IWUSR
);
42 MODULE_PARM_DESC(nointxmask
,
43 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
45 #ifdef CONFIG_VFIO_PCI_VGA
46 static bool disable_vga
;
47 module_param(disable_vga
, bool, S_IRUGO
);
48 MODULE_PARM_DESC(disable_vga
, "Disable VGA resource access through vfio-pci");
51 static bool disable_idle_d3
;
52 module_param(disable_idle_d3
, bool, S_IRUGO
| S_IWUSR
);
53 MODULE_PARM_DESC(disable_idle_d3
,
54 "Disable using the PCI D3 low power state for idle, unused devices");
56 static bool enable_sriov
;
58 module_param(enable_sriov
, bool, 0644);
59 MODULE_PARM_DESC(enable_sriov
, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
62 static inline bool vfio_vga_disabled(void)
64 #ifdef CONFIG_VFIO_PCI_VGA
72 * Our VGA arbiter participation is limited since we don't know anything
73 * about the device itself. However, if the device is the only VGA device
74 * downstream of a bridge and VFIO VGA support is disabled, then we can
75 * safely return legacy VGA IO and memory as not decoded since the user
76 * has no way to get to it and routing can be disabled externally at the
79 static unsigned int vfio_pci_set_vga_decode(void *opaque
, bool single_vga
)
81 struct vfio_pci_device
*vdev
= opaque
;
82 struct pci_dev
*tmp
= NULL
, *pdev
= vdev
->pdev
;
83 unsigned char max_busnr
;
86 if (single_vga
|| !vfio_vga_disabled() || pci_is_root_bus(pdev
->bus
))
87 return VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
88 VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
90 max_busnr
= pci_bus_max_busnr(pdev
->bus
);
91 decodes
= VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
;
93 while ((tmp
= pci_get_class(PCI_CLASS_DISPLAY_VGA
<< 8, tmp
)) != NULL
) {
95 pci_domain_nr(tmp
->bus
) != pci_domain_nr(pdev
->bus
) ||
96 pci_is_root_bus(tmp
->bus
))
99 if (tmp
->bus
->number
>= pdev
->bus
->number
&&
100 tmp
->bus
->number
<= max_busnr
) {
102 decodes
|= VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
110 static inline bool vfio_pci_is_vga(struct pci_dev
*pdev
)
112 return (pdev
->class >> 8) == PCI_CLASS_DISPLAY_VGA
;
115 static void vfio_pci_probe_mmaps(struct vfio_pci_device
*vdev
)
117 struct resource
*res
;
119 struct vfio_pci_dummy_resource
*dummy_res
;
121 INIT_LIST_HEAD(&vdev
->dummy_resources_list
);
123 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
124 int bar
= i
+ PCI_STD_RESOURCES
;
126 res
= &vdev
->pdev
->resource
[bar
];
128 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP
))
131 if (!(res
->flags
& IORESOURCE_MEM
))
135 * The PCI core shouldn't set up a resource with a
136 * type but zero size. But there may be bugs that
137 * cause us to do that.
139 if (!resource_size(res
))
142 if (resource_size(res
) >= PAGE_SIZE
) {
143 vdev
->bar_mmap_supported
[bar
] = true;
147 if (!(res
->start
& ~PAGE_MASK
)) {
149 * Add a dummy resource to reserve the remainder
150 * of the exclusive page in case that hot-add
151 * device's bar is assigned into it.
153 dummy_res
= kzalloc(sizeof(*dummy_res
), GFP_KERNEL
);
154 if (dummy_res
== NULL
)
157 dummy_res
->resource
.name
= "vfio sub-page reserved";
158 dummy_res
->resource
.start
= res
->end
+ 1;
159 dummy_res
->resource
.end
= res
->start
+ PAGE_SIZE
- 1;
160 dummy_res
->resource
.flags
= res
->flags
;
161 if (request_resource(res
->parent
,
162 &dummy_res
->resource
)) {
166 dummy_res
->index
= bar
;
167 list_add(&dummy_res
->res_next
,
168 &vdev
->dummy_resources_list
);
169 vdev
->bar_mmap_supported
[bar
] = true;
173 * Here we don't handle the case when the BAR is not page
174 * aligned because we can't expect the BAR will be
175 * assigned into the same location in a page in guest
176 * when we passthrough the BAR. And it's hard to access
177 * this BAR in userspace because we have no way to get
178 * the BAR's location in a page.
181 vdev
->bar_mmap_supported
[bar
] = false;
185 static void vfio_pci_try_bus_reset(struct vfio_pci_device
*vdev
);
186 static void vfio_pci_disable(struct vfio_pci_device
*vdev
);
189 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
190 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
191 * If a device implements the former but not the latter we would typically
192 * expect broken_intx_masking be set and require an exclusive interrupt.
193 * However since we do have control of the device's ability to assert INTx,
194 * we can instead pretend that the device does not implement INTx, virtualizing
195 * the pin register to report zero and maintaining DisINTx set on the host.
197 static bool vfio_pci_nointx(struct pci_dev
*pdev
)
199 switch (pdev
->vendor
) {
200 case PCI_VENDOR_ID_INTEL
:
201 switch (pdev
->device
) {
202 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
205 case 0x1580 ... 0x1581:
206 case 0x1583 ... 0x158b:
207 case 0x37d0 ... 0x37d2:
217 static void vfio_pci_probe_power_state(struct vfio_pci_device
*vdev
)
219 struct pci_dev
*pdev
= vdev
->pdev
;
225 pci_read_config_word(pdev
, pdev
->pm_cap
+ PCI_PM_CTRL
, &pmcsr
);
227 vdev
->needs_pm_restore
= !(pmcsr
& PCI_PM_CTRL_NO_SOFT_RESET
);
231 * pci_set_power_state() wrapper handling devices which perform a soft reset on
232 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
233 * restore when returned to D0. Saved separately from pci_saved_state for use
234 * by PM capability emulation and separately from pci_dev internal saved state
235 * to avoid it being overwritten and consumed around other resets.
237 int vfio_pci_set_power_state(struct vfio_pci_device
*vdev
, pci_power_t state
)
239 struct pci_dev
*pdev
= vdev
->pdev
;
240 bool needs_restore
= false, needs_save
= false;
243 if (vdev
->needs_pm_restore
) {
244 if (pdev
->current_state
< PCI_D3hot
&& state
>= PCI_D3hot
) {
245 pci_save_state(pdev
);
249 if (pdev
->current_state
>= PCI_D3hot
&& state
<= PCI_D0
)
250 needs_restore
= true;
253 ret
= pci_set_power_state(pdev
, state
);
256 /* D3 might be unsupported via quirk, skip unless in D3 */
257 if (needs_save
&& pdev
->current_state
>= PCI_D3hot
) {
258 vdev
->pm_save
= pci_store_saved_state(pdev
);
259 } else if (needs_restore
) {
260 pci_load_and_free_saved_state(pdev
, &vdev
->pm_save
);
261 pci_restore_state(pdev
);
268 static int vfio_pci_enable(struct vfio_pci_device
*vdev
)
270 struct pci_dev
*pdev
= vdev
->pdev
;
275 vfio_pci_set_power_state(vdev
, PCI_D0
);
277 /* Don't allow our initial saved state to include busmaster */
278 pci_clear_master(pdev
);
280 ret
= pci_enable_device(pdev
);
284 /* If reset fails because of the device lock, fail this path entirely */
285 ret
= pci_try_reset_function(pdev
);
286 if (ret
== -EAGAIN
) {
287 pci_disable_device(pdev
);
291 vdev
->reset_works
= !ret
;
292 pci_save_state(pdev
);
293 vdev
->pci_saved_state
= pci_store_saved_state(pdev
);
294 if (!vdev
->pci_saved_state
)
295 pci_dbg(pdev
, "%s: Couldn't store saved state\n", __func__
);
297 if (likely(!nointxmask
)) {
298 if (vfio_pci_nointx(pdev
)) {
299 pci_info(pdev
, "Masking broken INTx support\n");
303 vdev
->pci_2_3
= pci_intx_mask_supported(pdev
);
306 pci_read_config_word(pdev
, PCI_COMMAND
, &cmd
);
307 if (vdev
->pci_2_3
&& (cmd
& PCI_COMMAND_INTX_DISABLE
)) {
308 cmd
&= ~PCI_COMMAND_INTX_DISABLE
;
309 pci_write_config_word(pdev
, PCI_COMMAND
, cmd
);
312 ret
= vfio_config_init(vdev
);
314 kfree(vdev
->pci_saved_state
);
315 vdev
->pci_saved_state
= NULL
;
316 pci_disable_device(pdev
);
320 msix_pos
= pdev
->msix_cap
;
325 pci_read_config_word(pdev
, msix_pos
+ PCI_MSIX_FLAGS
, &flags
);
326 pci_read_config_dword(pdev
, msix_pos
+ PCI_MSIX_TABLE
, &table
);
328 vdev
->msix_bar
= table
& PCI_MSIX_TABLE_BIR
;
329 vdev
->msix_offset
= table
& PCI_MSIX_TABLE_OFFSET
;
330 vdev
->msix_size
= ((flags
& PCI_MSIX_FLAGS_QSIZE
) + 1) * 16;
332 vdev
->msix_bar
= 0xFF;
334 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev
))
335 vdev
->has_vga
= true;
338 if (vfio_pci_is_vga(pdev
) &&
339 pdev
->vendor
== PCI_VENDOR_ID_INTEL
&&
340 IS_ENABLED(CONFIG_VFIO_PCI_IGD
)) {
341 ret
= vfio_pci_igd_init(vdev
);
343 pci_warn(pdev
, "Failed to setup Intel IGD regions\n");
348 if (pdev
->vendor
== PCI_VENDOR_ID_NVIDIA
&&
349 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2
)) {
350 ret
= vfio_pci_nvdia_v100_nvlink2_init(vdev
);
351 if (ret
&& ret
!= -ENODEV
) {
352 pci_warn(pdev
, "Failed to setup NVIDIA NV2 RAM region\n");
357 if (pdev
->vendor
== PCI_VENDOR_ID_IBM
&&
358 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2
)) {
359 ret
= vfio_pci_ibm_npu2_init(vdev
);
360 if (ret
&& ret
!= -ENODEV
) {
361 pci_warn(pdev
, "Failed to setup NVIDIA NV2 ATSD region\n");
366 vfio_pci_probe_mmaps(vdev
);
371 vfio_pci_disable(vdev
);
375 static void vfio_pci_disable(struct vfio_pci_device
*vdev
)
377 struct pci_dev
*pdev
= vdev
->pdev
;
378 struct vfio_pci_dummy_resource
*dummy_res
, *tmp
;
379 struct vfio_pci_ioeventfd
*ioeventfd
, *ioeventfd_tmp
;
382 /* Stop the device from further DMA */
383 pci_clear_master(pdev
);
385 vfio_pci_set_irqs_ioctl(vdev
, VFIO_IRQ_SET_DATA_NONE
|
386 VFIO_IRQ_SET_ACTION_TRIGGER
,
387 vdev
->irq_type
, 0, 0, NULL
);
389 /* Device closed, don't need mutex here */
390 list_for_each_entry_safe(ioeventfd
, ioeventfd_tmp
,
391 &vdev
->ioeventfds_list
, next
) {
392 vfio_virqfd_disable(&ioeventfd
->virqfd
);
393 list_del(&ioeventfd
->next
);
396 vdev
->ioeventfds_nr
= 0;
398 vdev
->virq_disabled
= false;
400 for (i
= 0; i
< vdev
->num_regions
; i
++)
401 vdev
->region
[i
].ops
->release(vdev
, &vdev
->region
[i
]);
403 vdev
->num_regions
= 0;
405 vdev
->region
= NULL
; /* don't krealloc a freed pointer */
407 vfio_config_free(vdev
);
409 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
410 bar
= i
+ PCI_STD_RESOURCES
;
411 if (!vdev
->barmap
[bar
])
413 pci_iounmap(pdev
, vdev
->barmap
[bar
]);
414 pci_release_selected_regions(pdev
, 1 << bar
);
415 vdev
->barmap
[bar
] = NULL
;
418 list_for_each_entry_safe(dummy_res
, tmp
,
419 &vdev
->dummy_resources_list
, res_next
) {
420 list_del(&dummy_res
->res_next
);
421 release_resource(&dummy_res
->resource
);
425 vdev
->needs_reset
= true;
428 * If we have saved state, restore it. If we can reset the device,
429 * even better. Resetting with current state seems better than
430 * nothing, but saving and restoring current state without reset
433 if (pci_load_and_free_saved_state(pdev
, &vdev
->pci_saved_state
)) {
434 pci_info(pdev
, "%s: Couldn't reload saved state\n", __func__
);
436 if (!vdev
->reset_works
)
439 pci_save_state(pdev
);
443 * Disable INTx and MSI, presumably to avoid spurious interrupts
444 * during reset. Stolen from pci_reset_function()
446 pci_write_config_word(pdev
, PCI_COMMAND
, PCI_COMMAND_INTX_DISABLE
);
449 * Try to get the locks ourselves to prevent a deadlock. The
450 * success of this is dependent on being able to lock the device,
451 * which is not always possible.
452 * We can not use the "try" reset interface here, which will
453 * overwrite the previously restored configuration information.
455 if (vdev
->reset_works
&& pci_cfg_access_trylock(pdev
)) {
456 if (device_trylock(&pdev
->dev
)) {
457 if (!__pci_reset_function_locked(pdev
))
458 vdev
->needs_reset
= false;
459 device_unlock(&pdev
->dev
);
461 pci_cfg_access_unlock(pdev
);
464 pci_restore_state(pdev
);
466 pci_disable_device(pdev
);
468 vfio_pci_try_bus_reset(vdev
);
470 if (!disable_idle_d3
)
471 vfio_pci_set_power_state(vdev
, PCI_D3hot
);
474 static struct pci_driver vfio_pci_driver
;
476 static struct vfio_pci_device
*get_pf_vdev(struct vfio_pci_device
*vdev
,
477 struct vfio_device
**pf_dev
)
479 struct pci_dev
*physfn
= pci_physfn(vdev
->pdev
);
481 if (!vdev
->pdev
->is_virtfn
)
484 *pf_dev
= vfio_device_get_from_dev(&physfn
->dev
);
488 if (pci_dev_driver(physfn
) != &vfio_pci_driver
) {
489 vfio_device_put(*pf_dev
);
493 return vfio_device_data(*pf_dev
);
496 static void vfio_pci_vf_token_user_add(struct vfio_pci_device
*vdev
, int val
)
498 struct vfio_device
*pf_dev
;
499 struct vfio_pci_device
*pf_vdev
= get_pf_vdev(vdev
, &pf_dev
);
504 mutex_lock(&pf_vdev
->vf_token
->lock
);
505 pf_vdev
->vf_token
->users
+= val
;
506 WARN_ON(pf_vdev
->vf_token
->users
< 0);
507 mutex_unlock(&pf_vdev
->vf_token
->lock
);
509 vfio_device_put(pf_dev
);
512 static void vfio_pci_release(void *device_data
)
514 struct vfio_pci_device
*vdev
= device_data
;
516 mutex_lock(&vdev
->reflck
->lock
);
518 if (!(--vdev
->refcnt
)) {
519 vfio_pci_vf_token_user_add(vdev
, -1);
520 vfio_spapr_pci_eeh_release(vdev
->pdev
);
521 vfio_pci_disable(vdev
);
524 mutex_unlock(&vdev
->reflck
->lock
);
526 module_put(THIS_MODULE
);
529 static int vfio_pci_open(void *device_data
)
531 struct vfio_pci_device
*vdev
= device_data
;
534 if (!try_module_get(THIS_MODULE
))
537 mutex_lock(&vdev
->reflck
->lock
);
540 ret
= vfio_pci_enable(vdev
);
544 vfio_spapr_pci_eeh_open(vdev
->pdev
);
545 vfio_pci_vf_token_user_add(vdev
, 1);
549 mutex_unlock(&vdev
->reflck
->lock
);
551 module_put(THIS_MODULE
);
555 static int vfio_pci_get_irq_count(struct vfio_pci_device
*vdev
, int irq_type
)
557 if (irq_type
== VFIO_PCI_INTX_IRQ_INDEX
) {
560 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX
) ||
561 vdev
->nointx
|| vdev
->pdev
->is_virtfn
)
564 pci_read_config_byte(vdev
->pdev
, PCI_INTERRUPT_PIN
, &pin
);
567 } else if (irq_type
== VFIO_PCI_MSI_IRQ_INDEX
) {
571 pos
= vdev
->pdev
->msi_cap
;
573 pci_read_config_word(vdev
->pdev
,
574 pos
+ PCI_MSI_FLAGS
, &flags
);
575 return 1 << ((flags
& PCI_MSI_FLAGS_QMASK
) >> 1);
577 } else if (irq_type
== VFIO_PCI_MSIX_IRQ_INDEX
) {
581 pos
= vdev
->pdev
->msix_cap
;
583 pci_read_config_word(vdev
->pdev
,
584 pos
+ PCI_MSIX_FLAGS
, &flags
);
586 return (flags
& PCI_MSIX_FLAGS_QSIZE
) + 1;
588 } else if (irq_type
== VFIO_PCI_ERR_IRQ_INDEX
) {
589 if (pci_is_pcie(vdev
->pdev
))
591 } else if (irq_type
== VFIO_PCI_REQ_IRQ_INDEX
) {
598 static int vfio_pci_count_devs(struct pci_dev
*pdev
, void *data
)
604 struct vfio_pci_fill_info
{
607 struct vfio_pci_dependent_device
*devices
;
610 static int vfio_pci_fill_devs(struct pci_dev
*pdev
, void *data
)
612 struct vfio_pci_fill_info
*fill
= data
;
613 struct iommu_group
*iommu_group
;
615 if (fill
->cur
== fill
->max
)
616 return -EAGAIN
; /* Something changed, try again */
618 iommu_group
= iommu_group_get(&pdev
->dev
);
620 return -EPERM
; /* Cannot reset non-isolated devices */
622 fill
->devices
[fill
->cur
].group_id
= iommu_group_id(iommu_group
);
623 fill
->devices
[fill
->cur
].segment
= pci_domain_nr(pdev
->bus
);
624 fill
->devices
[fill
->cur
].bus
= pdev
->bus
->number
;
625 fill
->devices
[fill
->cur
].devfn
= pdev
->devfn
;
627 iommu_group_put(iommu_group
);
631 struct vfio_pci_group_entry
{
632 struct vfio_group
*group
;
636 struct vfio_pci_group_info
{
638 struct vfio_pci_group_entry
*groups
;
641 static int vfio_pci_validate_devs(struct pci_dev
*pdev
, void *data
)
643 struct vfio_pci_group_info
*info
= data
;
644 struct iommu_group
*group
;
647 group
= iommu_group_get(&pdev
->dev
);
651 id
= iommu_group_id(group
);
653 for (i
= 0; i
< info
->count
; i
++)
654 if (info
->groups
[i
].id
== id
)
657 iommu_group_put(group
);
659 return (i
== info
->count
) ? -EINVAL
: 0;
662 static bool vfio_pci_dev_below_slot(struct pci_dev
*pdev
, struct pci_slot
*slot
)
664 for (; pdev
; pdev
= pdev
->bus
->self
)
665 if (pdev
->bus
== slot
->bus
)
666 return (pdev
->slot
== slot
);
670 struct vfio_pci_walk_info
{
671 int (*fn
)(struct pci_dev
*, void *data
);
673 struct pci_dev
*pdev
;
678 static int vfio_pci_walk_wrapper(struct pci_dev
*pdev
, void *data
)
680 struct vfio_pci_walk_info
*walk
= data
;
682 if (!walk
->slot
|| vfio_pci_dev_below_slot(pdev
, walk
->pdev
->slot
))
683 walk
->ret
= walk
->fn(pdev
, walk
->data
);
688 static int vfio_pci_for_each_slot_or_bus(struct pci_dev
*pdev
,
689 int (*fn
)(struct pci_dev
*,
690 void *data
), void *data
,
693 struct vfio_pci_walk_info walk
= {
694 .fn
= fn
, .data
= data
, .pdev
= pdev
, .slot
= slot
, .ret
= 0,
697 pci_walk_bus(pdev
->bus
, vfio_pci_walk_wrapper
, &walk
);
702 static int msix_mmappable_cap(struct vfio_pci_device
*vdev
,
703 struct vfio_info_cap
*caps
)
705 struct vfio_info_cap_header header
= {
706 .id
= VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
,
710 return vfio_info_add_capability(caps
, &header
, sizeof(header
));
713 int vfio_pci_register_dev_region(struct vfio_pci_device
*vdev
,
714 unsigned int type
, unsigned int subtype
,
715 const struct vfio_pci_regops
*ops
,
716 size_t size
, u32 flags
, void *data
)
718 struct vfio_pci_region
*region
;
720 region
= krealloc(vdev
->region
,
721 (vdev
->num_regions
+ 1) * sizeof(*region
),
726 vdev
->region
= region
;
727 vdev
->region
[vdev
->num_regions
].type
= type
;
728 vdev
->region
[vdev
->num_regions
].subtype
= subtype
;
729 vdev
->region
[vdev
->num_regions
].ops
= ops
;
730 vdev
->region
[vdev
->num_regions
].size
= size
;
731 vdev
->region
[vdev
->num_regions
].flags
= flags
;
732 vdev
->region
[vdev
->num_regions
].data
= data
;
739 static long vfio_pci_ioctl(void *device_data
,
740 unsigned int cmd
, unsigned long arg
)
742 struct vfio_pci_device
*vdev
= device_data
;
745 if (cmd
== VFIO_DEVICE_GET_INFO
) {
746 struct vfio_device_info info
;
748 minsz
= offsetofend(struct vfio_device_info
, num_irqs
);
750 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
753 if (info
.argsz
< minsz
)
756 info
.flags
= VFIO_DEVICE_FLAGS_PCI
;
758 if (vdev
->reset_works
)
759 info
.flags
|= VFIO_DEVICE_FLAGS_RESET
;
761 info
.num_regions
= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
;
762 info
.num_irqs
= VFIO_PCI_NUM_IRQS
;
764 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
767 } else if (cmd
== VFIO_DEVICE_GET_REGION_INFO
) {
768 struct pci_dev
*pdev
= vdev
->pdev
;
769 struct vfio_region_info info
;
770 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
773 minsz
= offsetofend(struct vfio_region_info
, offset
);
775 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
778 if (info
.argsz
< minsz
)
781 switch (info
.index
) {
782 case VFIO_PCI_CONFIG_REGION_INDEX
:
783 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
784 info
.size
= pdev
->cfg_size
;
785 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
786 VFIO_REGION_INFO_FLAG_WRITE
;
788 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
789 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
790 info
.size
= pci_resource_len(pdev
, info
.index
);
796 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
797 VFIO_REGION_INFO_FLAG_WRITE
;
798 if (vdev
->bar_mmap_supported
[info
.index
]) {
799 info
.flags
|= VFIO_REGION_INFO_FLAG_MMAP
;
800 if (info
.index
== vdev
->msix_bar
) {
801 ret
= msix_mmappable_cap(vdev
, &caps
);
808 case VFIO_PCI_ROM_REGION_INDEX
:
814 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
817 /* Report the BAR size, not the ROM size */
818 info
.size
= pci_resource_len(pdev
, info
.index
);
820 /* Shadow ROMs appear as PCI option ROMs */
821 if (pdev
->resource
[PCI_ROM_RESOURCE
].flags
&
822 IORESOURCE_ROM_SHADOW
)
829 * Is it really there? Enable memory decode for
830 * implicit access in pci_map_rom().
832 pci_read_config_word(pdev
, PCI_COMMAND
, &orig_cmd
);
833 pci_write_config_word(pdev
, PCI_COMMAND
,
834 orig_cmd
| PCI_COMMAND_MEMORY
);
836 io
= pci_map_rom(pdev
, &size
);
838 info
.flags
= VFIO_REGION_INFO_FLAG_READ
;
839 pci_unmap_rom(pdev
, io
);
844 pci_write_config_word(pdev
, PCI_COMMAND
, orig_cmd
);
847 case VFIO_PCI_VGA_REGION_INDEX
:
851 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
853 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
854 VFIO_REGION_INFO_FLAG_WRITE
;
859 struct vfio_region_info_cap_type cap_type
= {
860 .header
.id
= VFIO_REGION_INFO_CAP_TYPE
,
861 .header
.version
= 1 };
864 VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
866 info
.index
= array_index_nospec(info
.index
,
867 VFIO_PCI_NUM_REGIONS
+
870 i
= info
.index
- VFIO_PCI_NUM_REGIONS
;
872 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
873 info
.size
= vdev
->region
[i
].size
;
874 info
.flags
= vdev
->region
[i
].flags
;
876 cap_type
.type
= vdev
->region
[i
].type
;
877 cap_type
.subtype
= vdev
->region
[i
].subtype
;
879 ret
= vfio_info_add_capability(&caps
, &cap_type
.header
,
884 if (vdev
->region
[i
].ops
->add_capability
) {
885 ret
= vdev
->region
[i
].ops
->add_capability(vdev
,
886 &vdev
->region
[i
], &caps
);
894 info
.flags
|= VFIO_REGION_INFO_FLAG_CAPS
;
895 if (info
.argsz
< sizeof(info
) + caps
.size
) {
896 info
.argsz
= sizeof(info
) + caps
.size
;
899 vfio_info_cap_shift(&caps
, sizeof(info
));
900 if (copy_to_user((void __user
*)arg
+
901 sizeof(info
), caps
.buf
,
906 info
.cap_offset
= sizeof(info
);
912 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
915 } else if (cmd
== VFIO_DEVICE_GET_IRQ_INFO
) {
916 struct vfio_irq_info info
;
918 minsz
= offsetofend(struct vfio_irq_info
, count
);
920 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
923 if (info
.argsz
< minsz
|| info
.index
>= VFIO_PCI_NUM_IRQS
)
926 switch (info
.index
) {
927 case VFIO_PCI_INTX_IRQ_INDEX
... VFIO_PCI_MSIX_IRQ_INDEX
:
928 case VFIO_PCI_REQ_IRQ_INDEX
:
930 case VFIO_PCI_ERR_IRQ_INDEX
:
931 if (pci_is_pcie(vdev
->pdev
))
938 info
.flags
= VFIO_IRQ_INFO_EVENTFD
;
940 info
.count
= vfio_pci_get_irq_count(vdev
, info
.index
);
942 if (info
.index
== VFIO_PCI_INTX_IRQ_INDEX
)
943 info
.flags
|= (VFIO_IRQ_INFO_MASKABLE
|
944 VFIO_IRQ_INFO_AUTOMASKED
);
946 info
.flags
|= VFIO_IRQ_INFO_NORESIZE
;
948 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
951 } else if (cmd
== VFIO_DEVICE_SET_IRQS
) {
952 struct vfio_irq_set hdr
;
955 size_t data_size
= 0;
957 minsz
= offsetofend(struct vfio_irq_set
, count
);
959 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
962 max
= vfio_pci_get_irq_count(vdev
, hdr
.index
);
964 ret
= vfio_set_irqs_validate_and_prepare(&hdr
, max
,
965 VFIO_PCI_NUM_IRQS
, &data_size
);
970 data
= memdup_user((void __user
*)(arg
+ minsz
),
973 return PTR_ERR(data
);
976 mutex_lock(&vdev
->igate
);
978 ret
= vfio_pci_set_irqs_ioctl(vdev
, hdr
.flags
, hdr
.index
,
979 hdr
.start
, hdr
.count
, data
);
981 mutex_unlock(&vdev
->igate
);
986 } else if (cmd
== VFIO_DEVICE_RESET
) {
987 return vdev
->reset_works
?
988 pci_try_reset_function(vdev
->pdev
) : -EINVAL
;
990 } else if (cmd
== VFIO_DEVICE_GET_PCI_HOT_RESET_INFO
) {
991 struct vfio_pci_hot_reset_info hdr
;
992 struct vfio_pci_fill_info fill
= { 0 };
993 struct vfio_pci_dependent_device
*devices
= NULL
;
997 minsz
= offsetofend(struct vfio_pci_hot_reset_info
, count
);
999 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1002 if (hdr
.argsz
< minsz
)
1007 /* Can we do a slot or bus reset or neither? */
1008 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1010 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1013 /* How many devices are affected? */
1014 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1015 vfio_pci_count_devs
,
1020 WARN_ON(!fill
.max
); /* Should always be at least one */
1023 * If there's enough space, fill it now, otherwise return
1024 * -ENOSPC and the number of devices affected.
1026 if (hdr
.argsz
< sizeof(hdr
) + (fill
.max
* sizeof(*devices
))) {
1028 hdr
.count
= fill
.max
;
1029 goto reset_info_exit
;
1032 devices
= kcalloc(fill
.max
, sizeof(*devices
), GFP_KERNEL
);
1036 fill
.devices
= devices
;
1038 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1043 * If a device was removed between counting and filling,
1044 * we may come up short of fill.max. If a device was
1045 * added, we'll have a return of -EAGAIN above.
1048 hdr
.count
= fill
.cur
;
1051 if (copy_to_user((void __user
*)arg
, &hdr
, minsz
))
1055 if (copy_to_user((void __user
*)(arg
+ minsz
), devices
,
1056 hdr
.count
* sizeof(*devices
)))
1063 } else if (cmd
== VFIO_DEVICE_PCI_HOT_RESET
) {
1064 struct vfio_pci_hot_reset hdr
;
1066 struct vfio_pci_group_entry
*groups
;
1067 struct vfio_pci_group_info info
;
1069 int i
, count
= 0, ret
= 0;
1071 minsz
= offsetofend(struct vfio_pci_hot_reset
, count
);
1073 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1076 if (hdr
.argsz
< minsz
|| hdr
.flags
)
1079 /* Can we do a slot or bus reset or neither? */
1080 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1082 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1086 * We can't let userspace give us an arbitrarily large
1087 * buffer to copy, so verify how many we think there
1088 * could be. Note groups can have multiple devices so
1089 * one group per device is the max.
1091 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1092 vfio_pci_count_devs
,
1097 /* Somewhere between 1 and count is OK */
1098 if (!hdr
.count
|| hdr
.count
> count
)
1101 group_fds
= kcalloc(hdr
.count
, sizeof(*group_fds
), GFP_KERNEL
);
1102 groups
= kcalloc(hdr
.count
, sizeof(*groups
), GFP_KERNEL
);
1103 if (!group_fds
|| !groups
) {
1109 if (copy_from_user(group_fds
, (void __user
*)(arg
+ minsz
),
1110 hdr
.count
* sizeof(*group_fds
))) {
1117 * For each group_fd, get the group through the vfio external
1118 * user interface and store the group and iommu ID. This
1119 * ensures the group is held across the reset.
1121 for (i
= 0; i
< hdr
.count
; i
++) {
1122 struct vfio_group
*group
;
1123 struct fd f
= fdget(group_fds
[i
]);
1129 group
= vfio_group_get_external_user(f
.file
);
1131 if (IS_ERR(group
)) {
1132 ret
= PTR_ERR(group
);
1136 groups
[i
].group
= group
;
1137 groups
[i
].id
= vfio_external_user_iommu_id(group
);
1142 /* release reference to groups on error */
1144 goto hot_reset_release
;
1146 info
.count
= hdr
.count
;
1147 info
.groups
= groups
;
1150 * Test whether all the affected devices are contained
1151 * by the set of groups provided by the user.
1153 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1154 vfio_pci_validate_devs
,
1157 /* User has access, do the reset */
1158 ret
= pci_reset_bus(vdev
->pdev
);
1161 for (i
--; i
>= 0; i
--)
1162 vfio_group_put_external_user(groups
[i
].group
);
1166 } else if (cmd
== VFIO_DEVICE_IOEVENTFD
) {
1167 struct vfio_device_ioeventfd ioeventfd
;
1170 minsz
= offsetofend(struct vfio_device_ioeventfd
, fd
);
1172 if (copy_from_user(&ioeventfd
, (void __user
*)arg
, minsz
))
1175 if (ioeventfd
.argsz
< minsz
)
1178 if (ioeventfd
.flags
& ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK
)
1181 count
= ioeventfd
.flags
& VFIO_DEVICE_IOEVENTFD_SIZE_MASK
;
1183 if (hweight8(count
) != 1 || ioeventfd
.fd
< -1)
1186 return vfio_pci_ioeventfd(vdev
, ioeventfd
.offset
,
1187 ioeventfd
.data
, count
, ioeventfd
.fd
);
1188 } else if (cmd
== VFIO_DEVICE_FEATURE
) {
1189 struct vfio_device_feature feature
;
1192 minsz
= offsetofend(struct vfio_device_feature
, flags
);
1194 if (copy_from_user(&feature
, (void __user
*)arg
, minsz
))
1197 if (feature
.argsz
< minsz
)
1200 /* Check unknown flags */
1201 if (feature
.flags
& ~(VFIO_DEVICE_FEATURE_MASK
|
1202 VFIO_DEVICE_FEATURE_SET
|
1203 VFIO_DEVICE_FEATURE_GET
|
1204 VFIO_DEVICE_FEATURE_PROBE
))
1207 /* GET & SET are mutually exclusive except with PROBE */
1208 if (!(feature
.flags
& VFIO_DEVICE_FEATURE_PROBE
) &&
1209 (feature
.flags
& VFIO_DEVICE_FEATURE_SET
) &&
1210 (feature
.flags
& VFIO_DEVICE_FEATURE_GET
))
1213 switch (feature
.flags
& VFIO_DEVICE_FEATURE_MASK
) {
1214 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN
:
1215 if (!vdev
->vf_token
)
1219 * We do not support GET of the VF Token UUID as this
1220 * could expose the token of the previous device user.
1222 if (feature
.flags
& VFIO_DEVICE_FEATURE_GET
)
1225 if (feature
.flags
& VFIO_DEVICE_FEATURE_PROBE
)
1228 /* Don't SET unless told to do so */
1229 if (!(feature
.flags
& VFIO_DEVICE_FEATURE_SET
))
1232 if (feature
.argsz
< minsz
+ sizeof(uuid
))
1235 if (copy_from_user(&uuid
, (void __user
*)(arg
+ minsz
),
1239 mutex_lock(&vdev
->vf_token
->lock
);
1240 uuid_copy(&vdev
->vf_token
->uuid
, &uuid
);
1241 mutex_unlock(&vdev
->vf_token
->lock
);
1252 static ssize_t
vfio_pci_rw(void *device_data
, char __user
*buf
,
1253 size_t count
, loff_t
*ppos
, bool iswrite
)
1255 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
1256 struct vfio_pci_device
*vdev
= device_data
;
1258 if (index
>= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
1262 case VFIO_PCI_CONFIG_REGION_INDEX
:
1263 return vfio_pci_config_rw(vdev
, buf
, count
, ppos
, iswrite
);
1265 case VFIO_PCI_ROM_REGION_INDEX
:
1268 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, false);
1270 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
1271 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, iswrite
);
1273 case VFIO_PCI_VGA_REGION_INDEX
:
1274 return vfio_pci_vga_rw(vdev
, buf
, count
, ppos
, iswrite
);
1276 index
-= VFIO_PCI_NUM_REGIONS
;
1277 return vdev
->region
[index
].ops
->rw(vdev
, buf
,
1278 count
, ppos
, iswrite
);
1284 static ssize_t
vfio_pci_read(void *device_data
, char __user
*buf
,
1285 size_t count
, loff_t
*ppos
)
1290 return vfio_pci_rw(device_data
, buf
, count
, ppos
, false);
1293 static ssize_t
vfio_pci_write(void *device_data
, const char __user
*buf
,
1294 size_t count
, loff_t
*ppos
)
1299 return vfio_pci_rw(device_data
, (char __user
*)buf
, count
, ppos
, true);
1302 static int vfio_pci_mmap(void *device_data
, struct vm_area_struct
*vma
)
1304 struct vfio_pci_device
*vdev
= device_data
;
1305 struct pci_dev
*pdev
= vdev
->pdev
;
1307 u64 phys_len
, req_len
, pgoff
, req_start
;
1310 index
= vma
->vm_pgoff
>> (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
);
1312 if (vma
->vm_end
< vma
->vm_start
)
1314 if ((vma
->vm_flags
& VM_SHARED
) == 0)
1316 if (index
>= VFIO_PCI_NUM_REGIONS
) {
1317 int regnum
= index
- VFIO_PCI_NUM_REGIONS
;
1318 struct vfio_pci_region
*region
= vdev
->region
+ regnum
;
1320 if (region
&& region
->ops
&& region
->ops
->mmap
&&
1321 (region
->flags
& VFIO_REGION_INFO_FLAG_MMAP
))
1322 return region
->ops
->mmap(vdev
, region
, vma
);
1325 if (index
>= VFIO_PCI_ROM_REGION_INDEX
)
1327 if (!vdev
->bar_mmap_supported
[index
])
1330 phys_len
= PAGE_ALIGN(pci_resource_len(pdev
, index
));
1331 req_len
= vma
->vm_end
- vma
->vm_start
;
1332 pgoff
= vma
->vm_pgoff
&
1333 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
1334 req_start
= pgoff
<< PAGE_SHIFT
;
1336 if (req_start
+ req_len
> phys_len
)
1340 * Even though we don't make use of the barmap for the mmap,
1341 * we need to request the region and the barmap tracks that.
1343 if (!vdev
->barmap
[index
]) {
1344 ret
= pci_request_selected_regions(pdev
,
1345 1 << index
, "vfio-pci");
1349 vdev
->barmap
[index
] = pci_iomap(pdev
, index
, 0);
1350 if (!vdev
->barmap
[index
]) {
1351 pci_release_selected_regions(pdev
, 1 << index
);
1356 vma
->vm_private_data
= vdev
;
1357 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
1358 vma
->vm_pgoff
= (pci_resource_start(pdev
, index
) >> PAGE_SHIFT
) + pgoff
;
1360 return remap_pfn_range(vma
, vma
->vm_start
, vma
->vm_pgoff
,
1361 req_len
, vma
->vm_page_prot
);
1364 static void vfio_pci_request(void *device_data
, unsigned int count
)
1366 struct vfio_pci_device
*vdev
= device_data
;
1367 struct pci_dev
*pdev
= vdev
->pdev
;
1369 mutex_lock(&vdev
->igate
);
1371 if (vdev
->req_trigger
) {
1373 pci_notice_ratelimited(pdev
,
1374 "Relaying device request to user (#%u)\n",
1376 eventfd_signal(vdev
->req_trigger
, 1);
1377 } else if (count
== 0) {
1379 "No device request channel registered, blocked until released by user\n");
1382 mutex_unlock(&vdev
->igate
);
1385 static int vfio_pci_validate_vf_token(struct vfio_pci_device
*vdev
,
1386 bool vf_token
, uuid_t
*uuid
)
1389 * There's always some degree of trust or collaboration between SR-IOV
1390 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1391 * can disrupt VFs with a reset, but often the PF has more explicit
1392 * access to deny service to the VF or access data passed through the
1393 * VF. We therefore require an opt-in via a shared VF token (UUID) to
1394 * represent this trust. This both prevents that a VF driver might
1395 * assume the PF driver is a trusted, in-kernel driver, and also that
1396 * a PF driver might be replaced with a rogue driver, unknown to in-use
1399 * Therefore when presented with a VF, if the PF is a vfio device and
1400 * it is bound to the vfio-pci driver, the user needs to provide a VF
1401 * token to access the device, in the form of appending a vf_token to
1402 * the device name, for example:
1404 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1406 * When presented with a PF which has VFs in use, the user must also
1407 * provide the current VF token to prove collaboration with existing
1408 * VF users. If VFs are not in use, the VF token provided for the PF
1409 * device will act to set the VF token.
1411 * If the VF token is provided but unused, an error is generated.
1413 if (!vdev
->pdev
->is_virtfn
&& !vdev
->vf_token
&& !vf_token
)
1414 return 0; /* No VF token provided or required */
1416 if (vdev
->pdev
->is_virtfn
) {
1417 struct vfio_device
*pf_dev
;
1418 struct vfio_pci_device
*pf_vdev
= get_pf_vdev(vdev
, &pf_dev
);
1423 return 0; /* PF is not vfio-pci, no VF token */
1425 pci_info_ratelimited(vdev
->pdev
,
1426 "VF token incorrectly provided, PF not bound to vfio-pci\n");
1431 vfio_device_put(pf_dev
);
1432 pci_info_ratelimited(vdev
->pdev
,
1433 "VF token required to access device\n");
1437 mutex_lock(&pf_vdev
->vf_token
->lock
);
1438 match
= uuid_equal(uuid
, &pf_vdev
->vf_token
->uuid
);
1439 mutex_unlock(&pf_vdev
->vf_token
->lock
);
1441 vfio_device_put(pf_dev
);
1444 pci_info_ratelimited(vdev
->pdev
,
1445 "Incorrect VF token provided for device\n");
1448 } else if (vdev
->vf_token
) {
1449 mutex_lock(&vdev
->vf_token
->lock
);
1450 if (vdev
->vf_token
->users
) {
1452 mutex_unlock(&vdev
->vf_token
->lock
);
1453 pci_info_ratelimited(vdev
->pdev
,
1454 "VF token required to access device\n");
1458 if (!uuid_equal(uuid
, &vdev
->vf_token
->uuid
)) {
1459 mutex_unlock(&vdev
->vf_token
->lock
);
1460 pci_info_ratelimited(vdev
->pdev
,
1461 "Incorrect VF token provided for device\n");
1464 } else if (vf_token
) {
1465 uuid_copy(&vdev
->vf_token
->uuid
, uuid
);
1468 mutex_unlock(&vdev
->vf_token
->lock
);
1469 } else if (vf_token
) {
1470 pci_info_ratelimited(vdev
->pdev
,
1471 "VF token incorrectly provided, not a PF or VF\n");
1478 #define VF_TOKEN_ARG "vf_token="
1480 static int vfio_pci_match(void *device_data
, char *buf
)
1482 struct vfio_pci_device
*vdev
= device_data
;
1483 bool vf_token
= false;
1487 if (strncmp(pci_name(vdev
->pdev
), buf
, strlen(pci_name(vdev
->pdev
))))
1488 return 0; /* No match */
1490 if (strlen(buf
) > strlen(pci_name(vdev
->pdev
))) {
1491 buf
+= strlen(pci_name(vdev
->pdev
));
1494 return 0; /* No match: non-whitespace after name */
1502 if (!vf_token
&& !strncmp(buf
, VF_TOKEN_ARG
,
1503 strlen(VF_TOKEN_ARG
))) {
1504 buf
+= strlen(VF_TOKEN_ARG
);
1506 if (strlen(buf
) < UUID_STRING_LEN
)
1509 ret
= uuid_parse(buf
, &uuid
);
1514 buf
+= UUID_STRING_LEN
;
1516 /* Unknown/duplicate option */
1522 ret
= vfio_pci_validate_vf_token(vdev
, vf_token
, &uuid
);
1526 return 1; /* Match */
1529 static const struct vfio_device_ops vfio_pci_ops
= {
1531 .open
= vfio_pci_open
,
1532 .release
= vfio_pci_release
,
1533 .ioctl
= vfio_pci_ioctl
,
1534 .read
= vfio_pci_read
,
1535 .write
= vfio_pci_write
,
1536 .mmap
= vfio_pci_mmap
,
1537 .request
= vfio_pci_request
,
1538 .match
= vfio_pci_match
,
1541 static int vfio_pci_reflck_attach(struct vfio_pci_device
*vdev
);
1542 static void vfio_pci_reflck_put(struct vfio_pci_reflck
*reflck
);
1543 static struct pci_driver vfio_pci_driver
;
1545 static int vfio_pci_bus_notifier(struct notifier_block
*nb
,
1546 unsigned long action
, void *data
)
1548 struct vfio_pci_device
*vdev
= container_of(nb
,
1549 struct vfio_pci_device
, nb
);
1550 struct device
*dev
= data
;
1551 struct pci_dev
*pdev
= to_pci_dev(dev
);
1552 struct pci_dev
*physfn
= pci_physfn(pdev
);
1554 if (action
== BUS_NOTIFY_ADD_DEVICE
&&
1555 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1556 pci_info(vdev
->pdev
, "Captured SR-IOV VF %s driver_override\n",
1558 pdev
->driver_override
= kasprintf(GFP_KERNEL
, "%s",
1560 } else if (action
== BUS_NOTIFY_BOUND_DRIVER
&&
1561 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1562 struct pci_driver
*drv
= pci_dev_driver(pdev
);
1564 if (drv
&& drv
!= &vfio_pci_driver
)
1565 pci_warn(vdev
->pdev
,
1566 "VF %s bound to driver %s while PF bound to vfio-pci\n",
1567 pci_name(pdev
), drv
->name
);
1573 static int vfio_pci_probe(struct pci_dev
*pdev
, const struct pci_device_id
*id
)
1575 struct vfio_pci_device
*vdev
;
1576 struct iommu_group
*group
;
1579 if (pdev
->hdr_type
!= PCI_HEADER_TYPE_NORMAL
)
1583 * Prevent binding to PFs with VFs enabled, the VFs might be in use
1584 * by the host or other users. We cannot capture the VFs if they
1585 * already exist, nor can we track VF users. Disabling SR-IOV here
1586 * would initiate removing the VFs, which would unbind the driver,
1587 * which is prone to blocking if that VF is also in use by vfio-pci.
1588 * Just reject these PFs and let the user sort it out.
1590 if (pci_num_vf(pdev
)) {
1591 pci_warn(pdev
, "Cannot bind to PF with SR-IOV enabled\n");
1595 group
= vfio_iommu_group_get(&pdev
->dev
);
1599 vdev
= kzalloc(sizeof(*vdev
), GFP_KERNEL
);
1606 vdev
->irq_type
= VFIO_PCI_NUM_IRQS
;
1607 mutex_init(&vdev
->igate
);
1608 spin_lock_init(&vdev
->irqlock
);
1609 mutex_init(&vdev
->ioeventfds_lock
);
1610 INIT_LIST_HEAD(&vdev
->ioeventfds_list
);
1612 ret
= vfio_add_group_dev(&pdev
->dev
, &vfio_pci_ops
, vdev
);
1616 ret
= vfio_pci_reflck_attach(vdev
);
1618 goto out_del_group_dev
;
1620 if (pdev
->is_physfn
) {
1621 vdev
->vf_token
= kzalloc(sizeof(*vdev
->vf_token
), GFP_KERNEL
);
1622 if (!vdev
->vf_token
) {
1627 mutex_init(&vdev
->vf_token
->lock
);
1628 uuid_gen(&vdev
->vf_token
->uuid
);
1630 vdev
->nb
.notifier_call
= vfio_pci_bus_notifier
;
1631 ret
= bus_register_notifier(&pci_bus_type
, &vdev
->nb
);
1636 if (vfio_pci_is_vga(pdev
)) {
1637 vga_client_register(pdev
, vdev
, NULL
, vfio_pci_set_vga_decode
);
1638 vga_set_legacy_decoding(pdev
,
1639 vfio_pci_set_vga_decode(vdev
, false));
1642 vfio_pci_probe_power_state(vdev
);
1644 if (!disable_idle_d3
) {
1646 * pci-core sets the device power state to an unknown value at
1647 * bootup and after being removed from a driver. The only
1648 * transition it allows from this unknown state is to D0, which
1649 * typically happens when a driver calls pci_enable_device().
1650 * We're not ready to enable the device yet, but we do want to
1651 * be able to get to D3. Therefore first do a D0 transition
1652 * before going to D3.
1654 vfio_pci_set_power_state(vdev
, PCI_D0
);
1655 vfio_pci_set_power_state(vdev
, PCI_D3hot
);
1661 kfree(vdev
->vf_token
);
1663 vfio_pci_reflck_put(vdev
->reflck
);
1665 vfio_del_group_dev(&pdev
->dev
);
1669 vfio_iommu_group_put(group
, &pdev
->dev
);
1673 static void vfio_pci_remove(struct pci_dev
*pdev
)
1675 struct vfio_pci_device
*vdev
;
1677 pci_disable_sriov(pdev
);
1679 vdev
= vfio_del_group_dev(&pdev
->dev
);
1683 if (vdev
->vf_token
) {
1684 WARN_ON(vdev
->vf_token
->users
);
1685 mutex_destroy(&vdev
->vf_token
->lock
);
1686 kfree(vdev
->vf_token
);
1689 if (vdev
->nb
.notifier_call
)
1690 bus_unregister_notifier(&pci_bus_type
, &vdev
->nb
);
1692 vfio_pci_reflck_put(vdev
->reflck
);
1694 vfio_iommu_group_put(pdev
->dev
.iommu_group
, &pdev
->dev
);
1695 kfree(vdev
->region
);
1696 mutex_destroy(&vdev
->ioeventfds_lock
);
1698 if (!disable_idle_d3
)
1699 vfio_pci_set_power_state(vdev
, PCI_D0
);
1701 kfree(vdev
->pm_save
);
1704 if (vfio_pci_is_vga(pdev
)) {
1705 vga_client_register(pdev
, NULL
, NULL
, NULL
);
1706 vga_set_legacy_decoding(pdev
,
1707 VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
1708 VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
);
1712 static pci_ers_result_t
vfio_pci_aer_err_detected(struct pci_dev
*pdev
,
1713 pci_channel_state_t state
)
1715 struct vfio_pci_device
*vdev
;
1716 struct vfio_device
*device
;
1718 device
= vfio_device_get_from_dev(&pdev
->dev
);
1720 return PCI_ERS_RESULT_DISCONNECT
;
1722 vdev
= vfio_device_data(device
);
1724 vfio_device_put(device
);
1725 return PCI_ERS_RESULT_DISCONNECT
;
1728 mutex_lock(&vdev
->igate
);
1730 if (vdev
->err_trigger
)
1731 eventfd_signal(vdev
->err_trigger
, 1);
1733 mutex_unlock(&vdev
->igate
);
1735 vfio_device_put(device
);
1737 return PCI_ERS_RESULT_CAN_RECOVER
;
1740 static int vfio_pci_sriov_configure(struct pci_dev
*pdev
, int nr_virtfn
)
1742 struct vfio_pci_device
*vdev
;
1743 struct vfio_device
*device
;
1751 device
= vfio_device_get_from_dev(&pdev
->dev
);
1755 vdev
= vfio_device_data(device
);
1757 vfio_device_put(device
);
1762 pci_disable_sriov(pdev
);
1764 ret
= pci_enable_sriov(pdev
, nr_virtfn
);
1766 vfio_device_put(device
);
1768 return ret
< 0 ? ret
: nr_virtfn
;
1771 static const struct pci_error_handlers vfio_err_handlers
= {
1772 .error_detected
= vfio_pci_aer_err_detected
,
1775 static struct pci_driver vfio_pci_driver
= {
1777 .id_table
= NULL
, /* only dynamic ids */
1778 .probe
= vfio_pci_probe
,
1779 .remove
= vfio_pci_remove
,
1780 .sriov_configure
= vfio_pci_sriov_configure
,
1781 .err_handler
= &vfio_err_handlers
,
1784 static DEFINE_MUTEX(reflck_lock
);
1786 static struct vfio_pci_reflck
*vfio_pci_reflck_alloc(void)
1788 struct vfio_pci_reflck
*reflck
;
1790 reflck
= kzalloc(sizeof(*reflck
), GFP_KERNEL
);
1792 return ERR_PTR(-ENOMEM
);
1794 kref_init(&reflck
->kref
);
1795 mutex_init(&reflck
->lock
);
1800 static void vfio_pci_reflck_get(struct vfio_pci_reflck
*reflck
)
1802 kref_get(&reflck
->kref
);
1805 static int vfio_pci_reflck_find(struct pci_dev
*pdev
, void *data
)
1807 struct vfio_pci_reflck
**preflck
= data
;
1808 struct vfio_device
*device
;
1809 struct vfio_pci_device
*vdev
;
1811 device
= vfio_device_get_from_dev(&pdev
->dev
);
1815 if (pci_dev_driver(pdev
) != &vfio_pci_driver
) {
1816 vfio_device_put(device
);
1820 vdev
= vfio_device_data(device
);
1823 vfio_pci_reflck_get(vdev
->reflck
);
1824 *preflck
= vdev
->reflck
;
1825 vfio_device_put(device
);
1829 vfio_device_put(device
);
1833 static int vfio_pci_reflck_attach(struct vfio_pci_device
*vdev
)
1835 bool slot
= !pci_probe_reset_slot(vdev
->pdev
->slot
);
1837 mutex_lock(&reflck_lock
);
1839 if (pci_is_root_bus(vdev
->pdev
->bus
) ||
1840 vfio_pci_for_each_slot_or_bus(vdev
->pdev
, vfio_pci_reflck_find
,
1841 &vdev
->reflck
, slot
) <= 0)
1842 vdev
->reflck
= vfio_pci_reflck_alloc();
1844 mutex_unlock(&reflck_lock
);
1846 return PTR_ERR_OR_ZERO(vdev
->reflck
);
1849 static void vfio_pci_reflck_release(struct kref
*kref
)
1851 struct vfio_pci_reflck
*reflck
= container_of(kref
,
1852 struct vfio_pci_reflck
,
1856 mutex_unlock(&reflck_lock
);
1859 static void vfio_pci_reflck_put(struct vfio_pci_reflck
*reflck
)
1861 kref_put_mutex(&reflck
->kref
, vfio_pci_reflck_release
, &reflck_lock
);
1864 struct vfio_devices
{
1865 struct vfio_device
**devices
;
1870 static int vfio_pci_get_unused_devs(struct pci_dev
*pdev
, void *data
)
1872 struct vfio_devices
*devs
= data
;
1873 struct vfio_device
*device
;
1874 struct vfio_pci_device
*vdev
;
1876 if (devs
->cur_index
== devs
->max_index
)
1879 device
= vfio_device_get_from_dev(&pdev
->dev
);
1883 if (pci_dev_driver(pdev
) != &vfio_pci_driver
) {
1884 vfio_device_put(device
);
1888 vdev
= vfio_device_data(device
);
1890 /* Fault if the device is not unused */
1892 vfio_device_put(device
);
1896 devs
->devices
[devs
->cur_index
++] = device
;
1901 * If a bus or slot reset is available for the provided device and:
1902 * - All of the devices affected by that bus or slot reset are unused
1904 * - At least one of the affected devices is marked dirty via
1905 * needs_reset (such as by lack of FLR support)
1906 * Then attempt to perform that bus or slot reset. Callers are required
1907 * to hold vdev->reflck->lock, protecting the bus/slot reset group from
1908 * concurrent opens. A vfio_device reference is acquired for each device
1909 * to prevent unbinds during the reset operation.
1911 * NB: vfio-core considers a group to be viable even if some devices are
1912 * bound to drivers like pci-stub or pcieport. Here we require all devices
1913 * to be bound to vfio_pci since that's the only way we can be sure they
1916 static void vfio_pci_try_bus_reset(struct vfio_pci_device
*vdev
)
1918 struct vfio_devices devs
= { .cur_index
= 0 };
1919 int i
= 0, ret
= -EINVAL
;
1921 struct vfio_pci_device
*tmp
;
1923 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1925 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1928 if (vfio_pci_for_each_slot_or_bus(vdev
->pdev
, vfio_pci_count_devs
,
1933 devs
.devices
= kcalloc(i
, sizeof(struct vfio_device
*), GFP_KERNEL
);
1937 if (vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1938 vfio_pci_get_unused_devs
,
1942 /* Does at least one need a reset? */
1943 for (i
= 0; i
< devs
.cur_index
; i
++) {
1944 tmp
= vfio_device_data(devs
.devices
[i
]);
1945 if (tmp
->needs_reset
) {
1946 ret
= pci_reset_bus(vdev
->pdev
);
1952 for (i
= 0; i
< devs
.cur_index
; i
++) {
1953 tmp
= vfio_device_data(devs
.devices
[i
]);
1956 * If reset was successful, affected devices no longer need
1957 * a reset and we should return all the collateral devices
1958 * to low power. If not successful, we either didn't reset
1959 * the bus or timed out waiting for it, so let's not touch
1963 tmp
->needs_reset
= false;
1965 if (tmp
!= vdev
&& !disable_idle_d3
)
1966 vfio_pci_set_power_state(tmp
, PCI_D3hot
);
1969 vfio_device_put(devs
.devices
[i
]);
1972 kfree(devs
.devices
);
1975 static void __exit
vfio_pci_cleanup(void)
1977 pci_unregister_driver(&vfio_pci_driver
);
1978 vfio_pci_uninit_perm_bits();
1981 static void __init
vfio_pci_fill_ids(void)
1986 /* no ids passed actually */
1990 /* add ids specified in the module parameter */
1992 while ((id
= strsep(&p
, ","))) {
1993 unsigned int vendor
, device
, subvendor
= PCI_ANY_ID
,
1994 subdevice
= PCI_ANY_ID
, class = 0, class_mask
= 0;
2000 fields
= sscanf(id
, "%x:%x:%x:%x:%x:%x",
2001 &vendor
, &device
, &subvendor
, &subdevice
,
2002 &class, &class_mask
);
2005 pr_warn("invalid id string \"%s\"\n", id
);
2009 rc
= pci_add_dynid(&vfio_pci_driver
, vendor
, device
,
2010 subvendor
, subdevice
, class, class_mask
, 0);
2012 pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
2013 vendor
, device
, subvendor
, subdevice
,
2014 class, class_mask
, rc
);
2016 pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
2017 vendor
, device
, subvendor
, subdevice
,
2022 static int __init
vfio_pci_init(void)
2026 /* Allocate shared config space permision data used by all devices */
2027 ret
= vfio_pci_init_perm_bits();
2031 /* Register and scan for devices */
2032 ret
= pci_register_driver(&vfio_pci_driver
);
2036 vfio_pci_fill_ids();
2041 vfio_pci_uninit_perm_bits();
2045 module_init(vfio_pci_init
);
2046 module_exit(vfio_pci_cleanup
);
2048 MODULE_VERSION(DRIVER_VERSION
);
2049 MODULE_LICENSE("GPL v2");
2050 MODULE_AUTHOR(DRIVER_AUTHOR
);
2051 MODULE_DESCRIPTION(DRIVER_DESC
);