1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
4 * Author: Alex Williamson <alex.williamson@redhat.com>
6 * Derived from original vfio:
7 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
8 * Author: Tom Lyon, pugs@cisco.com
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <linux/device.h>
14 #include <linux/eventfd.h>
15 #include <linux/file.h>
16 #include <linux/interrupt.h>
17 #include <linux/iommu.h>
18 #include <linux/module.h>
19 #include <linux/mutex.h>
20 #include <linux/notifier.h>
21 #include <linux/pci.h>
22 #include <linux/pm_runtime.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/uaccess.h>
26 #include <linux/vfio.h>
27 #include <linux/vgaarb.h>
28 #include <linux/nospec.h>
29 #include <linux/sched/mm.h>
31 #include "vfio_pci_private.h"
33 #define DRIVER_VERSION "0.2"
34 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
35 #define DRIVER_DESC "VFIO PCI - User Level meta-driver"
37 static char ids
[1024] __initdata
;
38 module_param_string(ids
, ids
, sizeof(ids
), 0);
39 MODULE_PARM_DESC(ids
, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified");
41 static bool nointxmask
;
42 module_param_named(nointxmask
, nointxmask
, bool, S_IRUGO
| S_IWUSR
);
43 MODULE_PARM_DESC(nointxmask
,
44 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
46 #ifdef CONFIG_VFIO_PCI_VGA
47 static bool disable_vga
;
48 module_param(disable_vga
, bool, S_IRUGO
);
49 MODULE_PARM_DESC(disable_vga
, "Disable VGA resource access through vfio-pci");
52 static bool disable_idle_d3
;
53 module_param(disable_idle_d3
, bool, S_IRUGO
| S_IWUSR
);
54 MODULE_PARM_DESC(disable_idle_d3
,
55 "Disable using the PCI D3 low power state for idle, unused devices");
57 static bool enable_sriov
;
59 module_param(enable_sriov
, bool, 0644);
60 MODULE_PARM_DESC(enable_sriov
, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
63 static bool disable_denylist
;
64 module_param(disable_denylist
, bool, 0444);
65 MODULE_PARM_DESC(disable_denylist
, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
67 static inline bool vfio_vga_disabled(void)
69 #ifdef CONFIG_VFIO_PCI_VGA
76 static bool vfio_pci_dev_in_denylist(struct pci_dev
*pdev
)
78 switch (pdev
->vendor
) {
79 case PCI_VENDOR_ID_INTEL
:
80 switch (pdev
->device
) {
81 case PCI_DEVICE_ID_INTEL_QAT_C3XXX
:
82 case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF
:
83 case PCI_DEVICE_ID_INTEL_QAT_C62X
:
84 case PCI_DEVICE_ID_INTEL_QAT_C62X_VF
:
85 case PCI_DEVICE_ID_INTEL_QAT_DH895XCC
:
86 case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF
:
96 static bool vfio_pci_is_denylisted(struct pci_dev
*pdev
)
98 if (!vfio_pci_dev_in_denylist(pdev
))
101 if (disable_denylist
) {
103 "device denylist disabled - allowing device %04x:%04x.\n",
104 pdev
->vendor
, pdev
->device
);
108 pci_warn(pdev
, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
109 pdev
->vendor
, pdev
->device
);
115 * Our VGA arbiter participation is limited since we don't know anything
116 * about the device itself. However, if the device is the only VGA device
117 * downstream of a bridge and VFIO VGA support is disabled, then we can
118 * safely return legacy VGA IO and memory as not decoded since the user
119 * has no way to get to it and routing can be disabled externally at the
122 static unsigned int vfio_pci_set_vga_decode(void *opaque
, bool single_vga
)
124 struct vfio_pci_device
*vdev
= opaque
;
125 struct pci_dev
*tmp
= NULL
, *pdev
= vdev
->pdev
;
126 unsigned char max_busnr
;
127 unsigned int decodes
;
129 if (single_vga
|| !vfio_vga_disabled() || pci_is_root_bus(pdev
->bus
))
130 return VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
131 VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
133 max_busnr
= pci_bus_max_busnr(pdev
->bus
);
134 decodes
= VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
;
136 while ((tmp
= pci_get_class(PCI_CLASS_DISPLAY_VGA
<< 8, tmp
)) != NULL
) {
138 pci_domain_nr(tmp
->bus
) != pci_domain_nr(pdev
->bus
) ||
139 pci_is_root_bus(tmp
->bus
))
142 if (tmp
->bus
->number
>= pdev
->bus
->number
&&
143 tmp
->bus
->number
<= max_busnr
) {
145 decodes
|= VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
153 static inline bool vfio_pci_is_vga(struct pci_dev
*pdev
)
155 return (pdev
->class >> 8) == PCI_CLASS_DISPLAY_VGA
;
158 static void vfio_pci_probe_mmaps(struct vfio_pci_device
*vdev
)
160 struct resource
*res
;
162 struct vfio_pci_dummy_resource
*dummy_res
;
164 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
165 int bar
= i
+ PCI_STD_RESOURCES
;
167 res
= &vdev
->pdev
->resource
[bar
];
169 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP
))
172 if (!(res
->flags
& IORESOURCE_MEM
))
176 * The PCI core shouldn't set up a resource with a
177 * type but zero size. But there may be bugs that
178 * cause us to do that.
180 if (!resource_size(res
))
183 if (resource_size(res
) >= PAGE_SIZE
) {
184 vdev
->bar_mmap_supported
[bar
] = true;
188 if (!(res
->start
& ~PAGE_MASK
)) {
190 * Add a dummy resource to reserve the remainder
191 * of the exclusive page in case that hot-add
192 * device's bar is assigned into it.
194 dummy_res
= kzalloc(sizeof(*dummy_res
), GFP_KERNEL
);
195 if (dummy_res
== NULL
)
198 dummy_res
->resource
.name
= "vfio sub-page reserved";
199 dummy_res
->resource
.start
= res
->end
+ 1;
200 dummy_res
->resource
.end
= res
->start
+ PAGE_SIZE
- 1;
201 dummy_res
->resource
.flags
= res
->flags
;
202 if (request_resource(res
->parent
,
203 &dummy_res
->resource
)) {
207 dummy_res
->index
= bar
;
208 list_add(&dummy_res
->res_next
,
209 &vdev
->dummy_resources_list
);
210 vdev
->bar_mmap_supported
[bar
] = true;
214 * Here we don't handle the case when the BAR is not page
215 * aligned because we can't expect the BAR will be
216 * assigned into the same location in a page in guest
217 * when we passthrough the BAR. And it's hard to access
218 * this BAR in userspace because we have no way to get
219 * the BAR's location in a page.
222 vdev
->bar_mmap_supported
[bar
] = false;
226 static void vfio_pci_try_bus_reset(struct vfio_pci_device
*vdev
);
227 static void vfio_pci_disable(struct vfio_pci_device
*vdev
);
228 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev
*pdev
, void *data
);
231 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
232 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
233 * If a device implements the former but not the latter we would typically
234 * expect broken_intx_masking be set and require an exclusive interrupt.
235 * However since we do have control of the device's ability to assert INTx,
236 * we can instead pretend that the device does not implement INTx, virtualizing
237 * the pin register to report zero and maintaining DisINTx set on the host.
239 static bool vfio_pci_nointx(struct pci_dev
*pdev
)
241 switch (pdev
->vendor
) {
242 case PCI_VENDOR_ID_INTEL
:
243 switch (pdev
->device
) {
244 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
247 case 0x1580 ... 0x1581:
248 case 0x1583 ... 0x158b:
249 case 0x37d0 ... 0x37d2:
261 static void vfio_pci_probe_power_state(struct vfio_pci_device
*vdev
)
263 struct pci_dev
*pdev
= vdev
->pdev
;
269 pci_read_config_word(pdev
, pdev
->pm_cap
+ PCI_PM_CTRL
, &pmcsr
);
271 vdev
->needs_pm_restore
= !(pmcsr
& PCI_PM_CTRL_NO_SOFT_RESET
);
275 * pci_set_power_state() wrapper handling devices which perform a soft reset on
276 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
277 * restore when returned to D0. Saved separately from pci_saved_state for use
278 * by PM capability emulation and separately from pci_dev internal saved state
279 * to avoid it being overwritten and consumed around other resets.
281 int vfio_pci_set_power_state(struct vfio_pci_device
*vdev
, pci_power_t state
)
283 struct pci_dev
*pdev
= vdev
->pdev
;
284 bool needs_restore
= false, needs_save
= false;
287 if (vdev
->needs_pm_restore
) {
288 if (pdev
->current_state
< PCI_D3hot
&& state
>= PCI_D3hot
) {
289 pci_save_state(pdev
);
293 if (pdev
->current_state
>= PCI_D3hot
&& state
<= PCI_D0
)
294 needs_restore
= true;
297 ret
= pci_set_power_state(pdev
, state
);
300 /* D3 might be unsupported via quirk, skip unless in D3 */
301 if (needs_save
&& pdev
->current_state
>= PCI_D3hot
) {
302 vdev
->pm_save
= pci_store_saved_state(pdev
);
303 } else if (needs_restore
) {
304 pci_load_and_free_saved_state(pdev
, &vdev
->pm_save
);
305 pci_restore_state(pdev
);
312 static int vfio_pci_enable(struct vfio_pci_device
*vdev
)
314 struct pci_dev
*pdev
= vdev
->pdev
;
319 vfio_pci_set_power_state(vdev
, PCI_D0
);
321 /* Don't allow our initial saved state to include busmaster */
322 pci_clear_master(pdev
);
324 ret
= pci_enable_device(pdev
);
328 /* If reset fails because of the device lock, fail this path entirely */
329 ret
= pci_try_reset_function(pdev
);
330 if (ret
== -EAGAIN
) {
331 pci_disable_device(pdev
);
335 vdev
->reset_works
= !ret
;
336 pci_save_state(pdev
);
337 vdev
->pci_saved_state
= pci_store_saved_state(pdev
);
338 if (!vdev
->pci_saved_state
)
339 pci_dbg(pdev
, "%s: Couldn't store saved state\n", __func__
);
341 if (likely(!nointxmask
)) {
342 if (vfio_pci_nointx(pdev
)) {
343 pci_info(pdev
, "Masking broken INTx support\n");
347 vdev
->pci_2_3
= pci_intx_mask_supported(pdev
);
350 pci_read_config_word(pdev
, PCI_COMMAND
, &cmd
);
351 if (vdev
->pci_2_3
&& (cmd
& PCI_COMMAND_INTX_DISABLE
)) {
352 cmd
&= ~PCI_COMMAND_INTX_DISABLE
;
353 pci_write_config_word(pdev
, PCI_COMMAND
, cmd
);
356 ret
= vfio_config_init(vdev
);
358 kfree(vdev
->pci_saved_state
);
359 vdev
->pci_saved_state
= NULL
;
360 pci_disable_device(pdev
);
364 msix_pos
= pdev
->msix_cap
;
369 pci_read_config_word(pdev
, msix_pos
+ PCI_MSIX_FLAGS
, &flags
);
370 pci_read_config_dword(pdev
, msix_pos
+ PCI_MSIX_TABLE
, &table
);
372 vdev
->msix_bar
= table
& PCI_MSIX_TABLE_BIR
;
373 vdev
->msix_offset
= table
& PCI_MSIX_TABLE_OFFSET
;
374 vdev
->msix_size
= ((flags
& PCI_MSIX_FLAGS_QSIZE
) + 1) * 16;
376 vdev
->msix_bar
= 0xFF;
378 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev
))
379 vdev
->has_vga
= true;
382 if (vfio_pci_is_vga(pdev
) &&
383 pdev
->vendor
== PCI_VENDOR_ID_INTEL
&&
384 IS_ENABLED(CONFIG_VFIO_PCI_IGD
)) {
385 ret
= vfio_pci_igd_init(vdev
);
386 if (ret
&& ret
!= -ENODEV
) {
387 pci_warn(pdev
, "Failed to setup Intel IGD regions\n");
392 if (pdev
->vendor
== PCI_VENDOR_ID_NVIDIA
&&
393 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2
)) {
394 ret
= vfio_pci_nvdia_v100_nvlink2_init(vdev
);
395 if (ret
&& ret
!= -ENODEV
) {
396 pci_warn(pdev
, "Failed to setup NVIDIA NV2 RAM region\n");
401 if (pdev
->vendor
== PCI_VENDOR_ID_IBM
&&
402 IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2
)) {
403 ret
= vfio_pci_ibm_npu2_init(vdev
);
404 if (ret
&& ret
!= -ENODEV
) {
405 pci_warn(pdev
, "Failed to setup NVIDIA NV2 ATSD region\n");
410 vfio_pci_probe_mmaps(vdev
);
415 vfio_pci_disable(vdev
);
419 static void vfio_pci_disable(struct vfio_pci_device
*vdev
)
421 struct pci_dev
*pdev
= vdev
->pdev
;
422 struct vfio_pci_dummy_resource
*dummy_res
, *tmp
;
423 struct vfio_pci_ioeventfd
*ioeventfd
, *ioeventfd_tmp
;
426 /* Stop the device from further DMA */
427 pci_clear_master(pdev
);
429 vfio_pci_set_irqs_ioctl(vdev
, VFIO_IRQ_SET_DATA_NONE
|
430 VFIO_IRQ_SET_ACTION_TRIGGER
,
431 vdev
->irq_type
, 0, 0, NULL
);
433 /* Device closed, don't need mutex here */
434 list_for_each_entry_safe(ioeventfd
, ioeventfd_tmp
,
435 &vdev
->ioeventfds_list
, next
) {
436 vfio_virqfd_disable(&ioeventfd
->virqfd
);
437 list_del(&ioeventfd
->next
);
440 vdev
->ioeventfds_nr
= 0;
442 vdev
->virq_disabled
= false;
444 for (i
= 0; i
< vdev
->num_regions
; i
++)
445 vdev
->region
[i
].ops
->release(vdev
, &vdev
->region
[i
]);
447 vdev
->num_regions
= 0;
449 vdev
->region
= NULL
; /* don't krealloc a freed pointer */
451 vfio_config_free(vdev
);
453 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
454 bar
= i
+ PCI_STD_RESOURCES
;
455 if (!vdev
->barmap
[bar
])
457 pci_iounmap(pdev
, vdev
->barmap
[bar
]);
458 pci_release_selected_regions(pdev
, 1 << bar
);
459 vdev
->barmap
[bar
] = NULL
;
462 list_for_each_entry_safe(dummy_res
, tmp
,
463 &vdev
->dummy_resources_list
, res_next
) {
464 list_del(&dummy_res
->res_next
);
465 release_resource(&dummy_res
->resource
);
469 vdev
->needs_reset
= true;
472 * If we have saved state, restore it. If we can reset the device,
473 * even better. Resetting with current state seems better than
474 * nothing, but saving and restoring current state without reset
477 if (pci_load_and_free_saved_state(pdev
, &vdev
->pci_saved_state
)) {
478 pci_info(pdev
, "%s: Couldn't reload saved state\n", __func__
);
480 if (!vdev
->reset_works
)
483 pci_save_state(pdev
);
487 * Disable INTx and MSI, presumably to avoid spurious interrupts
488 * during reset. Stolen from pci_reset_function()
490 pci_write_config_word(pdev
, PCI_COMMAND
, PCI_COMMAND_INTX_DISABLE
);
493 * Try to get the locks ourselves to prevent a deadlock. The
494 * success of this is dependent on being able to lock the device,
495 * which is not always possible.
496 * We can not use the "try" reset interface here, which will
497 * overwrite the previously restored configuration information.
499 if (vdev
->reset_works
&& pci_cfg_access_trylock(pdev
)) {
500 if (device_trylock(&pdev
->dev
)) {
501 if (!__pci_reset_function_locked(pdev
))
502 vdev
->needs_reset
= false;
503 device_unlock(&pdev
->dev
);
505 pci_cfg_access_unlock(pdev
);
508 pci_restore_state(pdev
);
510 pci_disable_device(pdev
);
512 vfio_pci_try_bus_reset(vdev
);
514 if (!disable_idle_d3
)
515 vfio_pci_set_power_state(vdev
, PCI_D3hot
);
518 static struct pci_driver vfio_pci_driver
;
520 static struct vfio_pci_device
*get_pf_vdev(struct vfio_pci_device
*vdev
,
521 struct vfio_device
**pf_dev
)
523 struct pci_dev
*physfn
= pci_physfn(vdev
->pdev
);
525 if (!vdev
->pdev
->is_virtfn
)
528 *pf_dev
= vfio_device_get_from_dev(&physfn
->dev
);
532 if (pci_dev_driver(physfn
) != &vfio_pci_driver
) {
533 vfio_device_put(*pf_dev
);
537 return vfio_device_data(*pf_dev
);
540 static void vfio_pci_vf_token_user_add(struct vfio_pci_device
*vdev
, int val
)
542 struct vfio_device
*pf_dev
;
543 struct vfio_pci_device
*pf_vdev
= get_pf_vdev(vdev
, &pf_dev
);
548 mutex_lock(&pf_vdev
->vf_token
->lock
);
549 pf_vdev
->vf_token
->users
+= val
;
550 WARN_ON(pf_vdev
->vf_token
->users
< 0);
551 mutex_unlock(&pf_vdev
->vf_token
->lock
);
553 vfio_device_put(pf_dev
);
556 static void vfio_pci_release(void *device_data
)
558 struct vfio_pci_device
*vdev
= device_data
;
560 mutex_lock(&vdev
->reflck
->lock
);
562 if (!(--vdev
->refcnt
)) {
563 vfio_pci_vf_token_user_add(vdev
, -1);
564 vfio_spapr_pci_eeh_release(vdev
->pdev
);
565 vfio_pci_disable(vdev
);
567 mutex_lock(&vdev
->igate
);
568 if (vdev
->err_trigger
) {
569 eventfd_ctx_put(vdev
->err_trigger
);
570 vdev
->err_trigger
= NULL
;
572 if (vdev
->req_trigger
) {
573 eventfd_ctx_put(vdev
->req_trigger
);
574 vdev
->req_trigger
= NULL
;
576 mutex_unlock(&vdev
->igate
);
579 mutex_unlock(&vdev
->reflck
->lock
);
581 module_put(THIS_MODULE
);
584 static int vfio_pci_open(void *device_data
)
586 struct vfio_pci_device
*vdev
= device_data
;
589 if (!try_module_get(THIS_MODULE
))
592 mutex_lock(&vdev
->reflck
->lock
);
595 ret
= vfio_pci_enable(vdev
);
599 vfio_spapr_pci_eeh_open(vdev
->pdev
);
600 vfio_pci_vf_token_user_add(vdev
, 1);
604 mutex_unlock(&vdev
->reflck
->lock
);
606 module_put(THIS_MODULE
);
610 static int vfio_pci_get_irq_count(struct vfio_pci_device
*vdev
, int irq_type
)
612 if (irq_type
== VFIO_PCI_INTX_IRQ_INDEX
) {
615 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX
) ||
616 vdev
->nointx
|| vdev
->pdev
->is_virtfn
)
619 pci_read_config_byte(vdev
->pdev
, PCI_INTERRUPT_PIN
, &pin
);
622 } else if (irq_type
== VFIO_PCI_MSI_IRQ_INDEX
) {
626 pos
= vdev
->pdev
->msi_cap
;
628 pci_read_config_word(vdev
->pdev
,
629 pos
+ PCI_MSI_FLAGS
, &flags
);
630 return 1 << ((flags
& PCI_MSI_FLAGS_QMASK
) >> 1);
632 } else if (irq_type
== VFIO_PCI_MSIX_IRQ_INDEX
) {
636 pos
= vdev
->pdev
->msix_cap
;
638 pci_read_config_word(vdev
->pdev
,
639 pos
+ PCI_MSIX_FLAGS
, &flags
);
641 return (flags
& PCI_MSIX_FLAGS_QSIZE
) + 1;
643 } else if (irq_type
== VFIO_PCI_ERR_IRQ_INDEX
) {
644 if (pci_is_pcie(vdev
->pdev
))
646 } else if (irq_type
== VFIO_PCI_REQ_IRQ_INDEX
) {
653 static int vfio_pci_count_devs(struct pci_dev
*pdev
, void *data
)
659 struct vfio_pci_fill_info
{
662 struct vfio_pci_dependent_device
*devices
;
665 static int vfio_pci_fill_devs(struct pci_dev
*pdev
, void *data
)
667 struct vfio_pci_fill_info
*fill
= data
;
668 struct iommu_group
*iommu_group
;
670 if (fill
->cur
== fill
->max
)
671 return -EAGAIN
; /* Something changed, try again */
673 iommu_group
= iommu_group_get(&pdev
->dev
);
675 return -EPERM
; /* Cannot reset non-isolated devices */
677 fill
->devices
[fill
->cur
].group_id
= iommu_group_id(iommu_group
);
678 fill
->devices
[fill
->cur
].segment
= pci_domain_nr(pdev
->bus
);
679 fill
->devices
[fill
->cur
].bus
= pdev
->bus
->number
;
680 fill
->devices
[fill
->cur
].devfn
= pdev
->devfn
;
682 iommu_group_put(iommu_group
);
686 struct vfio_pci_group_entry
{
687 struct vfio_group
*group
;
691 struct vfio_pci_group_info
{
693 struct vfio_pci_group_entry
*groups
;
696 static int vfio_pci_validate_devs(struct pci_dev
*pdev
, void *data
)
698 struct vfio_pci_group_info
*info
= data
;
699 struct iommu_group
*group
;
702 group
= iommu_group_get(&pdev
->dev
);
706 id
= iommu_group_id(group
);
708 for (i
= 0; i
< info
->count
; i
++)
709 if (info
->groups
[i
].id
== id
)
712 iommu_group_put(group
);
714 return (i
== info
->count
) ? -EINVAL
: 0;
717 static bool vfio_pci_dev_below_slot(struct pci_dev
*pdev
, struct pci_slot
*slot
)
719 for (; pdev
; pdev
= pdev
->bus
->self
)
720 if (pdev
->bus
== slot
->bus
)
721 return (pdev
->slot
== slot
);
725 struct vfio_pci_walk_info
{
726 int (*fn
)(struct pci_dev
*, void *data
);
728 struct pci_dev
*pdev
;
733 static int vfio_pci_walk_wrapper(struct pci_dev
*pdev
, void *data
)
735 struct vfio_pci_walk_info
*walk
= data
;
737 if (!walk
->slot
|| vfio_pci_dev_below_slot(pdev
, walk
->pdev
->slot
))
738 walk
->ret
= walk
->fn(pdev
, walk
->data
);
743 static int vfio_pci_for_each_slot_or_bus(struct pci_dev
*pdev
,
744 int (*fn
)(struct pci_dev
*,
745 void *data
), void *data
,
748 struct vfio_pci_walk_info walk
= {
749 .fn
= fn
, .data
= data
, .pdev
= pdev
, .slot
= slot
, .ret
= 0,
752 pci_walk_bus(pdev
->bus
, vfio_pci_walk_wrapper
, &walk
);
757 static int msix_mmappable_cap(struct vfio_pci_device
*vdev
,
758 struct vfio_info_cap
*caps
)
760 struct vfio_info_cap_header header
= {
761 .id
= VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
,
765 return vfio_info_add_capability(caps
, &header
, sizeof(header
));
768 int vfio_pci_register_dev_region(struct vfio_pci_device
*vdev
,
769 unsigned int type
, unsigned int subtype
,
770 const struct vfio_pci_regops
*ops
,
771 size_t size
, u32 flags
, void *data
)
773 struct vfio_pci_region
*region
;
775 region
= krealloc(vdev
->region
,
776 (vdev
->num_regions
+ 1) * sizeof(*region
),
781 vdev
->region
= region
;
782 vdev
->region
[vdev
->num_regions
].type
= type
;
783 vdev
->region
[vdev
->num_regions
].subtype
= subtype
;
784 vdev
->region
[vdev
->num_regions
].ops
= ops
;
785 vdev
->region
[vdev
->num_regions
].size
= size
;
786 vdev
->region
[vdev
->num_regions
].flags
= flags
;
787 vdev
->region
[vdev
->num_regions
].data
= data
;
794 struct vfio_devices
{
795 struct vfio_device
**devices
;
800 static long vfio_pci_ioctl(void *device_data
,
801 unsigned int cmd
, unsigned long arg
)
803 struct vfio_pci_device
*vdev
= device_data
;
806 if (cmd
== VFIO_DEVICE_GET_INFO
) {
807 struct vfio_device_info info
;
808 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
811 minsz
= offsetofend(struct vfio_device_info
, num_irqs
);
813 /* For backward compatibility, cannot require this */
814 capsz
= offsetofend(struct vfio_iommu_type1_info
, cap_offset
);
816 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
819 if (info
.argsz
< minsz
)
822 if (info
.argsz
>= capsz
) {
827 info
.flags
= VFIO_DEVICE_FLAGS_PCI
;
829 if (vdev
->reset_works
)
830 info
.flags
|= VFIO_DEVICE_FLAGS_RESET
;
832 info
.num_regions
= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
;
833 info
.num_irqs
= VFIO_PCI_NUM_IRQS
;
835 if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV
)) {
836 int ret
= vfio_pci_info_zdev_add_caps(vdev
, &caps
);
838 if (ret
&& ret
!= -ENODEV
) {
839 pci_warn(vdev
->pdev
, "Failed to setup zPCI info capabilities\n");
845 info
.flags
|= VFIO_DEVICE_FLAGS_CAPS
;
846 if (info
.argsz
< sizeof(info
) + caps
.size
) {
847 info
.argsz
= sizeof(info
) + caps
.size
;
849 vfio_info_cap_shift(&caps
, sizeof(info
));
850 if (copy_to_user((void __user
*)arg
+
851 sizeof(info
), caps
.buf
,
856 info
.cap_offset
= sizeof(info
);
862 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
865 } else if (cmd
== VFIO_DEVICE_GET_REGION_INFO
) {
866 struct pci_dev
*pdev
= vdev
->pdev
;
867 struct vfio_region_info info
;
868 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
871 minsz
= offsetofend(struct vfio_region_info
, offset
);
873 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
876 if (info
.argsz
< minsz
)
879 switch (info
.index
) {
880 case VFIO_PCI_CONFIG_REGION_INDEX
:
881 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
882 info
.size
= pdev
->cfg_size
;
883 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
884 VFIO_REGION_INFO_FLAG_WRITE
;
886 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
887 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
888 info
.size
= pci_resource_len(pdev
, info
.index
);
894 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
895 VFIO_REGION_INFO_FLAG_WRITE
;
896 if (vdev
->bar_mmap_supported
[info
.index
]) {
897 info
.flags
|= VFIO_REGION_INFO_FLAG_MMAP
;
898 if (info
.index
== vdev
->msix_bar
) {
899 ret
= msix_mmappable_cap(vdev
, &caps
);
906 case VFIO_PCI_ROM_REGION_INDEX
:
912 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
915 /* Report the BAR size, not the ROM size */
916 info
.size
= pci_resource_len(pdev
, info
.index
);
918 /* Shadow ROMs appear as PCI option ROMs */
919 if (pdev
->resource
[PCI_ROM_RESOURCE
].flags
&
920 IORESOURCE_ROM_SHADOW
)
927 * Is it really there? Enable memory decode for
928 * implicit access in pci_map_rom().
930 cmd
= vfio_pci_memory_lock_and_enable(vdev
);
931 io
= pci_map_rom(pdev
, &size
);
933 info
.flags
= VFIO_REGION_INFO_FLAG_READ
;
934 pci_unmap_rom(pdev
, io
);
938 vfio_pci_memory_unlock_and_restore(vdev
, cmd
);
942 case VFIO_PCI_VGA_REGION_INDEX
:
946 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
948 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
949 VFIO_REGION_INFO_FLAG_WRITE
;
954 struct vfio_region_info_cap_type cap_type
= {
955 .header
.id
= VFIO_REGION_INFO_CAP_TYPE
,
956 .header
.version
= 1 };
959 VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
961 info
.index
= array_index_nospec(info
.index
,
962 VFIO_PCI_NUM_REGIONS
+
965 i
= info
.index
- VFIO_PCI_NUM_REGIONS
;
967 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
968 info
.size
= vdev
->region
[i
].size
;
969 info
.flags
= vdev
->region
[i
].flags
;
971 cap_type
.type
= vdev
->region
[i
].type
;
972 cap_type
.subtype
= vdev
->region
[i
].subtype
;
974 ret
= vfio_info_add_capability(&caps
, &cap_type
.header
,
979 if (vdev
->region
[i
].ops
->add_capability
) {
980 ret
= vdev
->region
[i
].ops
->add_capability(vdev
,
981 &vdev
->region
[i
], &caps
);
989 info
.flags
|= VFIO_REGION_INFO_FLAG_CAPS
;
990 if (info
.argsz
< sizeof(info
) + caps
.size
) {
991 info
.argsz
= sizeof(info
) + caps
.size
;
994 vfio_info_cap_shift(&caps
, sizeof(info
));
995 if (copy_to_user((void __user
*)arg
+
996 sizeof(info
), caps
.buf
,
1001 info
.cap_offset
= sizeof(info
);
1007 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
1010 } else if (cmd
== VFIO_DEVICE_GET_IRQ_INFO
) {
1011 struct vfio_irq_info info
;
1013 minsz
= offsetofend(struct vfio_irq_info
, count
);
1015 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
1018 if (info
.argsz
< minsz
|| info
.index
>= VFIO_PCI_NUM_IRQS
)
1021 switch (info
.index
) {
1022 case VFIO_PCI_INTX_IRQ_INDEX
... VFIO_PCI_MSIX_IRQ_INDEX
:
1023 case VFIO_PCI_REQ_IRQ_INDEX
:
1025 case VFIO_PCI_ERR_IRQ_INDEX
:
1026 if (pci_is_pcie(vdev
->pdev
))
1033 info
.flags
= VFIO_IRQ_INFO_EVENTFD
;
1035 info
.count
= vfio_pci_get_irq_count(vdev
, info
.index
);
1037 if (info
.index
== VFIO_PCI_INTX_IRQ_INDEX
)
1038 info
.flags
|= (VFIO_IRQ_INFO_MASKABLE
|
1039 VFIO_IRQ_INFO_AUTOMASKED
);
1041 info
.flags
|= VFIO_IRQ_INFO_NORESIZE
;
1043 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
1046 } else if (cmd
== VFIO_DEVICE_SET_IRQS
) {
1047 struct vfio_irq_set hdr
;
1050 size_t data_size
= 0;
1052 minsz
= offsetofend(struct vfio_irq_set
, count
);
1054 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1057 max
= vfio_pci_get_irq_count(vdev
, hdr
.index
);
1059 ret
= vfio_set_irqs_validate_and_prepare(&hdr
, max
,
1060 VFIO_PCI_NUM_IRQS
, &data_size
);
1065 data
= memdup_user((void __user
*)(arg
+ minsz
),
1068 return PTR_ERR(data
);
1071 mutex_lock(&vdev
->igate
);
1073 ret
= vfio_pci_set_irqs_ioctl(vdev
, hdr
.flags
, hdr
.index
,
1074 hdr
.start
, hdr
.count
, data
);
1076 mutex_unlock(&vdev
->igate
);
1081 } else if (cmd
== VFIO_DEVICE_RESET
) {
1084 if (!vdev
->reset_works
)
1087 vfio_pci_zap_and_down_write_memory_lock(vdev
);
1088 ret
= pci_try_reset_function(vdev
->pdev
);
1089 up_write(&vdev
->memory_lock
);
1093 } else if (cmd
== VFIO_DEVICE_GET_PCI_HOT_RESET_INFO
) {
1094 struct vfio_pci_hot_reset_info hdr
;
1095 struct vfio_pci_fill_info fill
= { 0 };
1096 struct vfio_pci_dependent_device
*devices
= NULL
;
1100 minsz
= offsetofend(struct vfio_pci_hot_reset_info
, count
);
1102 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1105 if (hdr
.argsz
< minsz
)
1110 /* Can we do a slot or bus reset or neither? */
1111 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1113 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1116 /* How many devices are affected? */
1117 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1118 vfio_pci_count_devs
,
1123 WARN_ON(!fill
.max
); /* Should always be at least one */
1126 * If there's enough space, fill it now, otherwise return
1127 * -ENOSPC and the number of devices affected.
1129 if (hdr
.argsz
< sizeof(hdr
) + (fill
.max
* sizeof(*devices
))) {
1131 hdr
.count
= fill
.max
;
1132 goto reset_info_exit
;
1135 devices
= kcalloc(fill
.max
, sizeof(*devices
), GFP_KERNEL
);
1139 fill
.devices
= devices
;
1141 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1146 * If a device was removed between counting and filling,
1147 * we may come up short of fill.max. If a device was
1148 * added, we'll have a return of -EAGAIN above.
1151 hdr
.count
= fill
.cur
;
1154 if (copy_to_user((void __user
*)arg
, &hdr
, minsz
))
1158 if (copy_to_user((void __user
*)(arg
+ minsz
), devices
,
1159 hdr
.count
* sizeof(*devices
)))
1166 } else if (cmd
== VFIO_DEVICE_PCI_HOT_RESET
) {
1167 struct vfio_pci_hot_reset hdr
;
1169 struct vfio_pci_group_entry
*groups
;
1170 struct vfio_pci_group_info info
;
1171 struct vfio_devices devs
= { .cur_index
= 0 };
1173 int i
, group_idx
, mem_idx
= 0, count
= 0, ret
= 0;
1175 minsz
= offsetofend(struct vfio_pci_hot_reset
, count
);
1177 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1180 if (hdr
.argsz
< minsz
|| hdr
.flags
)
1183 /* Can we do a slot or bus reset or neither? */
1184 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1186 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1190 * We can't let userspace give us an arbitrarily large
1191 * buffer to copy, so verify how many we think there
1192 * could be. Note groups can have multiple devices so
1193 * one group per device is the max.
1195 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1196 vfio_pci_count_devs
,
1201 /* Somewhere between 1 and count is OK */
1202 if (!hdr
.count
|| hdr
.count
> count
)
1205 group_fds
= kcalloc(hdr
.count
, sizeof(*group_fds
), GFP_KERNEL
);
1206 groups
= kcalloc(hdr
.count
, sizeof(*groups
), GFP_KERNEL
);
1207 if (!group_fds
|| !groups
) {
1213 if (copy_from_user(group_fds
, (void __user
*)(arg
+ minsz
),
1214 hdr
.count
* sizeof(*group_fds
))) {
1221 * For each group_fd, get the group through the vfio external
1222 * user interface and store the group and iommu ID. This
1223 * ensures the group is held across the reset.
1225 for (group_idx
= 0; group_idx
< hdr
.count
; group_idx
++) {
1226 struct vfio_group
*group
;
1227 struct fd f
= fdget(group_fds
[group_idx
]);
1233 group
= vfio_group_get_external_user(f
.file
);
1235 if (IS_ERR(group
)) {
1236 ret
= PTR_ERR(group
);
1240 groups
[group_idx
].group
= group
;
1241 groups
[group_idx
].id
=
1242 vfio_external_user_iommu_id(group
);
1247 /* release reference to groups on error */
1249 goto hot_reset_release
;
1251 info
.count
= hdr
.count
;
1252 info
.groups
= groups
;
1255 * Test whether all the affected devices are contained
1256 * by the set of groups provided by the user.
1258 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1259 vfio_pci_validate_devs
,
1262 goto hot_reset_release
;
1264 devs
.max_index
= count
;
1265 devs
.devices
= kcalloc(count
, sizeof(struct vfio_device
*),
1267 if (!devs
.devices
) {
1269 goto hot_reset_release
;
1273 * We need to get memory_lock for each device, but devices
1274 * can share mmap_lock, therefore we need to zap and hold
1275 * the vma_lock for each device, and only then get each
1278 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
1279 vfio_pci_try_zap_and_vma_lock_cb
,
1282 goto hot_reset_release
;
1284 for (; mem_idx
< devs
.cur_index
; mem_idx
++) {
1285 struct vfio_pci_device
*tmp
;
1287 tmp
= vfio_device_data(devs
.devices
[mem_idx
]);
1289 ret
= down_write_trylock(&tmp
->memory_lock
);
1292 goto hot_reset_release
;
1294 mutex_unlock(&tmp
->vma_lock
);
1297 /* User has access, do the reset */
1298 ret
= pci_reset_bus(vdev
->pdev
);
1301 for (i
= 0; i
< devs
.cur_index
; i
++) {
1302 struct vfio_device
*device
;
1303 struct vfio_pci_device
*tmp
;
1305 device
= devs
.devices
[i
];
1306 tmp
= vfio_device_data(device
);
1309 up_write(&tmp
->memory_lock
);
1311 mutex_unlock(&tmp
->vma_lock
);
1312 vfio_device_put(device
);
1314 kfree(devs
.devices
);
1316 for (group_idx
--; group_idx
>= 0; group_idx
--)
1317 vfio_group_put_external_user(groups
[group_idx
].group
);
1321 } else if (cmd
== VFIO_DEVICE_IOEVENTFD
) {
1322 struct vfio_device_ioeventfd ioeventfd
;
1325 minsz
= offsetofend(struct vfio_device_ioeventfd
, fd
);
1327 if (copy_from_user(&ioeventfd
, (void __user
*)arg
, minsz
))
1330 if (ioeventfd
.argsz
< minsz
)
1333 if (ioeventfd
.flags
& ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK
)
1336 count
= ioeventfd
.flags
& VFIO_DEVICE_IOEVENTFD_SIZE_MASK
;
1338 if (hweight8(count
) != 1 || ioeventfd
.fd
< -1)
1341 return vfio_pci_ioeventfd(vdev
, ioeventfd
.offset
,
1342 ioeventfd
.data
, count
, ioeventfd
.fd
);
1343 } else if (cmd
== VFIO_DEVICE_FEATURE
) {
1344 struct vfio_device_feature feature
;
1347 minsz
= offsetofend(struct vfio_device_feature
, flags
);
1349 if (copy_from_user(&feature
, (void __user
*)arg
, minsz
))
1352 if (feature
.argsz
< minsz
)
1355 /* Check unknown flags */
1356 if (feature
.flags
& ~(VFIO_DEVICE_FEATURE_MASK
|
1357 VFIO_DEVICE_FEATURE_SET
|
1358 VFIO_DEVICE_FEATURE_GET
|
1359 VFIO_DEVICE_FEATURE_PROBE
))
1362 /* GET & SET are mutually exclusive except with PROBE */
1363 if (!(feature
.flags
& VFIO_DEVICE_FEATURE_PROBE
) &&
1364 (feature
.flags
& VFIO_DEVICE_FEATURE_SET
) &&
1365 (feature
.flags
& VFIO_DEVICE_FEATURE_GET
))
1368 switch (feature
.flags
& VFIO_DEVICE_FEATURE_MASK
) {
1369 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN
:
1370 if (!vdev
->vf_token
)
1374 * We do not support GET of the VF Token UUID as this
1375 * could expose the token of the previous device user.
1377 if (feature
.flags
& VFIO_DEVICE_FEATURE_GET
)
1380 if (feature
.flags
& VFIO_DEVICE_FEATURE_PROBE
)
1383 /* Don't SET unless told to do so */
1384 if (!(feature
.flags
& VFIO_DEVICE_FEATURE_SET
))
1387 if (feature
.argsz
< minsz
+ sizeof(uuid
))
1390 if (copy_from_user(&uuid
, (void __user
*)(arg
+ minsz
),
1394 mutex_lock(&vdev
->vf_token
->lock
);
1395 uuid_copy(&vdev
->vf_token
->uuid
, &uuid
);
1396 mutex_unlock(&vdev
->vf_token
->lock
);
1407 static ssize_t
vfio_pci_rw(void *device_data
, char __user
*buf
,
1408 size_t count
, loff_t
*ppos
, bool iswrite
)
1410 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
1411 struct vfio_pci_device
*vdev
= device_data
;
1413 if (index
>= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
1417 case VFIO_PCI_CONFIG_REGION_INDEX
:
1418 return vfio_pci_config_rw(vdev
, buf
, count
, ppos
, iswrite
);
1420 case VFIO_PCI_ROM_REGION_INDEX
:
1423 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, false);
1425 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
1426 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, iswrite
);
1428 case VFIO_PCI_VGA_REGION_INDEX
:
1429 return vfio_pci_vga_rw(vdev
, buf
, count
, ppos
, iswrite
);
1431 index
-= VFIO_PCI_NUM_REGIONS
;
1432 return vdev
->region
[index
].ops
->rw(vdev
, buf
,
1433 count
, ppos
, iswrite
);
1439 static ssize_t
vfio_pci_read(void *device_data
, char __user
*buf
,
1440 size_t count
, loff_t
*ppos
)
1445 return vfio_pci_rw(device_data
, buf
, count
, ppos
, false);
1448 static ssize_t
vfio_pci_write(void *device_data
, const char __user
*buf
,
1449 size_t count
, loff_t
*ppos
)
1454 return vfio_pci_rw(device_data
, (char __user
*)buf
, count
, ppos
, true);
1457 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1458 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device
*vdev
, bool try)
1460 struct vfio_pci_mmap_vma
*mmap_vma
, *tmp
;
1464 * vma_lock is nested under mmap_lock for vm_ops callback paths.
1465 * The memory_lock semaphore is used by both code paths calling
1466 * into this function to zap vmas and the vm_ops.fault callback
1467 * to protect the memory enable state of the device.
1469 * When zapping vmas we need to maintain the mmap_lock => vma_lock
1470 * ordering, which requires using vma_lock to walk vma_list to
1471 * acquire an mm, then dropping vma_lock to get the mmap_lock and
1472 * reacquiring vma_lock. This logic is derived from similar
1473 * requirements in uverbs_user_mmap_disassociate().
1475 * mmap_lock must always be the top-level lock when it is taken.
1476 * Therefore we can only hold the memory_lock write lock when
1477 * vma_list is empty, as we'd need to take mmap_lock to clear
1478 * entries. vma_list can only be guaranteed empty when holding
1479 * vma_lock, thus memory_lock is nested under vma_lock.
1481 * This enables the vm_ops.fault callback to acquire vma_lock,
1482 * followed by memory_lock read lock, while already holding
1483 * mmap_lock without risk of deadlock.
1486 struct mm_struct
*mm
= NULL
;
1489 if (!mutex_trylock(&vdev
->vma_lock
))
1492 mutex_lock(&vdev
->vma_lock
);
1494 while (!list_empty(&vdev
->vma_list
)) {
1495 mmap_vma
= list_first_entry(&vdev
->vma_list
,
1496 struct vfio_pci_mmap_vma
,
1498 mm
= mmap_vma
->vma
->vm_mm
;
1499 if (mmget_not_zero(mm
))
1502 list_del(&mmap_vma
->vma_next
);
1508 mutex_unlock(&vdev
->vma_lock
);
1511 if (!mmap_read_trylock(mm
)) {
1519 if (!mutex_trylock(&vdev
->vma_lock
)) {
1520 mmap_read_unlock(mm
);
1525 mutex_lock(&vdev
->vma_lock
);
1527 list_for_each_entry_safe(mmap_vma
, tmp
,
1528 &vdev
->vma_list
, vma_next
) {
1529 struct vm_area_struct
*vma
= mmap_vma
->vma
;
1531 if (vma
->vm_mm
!= mm
)
1534 list_del(&mmap_vma
->vma_next
);
1537 zap_vma_ptes(vma
, vma
->vm_start
,
1538 vma
->vm_end
- vma
->vm_start
);
1540 mutex_unlock(&vdev
->vma_lock
);
1541 mmap_read_unlock(mm
);
1546 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device
*vdev
)
1548 vfio_pci_zap_and_vma_lock(vdev
, false);
1549 down_write(&vdev
->memory_lock
);
1550 mutex_unlock(&vdev
->vma_lock
);
1553 u16
vfio_pci_memory_lock_and_enable(struct vfio_pci_device
*vdev
)
1557 down_write(&vdev
->memory_lock
);
1558 pci_read_config_word(vdev
->pdev
, PCI_COMMAND
, &cmd
);
1559 if (!(cmd
& PCI_COMMAND_MEMORY
))
1560 pci_write_config_word(vdev
->pdev
, PCI_COMMAND
,
1561 cmd
| PCI_COMMAND_MEMORY
);
1566 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device
*vdev
, u16 cmd
)
1568 pci_write_config_word(vdev
->pdev
, PCI_COMMAND
, cmd
);
1569 up_write(&vdev
->memory_lock
);
1572 /* Caller holds vma_lock */
1573 static int __vfio_pci_add_vma(struct vfio_pci_device
*vdev
,
1574 struct vm_area_struct
*vma
)
1576 struct vfio_pci_mmap_vma
*mmap_vma
;
1578 mmap_vma
= kmalloc(sizeof(*mmap_vma
), GFP_KERNEL
);
1582 mmap_vma
->vma
= vma
;
1583 list_add(&mmap_vma
->vma_next
, &vdev
->vma_list
);
1589 * Zap mmaps on open so that we can fault them in on access and therefore
1590 * our vma_list only tracks mappings accessed since last zap.
1592 static void vfio_pci_mmap_open(struct vm_area_struct
*vma
)
1594 zap_vma_ptes(vma
, vma
->vm_start
, vma
->vm_end
- vma
->vm_start
);
1597 static void vfio_pci_mmap_close(struct vm_area_struct
*vma
)
1599 struct vfio_pci_device
*vdev
= vma
->vm_private_data
;
1600 struct vfio_pci_mmap_vma
*mmap_vma
;
1602 mutex_lock(&vdev
->vma_lock
);
1603 list_for_each_entry(mmap_vma
, &vdev
->vma_list
, vma_next
) {
1604 if (mmap_vma
->vma
== vma
) {
1605 list_del(&mmap_vma
->vma_next
);
1610 mutex_unlock(&vdev
->vma_lock
);
1613 static vm_fault_t
vfio_pci_mmap_fault(struct vm_fault
*vmf
)
1615 struct vm_area_struct
*vma
= vmf
->vma
;
1616 struct vfio_pci_device
*vdev
= vma
->vm_private_data
;
1617 vm_fault_t ret
= VM_FAULT_NOPAGE
;
1619 mutex_lock(&vdev
->vma_lock
);
1620 down_read(&vdev
->memory_lock
);
1622 if (!__vfio_pci_memory_enabled(vdev
)) {
1623 ret
= VM_FAULT_SIGBUS
;
1624 mutex_unlock(&vdev
->vma_lock
);
1628 if (__vfio_pci_add_vma(vdev
, vma
)) {
1630 mutex_unlock(&vdev
->vma_lock
);
1634 mutex_unlock(&vdev
->vma_lock
);
1636 if (io_remap_pfn_range(vma
, vma
->vm_start
, vma
->vm_pgoff
,
1637 vma
->vm_end
- vma
->vm_start
, vma
->vm_page_prot
))
1638 ret
= VM_FAULT_SIGBUS
;
1641 up_read(&vdev
->memory_lock
);
1645 static const struct vm_operations_struct vfio_pci_mmap_ops
= {
1646 .open
= vfio_pci_mmap_open
,
1647 .close
= vfio_pci_mmap_close
,
1648 .fault
= vfio_pci_mmap_fault
,
1651 static int vfio_pci_mmap(void *device_data
, struct vm_area_struct
*vma
)
1653 struct vfio_pci_device
*vdev
= device_data
;
1654 struct pci_dev
*pdev
= vdev
->pdev
;
1656 u64 phys_len
, req_len
, pgoff
, req_start
;
1659 index
= vma
->vm_pgoff
>> (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
);
1661 if (vma
->vm_end
< vma
->vm_start
)
1663 if ((vma
->vm_flags
& VM_SHARED
) == 0)
1665 if (index
>= VFIO_PCI_NUM_REGIONS
) {
1666 int regnum
= index
- VFIO_PCI_NUM_REGIONS
;
1667 struct vfio_pci_region
*region
= vdev
->region
+ regnum
;
1669 if (region
&& region
->ops
&& region
->ops
->mmap
&&
1670 (region
->flags
& VFIO_REGION_INFO_FLAG_MMAP
))
1671 return region
->ops
->mmap(vdev
, region
, vma
);
1674 if (index
>= VFIO_PCI_ROM_REGION_INDEX
)
1676 if (!vdev
->bar_mmap_supported
[index
])
1679 phys_len
= PAGE_ALIGN(pci_resource_len(pdev
, index
));
1680 req_len
= vma
->vm_end
- vma
->vm_start
;
1681 pgoff
= vma
->vm_pgoff
&
1682 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
1683 req_start
= pgoff
<< PAGE_SHIFT
;
1685 if (req_start
+ req_len
> phys_len
)
1689 * Even though we don't make use of the barmap for the mmap,
1690 * we need to request the region and the barmap tracks that.
1692 if (!vdev
->barmap
[index
]) {
1693 ret
= pci_request_selected_regions(pdev
,
1694 1 << index
, "vfio-pci");
1698 vdev
->barmap
[index
] = pci_iomap(pdev
, index
, 0);
1699 if (!vdev
->barmap
[index
]) {
1700 pci_release_selected_regions(pdev
, 1 << index
);
1705 vma
->vm_private_data
= vdev
;
1706 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
1707 vma
->vm_pgoff
= (pci_resource_start(pdev
, index
) >> PAGE_SHIFT
) + pgoff
;
1710 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1711 * change vm_flags within the fault handler. Set them now.
1713 vma
->vm_flags
|= VM_IO
| VM_PFNMAP
| VM_DONTEXPAND
| VM_DONTDUMP
;
1714 vma
->vm_ops
= &vfio_pci_mmap_ops
;
1719 static void vfio_pci_request(void *device_data
, unsigned int count
)
1721 struct vfio_pci_device
*vdev
= device_data
;
1722 struct pci_dev
*pdev
= vdev
->pdev
;
1724 mutex_lock(&vdev
->igate
);
1726 if (vdev
->req_trigger
) {
1728 pci_notice_ratelimited(pdev
,
1729 "Relaying device request to user (#%u)\n",
1731 eventfd_signal(vdev
->req_trigger
, 1);
1732 } else if (count
== 0) {
1734 "No device request channel registered, blocked until released by user\n");
1737 mutex_unlock(&vdev
->igate
);
1740 static int vfio_pci_validate_vf_token(struct vfio_pci_device
*vdev
,
1741 bool vf_token
, uuid_t
*uuid
)
1744 * There's always some degree of trust or collaboration between SR-IOV
1745 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1746 * can disrupt VFs with a reset, but often the PF has more explicit
1747 * access to deny service to the VF or access data passed through the
1748 * VF. We therefore require an opt-in via a shared VF token (UUID) to
1749 * represent this trust. This both prevents that a VF driver might
1750 * assume the PF driver is a trusted, in-kernel driver, and also that
1751 * a PF driver might be replaced with a rogue driver, unknown to in-use
1754 * Therefore when presented with a VF, if the PF is a vfio device and
1755 * it is bound to the vfio-pci driver, the user needs to provide a VF
1756 * token to access the device, in the form of appending a vf_token to
1757 * the device name, for example:
1759 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1761 * When presented with a PF which has VFs in use, the user must also
1762 * provide the current VF token to prove collaboration with existing
1763 * VF users. If VFs are not in use, the VF token provided for the PF
1764 * device will act to set the VF token.
1766 * If the VF token is provided but unused, an error is generated.
1768 if (!vdev
->pdev
->is_virtfn
&& !vdev
->vf_token
&& !vf_token
)
1769 return 0; /* No VF token provided or required */
1771 if (vdev
->pdev
->is_virtfn
) {
1772 struct vfio_device
*pf_dev
;
1773 struct vfio_pci_device
*pf_vdev
= get_pf_vdev(vdev
, &pf_dev
);
1778 return 0; /* PF is not vfio-pci, no VF token */
1780 pci_info_ratelimited(vdev
->pdev
,
1781 "VF token incorrectly provided, PF not bound to vfio-pci\n");
1786 vfio_device_put(pf_dev
);
1787 pci_info_ratelimited(vdev
->pdev
,
1788 "VF token required to access device\n");
1792 mutex_lock(&pf_vdev
->vf_token
->lock
);
1793 match
= uuid_equal(uuid
, &pf_vdev
->vf_token
->uuid
);
1794 mutex_unlock(&pf_vdev
->vf_token
->lock
);
1796 vfio_device_put(pf_dev
);
1799 pci_info_ratelimited(vdev
->pdev
,
1800 "Incorrect VF token provided for device\n");
1803 } else if (vdev
->vf_token
) {
1804 mutex_lock(&vdev
->vf_token
->lock
);
1805 if (vdev
->vf_token
->users
) {
1807 mutex_unlock(&vdev
->vf_token
->lock
);
1808 pci_info_ratelimited(vdev
->pdev
,
1809 "VF token required to access device\n");
1813 if (!uuid_equal(uuid
, &vdev
->vf_token
->uuid
)) {
1814 mutex_unlock(&vdev
->vf_token
->lock
);
1815 pci_info_ratelimited(vdev
->pdev
,
1816 "Incorrect VF token provided for device\n");
1819 } else if (vf_token
) {
1820 uuid_copy(&vdev
->vf_token
->uuid
, uuid
);
1823 mutex_unlock(&vdev
->vf_token
->lock
);
1824 } else if (vf_token
) {
1825 pci_info_ratelimited(vdev
->pdev
,
1826 "VF token incorrectly provided, not a PF or VF\n");
1833 #define VF_TOKEN_ARG "vf_token="
1835 static int vfio_pci_match(void *device_data
, char *buf
)
1837 struct vfio_pci_device
*vdev
= device_data
;
1838 bool vf_token
= false;
1842 if (strncmp(pci_name(vdev
->pdev
), buf
, strlen(pci_name(vdev
->pdev
))))
1843 return 0; /* No match */
1845 if (strlen(buf
) > strlen(pci_name(vdev
->pdev
))) {
1846 buf
+= strlen(pci_name(vdev
->pdev
));
1849 return 0; /* No match: non-whitespace after name */
1857 if (!vf_token
&& !strncmp(buf
, VF_TOKEN_ARG
,
1858 strlen(VF_TOKEN_ARG
))) {
1859 buf
+= strlen(VF_TOKEN_ARG
);
1861 if (strlen(buf
) < UUID_STRING_LEN
)
1864 ret
= uuid_parse(buf
, &uuid
);
1869 buf
+= UUID_STRING_LEN
;
1871 /* Unknown/duplicate option */
1877 ret
= vfio_pci_validate_vf_token(vdev
, vf_token
, &uuid
);
1881 return 1; /* Match */
1884 static const struct vfio_device_ops vfio_pci_ops
= {
1886 .open
= vfio_pci_open
,
1887 .release
= vfio_pci_release
,
1888 .ioctl
= vfio_pci_ioctl
,
1889 .read
= vfio_pci_read
,
1890 .write
= vfio_pci_write
,
1891 .mmap
= vfio_pci_mmap
,
1892 .request
= vfio_pci_request
,
1893 .match
= vfio_pci_match
,
1896 static int vfio_pci_reflck_attach(struct vfio_pci_device
*vdev
);
1897 static void vfio_pci_reflck_put(struct vfio_pci_reflck
*reflck
);
1899 static int vfio_pci_bus_notifier(struct notifier_block
*nb
,
1900 unsigned long action
, void *data
)
1902 struct vfio_pci_device
*vdev
= container_of(nb
,
1903 struct vfio_pci_device
, nb
);
1904 struct device
*dev
= data
;
1905 struct pci_dev
*pdev
= to_pci_dev(dev
);
1906 struct pci_dev
*physfn
= pci_physfn(pdev
);
1908 if (action
== BUS_NOTIFY_ADD_DEVICE
&&
1909 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1910 pci_info(vdev
->pdev
, "Captured SR-IOV VF %s driver_override\n",
1912 pdev
->driver_override
= kasprintf(GFP_KERNEL
, "%s",
1914 } else if (action
== BUS_NOTIFY_BOUND_DRIVER
&&
1915 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1916 struct pci_driver
*drv
= pci_dev_driver(pdev
);
1918 if (drv
&& drv
!= &vfio_pci_driver
)
1919 pci_warn(vdev
->pdev
,
1920 "VF %s bound to driver %s while PF bound to vfio-pci\n",
1921 pci_name(pdev
), drv
->name
);
1927 static int vfio_pci_probe(struct pci_dev
*pdev
, const struct pci_device_id
*id
)
1929 struct vfio_pci_device
*vdev
;
1930 struct iommu_group
*group
;
1933 if (vfio_pci_is_denylisted(pdev
))
1936 if (pdev
->hdr_type
!= PCI_HEADER_TYPE_NORMAL
)
1940 * Prevent binding to PFs with VFs enabled, the VFs might be in use
1941 * by the host or other users. We cannot capture the VFs if they
1942 * already exist, nor can we track VF users. Disabling SR-IOV here
1943 * would initiate removing the VFs, which would unbind the driver,
1944 * which is prone to blocking if that VF is also in use by vfio-pci.
1945 * Just reject these PFs and let the user sort it out.
1947 if (pci_num_vf(pdev
)) {
1948 pci_warn(pdev
, "Cannot bind to PF with SR-IOV enabled\n");
1952 group
= vfio_iommu_group_get(&pdev
->dev
);
1956 vdev
= kzalloc(sizeof(*vdev
), GFP_KERNEL
);
1963 vdev
->irq_type
= VFIO_PCI_NUM_IRQS
;
1964 mutex_init(&vdev
->igate
);
1965 spin_lock_init(&vdev
->irqlock
);
1966 mutex_init(&vdev
->ioeventfds_lock
);
1967 INIT_LIST_HEAD(&vdev
->dummy_resources_list
);
1968 INIT_LIST_HEAD(&vdev
->ioeventfds_list
);
1969 mutex_init(&vdev
->vma_lock
);
1970 INIT_LIST_HEAD(&vdev
->vma_list
);
1971 init_rwsem(&vdev
->memory_lock
);
1973 ret
= vfio_add_group_dev(&pdev
->dev
, &vfio_pci_ops
, vdev
);
1977 ret
= vfio_pci_reflck_attach(vdev
);
1979 goto out_del_group_dev
;
1981 if (pdev
->is_physfn
) {
1982 vdev
->vf_token
= kzalloc(sizeof(*vdev
->vf_token
), GFP_KERNEL
);
1983 if (!vdev
->vf_token
) {
1988 mutex_init(&vdev
->vf_token
->lock
);
1989 uuid_gen(&vdev
->vf_token
->uuid
);
1991 vdev
->nb
.notifier_call
= vfio_pci_bus_notifier
;
1992 ret
= bus_register_notifier(&pci_bus_type
, &vdev
->nb
);
1997 if (vfio_pci_is_vga(pdev
)) {
1998 vga_client_register(pdev
, vdev
, NULL
, vfio_pci_set_vga_decode
);
1999 vga_set_legacy_decoding(pdev
,
2000 vfio_pci_set_vga_decode(vdev
, false));
2003 vfio_pci_probe_power_state(vdev
);
2005 if (!disable_idle_d3
) {
2007 * pci-core sets the device power state to an unknown value at
2008 * bootup and after being removed from a driver. The only
2009 * transition it allows from this unknown state is to D0, which
2010 * typically happens when a driver calls pci_enable_device().
2011 * We're not ready to enable the device yet, but we do want to
2012 * be able to get to D3. Therefore first do a D0 transition
2013 * before going to D3.
2015 vfio_pci_set_power_state(vdev
, PCI_D0
);
2016 vfio_pci_set_power_state(vdev
, PCI_D3hot
);
2022 kfree(vdev
->vf_token
);
2024 vfio_pci_reflck_put(vdev
->reflck
);
2026 vfio_del_group_dev(&pdev
->dev
);
2030 vfio_iommu_group_put(group
, &pdev
->dev
);
2034 static void vfio_pci_remove(struct pci_dev
*pdev
)
2036 struct vfio_pci_device
*vdev
;
2038 pci_disable_sriov(pdev
);
2040 vdev
= vfio_del_group_dev(&pdev
->dev
);
2044 if (vdev
->vf_token
) {
2045 WARN_ON(vdev
->vf_token
->users
);
2046 mutex_destroy(&vdev
->vf_token
->lock
);
2047 kfree(vdev
->vf_token
);
2050 if (vdev
->nb
.notifier_call
)
2051 bus_unregister_notifier(&pci_bus_type
, &vdev
->nb
);
2053 vfio_pci_reflck_put(vdev
->reflck
);
2055 vfio_iommu_group_put(pdev
->dev
.iommu_group
, &pdev
->dev
);
2056 kfree(vdev
->region
);
2057 mutex_destroy(&vdev
->ioeventfds_lock
);
2059 if (!disable_idle_d3
)
2060 vfio_pci_set_power_state(vdev
, PCI_D0
);
2062 kfree(vdev
->pm_save
);
2065 if (vfio_pci_is_vga(pdev
)) {
2066 vga_client_register(pdev
, NULL
, NULL
, NULL
);
2067 vga_set_legacy_decoding(pdev
,
2068 VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
2069 VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
);
2073 static pci_ers_result_t
vfio_pci_aer_err_detected(struct pci_dev
*pdev
,
2074 pci_channel_state_t state
)
2076 struct vfio_pci_device
*vdev
;
2077 struct vfio_device
*device
;
2079 device
= vfio_device_get_from_dev(&pdev
->dev
);
2081 return PCI_ERS_RESULT_DISCONNECT
;
2083 vdev
= vfio_device_data(device
);
2085 vfio_device_put(device
);
2086 return PCI_ERS_RESULT_DISCONNECT
;
2089 mutex_lock(&vdev
->igate
);
2091 if (vdev
->err_trigger
)
2092 eventfd_signal(vdev
->err_trigger
, 1);
2094 mutex_unlock(&vdev
->igate
);
2096 vfio_device_put(device
);
2098 return PCI_ERS_RESULT_CAN_RECOVER
;
2101 static int vfio_pci_sriov_configure(struct pci_dev
*pdev
, int nr_virtfn
)
2103 struct vfio_pci_device
*vdev
;
2104 struct vfio_device
*device
;
2112 device
= vfio_device_get_from_dev(&pdev
->dev
);
2116 vdev
= vfio_device_data(device
);
2118 vfio_device_put(device
);
2123 pci_disable_sriov(pdev
);
2125 ret
= pci_enable_sriov(pdev
, nr_virtfn
);
2127 vfio_device_put(device
);
2129 return ret
< 0 ? ret
: nr_virtfn
;
2132 static const struct pci_error_handlers vfio_err_handlers
= {
2133 .error_detected
= vfio_pci_aer_err_detected
,
2136 static struct pci_driver vfio_pci_driver
= {
2138 .id_table
= NULL
, /* only dynamic ids */
2139 .probe
= vfio_pci_probe
,
2140 .remove
= vfio_pci_remove
,
2141 .sriov_configure
= vfio_pci_sriov_configure
,
2142 .err_handler
= &vfio_err_handlers
,
2145 static DEFINE_MUTEX(reflck_lock
);
2147 static struct vfio_pci_reflck
*vfio_pci_reflck_alloc(void)
2149 struct vfio_pci_reflck
*reflck
;
2151 reflck
= kzalloc(sizeof(*reflck
), GFP_KERNEL
);
2153 return ERR_PTR(-ENOMEM
);
2155 kref_init(&reflck
->kref
);
2156 mutex_init(&reflck
->lock
);
2161 static void vfio_pci_reflck_get(struct vfio_pci_reflck
*reflck
)
2163 kref_get(&reflck
->kref
);
2166 static int vfio_pci_reflck_find(struct pci_dev
*pdev
, void *data
)
2168 struct vfio_pci_reflck
**preflck
= data
;
2169 struct vfio_device
*device
;
2170 struct vfio_pci_device
*vdev
;
2172 device
= vfio_device_get_from_dev(&pdev
->dev
);
2176 if (pci_dev_driver(pdev
) != &vfio_pci_driver
) {
2177 vfio_device_put(device
);
2181 vdev
= vfio_device_data(device
);
2184 vfio_pci_reflck_get(vdev
->reflck
);
2185 *preflck
= vdev
->reflck
;
2186 vfio_device_put(device
);
2190 vfio_device_put(device
);
2194 static int vfio_pci_reflck_attach(struct vfio_pci_device
*vdev
)
2196 bool slot
= !pci_probe_reset_slot(vdev
->pdev
->slot
);
2198 mutex_lock(&reflck_lock
);
2200 if (pci_is_root_bus(vdev
->pdev
->bus
) ||
2201 vfio_pci_for_each_slot_or_bus(vdev
->pdev
, vfio_pci_reflck_find
,
2202 &vdev
->reflck
, slot
) <= 0)
2203 vdev
->reflck
= vfio_pci_reflck_alloc();
2205 mutex_unlock(&reflck_lock
);
2207 return PTR_ERR_OR_ZERO(vdev
->reflck
);
2210 static void vfio_pci_reflck_release(struct kref
*kref
)
2212 struct vfio_pci_reflck
*reflck
= container_of(kref
,
2213 struct vfio_pci_reflck
,
2217 mutex_unlock(&reflck_lock
);
2220 static void vfio_pci_reflck_put(struct vfio_pci_reflck
*reflck
)
2222 kref_put_mutex(&reflck
->kref
, vfio_pci_reflck_release
, &reflck_lock
);
2225 static int vfio_pci_get_unused_devs(struct pci_dev
*pdev
, void *data
)
2227 struct vfio_devices
*devs
= data
;
2228 struct vfio_device
*device
;
2229 struct vfio_pci_device
*vdev
;
2231 if (devs
->cur_index
== devs
->max_index
)
2234 device
= vfio_device_get_from_dev(&pdev
->dev
);
2238 if (pci_dev_driver(pdev
) != &vfio_pci_driver
) {
2239 vfio_device_put(device
);
2243 vdev
= vfio_device_data(device
);
2245 /* Fault if the device is not unused */
2247 vfio_device_put(device
);
2251 devs
->devices
[devs
->cur_index
++] = device
;
2255 static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev
*pdev
, void *data
)
2257 struct vfio_devices
*devs
= data
;
2258 struct vfio_device
*device
;
2259 struct vfio_pci_device
*vdev
;
2261 if (devs
->cur_index
== devs
->max_index
)
2264 device
= vfio_device_get_from_dev(&pdev
->dev
);
2268 if (pci_dev_driver(pdev
) != &vfio_pci_driver
) {
2269 vfio_device_put(device
);
2273 vdev
= vfio_device_data(device
);
2276 * Locking multiple devices is prone to deadlock, runaway and
2277 * unwind if we hit contention.
2279 if (!vfio_pci_zap_and_vma_lock(vdev
, true)) {
2280 vfio_device_put(device
);
2284 devs
->devices
[devs
->cur_index
++] = device
;
2289 * If a bus or slot reset is available for the provided device and:
2290 * - All of the devices affected by that bus or slot reset are unused
2292 * - At least one of the affected devices is marked dirty via
2293 * needs_reset (such as by lack of FLR support)
2294 * Then attempt to perform that bus or slot reset. Callers are required
2295 * to hold vdev->reflck->lock, protecting the bus/slot reset group from
2296 * concurrent opens. A vfio_device reference is acquired for each device
2297 * to prevent unbinds during the reset operation.
2299 * NB: vfio-core considers a group to be viable even if some devices are
2300 * bound to drivers like pci-stub or pcieport. Here we require all devices
2301 * to be bound to vfio_pci since that's the only way we can be sure they
2304 static void vfio_pci_try_bus_reset(struct vfio_pci_device
*vdev
)
2306 struct vfio_devices devs
= { .cur_index
= 0 };
2307 int i
= 0, ret
= -EINVAL
;
2309 struct vfio_pci_device
*tmp
;
2311 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
2313 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
2316 if (vfio_pci_for_each_slot_or_bus(vdev
->pdev
, vfio_pci_count_devs
,
2321 devs
.devices
= kcalloc(i
, sizeof(struct vfio_device
*), GFP_KERNEL
);
2325 if (vfio_pci_for_each_slot_or_bus(vdev
->pdev
,
2326 vfio_pci_get_unused_devs
,
2330 /* Does at least one need a reset? */
2331 for (i
= 0; i
< devs
.cur_index
; i
++) {
2332 tmp
= vfio_device_data(devs
.devices
[i
]);
2333 if (tmp
->needs_reset
) {
2334 ret
= pci_reset_bus(vdev
->pdev
);
2340 for (i
= 0; i
< devs
.cur_index
; i
++) {
2341 tmp
= vfio_device_data(devs
.devices
[i
]);
2344 * If reset was successful, affected devices no longer need
2345 * a reset and we should return all the collateral devices
2346 * to low power. If not successful, we either didn't reset
2347 * the bus or timed out waiting for it, so let's not touch
2351 tmp
->needs_reset
= false;
2353 if (tmp
!= vdev
&& !disable_idle_d3
)
2354 vfio_pci_set_power_state(tmp
, PCI_D3hot
);
2357 vfio_device_put(devs
.devices
[i
]);
2360 kfree(devs
.devices
);
2363 static void __exit
vfio_pci_cleanup(void)
2365 pci_unregister_driver(&vfio_pci_driver
);
2366 vfio_pci_uninit_perm_bits();
2369 static void __init
vfio_pci_fill_ids(void)
2374 /* no ids passed actually */
2378 /* add ids specified in the module parameter */
2380 while ((id
= strsep(&p
, ","))) {
2381 unsigned int vendor
, device
, subvendor
= PCI_ANY_ID
,
2382 subdevice
= PCI_ANY_ID
, class = 0, class_mask
= 0;
2388 fields
= sscanf(id
, "%x:%x:%x:%x:%x:%x",
2389 &vendor
, &device
, &subvendor
, &subdevice
,
2390 &class, &class_mask
);
2393 pr_warn("invalid id string \"%s\"\n", id
);
2397 rc
= pci_add_dynid(&vfio_pci_driver
, vendor
, device
,
2398 subvendor
, subdevice
, class, class_mask
, 0);
2400 pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
2401 vendor
, device
, subvendor
, subdevice
,
2402 class, class_mask
, rc
);
2404 pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
2405 vendor
, device
, subvendor
, subdevice
,
2410 static int __init
vfio_pci_init(void)
2414 /* Allocate shared config space permision data used by all devices */
2415 ret
= vfio_pci_init_perm_bits();
2419 /* Register and scan for devices */
2420 ret
= pci_register_driver(&vfio_pci_driver
);
2424 vfio_pci_fill_ids();
2426 if (disable_denylist
)
2427 pr_warn("device denylist disabled.\n");
2432 vfio_pci_uninit_perm_bits();
2436 module_init(vfio_pci_init
);
2437 module_exit(vfio_pci_cleanup
);
2439 MODULE_VERSION(DRIVER_VERSION
);
2440 MODULE_LICENSE("GPL v2");
2441 MODULE_AUTHOR(DRIVER_AUTHOR
);
2442 MODULE_DESCRIPTION(DRIVER_DESC
);