1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
4 * Author: Alex Williamson <alex.williamson@redhat.com>
6 * Derived from original vfio:
7 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
8 * Author: Tom Lyon, pugs@cisco.com
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <linux/aperture.h>
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/file.h>
17 #include <linux/interrupt.h>
18 #include <linux/iommu.h>
19 #include <linux/module.h>
20 #include <linux/mutex.h>
21 #include <linux/notifier.h>
22 #include <linux/pci.h>
23 #include <linux/pfn_t.h>
24 #include <linux/pm_runtime.h>
25 #include <linux/slab.h>
26 #include <linux/types.h>
27 #include <linux/uaccess.h>
28 #include <linux/vgaarb.h>
29 #include <linux/nospec.h>
30 #include <linux/sched/mm.h>
31 #include <linux/iommufd.h>
32 #if IS_ENABLED(CONFIG_EEH)
36 #include "vfio_pci_priv.h"
38 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
39 #define DRIVER_DESC "core driver for VFIO based PCI devices"
41 static bool nointxmask
;
42 static bool disable_vga
;
43 static bool disable_idle_d3
;
45 /* List of PF's that vfio_pci_core_sriov_configure() has been called on */
46 static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex
);
47 static LIST_HEAD(vfio_pci_sriov_pfs
);
49 struct vfio_pci_dummy_resource
{
50 struct resource resource
;
52 struct list_head res_next
;
55 struct vfio_pci_vf_token
{
61 static inline bool vfio_vga_disabled(void)
63 #ifdef CONFIG_VFIO_PCI_VGA
71 * Our VGA arbiter participation is limited since we don't know anything
72 * about the device itself. However, if the device is the only VGA device
73 * downstream of a bridge and VFIO VGA support is disabled, then we can
74 * safely return legacy VGA IO and memory as not decoded since the user
75 * has no way to get to it and routing can be disabled externally at the
78 static unsigned int vfio_pci_set_decode(struct pci_dev
*pdev
, bool single_vga
)
80 struct pci_dev
*tmp
= NULL
;
81 unsigned char max_busnr
;
84 if (single_vga
|| !vfio_vga_disabled() || pci_is_root_bus(pdev
->bus
))
85 return VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
86 VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
88 max_busnr
= pci_bus_max_busnr(pdev
->bus
);
89 decodes
= VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
;
91 while ((tmp
= pci_get_class(PCI_CLASS_DISPLAY_VGA
<< 8, tmp
)) != NULL
) {
93 pci_domain_nr(tmp
->bus
) != pci_domain_nr(pdev
->bus
) ||
94 pci_is_root_bus(tmp
->bus
))
97 if (tmp
->bus
->number
>= pdev
->bus
->number
&&
98 tmp
->bus
->number
<= max_busnr
) {
100 decodes
|= VGA_RSRC_LEGACY_IO
| VGA_RSRC_LEGACY_MEM
;
108 static void vfio_pci_probe_mmaps(struct vfio_pci_core_device
*vdev
)
110 struct resource
*res
;
112 struct vfio_pci_dummy_resource
*dummy_res
;
114 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
115 int bar
= i
+ PCI_STD_RESOURCES
;
117 res
= &vdev
->pdev
->resource
[bar
];
119 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP
))
122 if (!(res
->flags
& IORESOURCE_MEM
))
126 * The PCI core shouldn't set up a resource with a
127 * type but zero size. But there may be bugs that
128 * cause us to do that.
130 if (!resource_size(res
))
133 if (resource_size(res
) >= PAGE_SIZE
) {
134 vdev
->bar_mmap_supported
[bar
] = true;
138 if (!(res
->start
& ~PAGE_MASK
)) {
140 * Add a dummy resource to reserve the remainder
141 * of the exclusive page in case that hot-add
142 * device's bar is assigned into it.
145 kzalloc(sizeof(*dummy_res
), GFP_KERNEL_ACCOUNT
);
146 if (dummy_res
== NULL
)
149 dummy_res
->resource
.name
= "vfio sub-page reserved";
150 dummy_res
->resource
.start
= res
->end
+ 1;
151 dummy_res
->resource
.end
= res
->start
+ PAGE_SIZE
- 1;
152 dummy_res
->resource
.flags
= res
->flags
;
153 if (request_resource(res
->parent
,
154 &dummy_res
->resource
)) {
158 dummy_res
->index
= bar
;
159 list_add(&dummy_res
->res_next
,
160 &vdev
->dummy_resources_list
);
161 vdev
->bar_mmap_supported
[bar
] = true;
165 * Here we don't handle the case when the BAR is not page
166 * aligned because we can't expect the BAR will be
167 * assigned into the same location in a page in guest
168 * when we passthrough the BAR. And it's hard to access
169 * this BAR in userspace because we have no way to get
170 * the BAR's location in a page.
173 vdev
->bar_mmap_supported
[bar
] = false;
177 struct vfio_pci_group_info
;
178 static void vfio_pci_dev_set_try_reset(struct vfio_device_set
*dev_set
);
179 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set
*dev_set
,
180 struct vfio_pci_group_info
*groups
,
181 struct iommufd_ctx
*iommufd_ctx
);
184 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
185 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
186 * If a device implements the former but not the latter we would typically
187 * expect broken_intx_masking be set and require an exclusive interrupt.
188 * However since we do have control of the device's ability to assert INTx,
189 * we can instead pretend that the device does not implement INTx, virtualizing
190 * the pin register to report zero and maintaining DisINTx set on the host.
192 static bool vfio_pci_nointx(struct pci_dev
*pdev
)
194 switch (pdev
->vendor
) {
195 case PCI_VENDOR_ID_INTEL
:
196 switch (pdev
->device
) {
197 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
200 case 0x1580 ... 0x1581:
201 case 0x1583 ... 0x158b:
202 case 0x37d0 ... 0x37d2:
214 static void vfio_pci_probe_power_state(struct vfio_pci_core_device
*vdev
)
216 struct pci_dev
*pdev
= vdev
->pdev
;
222 pci_read_config_word(pdev
, pdev
->pm_cap
+ PCI_PM_CTRL
, &pmcsr
);
224 vdev
->needs_pm_restore
= !(pmcsr
& PCI_PM_CTRL_NO_SOFT_RESET
);
228 * pci_set_power_state() wrapper handling devices which perform a soft reset on
229 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
230 * restore when returned to D0. Saved separately from pci_saved_state for use
231 * by PM capability emulation and separately from pci_dev internal saved state
232 * to avoid it being overwritten and consumed around other resets.
234 int vfio_pci_set_power_state(struct vfio_pci_core_device
*vdev
, pci_power_t state
)
236 struct pci_dev
*pdev
= vdev
->pdev
;
237 bool needs_restore
= false, needs_save
= false;
240 /* Prevent changing power state for PFs with VFs enabled */
241 if (pci_num_vf(pdev
) && state
> PCI_D0
)
244 if (vdev
->needs_pm_restore
) {
245 if (pdev
->current_state
< PCI_D3hot
&& state
>= PCI_D3hot
) {
246 pci_save_state(pdev
);
250 if (pdev
->current_state
>= PCI_D3hot
&& state
<= PCI_D0
)
251 needs_restore
= true;
254 ret
= pci_set_power_state(pdev
, state
);
257 /* D3 might be unsupported via quirk, skip unless in D3 */
258 if (needs_save
&& pdev
->current_state
>= PCI_D3hot
) {
260 * The current PCI state will be saved locally in
261 * 'pm_save' during the D3hot transition. When the
262 * device state is changed to D0 again with the current
263 * function, then pci_store_saved_state() will restore
264 * the state and will free the memory pointed by
265 * 'pm_save'. There are few cases where the PCI power
266 * state can be changed to D0 without the involvement
267 * of the driver. For these cases, free the earlier
268 * allocated memory first before overwriting 'pm_save'
269 * to prevent the memory leak.
271 kfree(vdev
->pm_save
);
272 vdev
->pm_save
= pci_store_saved_state(pdev
);
273 } else if (needs_restore
) {
274 pci_load_and_free_saved_state(pdev
, &vdev
->pm_save
);
275 pci_restore_state(pdev
);
282 static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device
*vdev
,
283 struct eventfd_ctx
*efdctx
)
286 * The vdev power related flags are protected with 'memory_lock'
289 vfio_pci_zap_and_down_write_memory_lock(vdev
);
290 if (vdev
->pm_runtime_engaged
) {
291 up_write(&vdev
->memory_lock
);
295 vdev
->pm_runtime_engaged
= true;
296 vdev
->pm_wake_eventfd_ctx
= efdctx
;
297 pm_runtime_put_noidle(&vdev
->pdev
->dev
);
298 up_write(&vdev
->memory_lock
);
303 static int vfio_pci_core_pm_entry(struct vfio_device
*device
, u32 flags
,
304 void __user
*arg
, size_t argsz
)
306 struct vfio_pci_core_device
*vdev
=
307 container_of(device
, struct vfio_pci_core_device
, vdev
);
310 ret
= vfio_check_feature(flags
, argsz
, VFIO_DEVICE_FEATURE_SET
, 0);
315 * Inside vfio_pci_runtime_pm_entry(), only the runtime PM usage count
316 * will be decremented. The pm_runtime_put() will be invoked again
317 * while returning from the ioctl and then the device can go into
318 * runtime suspended state.
320 return vfio_pci_runtime_pm_entry(vdev
, NULL
);
323 static int vfio_pci_core_pm_entry_with_wakeup(
324 struct vfio_device
*device
, u32 flags
,
325 struct vfio_device_low_power_entry_with_wakeup __user
*arg
,
328 struct vfio_pci_core_device
*vdev
=
329 container_of(device
, struct vfio_pci_core_device
, vdev
);
330 struct vfio_device_low_power_entry_with_wakeup entry
;
331 struct eventfd_ctx
*efdctx
;
334 ret
= vfio_check_feature(flags
, argsz
, VFIO_DEVICE_FEATURE_SET
,
339 if (copy_from_user(&entry
, arg
, sizeof(entry
)))
342 if (entry
.wakeup_eventfd
< 0)
345 efdctx
= eventfd_ctx_fdget(entry
.wakeup_eventfd
);
347 return PTR_ERR(efdctx
);
349 ret
= vfio_pci_runtime_pm_entry(vdev
, efdctx
);
351 eventfd_ctx_put(efdctx
);
356 static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device
*vdev
)
358 if (vdev
->pm_runtime_engaged
) {
359 vdev
->pm_runtime_engaged
= false;
360 pm_runtime_get_noresume(&vdev
->pdev
->dev
);
362 if (vdev
->pm_wake_eventfd_ctx
) {
363 eventfd_ctx_put(vdev
->pm_wake_eventfd_ctx
);
364 vdev
->pm_wake_eventfd_ctx
= NULL
;
369 static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device
*vdev
)
372 * The vdev power related flags are protected with 'memory_lock'
375 down_write(&vdev
->memory_lock
);
376 __vfio_pci_runtime_pm_exit(vdev
);
377 up_write(&vdev
->memory_lock
);
380 static int vfio_pci_core_pm_exit(struct vfio_device
*device
, u32 flags
,
381 void __user
*arg
, size_t argsz
)
383 struct vfio_pci_core_device
*vdev
=
384 container_of(device
, struct vfio_pci_core_device
, vdev
);
387 ret
= vfio_check_feature(flags
, argsz
, VFIO_DEVICE_FEATURE_SET
, 0);
392 * The device is always in the active state here due to pm wrappers
393 * around ioctls. If the device had entered a low power state and
394 * pm_wake_eventfd_ctx is valid, vfio_pci_core_runtime_resume() has
395 * already signaled the eventfd and exited low power mode itself.
396 * pm_runtime_engaged protects the redundant call here.
398 vfio_pci_runtime_pm_exit(vdev
);
403 static int vfio_pci_core_runtime_suspend(struct device
*dev
)
405 struct vfio_pci_core_device
*vdev
= dev_get_drvdata(dev
);
407 down_write(&vdev
->memory_lock
);
409 * The user can move the device into D3hot state before invoking
410 * power management IOCTL. Move the device into D0 state here and then
411 * the pci-driver core runtime PM suspend function will move the device
412 * into the low power state. Also, for the devices which have
413 * NoSoftRst-, it will help in restoring the original state
414 * (saved locally in 'vdev->pm_save').
416 vfio_pci_set_power_state(vdev
, PCI_D0
);
417 up_write(&vdev
->memory_lock
);
420 * If INTx is enabled, then mask INTx before going into the runtime
421 * suspended state and unmask the same in the runtime resume.
422 * If INTx has already been masked by the user, then
423 * vfio_pci_intx_mask() will return false and in that case, INTx
424 * should not be unmasked in the runtime resume.
426 vdev
->pm_intx_masked
= ((vdev
->irq_type
== VFIO_PCI_INTX_IRQ_INDEX
) &&
427 vfio_pci_intx_mask(vdev
));
432 static int vfio_pci_core_runtime_resume(struct device
*dev
)
434 struct vfio_pci_core_device
*vdev
= dev_get_drvdata(dev
);
437 * Resume with a pm_wake_eventfd_ctx signals the eventfd and exit
440 down_write(&vdev
->memory_lock
);
441 if (vdev
->pm_wake_eventfd_ctx
) {
442 eventfd_signal(vdev
->pm_wake_eventfd_ctx
);
443 __vfio_pci_runtime_pm_exit(vdev
);
445 up_write(&vdev
->memory_lock
);
447 if (vdev
->pm_intx_masked
)
448 vfio_pci_intx_unmask(vdev
);
452 #endif /* CONFIG_PM */
455 * The pci-driver core runtime PM routines always save the device state
456 * before going into suspended state. If the device is going into low power
457 * state with only with runtime PM ops, then no explicit handling is needed
458 * for the devices which have NoSoftRst-.
460 static const struct dev_pm_ops vfio_pci_core_pm_ops
= {
461 SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend
,
462 vfio_pci_core_runtime_resume
,
466 int vfio_pci_core_enable(struct vfio_pci_core_device
*vdev
)
468 struct pci_dev
*pdev
= vdev
->pdev
;
473 if (!disable_idle_d3
) {
474 ret
= pm_runtime_resume_and_get(&pdev
->dev
);
479 /* Don't allow our initial saved state to include busmaster */
480 pci_clear_master(pdev
);
482 ret
= pci_enable_device(pdev
);
486 /* If reset fails because of the device lock, fail this path entirely */
487 ret
= pci_try_reset_function(pdev
);
489 goto out_disable_device
;
491 vdev
->reset_works
= !ret
;
492 pci_save_state(pdev
);
493 vdev
->pci_saved_state
= pci_store_saved_state(pdev
);
494 if (!vdev
->pci_saved_state
)
495 pci_dbg(pdev
, "%s: Couldn't store saved state\n", __func__
);
497 if (likely(!nointxmask
)) {
498 if (vfio_pci_nointx(pdev
)) {
499 pci_info(pdev
, "Masking broken INTx support\n");
503 vdev
->pci_2_3
= pci_intx_mask_supported(pdev
);
506 pci_read_config_word(pdev
, PCI_COMMAND
, &cmd
);
507 if (vdev
->pci_2_3
&& (cmd
& PCI_COMMAND_INTX_DISABLE
)) {
508 cmd
&= ~PCI_COMMAND_INTX_DISABLE
;
509 pci_write_config_word(pdev
, PCI_COMMAND
, cmd
);
512 ret
= vfio_pci_zdev_open_device(vdev
);
516 ret
= vfio_config_init(vdev
);
520 msix_pos
= pdev
->msix_cap
;
525 pci_read_config_word(pdev
, msix_pos
+ PCI_MSIX_FLAGS
, &flags
);
526 pci_read_config_dword(pdev
, msix_pos
+ PCI_MSIX_TABLE
, &table
);
528 vdev
->msix_bar
= table
& PCI_MSIX_TABLE_BIR
;
529 vdev
->msix_offset
= table
& PCI_MSIX_TABLE_OFFSET
;
530 vdev
->msix_size
= ((flags
& PCI_MSIX_FLAGS_QSIZE
) + 1) * 16;
531 vdev
->has_dyn_msix
= pci_msix_can_alloc_dyn(pdev
);
533 vdev
->msix_bar
= 0xFF;
534 vdev
->has_dyn_msix
= false;
537 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev
))
538 vdev
->has_vga
= true;
544 vfio_pci_zdev_close_device(vdev
);
546 kfree(vdev
->pci_saved_state
);
547 vdev
->pci_saved_state
= NULL
;
549 pci_disable_device(pdev
);
551 if (!disable_idle_d3
)
552 pm_runtime_put(&pdev
->dev
);
555 EXPORT_SYMBOL_GPL(vfio_pci_core_enable
);
557 void vfio_pci_core_disable(struct vfio_pci_core_device
*vdev
)
559 struct pci_dev
*pdev
= vdev
->pdev
;
560 struct vfio_pci_dummy_resource
*dummy_res
, *tmp
;
561 struct vfio_pci_ioeventfd
*ioeventfd
, *ioeventfd_tmp
;
564 /* For needs_reset */
565 lockdep_assert_held(&vdev
->vdev
.dev_set
->lock
);
568 * This function can be invoked while the power state is non-D0.
569 * This non-D0 power state can be with or without runtime PM.
570 * vfio_pci_runtime_pm_exit() will internally increment the usage
571 * count corresponding to pm_runtime_put() called during low power
572 * feature entry and then pm_runtime_resume() will wake up the device,
573 * if the device has already gone into the suspended state. Otherwise,
574 * the vfio_pci_set_power_state() will change the device power state
577 vfio_pci_runtime_pm_exit(vdev
);
578 pm_runtime_resume(&pdev
->dev
);
581 * This function calls __pci_reset_function_locked() which internally
582 * can use pci_pm_reset() for the function reset. pci_pm_reset() will
583 * fail if the power state is non-D0. Also, for the devices which
584 * have NoSoftRst-, the reset function can cause the PCI config space
585 * reset without restoring the original state (saved locally in
588 vfio_pci_set_power_state(vdev
, PCI_D0
);
590 /* Stop the device from further DMA */
591 pci_clear_master(pdev
);
593 vfio_pci_set_irqs_ioctl(vdev
, VFIO_IRQ_SET_DATA_NONE
|
594 VFIO_IRQ_SET_ACTION_TRIGGER
,
595 vdev
->irq_type
, 0, 0, NULL
);
597 /* Device closed, don't need mutex here */
598 list_for_each_entry_safe(ioeventfd
, ioeventfd_tmp
,
599 &vdev
->ioeventfds_list
, next
) {
600 vfio_virqfd_disable(&ioeventfd
->virqfd
);
601 list_del(&ioeventfd
->next
);
604 vdev
->ioeventfds_nr
= 0;
606 vdev
->virq_disabled
= false;
608 for (i
= 0; i
< vdev
->num_regions
; i
++)
609 vdev
->region
[i
].ops
->release(vdev
, &vdev
->region
[i
]);
611 vdev
->num_regions
= 0;
613 vdev
->region
= NULL
; /* don't krealloc a freed pointer */
615 vfio_config_free(vdev
);
617 for (i
= 0; i
< PCI_STD_NUM_BARS
; i
++) {
618 bar
= i
+ PCI_STD_RESOURCES
;
619 if (!vdev
->barmap
[bar
])
621 pci_iounmap(pdev
, vdev
->barmap
[bar
]);
622 pci_release_selected_regions(pdev
, 1 << bar
);
623 vdev
->barmap
[bar
] = NULL
;
626 list_for_each_entry_safe(dummy_res
, tmp
,
627 &vdev
->dummy_resources_list
, res_next
) {
628 list_del(&dummy_res
->res_next
);
629 release_resource(&dummy_res
->resource
);
633 vdev
->needs_reset
= true;
635 vfio_pci_zdev_close_device(vdev
);
638 * If we have saved state, restore it. If we can reset the device,
639 * even better. Resetting with current state seems better than
640 * nothing, but saving and restoring current state without reset
643 if (pci_load_and_free_saved_state(pdev
, &vdev
->pci_saved_state
)) {
644 pci_info(pdev
, "%s: Couldn't reload saved state\n", __func__
);
646 if (!vdev
->reset_works
)
649 pci_save_state(pdev
);
653 * Disable INTx and MSI, presumably to avoid spurious interrupts
654 * during reset. Stolen from pci_reset_function()
656 pci_write_config_word(pdev
, PCI_COMMAND
, PCI_COMMAND_INTX_DISABLE
);
659 * Try to get the locks ourselves to prevent a deadlock. The
660 * success of this is dependent on being able to lock the device,
661 * which is not always possible.
662 * We can not use the "try" reset interface here, which will
663 * overwrite the previously restored configuration information.
665 if (vdev
->reset_works
&& pci_dev_trylock(pdev
)) {
666 if (!__pci_reset_function_locked(pdev
))
667 vdev
->needs_reset
= false;
668 pci_dev_unlock(pdev
);
671 pci_restore_state(pdev
);
673 pci_disable_device(pdev
);
675 vfio_pci_dev_set_try_reset(vdev
->vdev
.dev_set
);
677 /* Put the pm-runtime usage counter acquired during enable */
678 if (!disable_idle_d3
)
679 pm_runtime_put(&pdev
->dev
);
681 EXPORT_SYMBOL_GPL(vfio_pci_core_disable
);
683 void vfio_pci_core_close_device(struct vfio_device
*core_vdev
)
685 struct vfio_pci_core_device
*vdev
=
686 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
688 if (vdev
->sriov_pf_core_dev
) {
689 mutex_lock(&vdev
->sriov_pf_core_dev
->vf_token
->lock
);
690 WARN_ON(!vdev
->sriov_pf_core_dev
->vf_token
->users
);
691 vdev
->sriov_pf_core_dev
->vf_token
->users
--;
692 mutex_unlock(&vdev
->sriov_pf_core_dev
->vf_token
->lock
);
694 #if IS_ENABLED(CONFIG_EEH)
695 eeh_dev_release(vdev
->pdev
);
697 vfio_pci_core_disable(vdev
);
699 mutex_lock(&vdev
->igate
);
700 if (vdev
->err_trigger
) {
701 eventfd_ctx_put(vdev
->err_trigger
);
702 vdev
->err_trigger
= NULL
;
704 if (vdev
->req_trigger
) {
705 eventfd_ctx_put(vdev
->req_trigger
);
706 vdev
->req_trigger
= NULL
;
708 mutex_unlock(&vdev
->igate
);
710 EXPORT_SYMBOL_GPL(vfio_pci_core_close_device
);
712 void vfio_pci_core_finish_enable(struct vfio_pci_core_device
*vdev
)
714 vfio_pci_probe_mmaps(vdev
);
715 #if IS_ENABLED(CONFIG_EEH)
716 eeh_dev_open(vdev
->pdev
);
719 if (vdev
->sriov_pf_core_dev
) {
720 mutex_lock(&vdev
->sriov_pf_core_dev
->vf_token
->lock
);
721 vdev
->sriov_pf_core_dev
->vf_token
->users
++;
722 mutex_unlock(&vdev
->sriov_pf_core_dev
->vf_token
->lock
);
725 EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable
);
727 static int vfio_pci_get_irq_count(struct vfio_pci_core_device
*vdev
, int irq_type
)
729 if (irq_type
== VFIO_PCI_INTX_IRQ_INDEX
) {
732 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX
) ||
733 vdev
->nointx
|| vdev
->pdev
->is_virtfn
)
736 pci_read_config_byte(vdev
->pdev
, PCI_INTERRUPT_PIN
, &pin
);
739 } else if (irq_type
== VFIO_PCI_MSI_IRQ_INDEX
) {
743 pos
= vdev
->pdev
->msi_cap
;
745 pci_read_config_word(vdev
->pdev
,
746 pos
+ PCI_MSI_FLAGS
, &flags
);
747 return 1 << ((flags
& PCI_MSI_FLAGS_QMASK
) >> 1);
749 } else if (irq_type
== VFIO_PCI_MSIX_IRQ_INDEX
) {
753 pos
= vdev
->pdev
->msix_cap
;
755 pci_read_config_word(vdev
->pdev
,
756 pos
+ PCI_MSIX_FLAGS
, &flags
);
758 return (flags
& PCI_MSIX_FLAGS_QSIZE
) + 1;
760 } else if (irq_type
== VFIO_PCI_ERR_IRQ_INDEX
) {
761 if (pci_is_pcie(vdev
->pdev
))
763 } else if (irq_type
== VFIO_PCI_REQ_IRQ_INDEX
) {
770 static int vfio_pci_count_devs(struct pci_dev
*pdev
, void *data
)
776 struct vfio_pci_fill_info
{
777 struct vfio_device
*vdev
;
778 struct vfio_pci_dependent_device
*devices
;
784 static int vfio_pci_fill_devs(struct pci_dev
*pdev
, void *data
)
786 struct vfio_pci_dependent_device
*info
;
787 struct vfio_pci_fill_info
*fill
= data
;
789 /* The topology changed since we counted devices */
790 if (fill
->count
>= fill
->nr_devices
)
793 info
= &fill
->devices
[fill
->count
++];
794 info
->segment
= pci_domain_nr(pdev
->bus
);
795 info
->bus
= pdev
->bus
->number
;
796 info
->devfn
= pdev
->devfn
;
798 if (fill
->flags
& VFIO_PCI_HOT_RESET_FLAG_DEV_ID
) {
799 struct iommufd_ctx
*iommufd
= vfio_iommufd_device_ictx(fill
->vdev
);
800 struct vfio_device_set
*dev_set
= fill
->vdev
->dev_set
;
801 struct vfio_device
*vdev
;
804 * hot-reset requires all affected devices be represented in
807 vdev
= vfio_find_device_in_devset(dev_set
, &pdev
->dev
);
809 info
->devid
= VFIO_PCI_DEVID_NOT_OWNED
;
811 int id
= vfio_iommufd_get_dev_id(vdev
, iommufd
);
815 else if (id
== -ENOENT
)
816 info
->devid
= VFIO_PCI_DEVID_OWNED
;
818 info
->devid
= VFIO_PCI_DEVID_NOT_OWNED
;
820 /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
821 if (info
->devid
== VFIO_PCI_DEVID_NOT_OWNED
)
822 fill
->flags
&= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED
;
824 struct iommu_group
*iommu_group
;
826 iommu_group
= iommu_group_get(&pdev
->dev
);
828 return -EPERM
; /* Cannot reset non-isolated devices */
830 info
->group_id
= iommu_group_id(iommu_group
);
831 iommu_group_put(iommu_group
);
837 struct vfio_pci_group_info
{
842 static bool vfio_pci_dev_below_slot(struct pci_dev
*pdev
, struct pci_slot
*slot
)
844 for (; pdev
; pdev
= pdev
->bus
->self
)
845 if (pdev
->bus
== slot
->bus
)
846 return (pdev
->slot
== slot
);
850 struct vfio_pci_walk_info
{
851 int (*fn
)(struct pci_dev
*pdev
, void *data
);
853 struct pci_dev
*pdev
;
858 static int vfio_pci_walk_wrapper(struct pci_dev
*pdev
, void *data
)
860 struct vfio_pci_walk_info
*walk
= data
;
862 if (!walk
->slot
|| vfio_pci_dev_below_slot(pdev
, walk
->pdev
->slot
))
863 walk
->ret
= walk
->fn(pdev
, walk
->data
);
868 static int vfio_pci_for_each_slot_or_bus(struct pci_dev
*pdev
,
869 int (*fn
)(struct pci_dev
*,
870 void *data
), void *data
,
873 struct vfio_pci_walk_info walk
= {
874 .fn
= fn
, .data
= data
, .pdev
= pdev
, .slot
= slot
, .ret
= 0,
877 pci_walk_bus(pdev
->bus
, vfio_pci_walk_wrapper
, &walk
);
882 static int msix_mmappable_cap(struct vfio_pci_core_device
*vdev
,
883 struct vfio_info_cap
*caps
)
885 struct vfio_info_cap_header header
= {
886 .id
= VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
,
890 return vfio_info_add_capability(caps
, &header
, sizeof(header
));
893 int vfio_pci_core_register_dev_region(struct vfio_pci_core_device
*vdev
,
894 unsigned int type
, unsigned int subtype
,
895 const struct vfio_pci_regops
*ops
,
896 size_t size
, u32 flags
, void *data
)
898 struct vfio_pci_region
*region
;
900 region
= krealloc(vdev
->region
,
901 (vdev
->num_regions
+ 1) * sizeof(*region
),
906 vdev
->region
= region
;
907 vdev
->region
[vdev
->num_regions
].type
= type
;
908 vdev
->region
[vdev
->num_regions
].subtype
= subtype
;
909 vdev
->region
[vdev
->num_regions
].ops
= ops
;
910 vdev
->region
[vdev
->num_regions
].size
= size
;
911 vdev
->region
[vdev
->num_regions
].flags
= flags
;
912 vdev
->region
[vdev
->num_regions
].data
= data
;
918 EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region
);
920 static int vfio_pci_info_atomic_cap(struct vfio_pci_core_device
*vdev
,
921 struct vfio_info_cap
*caps
)
923 struct vfio_device_info_cap_pci_atomic_comp cap
= {
924 .header
.id
= VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP
,
927 struct pci_dev
*pdev
= pci_physfn(vdev
->pdev
);
930 pcie_capability_read_dword(pdev
, PCI_EXP_DEVCAP2
, &devcap2
);
932 if ((devcap2
& PCI_EXP_DEVCAP2_ATOMIC_COMP32
) &&
933 !pci_enable_atomic_ops_to_root(pdev
, PCI_EXP_DEVCAP2_ATOMIC_COMP32
))
934 cap
.flags
|= VFIO_PCI_ATOMIC_COMP32
;
936 if ((devcap2
& PCI_EXP_DEVCAP2_ATOMIC_COMP64
) &&
937 !pci_enable_atomic_ops_to_root(pdev
, PCI_EXP_DEVCAP2_ATOMIC_COMP64
))
938 cap
.flags
|= VFIO_PCI_ATOMIC_COMP64
;
940 if ((devcap2
& PCI_EXP_DEVCAP2_ATOMIC_COMP128
) &&
941 !pci_enable_atomic_ops_to_root(pdev
,
942 PCI_EXP_DEVCAP2_ATOMIC_COMP128
))
943 cap
.flags
|= VFIO_PCI_ATOMIC_COMP128
;
948 return vfio_info_add_capability(caps
, &cap
.header
, sizeof(cap
));
951 static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device
*vdev
,
952 struct vfio_device_info __user
*arg
)
954 unsigned long minsz
= offsetofend(struct vfio_device_info
, num_irqs
);
955 struct vfio_device_info info
= {};
956 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
959 if (copy_from_user(&info
, arg
, minsz
))
962 if (info
.argsz
< minsz
)
965 minsz
= min_t(size_t, info
.argsz
, sizeof(info
));
967 info
.flags
= VFIO_DEVICE_FLAGS_PCI
;
969 if (vdev
->reset_works
)
970 info
.flags
|= VFIO_DEVICE_FLAGS_RESET
;
972 info
.num_regions
= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
;
973 info
.num_irqs
= VFIO_PCI_NUM_IRQS
;
975 ret
= vfio_pci_info_zdev_add_caps(vdev
, &caps
);
976 if (ret
&& ret
!= -ENODEV
) {
978 "Failed to setup zPCI info capabilities\n");
982 ret
= vfio_pci_info_atomic_cap(vdev
, &caps
);
983 if (ret
&& ret
!= -ENODEV
) {
985 "Failed to setup AtomicOps info capability\n");
990 info
.flags
|= VFIO_DEVICE_FLAGS_CAPS
;
991 if (info
.argsz
< sizeof(info
) + caps
.size
) {
992 info
.argsz
= sizeof(info
) + caps
.size
;
994 vfio_info_cap_shift(&caps
, sizeof(info
));
995 if (copy_to_user(arg
+ 1, caps
.buf
, caps
.size
)) {
999 info
.cap_offset
= sizeof(*arg
);
1005 return copy_to_user(arg
, &info
, minsz
) ? -EFAULT
: 0;
1008 static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device
*vdev
,
1009 struct vfio_region_info __user
*arg
)
1011 unsigned long minsz
= offsetofend(struct vfio_region_info
, offset
);
1012 struct pci_dev
*pdev
= vdev
->pdev
;
1013 struct vfio_region_info info
;
1014 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
1017 if (copy_from_user(&info
, arg
, minsz
))
1020 if (info
.argsz
< minsz
)
1023 switch (info
.index
) {
1024 case VFIO_PCI_CONFIG_REGION_INDEX
:
1025 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1026 info
.size
= pdev
->cfg_size
;
1027 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
1028 VFIO_REGION_INFO_FLAG_WRITE
;
1030 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
1031 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1032 info
.size
= pci_resource_len(pdev
, info
.index
);
1038 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
1039 VFIO_REGION_INFO_FLAG_WRITE
;
1040 if (vdev
->bar_mmap_supported
[info
.index
]) {
1041 info
.flags
|= VFIO_REGION_INFO_FLAG_MMAP
;
1042 if (info
.index
== vdev
->msix_bar
) {
1043 ret
= msix_mmappable_cap(vdev
, &caps
);
1050 case VFIO_PCI_ROM_REGION_INDEX
: {
1055 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1058 /* Report the BAR size, not the ROM size */
1059 info
.size
= pci_resource_len(pdev
, info
.index
);
1061 /* Shadow ROMs appear as PCI option ROMs */
1062 if (pdev
->resource
[PCI_ROM_RESOURCE
].flags
&
1063 IORESOURCE_ROM_SHADOW
)
1064 info
.size
= 0x20000;
1070 * Is it really there? Enable memory decode for implicit access
1073 cmd
= vfio_pci_memory_lock_and_enable(vdev
);
1074 io
= pci_map_rom(pdev
, &size
);
1076 info
.flags
= VFIO_REGION_INFO_FLAG_READ
;
1077 pci_unmap_rom(pdev
, io
);
1081 vfio_pci_memory_unlock_and_restore(vdev
, cmd
);
1085 case VFIO_PCI_VGA_REGION_INDEX
:
1089 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1090 info
.size
= 0xc0000;
1091 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
1092 VFIO_REGION_INFO_FLAG_WRITE
;
1096 struct vfio_region_info_cap_type cap_type
= {
1097 .header
.id
= VFIO_REGION_INFO_CAP_TYPE
,
1101 if (info
.index
>= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
1103 info
.index
= array_index_nospec(
1104 info
.index
, VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
);
1106 i
= info
.index
- VFIO_PCI_NUM_REGIONS
;
1108 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1109 info
.size
= vdev
->region
[i
].size
;
1110 info
.flags
= vdev
->region
[i
].flags
;
1112 cap_type
.type
= vdev
->region
[i
].type
;
1113 cap_type
.subtype
= vdev
->region
[i
].subtype
;
1115 ret
= vfio_info_add_capability(&caps
, &cap_type
.header
,
1120 if (vdev
->region
[i
].ops
->add_capability
) {
1121 ret
= vdev
->region
[i
].ops
->add_capability(
1122 vdev
, &vdev
->region
[i
], &caps
);
1130 info
.flags
|= VFIO_REGION_INFO_FLAG_CAPS
;
1131 if (info
.argsz
< sizeof(info
) + caps
.size
) {
1132 info
.argsz
= sizeof(info
) + caps
.size
;
1133 info
.cap_offset
= 0;
1135 vfio_info_cap_shift(&caps
, sizeof(info
));
1136 if (copy_to_user(arg
+ 1, caps
.buf
, caps
.size
)) {
1140 info
.cap_offset
= sizeof(*arg
);
1146 return copy_to_user(arg
, &info
, minsz
) ? -EFAULT
: 0;
1149 static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device
*vdev
,
1150 struct vfio_irq_info __user
*arg
)
1152 unsigned long minsz
= offsetofend(struct vfio_irq_info
, count
);
1153 struct vfio_irq_info info
;
1155 if (copy_from_user(&info
, arg
, minsz
))
1158 if (info
.argsz
< minsz
|| info
.index
>= VFIO_PCI_NUM_IRQS
)
1161 switch (info
.index
) {
1162 case VFIO_PCI_INTX_IRQ_INDEX
... VFIO_PCI_MSIX_IRQ_INDEX
:
1163 case VFIO_PCI_REQ_IRQ_INDEX
:
1165 case VFIO_PCI_ERR_IRQ_INDEX
:
1166 if (pci_is_pcie(vdev
->pdev
))
1173 info
.flags
= VFIO_IRQ_INFO_EVENTFD
;
1175 info
.count
= vfio_pci_get_irq_count(vdev
, info
.index
);
1177 if (info
.index
== VFIO_PCI_INTX_IRQ_INDEX
)
1179 (VFIO_IRQ_INFO_MASKABLE
| VFIO_IRQ_INFO_AUTOMASKED
);
1180 else if (info
.index
!= VFIO_PCI_MSIX_IRQ_INDEX
|| !vdev
->has_dyn_msix
)
1181 info
.flags
|= VFIO_IRQ_INFO_NORESIZE
;
1183 return copy_to_user(arg
, &info
, minsz
) ? -EFAULT
: 0;
1186 static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device
*vdev
,
1187 struct vfio_irq_set __user
*arg
)
1189 unsigned long minsz
= offsetofend(struct vfio_irq_set
, count
);
1190 struct vfio_irq_set hdr
;
1193 size_t data_size
= 0;
1195 if (copy_from_user(&hdr
, arg
, minsz
))
1198 max
= vfio_pci_get_irq_count(vdev
, hdr
.index
);
1200 ret
= vfio_set_irqs_validate_and_prepare(&hdr
, max
, VFIO_PCI_NUM_IRQS
,
1206 data
= memdup_user(&arg
->data
, data_size
);
1208 return PTR_ERR(data
);
1211 mutex_lock(&vdev
->igate
);
1213 ret
= vfio_pci_set_irqs_ioctl(vdev
, hdr
.flags
, hdr
.index
, hdr
.start
,
1216 mutex_unlock(&vdev
->igate
);
1222 static int vfio_pci_ioctl_reset(struct vfio_pci_core_device
*vdev
,
1227 if (!vdev
->reset_works
)
1230 vfio_pci_zap_and_down_write_memory_lock(vdev
);
1233 * This function can be invoked while the power state is non-D0. If
1234 * pci_try_reset_function() has been called while the power state is
1235 * non-D0, then pci_try_reset_function() will internally set the power
1236 * state to D0 without vfio driver involvement. For the devices which
1237 * have NoSoftRst-, the reset function can cause the PCI config space
1238 * reset without restoring the original state (saved locally in
1241 vfio_pci_set_power_state(vdev
, PCI_D0
);
1243 ret
= pci_try_reset_function(vdev
->pdev
);
1244 up_write(&vdev
->memory_lock
);
1249 static int vfio_pci_ioctl_get_pci_hot_reset_info(
1250 struct vfio_pci_core_device
*vdev
,
1251 struct vfio_pci_hot_reset_info __user
*arg
)
1253 unsigned long minsz
=
1254 offsetofend(struct vfio_pci_hot_reset_info
, count
);
1255 struct vfio_pci_dependent_device
*devices
= NULL
;
1256 struct vfio_pci_hot_reset_info hdr
;
1257 struct vfio_pci_fill_info fill
= {};
1261 if (copy_from_user(&hdr
, arg
, minsz
))
1264 if (hdr
.argsz
< minsz
)
1269 /* Can we do a slot or bus reset or neither? */
1270 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1272 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1275 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
, vfio_pci_count_devs
,
1280 if (WARN_ON(!count
)) /* Should always be at least one */
1283 if (count
> (hdr
.argsz
- sizeof(hdr
)) / sizeof(*devices
)) {
1289 devices
= kcalloc(count
, sizeof(*devices
), GFP_KERNEL
);
1293 fill
.devices
= devices
;
1294 fill
.nr_devices
= count
;
1295 fill
.vdev
= &vdev
->vdev
;
1297 if (vfio_device_cdev_opened(&vdev
->vdev
))
1298 fill
.flags
|= VFIO_PCI_HOT_RESET_FLAG_DEV_ID
|
1299 VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED
;
1301 mutex_lock(&vdev
->vdev
.dev_set
->lock
);
1302 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
, vfio_pci_fill_devs
,
1304 mutex_unlock(&vdev
->vdev
.dev_set
->lock
);
1308 if (copy_to_user(arg
->devices
, devices
,
1309 sizeof(*devices
) * fill
.count
)) {
1314 hdr
.count
= fill
.count
;
1315 hdr
.flags
= fill
.flags
;
1318 if (copy_to_user(arg
, &hdr
, minsz
))
1326 vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device
*vdev
,
1327 u32 array_count
, bool slot
,
1328 struct vfio_pci_hot_reset __user
*arg
)
1331 struct file
**files
;
1332 struct vfio_pci_group_info info
;
1333 int file_idx
, count
= 0, ret
= 0;
1336 * We can't let userspace give us an arbitrarily large buffer to copy,
1337 * so verify how many we think there could be. Note groups can have
1338 * multiple devices so one group per device is the max.
1340 ret
= vfio_pci_for_each_slot_or_bus(vdev
->pdev
, vfio_pci_count_devs
,
1345 if (array_count
> count
)
1348 group_fds
= kcalloc(array_count
, sizeof(*group_fds
), GFP_KERNEL
);
1349 files
= kcalloc(array_count
, sizeof(*files
), GFP_KERNEL
);
1350 if (!group_fds
|| !files
) {
1356 if (copy_from_user(group_fds
, arg
->group_fds
,
1357 array_count
* sizeof(*group_fds
))) {
1364 * Get the group file for each fd to ensure the group is held across
1367 for (file_idx
= 0; file_idx
< array_count
; file_idx
++) {
1368 struct file
*file
= fget(group_fds
[file_idx
]);
1375 /* Ensure the FD is a vfio group FD.*/
1376 if (!vfio_file_is_group(file
)) {
1382 files
[file_idx
] = file
;
1387 /* release reference to groups on error */
1389 goto hot_reset_release
;
1391 info
.count
= array_count
;
1394 ret
= vfio_pci_dev_set_hot_reset(vdev
->vdev
.dev_set
, &info
, NULL
);
1397 for (file_idx
--; file_idx
>= 0; file_idx
--)
1398 fput(files
[file_idx
]);
1404 static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device
*vdev
,
1405 struct vfio_pci_hot_reset __user
*arg
)
1407 unsigned long minsz
= offsetofend(struct vfio_pci_hot_reset
, count
);
1408 struct vfio_pci_hot_reset hdr
;
1411 if (copy_from_user(&hdr
, arg
, minsz
))
1414 if (hdr
.argsz
< minsz
|| hdr
.flags
)
1417 /* zero-length array is only for cdev opened devices */
1418 if (!!hdr
.count
== vfio_device_cdev_opened(&vdev
->vdev
))
1421 /* Can we do a slot or bus reset or neither? */
1422 if (!pci_probe_reset_slot(vdev
->pdev
->slot
))
1424 else if (pci_probe_reset_bus(vdev
->pdev
->bus
))
1428 return vfio_pci_ioctl_pci_hot_reset_groups(vdev
, hdr
.count
, slot
, arg
);
1430 return vfio_pci_dev_set_hot_reset(vdev
->vdev
.dev_set
, NULL
,
1431 vfio_iommufd_device_ictx(&vdev
->vdev
));
1434 static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device
*vdev
,
1435 struct vfio_device_ioeventfd __user
*arg
)
1437 unsigned long minsz
= offsetofend(struct vfio_device_ioeventfd
, fd
);
1438 struct vfio_device_ioeventfd ioeventfd
;
1441 if (copy_from_user(&ioeventfd
, arg
, minsz
))
1444 if (ioeventfd
.argsz
< minsz
)
1447 if (ioeventfd
.flags
& ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK
)
1450 count
= ioeventfd
.flags
& VFIO_DEVICE_IOEVENTFD_SIZE_MASK
;
1452 if (hweight8(count
) != 1 || ioeventfd
.fd
< -1)
1455 return vfio_pci_ioeventfd(vdev
, ioeventfd
.offset
, ioeventfd
.data
, count
,
1459 long vfio_pci_core_ioctl(struct vfio_device
*core_vdev
, unsigned int cmd
,
1462 struct vfio_pci_core_device
*vdev
=
1463 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1464 void __user
*uarg
= (void __user
*)arg
;
1467 case VFIO_DEVICE_GET_INFO
:
1468 return vfio_pci_ioctl_get_info(vdev
, uarg
);
1469 case VFIO_DEVICE_GET_IRQ_INFO
:
1470 return vfio_pci_ioctl_get_irq_info(vdev
, uarg
);
1471 case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO
:
1472 return vfio_pci_ioctl_get_pci_hot_reset_info(vdev
, uarg
);
1473 case VFIO_DEVICE_GET_REGION_INFO
:
1474 return vfio_pci_ioctl_get_region_info(vdev
, uarg
);
1475 case VFIO_DEVICE_IOEVENTFD
:
1476 return vfio_pci_ioctl_ioeventfd(vdev
, uarg
);
1477 case VFIO_DEVICE_PCI_HOT_RESET
:
1478 return vfio_pci_ioctl_pci_hot_reset(vdev
, uarg
);
1479 case VFIO_DEVICE_RESET
:
1480 return vfio_pci_ioctl_reset(vdev
, uarg
);
1481 case VFIO_DEVICE_SET_IRQS
:
1482 return vfio_pci_ioctl_set_irqs(vdev
, uarg
);
1487 EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl
);
1489 static int vfio_pci_core_feature_token(struct vfio_device
*device
, u32 flags
,
1490 uuid_t __user
*arg
, size_t argsz
)
1492 struct vfio_pci_core_device
*vdev
=
1493 container_of(device
, struct vfio_pci_core_device
, vdev
);
1497 if (!vdev
->vf_token
)
1500 * We do not support GET of the VF Token UUID as this could
1501 * expose the token of the previous device user.
1503 ret
= vfio_check_feature(flags
, argsz
, VFIO_DEVICE_FEATURE_SET
,
1508 if (copy_from_user(&uuid
, arg
, sizeof(uuid
)))
1511 mutex_lock(&vdev
->vf_token
->lock
);
1512 uuid_copy(&vdev
->vf_token
->uuid
, &uuid
);
1513 mutex_unlock(&vdev
->vf_token
->lock
);
1517 int vfio_pci_core_ioctl_feature(struct vfio_device
*device
, u32 flags
,
1518 void __user
*arg
, size_t argsz
)
1520 switch (flags
& VFIO_DEVICE_FEATURE_MASK
) {
1521 case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
:
1522 return vfio_pci_core_pm_entry(device
, flags
, arg
, argsz
);
1523 case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
:
1524 return vfio_pci_core_pm_entry_with_wakeup(device
, flags
,
1526 case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
:
1527 return vfio_pci_core_pm_exit(device
, flags
, arg
, argsz
);
1528 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN
:
1529 return vfio_pci_core_feature_token(device
, flags
, arg
, argsz
);
1534 EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl_feature
);
1536 static ssize_t
vfio_pci_rw(struct vfio_pci_core_device
*vdev
, char __user
*buf
,
1537 size_t count
, loff_t
*ppos
, bool iswrite
)
1539 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
1542 if (index
>= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
1545 ret
= pm_runtime_resume_and_get(&vdev
->pdev
->dev
);
1547 pci_info_ratelimited(vdev
->pdev
, "runtime resume failed %d\n",
1553 case VFIO_PCI_CONFIG_REGION_INDEX
:
1554 ret
= vfio_pci_config_rw(vdev
, buf
, count
, ppos
, iswrite
);
1557 case VFIO_PCI_ROM_REGION_INDEX
:
1561 ret
= vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, false);
1564 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
1565 ret
= vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, iswrite
);
1568 case VFIO_PCI_VGA_REGION_INDEX
:
1569 ret
= vfio_pci_vga_rw(vdev
, buf
, count
, ppos
, iswrite
);
1573 index
-= VFIO_PCI_NUM_REGIONS
;
1574 ret
= vdev
->region
[index
].ops
->rw(vdev
, buf
,
1575 count
, ppos
, iswrite
);
1579 pm_runtime_put(&vdev
->pdev
->dev
);
1583 ssize_t
vfio_pci_core_read(struct vfio_device
*core_vdev
, char __user
*buf
,
1584 size_t count
, loff_t
*ppos
)
1586 struct vfio_pci_core_device
*vdev
=
1587 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1592 return vfio_pci_rw(vdev
, buf
, count
, ppos
, false);
1594 EXPORT_SYMBOL_GPL(vfio_pci_core_read
);
1596 ssize_t
vfio_pci_core_write(struct vfio_device
*core_vdev
, const char __user
*buf
,
1597 size_t count
, loff_t
*ppos
)
1599 struct vfio_pci_core_device
*vdev
=
1600 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1605 return vfio_pci_rw(vdev
, (char __user
*)buf
, count
, ppos
, true);
1607 EXPORT_SYMBOL_GPL(vfio_pci_core_write
);
1609 static void vfio_pci_zap_bars(struct vfio_pci_core_device
*vdev
)
1611 struct vfio_device
*core_vdev
= &vdev
->vdev
;
1612 loff_t start
= VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX
);
1613 loff_t end
= VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX
);
1614 loff_t len
= end
- start
;
1616 unmap_mapping_range(core_vdev
->inode
->i_mapping
, start
, len
, true);
1619 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device
*vdev
)
1621 down_write(&vdev
->memory_lock
);
1622 vfio_pci_zap_bars(vdev
);
1625 u16
vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device
*vdev
)
1629 down_write(&vdev
->memory_lock
);
1630 pci_read_config_word(vdev
->pdev
, PCI_COMMAND
, &cmd
);
1631 if (!(cmd
& PCI_COMMAND_MEMORY
))
1632 pci_write_config_word(vdev
->pdev
, PCI_COMMAND
,
1633 cmd
| PCI_COMMAND_MEMORY
);
1638 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device
*vdev
, u16 cmd
)
1640 pci_write_config_word(vdev
->pdev
, PCI_COMMAND
, cmd
);
1641 up_write(&vdev
->memory_lock
);
1644 static unsigned long vma_to_pfn(struct vm_area_struct
*vma
)
1646 struct vfio_pci_core_device
*vdev
= vma
->vm_private_data
;
1647 int index
= vma
->vm_pgoff
>> (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
);
1650 pgoff
= vma
->vm_pgoff
&
1651 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
1653 return (pci_resource_start(vdev
->pdev
, index
) >> PAGE_SHIFT
) + pgoff
;
1656 static vm_fault_t
vfio_pci_mmap_huge_fault(struct vm_fault
*vmf
,
1659 struct vm_area_struct
*vma
= vmf
->vma
;
1660 struct vfio_pci_core_device
*vdev
= vma
->vm_private_data
;
1661 unsigned long pfn
, pgoff
= vmf
->pgoff
- vma
->vm_pgoff
;
1662 vm_fault_t ret
= VM_FAULT_SIGBUS
;
1664 if (order
&& (vmf
->address
& ((PAGE_SIZE
<< order
) - 1) ||
1665 vmf
->address
+ (PAGE_SIZE
<< order
) > vma
->vm_end
)) {
1666 ret
= VM_FAULT_FALLBACK
;
1670 pfn
= vma_to_pfn(vma
);
1672 down_read(&vdev
->memory_lock
);
1674 if (vdev
->pm_runtime_engaged
|| !__vfio_pci_memory_enabled(vdev
))
1679 ret
= vmf_insert_pfn(vma
, vmf
->address
, pfn
+ pgoff
);
1681 #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
1683 ret
= vmf_insert_pfn_pmd(vmf
, __pfn_to_pfn_t(pfn
+ pgoff
,
1687 #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
1689 ret
= vmf_insert_pfn_pud(vmf
, __pfn_to_pfn_t(pfn
+ pgoff
,
1694 ret
= VM_FAULT_FALLBACK
;
1698 up_read(&vdev
->memory_lock
);
1700 dev_dbg_ratelimited(&vdev
->pdev
->dev
,
1701 "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n",
1704 (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
),
1705 pgoff
, (unsigned int)ret
);
1710 static vm_fault_t
vfio_pci_mmap_page_fault(struct vm_fault
*vmf
)
1712 return vfio_pci_mmap_huge_fault(vmf
, 0);
1715 static const struct vm_operations_struct vfio_pci_mmap_ops
= {
1716 .fault
= vfio_pci_mmap_page_fault
,
1717 #ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
1718 .huge_fault
= vfio_pci_mmap_huge_fault
,
1722 int vfio_pci_core_mmap(struct vfio_device
*core_vdev
, struct vm_area_struct
*vma
)
1724 struct vfio_pci_core_device
*vdev
=
1725 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1726 struct pci_dev
*pdev
= vdev
->pdev
;
1728 u64 phys_len
, req_len
, pgoff
, req_start
;
1731 index
= vma
->vm_pgoff
>> (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
);
1733 if (index
>= VFIO_PCI_NUM_REGIONS
+ vdev
->num_regions
)
1735 if (vma
->vm_end
< vma
->vm_start
)
1737 if ((vma
->vm_flags
& VM_SHARED
) == 0)
1739 if (index
>= VFIO_PCI_NUM_REGIONS
) {
1740 int regnum
= index
- VFIO_PCI_NUM_REGIONS
;
1741 struct vfio_pci_region
*region
= vdev
->region
+ regnum
;
1743 if (region
->ops
&& region
->ops
->mmap
&&
1744 (region
->flags
& VFIO_REGION_INFO_FLAG_MMAP
))
1745 return region
->ops
->mmap(vdev
, region
, vma
);
1748 if (index
>= VFIO_PCI_ROM_REGION_INDEX
)
1750 if (!vdev
->bar_mmap_supported
[index
])
1753 phys_len
= PAGE_ALIGN(pci_resource_len(pdev
, index
));
1754 req_len
= vma
->vm_end
- vma
->vm_start
;
1755 pgoff
= vma
->vm_pgoff
&
1756 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
1757 req_start
= pgoff
<< PAGE_SHIFT
;
1759 if (req_start
+ req_len
> phys_len
)
1763 * Even though we don't make use of the barmap for the mmap,
1764 * we need to request the region and the barmap tracks that.
1766 if (!vdev
->barmap
[index
]) {
1767 ret
= pci_request_selected_regions(pdev
,
1768 1 << index
, "vfio-pci");
1772 vdev
->barmap
[index
] = pci_iomap(pdev
, index
, 0);
1773 if (!vdev
->barmap
[index
]) {
1774 pci_release_selected_regions(pdev
, 1 << index
);
1779 vma
->vm_private_data
= vdev
;
1780 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
1781 vma
->vm_page_prot
= pgprot_decrypted(vma
->vm_page_prot
);
1784 * Set vm_flags now, they should not be changed in the fault handler.
1785 * We want the same flags and page protection (decrypted above) as
1786 * io_remap_pfn_range() would set.
1788 * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
1789 * allowing KVM stage 2 device mapping attributes to use Normal-NC
1790 * rather than DEVICE_nGnRE, which allows guest mappings
1791 * supporting write-combining attributes (WC). ARM does not
1792 * architecturally guarantee this is safe, and indeed some MMIO
1793 * regions like the GICv2 VCPU interface can trigger uncontained
1794 * faults if Normal-NC is used.
1796 * To safely use VFIO in KVM the platform must guarantee full
1797 * safety in the guest where no action taken against a MMIO
1798 * mapping can trigger an uncontained failure. The assumption is
1799 * that most VFIO PCI platforms support this for both mapping types,
1800 * at least in common flows, based on some expectations of how
1801 * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in
1804 vm_flags_set(vma
, VM_ALLOW_ANY_UNCACHED
| VM_IO
| VM_PFNMAP
|
1805 VM_DONTEXPAND
| VM_DONTDUMP
);
1806 vma
->vm_ops
= &vfio_pci_mmap_ops
;
1810 EXPORT_SYMBOL_GPL(vfio_pci_core_mmap
);
1812 void vfio_pci_core_request(struct vfio_device
*core_vdev
, unsigned int count
)
1814 struct vfio_pci_core_device
*vdev
=
1815 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1816 struct pci_dev
*pdev
= vdev
->pdev
;
1818 mutex_lock(&vdev
->igate
);
1820 if (vdev
->req_trigger
) {
1822 pci_notice_ratelimited(pdev
,
1823 "Relaying device request to user (#%u)\n",
1825 eventfd_signal(vdev
->req_trigger
);
1826 } else if (count
== 0) {
1828 "No device request channel registered, blocked until released by user\n");
1831 mutex_unlock(&vdev
->igate
);
1833 EXPORT_SYMBOL_GPL(vfio_pci_core_request
);
1835 static int vfio_pci_validate_vf_token(struct vfio_pci_core_device
*vdev
,
1836 bool vf_token
, uuid_t
*uuid
)
1839 * There's always some degree of trust or collaboration between SR-IOV
1840 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1841 * can disrupt VFs with a reset, but often the PF has more explicit
1842 * access to deny service to the VF or access data passed through the
1843 * VF. We therefore require an opt-in via a shared VF token (UUID) to
1844 * represent this trust. This both prevents that a VF driver might
1845 * assume the PF driver is a trusted, in-kernel driver, and also that
1846 * a PF driver might be replaced with a rogue driver, unknown to in-use
1849 * Therefore when presented with a VF, if the PF is a vfio device and
1850 * it is bound to the vfio-pci driver, the user needs to provide a VF
1851 * token to access the device, in the form of appending a vf_token to
1852 * the device name, for example:
1854 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1856 * When presented with a PF which has VFs in use, the user must also
1857 * provide the current VF token to prove collaboration with existing
1858 * VF users. If VFs are not in use, the VF token provided for the PF
1859 * device will act to set the VF token.
1861 * If the VF token is provided but unused, an error is generated.
1863 if (vdev
->pdev
->is_virtfn
) {
1864 struct vfio_pci_core_device
*pf_vdev
= vdev
->sriov_pf_core_dev
;
1869 return 0; /* PF is not vfio-pci, no VF token */
1871 pci_info_ratelimited(vdev
->pdev
,
1872 "VF token incorrectly provided, PF not bound to vfio-pci\n");
1877 pci_info_ratelimited(vdev
->pdev
,
1878 "VF token required to access device\n");
1882 mutex_lock(&pf_vdev
->vf_token
->lock
);
1883 match
= uuid_equal(uuid
, &pf_vdev
->vf_token
->uuid
);
1884 mutex_unlock(&pf_vdev
->vf_token
->lock
);
1887 pci_info_ratelimited(vdev
->pdev
,
1888 "Incorrect VF token provided for device\n");
1891 } else if (vdev
->vf_token
) {
1892 mutex_lock(&vdev
->vf_token
->lock
);
1893 if (vdev
->vf_token
->users
) {
1895 mutex_unlock(&vdev
->vf_token
->lock
);
1896 pci_info_ratelimited(vdev
->pdev
,
1897 "VF token required to access device\n");
1901 if (!uuid_equal(uuid
, &vdev
->vf_token
->uuid
)) {
1902 mutex_unlock(&vdev
->vf_token
->lock
);
1903 pci_info_ratelimited(vdev
->pdev
,
1904 "Incorrect VF token provided for device\n");
1907 } else if (vf_token
) {
1908 uuid_copy(&vdev
->vf_token
->uuid
, uuid
);
1911 mutex_unlock(&vdev
->vf_token
->lock
);
1912 } else if (vf_token
) {
1913 pci_info_ratelimited(vdev
->pdev
,
1914 "VF token incorrectly provided, not a PF or VF\n");
1921 #define VF_TOKEN_ARG "vf_token="
1923 int vfio_pci_core_match(struct vfio_device
*core_vdev
, char *buf
)
1925 struct vfio_pci_core_device
*vdev
=
1926 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
1927 bool vf_token
= false;
1931 if (strncmp(pci_name(vdev
->pdev
), buf
, strlen(pci_name(vdev
->pdev
))))
1932 return 0; /* No match */
1934 if (strlen(buf
) > strlen(pci_name(vdev
->pdev
))) {
1935 buf
+= strlen(pci_name(vdev
->pdev
));
1938 return 0; /* No match: non-whitespace after name */
1946 if (!vf_token
&& !strncmp(buf
, VF_TOKEN_ARG
,
1947 strlen(VF_TOKEN_ARG
))) {
1948 buf
+= strlen(VF_TOKEN_ARG
);
1950 if (strlen(buf
) < UUID_STRING_LEN
)
1953 ret
= uuid_parse(buf
, &uuid
);
1958 buf
+= UUID_STRING_LEN
;
1960 /* Unknown/duplicate option */
1966 ret
= vfio_pci_validate_vf_token(vdev
, vf_token
, &uuid
);
1970 return 1; /* Match */
1972 EXPORT_SYMBOL_GPL(vfio_pci_core_match
);
1974 static int vfio_pci_bus_notifier(struct notifier_block
*nb
,
1975 unsigned long action
, void *data
)
1977 struct vfio_pci_core_device
*vdev
= container_of(nb
,
1978 struct vfio_pci_core_device
, nb
);
1979 struct device
*dev
= data
;
1980 struct pci_dev
*pdev
= to_pci_dev(dev
);
1981 struct pci_dev
*physfn
= pci_physfn(pdev
);
1983 if (action
== BUS_NOTIFY_ADD_DEVICE
&&
1984 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1985 pci_info(vdev
->pdev
, "Captured SR-IOV VF %s driver_override\n",
1987 pdev
->driver_override
= kasprintf(GFP_KERNEL
, "%s",
1988 vdev
->vdev
.ops
->name
);
1989 WARN_ON(!pdev
->driver_override
);
1990 } else if (action
== BUS_NOTIFY_BOUND_DRIVER
&&
1991 pdev
->is_virtfn
&& physfn
== vdev
->pdev
) {
1992 struct pci_driver
*drv
= pci_dev_driver(pdev
);
1994 if (drv
&& drv
!= pci_dev_driver(vdev
->pdev
))
1995 pci_warn(vdev
->pdev
,
1996 "VF %s bound to driver %s while PF bound to driver %s\n",
1997 pci_name(pdev
), drv
->name
,
1998 pci_dev_driver(vdev
->pdev
)->name
);
2004 static int vfio_pci_vf_init(struct vfio_pci_core_device
*vdev
)
2006 struct pci_dev
*pdev
= vdev
->pdev
;
2007 struct vfio_pci_core_device
*cur
;
2008 struct pci_dev
*physfn
;
2011 if (pdev
->is_virtfn
) {
2013 * If this VF was created by our vfio_pci_core_sriov_configure()
2014 * then we can find the PF vfio_pci_core_device now, and due to
2015 * the locking in pci_disable_sriov() it cannot change until
2016 * this VF device driver is removed.
2018 physfn
= pci_physfn(vdev
->pdev
);
2019 mutex_lock(&vfio_pci_sriov_pfs_mutex
);
2020 list_for_each_entry(cur
, &vfio_pci_sriov_pfs
, sriov_pfs_item
) {
2021 if (cur
->pdev
== physfn
) {
2022 vdev
->sriov_pf_core_dev
= cur
;
2026 mutex_unlock(&vfio_pci_sriov_pfs_mutex
);
2030 /* Not a SRIOV PF */
2031 if (!pdev
->is_physfn
)
2034 vdev
->vf_token
= kzalloc(sizeof(*vdev
->vf_token
), GFP_KERNEL
);
2035 if (!vdev
->vf_token
)
2038 mutex_init(&vdev
->vf_token
->lock
);
2039 uuid_gen(&vdev
->vf_token
->uuid
);
2041 vdev
->nb
.notifier_call
= vfio_pci_bus_notifier
;
2042 ret
= bus_register_notifier(&pci_bus_type
, &vdev
->nb
);
2044 kfree(vdev
->vf_token
);
2050 static void vfio_pci_vf_uninit(struct vfio_pci_core_device
*vdev
)
2052 if (!vdev
->vf_token
)
2055 bus_unregister_notifier(&pci_bus_type
, &vdev
->nb
);
2056 WARN_ON(vdev
->vf_token
->users
);
2057 mutex_destroy(&vdev
->vf_token
->lock
);
2058 kfree(vdev
->vf_token
);
2061 static int vfio_pci_vga_init(struct vfio_pci_core_device
*vdev
)
2063 struct pci_dev
*pdev
= vdev
->pdev
;
2066 if (!vfio_pci_is_vga(pdev
))
2069 ret
= aperture_remove_conflicting_pci_devices(pdev
, vdev
->vdev
.ops
->name
);
2073 ret
= vga_client_register(pdev
, vfio_pci_set_decode
);
2076 vga_set_legacy_decoding(pdev
, vfio_pci_set_decode(pdev
, false));
2080 static void vfio_pci_vga_uninit(struct vfio_pci_core_device
*vdev
)
2082 struct pci_dev
*pdev
= vdev
->pdev
;
2084 if (!vfio_pci_is_vga(pdev
))
2086 vga_client_unregister(pdev
);
2087 vga_set_legacy_decoding(pdev
, VGA_RSRC_NORMAL_IO
| VGA_RSRC_NORMAL_MEM
|
2088 VGA_RSRC_LEGACY_IO
|
2089 VGA_RSRC_LEGACY_MEM
);
2092 int vfio_pci_core_init_dev(struct vfio_device
*core_vdev
)
2094 struct vfio_pci_core_device
*vdev
=
2095 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
2097 vdev
->pdev
= to_pci_dev(core_vdev
->dev
);
2098 vdev
->irq_type
= VFIO_PCI_NUM_IRQS
;
2099 mutex_init(&vdev
->igate
);
2100 spin_lock_init(&vdev
->irqlock
);
2101 mutex_init(&vdev
->ioeventfds_lock
);
2102 INIT_LIST_HEAD(&vdev
->dummy_resources_list
);
2103 INIT_LIST_HEAD(&vdev
->ioeventfds_list
);
2104 INIT_LIST_HEAD(&vdev
->sriov_pfs_item
);
2105 init_rwsem(&vdev
->memory_lock
);
2106 xa_init(&vdev
->ctx
);
2110 EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev
);
2112 void vfio_pci_core_release_dev(struct vfio_device
*core_vdev
)
2114 struct vfio_pci_core_device
*vdev
=
2115 container_of(core_vdev
, struct vfio_pci_core_device
, vdev
);
2117 mutex_destroy(&vdev
->igate
);
2118 mutex_destroy(&vdev
->ioeventfds_lock
);
2119 kfree(vdev
->region
);
2120 kfree(vdev
->pm_save
);
2122 EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev
);
2124 int vfio_pci_core_register_device(struct vfio_pci_core_device
*vdev
)
2126 struct pci_dev
*pdev
= vdev
->pdev
;
2127 struct device
*dev
= &pdev
->dev
;
2130 /* Drivers must set the vfio_pci_core_device to their drvdata */
2131 if (WARN_ON(vdev
!= dev_get_drvdata(dev
)))
2134 if (pdev
->hdr_type
!= PCI_HEADER_TYPE_NORMAL
)
2137 if (vdev
->vdev
.mig_ops
) {
2138 if (!(vdev
->vdev
.mig_ops
->migration_get_state
&&
2139 vdev
->vdev
.mig_ops
->migration_set_state
&&
2140 vdev
->vdev
.mig_ops
->migration_get_data_size
) ||
2141 !(vdev
->vdev
.migration_flags
& VFIO_MIGRATION_STOP_COPY
))
2145 if (vdev
->vdev
.log_ops
&& !(vdev
->vdev
.log_ops
->log_start
&&
2146 vdev
->vdev
.log_ops
->log_stop
&&
2147 vdev
->vdev
.log_ops
->log_read_and_clear
))
2151 * Prevent binding to PFs with VFs enabled, the VFs might be in use
2152 * by the host or other users. We cannot capture the VFs if they
2153 * already exist, nor can we track VF users. Disabling SR-IOV here
2154 * would initiate removing the VFs, which would unbind the driver,
2155 * which is prone to blocking if that VF is also in use by vfio-pci.
2156 * Just reject these PFs and let the user sort it out.
2158 if (pci_num_vf(pdev
)) {
2159 pci_warn(pdev
, "Cannot bind to PF with SR-IOV enabled\n");
2163 if (pci_is_root_bus(pdev
->bus
)) {
2164 ret
= vfio_assign_device_set(&vdev
->vdev
, vdev
);
2165 } else if (!pci_probe_reset_slot(pdev
->slot
)) {
2166 ret
= vfio_assign_device_set(&vdev
->vdev
, pdev
->slot
);
2169 * If there is no slot reset support for this device, the whole
2170 * bus needs to be grouped together to support bus-wide resets.
2172 ret
= vfio_assign_device_set(&vdev
->vdev
, pdev
->bus
);
2177 ret
= vfio_pci_vf_init(vdev
);
2180 ret
= vfio_pci_vga_init(vdev
);
2184 vfio_pci_probe_power_state(vdev
);
2187 * pci-core sets the device power state to an unknown value at
2188 * bootup and after being removed from a driver. The only
2189 * transition it allows from this unknown state is to D0, which
2190 * typically happens when a driver calls pci_enable_device().
2191 * We're not ready to enable the device yet, but we do want to
2192 * be able to get to D3. Therefore first do a D0 transition
2193 * before enabling runtime PM.
2195 vfio_pci_set_power_state(vdev
, PCI_D0
);
2197 dev
->driver
->pm
= &vfio_pci_core_pm_ops
;
2198 pm_runtime_allow(dev
);
2199 if (!disable_idle_d3
)
2200 pm_runtime_put(dev
);
2202 ret
= vfio_register_group_dev(&vdev
->vdev
);
2208 if (!disable_idle_d3
)
2209 pm_runtime_get_noresume(dev
);
2211 pm_runtime_forbid(dev
);
2213 vfio_pci_vf_uninit(vdev
);
2216 EXPORT_SYMBOL_GPL(vfio_pci_core_register_device
);
2218 void vfio_pci_core_unregister_device(struct vfio_pci_core_device
*vdev
)
2220 vfio_pci_core_sriov_configure(vdev
, 0);
2222 vfio_unregister_group_dev(&vdev
->vdev
);
2224 vfio_pci_vf_uninit(vdev
);
2225 vfio_pci_vga_uninit(vdev
);
2227 if (!disable_idle_d3
)
2228 pm_runtime_get_noresume(&vdev
->pdev
->dev
);
2230 pm_runtime_forbid(&vdev
->pdev
->dev
);
2232 EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device
);
2234 pci_ers_result_t
vfio_pci_core_aer_err_detected(struct pci_dev
*pdev
,
2235 pci_channel_state_t state
)
2237 struct vfio_pci_core_device
*vdev
= dev_get_drvdata(&pdev
->dev
);
2239 mutex_lock(&vdev
->igate
);
2241 if (vdev
->err_trigger
)
2242 eventfd_signal(vdev
->err_trigger
);
2244 mutex_unlock(&vdev
->igate
);
2246 return PCI_ERS_RESULT_CAN_RECOVER
;
2248 EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected
);
2250 int vfio_pci_core_sriov_configure(struct vfio_pci_core_device
*vdev
,
2253 struct pci_dev
*pdev
= vdev
->pdev
;
2256 device_lock_assert(&pdev
->dev
);
2259 mutex_lock(&vfio_pci_sriov_pfs_mutex
);
2261 * The thread that adds the vdev to the list is the only thread
2262 * that gets to call pci_enable_sriov() and we will only allow
2263 * it to be called once without going through
2264 * pci_disable_sriov()
2266 if (!list_empty(&vdev
->sriov_pfs_item
)) {
2270 list_add_tail(&vdev
->sriov_pfs_item
, &vfio_pci_sriov_pfs
);
2271 mutex_unlock(&vfio_pci_sriov_pfs_mutex
);
2274 * The PF power state should always be higher than the VF power
2275 * state. The PF can be in low power state either with runtime
2276 * power management (when there is no user) or PCI_PM_CTRL
2277 * register write by the user. If PF is in the low power state,
2278 * then change the power state to D0 first before enabling
2279 * SR-IOV. Also, this function can be called at any time, and
2280 * userspace PCI_PM_CTRL write can race against this code path,
2281 * so protect the same with 'memory_lock'.
2283 ret
= pm_runtime_resume_and_get(&pdev
->dev
);
2287 down_write(&vdev
->memory_lock
);
2288 vfio_pci_set_power_state(vdev
, PCI_D0
);
2289 ret
= pci_enable_sriov(pdev
, nr_virtfn
);
2290 up_write(&vdev
->memory_lock
);
2292 pm_runtime_put(&pdev
->dev
);
2298 if (pci_num_vf(pdev
)) {
2299 pci_disable_sriov(pdev
);
2300 pm_runtime_put(&pdev
->dev
);
2304 mutex_lock(&vfio_pci_sriov_pfs_mutex
);
2305 list_del_init(&vdev
->sriov_pfs_item
);
2307 mutex_unlock(&vfio_pci_sriov_pfs_mutex
);
2310 EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure
);
2312 const struct pci_error_handlers vfio_pci_core_err_handlers
= {
2313 .error_detected
= vfio_pci_core_aer_err_detected
,
2315 EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers
);
2317 static bool vfio_dev_in_groups(struct vfio_device
*vdev
,
2318 struct vfio_pci_group_info
*groups
)
2325 for (i
= 0; i
< groups
->count
; i
++)
2326 if (vfio_file_has_dev(groups
->files
[i
], vdev
))
2331 static int vfio_pci_is_device_in_set(struct pci_dev
*pdev
, void *data
)
2333 struct vfio_device_set
*dev_set
= data
;
2335 return vfio_find_device_in_devset(dev_set
, &pdev
->dev
) ? 0 : -ENODEV
;
2339 * vfio-core considers a group to be viable and will create a vfio_device even
2340 * if some devices are bound to drivers like pci-stub or pcieport. Here we
2341 * require all PCI devices to be inside our dev_set since that ensures they stay
2342 * put and that every driver controlling the device can co-ordinate with the
2345 * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
2346 * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
2348 static struct pci_dev
*
2349 vfio_pci_dev_set_resettable(struct vfio_device_set
*dev_set
)
2351 struct pci_dev
*pdev
;
2353 lockdep_assert_held(&dev_set
->lock
);
2356 * By definition all PCI devices in the dev_set share the same PCI
2357 * reset, so any pci_dev will have the same outcomes for
2358 * pci_probe_reset_*() and pci_reset_bus().
2360 pdev
= list_first_entry(&dev_set
->device_list
,
2361 struct vfio_pci_core_device
,
2362 vdev
.dev_set_list
)->pdev
;
2364 /* pci_reset_bus() is supported */
2365 if (pci_probe_reset_slot(pdev
->slot
) && pci_probe_reset_bus(pdev
->bus
))
2368 if (vfio_pci_for_each_slot_or_bus(pdev
, vfio_pci_is_device_in_set
,
2370 !pci_probe_reset_slot(pdev
->slot
)))
2375 static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set
*dev_set
)
2377 struct vfio_pci_core_device
*cur
;
2380 list_for_each_entry(cur
, &dev_set
->device_list
, vdev
.dev_set_list
) {
2381 ret
= pm_runtime_resume_and_get(&cur
->pdev
->dev
);
2389 list_for_each_entry_continue_reverse(cur
, &dev_set
->device_list
,
2391 pm_runtime_put(&cur
->pdev
->dev
);
2396 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set
*dev_set
,
2397 struct vfio_pci_group_info
*groups
,
2398 struct iommufd_ctx
*iommufd_ctx
)
2400 struct vfio_pci_core_device
*vdev
;
2401 struct pci_dev
*pdev
;
2404 mutex_lock(&dev_set
->lock
);
2406 pdev
= vfio_pci_dev_set_resettable(dev_set
);
2413 * Some of the devices in the dev_set can be in the runtime suspended
2414 * state. Increment the usage count for all the devices in the dev_set
2415 * before reset and decrement the same after reset.
2417 ret
= vfio_pci_dev_set_pm_runtime_get(dev_set
);
2421 list_for_each_entry(vdev
, &dev_set
->device_list
, vdev
.dev_set_list
) {
2425 * Test whether all the affected devices can be reset by the
2428 * If called from a group opened device and the user provides
2429 * a set of groups, all the devices in the dev_set should be
2430 * contained by the set of groups provided by the user.
2432 * If called from a cdev opened device and the user provides
2433 * a zero-length array, all the devices in the dev_set must
2434 * be bound to the same iommufd_ctx as the input iommufd_ctx.
2435 * If there is any device that has not been bound to any
2436 * iommufd_ctx yet, check if its iommu_group has any device
2437 * bound to the input iommufd_ctx. Such devices can be
2438 * considered owned by the input iommufd_ctx as the device
2439 * cannot be owned by another iommufd_ctx when its iommu_group
2442 * Otherwise, reset is not allowed.
2445 int devid
= vfio_iommufd_get_dev_id(&vdev
->vdev
,
2448 owned
= (devid
> 0 || devid
== -ENOENT
);
2450 owned
= vfio_dev_in_groups(&vdev
->vdev
, groups
);
2459 * Take the memory write lock for each device and zap BAR
2460 * mappings to prevent the user accessing the device while in
2461 * reset. Locking multiple devices is prone to deadlock,
2462 * runaway and unwind if we hit contention.
2464 if (!down_write_trylock(&vdev
->memory_lock
)) {
2469 vfio_pci_zap_bars(vdev
);
2472 if (!list_entry_is_head(vdev
,
2473 &dev_set
->device_list
, vdev
.dev_set_list
)) {
2474 vdev
= list_prev_entry(vdev
, vdev
.dev_set_list
);
2479 * The pci_reset_bus() will reset all the devices in the bus.
2480 * The power state can be non-D0 for some of the devices in the bus.
2481 * For these devices, the pci_reset_bus() will internally set
2482 * the power state to D0 without vfio driver involvement.
2483 * For the devices which have NoSoftRst-, the reset function can
2484 * cause the PCI config space reset without restoring the original
2485 * state (saved locally in 'vdev->pm_save').
2487 list_for_each_entry(vdev
, &dev_set
->device_list
, vdev
.dev_set_list
)
2488 vfio_pci_set_power_state(vdev
, PCI_D0
);
2490 ret
= pci_reset_bus(pdev
);
2492 vdev
= list_last_entry(&dev_set
->device_list
,
2493 struct vfio_pci_core_device
, vdev
.dev_set_list
);
2496 list_for_each_entry_from_reverse(vdev
, &dev_set
->device_list
,
2498 up_write(&vdev
->memory_lock
);
2500 list_for_each_entry(vdev
, &dev_set
->device_list
, vdev
.dev_set_list
)
2501 pm_runtime_put(&vdev
->pdev
->dev
);
2504 mutex_unlock(&dev_set
->lock
);
2508 static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set
*dev_set
)
2510 struct vfio_pci_core_device
*cur
;
2511 bool needs_reset
= false;
2513 /* No other VFIO device in the set can be open. */
2514 if (vfio_device_set_open_count(dev_set
) > 1)
2517 list_for_each_entry(cur
, &dev_set
->device_list
, vdev
.dev_set_list
)
2518 needs_reset
|= cur
->needs_reset
;
2523 * If a bus or slot reset is available for the provided dev_set and:
2524 * - All of the devices affected by that bus or slot reset are unused
2525 * - At least one of the affected devices is marked dirty via
2526 * needs_reset (such as by lack of FLR support)
2527 * Then attempt to perform that bus or slot reset.
2529 static void vfio_pci_dev_set_try_reset(struct vfio_device_set
*dev_set
)
2531 struct vfio_pci_core_device
*cur
;
2532 struct pci_dev
*pdev
;
2533 bool reset_done
= false;
2535 if (!vfio_pci_dev_set_needs_reset(dev_set
))
2538 pdev
= vfio_pci_dev_set_resettable(dev_set
);
2543 * Some of the devices in the bus can be in the runtime suspended
2544 * state. Increment the usage count for all the devices in the dev_set
2545 * before reset and decrement the same after reset.
2547 if (!disable_idle_d3
&& vfio_pci_dev_set_pm_runtime_get(dev_set
))
2550 if (!pci_reset_bus(pdev
))
2553 list_for_each_entry(cur
, &dev_set
->device_list
, vdev
.dev_set_list
) {
2555 cur
->needs_reset
= false;
2557 if (!disable_idle_d3
)
2558 pm_runtime_put(&cur
->pdev
->dev
);
2562 void vfio_pci_core_set_params(bool is_nointxmask
, bool is_disable_vga
,
2563 bool is_disable_idle_d3
)
2565 nointxmask
= is_nointxmask
;
2566 disable_vga
= is_disable_vga
;
2567 disable_idle_d3
= is_disable_idle_d3
;
2569 EXPORT_SYMBOL_GPL(vfio_pci_core_set_params
);
2571 static void vfio_pci_core_cleanup(void)
2573 vfio_pci_uninit_perm_bits();
2576 static int __init
vfio_pci_core_init(void)
2578 /* Allocate shared config space permission data used by all devices */
2579 return vfio_pci_init_perm_bits();
2582 module_init(vfio_pci_core_init
);
2583 module_exit(vfio_pci_core_cleanup
);
2585 MODULE_LICENSE("GPL v2");
2586 MODULE_AUTHOR(DRIVER_AUTHOR
);
2587 MODULE_DESCRIPTION(DRIVER_DESC
);