2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/interrupt.h>
17 #include <linux/iommu.h>
18 #include <linux/module.h>
19 #include <linux/mutex.h>
20 #include <linux/notifier.h>
21 #include <linux/pci.h>
22 #include <linux/pm_runtime.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/uaccess.h>
26 #include <linux/vfio.h>
28 #include "vfio_pci_private.h"
30 #define DRIVER_VERSION "0.2"
31 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
32 #define DRIVER_DESC "VFIO PCI - User Level meta-driver"
34 static bool nointxmask
;
35 module_param_named(nointxmask
, nointxmask
, bool, S_IRUGO
| S_IWUSR
);
36 MODULE_PARM_DESC(nointxmask
,
37 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
39 static int vfio_pci_enable(struct vfio_pci_device
*vdev
)
41 struct pci_dev
*pdev
= vdev
->pdev
;
46 ret
= pci_enable_device(pdev
);
50 vdev
->reset_works
= (pci_reset_function(pdev
) == 0);
52 vdev
->pci_saved_state
= pci_store_saved_state(pdev
);
53 if (!vdev
->pci_saved_state
)
54 pr_debug("%s: Couldn't store %s saved state\n",
55 __func__
, dev_name(&pdev
->dev
));
57 ret
= vfio_config_init(vdev
);
59 pci_load_and_free_saved_state(pdev
, &vdev
->pci_saved_state
);
60 pci_disable_device(pdev
);
64 if (likely(!nointxmask
))
65 vdev
->pci_2_3
= pci_intx_mask_supported(pdev
);
67 pci_read_config_word(pdev
, PCI_COMMAND
, &cmd
);
68 if (vdev
->pci_2_3
&& (cmd
& PCI_COMMAND_INTX_DISABLE
)) {
69 cmd
&= ~PCI_COMMAND_INTX_DISABLE
;
70 pci_write_config_word(pdev
, PCI_COMMAND
, cmd
);
73 msix_pos
= pci_find_capability(pdev
, PCI_CAP_ID_MSIX
);
78 pci_read_config_word(pdev
, msix_pos
+ PCI_MSIX_FLAGS
, &flags
);
79 pci_read_config_dword(pdev
, msix_pos
+ PCI_MSIX_TABLE
, &table
);
81 vdev
->msix_bar
= table
& PCI_MSIX_FLAGS_BIRMASK
;
82 vdev
->msix_offset
= table
& ~PCI_MSIX_FLAGS_BIRMASK
;
83 vdev
->msix_size
= ((flags
& PCI_MSIX_FLAGS_QSIZE
) + 1) * 16;
85 vdev
->msix_bar
= 0xFF;
87 #ifdef CONFIG_VFIO_PCI_VGA
88 if ((pdev
->class >> 8) == PCI_CLASS_DISPLAY_VGA
)
95 static void vfio_pci_disable(struct vfio_pci_device
*vdev
)
97 struct pci_dev
*pdev
= vdev
->pdev
;
100 pci_disable_device(pdev
);
102 vfio_pci_set_irqs_ioctl(vdev
, VFIO_IRQ_SET_DATA_NONE
|
103 VFIO_IRQ_SET_ACTION_TRIGGER
,
104 vdev
->irq_type
, 0, 0, NULL
);
106 vdev
->virq_disabled
= false;
108 vfio_config_free(vdev
);
110 for (bar
= PCI_STD_RESOURCES
; bar
<= PCI_STD_RESOURCE_END
; bar
++) {
111 if (!vdev
->barmap
[bar
])
113 pci_iounmap(pdev
, vdev
->barmap
[bar
]);
114 pci_release_selected_regions(pdev
, 1 << bar
);
115 vdev
->barmap
[bar
] = NULL
;
119 * If we have saved state, restore it. If we can reset the device,
120 * even better. Resetting with current state seems better than
121 * nothing, but saving and restoring current state without reset
124 if (pci_load_and_free_saved_state(pdev
, &vdev
->pci_saved_state
)) {
125 pr_info("%s: Couldn't reload %s saved state\n",
126 __func__
, dev_name(&pdev
->dev
));
128 if (!vdev
->reset_works
)
131 pci_save_state(pdev
);
135 * Disable INTx and MSI, presumably to avoid spurious interrupts
136 * during reset. Stolen from pci_reset_function()
138 pci_write_config_word(pdev
, PCI_COMMAND
, PCI_COMMAND_INTX_DISABLE
);
140 if (vdev
->reset_works
)
141 __pci_reset_function(pdev
);
143 pci_restore_state(pdev
);
146 static void vfio_pci_release(void *device_data
)
148 struct vfio_pci_device
*vdev
= device_data
;
150 if (atomic_dec_and_test(&vdev
->refcnt
))
151 vfio_pci_disable(vdev
);
153 module_put(THIS_MODULE
);
156 static int vfio_pci_open(void *device_data
)
158 struct vfio_pci_device
*vdev
= device_data
;
160 if (!try_module_get(THIS_MODULE
))
163 if (atomic_inc_return(&vdev
->refcnt
) == 1) {
164 int ret
= vfio_pci_enable(vdev
);
166 module_put(THIS_MODULE
);
174 static int vfio_pci_get_irq_count(struct vfio_pci_device
*vdev
, int irq_type
)
176 if (irq_type
== VFIO_PCI_INTX_IRQ_INDEX
) {
178 pci_read_config_byte(vdev
->pdev
, PCI_INTERRUPT_PIN
, &pin
);
182 } else if (irq_type
== VFIO_PCI_MSI_IRQ_INDEX
) {
186 pos
= pci_find_capability(vdev
->pdev
, PCI_CAP_ID_MSI
);
188 pci_read_config_word(vdev
->pdev
,
189 pos
+ PCI_MSI_FLAGS
, &flags
);
191 return 1 << (flags
& PCI_MSI_FLAGS_QMASK
);
193 } else if (irq_type
== VFIO_PCI_MSIX_IRQ_INDEX
) {
197 pos
= pci_find_capability(vdev
->pdev
, PCI_CAP_ID_MSIX
);
199 pci_read_config_word(vdev
->pdev
,
200 pos
+ PCI_MSIX_FLAGS
, &flags
);
202 return (flags
& PCI_MSIX_FLAGS_QSIZE
) + 1;
209 static long vfio_pci_ioctl(void *device_data
,
210 unsigned int cmd
, unsigned long arg
)
212 struct vfio_pci_device
*vdev
= device_data
;
215 if (cmd
== VFIO_DEVICE_GET_INFO
) {
216 struct vfio_device_info info
;
218 minsz
= offsetofend(struct vfio_device_info
, num_irqs
);
220 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
223 if (info
.argsz
< minsz
)
226 info
.flags
= VFIO_DEVICE_FLAGS_PCI
;
228 if (vdev
->reset_works
)
229 info
.flags
|= VFIO_DEVICE_FLAGS_RESET
;
231 info
.num_regions
= VFIO_PCI_NUM_REGIONS
;
232 info
.num_irqs
= VFIO_PCI_NUM_IRQS
;
234 return copy_to_user((void __user
*)arg
, &info
, minsz
);
236 } else if (cmd
== VFIO_DEVICE_GET_REGION_INFO
) {
237 struct pci_dev
*pdev
= vdev
->pdev
;
238 struct vfio_region_info info
;
240 minsz
= offsetofend(struct vfio_region_info
, offset
);
242 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
245 if (info
.argsz
< minsz
)
248 switch (info
.index
) {
249 case VFIO_PCI_CONFIG_REGION_INDEX
:
250 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
251 info
.size
= pdev
->cfg_size
;
252 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
253 VFIO_REGION_INFO_FLAG_WRITE
;
255 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
256 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
257 info
.size
= pci_resource_len(pdev
, info
.index
);
263 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
264 VFIO_REGION_INFO_FLAG_WRITE
;
265 if (pci_resource_flags(pdev
, info
.index
) &
266 IORESOURCE_MEM
&& info
.size
>= PAGE_SIZE
)
267 info
.flags
|= VFIO_REGION_INFO_FLAG_MMAP
;
269 case VFIO_PCI_ROM_REGION_INDEX
:
274 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
277 /* Report the BAR size, not the ROM size */
278 info
.size
= pci_resource_len(pdev
, info
.index
);
282 /* Is it really there? */
283 io
= pci_map_rom(pdev
, &size
);
288 pci_unmap_rom(pdev
, io
);
290 info
.flags
= VFIO_REGION_INFO_FLAG_READ
;
293 case VFIO_PCI_VGA_REGION_INDEX
:
297 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
299 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
300 VFIO_REGION_INFO_FLAG_WRITE
;
307 return copy_to_user((void __user
*)arg
, &info
, minsz
);
309 } else if (cmd
== VFIO_DEVICE_GET_IRQ_INFO
) {
310 struct vfio_irq_info info
;
312 minsz
= offsetofend(struct vfio_irq_info
, count
);
314 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
317 if (info
.argsz
< minsz
|| info
.index
>= VFIO_PCI_NUM_IRQS
)
320 info
.flags
= VFIO_IRQ_INFO_EVENTFD
;
322 info
.count
= vfio_pci_get_irq_count(vdev
, info
.index
);
324 if (info
.index
== VFIO_PCI_INTX_IRQ_INDEX
)
325 info
.flags
|= (VFIO_IRQ_INFO_MASKABLE
|
326 VFIO_IRQ_INFO_AUTOMASKED
);
328 info
.flags
|= VFIO_IRQ_INFO_NORESIZE
;
330 return copy_to_user((void __user
*)arg
, &info
, minsz
);
332 } else if (cmd
== VFIO_DEVICE_SET_IRQS
) {
333 struct vfio_irq_set hdr
;
337 minsz
= offsetofend(struct vfio_irq_set
, count
);
339 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
342 if (hdr
.argsz
< minsz
|| hdr
.index
>= VFIO_PCI_NUM_IRQS
||
343 hdr
.flags
& ~(VFIO_IRQ_SET_DATA_TYPE_MASK
|
344 VFIO_IRQ_SET_ACTION_TYPE_MASK
))
347 if (!(hdr
.flags
& VFIO_IRQ_SET_DATA_NONE
)) {
349 int max
= vfio_pci_get_irq_count(vdev
, hdr
.index
);
351 if (hdr
.flags
& VFIO_IRQ_SET_DATA_BOOL
)
352 size
= sizeof(uint8_t);
353 else if (hdr
.flags
& VFIO_IRQ_SET_DATA_EVENTFD
)
354 size
= sizeof(int32_t);
358 if (hdr
.argsz
- minsz
< hdr
.count
* size
||
359 hdr
.start
>= max
|| hdr
.start
+ hdr
.count
> max
)
362 data
= memdup_user((void __user
*)(arg
+ minsz
),
365 return PTR_ERR(data
);
368 mutex_lock(&vdev
->igate
);
370 ret
= vfio_pci_set_irqs_ioctl(vdev
, hdr
.flags
, hdr
.index
,
371 hdr
.start
, hdr
.count
, data
);
373 mutex_unlock(&vdev
->igate
);
378 } else if (cmd
== VFIO_DEVICE_RESET
)
379 return vdev
->reset_works
?
380 pci_reset_function(vdev
->pdev
) : -EINVAL
;
385 static ssize_t
vfio_pci_rw(void *device_data
, char __user
*buf
,
386 size_t count
, loff_t
*ppos
, bool iswrite
)
388 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
389 struct vfio_pci_device
*vdev
= device_data
;
391 if (index
>= VFIO_PCI_NUM_REGIONS
)
395 case VFIO_PCI_CONFIG_REGION_INDEX
:
396 return vfio_pci_config_rw(vdev
, buf
, count
, ppos
, iswrite
);
398 case VFIO_PCI_ROM_REGION_INDEX
:
401 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, false);
403 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
404 return vfio_pci_bar_rw(vdev
, buf
, count
, ppos
, iswrite
);
406 case VFIO_PCI_VGA_REGION_INDEX
:
407 return vfio_pci_vga_rw(vdev
, buf
, count
, ppos
, iswrite
);
413 static ssize_t
vfio_pci_read(void *device_data
, char __user
*buf
,
414 size_t count
, loff_t
*ppos
)
419 return vfio_pci_rw(device_data
, buf
, count
, ppos
, false);
422 static ssize_t
vfio_pci_write(void *device_data
, const char __user
*buf
,
423 size_t count
, loff_t
*ppos
)
428 return vfio_pci_rw(device_data
, (char __user
*)buf
, count
, ppos
, true);
431 static int vfio_pci_mmap(void *device_data
, struct vm_area_struct
*vma
)
433 struct vfio_pci_device
*vdev
= device_data
;
434 struct pci_dev
*pdev
= vdev
->pdev
;
436 u64 phys_len
, req_len
, pgoff
, req_start
;
439 index
= vma
->vm_pgoff
>> (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
);
441 if (vma
->vm_end
< vma
->vm_start
)
443 if ((vma
->vm_flags
& VM_SHARED
) == 0)
445 if (index
>= VFIO_PCI_ROM_REGION_INDEX
)
447 if (!(pci_resource_flags(pdev
, index
) & IORESOURCE_MEM
))
450 phys_len
= pci_resource_len(pdev
, index
);
451 req_len
= vma
->vm_end
- vma
->vm_start
;
452 pgoff
= vma
->vm_pgoff
&
453 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
454 req_start
= pgoff
<< PAGE_SHIFT
;
456 if (phys_len
< PAGE_SIZE
|| req_start
+ req_len
> phys_len
)
459 if (index
== vdev
->msix_bar
) {
461 * Disallow mmaps overlapping the MSI-X table; users don't
462 * get to touch this directly. We could find somewhere
463 * else to map the overlap, but page granularity is only
464 * a recommendation, not a requirement, so the user needs
465 * to know which bits are real. Requiring them to mmap
466 * around the table makes that clear.
469 /* If neither entirely above nor below, then it overlaps */
470 if (!(req_start
>= vdev
->msix_offset
+ vdev
->msix_size
||
471 req_start
+ req_len
<= vdev
->msix_offset
))
476 * Even though we don't make use of the barmap for the mmap,
477 * we need to request the region and the barmap tracks that.
479 if (!vdev
->barmap
[index
]) {
480 ret
= pci_request_selected_regions(pdev
,
481 1 << index
, "vfio-pci");
485 vdev
->barmap
[index
] = pci_iomap(pdev
, index
, 0);
488 vma
->vm_private_data
= vdev
;
489 vma
->vm_flags
|= VM_IO
| VM_DONTEXPAND
| VM_DONTDUMP
;
490 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
491 vma
->vm_pgoff
= (pci_resource_start(pdev
, index
) >> PAGE_SHIFT
) + pgoff
;
493 return remap_pfn_range(vma
, vma
->vm_start
, vma
->vm_pgoff
,
494 req_len
, vma
->vm_page_prot
);
497 static const struct vfio_device_ops vfio_pci_ops
= {
499 .open
= vfio_pci_open
,
500 .release
= vfio_pci_release
,
501 .ioctl
= vfio_pci_ioctl
,
502 .read
= vfio_pci_read
,
503 .write
= vfio_pci_write
,
504 .mmap
= vfio_pci_mmap
,
507 static int vfio_pci_probe(struct pci_dev
*pdev
, const struct pci_device_id
*id
)
510 struct vfio_pci_device
*vdev
;
511 struct iommu_group
*group
;
514 pci_read_config_byte(pdev
, PCI_HEADER_TYPE
, &type
);
515 if ((type
& PCI_HEADER_TYPE
) != PCI_HEADER_TYPE_NORMAL
)
518 group
= iommu_group_get(&pdev
->dev
);
522 vdev
= kzalloc(sizeof(*vdev
), GFP_KERNEL
);
524 iommu_group_put(group
);
529 vdev
->irq_type
= VFIO_PCI_NUM_IRQS
;
530 mutex_init(&vdev
->igate
);
531 spin_lock_init(&vdev
->irqlock
);
532 atomic_set(&vdev
->refcnt
, 0);
534 ret
= vfio_add_group_dev(&pdev
->dev
, &vfio_pci_ops
, vdev
);
536 iommu_group_put(group
);
543 static void vfio_pci_remove(struct pci_dev
*pdev
)
545 struct vfio_pci_device
*vdev
;
547 vdev
= vfio_del_group_dev(&pdev
->dev
);
551 iommu_group_put(pdev
->dev
.iommu_group
);
555 static struct pci_driver vfio_pci_driver
= {
557 .id_table
= NULL
, /* only dynamic ids */
558 .probe
= vfio_pci_probe
,
559 .remove
= vfio_pci_remove
,
562 static void __exit
vfio_pci_cleanup(void)
564 pci_unregister_driver(&vfio_pci_driver
);
565 vfio_pci_virqfd_exit();
566 vfio_pci_uninit_perm_bits();
569 static int __init
vfio_pci_init(void)
573 /* Allocate shared config space permision data used by all devices */
574 ret
= vfio_pci_init_perm_bits();
578 /* Start the virqfd cleanup handler */
579 ret
= vfio_pci_virqfd_init();
583 /* Register and scan for devices */
584 ret
= pci_register_driver(&vfio_pci_driver
);
591 vfio_pci_virqfd_exit();
593 vfio_pci_uninit_perm_bits();
597 module_init(vfio_pci_init
);
598 module_exit(vfio_pci_cleanup
);
600 MODULE_VERSION(DRIVER_VERSION
);
601 MODULE_LICENSE("GPL v2");
602 MODULE_AUTHOR(DRIVER_AUTHOR
);
603 MODULE_DESCRIPTION(DRIVER_DESC
);