1 // SPDX-License-Identifier: GPL-2.0
3 * Volume Management Device driver
4 * Copyright (c) 2015, Intel Corporation.
7 #include <linux/device.h>
8 #include <linux/interrupt.h>
10 #include <linux/kernel.h>
11 #include <linux/module.h>
12 #include <linux/msi.h>
13 #include <linux/pci.h>
14 #include <linux/srcu.h>
15 #include <linux/rculist.h>
16 #include <linux/rcupdate.h>
18 #include <asm/irqdomain.h>
19 #include <asm/device.h>
21 #include <asm/msidef.h>
27 #define PCI_REG_VMCAP 0x40
28 #define BUS_RESTRICT_CAP(vmcap) (vmcap & 0x1)
29 #define PCI_REG_VMCONFIG 0x44
30 #define BUS_RESTRICT_CFG(vmcfg) ((vmcfg >> 8) & 0x3)
31 #define PCI_REG_VMLOCK 0x70
32 #define MB2_SHADOW_EN(vmlock) (vmlock & 0x2)
34 #define MB2_SHADOW_OFFSET 0x2000
35 #define MB2_SHADOW_SIZE 16
39 * Device may contain registers which hint the physical location of the
40 * membars, in order to allow proper address translation during
41 * resource assignment to enable guest virtualization
43 VMD_FEAT_HAS_MEMBAR_SHADOW
= (1 << 0),
46 * Device may provide root port configuration information which limits
49 VMD_FEAT_HAS_BUS_RESTRICTIONS
= (1 << 1),
53 * Lock for manipulating VMD IRQ lists.
55 static DEFINE_RAW_SPINLOCK(list_lock
);
58 * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
59 * @node: list item for parent traversal.
60 * @irq: back pointer to parent.
61 * @enabled: true if driver enabled IRQ
62 * @virq: the virtual IRQ value provided to the requesting driver.
64 * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
65 * a VMD IRQ using this structure.
68 struct list_head node
;
69 struct vmd_irq_list
*irq
;
75 * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector
76 * @irq_list: the list of irq's the VMD one demuxes to.
77 * @srcu: SRCU struct for local synchronization.
78 * @count: number of child IRQs assigned to this vector; used to track
82 struct list_head irq_list
;
83 struct srcu_struct srcu
;
94 struct vmd_irq_list
*irqs
;
96 struct pci_sysdata sysdata
;
97 struct resource resources
[3];
98 struct irq_domain
*irq_domain
;
103 static inline struct vmd_dev
*vmd_from_bus(struct pci_bus
*bus
)
105 return container_of(bus
->sysdata
, struct vmd_dev
, sysdata
);
108 static inline unsigned int index_from_irqs(struct vmd_dev
*vmd
,
109 struct vmd_irq_list
*irqs
)
111 return irqs
- vmd
->irqs
;
115 * Drivers managing a device in a VMD domain allocate their own IRQs as before,
116 * but the MSI entry for the hardware it's driving will be programmed with a
117 * destination ID for the VMD MSI-X table. The VMD muxes interrupts in its
118 * domain into one of its own, and the VMD driver de-muxes these for the
119 * handlers sharing that VMD IRQ. The vmd irq_domain provides the operations
120 * and irq_chip to set this up.
122 static void vmd_compose_msi_msg(struct irq_data
*data
, struct msi_msg
*msg
)
124 struct vmd_irq
*vmdirq
= data
->chip_data
;
125 struct vmd_irq_list
*irq
= vmdirq
->irq
;
126 struct vmd_dev
*vmd
= irq_data_get_irq_handler_data(data
);
128 msg
->address_hi
= MSI_ADDR_BASE_HI
;
129 msg
->address_lo
= MSI_ADDR_BASE_LO
|
130 MSI_ADDR_DEST_ID(index_from_irqs(vmd
, irq
));
135 * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops.
137 static void vmd_irq_enable(struct irq_data
*data
)
139 struct vmd_irq
*vmdirq
= data
->chip_data
;
142 raw_spin_lock_irqsave(&list_lock
, flags
);
143 WARN_ON(vmdirq
->enabled
);
144 list_add_tail_rcu(&vmdirq
->node
, &vmdirq
->irq
->irq_list
);
145 vmdirq
->enabled
= true;
146 raw_spin_unlock_irqrestore(&list_lock
, flags
);
148 data
->chip
->irq_unmask(data
);
151 static void vmd_irq_disable(struct irq_data
*data
)
153 struct vmd_irq
*vmdirq
= data
->chip_data
;
156 data
->chip
->irq_mask(data
);
158 raw_spin_lock_irqsave(&list_lock
, flags
);
159 if (vmdirq
->enabled
) {
160 list_del_rcu(&vmdirq
->node
);
161 vmdirq
->enabled
= false;
163 raw_spin_unlock_irqrestore(&list_lock
, flags
);
167 * XXX: Stubbed until we develop acceptable way to not create conflicts with
168 * other devices sharing the same vector.
170 static int vmd_irq_set_affinity(struct irq_data
*data
,
171 const struct cpumask
*dest
, bool force
)
176 static struct irq_chip vmd_msi_controller
= {
178 .irq_enable
= vmd_irq_enable
,
179 .irq_disable
= vmd_irq_disable
,
180 .irq_compose_msi_msg
= vmd_compose_msi_msg
,
181 .irq_set_affinity
= vmd_irq_set_affinity
,
184 static irq_hw_number_t
vmd_get_hwirq(struct msi_domain_info
*info
,
185 msi_alloc_info_t
*arg
)
191 * XXX: We can be even smarter selecting the best IRQ once we solve the
194 static struct vmd_irq_list
*vmd_next_irq(struct vmd_dev
*vmd
, struct msi_desc
*desc
)
199 if (vmd
->msix_count
== 1)
200 return &vmd
->irqs
[0];
203 * White list for fast-interrupt handlers. All others will share the
204 * "slow" interrupt vector.
206 switch (msi_desc_to_pci_dev(desc
)->class) {
207 case PCI_CLASS_STORAGE_EXPRESS
:
210 return &vmd
->irqs
[0];
213 raw_spin_lock_irqsave(&list_lock
, flags
);
214 for (i
= 1; i
< vmd
->msix_count
; i
++)
215 if (vmd
->irqs
[i
].count
< vmd
->irqs
[best
].count
)
217 vmd
->irqs
[best
].count
++;
218 raw_spin_unlock_irqrestore(&list_lock
, flags
);
220 return &vmd
->irqs
[best
];
223 static int vmd_msi_init(struct irq_domain
*domain
, struct msi_domain_info
*info
,
224 unsigned int virq
, irq_hw_number_t hwirq
,
225 msi_alloc_info_t
*arg
)
227 struct msi_desc
*desc
= arg
->desc
;
228 struct vmd_dev
*vmd
= vmd_from_bus(msi_desc_to_pci_dev(desc
)->bus
);
229 struct vmd_irq
*vmdirq
= kzalloc(sizeof(*vmdirq
), GFP_KERNEL
);
230 unsigned int index
, vector
;
235 INIT_LIST_HEAD(&vmdirq
->node
);
236 vmdirq
->irq
= vmd_next_irq(vmd
, desc
);
238 index
= index_from_irqs(vmd
, vmdirq
->irq
);
239 vector
= pci_irq_vector(vmd
->dev
, index
);
241 irq_domain_set_info(domain
, virq
, vector
, info
->chip
, vmdirq
,
242 handle_untracked_irq
, vmd
, NULL
);
246 static void vmd_msi_free(struct irq_domain
*domain
,
247 struct msi_domain_info
*info
, unsigned int virq
)
249 struct vmd_irq
*vmdirq
= irq_get_chip_data(virq
);
252 synchronize_srcu(&vmdirq
->irq
->srcu
);
254 /* XXX: Potential optimization to rebalance */
255 raw_spin_lock_irqsave(&list_lock
, flags
);
256 vmdirq
->irq
->count
--;
257 raw_spin_unlock_irqrestore(&list_lock
, flags
);
262 static int vmd_msi_prepare(struct irq_domain
*domain
, struct device
*dev
,
263 int nvec
, msi_alloc_info_t
*arg
)
265 struct pci_dev
*pdev
= to_pci_dev(dev
);
266 struct vmd_dev
*vmd
= vmd_from_bus(pdev
->bus
);
268 if (nvec
> vmd
->msix_count
)
269 return vmd
->msix_count
;
271 memset(arg
, 0, sizeof(*arg
));
275 static void vmd_set_desc(msi_alloc_info_t
*arg
, struct msi_desc
*desc
)
280 static struct msi_domain_ops vmd_msi_domain_ops
= {
281 .get_hwirq
= vmd_get_hwirq
,
282 .msi_init
= vmd_msi_init
,
283 .msi_free
= vmd_msi_free
,
284 .msi_prepare
= vmd_msi_prepare
,
285 .set_desc
= vmd_set_desc
,
288 static struct msi_domain_info vmd_msi_domain_info
= {
289 .flags
= MSI_FLAG_USE_DEF_DOM_OPS
| MSI_FLAG_USE_DEF_CHIP_OPS
|
291 .ops
= &vmd_msi_domain_ops
,
292 .chip
= &vmd_msi_controller
,
295 static char __iomem
*vmd_cfg_addr(struct vmd_dev
*vmd
, struct pci_bus
*bus
,
296 unsigned int devfn
, int reg
, int len
)
298 char __iomem
*addr
= vmd
->cfgbar
+
299 ((bus
->number
- vmd
->busn_start
) << 20) +
302 if ((addr
- vmd
->cfgbar
) + len
>=
303 resource_size(&vmd
->dev
->resource
[VMD_CFGBAR
]))
310 * CPU may deadlock if config space is not serialized on some versions of this
311 * hardware, so all config space access is done under a spinlock.
313 static int vmd_pci_read(struct pci_bus
*bus
, unsigned int devfn
, int reg
,
316 struct vmd_dev
*vmd
= vmd_from_bus(bus
);
317 char __iomem
*addr
= vmd_cfg_addr(vmd
, bus
, devfn
, reg
, len
);
324 spin_lock_irqsave(&vmd
->cfg_lock
, flags
);
327 *value
= readb(addr
);
330 *value
= readw(addr
);
333 *value
= readl(addr
);
339 spin_unlock_irqrestore(&vmd
->cfg_lock
, flags
);
344 * VMD h/w converts non-posted config writes to posted memory writes. The
345 * read-back in this function forces the completion so it returns only after
346 * the config space was written, as expected.
348 static int vmd_pci_write(struct pci_bus
*bus
, unsigned int devfn
, int reg
,
351 struct vmd_dev
*vmd
= vmd_from_bus(bus
);
352 char __iomem
*addr
= vmd_cfg_addr(vmd
, bus
, devfn
, reg
, len
);
359 spin_lock_irqsave(&vmd
->cfg_lock
, flags
);
377 spin_unlock_irqrestore(&vmd
->cfg_lock
, flags
);
381 static struct pci_ops vmd_ops
= {
382 .read
= vmd_pci_read
,
383 .write
= vmd_pci_write
,
386 static void vmd_attach_resources(struct vmd_dev
*vmd
)
388 vmd
->dev
->resource
[VMD_MEMBAR1
].child
= &vmd
->resources
[1];
389 vmd
->dev
->resource
[VMD_MEMBAR2
].child
= &vmd
->resources
[2];
392 static void vmd_detach_resources(struct vmd_dev
*vmd
)
394 vmd
->dev
->resource
[VMD_MEMBAR1
].child
= NULL
;
395 vmd
->dev
->resource
[VMD_MEMBAR2
].child
= NULL
;
399 * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
400 * Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of which the lower
401 * 16 bits are the PCI Segment Group (domain) number. Other bits are
402 * currently reserved.
404 static int vmd_find_free_domain(void)
407 struct pci_bus
*bus
= NULL
;
409 while ((bus
= pci_find_next_bus(bus
)) != NULL
)
410 domain
= max_t(int, domain
, pci_domain_nr(bus
));
414 static int vmd_enable_domain(struct vmd_dev
*vmd
, unsigned long features
)
416 struct pci_sysdata
*sd
= &vmd
->sysdata
;
417 struct fwnode_handle
*fn
;
418 struct resource
*res
;
421 LIST_HEAD(resources
);
422 resource_size_t offset
[2] = {0};
423 resource_size_t membar2_offset
= 0x2000;
424 struct pci_bus
*child
;
427 * Shadow registers may exist in certain VMD device ids which allow
428 * guests to correctly assign host physical addresses to the root ports
429 * and child devices. These registers will either return the host value
430 * or 0, depending on an enable bit in the VMD device.
432 if (features
& VMD_FEAT_HAS_MEMBAR_SHADOW
) {
436 membar2_offset
= MB2_SHADOW_OFFSET
+ MB2_SHADOW_SIZE
;
437 ret
= pci_read_config_dword(vmd
->dev
, PCI_REG_VMLOCK
, &vmlock
);
438 if (ret
|| vmlock
== ~0)
441 if (MB2_SHADOW_EN(vmlock
)) {
442 void __iomem
*membar2
;
444 membar2
= pci_iomap(vmd
->dev
, VMD_MEMBAR2
, 0);
447 offset
[0] = vmd
->dev
->resource
[VMD_MEMBAR1
].start
-
448 readq(membar2
+ MB2_SHADOW_OFFSET
);
449 offset
[1] = vmd
->dev
->resource
[VMD_MEMBAR2
].start
-
450 readq(membar2
+ MB2_SHADOW_OFFSET
+ 8);
451 pci_iounmap(vmd
->dev
, membar2
);
456 * Certain VMD devices may have a root port configuration option which
457 * limits the bus range to between 0-127, 128-255, or 224-255
459 if (features
& VMD_FEAT_HAS_BUS_RESTRICTIONS
) {
462 pci_read_config_word(vmd
->dev
, PCI_REG_VMCAP
, ®16
);
463 if (BUS_RESTRICT_CAP(reg16
)) {
464 pci_read_config_word(vmd
->dev
, PCI_REG_VMCONFIG
,
467 switch (BUS_RESTRICT_CFG(reg16
)) {
469 vmd
->busn_start
= 128;
472 vmd
->busn_start
= 224;
475 pci_err(vmd
->dev
, "Unknown Bus Offset Setting\n");
483 res
= &vmd
->dev
->resource
[VMD_CFGBAR
];
484 vmd
->resources
[0] = (struct resource
) {
485 .name
= "VMD CFGBAR",
486 .start
= vmd
->busn_start
,
487 .end
= vmd
->busn_start
+ (resource_size(res
) >> 20) - 1,
488 .flags
= IORESOURCE_BUS
| IORESOURCE_PCI_FIXED
,
492 * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can
493 * put 32-bit resources in the window.
495 * There's no hardware reason why a 64-bit window *couldn't*
496 * contain a 32-bit resource, but pbus_size_mem() computes the
497 * bridge window size assuming a 64-bit window will contain no
498 * 32-bit resources. __pci_assign_resource() enforces that
499 * artificial restriction to make sure everything will fit.
501 * The only way we could use a 64-bit non-prefetchable MEMBAR is
502 * if its address is <4GB so that we can convert it to a 32-bit
503 * resource. To be visible to the host OS, all VMD endpoints must
504 * be initially configured by platform BIOS, which includes setting
505 * up these resources. We can assume the device is configured
506 * according to the platform needs.
508 res
= &vmd
->dev
->resource
[VMD_MEMBAR1
];
509 upper_bits
= upper_32_bits(res
->end
);
510 flags
= res
->flags
& ~IORESOURCE_SIZEALIGN
;
512 flags
&= ~IORESOURCE_MEM_64
;
513 vmd
->resources
[1] = (struct resource
) {
514 .name
= "VMD MEMBAR1",
521 res
= &vmd
->dev
->resource
[VMD_MEMBAR2
];
522 upper_bits
= upper_32_bits(res
->end
);
523 flags
= res
->flags
& ~IORESOURCE_SIZEALIGN
;
525 flags
&= ~IORESOURCE_MEM_64
;
526 vmd
->resources
[2] = (struct resource
) {
527 .name
= "VMD MEMBAR2",
528 .start
= res
->start
+ membar2_offset
,
534 sd
->vmd_dev
= vmd
->dev
;
535 sd
->domain
= vmd_find_free_domain();
539 sd
->node
= pcibus_to_node(vmd
->dev
->bus
);
541 fn
= irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd
->sysdata
.domain
);
545 vmd
->irq_domain
= pci_msi_create_irq_domain(fn
, &vmd_msi_domain_info
,
547 irq_domain_free_fwnode(fn
);
548 if (!vmd
->irq_domain
)
551 pci_add_resource(&resources
, &vmd
->resources
[0]);
552 pci_add_resource_offset(&resources
, &vmd
->resources
[1], offset
[0]);
553 pci_add_resource_offset(&resources
, &vmd
->resources
[2], offset
[1]);
555 vmd
->bus
= pci_create_root_bus(&vmd
->dev
->dev
, vmd
->busn_start
,
556 &vmd_ops
, sd
, &resources
);
558 pci_free_resource_list(&resources
);
559 irq_domain_remove(vmd
->irq_domain
);
563 vmd_attach_resources(vmd
);
564 dev_set_msi_domain(&vmd
->bus
->dev
, vmd
->irq_domain
);
566 pci_scan_child_bus(vmd
->bus
);
567 pci_assign_unassigned_bus_resources(vmd
->bus
);
570 * VMD root buses are virtual and don't return true on pci_is_pcie()
571 * and will fail pcie_bus_configure_settings() early. It can instead be
572 * run on each of the real root ports.
574 list_for_each_entry(child
, &vmd
->bus
->children
, node
)
575 pcie_bus_configure_settings(child
);
577 pci_bus_add_devices(vmd
->bus
);
579 WARN(sysfs_create_link(&vmd
->dev
->dev
.kobj
, &vmd
->bus
->dev
.kobj
,
580 "domain"), "Can't create symlink to domain\n");
584 static irqreturn_t
vmd_irq(int irq
, void *data
)
586 struct vmd_irq_list
*irqs
= data
;
587 struct vmd_irq
*vmdirq
;
590 idx
= srcu_read_lock(&irqs
->srcu
);
591 list_for_each_entry_rcu(vmdirq
, &irqs
->irq_list
, node
)
592 generic_handle_irq(vmdirq
->virq
);
593 srcu_read_unlock(&irqs
->srcu
, idx
);
598 static int vmd_probe(struct pci_dev
*dev
, const struct pci_device_id
*id
)
603 if (resource_size(&dev
->resource
[VMD_CFGBAR
]) < (1 << 20))
606 vmd
= devm_kzalloc(&dev
->dev
, sizeof(*vmd
), GFP_KERNEL
);
611 err
= pcim_enable_device(dev
);
615 vmd
->cfgbar
= pcim_iomap(dev
, VMD_CFGBAR
, 0);
620 if (dma_set_mask_and_coherent(&dev
->dev
, DMA_BIT_MASK(64)) &&
621 dma_set_mask_and_coherent(&dev
->dev
, DMA_BIT_MASK(32)))
624 vmd
->msix_count
= pci_msix_vec_count(dev
);
625 if (vmd
->msix_count
< 0)
628 vmd
->msix_count
= pci_alloc_irq_vectors(dev
, 1, vmd
->msix_count
,
630 if (vmd
->msix_count
< 0)
631 return vmd
->msix_count
;
633 vmd
->irqs
= devm_kcalloc(&dev
->dev
, vmd
->msix_count
, sizeof(*vmd
->irqs
),
638 for (i
= 0; i
< vmd
->msix_count
; i
++) {
639 err
= init_srcu_struct(&vmd
->irqs
[i
].srcu
);
643 INIT_LIST_HEAD(&vmd
->irqs
[i
].irq_list
);
644 err
= devm_request_irq(&dev
->dev
, pci_irq_vector(dev
, i
),
645 vmd_irq
, IRQF_NO_THREAD
,
646 "vmd", &vmd
->irqs
[i
]);
651 spin_lock_init(&vmd
->cfg_lock
);
652 pci_set_drvdata(dev
, vmd
);
653 err
= vmd_enable_domain(vmd
, (unsigned long) id
->driver_data
);
657 dev_info(&vmd
->dev
->dev
, "Bound to PCI domain %04x\n",
658 vmd
->sysdata
.domain
);
662 static void vmd_cleanup_srcu(struct vmd_dev
*vmd
)
666 for (i
= 0; i
< vmd
->msix_count
; i
++)
667 cleanup_srcu_struct(&vmd
->irqs
[i
].srcu
);
670 static void vmd_remove(struct pci_dev
*dev
)
672 struct vmd_dev
*vmd
= pci_get_drvdata(dev
);
674 sysfs_remove_link(&vmd
->dev
->dev
.kobj
, "domain");
675 pci_stop_root_bus(vmd
->bus
);
676 pci_remove_root_bus(vmd
->bus
);
677 vmd_cleanup_srcu(vmd
);
678 vmd_detach_resources(vmd
);
679 irq_domain_remove(vmd
->irq_domain
);
682 #ifdef CONFIG_PM_SLEEP
683 static int vmd_suspend(struct device
*dev
)
685 struct pci_dev
*pdev
= to_pci_dev(dev
);
686 struct vmd_dev
*vmd
= pci_get_drvdata(pdev
);
689 for (i
= 0; i
< vmd
->msix_count
; i
++)
690 devm_free_irq(dev
, pci_irq_vector(pdev
, i
), &vmd
->irqs
[i
]);
692 pci_save_state(pdev
);
696 static int vmd_resume(struct device
*dev
)
698 struct pci_dev
*pdev
= to_pci_dev(dev
);
699 struct vmd_dev
*vmd
= pci_get_drvdata(pdev
);
702 for (i
= 0; i
< vmd
->msix_count
; i
++) {
703 err
= devm_request_irq(dev
, pci_irq_vector(pdev
, i
),
704 vmd_irq
, IRQF_NO_THREAD
,
705 "vmd", &vmd
->irqs
[i
]);
710 pci_restore_state(pdev
);
714 static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops
, vmd_suspend
, vmd_resume
);
716 static const struct pci_device_id vmd_ids
[] = {
717 {PCI_DEVICE(PCI_VENDOR_ID_INTEL
, PCI_DEVICE_ID_INTEL_VMD_201D
),},
718 {PCI_DEVICE(PCI_VENDOR_ID_INTEL
, PCI_DEVICE_ID_INTEL_VMD_28C0
),
719 .driver_data
= VMD_FEAT_HAS_MEMBAR_SHADOW
|
720 VMD_FEAT_HAS_BUS_RESTRICTIONS
,},
721 {PCI_DEVICE(PCI_VENDOR_ID_INTEL
, 0x467f),
722 .driver_data
= VMD_FEAT_HAS_BUS_RESTRICTIONS
,},
723 {PCI_DEVICE(PCI_VENDOR_ID_INTEL
, 0x4c3d),
724 .driver_data
= VMD_FEAT_HAS_BUS_RESTRICTIONS
,},
725 {PCI_DEVICE(PCI_VENDOR_ID_INTEL
, PCI_DEVICE_ID_INTEL_VMD_9A0B
),
726 .driver_data
= VMD_FEAT_HAS_BUS_RESTRICTIONS
,},
729 MODULE_DEVICE_TABLE(pci
, vmd_ids
);
731 static struct pci_driver vmd_drv
= {
735 .remove
= vmd_remove
,
737 .pm
= &vmd_dev_pm_ops
,
740 module_pci_driver(vmd_drv
);
742 MODULE_AUTHOR("Intel Corporation");
743 MODULE_LICENSE("GPL v2");
744 MODULE_VERSION("0.6");