1 // SPDX-License-Identifier: GPL-2.0
3 * Volume Management Device driver
4 * Copyright (c) 2015, Intel Corporation.
7 #include <linux/device.h>
8 #include <linux/interrupt.h>
10 #include <linux/kernel.h>
11 #include <linux/module.h>
12 #include <linux/msi.h>
13 #include <linux/pci.h>
14 #include <linux/pci-acpi.h>
15 #include <linux/pci-ecam.h>
16 #include <linux/srcu.h>
17 #include <linux/rculist.h>
18 #include <linux/rcupdate.h>
20 #include <asm/irqdomain.h>
26 #define PCI_REG_VMCAP 0x40
27 #define BUS_RESTRICT_CAP(vmcap) (vmcap & 0x1)
28 #define PCI_REG_VMCONFIG 0x44
29 #define BUS_RESTRICT_CFG(vmcfg) ((vmcfg >> 8) & 0x3)
30 #define VMCONFIG_MSI_REMAP 0x2
31 #define PCI_REG_VMLOCK 0x70
32 #define MB2_SHADOW_EN(vmlock) (vmlock & 0x2)
34 #define MB2_SHADOW_OFFSET 0x2000
35 #define MB2_SHADOW_SIZE 16
39 * Device may contain registers which hint the physical location of the
40 * membars, in order to allow proper address translation during
41 * resource assignment to enable guest virtualization
43 VMD_FEAT_HAS_MEMBAR_SHADOW
= (1 << 0),
46 * Device may provide root port configuration information which limits
49 VMD_FEAT_HAS_BUS_RESTRICTIONS
= (1 << 1),
52 * Device contains physical location shadow registers in
53 * vendor-specific capability space
55 VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP
= (1 << 2),
58 * Device may use MSI-X vector 0 for software triggering and will not
59 * be used for MSI remapping
61 VMD_FEAT_OFFSET_FIRST_VECTOR
= (1 << 3),
64 * Device can bypass remapping MSI-X transactions into its MSI-X table,
65 * avoiding the requirement of a VMD MSI domain for child device
68 VMD_FEAT_CAN_BYPASS_MSI_REMAP
= (1 << 4),
71 * Enable ASPM on the PCIE root ports and set the default LTR of the
72 * storage devices on platforms where these values are not configured by
73 * BIOS. This is needed for laptops, which require these settings for
74 * proper power management of the SoC.
76 VMD_FEAT_BIOS_PM_QUIRK
= (1 << 5),
79 #define VMD_BIOS_PM_QUIRK_LTR 0x1003 /* 3145728 ns */
81 #define VMD_FEATS_CLIENT (VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP | \
82 VMD_FEAT_HAS_BUS_RESTRICTIONS | \
83 VMD_FEAT_OFFSET_FIRST_VECTOR | \
84 VMD_FEAT_BIOS_PM_QUIRK)
86 static DEFINE_IDA(vmd_instance_ida
);
89 * Lock for manipulating VMD IRQ lists.
91 static DEFINE_RAW_SPINLOCK(list_lock
);
94 * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
95 * @node: list item for parent traversal.
96 * @irq: back pointer to parent.
97 * @enabled: true if driver enabled IRQ
98 * @virq: the virtual IRQ value provided to the requesting driver.
100 * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
101 * a VMD IRQ using this structure.
104 struct list_head node
;
105 struct vmd_irq_list
*irq
;
111 * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector
112 * @irq_list: the list of irq's the VMD one demuxes to.
113 * @srcu: SRCU struct for local synchronization.
114 * @count: number of child IRQs assigned to this vector; used to track
116 * @virq: The underlying VMD Linux interrupt number
118 struct vmd_irq_list
{
119 struct list_head irq_list
;
120 struct srcu_struct srcu
;
129 void __iomem
*cfgbar
;
132 struct vmd_irq_list
*irqs
;
134 struct pci_sysdata sysdata
;
135 struct resource resources
[3];
136 struct irq_domain
*irq_domain
;
144 static inline struct vmd_dev
*vmd_from_bus(struct pci_bus
*bus
)
146 return container_of(bus
->sysdata
, struct vmd_dev
, sysdata
);
149 static inline unsigned int index_from_irqs(struct vmd_dev
*vmd
,
150 struct vmd_irq_list
*irqs
)
152 return irqs
- vmd
->irqs
;
156 * Drivers managing a device in a VMD domain allocate their own IRQs as before,
157 * but the MSI entry for the hardware it's driving will be programmed with a
158 * destination ID for the VMD MSI-X table. The VMD muxes interrupts in its
159 * domain into one of its own, and the VMD driver de-muxes these for the
160 * handlers sharing that VMD IRQ. The vmd irq_domain provides the operations
161 * and irq_chip to set this up.
163 static void vmd_compose_msi_msg(struct irq_data
*data
, struct msi_msg
*msg
)
165 struct vmd_irq
*vmdirq
= data
->chip_data
;
166 struct vmd_irq_list
*irq
= vmdirq
->irq
;
167 struct vmd_dev
*vmd
= irq_data_get_irq_handler_data(data
);
169 memset(msg
, 0, sizeof(*msg
));
170 msg
->address_hi
= X86_MSI_BASE_ADDRESS_HIGH
;
171 msg
->arch_addr_lo
.base_address
= X86_MSI_BASE_ADDRESS_LOW
;
172 msg
->arch_addr_lo
.destid_0_7
= index_from_irqs(vmd
, irq
);
176 * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops.
178 static void vmd_irq_enable(struct irq_data
*data
)
180 struct vmd_irq
*vmdirq
= data
->chip_data
;
183 raw_spin_lock_irqsave(&list_lock
, flags
);
184 WARN_ON(vmdirq
->enabled
);
185 list_add_tail_rcu(&vmdirq
->node
, &vmdirq
->irq
->irq_list
);
186 vmdirq
->enabled
= true;
187 raw_spin_unlock_irqrestore(&list_lock
, flags
);
189 data
->chip
->irq_unmask(data
);
192 static void vmd_irq_disable(struct irq_data
*data
)
194 struct vmd_irq
*vmdirq
= data
->chip_data
;
197 data
->chip
->irq_mask(data
);
199 raw_spin_lock_irqsave(&list_lock
, flags
);
200 if (vmdirq
->enabled
) {
201 list_del_rcu(&vmdirq
->node
);
202 vmdirq
->enabled
= false;
204 raw_spin_unlock_irqrestore(&list_lock
, flags
);
207 static struct irq_chip vmd_msi_controller
= {
209 .irq_enable
= vmd_irq_enable
,
210 .irq_disable
= vmd_irq_disable
,
211 .irq_compose_msi_msg
= vmd_compose_msi_msg
,
214 static irq_hw_number_t
vmd_get_hwirq(struct msi_domain_info
*info
,
215 msi_alloc_info_t
*arg
)
221 * XXX: We can be even smarter selecting the best IRQ once we solve the
224 static struct vmd_irq_list
*vmd_next_irq(struct vmd_dev
*vmd
, struct msi_desc
*desc
)
229 if (vmd
->msix_count
== 1 + vmd
->first_vec
)
230 return &vmd
->irqs
[vmd
->first_vec
];
233 * White list for fast-interrupt handlers. All others will share the
234 * "slow" interrupt vector.
236 switch (msi_desc_to_pci_dev(desc
)->class) {
237 case PCI_CLASS_STORAGE_EXPRESS
:
240 return &vmd
->irqs
[vmd
->first_vec
];
243 raw_spin_lock_irqsave(&list_lock
, flags
);
244 best
= vmd
->first_vec
+ 1;
245 for (i
= best
; i
< vmd
->msix_count
; i
++)
246 if (vmd
->irqs
[i
].count
< vmd
->irqs
[best
].count
)
248 vmd
->irqs
[best
].count
++;
249 raw_spin_unlock_irqrestore(&list_lock
, flags
);
251 return &vmd
->irqs
[best
];
254 static int vmd_msi_init(struct irq_domain
*domain
, struct msi_domain_info
*info
,
255 unsigned int virq
, irq_hw_number_t hwirq
,
256 msi_alloc_info_t
*arg
)
258 struct msi_desc
*desc
= arg
->desc
;
259 struct vmd_dev
*vmd
= vmd_from_bus(msi_desc_to_pci_dev(desc
)->bus
);
260 struct vmd_irq
*vmdirq
= kzalloc(sizeof(*vmdirq
), GFP_KERNEL
);
265 INIT_LIST_HEAD(&vmdirq
->node
);
266 vmdirq
->irq
= vmd_next_irq(vmd
, desc
);
269 irq_domain_set_info(domain
, virq
, vmdirq
->irq
->virq
, info
->chip
, vmdirq
,
270 handle_untracked_irq
, vmd
, NULL
);
274 static void vmd_msi_free(struct irq_domain
*domain
,
275 struct msi_domain_info
*info
, unsigned int virq
)
277 struct vmd_irq
*vmdirq
= irq_get_chip_data(virq
);
280 synchronize_srcu(&vmdirq
->irq
->srcu
);
282 /* XXX: Potential optimization to rebalance */
283 raw_spin_lock_irqsave(&list_lock
, flags
);
284 vmdirq
->irq
->count
--;
285 raw_spin_unlock_irqrestore(&list_lock
, flags
);
290 static int vmd_msi_prepare(struct irq_domain
*domain
, struct device
*dev
,
291 int nvec
, msi_alloc_info_t
*arg
)
293 struct pci_dev
*pdev
= to_pci_dev(dev
);
294 struct vmd_dev
*vmd
= vmd_from_bus(pdev
->bus
);
296 if (nvec
> vmd
->msix_count
)
297 return vmd
->msix_count
;
299 memset(arg
, 0, sizeof(*arg
));
303 static void vmd_set_desc(msi_alloc_info_t
*arg
, struct msi_desc
*desc
)
308 static struct msi_domain_ops vmd_msi_domain_ops
= {
309 .get_hwirq
= vmd_get_hwirq
,
310 .msi_init
= vmd_msi_init
,
311 .msi_free
= vmd_msi_free
,
312 .msi_prepare
= vmd_msi_prepare
,
313 .set_desc
= vmd_set_desc
,
316 static struct msi_domain_info vmd_msi_domain_info
= {
317 .flags
= MSI_FLAG_USE_DEF_DOM_OPS
| MSI_FLAG_USE_DEF_CHIP_OPS
|
318 MSI_FLAG_NO_AFFINITY
| MSI_FLAG_PCI_MSIX
,
319 .ops
= &vmd_msi_domain_ops
,
320 .chip
= &vmd_msi_controller
,
323 static void vmd_set_msi_remapping(struct vmd_dev
*vmd
, bool enable
)
327 pci_read_config_word(vmd
->dev
, PCI_REG_VMCONFIG
, ®
);
328 reg
= enable
? (reg
& ~VMCONFIG_MSI_REMAP
) :
329 (reg
| VMCONFIG_MSI_REMAP
);
330 pci_write_config_word(vmd
->dev
, PCI_REG_VMCONFIG
, reg
);
333 static int vmd_create_irq_domain(struct vmd_dev
*vmd
)
335 struct fwnode_handle
*fn
;
337 fn
= irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd
->sysdata
.domain
);
341 vmd
->irq_domain
= pci_msi_create_irq_domain(fn
, &vmd_msi_domain_info
, NULL
);
342 if (!vmd
->irq_domain
) {
343 irq_domain_free_fwnode(fn
);
350 static void vmd_remove_irq_domain(struct vmd_dev
*vmd
)
353 * Some production BIOS won't enable remapping between soft reboots.
354 * Ensure remapping is restored before unloading the driver.
356 if (!vmd
->msix_count
)
357 vmd_set_msi_remapping(vmd
, true);
359 if (vmd
->irq_domain
) {
360 struct fwnode_handle
*fn
= vmd
->irq_domain
->fwnode
;
362 irq_domain_remove(vmd
->irq_domain
);
363 irq_domain_free_fwnode(fn
);
367 static void __iomem
*vmd_cfg_addr(struct vmd_dev
*vmd
, struct pci_bus
*bus
,
368 unsigned int devfn
, int reg
, int len
)
370 unsigned int busnr_ecam
= bus
->number
- vmd
->busn_start
;
371 u32 offset
= PCIE_ECAM_OFFSET(busnr_ecam
, devfn
, reg
);
373 if (offset
+ len
>= resource_size(&vmd
->dev
->resource
[VMD_CFGBAR
]))
376 return vmd
->cfgbar
+ offset
;
380 * CPU may deadlock if config space is not serialized on some versions of this
381 * hardware, so all config space access is done under a spinlock.
383 static int vmd_pci_read(struct pci_bus
*bus
, unsigned int devfn
, int reg
,
386 struct vmd_dev
*vmd
= vmd_from_bus(bus
);
387 void __iomem
*addr
= vmd_cfg_addr(vmd
, bus
, devfn
, reg
, len
);
394 spin_lock_irqsave(&vmd
->cfg_lock
, flags
);
397 *value
= readb(addr
);
400 *value
= readw(addr
);
403 *value
= readl(addr
);
409 spin_unlock_irqrestore(&vmd
->cfg_lock
, flags
);
414 * VMD h/w converts non-posted config writes to posted memory writes. The
415 * read-back in this function forces the completion so it returns only after
416 * the config space was written, as expected.
418 static int vmd_pci_write(struct pci_bus
*bus
, unsigned int devfn
, int reg
,
421 struct vmd_dev
*vmd
= vmd_from_bus(bus
);
422 void __iomem
*addr
= vmd_cfg_addr(vmd
, bus
, devfn
, reg
, len
);
429 spin_lock_irqsave(&vmd
->cfg_lock
, flags
);
447 spin_unlock_irqrestore(&vmd
->cfg_lock
, flags
);
451 static struct pci_ops vmd_ops
= {
452 .read
= vmd_pci_read
,
453 .write
= vmd_pci_write
,
457 static struct acpi_device
*vmd_acpi_find_companion(struct pci_dev
*pci_dev
)
459 struct pci_host_bridge
*bridge
;
462 if (pci_dev
->bus
->ops
!= &vmd_ops
)
465 bridge
= pci_find_host_bridge(pci_dev
->bus
);
466 busnr
= pci_dev
->bus
->number
- bridge
->bus
->number
;
468 * The address computation below is only applicable to relative bus
474 addr
= (busnr
<< 24) | ((u32
)pci_dev
->devfn
<< 16) | 0x8000FFFFU
;
476 dev_dbg(&pci_dev
->dev
, "Looking for ACPI companion (address 0x%x)\n",
479 return acpi_find_child_device(ACPI_COMPANION(bridge
->dev
.parent
), addr
,
483 static bool hook_installed
;
485 static void vmd_acpi_begin(void)
487 if (pci_acpi_set_companion_lookup_hook(vmd_acpi_find_companion
))
490 hook_installed
= true;
493 static void vmd_acpi_end(void)
498 pci_acpi_clear_companion_lookup_hook();
499 hook_installed
= false;
502 static inline void vmd_acpi_begin(void) { }
503 static inline void vmd_acpi_end(void) { }
504 #endif /* CONFIG_ACPI */
506 static void vmd_domain_reset(struct vmd_dev
*vmd
)
508 u16 bus
, max_buses
= resource_size(&vmd
->resources
[0]);
509 u8 dev
, functions
, fn
, hdr_type
;
512 for (bus
= 0; bus
< max_buses
; bus
++) {
513 for (dev
= 0; dev
< 32; dev
++) {
514 base
= vmd
->cfgbar
+ PCIE_ECAM_OFFSET(bus
,
515 PCI_DEVFN(dev
, 0), 0);
517 hdr_type
= readb(base
+ PCI_HEADER_TYPE
);
519 functions
= (hdr_type
& PCI_HEADER_TYPE_MFD
) ? 8 : 1;
520 for (fn
= 0; fn
< functions
; fn
++) {
521 base
= vmd
->cfgbar
+ PCIE_ECAM_OFFSET(bus
,
522 PCI_DEVFN(dev
, fn
), 0);
524 hdr_type
= readb(base
+ PCI_HEADER_TYPE
) &
525 PCI_HEADER_TYPE_MASK
;
527 if (hdr_type
!= PCI_HEADER_TYPE_BRIDGE
||
528 (readw(base
+ PCI_CLASS_DEVICE
) !=
529 PCI_CLASS_BRIDGE_PCI
))
533 * Temporarily disable the I/O range before updating
536 writel(0x0000ffff, base
+ PCI_IO_BASE_UPPER16
);
537 /* Update lower 16 bits of I/O base/limit */
538 writew(0x00f0, base
+ PCI_IO_BASE
);
539 /* Update upper 16 bits of I/O base/limit */
540 writel(0, base
+ PCI_IO_BASE_UPPER16
);
542 /* MMIO Base/Limit */
543 writel(0x0000fff0, base
+ PCI_MEMORY_BASE
);
545 /* Prefetchable MMIO Base/Limit */
546 writel(0, base
+ PCI_PREF_LIMIT_UPPER32
);
547 writel(0x0000fff0, base
+ PCI_PREF_MEMORY_BASE
);
548 writel(0xffffffff, base
+ PCI_PREF_BASE_UPPER32
);
554 static void vmd_attach_resources(struct vmd_dev
*vmd
)
556 vmd
->dev
->resource
[VMD_MEMBAR1
].child
= &vmd
->resources
[1];
557 vmd
->dev
->resource
[VMD_MEMBAR2
].child
= &vmd
->resources
[2];
560 static void vmd_detach_resources(struct vmd_dev
*vmd
)
562 vmd
->dev
->resource
[VMD_MEMBAR1
].child
= NULL
;
563 vmd
->dev
->resource
[VMD_MEMBAR2
].child
= NULL
;
567 * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
568 * Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of which the lower
569 * 16 bits are the PCI Segment Group (domain) number. Other bits are
570 * currently reserved.
572 static int vmd_find_free_domain(void)
575 struct pci_bus
*bus
= NULL
;
577 while ((bus
= pci_find_next_bus(bus
)) != NULL
)
578 domain
= max_t(int, domain
, pci_domain_nr(bus
));
582 static int vmd_get_phys_offsets(struct vmd_dev
*vmd
, bool native_hint
,
583 resource_size_t
*offset1
,
584 resource_size_t
*offset2
)
586 struct pci_dev
*dev
= vmd
->dev
;
593 ret
= pci_read_config_dword(dev
, PCI_REG_VMLOCK
, &vmlock
);
594 if (ret
|| PCI_POSSIBLE_ERROR(vmlock
))
597 if (MB2_SHADOW_EN(vmlock
)) {
598 void __iomem
*membar2
;
600 membar2
= pci_iomap(dev
, VMD_MEMBAR2
, 0);
603 phys1
= readq(membar2
+ MB2_SHADOW_OFFSET
);
604 phys2
= readq(membar2
+ MB2_SHADOW_OFFSET
+ 8);
605 pci_iounmap(dev
, membar2
);
609 /* Hypervisor-Emulated Vendor-Specific Capability */
610 int pos
= pci_find_capability(dev
, PCI_CAP_ID_VNDR
);
613 pci_read_config_dword(dev
, pos
+ 4, ®
);
616 if (pos
&& reg
== 0x53484457) {
617 pci_read_config_dword(dev
, pos
+ 8, ®
);
618 pci_read_config_dword(dev
, pos
+ 12, ®u
);
619 phys1
= (u64
) regu
<< 32 | reg
;
621 pci_read_config_dword(dev
, pos
+ 16, ®
);
622 pci_read_config_dword(dev
, pos
+ 20, ®u
);
623 phys2
= (u64
) regu
<< 32 | reg
;
628 *offset1
= dev
->resource
[VMD_MEMBAR1
].start
-
629 (phys1
& PCI_BASE_ADDRESS_MEM_MASK
);
630 *offset2
= dev
->resource
[VMD_MEMBAR2
].start
-
631 (phys2
& PCI_BASE_ADDRESS_MEM_MASK
);
636 static int vmd_get_bus_number_start(struct vmd_dev
*vmd
)
638 struct pci_dev
*dev
= vmd
->dev
;
641 pci_read_config_word(dev
, PCI_REG_VMCAP
, ®
);
642 if (BUS_RESTRICT_CAP(reg
)) {
643 pci_read_config_word(dev
, PCI_REG_VMCONFIG
, ®
);
645 switch (BUS_RESTRICT_CFG(reg
)) {
650 vmd
->busn_start
= 128;
653 vmd
->busn_start
= 224;
656 pci_err(dev
, "Unknown Bus Offset Setting (%d)\n",
657 BUS_RESTRICT_CFG(reg
));
665 static irqreturn_t
vmd_irq(int irq
, void *data
)
667 struct vmd_irq_list
*irqs
= data
;
668 struct vmd_irq
*vmdirq
;
671 idx
= srcu_read_lock(&irqs
->srcu
);
672 list_for_each_entry_rcu(vmdirq
, &irqs
->irq_list
, node
)
673 generic_handle_irq(vmdirq
->virq
);
674 srcu_read_unlock(&irqs
->srcu
, idx
);
679 static int vmd_alloc_irqs(struct vmd_dev
*vmd
)
681 struct pci_dev
*dev
= vmd
->dev
;
684 vmd
->msix_count
= pci_msix_vec_count(dev
);
685 if (vmd
->msix_count
< 0)
688 vmd
->msix_count
= pci_alloc_irq_vectors(dev
, vmd
->first_vec
+ 1,
689 vmd
->msix_count
, PCI_IRQ_MSIX
);
690 if (vmd
->msix_count
< 0)
691 return vmd
->msix_count
;
693 vmd
->irqs
= devm_kcalloc(&dev
->dev
, vmd
->msix_count
, sizeof(*vmd
->irqs
),
698 for (i
= 0; i
< vmd
->msix_count
; i
++) {
699 err
= init_srcu_struct(&vmd
->irqs
[i
].srcu
);
703 INIT_LIST_HEAD(&vmd
->irqs
[i
].irq_list
);
704 vmd
->irqs
[i
].virq
= pci_irq_vector(dev
, i
);
705 err
= devm_request_irq(&dev
->dev
, vmd
->irqs
[i
].virq
,
706 vmd_irq
, IRQF_NO_THREAD
,
707 vmd
->name
, &vmd
->irqs
[i
]);
716 * Since VMD is an aperture to regular PCIe root ports, only allow it to
717 * control features that the OS is allowed to control on the physical PCI bus.
719 static void vmd_copy_host_bridge_flags(struct pci_host_bridge
*root_bridge
,
720 struct pci_host_bridge
*vmd_bridge
)
722 vmd_bridge
->native_pcie_hotplug
= root_bridge
->native_pcie_hotplug
;
723 vmd_bridge
->native_shpc_hotplug
= root_bridge
->native_shpc_hotplug
;
724 vmd_bridge
->native_aer
= root_bridge
->native_aer
;
725 vmd_bridge
->native_pme
= root_bridge
->native_pme
;
726 vmd_bridge
->native_ltr
= root_bridge
->native_ltr
;
727 vmd_bridge
->native_dpc
= root_bridge
->native_dpc
;
731 * Enable ASPM and LTR settings on devices that aren't configured by BIOS.
733 static int vmd_pm_enable_quirk(struct pci_dev
*pdev
, void *userdata
)
735 unsigned long features
= *(unsigned long *)userdata
;
736 u16 ltr
= VMD_BIOS_PM_QUIRK_LTR
;
740 if (!(features
& VMD_FEAT_BIOS_PM_QUIRK
))
743 pos
= pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_LTR
);
745 goto out_state_change
;
748 * Skip if the max snoop LTR is non-zero, indicating BIOS has set it
749 * so the LTR quirk is not needed.
751 pci_read_config_dword(pdev
, pos
+ PCI_LTR_MAX_SNOOP_LAT
, <r_reg
);
752 if (!!(ltr_reg
& (PCI_LTR_VALUE_MASK
| PCI_LTR_SCALE_MASK
)))
753 goto out_state_change
;
756 * Set the default values to the maximum required by the platform to
757 * allow the deepest power management savings. Write as a DWORD where
758 * the lower word is the max snoop latency and the upper word is the
759 * max non-snoop latency.
761 ltr_reg
= (ltr
<< 16) | ltr
;
762 pci_write_config_dword(pdev
, pos
+ PCI_LTR_MAX_SNOOP_LAT
, ltr_reg
);
763 pci_info(pdev
, "VMD: Default LTR value set by driver\n");
767 * Ensure devices are in D0 before enabling PCI-PM L1 PM Substates, per
768 * PCIe r6.0, sec 5.5.4.
770 pci_set_power_state_locked(pdev
, PCI_D0
);
771 pci_enable_link_state_locked(pdev
, PCIE_LINK_STATE_ALL
);
775 static int vmd_enable_domain(struct vmd_dev
*vmd
, unsigned long features
)
777 struct pci_sysdata
*sd
= &vmd
->sysdata
;
778 struct resource
*res
;
781 LIST_HEAD(resources
);
782 resource_size_t offset
[2] = {0};
783 resource_size_t membar2_offset
= 0x2000;
784 struct pci_bus
*child
;
789 * Shadow registers may exist in certain VMD device ids which allow
790 * guests to correctly assign host physical addresses to the root ports
791 * and child devices. These registers will either return the host value
792 * or 0, depending on an enable bit in the VMD device.
794 if (features
& VMD_FEAT_HAS_MEMBAR_SHADOW
) {
795 membar2_offset
= MB2_SHADOW_OFFSET
+ MB2_SHADOW_SIZE
;
796 ret
= vmd_get_phys_offsets(vmd
, true, &offset
[0], &offset
[1]);
799 } else if (features
& VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP
) {
800 ret
= vmd_get_phys_offsets(vmd
, false, &offset
[0], &offset
[1]);
806 * Certain VMD devices may have a root port configuration option which
807 * limits the bus range to between 0-127, 128-255, or 224-255
809 if (features
& VMD_FEAT_HAS_BUS_RESTRICTIONS
) {
810 ret
= vmd_get_bus_number_start(vmd
);
815 res
= &vmd
->dev
->resource
[VMD_CFGBAR
];
816 vmd
->resources
[0] = (struct resource
) {
817 .name
= "VMD CFGBAR",
818 .start
= vmd
->busn_start
,
819 .end
= vmd
->busn_start
+ (resource_size(res
) >> 20) - 1,
820 .flags
= IORESOURCE_BUS
| IORESOURCE_PCI_FIXED
,
824 * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can
825 * put 32-bit resources in the window.
827 * There's no hardware reason why a 64-bit window *couldn't*
828 * contain a 32-bit resource, but pbus_size_mem() computes the
829 * bridge window size assuming a 64-bit window will contain no
830 * 32-bit resources. __pci_assign_resource() enforces that
831 * artificial restriction to make sure everything will fit.
833 * The only way we could use a 64-bit non-prefetchable MEMBAR is
834 * if its address is <4GB so that we can convert it to a 32-bit
835 * resource. To be visible to the host OS, all VMD endpoints must
836 * be initially configured by platform BIOS, which includes setting
837 * up these resources. We can assume the device is configured
838 * according to the platform needs.
840 res
= &vmd
->dev
->resource
[VMD_MEMBAR1
];
841 upper_bits
= upper_32_bits(res
->end
);
842 flags
= res
->flags
& ~IORESOURCE_SIZEALIGN
;
844 flags
&= ~IORESOURCE_MEM_64
;
845 vmd
->resources
[1] = (struct resource
) {
846 .name
= "VMD MEMBAR1",
853 res
= &vmd
->dev
->resource
[VMD_MEMBAR2
];
854 upper_bits
= upper_32_bits(res
->end
);
855 flags
= res
->flags
& ~IORESOURCE_SIZEALIGN
;
857 flags
&= ~IORESOURCE_MEM_64
;
858 vmd
->resources
[2] = (struct resource
) {
859 .name
= "VMD MEMBAR2",
860 .start
= res
->start
+ membar2_offset
,
866 sd
->vmd_dev
= vmd
->dev
;
867 sd
->domain
= vmd_find_free_domain();
871 sd
->node
= pcibus_to_node(vmd
->dev
->bus
);
874 * Currently MSI remapping must be enabled in guest passthrough mode
875 * due to some missing interrupt remapping plumbing. This is probably
876 * acceptable because the guest is usually CPU-limited and MSI
877 * remapping doesn't become a performance bottleneck.
879 if (!(features
& VMD_FEAT_CAN_BYPASS_MSI_REMAP
) ||
880 offset
[0] || offset
[1]) {
881 ret
= vmd_alloc_irqs(vmd
);
885 vmd_set_msi_remapping(vmd
, true);
887 ret
= vmd_create_irq_domain(vmd
);
892 * Override the IRQ domain bus token so the domain can be
893 * distinguished from a regular PCI/MSI domain.
895 irq_domain_update_bus_token(vmd
->irq_domain
, DOMAIN_BUS_VMD_MSI
);
897 vmd_set_msi_remapping(vmd
, false);
900 pci_add_resource(&resources
, &vmd
->resources
[0]);
901 pci_add_resource_offset(&resources
, &vmd
->resources
[1], offset
[0]);
902 pci_add_resource_offset(&resources
, &vmd
->resources
[2], offset
[1]);
904 vmd
->bus
= pci_create_root_bus(&vmd
->dev
->dev
, vmd
->busn_start
,
905 &vmd_ops
, sd
, &resources
);
907 pci_free_resource_list(&resources
);
908 vmd_remove_irq_domain(vmd
);
912 vmd_copy_host_bridge_flags(pci_find_host_bridge(vmd
->dev
->bus
),
913 to_pci_host_bridge(vmd
->bus
->bridge
));
915 vmd_attach_resources(vmd
);
917 dev_set_msi_domain(&vmd
->bus
->dev
, vmd
->irq_domain
);
919 dev_set_msi_domain(&vmd
->bus
->dev
,
920 dev_get_msi_domain(&vmd
->dev
->dev
));
922 WARN(sysfs_create_link(&vmd
->dev
->dev
.kobj
, &vmd
->bus
->dev
.kobj
,
923 "domain"), "Can't create symlink to domain\n");
927 pci_scan_child_bus(vmd
->bus
);
928 vmd_domain_reset(vmd
);
930 /* When Intel VMD is enabled, the OS does not discover the Root Ports
931 * owned by Intel VMD within the MMCFG space. pci_reset_bus() applies
932 * a reset to the parent of the PCI device supplied as argument. This
933 * is why we pass a child device, so the reset can be triggered at
934 * the Intel bridge level and propagated to all the children in the
937 list_for_each_entry(child
, &vmd
->bus
->children
, node
) {
938 if (!list_empty(&child
->devices
)) {
939 dev
= list_first_entry(&child
->devices
,
940 struct pci_dev
, bus_list
);
941 ret
= pci_reset_bus(dev
);
943 pci_warn(dev
, "can't reset device: %d\n", ret
);
949 pci_assign_unassigned_bus_resources(vmd
->bus
);
951 pci_walk_bus(vmd
->bus
, vmd_pm_enable_quirk
, &features
);
954 * VMD root buses are virtual and don't return true on pci_is_pcie()
955 * and will fail pcie_bus_configure_settings() early. It can instead be
956 * run on each of the real root ports.
958 list_for_each_entry(child
, &vmd
->bus
->children
, node
)
959 pcie_bus_configure_settings(child
);
961 pci_bus_add_devices(vmd
->bus
);
967 static int vmd_probe(struct pci_dev
*dev
, const struct pci_device_id
*id
)
969 unsigned long features
= (unsigned long) id
->driver_data
;
973 if (resource_size(&dev
->resource
[VMD_CFGBAR
]) < (1 << 20))
976 vmd
= devm_kzalloc(&dev
->dev
, sizeof(*vmd
), GFP_KERNEL
);
981 vmd
->instance
= ida_alloc(&vmd_instance_ida
, GFP_KERNEL
);
982 if (vmd
->instance
< 0)
983 return vmd
->instance
;
985 vmd
->name
= devm_kasprintf(&dev
->dev
, GFP_KERNEL
, "vmd%d",
989 goto out_release_instance
;
992 err
= pcim_enable_device(dev
);
994 goto out_release_instance
;
996 vmd
->cfgbar
= pcim_iomap(dev
, VMD_CFGBAR
, 0);
999 goto out_release_instance
;
1002 pci_set_master(dev
);
1003 if (dma_set_mask_and_coherent(&dev
->dev
, DMA_BIT_MASK(64)) &&
1004 dma_set_mask_and_coherent(&dev
->dev
, DMA_BIT_MASK(32))) {
1006 goto out_release_instance
;
1009 if (features
& VMD_FEAT_OFFSET_FIRST_VECTOR
)
1012 spin_lock_init(&vmd
->cfg_lock
);
1013 pci_set_drvdata(dev
, vmd
);
1014 err
= vmd_enable_domain(vmd
, features
);
1016 goto out_release_instance
;
1018 dev_info(&vmd
->dev
->dev
, "Bound to PCI domain %04x\n",
1019 vmd
->sysdata
.domain
);
1022 out_release_instance
:
1023 ida_free(&vmd_instance_ida
, vmd
->instance
);
1027 static void vmd_cleanup_srcu(struct vmd_dev
*vmd
)
1031 for (i
= 0; i
< vmd
->msix_count
; i
++)
1032 cleanup_srcu_struct(&vmd
->irqs
[i
].srcu
);
1035 static void vmd_remove(struct pci_dev
*dev
)
1037 struct vmd_dev
*vmd
= pci_get_drvdata(dev
);
1039 pci_stop_root_bus(vmd
->bus
);
1040 sysfs_remove_link(&vmd
->dev
->dev
.kobj
, "domain");
1041 pci_remove_root_bus(vmd
->bus
);
1042 vmd_cleanup_srcu(vmd
);
1043 vmd_detach_resources(vmd
);
1044 vmd_remove_irq_domain(vmd
);
1045 ida_free(&vmd_instance_ida
, vmd
->instance
);
1048 static void vmd_shutdown(struct pci_dev
*dev
)
1050 struct vmd_dev
*vmd
= pci_get_drvdata(dev
);
1052 vmd_remove_irq_domain(vmd
);
1055 #ifdef CONFIG_PM_SLEEP
1056 static int vmd_suspend(struct device
*dev
)
1058 struct pci_dev
*pdev
= to_pci_dev(dev
);
1059 struct vmd_dev
*vmd
= pci_get_drvdata(pdev
);
1062 for (i
= 0; i
< vmd
->msix_count
; i
++)
1063 devm_free_irq(dev
, vmd
->irqs
[i
].virq
, &vmd
->irqs
[i
]);
1068 static int vmd_resume(struct device
*dev
)
1070 struct pci_dev
*pdev
= to_pci_dev(dev
);
1071 struct vmd_dev
*vmd
= pci_get_drvdata(pdev
);
1074 vmd_set_msi_remapping(vmd
, !!vmd
->irq_domain
);
1076 for (i
= 0; i
< vmd
->msix_count
; i
++) {
1077 err
= devm_request_irq(dev
, vmd
->irqs
[i
].virq
,
1078 vmd_irq
, IRQF_NO_THREAD
,
1079 vmd
->name
, &vmd
->irqs
[i
]);
1087 static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops
, vmd_suspend
, vmd_resume
);
1089 static const struct pci_device_id vmd_ids
[] = {
1090 {PCI_VDEVICE(INTEL
, PCI_DEVICE_ID_INTEL_VMD_201D
),
1091 .driver_data
= VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP
,},
1092 {PCI_VDEVICE(INTEL
, PCI_DEVICE_ID_INTEL_VMD_28C0
),
1093 .driver_data
= VMD_FEAT_HAS_MEMBAR_SHADOW
|
1094 VMD_FEAT_HAS_BUS_RESTRICTIONS
|
1095 VMD_FEAT_CAN_BYPASS_MSI_REMAP
,},
1096 {PCI_VDEVICE(INTEL
, 0x467f),
1097 .driver_data
= VMD_FEATS_CLIENT
,},
1098 {PCI_VDEVICE(INTEL
, 0x4c3d),
1099 .driver_data
= VMD_FEATS_CLIENT
,},
1100 {PCI_VDEVICE(INTEL
, 0xa77f),
1101 .driver_data
= VMD_FEATS_CLIENT
,},
1102 {PCI_VDEVICE(INTEL
, 0x7d0b),
1103 .driver_data
= VMD_FEATS_CLIENT
,},
1104 {PCI_VDEVICE(INTEL
, 0xad0b),
1105 .driver_data
= VMD_FEATS_CLIENT
,},
1106 {PCI_VDEVICE(INTEL
, PCI_DEVICE_ID_INTEL_VMD_9A0B
),
1107 .driver_data
= VMD_FEATS_CLIENT
,},
1108 {PCI_VDEVICE(INTEL
, 0xb60b),
1109 .driver_data
= VMD_FEATS_CLIENT
,},
1110 {PCI_VDEVICE(INTEL
, 0xb06f),
1111 .driver_data
= VMD_FEATS_CLIENT
,},
1114 MODULE_DEVICE_TABLE(pci
, vmd_ids
);
1116 static struct pci_driver vmd_drv
= {
1118 .id_table
= vmd_ids
,
1120 .remove
= vmd_remove
,
1121 .shutdown
= vmd_shutdown
,
1123 .pm
= &vmd_dev_pm_ops
,
1126 module_pci_driver(vmd_drv
);
1128 MODULE_AUTHOR("Intel Corporation");
1129 MODULE_DESCRIPTION("Volume Management Device driver");
1130 MODULE_LICENSE("GPL v2");
1131 MODULE_VERSION("0.6");