1 From a2e548aaac239c9c3e79d61f5386856efaa98c4c Mon Sep 17 00:00:00 2001
2 From: Daniel Drake <drake@endlessm.com>
3 Date: Tue, 4 Jun 2019 14:51:21 +0800
4 Subject: [PATCH] PCI: Add Intel remapped NVMe device support
6 Consumer products that are configured by default to run the Intel SATA AHCI
7 controller in "RAID" or "Intel RST Premium With Intel Optane System
8 Acceleration" mode are becoming increasingly prevalent.
10 Unde this mode, NVMe devices are remapped into the SATA device and become
11 hidden from the PCI bus, which means that Linux users cannot access their
12 storage devices unless they go into the firmware setup menu to revert back
13 to AHCI mode - assuming such option is available. Lack of support for this
14 mode is also causing complications for vendors who distribute Linux.
16 Add support for the remapped NVMe mode by creating a virtual PCI bus,
17 where the AHCI and NVMe devices are presented separately, allowing the
18 ahci and nvme drivers to bind in the normal way.
20 Unfortunately the NVMe device configuration space is inaccesible under
21 this scheme, so we provide a fake one, and hope that no DeviceID-based
22 quirks are needed. The interrupt is shared between the AHCI and NVMe
25 Allow pci_real_dma_dev() to traverse back to the real DMA device from
26 the PCI devices created on our virtual bus, in case the iommu driver
27 will be involved with data transfers here.
29 The existing ahci driver is modified to not claim devices where remapped
30 NVMe devices are present, allowing this new driver to step in.
32 The details of the remapping scheme came from patches previously
33 posted by Dan Williams and the resulting discussion.
35 https://phabricator.endlessm.com/T24358
36 https://phabricator.endlessm.com/T29119
38 Signed-off-by: Daniel Drake <drake@endlessm.com>
40 arch/x86/include/asm/pci.h | 6 +
41 arch/x86/pci/common.c | 7 +-
42 drivers/ata/ahci.c | 23 +-
43 drivers/pci/controller/Makefile | 6 +
44 drivers/pci/controller/intel-nvme-remap.c | 462 ++++++++++++++++++++++
45 5 files changed, 488 insertions(+), 16 deletions(-)
46 create mode 100644 drivers/pci/controller/intel-nvme-remap.c
48 diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
49 index f3fd5928bcbb5..f7488c2b203cf 100644
50 --- a/arch/x86/include/asm/pci.h
51 +++ b/arch/x86/include/asm/pci.h
52 @@ -27,6 +27,7 @@ struct pci_sysdata {
53 #if IS_ENABLED(CONFIG_VMD)
54 struct pci_dev *vmd_dev; /* VMD Device if in Intel VMD domain */
56 + struct pci_dev *nvme_remap_dev; /* AHCI Device if NVME remapped bus */
59 extern int pci_routeirq;
60 @@ -70,6 +71,11 @@ static inline bool is_vmd(struct pci_bus *bus)
61 #define is_vmd(bus) false
62 #endif /* CONFIG_VMD */
64 +static inline bool is_nvme_remap(struct pci_bus *bus)
66 + return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
69 /* Can be used to override the logic in pci_scan_bus for skipping
70 already-configured bus numbers - to be used for buggy BIOSes
71 or architectures with incomplete PCI setup by the loader */
72 diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
73 index 618b01d60af2c..8da4703d7d02c 100644
74 --- a/arch/x86/pci/common.c
75 +++ b/arch/x86/pci/common.c
76 @@ -727,12 +727,15 @@ int pci_ext_cfg_avail(void)
80 -#if IS_ENABLED(CONFIG_VMD)
81 struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
83 +#if IS_ENABLED(CONFIG_VMD)
85 return to_pci_sysdata(dev->bus)->vmd_dev;
88 + if (is_nvme_remap(dev->bus))
89 + return to_pci_sysdata(dev->bus)->nvme_remap_dev;
94 diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
95 index c1eca72b4575d..974d877659113 100644
96 --- a/drivers/ata/ahci.c
97 +++ b/drivers/ata/ahci.c
98 @@ -1503,7 +1503,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
102 -static void ahci_remap_check(struct pci_dev *pdev, int bar,
103 +static int ahci_remap_check(struct pci_dev *pdev, int bar,
104 struct ahci_host_priv *hpriv)
107 @@ -1516,7 +1516,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
108 pci_resource_len(pdev, bar) < SZ_512K ||
109 bar != AHCI_PCI_BAR_STANDARD ||
110 !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
114 cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
115 for (i = 0; i < AHCI_MAX_REMAP; i++) {
116 @@ -1531,18 +1531,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
119 if (!hpriv->remapped_nvme)
122 - dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
123 - hpriv->remapped_nvme);
124 - dev_warn(&pdev->dev,
125 - "Switch your BIOS from RAID to AHCI mode to use them.\n");
129 - * Don't rely on the msi-x capability in the remap case,
130 - * share the legacy interrupt across ahci and remapped devices.
132 - hpriv->flags |= AHCI_HFLAG_NO_MSI;
133 + /* Abort probe, allowing intel-nvme-remap to step in when available */
134 + dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
138 static int ahci_get_irq_vector(struct ata_host *host, int port)
139 @@ -1765,7 +1758,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
140 hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
142 /* detect remapped nvme devices */
143 - ahci_remap_check(pdev, ahci_pci_bar, hpriv);
144 + rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
148 sysfs_add_file_to_group(&pdev->dev.kobj,
149 &dev_attr_remapped_nvme.attr,
150 diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
151 index 37c8663de7fe1..897d19f92edeb 100644
152 --- a/drivers/pci/controller/Makefile
153 +++ b/drivers/pci/controller/Makefile
155 # SPDX-License-Identifier: GPL-2.0
157 +ifdef CONFIG_SATA_AHCI
158 +obj-y += intel-nvme-remap.o
162 obj-$(CONFIG_PCIE_CADENCE) += cadence/
163 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
164 obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
165 diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
167 index 0000000000000..296fc44d395de
169 +++ b/drivers/pci/controller/intel-nvme-remap.c
171 +// SPDX-License-Identifier: GPL-2.0
173 + * Intel remapped NVMe device support.
175 + * Copyright (c) 2019 Endless Mobile, Inc.
176 + * Author: Daniel Drake <drake@endlessm.com>
178 + * Some products ship by default with the SATA controller in "RAID" or
179 + * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
180 + * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
181 + * devices disappear from the PCI bus, and instead their I/O memory becomes
182 + * available within the AHCI device BARs.
184 + * This scheme is understood to be a way of avoiding usage of the standard
185 + * Windows NVMe driver under that OS, instead mandating usage of Intel's
186 + * driver instead, which has better power management, and presumably offers
187 + * some RAID/disk-caching solutions too.
189 + * Here in this driver, we support the remapped NVMe mode by claiming the
190 + * AHCI device and creating a fake PCIe root port. On the new bus, the
191 + * original AHCI device is exposed with only minor tweaks. Then, fake PCI
192 + * devices corresponding to the remapped NVMe devices are created. The usual
193 + * ahci and nvme drivers are then expected to bind to these devices and
194 + * operate as normal.
196 + * The PCI configuration space for the NVMe devices is completely
197 + * unavailable, so we fake a minimal one and hope for the best.
199 + * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
200 + * we only support the legacy interrupt here, although MSI support
201 + * could potentially be added later.
204 +#define MODULE_NAME "intel-nvme-remap"
206 +#include <linux/ahci-remap.h>
207 +#include <linux/irq.h>
208 +#include <linux/kernel.h>
209 +#include <linux/module.h>
210 +#include <linux/pci.h>
212 +#define AHCI_PCI_BAR_STANDARD 5
214 +struct nvme_remap_dev {
215 + struct pci_dev *dev; /* AHCI device */
216 + struct pci_bus *bus; /* our fake PCI bus */
217 + struct pci_sysdata sysdata;
218 + int irq_base; /* our fake interrupts */
221 + * When we detect an all-ones write to a BAR register, this flag
222 + * is set, so that we return the BAR size on the next read (a
223 + * standard PCI behaviour).
224 + * This includes the assumption that an all-ones BAR write is
225 + * immediately followed by a read of the same register.
230 + * Resources copied from the AHCI device, to be regarded as
231 + * resources on our fake bus.
233 + struct resource ahci_resources[PCI_NUM_RESOURCES];
235 + /* Resources corresponding to the NVMe devices. */
236 + struct resource remapped_dev_mem[AHCI_MAX_REMAP];
238 + /* Number of remapped NVMe devices found. */
239 + int num_remapped_devices;
242 +static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
244 + return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
248 +/******** PCI configuration space **********/
251 + * Helper macros for tweaking returned contents of PCI configuration space.
253 + * value contains len bytes of data read from reg.
254 + * If fixup_reg is included in that range, fix up the contents of that
255 + * register to fixed_value.
257 +#define NR_FIX8(fixup_reg, fixed_value) do { \
258 + if (reg <= fixup_reg && fixup_reg < reg + len) \
259 + ((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
262 +#define NR_FIX16(fixup_reg, fixed_value) do { \
263 + NR_FIX8(fixup_reg, fixed_value); \
264 + NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
267 +#define NR_FIX24(fixup_reg, fixed_value) do { \
268 + NR_FIX8(fixup_reg, fixed_value); \
269 + NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
270 + NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
273 +#define NR_FIX32(fixup_reg, fixed_value) do { \
274 + NR_FIX16(fixup_reg, (u16) fixed_value); \
275 + NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
279 + * Read PCI config space of the slot 0 (AHCI) device.
280 + * We pass through the read request to the underlying device, but
281 + * tweak the results in some cases.
283 +static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
284 + int len, u32 *value)
286 + struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
287 + struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
290 + ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
296 + * Adjust the device class, to prevent this driver from attempting to
297 + * additionally probe the device we're simulating here.
299 + NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
302 + * Unset interrupt pin, otherwise ACPI tries to find routing
303 + * info for our virtual IRQ, fails, and complains.
305 + NR_FIX8(PCI_INTERRUPT_PIN, 0);
308 + * Truncate the AHCI BAR to not include the region that covers the
309 + * hidden devices. This will cause the ahci driver to successfully
310 + * probe th new device (instead of handing it over to this driver).
312 + if (nrdev->bar_sizing) {
313 + NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
314 + nrdev->bar_sizing = false;
317 + return PCIBIOS_SUCCESSFUL;
321 + * Read PCI config space of a remapped device.
322 + * Since the original PCI config space is inaccessible, we provide a minimal,
323 + * fake config space instead.
325 +static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
326 + int reg, int len, u32 *value)
328 + struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
329 + struct resource *remapped_mem;
331 + if (port > nrdev->num_remapped_devices)
332 + return PCIBIOS_DEVICE_NOT_FOUND;
335 + remapped_mem = &nrdev->remapped_dev_mem[port - 1];
337 + /* Set a Vendor ID, otherwise Linux assumes no device is present */
338 + NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
340 + /* Always appear on & bus mastering */
341 + NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
343 + /* Set class so that nvme driver probes us */
344 + NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
346 + if (nrdev->bar_sizing) {
347 + NR_FIX32(PCI_BASE_ADDRESS_0,
348 + ~(resource_size(remapped_mem) - 1));
349 + nrdev->bar_sizing = false;
351 + resource_size_t mem_start = remapped_mem->start;
353 + mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
354 + NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
356 + NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
359 + return PCIBIOS_SUCCESSFUL;
362 +/* Read PCI configuration space. */
363 +static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
364 + int reg, int len, u32 *value)
366 + if (PCI_SLOT(devfn) == 0)
367 + return nvme_remap_pci_read_slot0(bus, reg, len, value);
369 + return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
374 + * Write PCI config space of the slot 0 (AHCI) device.
375 + * Apart from the special case of BAR sizing, we disable all writes.
376 + * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
377 + * that would affect the operation of the NVMe devices.
379 +static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
380 + int len, u32 value)
382 + struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
383 + struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
385 + if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
387 + * Writing all-ones to a BAR means that the size of the
388 + * memory region is being checked. Flag this so that we can
389 + * reply with an appropriate size on the next read.
392 + nrdev->bar_sizing = true;
394 + return ahci_dev_bus->ops->write(ahci_dev_bus,
399 + return PCIBIOS_SET_FAILED;
403 + * Write PCI config space of a remapped device.
404 + * Since the original PCI config space is inaccessible, we reject all
405 + * writes, except for the special case of BAR probing.
407 +static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
409 + int reg, int len, u32 value)
411 + struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
413 + if (port > nrdev->num_remapped_devices)
414 + return PCIBIOS_DEVICE_NOT_FOUND;
417 + * Writing all-ones to a BAR means that the size of the memory
418 + * region is being checked. Flag this so that we can reply with
419 + * an appropriate size on the next read.
421 + if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
422 + && reg <= PCI_BASE_ADDRESS_5) {
423 + nrdev->bar_sizing = true;
424 + return PCIBIOS_SUCCESSFUL;
427 + return PCIBIOS_SET_FAILED;
430 +/* Write PCI configuration space. */
431 +static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
432 + int reg, int len, u32 value)
434 + if (PCI_SLOT(devfn) == 0)
435 + return nvme_remap_pci_write_slot0(bus, reg, len, value);
437 + return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
441 +static struct pci_ops nvme_remap_pci_ops = {
442 + .read = nvme_remap_pci_read,
443 + .write = nvme_remap_pci_write,
447 +/******** Initialization & exit **********/
450 + * Find a PCI domain ID to use for our fake bus.
451 + * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
453 +static int find_free_domain(void)
455 + int domain = 0xffff;
456 + struct pci_bus *bus = NULL;
458 + while ((bus = pci_find_next_bus(bus)) != NULL)
459 + domain = max_t(int, domain, pci_domain_nr(bus));
464 +static int find_remapped_devices(struct nvme_remap_dev *nrdev,
465 + struct list_head *resources)
467 + void __iomem *mmio;
471 + mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
472 + pci_resource_len(nrdev->dev,
473 + AHCI_PCI_BAR_STANDARD));
477 + /* Check if this device might have remapped nvme devices. */
478 + if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
479 + !(readl(mmio + AHCI_VSCAP) & 1))
482 + cap = readq(mmio + AHCI_REMAP_CAP);
483 + for (i = 0; i < AHCI_MAX_REMAP; i++) {
484 + struct resource *remapped_mem;
486 + if ((cap & (1 << i)) == 0)
488 + if (readl(mmio + ahci_remap_dcc(i))
489 + != PCI_CLASS_STORAGE_EXPRESS)
492 + /* We've found a remapped device */
493 + remapped_mem = &nrdev->remapped_dev_mem[count++];
494 + remapped_mem->start =
495 + pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
496 + + ahci_remap_base(i);
497 + remapped_mem->end = remapped_mem->start
498 + + AHCI_REMAP_N_SIZE - 1;
499 + remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
500 + pci_add_resource(resources, remapped_mem);
503 + pcim_iounmap(nrdev->dev, mmio);
508 + nrdev->num_remapped_devices = count;
509 + dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
510 + nrdev->num_remapped_devices);
514 +static void nvme_remap_remove_root_bus(void *data)
516 + struct pci_bus *bus = data;
518 + pci_stop_root_bus(bus);
519 + pci_remove_root_bus(bus);
522 +static int nvme_remap_probe(struct pci_dev *dev,
523 + const struct pci_device_id *id)
525 + struct nvme_remap_dev *nrdev;
526 + LIST_HEAD(resources);
529 + struct pci_dev *child;
531 + nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
532 + nrdev->sysdata.domain = find_free_domain();
533 + nrdev->sysdata.nvme_remap_dev = dev;
535 + pci_set_drvdata(dev, nrdev);
537 + ret = pcim_enable_device(dev);
541 + pci_set_master(dev);
543 + ret = find_remapped_devices(nrdev, &resources);
547 + /* Add resources from the original AHCI device */
548 + for (i = 0; i < PCI_NUM_RESOURCES; i++) {
549 + struct resource *res = &dev->resource[i];
552 + struct resource *nr_res = &nrdev->ahci_resources[i];
554 + nr_res->start = res->start;
555 + nr_res->end = res->end;
556 + nr_res->flags = res->flags;
557 + pci_add_resource(&resources, nr_res);
561 + /* Create virtual interrupts */
562 + nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
563 + nrdev->num_remapped_devices + 1,
565 + if (nrdev->irq_base < 0)
566 + return nrdev->irq_base;
568 + /* Create and populate PCI bus */
569 + nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
570 + &nrdev->sysdata, &resources);
574 + if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
578 + /* We don't support sharing MSI interrupts between these devices */
579 + nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
581 + pci_scan_child_bus(nrdev->bus);
583 + list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
585 + * Prevent PCI core from trying to move memory BARs around.
586 + * The hidden NVMe devices are at fixed locations.
588 + for (i = 0; i < PCI_NUM_RESOURCES; i++) {
589 + struct resource *res = &child->resource[i];
591 + if (res->flags & IORESOURCE_MEM)
592 + res->flags |= IORESOURCE_PCI_FIXED;
595 + /* Share the legacy IRQ between all devices */
596 + child->irq = dev->irq;
599 + pci_assign_unassigned_bus_resources(nrdev->bus);
600 + pci_bus_add_devices(nrdev->bus);
605 +static const struct pci_device_id nvme_remap_ids[] = {
607 + * Match all Intel RAID controllers.
609 + * There's overlap here with the set of devices detected by the ahci
610 + * driver, but ahci will only successfully probe when there
611 + * *aren't* any remapped NVMe devices, and this driver will only
612 + * successfully probe when there *are* remapped NVMe devices that
616 + PCI_VDEVICE(INTEL, PCI_ANY_ID),
617 + .class = PCI_CLASS_STORAGE_RAID << 8,
618 + .class_mask = 0xffffff00,
622 +MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
624 +static struct pci_driver nvme_remap_drv = {
625 + .name = MODULE_NAME,
626 + .id_table = nvme_remap_ids,
627 + .probe = nvme_remap_probe,
629 +module_pci_driver(nvme_remap_drv);
631 +MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
632 +MODULE_LICENSE("GPL v2");