2 * QEMU paravirtual RDMA
4 * Copyright (C) 2018 Oracle
5 * Copyright (C) 2018 Red Hat Inc
8 * Yuval Shaia <yuval.shaia@oracle.com>
9 * Marcel Apfelbaum <marcel@redhat.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
19 #include "hw/pci/pci.h"
20 #include "hw/pci/pci_ids.h"
21 #include "hw/pci/msi.h"
22 #include "hw/pci/msix.h"
23 #include "hw/qdev-core.h"
24 #include "hw/qdev-properties.h"
28 #include "../rdma_rm.h"
29 #include "../rdma_backend.h"
30 #include "../rdma_utils.h"
32 #include <infiniband/verbs.h>
34 #include "standard-headers/rdma/vmw_pvrdma-abi.h"
35 #include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h"
36 #include "pvrdma_qp_ops.h"
38 static Property pvrdma_dev_properties
[] = {
39 DEFINE_PROP_STRING("backend-dev", PVRDMADev
, backend_device_name
),
40 DEFINE_PROP_UINT8("backend-port", PVRDMADev
, backend_port_num
, 1),
41 DEFINE_PROP_UINT8("backend-gid-idx", PVRDMADev
, backend_gid_idx
, 0),
42 DEFINE_PROP_UINT64("dev-caps-max-mr-size", PVRDMADev
, dev_attr
.max_mr_size
,
44 DEFINE_PROP_INT32("dev-caps-max-qp", PVRDMADev
, dev_attr
.max_qp
, MAX_QP
),
45 DEFINE_PROP_INT32("dev-caps-max-sge", PVRDMADev
, dev_attr
.max_sge
, MAX_SGE
),
46 DEFINE_PROP_INT32("dev-caps-max-cq", PVRDMADev
, dev_attr
.max_cq
, MAX_CQ
),
47 DEFINE_PROP_INT32("dev-caps-max-mr", PVRDMADev
, dev_attr
.max_mr
, MAX_MR
),
48 DEFINE_PROP_INT32("dev-caps-max-pd", PVRDMADev
, dev_attr
.max_pd
, MAX_PD
),
49 DEFINE_PROP_INT32("dev-caps-qp-rd-atom", PVRDMADev
, dev_attr
.max_qp_rd_atom
,
51 DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev
,
52 dev_attr
.max_qp_init_rd_atom
, MAX_QP_INIT_RD_ATOM
),
53 DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev
, dev_attr
.max_ah
, MAX_AH
),
54 DEFINE_PROP_END_OF_LIST(),
57 static void free_dev_ring(PCIDevice
*pci_dev
, PvrdmaRing
*ring
,
60 pvrdma_ring_free(ring
);
61 rdma_pci_dma_unmap(pci_dev
, ring_state
, TARGET_PAGE_SIZE
);
64 static int init_dev_ring(PvrdmaRing
*ring
, struct pvrdma_ring
**ring_state
,
65 const char *name
, PCIDevice
*pci_dev
,
66 dma_addr_t dir_addr
, uint32_t num_pages
)
71 pr_dbg("Initializing device ring %s\n", name
);
72 pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)dir_addr
);
73 pr_dbg("num_pages=%d\n", num_pages
);
74 dir
= rdma_pci_dma_map(pci_dev
, dir_addr
, TARGET_PAGE_SIZE
);
76 pr_err("Failed to map to page directory\n");
80 tbl
= rdma_pci_dma_map(pci_dev
, dir
[0], TARGET_PAGE_SIZE
);
82 pr_err("Failed to map to page table\n");
87 *ring_state
= rdma_pci_dma_map(pci_dev
, tbl
[0], TARGET_PAGE_SIZE
);
89 pr_err("Failed to map to ring state\n");
93 /* RX ring is the second */
95 rc
= pvrdma_ring_init(ring
, name
, pci_dev
,
96 (struct pvrdma_ring
*)*ring_state
,
97 (num_pages
- 1) * TARGET_PAGE_SIZE
/
98 sizeof(struct pvrdma_cqne
),
99 sizeof(struct pvrdma_cqne
),
100 (dma_addr_t
*)&tbl
[1], (dma_addr_t
)num_pages
- 1);
102 pr_err("Failed to initialize ring\n");
104 goto out_free_ring_state
;
110 rdma_pci_dma_unmap(pci_dev
, *ring_state
, TARGET_PAGE_SIZE
);
113 rdma_pci_dma_unmap(pci_dev
, tbl
, TARGET_PAGE_SIZE
);
116 rdma_pci_dma_unmap(pci_dev
, dir
, TARGET_PAGE_SIZE
);
122 static void free_dsr(PVRDMADev
*dev
)
124 PCIDevice
*pci_dev
= PCI_DEVICE(dev
);
126 if (!dev
->dsr_info
.dsr
) {
130 free_dev_ring(pci_dev
, &dev
->dsr_info
.async
,
131 dev
->dsr_info
.async_ring_state
);
133 free_dev_ring(pci_dev
, &dev
->dsr_info
.cq
, dev
->dsr_info
.cq_ring_state
);
135 rdma_pci_dma_unmap(pci_dev
, dev
->dsr_info
.req
,
136 sizeof(union pvrdma_cmd_req
));
138 rdma_pci_dma_unmap(pci_dev
, dev
->dsr_info
.rsp
,
139 sizeof(union pvrdma_cmd_resp
));
141 rdma_pci_dma_unmap(pci_dev
, dev
->dsr_info
.dsr
,
142 sizeof(struct pvrdma_device_shared_region
));
144 dev
->dsr_info
.dsr
= NULL
;
147 static int load_dsr(PVRDMADev
*dev
)
150 PCIDevice
*pci_dev
= PCI_DEVICE(dev
);
152 struct pvrdma_device_shared_region
*dsr
;
157 pr_dbg("dsr_dma=0x%llx\n", (long long unsigned int)dev
->dsr_info
.dma
);
158 dev
->dsr_info
.dsr
= rdma_pci_dma_map(pci_dev
, dev
->dsr_info
.dma
,
159 sizeof(struct pvrdma_device_shared_region
));
160 if (!dev
->dsr_info
.dsr
) {
161 pr_err("Failed to map to DSR\n");
167 dsr_info
= &dev
->dsr_info
;
170 /* Map to command slot */
171 pr_dbg("cmd_dma=0x%llx\n", (long long unsigned int)dsr
->cmd_slot_dma
);
172 dsr_info
->req
= rdma_pci_dma_map(pci_dev
, dsr
->cmd_slot_dma
,
173 sizeof(union pvrdma_cmd_req
));
174 if (!dsr_info
->req
) {
175 pr_err("Failed to map to command slot address\n");
180 /* Map to response slot */
181 pr_dbg("rsp_dma=0x%llx\n", (long long unsigned int)dsr
->resp_slot_dma
);
182 dsr_info
->rsp
= rdma_pci_dma_map(pci_dev
, dsr
->resp_slot_dma
,
183 sizeof(union pvrdma_cmd_resp
));
184 if (!dsr_info
->rsp
) {
185 pr_err("Failed to map to response slot address\n");
190 /* Map to CQ notification ring */
191 rc
= init_dev_ring(&dsr_info
->cq
, &dsr_info
->cq_ring_state
, "dev_cq",
192 pci_dev
, dsr
->cq_ring_pages
.pdir_dma
,
193 dsr
->cq_ring_pages
.num_pages
);
195 pr_err("Failed to map to initialize CQ ring\n");
200 /* Map to event notification ring */
201 rc
= init_dev_ring(&dsr_info
->async
, &dsr_info
->async_ring_state
,
202 "dev_async", pci_dev
, dsr
->async_ring_pages
.pdir_dma
,
203 dsr
->async_ring_pages
.num_pages
);
205 pr_err("Failed to map to initialize event ring\n");
213 rdma_pci_dma_unmap(pci_dev
, dsr_info
->rsp
, sizeof(union pvrdma_cmd_resp
));
216 rdma_pci_dma_unmap(pci_dev
, dsr_info
->req
, sizeof(union pvrdma_cmd_req
));
219 rdma_pci_dma_unmap(pci_dev
, dsr_info
->dsr
,
220 sizeof(struct pvrdma_device_shared_region
));
221 dsr_info
->dsr
= NULL
;
227 static void init_dsr_dev_caps(PVRDMADev
*dev
)
229 struct pvrdma_device_shared_region
*dsr
;
231 if (dev
->dsr_info
.dsr
== NULL
) {
232 pr_err("Can't initialized DSR\n");
236 dsr
= dev
->dsr_info
.dsr
;
238 dsr
->caps
.fw_ver
= PVRDMA_FW_VERSION
;
239 pr_dbg("fw_ver=0x%" PRIx64
"\n", dsr
->caps
.fw_ver
);
241 dsr
->caps
.mode
= PVRDMA_DEVICE_MODE_ROCE
;
242 pr_dbg("mode=%d\n", dsr
->caps
.mode
);
244 dsr
->caps
.gid_types
|= PVRDMA_GID_TYPE_FLAG_ROCE_V1
;
245 pr_dbg("gid_types=0x%x\n", dsr
->caps
.gid_types
);
247 dsr
->caps
.max_uar
= RDMA_BAR2_UAR_SIZE
;
248 pr_dbg("max_uar=%d\n", dsr
->caps
.max_uar
);
250 dsr
->caps
.max_mr_size
= dev
->dev_attr
.max_mr_size
;
251 dsr
->caps
.max_qp
= dev
->dev_attr
.max_qp
;
252 dsr
->caps
.max_qp_wr
= dev
->dev_attr
.max_qp_wr
;
253 dsr
->caps
.max_sge
= dev
->dev_attr
.max_sge
;
254 dsr
->caps
.max_cq
= dev
->dev_attr
.max_cq
;
255 dsr
->caps
.max_cqe
= dev
->dev_attr
.max_cqe
;
256 dsr
->caps
.max_mr
= dev
->dev_attr
.max_mr
;
257 dsr
->caps
.max_pd
= dev
->dev_attr
.max_pd
;
258 dsr
->caps
.max_ah
= dev
->dev_attr
.max_ah
;
260 dsr
->caps
.gid_tbl_len
= MAX_GIDS
;
261 pr_dbg("gid_tbl_len=%d\n", dsr
->caps
.gid_tbl_len
);
263 dsr
->caps
.sys_image_guid
= 0;
264 pr_dbg("sys_image_guid=%" PRIx64
"\n", dsr
->caps
.sys_image_guid
);
266 dsr
->caps
.node_guid
= cpu_to_be64(dev
->node_guid
);
267 pr_dbg("node_guid=%" PRIx64
"\n", be64_to_cpu(dsr
->caps
.node_guid
));
269 dsr
->caps
.phys_port_cnt
= MAX_PORTS
;
270 pr_dbg("phys_port_cnt=%d\n", dsr
->caps
.phys_port_cnt
);
272 dsr
->caps
.max_pkeys
= MAX_PKEYS
;
273 pr_dbg("max_pkeys=%d\n", dsr
->caps
.max_pkeys
);
275 pr_dbg("Initialized\n");
278 static void init_ports(PVRDMADev
*dev
, Error
**errp
)
282 memset(dev
->rdma_dev_res
.ports
, 0, sizeof(dev
->rdma_dev_res
.ports
));
284 for (i
= 0; i
< MAX_PORTS
; i
++) {
285 dev
->rdma_dev_res
.ports
[i
].state
= IBV_PORT_DOWN
;
289 static void uninit_msix(PCIDevice
*pdev
, int used_vectors
)
291 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
294 for (i
= 0; i
< used_vectors
; i
++) {
295 msix_vector_unuse(pdev
, i
);
298 msix_uninit(pdev
, &dev
->msix
, &dev
->msix
);
301 static int init_msix(PCIDevice
*pdev
, Error
**errp
)
303 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
307 rc
= msix_init(pdev
, RDMA_MAX_INTRS
, &dev
->msix
, RDMA_MSIX_BAR_IDX
,
308 RDMA_MSIX_TABLE
, &dev
->msix
, RDMA_MSIX_BAR_IDX
,
309 RDMA_MSIX_PBA
, 0, NULL
);
312 error_setg(errp
, "Failed to initialize MSI-X");
316 for (i
= 0; i
< RDMA_MAX_INTRS
; i
++) {
317 rc
= msix_vector_use(PCI_DEVICE(dev
), i
);
319 error_setg(errp
, "Fail mark MSI-X vector %d", i
);
320 uninit_msix(pdev
, i
);
328 static void pvrdma_fini(PCIDevice
*pdev
)
330 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
332 pr_dbg("Closing device %s %x.%x\n", pdev
->name
, PCI_SLOT(pdev
->devfn
),
333 PCI_FUNC(pdev
->devfn
));
335 pvrdma_qp_ops_fini();
337 rdma_rm_fini(&dev
->rdma_dev_res
);
339 rdma_backend_fini(&dev
->backend_dev
);
343 if (msix_enabled(pdev
)) {
344 uninit_msix(pdev
, RDMA_MAX_INTRS
);
348 static void pvrdma_stop(PVRDMADev
*dev
)
350 rdma_backend_stop(&dev
->backend_dev
);
353 static void pvrdma_start(PVRDMADev
*dev
)
355 rdma_backend_start(&dev
->backend_dev
);
358 static void activate_device(PVRDMADev
*dev
)
361 set_reg_val(dev
, PVRDMA_REG_ERR
, 0);
362 pr_dbg("Device activated\n");
365 static int unquiesce_device(PVRDMADev
*dev
)
367 pr_dbg("Device unquiesced\n");
371 static int reset_device(PVRDMADev
*dev
)
375 pr_dbg("Device reset complete\n");
380 static uint64_t regs_read(void *opaque
, hwaddr addr
, unsigned size
)
382 PVRDMADev
*dev
= opaque
;
385 /* pr_dbg("addr=0x%lx, size=%d\n", addr, size); */
387 if (get_reg_val(dev
, addr
, &val
)) {
388 pr_dbg("Error trying to read REG value from address 0x%x\n",
393 trace_pvrdma_regs_read(addr
, val
);
398 static void regs_write(void *opaque
, hwaddr addr
, uint64_t val
, unsigned size
)
400 PVRDMADev
*dev
= opaque
;
402 /* pr_dbg("addr=0x%lx, val=0x%x, size=%d\n", addr, (uint32_t)val, size); */
404 if (set_reg_val(dev
, addr
, val
)) {
405 pr_err("Fail to set REG value, addr=0x%" PRIx64
", val=0x%" PRIx64
"\n",
410 trace_pvrdma_regs_write(addr
, val
);
413 case PVRDMA_REG_DSRLOW
:
414 dev
->dsr_info
.dma
= val
;
416 case PVRDMA_REG_DSRHIGH
:
417 dev
->dsr_info
.dma
|= val
<< 32;
419 init_dsr_dev_caps(dev
);
423 case PVRDMA_DEVICE_CTL_ACTIVATE
:
424 activate_device(dev
);
426 case PVRDMA_DEVICE_CTL_UNQUIESCE
:
427 unquiesce_device(dev
);
429 case PVRDMA_DEVICE_CTL_RESET
:
435 pr_dbg("Interrupt mask=0x%" PRIx64
"\n", val
);
436 dev
->interrupt_mask
= val
;
438 case PVRDMA_REG_REQUEST
:
440 execute_command(dev
);
448 static const MemoryRegionOps regs_ops
= {
451 .endianness
= DEVICE_LITTLE_ENDIAN
,
453 .min_access_size
= sizeof(uint32_t),
454 .max_access_size
= sizeof(uint32_t),
458 static void uar_write(void *opaque
, hwaddr addr
, uint64_t val
, unsigned size
)
460 PVRDMADev
*dev
= opaque
;
462 /* pr_dbg("addr=0x%lx, val=0x%x, size=%d\n", addr, (uint32_t)val, size); */
464 switch (addr
& 0xFFF) { /* Mask with 0xFFF as each UC gets page */
465 case PVRDMA_UAR_QP_OFFSET
:
466 pr_dbg("UAR QP command, addr=0x%" PRIx64
", val=0x%" PRIx64
"\n",
467 (uint64_t)addr
, val
);
468 if (val
& PVRDMA_UAR_QP_SEND
) {
469 pvrdma_qp_send(dev
, val
& PVRDMA_UAR_HANDLE_MASK
);
471 if (val
& PVRDMA_UAR_QP_RECV
) {
472 pvrdma_qp_recv(dev
, val
& PVRDMA_UAR_HANDLE_MASK
);
475 case PVRDMA_UAR_CQ_OFFSET
:
476 /* pr_dbg("UAR CQ cmd, addr=0x%x, val=0x%lx\n", (uint32_t)addr, val); */
477 if (val
& PVRDMA_UAR_CQ_ARM
) {
478 rdma_rm_req_notify_cq(&dev
->rdma_dev_res
,
479 val
& PVRDMA_UAR_HANDLE_MASK
,
480 !!(val
& PVRDMA_UAR_CQ_ARM_SOL
));
482 if (val
& PVRDMA_UAR_CQ_ARM_SOL
) {
483 pr_dbg("UAR_CQ_ARM_SOL (%" PRIx64
")\n",
484 val
& PVRDMA_UAR_HANDLE_MASK
);
486 if (val
& PVRDMA_UAR_CQ_POLL
) {
487 pr_dbg("UAR_CQ_POLL (%" PRIx64
")\n", val
& PVRDMA_UAR_HANDLE_MASK
);
488 pvrdma_cq_poll(&dev
->rdma_dev_res
, val
& PVRDMA_UAR_HANDLE_MASK
);
492 pr_err("Unsupported command, addr=0x%" PRIx64
", val=0x%" PRIx64
"\n",
498 static const MemoryRegionOps uar_ops
= {
500 .endianness
= DEVICE_LITTLE_ENDIAN
,
502 .min_access_size
= sizeof(uint32_t),
503 .max_access_size
= sizeof(uint32_t),
507 static void init_pci_config(PCIDevice
*pdev
)
509 pdev
->config
[PCI_INTERRUPT_PIN
] = 1;
512 static void init_bars(PCIDevice
*pdev
)
514 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
517 memory_region_init(&dev
->msix
, OBJECT(dev
), "pvrdma-msix",
518 RDMA_BAR0_MSIX_SIZE
);
519 pci_register_bar(pdev
, RDMA_MSIX_BAR_IDX
, PCI_BASE_ADDRESS_SPACE_MEMORY
,
522 /* BAR 1 - Registers */
523 memset(&dev
->regs_data
, 0, sizeof(dev
->regs_data
));
524 memory_region_init_io(&dev
->regs
, OBJECT(dev
), ®s_ops
, dev
,
525 "pvrdma-regs", sizeof(dev
->regs_data
));
526 pci_register_bar(pdev
, RDMA_REG_BAR_IDX
, PCI_BASE_ADDRESS_SPACE_MEMORY
,
530 memset(&dev
->uar_data
, 0, sizeof(dev
->uar_data
));
531 memory_region_init_io(&dev
->uar
, OBJECT(dev
), &uar_ops
, dev
, "rdma-uar",
532 sizeof(dev
->uar_data
));
533 pci_register_bar(pdev
, RDMA_UAR_BAR_IDX
, PCI_BASE_ADDRESS_SPACE_MEMORY
,
537 static void init_regs(PCIDevice
*pdev
)
539 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
541 set_reg_val(dev
, PVRDMA_REG_VERSION
, PVRDMA_HW_VERSION
);
542 set_reg_val(dev
, PVRDMA_REG_ERR
, 0xFFFF);
545 static void init_dev_caps(PVRDMADev
*dev
)
547 size_t pg_tbl_bytes
= TARGET_PAGE_SIZE
*
548 (TARGET_PAGE_SIZE
/ sizeof(uint64_t));
549 size_t wr_sz
= MAX(sizeof(struct pvrdma_sq_wqe_hdr
),
550 sizeof(struct pvrdma_rq_wqe_hdr
));
552 dev
->dev_attr
.max_qp_wr
= pg_tbl_bytes
/
553 (wr_sz
+ sizeof(struct pvrdma_sge
) * MAX_SGE
) -
554 TARGET_PAGE_SIZE
; /* First page is ring state */
555 pr_dbg("max_qp_wr=%d\n", dev
->dev_attr
.max_qp_wr
);
557 dev
->dev_attr
.max_cqe
= pg_tbl_bytes
/ sizeof(struct pvrdma_cqe
) -
558 TARGET_PAGE_SIZE
; /* First page is ring state */
559 pr_dbg("max_cqe=%d\n", dev
->dev_attr
.max_cqe
);
562 static int pvrdma_check_ram_shared(Object
*obj
, void *opaque
)
564 bool *shared
= opaque
;
566 if (object_dynamic_cast(obj
, "memory-backend-ram")) {
567 *shared
= object_property_get_bool(obj
, "share", NULL
);
573 static void pvrdma_realize(PCIDevice
*pdev
, Error
**errp
)
576 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
578 bool ram_shared
= false;
582 pr_dbg("Initializing device %s %x.%x\n", pdev
->name
,
583 PCI_SLOT(pdev
->devfn
), PCI_FUNC(pdev
->devfn
));
585 if (TARGET_PAGE_SIZE
!= getpagesize()) {
586 error_setg(errp
, "Target page size must be the same as host page size");
590 memdev_root
= object_resolve_path("/objects", NULL
);
592 object_child_foreach(memdev_root
, pvrdma_check_ram_shared
, &ram_shared
);
595 error_setg(errp
, "Only shared memory backed ram is supported");
599 dev
->dsr_info
.dsr
= NULL
;
601 init_pci_config(pdev
);
609 rc
= init_msix(pdev
, errp
);
614 rc
= rdma_backend_init(&dev
->backend_dev
, pdev
, &dev
->rdma_dev_res
,
615 dev
->backend_device_name
, dev
->backend_port_num
,
616 dev
->backend_gid_idx
, &dev
->dev_attr
, errp
);
621 rc
= rdma_rm_init(&dev
->rdma_dev_res
, &dev
->dev_attr
, errp
);
626 init_ports(dev
, errp
);
628 rc
= pvrdma_qp_ops_init();
635 error_append_hint(errp
, "Device fail to load\n");
639 static void pvrdma_exit(PCIDevice
*pdev
)
644 static void pvrdma_class_init(ObjectClass
*klass
, void *data
)
646 DeviceClass
*dc
= DEVICE_CLASS(klass
);
647 PCIDeviceClass
*k
= PCI_DEVICE_CLASS(klass
);
649 k
->realize
= pvrdma_realize
;
650 k
->exit
= pvrdma_exit
;
651 k
->vendor_id
= PCI_VENDOR_ID_VMWARE
;
652 k
->device_id
= PCI_DEVICE_ID_VMWARE_PVRDMA
;
654 k
->class_id
= PCI_CLASS_NETWORK_OTHER
;
656 dc
->desc
= "RDMA Device";
657 dc
->props
= pvrdma_dev_properties
;
658 set_bit(DEVICE_CATEGORY_NETWORK
, dc
->categories
);
661 static const TypeInfo pvrdma_info
= {
662 .name
= PVRDMA_HW_NAME
,
663 .parent
= TYPE_PCI_DEVICE
,
664 .instance_size
= sizeof(PVRDMADev
),
665 .class_init
= pvrdma_class_init
,
666 .interfaces
= (InterfaceInfo
[]) {
667 { INTERFACE_CONVENTIONAL_PCI_DEVICE
},
672 static void register_types(void)
674 type_register_static(&pvrdma_info
);
677 type_init(register_types
)