2 * Copyright (c) 2007, Neocleus Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 * Assign a PCI device from the host to a guest VM.
20 * Adapted for KVM by Qumranet.
22 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
23 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
24 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
25 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
26 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
35 #include "device-assignment.h"
37 /* From linux/ioport.h */
38 #define IORESOURCE_IO 0x00000100 /* Resource type */
39 #define IORESOURCE_MEM 0x00000200
40 #define IORESOURCE_IRQ 0x00000400
41 #define IORESOURCE_DMA 0x00000800
42 #define IORESOURCE_PREFETCH 0x00001000 /* No side effects */
44 /* #define DEVICE_ASSIGNMENT_DEBUG 1 */
46 #ifdef DEVICE_ASSIGNMENT_DEBUG
47 #define DEBUG(fmt, ...) \
49 fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
52 #define DEBUG(fmt, ...) do { } while(0)
55 static uint32_t guest_to_host_ioport(AssignedDevRegion
*region
, uint32_t addr
)
57 return region
->u
.r_baseport
+ (addr
- region
->e_physbase
);
60 static void assigned_dev_ioport_writeb(void *opaque
, uint32_t addr
,
63 AssignedDevRegion
*r_access
= opaque
;
64 uint32_t r_pio
= guest_to_host_ioport(r_access
, addr
);
66 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
67 r_pio
, (int)r_access
->e_physbase
,
68 (unsigned long)r_access
->u
.r_baseport
, value
);
73 static void assigned_dev_ioport_writew(void *opaque
, uint32_t addr
,
76 AssignedDevRegion
*r_access
= opaque
;
77 uint32_t r_pio
= guest_to_host_ioport(r_access
, addr
);
79 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
80 r_pio
, (int)r_access
->e_physbase
,
81 (unsigned long)r_access
->u
.r_baseport
, value
);
86 static void assigned_dev_ioport_writel(void *opaque
, uint32_t addr
,
89 AssignedDevRegion
*r_access
= opaque
;
90 uint32_t r_pio
= guest_to_host_ioport(r_access
, addr
);
92 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
93 r_pio
, (int)r_access
->e_physbase
,
94 (unsigned long)r_access
->u
.r_baseport
, value
);
99 static uint32_t assigned_dev_ioport_readb(void *opaque
, uint32_t addr
)
101 AssignedDevRegion
*r_access
= opaque
;
102 uint32_t r_pio
= guest_to_host_ioport(r_access
, addr
);
107 DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
108 r_pio
, (int)r_access
->e_physbase
,
109 (unsigned long)r_access
->u
.r_baseport
, value
);
114 static uint32_t assigned_dev_ioport_readw(void *opaque
, uint32_t addr
)
116 AssignedDevRegion
*r_access
= opaque
;
117 uint32_t r_pio
= guest_to_host_ioport(r_access
, addr
);
122 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
123 r_pio
, (int)r_access
->e_physbase
,
124 (unsigned long)r_access
->u
.r_baseport
, value
);
129 static uint32_t assigned_dev_ioport_readl(void *opaque
, uint32_t addr
)
131 AssignedDevRegion
*r_access
= opaque
;
132 uint32_t r_pio
= guest_to_host_ioport(r_access
, addr
);
137 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
138 r_pio
, (int)r_access
->e_physbase
,
139 (unsigned long)r_access
->u
.r_baseport
, value
);
144 static void assigned_dev_iomem_map(PCIDevice
*pci_dev
, int region_num
,
145 uint32_t e_phys
, uint32_t e_size
, int type
)
147 AssignedDevice
*r_dev
= (AssignedDevice
*) pci_dev
;
148 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
149 uint32_t old_ephys
= region
->e_physbase
;
150 uint32_t old_esize
= region
->e_size
;
151 int first_map
= (region
->e_size
== 0);
154 DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
155 e_phys
, region
->u
.r_virtbase
, type
, e_size
, region_num
);
157 region
->e_physbase
= e_phys
;
158 region
->e_size
= e_size
;
161 kvm_destroy_phys_mem(kvm_context
, old_ephys
, old_esize
);
164 ret
= kvm_register_phys_mem(kvm_context
, e_phys
,
165 region
->u
.r_virtbase
, e_size
, 0);
167 fprintf(stderr
, "%s: Error: create new mapping failed\n", __func__
);
172 static void assigned_dev_ioport_map(PCIDevice
*pci_dev
, int region_num
,
173 uint32_t addr
, uint32_t size
, int type
)
175 AssignedDevice
*r_dev
= (AssignedDevice
*) pci_dev
;
176 AssignedDevRegion
*region
= &r_dev
->v_addrs
[region_num
];
177 int first_map
= (region
->e_size
== 0);
180 region
->e_physbase
= addr
;
181 region
->e_size
= size
;
183 DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
184 addr
, region
->u
.r_baseport
, type
, size
, region_num
);
187 struct ioperm_data
*data
;
189 data
= qemu_mallocz(sizeof(struct ioperm_data
));
191 fprintf(stderr
, "%s: Out of memory\n", __func__
);
195 data
->start_port
= region
->u
.r_baseport
;
196 data
->num
= region
->r_size
;
199 kvm_add_ioperm_data(data
);
201 for (env
= first_cpu
; env
; env
= env
->next_cpu
)
202 kvm_ioperm(env
, data
);
205 register_ioport_read(addr
, size
, 1, assigned_dev_ioport_readb
,
206 (r_dev
->v_addrs
+ region_num
));
207 register_ioport_read(addr
, size
, 2, assigned_dev_ioport_readw
,
208 (r_dev
->v_addrs
+ region_num
));
209 register_ioport_read(addr
, size
, 4, assigned_dev_ioport_readl
,
210 (r_dev
->v_addrs
+ region_num
));
211 register_ioport_write(addr
, size
, 1, assigned_dev_ioport_writeb
,
212 (r_dev
->v_addrs
+ region_num
));
213 register_ioport_write(addr
, size
, 2, assigned_dev_ioport_writew
,
214 (r_dev
->v_addrs
+ region_num
));
215 register_ioport_write(addr
, size
, 4, assigned_dev_ioport_writel
,
216 (r_dev
->v_addrs
+ region_num
));
219 static void assigned_dev_pci_write_config(PCIDevice
*d
, uint32_t address
,
220 uint32_t val
, int len
)
225 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
226 ((d
->devfn
>> 3) & 0x1F), (d
->devfn
& 0x7),
227 (uint16_t) address
, val
, len
);
229 if (address
== 0x4) {
230 pci_default_write_config(d
, address
, val
, len
);
231 /* Continue to program the card */
234 if ((address
>= 0x10 && address
<= 0x24) || address
== 0x34 ||
235 address
== 0x3c || address
== 0x3d) {
236 /* used for update-mappings (BAR emulation) */
237 pci_default_write_config(d
, address
, val
, len
);
241 DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
242 ((d
->devfn
>> 3) & 0x1F), (d
->devfn
& 0x7),
243 (uint16_t) address
, val
, len
);
245 fd
= ((AssignedDevice
*)d
)->real_device
.config_fd
;
248 ret
= pwrite(fd
, &val
, len
, address
);
250 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
))
253 fprintf(stderr
, "%s: pwrite failed, ret = %zd errno = %d\n",
254 __func__
, ret
, errno
);
260 static uint32_t assigned_dev_pci_read_config(PCIDevice
*d
, uint32_t address
,
267 if ((address
>= 0x10 && address
<= 0x24) || address
== 0x34 ||
268 address
== 0x3c || address
== 0x3d) {
269 val
= pci_default_read_config(d
, address
, len
);
270 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
271 (d
->devfn
>> 3) & 0x1F, (d
->devfn
& 0x7), address
, val
, len
);
275 /* vga specific, remove later */
279 fd
= ((AssignedDevice
*)d
)->real_device
.config_fd
;
282 ret
= pread(fd
, &val
, len
, address
);
284 if ((ret
< 0) && (errno
== EINTR
|| errno
== EAGAIN
))
287 fprintf(stderr
, "%s: pread failed, ret = %zd errno = %d\n",
288 __func__
, ret
, errno
);
294 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
295 (d
->devfn
>> 3) & 0x1F, (d
->devfn
& 0x7), address
, val
, len
);
297 /* kill the special capabilities */
298 if (address
== 4 && len
== 4)
300 else if (address
== 6)
306 static int assigned_dev_register_regions(PCIRegion
*io_regions
,
307 unsigned long regions_num
,
308 AssignedDevice
*pci_dev
)
311 PCIRegion
*cur_region
= io_regions
;
313 for (i
= 0; i
< regions_num
; i
++, cur_region
++) {
314 if (!cur_region
->valid
)
316 pci_dev
->v_addrs
[i
].num
= i
;
318 /* handle memory io regions */
319 if (cur_region
->type
& IORESOURCE_MEM
) {
320 int t
= cur_region
->type
& IORESOURCE_PREFETCH
321 ? PCI_ADDRESS_SPACE_MEM_PREFETCH
322 : PCI_ADDRESS_SPACE_MEM
;
324 /* map physical memory */
325 pci_dev
->v_addrs
[i
].e_physbase
= cur_region
->base_addr
;
326 pci_dev
->v_addrs
[i
].u
.r_virtbase
=
328 (cur_region
->size
+ 0xFFF) & 0xFFFFF000,
329 PROT_WRITE
| PROT_READ
, MAP_SHARED
,
330 cur_region
->resource_fd
, (off_t
) 0);
332 if (pci_dev
->v_addrs
[i
].u
.r_virtbase
== MAP_FAILED
) {
333 pci_dev
->v_addrs
[i
].u
.r_virtbase
= NULL
;
334 fprintf(stderr
, "%s: Error: Couldn't mmap 0x%x!"
336 (uint32_t) (cur_region
->base_addr
));
339 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
340 pci_dev
->v_addrs
[i
].e_size
= 0;
343 pci_dev
->v_addrs
[i
].u
.r_virtbase
+=
344 (cur_region
->base_addr
& 0xFFF);
346 pci_register_io_region((PCIDevice
*) pci_dev
, i
,
348 assigned_dev_iomem_map
);
351 /* handle port io regions */
352 pci_dev
->v_addrs
[i
].e_physbase
= cur_region
->base_addr
;
353 pci_dev
->v_addrs
[i
].u
.r_baseport
= cur_region
->base_addr
;
354 pci_dev
->v_addrs
[i
].r_size
= cur_region
->size
;
355 pci_dev
->v_addrs
[i
].e_size
= 0;
357 pci_register_io_region((PCIDevice
*) pci_dev
, i
,
358 cur_region
->size
, PCI_ADDRESS_SPACE_IO
,
359 assigned_dev_ioport_map
);
361 /* not relevant for port io */
362 pci_dev
->v_addrs
[i
].memory_index
= 0;
369 static int get_real_device(AssignedDevice
*pci_dev
, uint8_t r_bus
,
370 uint8_t r_dev
, uint8_t r_func
)
372 char dir
[128], name
[128];
375 unsigned long long start
, end
, size
, flags
;
377 PCIDevRegions
*dev
= &pci_dev
->real_device
;
379 dev
->region_number
= 0;
381 snprintf(dir
, sizeof(dir
), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
382 r_bus
, r_dev
, r_func
);
384 snprintf(name
, sizeof(name
), "%sconfig", dir
);
386 fd
= open(name
, O_RDWR
);
388 fprintf(stderr
, "%s: %s: %m\n", __func__
, name
);
393 r
= read(fd
, pci_dev
->dev
.config
, sizeof(pci_dev
->dev
.config
));
395 if (errno
== EINTR
|| errno
== EAGAIN
)
397 fprintf(stderr
, "%s: read failed, errno = %d\n", __func__
, errno
);
400 snprintf(name
, sizeof(name
), "%sresource", dir
);
402 f
= fopen(name
, "r");
404 fprintf(stderr
, "%s: %s: %m\n", __func__
, name
);
408 for (r
= 0; r
< MAX_IO_REGIONS
; r
++) {
409 if (fscanf(f
, "%lli %lli %lli\n", &start
, &end
, &flags
) != 3)
412 rp
= dev
->regions
+ r
;
414 size
= end
- start
+ 1;
415 flags
&= IORESOURCE_IO
| IORESOURCE_MEM
| IORESOURCE_PREFETCH
;
416 if (size
== 0 || (flags
& ~IORESOURCE_PREFETCH
) == 0)
418 if (flags
& IORESOURCE_MEM
) {
419 flags
&= ~IORESOURCE_IO
;
420 snprintf(name
, sizeof(name
), "%sresource%d", dir
, r
);
421 fd
= open(name
, O_RDWR
);
423 continue; /* probably ROM */
424 rp
->resource_fd
= fd
;
426 flags
&= ~IORESOURCE_PREFETCH
;
430 rp
->base_addr
= start
;
432 DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
433 r
, rp
->size
, start
, rp
->type
, rp
->resource_fd
);
437 dev
->region_number
= r
;
441 static LIST_HEAD(, AssignedDevInfo
) adev_head
;
443 void free_assigned_device(AssignedDevInfo
*adev
)
445 AssignedDevice
*dev
= adev
->assigned_dev
;
450 for (i
= 0; i
< dev
->real_device
.region_number
; i
++) {
451 PCIRegion
*pci_region
= &dev
->real_device
.regions
[i
];
452 AssignedDevRegion
*region
= &dev
->v_addrs
[i
];
454 if (!pci_region
->valid
|| !(pci_region
->type
& IORESOURCE_MEM
))
457 if (region
->u
.r_virtbase
) {
458 int ret
= munmap(region
->u
.r_virtbase
,
459 (pci_region
->size
+ 0xFFF) & 0xFFFFF000);
462 "Failed to unmap assigned device region: %s\n",
467 if (dev
->real_device
.config_fd
) {
468 close(dev
->real_device
.config_fd
);
469 dev
->real_device
.config_fd
= 0;
472 pci_unregister_device(&dev
->dev
);
473 adev
->assigned_dev
= dev
= NULL
;
476 LIST_REMOVE(adev
, next
);
480 static uint32_t calc_assigned_dev_id(uint8_t bus
, uint8_t devfn
)
482 return (uint32_t)bus
<< 8 | (uint32_t)devfn
;
485 /* The pci config space got updated. Check if irq numbers have changed
488 void assigned_dev_update_irq(PCIDevice
*d
)
490 AssignedDevInfo
*adev
;
492 adev
= LIST_FIRST(&adev_head
);
494 AssignedDevInfo
*next
= LIST_NEXT(adev
, next
);
495 AssignedDevice
*assigned_dev
= adev
->assigned_dev
;
498 irq
= pci_map_irq(&assigned_dev
->dev
, assigned_dev
->intpin
);
499 irq
= piix_get_irq(irq
);
502 irq
= ipf_map_irq(d
, irq
);
505 if (irq
!= assigned_dev
->girq
) {
506 struct kvm_assigned_irq assigned_irq_data
;
508 memset(&assigned_irq_data
, 0, sizeof(assigned_irq_data
));
509 assigned_irq_data
.assigned_dev_id
=
510 calc_assigned_dev_id(assigned_dev
->h_busnr
,
511 (uint8_t) assigned_dev
->h_devfn
);
512 assigned_irq_data
.guest_irq
= irq
;
513 assigned_irq_data
.host_irq
= assigned_dev
->real_device
.irq
;
514 r
= kvm_assign_irq(kvm_context
, &assigned_irq_data
);
516 fprintf(stderr
, "Failed to assign irq for \"%s\": %s\n",
517 adev
->name
, strerror(-r
));
518 fprintf(stderr
, "Perhaps you are assigning a device "
519 "that shares an IRQ with another device?\n");
520 LIST_REMOVE(adev
, next
);
521 free_assigned_device(adev
);
525 assigned_dev
->girq
= irq
;
532 struct PCIDevice
*init_assigned_device(AssignedDevInfo
*adev
, PCIBus
*bus
)
536 uint8_t e_device
, e_intx
;
537 struct kvm_assigned_pci_dev assigned_dev_data
;
539 DEBUG("Registering real physical device %s (bus=%x dev=%x func=%x)\n",
540 adev
->name
, adev
->bus
, adev
->dev
, adev
->func
);
542 dev
= (AssignedDevice
*)
543 pci_register_device(bus
, adev
->name
, sizeof(AssignedDevice
),
544 -1, assigned_dev_pci_read_config
,
545 assigned_dev_pci_write_config
);
547 fprintf(stderr
, "%s: Error: Couldn't register real device %s\n",
548 __func__
, adev
->name
);
552 if (get_real_device(dev
, adev
->bus
, adev
->dev
, adev
->func
)) {
553 fprintf(stderr
, "%s: Error: Couldn't get real device (%s)!\n",
554 __func__
, adev
->name
);
558 /* handle real device's MMIO/PIO BARs */
559 if (assigned_dev_register_regions(dev
->real_device
.regions
,
560 dev
->real_device
.region_number
,
564 /* handle interrupt routing */
565 e_device
= (dev
->dev
.devfn
>> 3) & 0x1f;
566 e_intx
= dev
->dev
.config
[0x3d] - 1;
567 dev
->intpin
= e_intx
;
570 dev
->h_busnr
= adev
->bus
;
571 dev
->h_devfn
= PCI_DEVFN(adev
->dev
, adev
->func
);
573 memset(&assigned_dev_data
, 0, sizeof(assigned_dev_data
));
574 assigned_dev_data
.assigned_dev_id
=
575 calc_assigned_dev_id(dev
->h_busnr
, (uint32_t)dev
->h_devfn
);
576 assigned_dev_data
.busnr
= dev
->h_busnr
;
577 assigned_dev_data
.devfn
= dev
->h_devfn
;
580 /* We always enable the IOMMU if present
581 * (or when not disabled on the command line)
583 r
= kvm_check_extension(kvm_context
, KVM_CAP_IOMMU
);
584 if (r
&& !adev
->disable_iommu
)
585 assigned_dev_data
.flags
|= KVM_DEV_ASSIGN_ENABLE_IOMMU
;
588 r
= kvm_assign_pci_device(kvm_context
, &assigned_dev_data
);
590 fprintf(stderr
, "Failed to assign device \"%s\" : %s\n",
591 adev
->name
, strerror(-r
));
595 adev
->assigned_dev
= dev
;
599 pci_unregister_device(&dev
->dev
);
604 * Syntax to assign device:
606 * -pcidevice host=bus:dev.func[,dma=none][,name=Foo]
609 * -pcidevice host=00:13.0,dma=pvdma
611 * dma can currently only be 'none' to disable iommu support.
613 AssignedDevInfo
*add_assigned_device(const char *arg
)
619 AssignedDevInfo
*adev
;
621 adev
= qemu_mallocz(sizeof(AssignedDevInfo
));
623 fprintf(stderr
, "%s: Out of memory\n", __func__
);
626 r
= get_param_value(device
, sizeof(device
), "host", arg
);
627 r
= get_param_value(adev
->name
, sizeof(adev
->name
), "name", arg
);
629 snprintf(adev
->name
, sizeof(adev
->name
), "%s", device
);
632 r
= get_param_value(dma
, sizeof(dma
), "dma", arg
);
633 if (r
&& !strncmp(dma
, "none", 4))
634 adev
->disable_iommu
= 1;
637 adev
->bus
= strtoul(cp
, &cp1
, 16);
642 adev
->dev
= strtoul(cp
, &cp1
, 16);
647 adev
->func
= strtoul(cp
, &cp1
, 16);
649 LIST_INSERT_HEAD(&adev_head
, adev
, next
);
652 fprintf(stderr
, "pcidevice argument parse error; "
653 "please check the help text for usage\n");
658 void add_assigned_devices(PCIBus
*bus
, const char **devices
, int n_devices
)
662 for (i
= 0; i
< n_devices
; i
++) {
663 struct AssignedDevInfo
*adev
;
665 adev
= add_assigned_device(devices
[i
]);
667 fprintf(stderr
, "Could not add assigned device %s\n", devices
[i
]);
672 if (!init_assigned_device(adev
, bus
)) {
673 fprintf(stderr
, "Failed to initialize assigned device %s\n",