kvm: qemu: device-assignment: munmap() mmio regions
[kvm-userspace.git] / qemu / hw / device-assignment.c
bloba4f784eb7b104942b640829064f8a8ff9bda7ce8
1 /*
2 * Copyright (c) 2007, Neocleus Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
18 * Assign a PCI device from the host to a guest VM.
20 * Adapted for KVM by Qumranet.
22 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
23 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
24 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
25 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
26 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
28 #include <stdio.h>
29 #include <sys/io.h>
30 #include "qemu-kvm.h"
31 #include "hw.h"
32 #include "pc.h"
33 #include "sysemu.h"
34 #include "console.h"
35 #include "device-assignment.h"
37 /* From linux/ioport.h */
38 #define IORESOURCE_IO 0x00000100 /* Resource type */
39 #define IORESOURCE_MEM 0x00000200
40 #define IORESOURCE_IRQ 0x00000400
41 #define IORESOURCE_DMA 0x00000800
42 #define IORESOURCE_PREFETCH 0x00001000 /* No side effects */
44 /* #define DEVICE_ASSIGNMENT_DEBUG 1 */
46 #ifdef DEVICE_ASSIGNMENT_DEBUG
47 #define DEBUG(fmt, ...) \
48 do { \
49 fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
50 } while (0)
51 #else
52 #define DEBUG(fmt, ...) do { } while(0)
53 #endif
55 static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
57 return region->u.r_baseport + (addr - region->e_physbase);
60 static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
61 uint32_t value)
63 AssignedDevRegion *r_access = opaque;
64 uint32_t r_pio = guest_to_host_ioport(r_access, addr);
66 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
67 r_pio, (int)r_access->e_physbase,
68 (unsigned long)r_access->u.r_baseport, value);
70 outb(value, r_pio);
73 static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
74 uint32_t value)
76 AssignedDevRegion *r_access = opaque;
77 uint32_t r_pio = guest_to_host_ioport(r_access, addr);
79 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
80 r_pio, (int)r_access->e_physbase,
81 (unsigned long)r_access->u.r_baseport, value);
83 outw(value, r_pio);
86 static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
87 uint32_t value)
89 AssignedDevRegion *r_access = opaque;
90 uint32_t r_pio = guest_to_host_ioport(r_access, addr);
92 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
93 r_pio, (int)r_access->e_physbase,
94 (unsigned long)r_access->u.r_baseport, value);
96 outl(value, r_pio);
99 static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
101 AssignedDevRegion *r_access = opaque;
102 uint32_t r_pio = guest_to_host_ioport(r_access, addr);
103 uint32_t value;
105 value = inb(r_pio);
107 DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
108 r_pio, (int)r_access->e_physbase,
109 (unsigned long)r_access->u.r_baseport, value);
111 return value;
114 static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
116 AssignedDevRegion *r_access = opaque;
117 uint32_t r_pio = guest_to_host_ioport(r_access, addr);
118 uint32_t value;
120 value = inw(r_pio);
122 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
123 r_pio, (int)r_access->e_physbase,
124 (unsigned long)r_access->u.r_baseport, value);
126 return value;
129 static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
131 AssignedDevRegion *r_access = opaque;
132 uint32_t r_pio = guest_to_host_ioport(r_access, addr);
133 uint32_t value;
135 value = inl(r_pio);
137 DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
138 r_pio, (int)r_access->e_physbase,
139 (unsigned long)r_access->u.r_baseport, value);
141 return value;
144 static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
145 uint32_t e_phys, uint32_t e_size, int type)
147 AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
148 AssignedDevRegion *region = &r_dev->v_addrs[region_num];
149 uint32_t old_ephys = region->e_physbase;
150 uint32_t old_esize = region->e_size;
151 int first_map = (region->e_size == 0);
152 int ret = 0;
154 DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
155 e_phys, region->u.r_virtbase, type, e_size, region_num);
157 region->e_physbase = e_phys;
158 region->e_size = e_size;
160 if (!first_map)
161 kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
163 if (e_size > 0)
164 ret = kvm_register_phys_mem(kvm_context, e_phys,
165 region->u.r_virtbase, e_size, 0);
166 if (ret != 0) {
167 fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
168 exit(1);
172 static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
173 uint32_t addr, uint32_t size, int type)
175 AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
176 AssignedDevRegion *region = &r_dev->v_addrs[region_num];
177 int first_map = (region->e_size == 0);
178 CPUState *env;
180 region->e_physbase = addr;
181 region->e_size = size;
183 DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
184 addr, region->u.r_baseport, type, size, region_num);
186 if (first_map) {
187 struct ioperm_data *data;
189 data = qemu_mallocz(sizeof(struct ioperm_data));
190 if (data == NULL) {
191 fprintf(stderr, "%s: Out of memory\n", __func__);
192 exit(1);
195 data->start_port = region->u.r_baseport;
196 data->num = region->r_size;
197 data->turn_on = 1;
199 kvm_add_ioperm_data(data);
201 for (env = first_cpu; env; env = env->next_cpu)
202 kvm_ioperm(env, data);
205 register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
206 (r_dev->v_addrs + region_num));
207 register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
208 (r_dev->v_addrs + region_num));
209 register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
210 (r_dev->v_addrs + region_num));
211 register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
212 (r_dev->v_addrs + region_num));
213 register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
214 (r_dev->v_addrs + region_num));
215 register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
216 (r_dev->v_addrs + region_num));
219 static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
220 uint32_t val, int len)
222 int fd;
223 ssize_t ret;
225 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
226 ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
227 (uint16_t) address, val, len);
229 if (address == 0x4) {
230 pci_default_write_config(d, address, val, len);
231 /* Continue to program the card */
234 if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
235 address == 0x3c || address == 0x3d) {
236 /* used for update-mappings (BAR emulation) */
237 pci_default_write_config(d, address, val, len);
238 return;
241 DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
242 ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
243 (uint16_t) address, val, len);
245 fd = ((AssignedDevice *)d)->real_device.config_fd;
247 again:
248 ret = pwrite(fd, &val, len, address);
249 if (ret != len) {
250 if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
251 goto again;
253 fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
254 __func__, ret, errno);
256 exit(1);
260 static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
261 int len)
263 uint32_t val = 0;
264 int fd;
265 ssize_t ret;
267 if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
268 address == 0x3c || address == 0x3d) {
269 val = pci_default_read_config(d, address, len);
270 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
271 (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
272 return val;
275 /* vga specific, remove later */
276 if (address == 0xFC)
277 goto do_log;
279 fd = ((AssignedDevice *)d)->real_device.config_fd;
281 again:
282 ret = pread(fd, &val, len, address);
283 if (ret != len) {
284 if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
285 goto again;
287 fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
288 __func__, ret, errno);
290 exit(1);
293 do_log:
294 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
295 (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
297 /* kill the special capabilities */
298 if (address == 4 && len == 4)
299 val &= ~0x100000;
300 else if (address == 6)
301 val &= ~0x10;
303 return val;
306 static int assigned_dev_register_regions(PCIRegion *io_regions,
307 unsigned long regions_num,
308 AssignedDevice *pci_dev)
310 uint32_t i;
311 PCIRegion *cur_region = io_regions;
313 for (i = 0; i < regions_num; i++, cur_region++) {
314 if (!cur_region->valid)
315 continue;
316 pci_dev->v_addrs[i].num = i;
318 /* handle memory io regions */
319 if (cur_region->type & IORESOURCE_MEM) {
320 int t = cur_region->type & IORESOURCE_PREFETCH
321 ? PCI_ADDRESS_SPACE_MEM_PREFETCH
322 : PCI_ADDRESS_SPACE_MEM;
324 /* map physical memory */
325 pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
326 pci_dev->v_addrs[i].u.r_virtbase =
327 mmap(NULL,
328 (cur_region->size + 0xFFF) & 0xFFFFF000,
329 PROT_WRITE | PROT_READ, MAP_SHARED,
330 cur_region->resource_fd, (off_t) 0);
332 if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
333 pci_dev->v_addrs[i].u.r_virtbase = NULL;
334 fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
335 "\n", __func__,
336 (uint32_t) (cur_region->base_addr));
337 return -1;
339 pci_dev->v_addrs[i].r_size = cur_region->size;
340 pci_dev->v_addrs[i].e_size = 0;
342 /* add offset */
343 pci_dev->v_addrs[i].u.r_virtbase +=
344 (cur_region->base_addr & 0xFFF);
346 pci_register_io_region((PCIDevice *) pci_dev, i,
347 cur_region->size, t,
348 assigned_dev_iomem_map);
349 continue;
351 /* handle port io regions */
352 pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
353 pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
354 pci_dev->v_addrs[i].r_size = cur_region->size;
355 pci_dev->v_addrs[i].e_size = 0;
357 pci_register_io_region((PCIDevice *) pci_dev, i,
358 cur_region->size, PCI_ADDRESS_SPACE_IO,
359 assigned_dev_ioport_map);
361 /* not relevant for port io */
362 pci_dev->v_addrs[i].memory_index = 0;
365 /* success */
366 return 0;
369 static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
370 uint8_t r_dev, uint8_t r_func)
372 char dir[128], name[128];
373 int fd, r = 0;
374 FILE *f;
375 unsigned long long start, end, size, flags;
376 PCIRegion *rp;
377 PCIDevRegions *dev = &pci_dev->real_device;
379 dev->region_number = 0;
381 snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
382 r_bus, r_dev, r_func);
384 snprintf(name, sizeof(name), "%sconfig", dir);
386 fd = open(name, O_RDWR);
387 if (fd == -1) {
388 fprintf(stderr, "%s: %s: %m\n", __func__, name);
389 return 1;
391 dev->config_fd = fd;
392 again:
393 r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
394 if (r < 0) {
395 if (errno == EINTR || errno == EAGAIN)
396 goto again;
397 fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
400 snprintf(name, sizeof(name), "%sresource", dir);
402 f = fopen(name, "r");
403 if (f == NULL) {
404 fprintf(stderr, "%s: %s: %m\n", __func__, name);
405 return 1;
408 for (r = 0; r < MAX_IO_REGIONS; r++) {
409 if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
410 break;
412 rp = dev->regions + r;
413 rp->valid = 0;
414 size = end - start + 1;
415 flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
416 if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
417 continue;
418 if (flags & IORESOURCE_MEM) {
419 flags &= ~IORESOURCE_IO;
420 snprintf(name, sizeof(name), "%sresource%d", dir, r);
421 fd = open(name, O_RDWR);
422 if (fd == -1)
423 continue; /* probably ROM */
424 rp->resource_fd = fd;
425 } else
426 flags &= ~IORESOURCE_PREFETCH;
428 rp->type = flags;
429 rp->valid = 1;
430 rp->base_addr = start;
431 rp->size = size;
432 DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
433 r, rp->size, start, rp->type, rp->resource_fd);
435 fclose(f);
437 dev->region_number = r;
438 return 0;
441 static LIST_HEAD(, AssignedDevInfo) adev_head;
443 void free_assigned_device(AssignedDevInfo *adev)
445 AssignedDevice *dev = adev->assigned_dev;
447 if (dev) {
448 int i;
450 for (i = 0; i < dev->real_device.region_number; i++) {
451 PCIRegion *pci_region = &dev->real_device.regions[i];
452 AssignedDevRegion *region = &dev->v_addrs[i];
454 if (!pci_region->valid || !(pci_region->type & IORESOURCE_MEM))
455 continue;
457 if (region->u.r_virtbase) {
458 int ret = munmap(region->u.r_virtbase,
459 (pci_region->size + 0xFFF) & 0xFFFFF000);
460 if (ret != 0)
461 fprintf(stderr,
462 "Failed to unmap assigned device region: %s\n",
463 strerror(errno));
467 if (dev->real_device.config_fd) {
468 close(dev->real_device.config_fd);
469 dev->real_device.config_fd = 0;
472 pci_unregister_device(&dev->dev);
473 adev->assigned_dev = dev = NULL;
476 LIST_REMOVE(adev, next);
477 qemu_free(adev);
480 static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
482 return (uint32_t)bus << 8 | (uint32_t)devfn;
485 /* The pci config space got updated. Check if irq numbers have changed
486 * for our devices
488 void assigned_dev_update_irq(PCIDevice *d)
490 AssignedDevInfo *adev;
492 adev = LIST_FIRST(&adev_head);
493 while (adev) {
494 AssignedDevInfo *next = LIST_NEXT(adev, next);
495 AssignedDevice *assigned_dev = adev->assigned_dev;
496 int irq, r;
498 irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
499 irq = piix_get_irq(irq);
501 #ifdef TARGET_IA64
502 irq = ipf_map_irq(d, irq);
503 #endif
505 if (irq != assigned_dev->girq) {
506 struct kvm_assigned_irq assigned_irq_data;
508 memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
509 assigned_irq_data.assigned_dev_id =
510 calc_assigned_dev_id(assigned_dev->h_busnr,
511 (uint8_t) assigned_dev->h_devfn);
512 assigned_irq_data.guest_irq = irq;
513 assigned_irq_data.host_irq = assigned_dev->real_device.irq;
514 r = kvm_assign_irq(kvm_context, &assigned_irq_data);
515 if (r < 0) {
516 fprintf(stderr, "Failed to assign irq for \"%s\": %s\n",
517 adev->name, strerror(-r));
518 fprintf(stderr, "Perhaps you are assigning a device "
519 "that shares an IRQ with another device?\n");
520 LIST_REMOVE(adev, next);
521 free_assigned_device(adev);
522 adev = next;
523 continue;
525 assigned_dev->girq = irq;
528 adev = next;
532 struct PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus)
534 int r;
535 AssignedDevice *dev;
536 uint8_t e_device, e_intx;
537 struct kvm_assigned_pci_dev assigned_dev_data;
539 DEBUG("Registering real physical device %s (bus=%x dev=%x func=%x)\n",
540 adev->name, adev->bus, adev->dev, adev->func);
542 dev = (AssignedDevice *)
543 pci_register_device(bus, adev->name, sizeof(AssignedDevice),
544 -1, assigned_dev_pci_read_config,
545 assigned_dev_pci_write_config);
546 if (NULL == dev) {
547 fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
548 __func__, adev->name);
549 return NULL;
552 if (get_real_device(dev, adev->bus, adev->dev, adev->func)) {
553 fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
554 __func__, adev->name);
555 goto out;
558 /* handle real device's MMIO/PIO BARs */
559 if (assigned_dev_register_regions(dev->real_device.regions,
560 dev->real_device.region_number,
561 dev))
562 goto out;
564 /* handle interrupt routing */
565 e_device = (dev->dev.devfn >> 3) & 0x1f;
566 e_intx = dev->dev.config[0x3d] - 1;
567 dev->intpin = e_intx;
568 dev->run = 0;
569 dev->girq = 0;
570 dev->h_busnr = adev->bus;
571 dev->h_devfn = PCI_DEVFN(adev->dev, adev->func);
573 memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
574 assigned_dev_data.assigned_dev_id =
575 calc_assigned_dev_id(dev->h_busnr, (uint32_t)dev->h_devfn);
576 assigned_dev_data.busnr = dev->h_busnr;
577 assigned_dev_data.devfn = dev->h_devfn;
579 #ifdef KVM_CAP_IOMMU
580 /* We always enable the IOMMU if present
581 * (or when not disabled on the command line)
583 r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
584 if (r && !adev->disable_iommu)
585 assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
586 #endif
588 r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
589 if (r < 0) {
590 fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
591 adev->name, strerror(-r));
592 goto out;
595 adev->assigned_dev = dev;
596 return &dev->dev;
598 out:
599 pci_unregister_device(&dev->dev);
600 return NULL;
604 * Syntax to assign device:
606 * -pcidevice host=bus:dev.func[,dma=none][,name=Foo]
608 * Example:
609 * -pcidevice host=00:13.0,dma=pvdma
611 * dma can currently only be 'none' to disable iommu support.
613 AssignedDevInfo *add_assigned_device(const char *arg)
615 char *cp, *cp1;
616 char device[8];
617 char dma[6];
618 int r;
619 AssignedDevInfo *adev;
621 adev = qemu_mallocz(sizeof(AssignedDevInfo));
622 if (adev == NULL) {
623 fprintf(stderr, "%s: Out of memory\n", __func__);
624 return NULL;
626 r = get_param_value(device, sizeof(device), "host", arg);
627 r = get_param_value(adev->name, sizeof(adev->name), "name", arg);
628 if (!r)
629 snprintf(adev->name, sizeof(adev->name), "%s", device);
631 #ifdef KVM_CAP_IOMMU
632 r = get_param_value(dma, sizeof(dma), "dma", arg);
633 if (r && !strncmp(dma, "none", 4))
634 adev->disable_iommu = 1;
635 #endif
636 cp = device;
637 adev->bus = strtoul(cp, &cp1, 16);
638 if (*cp1 != ':')
639 goto bad;
640 cp = cp1 + 1;
642 adev->dev = strtoul(cp, &cp1, 16);
643 if (*cp1 != '.')
644 goto bad;
645 cp = cp1 + 1;
647 adev->func = strtoul(cp, &cp1, 16);
649 LIST_INSERT_HEAD(&adev_head, adev, next);
650 return adev;
651 bad:
652 fprintf(stderr, "pcidevice argument parse error; "
653 "please check the help text for usage\n");
654 qemu_free(adev);
655 return NULL;
658 void add_assigned_devices(PCIBus *bus, const char **devices, int n_devices)
660 int i;
662 for (i = 0; i < n_devices; i++) {
663 struct AssignedDevInfo *adev;
665 adev = add_assigned_device(devices[i]);
666 if (!adev) {
667 fprintf(stderr, "Could not add assigned device %s\n", devices[i]);
668 continue;
669 exit(1);
672 if (!init_assigned_device(adev, bus)) {
673 fprintf(stderr, "Failed to initialize assigned device %s\n",
674 devices[i]);
675 exit(1);