2 * VFIO: IOMMU DMA mapping support for TCE on POWER
4 * Copyright (C) 2013 IBM Corp. All rights reserved.
5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio_iommu_type1.c:
12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
13 * Author: Alex Williamson <alex.williamson@redhat.com>
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/slab.h>
19 #include <linux/uaccess.h>
20 #include <linux/err.h>
21 #include <linux/vfio.h>
22 #include <asm/iommu.h>
25 #define DRIVER_VERSION "0.1"
26 #define DRIVER_AUTHOR "aik@ozlabs.ru"
27 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
29 static void tce_iommu_detach_group(void *iommu_data
,
30 struct iommu_group
*iommu_group
);
32 static long try_increment_locked_vm(long npages
)
34 long ret
= 0, locked
, lock_limit
;
36 if (!current
|| !current
->mm
)
37 return -ESRCH
; /* process exited */
42 down_write(¤t
->mm
->mmap_sem
);
43 locked
= current
->mm
->locked_vm
+ npages
;
44 lock_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
45 if (locked
> lock_limit
&& !capable(CAP_IPC_LOCK
))
48 current
->mm
->locked_vm
+= npages
;
50 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current
->pid
,
52 current
->mm
->locked_vm
<< PAGE_SHIFT
,
53 rlimit(RLIMIT_MEMLOCK
),
54 ret
? " - exceeded" : "");
56 up_write(¤t
->mm
->mmap_sem
);
61 static void decrement_locked_vm(long npages
)
63 if (!current
|| !current
->mm
|| !npages
)
64 return; /* process exited */
66 down_write(¤t
->mm
->mmap_sem
);
67 if (WARN_ON_ONCE(npages
> current
->mm
->locked_vm
))
68 npages
= current
->mm
->locked_vm
;
69 current
->mm
->locked_vm
-= npages
;
70 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current
->pid
,
72 current
->mm
->locked_vm
<< PAGE_SHIFT
,
73 rlimit(RLIMIT_MEMLOCK
));
74 up_write(¤t
->mm
->mmap_sem
);
78 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
80 * This code handles mapping and unmapping of user data buffers
81 * into DMA'ble space using the IOMMU
85 * The container descriptor supports only a single group per container.
86 * Required by the API as the container is not supplied with the IOMMU group
87 * at the moment of initialization.
89 struct tce_container
{
91 struct iommu_group
*grp
;
93 unsigned long locked_pages
;
96 static bool tce_page_is_contained(struct page
*page
, unsigned page_shift
)
99 * Check that the TCE table granularity is not bigger than the size of
100 * a page we just found. Otherwise the hardware can get access to
101 * a bigger memory chunk that it should.
103 return (PAGE_SHIFT
+ compound_order(compound_head(page
))) >= page_shift
;
106 static long tce_iommu_find_table(struct tce_container
*container
,
107 phys_addr_t ioba
, struct iommu_table
**ptbl
)
110 struct iommu_table_group
*table_group
;
112 table_group
= iommu_group_get_iommudata(container
->grp
);
116 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
117 struct iommu_table
*tbl
= table_group
->tables
[i
];
120 unsigned long entry
= ioba
>> tbl
->it_page_shift
;
121 unsigned long start
= tbl
->it_offset
;
122 unsigned long end
= start
+ tbl
->it_size
;
124 if ((start
<= entry
) && (entry
< end
)) {
134 static int tce_iommu_enable(struct tce_container
*container
)
137 unsigned long locked
;
138 struct iommu_table
*tbl
;
139 struct iommu_table_group
*table_group
;
145 return -ESRCH
; /* process exited */
147 if (container
->enabled
)
151 * When userspace pages are mapped into the IOMMU, they are effectively
152 * locked memory, so, theoretically, we need to update the accounting
153 * of locked pages on each map and unmap. For powerpc, the map unmap
154 * paths can be very hot, though, and the accounting would kill
155 * performance, especially since it would be difficult to impossible
156 * to handle the accounting in real mode only.
158 * To address that, rather than precisely accounting every page, we
159 * instead account for a worst case on locked memory when the iommu is
160 * enabled and disabled. The worst case upper bound on locked memory
161 * is the size of the whole iommu window, which is usually relatively
162 * small (compared to total memory sizes) on POWER hardware.
164 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
165 * that would effectively kill the guest at random points, much better
166 * enforcing the limit based on the max that the guest can map.
168 * Unfortunately at the moment it counts whole tables, no matter how
169 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
170 * each with 2GB DMA window, 8GB will be counted here. The reason for
171 * this is that we cannot tell here the amount of RAM used by the guest
172 * as this information is only available from KVM and VFIO is
175 table_group
= iommu_group_get_iommudata(container
->grp
);
179 tbl
= table_group
->tables
[0];
180 locked
= (tbl
->it_size
<< tbl
->it_page_shift
) >> PAGE_SHIFT
;
181 ret
= try_increment_locked_vm(locked
);
185 container
->locked_pages
= locked
;
187 container
->enabled
= true;
192 static void tce_iommu_disable(struct tce_container
*container
)
194 if (!container
->enabled
)
197 container
->enabled
= false;
202 decrement_locked_vm(container
->locked_pages
);
205 static void *tce_iommu_open(unsigned long arg
)
207 struct tce_container
*container
;
209 if (arg
!= VFIO_SPAPR_TCE_IOMMU
) {
210 pr_err("tce_vfio: Wrong IOMMU type\n");
211 return ERR_PTR(-EINVAL
);
214 container
= kzalloc(sizeof(*container
), GFP_KERNEL
);
216 return ERR_PTR(-ENOMEM
);
218 mutex_init(&container
->lock
);
223 static void tce_iommu_release(void *iommu_data
)
225 struct tce_container
*container
= iommu_data
;
227 WARN_ON(container
->grp
);
230 tce_iommu_detach_group(iommu_data
, container
->grp
);
232 tce_iommu_disable(container
);
233 mutex_destroy(&container
->lock
);
238 static void tce_iommu_unuse_page(struct tce_container
*container
,
239 unsigned long oldtce
)
243 if (!(oldtce
& (TCE_PCI_READ
| TCE_PCI_WRITE
)))
246 page
= pfn_to_page(oldtce
>> PAGE_SHIFT
);
248 if (oldtce
& TCE_PCI_WRITE
)
254 static int tce_iommu_clear(struct tce_container
*container
,
255 struct iommu_table
*tbl
,
256 unsigned long entry
, unsigned long pages
)
258 unsigned long oldtce
;
260 for ( ; pages
; --pages
, ++entry
) {
261 oldtce
= iommu_clear_tce(tbl
, entry
);
265 tce_iommu_unuse_page(container
, oldtce
);
271 static int tce_iommu_use_page(unsigned long tce
, unsigned long *hpa
)
273 struct page
*page
= NULL
;
274 enum dma_data_direction direction
= iommu_tce_direction(tce
);
276 if (get_user_pages_fast(tce
& PAGE_MASK
, 1,
277 direction
!= DMA_TO_DEVICE
, &page
) != 1)
280 *hpa
= __pa((unsigned long) page_address(page
));
285 static long tce_iommu_build(struct tce_container
*container
,
286 struct iommu_table
*tbl
,
287 unsigned long entry
, unsigned long tce
, unsigned long pages
)
292 enum dma_data_direction direction
= iommu_tce_direction(tce
);
294 for (i
= 0; i
< pages
; ++i
) {
295 unsigned long offset
= tce
& IOMMU_PAGE_MASK(tbl
) & ~PAGE_MASK
;
297 ret
= tce_iommu_use_page(tce
, &hpa
);
301 page
= pfn_to_page(hpa
>> PAGE_SHIFT
);
302 if (!tce_page_is_contained(page
, tbl
->it_page_shift
)) {
308 ret
= iommu_tce_build(tbl
, entry
+ i
, (unsigned long) __va(hpa
),
311 tce_iommu_unuse_page(container
, hpa
);
312 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
313 __func__
, entry
<< tbl
->it_page_shift
,
317 tce
+= IOMMU_PAGE_SIZE(tbl
);
321 tce_iommu_clear(container
, tbl
, entry
, i
);
326 static long tce_iommu_ioctl(void *iommu_data
,
327 unsigned int cmd
, unsigned long arg
)
329 struct tce_container
*container
= iommu_data
;
334 case VFIO_CHECK_EXTENSION
:
336 case VFIO_SPAPR_TCE_IOMMU
:
340 ret
= vfio_spapr_iommu_eeh_ioctl(NULL
, cmd
, arg
);
344 return (ret
< 0) ? 0 : ret
;
346 case VFIO_IOMMU_SPAPR_TCE_GET_INFO
: {
347 struct vfio_iommu_spapr_tce_info info
;
348 struct iommu_table
*tbl
;
349 struct iommu_table_group
*table_group
;
351 if (WARN_ON(!container
->grp
))
354 table_group
= iommu_group_get_iommudata(container
->grp
);
356 tbl
= table_group
->tables
[0];
357 if (WARN_ON_ONCE(!tbl
))
360 minsz
= offsetofend(struct vfio_iommu_spapr_tce_info
,
363 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
366 if (info
.argsz
< minsz
)
369 info
.dma32_window_start
= tbl
->it_offset
<< tbl
->it_page_shift
;
370 info
.dma32_window_size
= tbl
->it_size
<< tbl
->it_page_shift
;
373 if (copy_to_user((void __user
*)arg
, &info
, minsz
))
378 case VFIO_IOMMU_MAP_DMA
: {
379 struct vfio_iommu_type1_dma_map param
;
380 struct iommu_table
*tbl
= NULL
;
384 if (!container
->enabled
)
387 minsz
= offsetofend(struct vfio_iommu_type1_dma_map
, size
);
389 if (copy_from_user(¶m
, (void __user
*)arg
, minsz
))
392 if (param
.argsz
< minsz
)
395 if (param
.flags
& ~(VFIO_DMA_MAP_FLAG_READ
|
396 VFIO_DMA_MAP_FLAG_WRITE
))
399 num
= tce_iommu_find_table(container
, param
.iova
, &tbl
);
403 if ((param
.size
& ~IOMMU_PAGE_MASK(tbl
)) ||
404 (param
.vaddr
& ~IOMMU_PAGE_MASK(tbl
)))
407 /* iova is checked by the IOMMU API */
409 if (param
.flags
& VFIO_DMA_MAP_FLAG_READ
)
411 if (param
.flags
& VFIO_DMA_MAP_FLAG_WRITE
)
412 tce
|= TCE_PCI_WRITE
;
414 ret
= iommu_tce_put_param_check(tbl
, param
.iova
, tce
);
418 ret
= tce_iommu_build(container
, tbl
,
419 param
.iova
>> tbl
->it_page_shift
,
420 tce
, param
.size
>> tbl
->it_page_shift
);
422 iommu_flush_tce(tbl
);
426 case VFIO_IOMMU_UNMAP_DMA
: {
427 struct vfio_iommu_type1_dma_unmap param
;
428 struct iommu_table
*tbl
= NULL
;
431 if (!container
->enabled
)
434 minsz
= offsetofend(struct vfio_iommu_type1_dma_unmap
,
437 if (copy_from_user(¶m
, (void __user
*)arg
, minsz
))
440 if (param
.argsz
< minsz
)
443 /* No flag is supported now */
447 num
= tce_iommu_find_table(container
, param
.iova
, &tbl
);
451 if (param
.size
& ~IOMMU_PAGE_MASK(tbl
))
454 ret
= iommu_tce_clear_param_check(tbl
, param
.iova
, 0,
455 param
.size
>> tbl
->it_page_shift
);
459 ret
= tce_iommu_clear(container
, tbl
,
460 param
.iova
>> tbl
->it_page_shift
,
461 param
.size
>> tbl
->it_page_shift
);
462 iommu_flush_tce(tbl
);
466 case VFIO_IOMMU_ENABLE
:
467 mutex_lock(&container
->lock
);
468 ret
= tce_iommu_enable(container
);
469 mutex_unlock(&container
->lock
);
473 case VFIO_IOMMU_DISABLE
:
474 mutex_lock(&container
->lock
);
475 tce_iommu_disable(container
);
476 mutex_unlock(&container
->lock
);
482 return vfio_spapr_iommu_eeh_ioctl(container
->grp
,
489 static void tce_iommu_release_ownership(struct tce_container
*container
,
490 struct iommu_table_group
*table_group
)
494 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
495 struct iommu_table
*tbl
= table_group
->tables
[i
];
500 tce_iommu_clear(container
, tbl
, tbl
->it_offset
, tbl
->it_size
);
502 iommu_release_ownership(tbl
);
506 static int tce_iommu_take_ownership(struct tce_container
*container
,
507 struct iommu_table_group
*table_group
)
511 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
512 struct iommu_table
*tbl
= table_group
->tables
[i
];
514 if (!tbl
|| !tbl
->it_map
)
517 rc
= iommu_take_ownership(tbl
);
519 for (j
= 0; j
< i
; ++j
)
520 iommu_release_ownership(
521 table_group
->tables
[j
]);
530 static void tce_iommu_release_ownership_ddw(struct tce_container
*container
,
531 struct iommu_table_group
*table_group
)
533 table_group
->ops
->release_ownership(table_group
);
536 static long tce_iommu_take_ownership_ddw(struct tce_container
*container
,
537 struct iommu_table_group
*table_group
)
539 table_group
->ops
->take_ownership(table_group
);
544 static int tce_iommu_attach_group(void *iommu_data
,
545 struct iommu_group
*iommu_group
)
548 struct tce_container
*container
= iommu_data
;
549 struct iommu_table_group
*table_group
;
551 mutex_lock(&container
->lock
);
553 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
554 iommu_group_id(iommu_group), iommu_group); */
555 if (container
->grp
) {
556 pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
557 iommu_group_id(container
->grp
),
558 iommu_group_id(iommu_group
));
563 if (container
->enabled
) {
564 pr_err("tce_vfio: attaching group #%u to enabled container\n",
565 iommu_group_id(iommu_group
));
570 table_group
= iommu_group_get_iommudata(iommu_group
);
576 if (!table_group
->ops
|| !table_group
->ops
->take_ownership
||
577 !table_group
->ops
->release_ownership
)
578 ret
= tce_iommu_take_ownership(container
, table_group
);
580 ret
= tce_iommu_take_ownership_ddw(container
, table_group
);
583 container
->grp
= iommu_group
;
586 mutex_unlock(&container
->lock
);
591 static void tce_iommu_detach_group(void *iommu_data
,
592 struct iommu_group
*iommu_group
)
594 struct tce_container
*container
= iommu_data
;
595 struct iommu_table_group
*table_group
;
597 mutex_lock(&container
->lock
);
598 if (iommu_group
!= container
->grp
) {
599 pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
600 iommu_group_id(iommu_group
),
601 iommu_group_id(container
->grp
));
605 if (container
->enabled
) {
606 pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
607 iommu_group_id(container
->grp
));
608 tce_iommu_disable(container
);
611 /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
612 iommu_group_id(iommu_group), iommu_group); */
613 container
->grp
= NULL
;
615 table_group
= iommu_group_get_iommudata(iommu_group
);
616 BUG_ON(!table_group
);
618 if (!table_group
->ops
|| !table_group
->ops
->release_ownership
)
619 tce_iommu_release_ownership(container
, table_group
);
621 tce_iommu_release_ownership_ddw(container
, table_group
);
624 mutex_unlock(&container
->lock
);
627 const struct vfio_iommu_driver_ops tce_iommu_driver_ops
= {
628 .name
= "iommu-vfio-powerpc",
629 .owner
= THIS_MODULE
,
630 .open
= tce_iommu_open
,
631 .release
= tce_iommu_release
,
632 .ioctl
= tce_iommu_ioctl
,
633 .attach_group
= tce_iommu_attach_group
,
634 .detach_group
= tce_iommu_detach_group
,
637 static int __init
tce_iommu_init(void)
639 return vfio_register_iommu_driver(&tce_iommu_driver_ops
);
642 static void __exit
tce_iommu_cleanup(void)
644 vfio_unregister_iommu_driver(&tce_iommu_driver_ops
);
647 module_init(tce_iommu_init
);
648 module_exit(tce_iommu_cleanup
);
650 MODULE_VERSION(DRIVER_VERSION
);
651 MODULE_LICENSE("GPL v2");
652 MODULE_AUTHOR(DRIVER_AUTHOR
);
653 MODULE_DESCRIPTION(DRIVER_DESC
);