x86/mm: Add TLB purge to free pmd/pte page interfaces
[linux/fpc-iii.git] / drivers / vfio / vfio_iommu_spapr_tce.c
blob70c748a5fbcc29fe1a285e0416e86852a2e5bc90
1 /*
2 * VFIO: IOMMU DMA mapping support for TCE on POWER
4 * Copyright (C) 2013 IBM Corp. All rights reserved.
5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio_iommu_type1.c:
12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
13 * Author: Alex Williamson <alex.williamson@redhat.com>
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/slab.h>
19 #include <linux/uaccess.h>
20 #include <linux/err.h>
21 #include <linux/vfio.h>
22 #include <linux/vmalloc.h>
23 #include <asm/iommu.h>
24 #include <asm/tce.h>
25 #include <asm/mmu_context.h>
27 #define DRIVER_VERSION "0.1"
28 #define DRIVER_AUTHOR "aik@ozlabs.ru"
29 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
31 static void tce_iommu_detach_group(void *iommu_data,
32 struct iommu_group *iommu_group);
34 static long try_increment_locked_vm(struct mm_struct *mm, long npages)
36 long ret = 0, locked, lock_limit;
38 if (WARN_ON_ONCE(!mm))
39 return -EPERM;
41 if (!npages)
42 return 0;
44 down_write(&mm->mmap_sem);
45 locked = mm->locked_vm + npages;
46 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
47 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
48 ret = -ENOMEM;
49 else
50 mm->locked_vm += npages;
52 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
53 npages << PAGE_SHIFT,
54 mm->locked_vm << PAGE_SHIFT,
55 rlimit(RLIMIT_MEMLOCK),
56 ret ? " - exceeded" : "");
58 up_write(&mm->mmap_sem);
60 return ret;
63 static void decrement_locked_vm(struct mm_struct *mm, long npages)
65 if (!mm || !npages)
66 return;
68 down_write(&mm->mmap_sem);
69 if (WARN_ON_ONCE(npages > mm->locked_vm))
70 npages = mm->locked_vm;
71 mm->locked_vm -= npages;
72 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
73 npages << PAGE_SHIFT,
74 mm->locked_vm << PAGE_SHIFT,
75 rlimit(RLIMIT_MEMLOCK));
76 up_write(&mm->mmap_sem);
80 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
82 * This code handles mapping and unmapping of user data buffers
83 * into DMA'ble space using the IOMMU
86 struct tce_iommu_group {
87 struct list_head next;
88 struct iommu_group *grp;
92 * A container needs to remember which preregistered region it has
93 * referenced to do proper cleanup at the userspace process exit.
95 struct tce_iommu_prereg {
96 struct list_head next;
97 struct mm_iommu_table_group_mem_t *mem;
101 * The container descriptor supports only a single group per container.
102 * Required by the API as the container is not supplied with the IOMMU group
103 * at the moment of initialization.
105 struct tce_container {
106 struct mutex lock;
107 bool enabled;
108 bool v2;
109 bool def_window_pending;
110 unsigned long locked_pages;
111 struct mm_struct *mm;
112 struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
113 struct list_head group_list;
114 struct list_head prereg_list;
117 static long tce_iommu_mm_set(struct tce_container *container)
119 if (container->mm) {
120 if (container->mm == current->mm)
121 return 0;
122 return -EPERM;
124 BUG_ON(!current->mm);
125 container->mm = current->mm;
126 atomic_inc(&container->mm->mm_count);
128 return 0;
131 static long tce_iommu_prereg_free(struct tce_container *container,
132 struct tce_iommu_prereg *tcemem)
134 long ret;
136 ret = mm_iommu_put(container->mm, tcemem->mem);
137 if (ret)
138 return ret;
140 list_del(&tcemem->next);
141 kfree(tcemem);
143 return 0;
146 static long tce_iommu_unregister_pages(struct tce_container *container,
147 __u64 vaddr, __u64 size)
149 struct mm_iommu_table_group_mem_t *mem;
150 struct tce_iommu_prereg *tcemem;
151 bool found = false;
153 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
154 return -EINVAL;
156 mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT);
157 if (!mem)
158 return -ENOENT;
160 list_for_each_entry(tcemem, &container->prereg_list, next) {
161 if (tcemem->mem == mem) {
162 found = true;
163 break;
167 if (!found)
168 return -ENOENT;
170 return tce_iommu_prereg_free(container, tcemem);
173 static long tce_iommu_register_pages(struct tce_container *container,
174 __u64 vaddr, __u64 size)
176 long ret = 0;
177 struct mm_iommu_table_group_mem_t *mem = NULL;
178 struct tce_iommu_prereg *tcemem;
179 unsigned long entries = size >> PAGE_SHIFT;
181 if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
182 ((vaddr + size) < vaddr))
183 return -EINVAL;
185 mem = mm_iommu_find(container->mm, vaddr, entries);
186 if (mem) {
187 list_for_each_entry(tcemem, &container->prereg_list, next) {
188 if (tcemem->mem == mem)
189 return -EBUSY;
193 ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
194 if (ret)
195 return ret;
197 tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
198 if (!tcemem) {
199 mm_iommu_put(container->mm, mem);
200 return -ENOMEM;
203 tcemem->mem = mem;
204 list_add(&tcemem->next, &container->prereg_list);
206 container->enabled = true;
208 return 0;
211 static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl,
212 struct mm_struct *mm)
214 unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
215 tbl->it_size, PAGE_SIZE);
216 unsigned long *uas;
217 long ret;
219 BUG_ON(tbl->it_userspace);
221 ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT);
222 if (ret)
223 return ret;
225 uas = vzalloc(cb);
226 if (!uas) {
227 decrement_locked_vm(mm, cb >> PAGE_SHIFT);
228 return -ENOMEM;
230 tbl->it_userspace = uas;
232 return 0;
235 static void tce_iommu_userspace_view_free(struct iommu_table *tbl,
236 struct mm_struct *mm)
238 unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
239 tbl->it_size, PAGE_SIZE);
241 if (!tbl->it_userspace)
242 return;
244 vfree(tbl->it_userspace);
245 tbl->it_userspace = NULL;
246 decrement_locked_vm(mm, cb >> PAGE_SHIFT);
249 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
252 * Check that the TCE table granularity is not bigger than the size of
253 * a page we just found. Otherwise the hardware can get access to
254 * a bigger memory chunk that it should.
256 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
259 static inline bool tce_groups_attached(struct tce_container *container)
261 return !list_empty(&container->group_list);
264 static long tce_iommu_find_table(struct tce_container *container,
265 phys_addr_t ioba, struct iommu_table **ptbl)
267 long i;
269 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
270 struct iommu_table *tbl = container->tables[i];
272 if (tbl) {
273 unsigned long entry = ioba >> tbl->it_page_shift;
274 unsigned long start = tbl->it_offset;
275 unsigned long end = start + tbl->it_size;
277 if ((start <= entry) && (entry < end)) {
278 *ptbl = tbl;
279 return i;
284 return -1;
287 static int tce_iommu_find_free_table(struct tce_container *container)
289 int i;
291 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
292 if (!container->tables[i])
293 return i;
296 return -ENOSPC;
299 static int tce_iommu_enable(struct tce_container *container)
301 int ret = 0;
302 unsigned long locked;
303 struct iommu_table_group *table_group;
304 struct tce_iommu_group *tcegrp;
306 if (container->enabled)
307 return -EBUSY;
310 * When userspace pages are mapped into the IOMMU, they are effectively
311 * locked memory, so, theoretically, we need to update the accounting
312 * of locked pages on each map and unmap. For powerpc, the map unmap
313 * paths can be very hot, though, and the accounting would kill
314 * performance, especially since it would be difficult to impossible
315 * to handle the accounting in real mode only.
317 * To address that, rather than precisely accounting every page, we
318 * instead account for a worst case on locked memory when the iommu is
319 * enabled and disabled. The worst case upper bound on locked memory
320 * is the size of the whole iommu window, which is usually relatively
321 * small (compared to total memory sizes) on POWER hardware.
323 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
324 * that would effectively kill the guest at random points, much better
325 * enforcing the limit based on the max that the guest can map.
327 * Unfortunately at the moment it counts whole tables, no matter how
328 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
329 * each with 2GB DMA window, 8GB will be counted here. The reason for
330 * this is that we cannot tell here the amount of RAM used by the guest
331 * as this information is only available from KVM and VFIO is
332 * KVM agnostic.
334 * So we do not allow enabling a container without a group attached
335 * as there is no way to know how much we should increment
336 * the locked_vm counter.
338 if (!tce_groups_attached(container))
339 return -ENODEV;
341 tcegrp = list_first_entry(&container->group_list,
342 struct tce_iommu_group, next);
343 table_group = iommu_group_get_iommudata(tcegrp->grp);
344 if (!table_group)
345 return -ENODEV;
347 if (!table_group->tce32_size)
348 return -EPERM;
350 ret = tce_iommu_mm_set(container);
351 if (ret)
352 return ret;
354 locked = table_group->tce32_size >> PAGE_SHIFT;
355 ret = try_increment_locked_vm(container->mm, locked);
356 if (ret)
357 return ret;
359 container->locked_pages = locked;
361 container->enabled = true;
363 return ret;
366 static void tce_iommu_disable(struct tce_container *container)
368 if (!container->enabled)
369 return;
371 container->enabled = false;
373 BUG_ON(!container->mm);
374 decrement_locked_vm(container->mm, container->locked_pages);
377 static void *tce_iommu_open(unsigned long arg)
379 struct tce_container *container;
381 if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
382 pr_err("tce_vfio: Wrong IOMMU type\n");
383 return ERR_PTR(-EINVAL);
386 container = kzalloc(sizeof(*container), GFP_KERNEL);
387 if (!container)
388 return ERR_PTR(-ENOMEM);
390 mutex_init(&container->lock);
391 INIT_LIST_HEAD_RCU(&container->group_list);
392 INIT_LIST_HEAD_RCU(&container->prereg_list);
394 container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
396 return container;
399 static int tce_iommu_clear(struct tce_container *container,
400 struct iommu_table *tbl,
401 unsigned long entry, unsigned long pages);
402 static void tce_iommu_free_table(struct tce_container *container,
403 struct iommu_table *tbl);
405 static void tce_iommu_release(void *iommu_data)
407 struct tce_container *container = iommu_data;
408 struct tce_iommu_group *tcegrp;
409 long i;
411 while (tce_groups_attached(container)) {
412 tcegrp = list_first_entry(&container->group_list,
413 struct tce_iommu_group, next);
414 tce_iommu_detach_group(iommu_data, tcegrp->grp);
418 * If VFIO created a table, it was not disposed
419 * by tce_iommu_detach_group() so do it now.
421 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
422 struct iommu_table *tbl = container->tables[i];
424 if (!tbl)
425 continue;
427 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
428 tce_iommu_free_table(container, tbl);
431 while (!list_empty(&container->prereg_list)) {
432 struct tce_iommu_prereg *tcemem;
434 tcemem = list_first_entry(&container->prereg_list,
435 struct tce_iommu_prereg, next);
436 WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
439 tce_iommu_disable(container);
440 if (container->mm)
441 mmdrop(container->mm);
442 mutex_destroy(&container->lock);
444 kfree(container);
447 static void tce_iommu_unuse_page(struct tce_container *container,
448 unsigned long hpa)
450 struct page *page;
452 page = pfn_to_page(hpa >> PAGE_SHIFT);
453 put_page(page);
456 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
457 unsigned long tce, unsigned long size,
458 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
460 long ret = 0;
461 struct mm_iommu_table_group_mem_t *mem;
463 mem = mm_iommu_lookup(container->mm, tce, size);
464 if (!mem)
465 return -EINVAL;
467 ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
468 if (ret)
469 return -EINVAL;
471 *pmem = mem;
473 return 0;
476 static void tce_iommu_unuse_page_v2(struct tce_container *container,
477 struct iommu_table *tbl, unsigned long entry)
479 struct mm_iommu_table_group_mem_t *mem = NULL;
480 int ret;
481 unsigned long hpa = 0;
482 unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
484 if (!pua)
485 return;
487 ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl),
488 &hpa, &mem);
489 if (ret)
490 pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
491 __func__, *pua, entry, ret);
492 if (mem)
493 mm_iommu_mapped_dec(mem);
495 *pua = 0;
498 static int tce_iommu_clear(struct tce_container *container,
499 struct iommu_table *tbl,
500 unsigned long entry, unsigned long pages)
502 unsigned long oldhpa;
503 long ret;
504 enum dma_data_direction direction;
506 for ( ; pages; --pages, ++entry) {
507 direction = DMA_NONE;
508 oldhpa = 0;
509 ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
510 if (ret)
511 continue;
513 if (direction == DMA_NONE)
514 continue;
516 if (container->v2) {
517 tce_iommu_unuse_page_v2(container, tbl, entry);
518 continue;
521 tce_iommu_unuse_page(container, oldhpa);
524 return 0;
527 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
529 struct page *page = NULL;
530 enum dma_data_direction direction = iommu_tce_direction(tce);
532 if (get_user_pages_fast(tce & PAGE_MASK, 1,
533 direction != DMA_TO_DEVICE, &page) != 1)
534 return -EFAULT;
536 *hpa = __pa((unsigned long) page_address(page));
538 return 0;
541 static long tce_iommu_build(struct tce_container *container,
542 struct iommu_table *tbl,
543 unsigned long entry, unsigned long tce, unsigned long pages,
544 enum dma_data_direction direction)
546 long i, ret = 0;
547 struct page *page;
548 unsigned long hpa;
549 enum dma_data_direction dirtmp;
551 for (i = 0; i < pages; ++i) {
552 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
554 ret = tce_iommu_use_page(tce, &hpa);
555 if (ret)
556 break;
558 page = pfn_to_page(hpa >> PAGE_SHIFT);
559 if (!tce_page_is_contained(page, tbl->it_page_shift)) {
560 ret = -EPERM;
561 break;
564 hpa |= offset;
565 dirtmp = direction;
566 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
567 if (ret) {
568 tce_iommu_unuse_page(container, hpa);
569 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
570 __func__, entry << tbl->it_page_shift,
571 tce, ret);
572 break;
575 if (dirtmp != DMA_NONE)
576 tce_iommu_unuse_page(container, hpa);
578 tce += IOMMU_PAGE_SIZE(tbl);
581 if (ret)
582 tce_iommu_clear(container, tbl, entry, i);
584 return ret;
587 static long tce_iommu_build_v2(struct tce_container *container,
588 struct iommu_table *tbl,
589 unsigned long entry, unsigned long tce, unsigned long pages,
590 enum dma_data_direction direction)
592 long i, ret = 0;
593 struct page *page;
594 unsigned long hpa;
595 enum dma_data_direction dirtmp;
597 if (!tbl->it_userspace) {
598 ret = tce_iommu_userspace_view_alloc(tbl, container->mm);
599 if (ret)
600 return ret;
603 for (i = 0; i < pages; ++i) {
604 struct mm_iommu_table_group_mem_t *mem = NULL;
605 unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
606 entry + i);
608 ret = tce_iommu_prereg_ua_to_hpa(container,
609 tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem);
610 if (ret)
611 break;
613 page = pfn_to_page(hpa >> PAGE_SHIFT);
614 if (!tce_page_is_contained(page, tbl->it_page_shift)) {
615 ret = -EPERM;
616 break;
619 /* Preserve offset within IOMMU page */
620 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
621 dirtmp = direction;
623 /* The registered region is being unregistered */
624 if (mm_iommu_mapped_inc(mem))
625 break;
627 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
628 if (ret) {
629 /* dirtmp cannot be DMA_NONE here */
630 tce_iommu_unuse_page_v2(container, tbl, entry + i);
631 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
632 __func__, entry << tbl->it_page_shift,
633 tce, ret);
634 break;
637 if (dirtmp != DMA_NONE)
638 tce_iommu_unuse_page_v2(container, tbl, entry + i);
640 *pua = tce;
642 tce += IOMMU_PAGE_SIZE(tbl);
645 if (ret)
646 tce_iommu_clear(container, tbl, entry, i);
648 return ret;
651 static long tce_iommu_create_table(struct tce_container *container,
652 struct iommu_table_group *table_group,
653 int num,
654 __u32 page_shift,
655 __u64 window_size,
656 __u32 levels,
657 struct iommu_table **ptbl)
659 long ret, table_size;
661 table_size = table_group->ops->get_table_size(page_shift, window_size,
662 levels);
663 if (!table_size)
664 return -EINVAL;
666 ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
667 if (ret)
668 return ret;
670 ret = table_group->ops->create_table(table_group, num,
671 page_shift, window_size, levels, ptbl);
673 WARN_ON(!ret && !(*ptbl)->it_ops->free);
674 WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
676 return ret;
679 static void tce_iommu_free_table(struct tce_container *container,
680 struct iommu_table *tbl)
682 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
684 tce_iommu_userspace_view_free(tbl, container->mm);
685 tbl->it_ops->free(tbl);
686 decrement_locked_vm(container->mm, pages);
689 static long tce_iommu_create_window(struct tce_container *container,
690 __u32 page_shift, __u64 window_size, __u32 levels,
691 __u64 *start_addr)
693 struct tce_iommu_group *tcegrp;
694 struct iommu_table_group *table_group;
695 struct iommu_table *tbl = NULL;
696 long ret, num;
698 num = tce_iommu_find_free_table(container);
699 if (num < 0)
700 return num;
702 /* Get the first group for ops::create_table */
703 tcegrp = list_first_entry(&container->group_list,
704 struct tce_iommu_group, next);
705 table_group = iommu_group_get_iommudata(tcegrp->grp);
706 if (!table_group)
707 return -EFAULT;
709 if (!(table_group->pgsizes & (1ULL << page_shift)))
710 return -EINVAL;
712 if (!table_group->ops->set_window || !table_group->ops->unset_window ||
713 !table_group->ops->get_table_size ||
714 !table_group->ops->create_table)
715 return -EPERM;
717 /* Create TCE table */
718 ret = tce_iommu_create_table(container, table_group, num,
719 page_shift, window_size, levels, &tbl);
720 if (ret)
721 return ret;
723 BUG_ON(!tbl->it_ops->free);
726 * Program the table to every group.
727 * Groups have been tested for compatibility at the attach time.
729 list_for_each_entry(tcegrp, &container->group_list, next) {
730 table_group = iommu_group_get_iommudata(tcegrp->grp);
732 ret = table_group->ops->set_window(table_group, num, tbl);
733 if (ret)
734 goto unset_exit;
737 container->tables[num] = tbl;
739 /* Return start address assigned by platform in create_table() */
740 *start_addr = tbl->it_offset << tbl->it_page_shift;
742 return 0;
744 unset_exit:
745 list_for_each_entry(tcegrp, &container->group_list, next) {
746 table_group = iommu_group_get_iommudata(tcegrp->grp);
747 table_group->ops->unset_window(table_group, num);
749 tce_iommu_free_table(container, tbl);
751 return ret;
754 static long tce_iommu_remove_window(struct tce_container *container,
755 __u64 start_addr)
757 struct iommu_table_group *table_group = NULL;
758 struct iommu_table *tbl;
759 struct tce_iommu_group *tcegrp;
760 int num;
762 num = tce_iommu_find_table(container, start_addr, &tbl);
763 if (num < 0)
764 return -EINVAL;
766 BUG_ON(!tbl->it_size);
768 /* Detach groups from IOMMUs */
769 list_for_each_entry(tcegrp, &container->group_list, next) {
770 table_group = iommu_group_get_iommudata(tcegrp->grp);
773 * SPAPR TCE IOMMU exposes the default DMA window to
774 * the guest via dma32_window_start/size of
775 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
776 * the userspace to remove this window, some do not so
777 * here we check for the platform capability.
779 if (!table_group->ops || !table_group->ops->unset_window)
780 return -EPERM;
782 table_group->ops->unset_window(table_group, num);
785 /* Free table */
786 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
787 tce_iommu_free_table(container, tbl);
788 container->tables[num] = NULL;
790 return 0;
793 static long tce_iommu_create_default_window(struct tce_container *container)
795 long ret;
796 __u64 start_addr = 0;
797 struct tce_iommu_group *tcegrp;
798 struct iommu_table_group *table_group;
800 if (!container->def_window_pending)
801 return 0;
803 if (!tce_groups_attached(container))
804 return -ENODEV;
806 tcegrp = list_first_entry(&container->group_list,
807 struct tce_iommu_group, next);
808 table_group = iommu_group_get_iommudata(tcegrp->grp);
809 if (!table_group)
810 return -ENODEV;
812 ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
813 table_group->tce32_size, 1, &start_addr);
814 WARN_ON_ONCE(!ret && start_addr);
816 if (!ret)
817 container->def_window_pending = false;
819 return ret;
822 static long tce_iommu_ioctl(void *iommu_data,
823 unsigned int cmd, unsigned long arg)
825 struct tce_container *container = iommu_data;
826 unsigned long minsz, ddwsz;
827 long ret;
829 switch (cmd) {
830 case VFIO_CHECK_EXTENSION:
831 switch (arg) {
832 case VFIO_SPAPR_TCE_IOMMU:
833 case VFIO_SPAPR_TCE_v2_IOMMU:
834 ret = 1;
835 break;
836 default:
837 ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
838 break;
841 return (ret < 0) ? 0 : ret;
845 * Sanity check to prevent one userspace from manipulating
846 * another userspace mm.
848 BUG_ON(!container);
849 if (container->mm && container->mm != current->mm)
850 return -EPERM;
852 switch (cmd) {
853 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
854 struct vfio_iommu_spapr_tce_info info;
855 struct tce_iommu_group *tcegrp;
856 struct iommu_table_group *table_group;
858 if (!tce_groups_attached(container))
859 return -ENXIO;
861 tcegrp = list_first_entry(&container->group_list,
862 struct tce_iommu_group, next);
863 table_group = iommu_group_get_iommudata(tcegrp->grp);
865 if (!table_group)
866 return -ENXIO;
868 minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
869 dma32_window_size);
871 if (copy_from_user(&info, (void __user *)arg, minsz))
872 return -EFAULT;
874 if (info.argsz < minsz)
875 return -EINVAL;
877 info.dma32_window_start = table_group->tce32_start;
878 info.dma32_window_size = table_group->tce32_size;
879 info.flags = 0;
880 memset(&info.ddw, 0, sizeof(info.ddw));
882 if (table_group->max_dynamic_windows_supported &&
883 container->v2) {
884 info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
885 info.ddw.pgsizes = table_group->pgsizes;
886 info.ddw.max_dynamic_windows_supported =
887 table_group->max_dynamic_windows_supported;
888 info.ddw.levels = table_group->max_levels;
891 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
893 if (info.argsz >= ddwsz)
894 minsz = ddwsz;
896 if (copy_to_user((void __user *)arg, &info, minsz))
897 return -EFAULT;
899 return 0;
901 case VFIO_IOMMU_MAP_DMA: {
902 struct vfio_iommu_type1_dma_map param;
903 struct iommu_table *tbl = NULL;
904 long num;
905 enum dma_data_direction direction;
907 if (!container->enabled)
908 return -EPERM;
910 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
912 if (copy_from_user(&param, (void __user *)arg, minsz))
913 return -EFAULT;
915 if (param.argsz < minsz)
916 return -EINVAL;
918 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
919 VFIO_DMA_MAP_FLAG_WRITE))
920 return -EINVAL;
922 ret = tce_iommu_create_default_window(container);
923 if (ret)
924 return ret;
926 num = tce_iommu_find_table(container, param.iova, &tbl);
927 if (num < 0)
928 return -ENXIO;
930 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
931 (param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
932 return -EINVAL;
934 /* iova is checked by the IOMMU API */
935 if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
936 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
937 direction = DMA_BIDIRECTIONAL;
938 else
939 direction = DMA_TO_DEVICE;
940 } else {
941 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
942 direction = DMA_FROM_DEVICE;
943 else
944 return -EINVAL;
947 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
948 if (ret)
949 return ret;
951 if (container->v2)
952 ret = tce_iommu_build_v2(container, tbl,
953 param.iova >> tbl->it_page_shift,
954 param.vaddr,
955 param.size >> tbl->it_page_shift,
956 direction);
957 else
958 ret = tce_iommu_build(container, tbl,
959 param.iova >> tbl->it_page_shift,
960 param.vaddr,
961 param.size >> tbl->it_page_shift,
962 direction);
964 iommu_flush_tce(tbl);
966 return ret;
968 case VFIO_IOMMU_UNMAP_DMA: {
969 struct vfio_iommu_type1_dma_unmap param;
970 struct iommu_table *tbl = NULL;
971 long num;
973 if (!container->enabled)
974 return -EPERM;
976 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
977 size);
979 if (copy_from_user(&param, (void __user *)arg, minsz))
980 return -EFAULT;
982 if (param.argsz < minsz)
983 return -EINVAL;
985 /* No flag is supported now */
986 if (param.flags)
987 return -EINVAL;
989 ret = tce_iommu_create_default_window(container);
990 if (ret)
991 return ret;
993 num = tce_iommu_find_table(container, param.iova, &tbl);
994 if (num < 0)
995 return -ENXIO;
997 if (param.size & ~IOMMU_PAGE_MASK(tbl))
998 return -EINVAL;
1000 ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
1001 param.size >> tbl->it_page_shift);
1002 if (ret)
1003 return ret;
1005 ret = tce_iommu_clear(container, tbl,
1006 param.iova >> tbl->it_page_shift,
1007 param.size >> tbl->it_page_shift);
1008 iommu_flush_tce(tbl);
1010 return ret;
1012 case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
1013 struct vfio_iommu_spapr_register_memory param;
1015 if (!container->v2)
1016 break;
1018 minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1019 size);
1021 ret = tce_iommu_mm_set(container);
1022 if (ret)
1023 return ret;
1025 if (copy_from_user(&param, (void __user *)arg, minsz))
1026 return -EFAULT;
1028 if (param.argsz < minsz)
1029 return -EINVAL;
1031 /* No flag is supported now */
1032 if (param.flags)
1033 return -EINVAL;
1035 mutex_lock(&container->lock);
1036 ret = tce_iommu_register_pages(container, param.vaddr,
1037 param.size);
1038 mutex_unlock(&container->lock);
1040 return ret;
1042 case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1043 struct vfio_iommu_spapr_register_memory param;
1045 if (!container->v2)
1046 break;
1048 if (!container->mm)
1049 return -EPERM;
1051 minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1052 size);
1054 if (copy_from_user(&param, (void __user *)arg, minsz))
1055 return -EFAULT;
1057 if (param.argsz < minsz)
1058 return -EINVAL;
1060 /* No flag is supported now */
1061 if (param.flags)
1062 return -EINVAL;
1064 mutex_lock(&container->lock);
1065 ret = tce_iommu_unregister_pages(container, param.vaddr,
1066 param.size);
1067 mutex_unlock(&container->lock);
1069 return ret;
1071 case VFIO_IOMMU_ENABLE:
1072 if (container->v2)
1073 break;
1075 mutex_lock(&container->lock);
1076 ret = tce_iommu_enable(container);
1077 mutex_unlock(&container->lock);
1078 return ret;
1081 case VFIO_IOMMU_DISABLE:
1082 if (container->v2)
1083 break;
1085 mutex_lock(&container->lock);
1086 tce_iommu_disable(container);
1087 mutex_unlock(&container->lock);
1088 return 0;
1090 case VFIO_EEH_PE_OP: {
1091 struct tce_iommu_group *tcegrp;
1093 ret = 0;
1094 list_for_each_entry(tcegrp, &container->group_list, next) {
1095 ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1096 cmd, arg);
1097 if (ret)
1098 return ret;
1100 return ret;
1103 case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1104 struct vfio_iommu_spapr_tce_create create;
1106 if (!container->v2)
1107 break;
1109 ret = tce_iommu_mm_set(container);
1110 if (ret)
1111 return ret;
1113 if (!tce_groups_attached(container))
1114 return -ENXIO;
1116 minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1117 start_addr);
1119 if (copy_from_user(&create, (void __user *)arg, minsz))
1120 return -EFAULT;
1122 if (create.argsz < minsz)
1123 return -EINVAL;
1125 if (create.flags)
1126 return -EINVAL;
1128 mutex_lock(&container->lock);
1130 ret = tce_iommu_create_default_window(container);
1131 if (!ret)
1132 ret = tce_iommu_create_window(container,
1133 create.page_shift,
1134 create.window_size, create.levels,
1135 &create.start_addr);
1137 mutex_unlock(&container->lock);
1139 if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1140 ret = -EFAULT;
1142 return ret;
1144 case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1145 struct vfio_iommu_spapr_tce_remove remove;
1147 if (!container->v2)
1148 break;
1150 ret = tce_iommu_mm_set(container);
1151 if (ret)
1152 return ret;
1154 if (!tce_groups_attached(container))
1155 return -ENXIO;
1157 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1158 start_addr);
1160 if (copy_from_user(&remove, (void __user *)arg, minsz))
1161 return -EFAULT;
1163 if (remove.argsz < minsz)
1164 return -EINVAL;
1166 if (remove.flags)
1167 return -EINVAL;
1169 if (container->def_window_pending && !remove.start_addr) {
1170 container->def_window_pending = false;
1171 return 0;
1174 mutex_lock(&container->lock);
1176 ret = tce_iommu_remove_window(container, remove.start_addr);
1178 mutex_unlock(&container->lock);
1180 return ret;
1184 return -ENOTTY;
1187 static void tce_iommu_release_ownership(struct tce_container *container,
1188 struct iommu_table_group *table_group)
1190 int i;
1192 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1193 struct iommu_table *tbl = container->tables[i];
1195 if (!tbl)
1196 continue;
1198 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1199 tce_iommu_userspace_view_free(tbl, container->mm);
1200 if (tbl->it_map)
1201 iommu_release_ownership(tbl);
1203 container->tables[i] = NULL;
1207 static int tce_iommu_take_ownership(struct tce_container *container,
1208 struct iommu_table_group *table_group)
1210 int i, j, rc = 0;
1212 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1213 struct iommu_table *tbl = table_group->tables[i];
1215 if (!tbl || !tbl->it_map)
1216 continue;
1218 rc = iommu_take_ownership(tbl);
1219 if (rc) {
1220 for (j = 0; j < i; ++j)
1221 iommu_release_ownership(
1222 table_group->tables[j]);
1224 return rc;
1228 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1229 container->tables[i] = table_group->tables[i];
1231 return 0;
1234 static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1235 struct iommu_table_group *table_group)
1237 long i;
1239 if (!table_group->ops->unset_window) {
1240 WARN_ON_ONCE(1);
1241 return;
1244 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1245 table_group->ops->unset_window(table_group, i);
1247 table_group->ops->release_ownership(table_group);
1250 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1251 struct iommu_table_group *table_group)
1253 long i, ret = 0;
1255 if (!table_group->ops->create_table || !table_group->ops->set_window ||
1256 !table_group->ops->release_ownership) {
1257 WARN_ON_ONCE(1);
1258 return -EFAULT;
1261 table_group->ops->take_ownership(table_group);
1263 /* Set all windows to the new group */
1264 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1265 struct iommu_table *tbl = container->tables[i];
1267 if (!tbl)
1268 continue;
1270 ret = table_group->ops->set_window(table_group, i, tbl);
1271 if (ret)
1272 goto release_exit;
1275 return 0;
1277 release_exit:
1278 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1279 table_group->ops->unset_window(table_group, i);
1281 table_group->ops->release_ownership(table_group);
1283 return ret;
1286 static int tce_iommu_attach_group(void *iommu_data,
1287 struct iommu_group *iommu_group)
1289 int ret;
1290 struct tce_container *container = iommu_data;
1291 struct iommu_table_group *table_group;
1292 struct tce_iommu_group *tcegrp = NULL;
1294 mutex_lock(&container->lock);
1296 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1297 iommu_group_id(iommu_group), iommu_group); */
1298 table_group = iommu_group_get_iommudata(iommu_group);
1299 if (!table_group) {
1300 ret = -ENODEV;
1301 goto unlock_exit;
1304 if (tce_groups_attached(container) && (!table_group->ops ||
1305 !table_group->ops->take_ownership ||
1306 !table_group->ops->release_ownership)) {
1307 ret = -EBUSY;
1308 goto unlock_exit;
1311 /* Check if new group has the same iommu_ops (i.e. compatible) */
1312 list_for_each_entry(tcegrp, &container->group_list, next) {
1313 struct iommu_table_group *table_group_tmp;
1315 if (tcegrp->grp == iommu_group) {
1316 pr_warn("tce_vfio: Group %d is already attached\n",
1317 iommu_group_id(iommu_group));
1318 ret = -EBUSY;
1319 goto unlock_exit;
1321 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1322 if (table_group_tmp->ops->create_table !=
1323 table_group->ops->create_table) {
1324 pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1325 iommu_group_id(iommu_group),
1326 iommu_group_id(tcegrp->grp));
1327 ret = -EPERM;
1328 goto unlock_exit;
1332 tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1333 if (!tcegrp) {
1334 ret = -ENOMEM;
1335 goto unlock_exit;
1338 if (!table_group->ops || !table_group->ops->take_ownership ||
1339 !table_group->ops->release_ownership) {
1340 if (container->v2) {
1341 ret = -EPERM;
1342 goto unlock_exit;
1344 ret = tce_iommu_take_ownership(container, table_group);
1345 } else {
1346 if (!container->v2) {
1347 ret = -EPERM;
1348 goto unlock_exit;
1350 ret = tce_iommu_take_ownership_ddw(container, table_group);
1351 if (!tce_groups_attached(container) && !container->tables[0])
1352 container->def_window_pending = true;
1355 if (!ret) {
1356 tcegrp->grp = iommu_group;
1357 list_add(&tcegrp->next, &container->group_list);
1360 unlock_exit:
1361 if (ret && tcegrp)
1362 kfree(tcegrp);
1364 mutex_unlock(&container->lock);
1366 return ret;
1369 static void tce_iommu_detach_group(void *iommu_data,
1370 struct iommu_group *iommu_group)
1372 struct tce_container *container = iommu_data;
1373 struct iommu_table_group *table_group;
1374 bool found = false;
1375 struct tce_iommu_group *tcegrp;
1377 mutex_lock(&container->lock);
1379 list_for_each_entry(tcegrp, &container->group_list, next) {
1380 if (tcegrp->grp == iommu_group) {
1381 found = true;
1382 break;
1386 if (!found) {
1387 pr_warn("tce_vfio: detaching unattached group #%u\n",
1388 iommu_group_id(iommu_group));
1389 goto unlock_exit;
1392 list_del(&tcegrp->next);
1393 kfree(tcegrp);
1395 table_group = iommu_group_get_iommudata(iommu_group);
1396 BUG_ON(!table_group);
1398 if (!table_group->ops || !table_group->ops->release_ownership)
1399 tce_iommu_release_ownership(container, table_group);
1400 else
1401 tce_iommu_release_ownership_ddw(container, table_group);
1403 unlock_exit:
1404 mutex_unlock(&container->lock);
1407 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1408 .name = "iommu-vfio-powerpc",
1409 .owner = THIS_MODULE,
1410 .open = tce_iommu_open,
1411 .release = tce_iommu_release,
1412 .ioctl = tce_iommu_ioctl,
1413 .attach_group = tce_iommu_attach_group,
1414 .detach_group = tce_iommu_detach_group,
1417 static int __init tce_iommu_init(void)
1419 return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1422 static void __exit tce_iommu_cleanup(void)
1424 vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1427 module_init(tce_iommu_init);
1428 module_exit(tce_iommu_cleanup);
1430 MODULE_VERSION(DRIVER_VERSION);
1431 MODULE_LICENSE("GPL v2");
1432 MODULE_AUTHOR(DRIVER_AUTHOR);
1433 MODULE_DESCRIPTION(DRIVER_DESC);