1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
20 #include "double_span.h"
21 #include "io_pagetable.h"
23 struct iopt_pages_list
{
24 struct iopt_pages
*pages
;
25 struct iopt_area
*area
;
26 struct list_head next
;
27 unsigned long start_byte
;
31 struct iopt_area
*iopt_area_contig_init(struct iopt_area_contig_iter
*iter
,
32 struct io_pagetable
*iopt
,
34 unsigned long last_iova
)
36 lockdep_assert_held(&iopt
->iova_rwsem
);
38 iter
->cur_iova
= iova
;
39 iter
->last_iova
= last_iova
;
40 iter
->area
= iopt_area_iter_first(iopt
, iova
, iova
);
43 if (!iter
->area
->pages
) {
50 struct iopt_area
*iopt_area_contig_next(struct iopt_area_contig_iter
*iter
)
52 unsigned long last_iova
;
56 last_iova
= iopt_area_last_iova(iter
->area
);
57 if (iter
->last_iova
<= last_iova
)
60 iter
->cur_iova
= last_iova
+ 1;
61 iter
->area
= iopt_area_iter_next(iter
->area
, iter
->cur_iova
,
65 if (iter
->cur_iova
!= iopt_area_iova(iter
->area
) ||
73 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter
*span
,
75 unsigned long iova_alignment
,
76 unsigned long page_offset
)
78 if (span
->is_used
|| span
->last_hole
- span
->start_hole
< length
- 1)
81 span
->start_hole
= ALIGN(span
->start_hole
, iova_alignment
) |
83 if (span
->start_hole
> span
->last_hole
||
84 span
->last_hole
- span
->start_hole
< length
- 1)
89 static bool __alloc_iova_check_used(struct interval_tree_span_iter
*span
,
91 unsigned long iova_alignment
,
92 unsigned long page_offset
)
94 if (span
->is_hole
|| span
->last_used
- span
->start_used
< length
- 1)
97 span
->start_used
= ALIGN(span
->start_used
, iova_alignment
) |
99 if (span
->start_used
> span
->last_used
||
100 span
->last_used
- span
->start_used
< length
- 1)
106 * Automatically find a block of IOVA that is not being used and not reserved.
107 * Does not return a 0 IOVA even if it is valid.
109 static int iopt_alloc_iova(struct io_pagetable
*iopt
, unsigned long *iova
,
110 unsigned long uptr
, unsigned long length
)
112 unsigned long page_offset
= uptr
% PAGE_SIZE
;
113 struct interval_tree_double_span_iter used_span
;
114 struct interval_tree_span_iter allowed_span
;
115 unsigned long max_alignment
= PAGE_SIZE
;
116 unsigned long iova_alignment
;
118 lockdep_assert_held(&iopt
->iova_rwsem
);
120 /* Protect roundup_pow-of_two() from overflow */
121 if (length
== 0 || length
>= ULONG_MAX
/ 2)
125 * Keep alignment present in the uptr when building the IOVA, this
126 * increases the chance we can map a THP.
129 iova_alignment
= roundup_pow_of_two(length
);
131 iova_alignment
= min_t(unsigned long,
132 roundup_pow_of_two(length
),
133 1UL << __ffs64(uptr
));
135 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 max_alignment
= HPAGE_SIZE
;
138 /* Protect against ALIGN() overflow */
139 if (iova_alignment
>= max_alignment
)
140 iova_alignment
= max_alignment
;
142 if (iova_alignment
< iopt
->iova_alignment
)
145 interval_tree_for_each_span(&allowed_span
, &iopt
->allowed_itree
,
146 PAGE_SIZE
, ULONG_MAX
- PAGE_SIZE
) {
147 if (RB_EMPTY_ROOT(&iopt
->allowed_itree
.rb_root
)) {
148 allowed_span
.start_used
= PAGE_SIZE
;
149 allowed_span
.last_used
= ULONG_MAX
- PAGE_SIZE
;
150 allowed_span
.is_hole
= false;
153 if (!__alloc_iova_check_used(&allowed_span
, length
,
154 iova_alignment
, page_offset
))
157 interval_tree_for_each_double_span(
158 &used_span
, &iopt
->reserved_itree
, &iopt
->area_itree
,
159 allowed_span
.start_used
, allowed_span
.last_used
) {
160 if (!__alloc_iova_check_hole(&used_span
, length
,
165 *iova
= used_span
.start_hole
;
172 static int iopt_check_iova(struct io_pagetable
*iopt
, unsigned long iova
,
173 unsigned long length
)
177 lockdep_assert_held(&iopt
->iova_rwsem
);
179 if ((iova
& (iopt
->iova_alignment
- 1)))
182 if (check_add_overflow(iova
, length
- 1, &last
))
185 /* No reserved IOVA intersects the range */
186 if (iopt_reserved_iter_first(iopt
, iova
, last
))
189 /* Check that there is not already a mapping in the range */
190 if (iopt_area_iter_first(iopt
, iova
, last
))
196 * The area takes a slice of the pages from start_bytes to start_byte + length
198 static int iopt_insert_area(struct io_pagetable
*iopt
, struct iopt_area
*area
,
199 struct iopt_pages
*pages
, unsigned long iova
,
200 unsigned long start_byte
, unsigned long length
,
203 lockdep_assert_held_write(&iopt
->iova_rwsem
);
205 if ((iommu_prot
& IOMMU_WRITE
) && !pages
->writable
)
208 area
->iommu_prot
= iommu_prot
;
209 area
->page_offset
= start_byte
% PAGE_SIZE
;
210 if (area
->page_offset
& (iopt
->iova_alignment
- 1))
213 area
->node
.start
= iova
;
214 if (check_add_overflow(iova
, length
- 1, &area
->node
.last
))
217 area
->pages_node
.start
= start_byte
/ PAGE_SIZE
;
218 if (check_add_overflow(start_byte
, length
- 1, &area
->pages_node
.last
))
220 area
->pages_node
.last
= area
->pages_node
.last
/ PAGE_SIZE
;
221 if (WARN_ON(area
->pages_node
.last
>= pages
->npages
))
225 * The area is inserted with a NULL pages indicating it is not fully
229 interval_tree_insert(&area
->node
, &iopt
->area_itree
);
233 static struct iopt_area
*iopt_area_alloc(void)
235 struct iopt_area
*area
;
237 area
= kzalloc(sizeof(*area
), GFP_KERNEL_ACCOUNT
);
240 RB_CLEAR_NODE(&area
->node
.rb
);
241 RB_CLEAR_NODE(&area
->pages_node
.rb
);
245 static int iopt_alloc_area_pages(struct io_pagetable
*iopt
,
246 struct list_head
*pages_list
,
247 unsigned long length
, unsigned long *dst_iova
,
248 int iommu_prot
, unsigned int flags
)
250 struct iopt_pages_list
*elm
;
254 list_for_each_entry(elm
, pages_list
, next
) {
255 elm
->area
= iopt_area_alloc();
260 down_write(&iopt
->iova_rwsem
);
261 if ((length
& (iopt
->iova_alignment
- 1)) || !length
) {
266 if (flags
& IOPT_ALLOC_IOVA
) {
267 /* Use the first entry to guess the ideal IOVA alignment */
268 elm
= list_first_entry(pages_list
, struct iopt_pages_list
,
270 rc
= iopt_alloc_iova(
272 (uintptr_t)elm
->pages
->uptr
+ elm
->start_byte
, length
);
275 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
) &&
276 WARN_ON(iopt_check_iova(iopt
, *dst_iova
, length
))) {
281 rc
= iopt_check_iova(iopt
, *dst_iova
, length
);
287 * Areas are created with a NULL pages so that the IOVA space is
288 * reserved and we can unlock the iova_rwsem.
291 list_for_each_entry(elm
, pages_list
, next
) {
292 rc
= iopt_insert_area(iopt
, elm
->area
, elm
->pages
, iova
,
293 elm
->start_byte
, elm
->length
, iommu_prot
);
300 up_write(&iopt
->iova_rwsem
);
304 static void iopt_abort_area(struct iopt_area
*area
)
306 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
))
307 WARN_ON(area
->pages
);
309 down_write(&area
->iopt
->iova_rwsem
);
310 interval_tree_remove(&area
->node
, &area
->iopt
->area_itree
);
311 up_write(&area
->iopt
->iova_rwsem
);
316 void iopt_free_pages_list(struct list_head
*pages_list
)
318 struct iopt_pages_list
*elm
;
320 while ((elm
= list_first_entry_or_null(pages_list
,
321 struct iopt_pages_list
, next
))) {
323 iopt_abort_area(elm
->area
);
325 iopt_put_pages(elm
->pages
);
326 list_del(&elm
->next
);
331 static int iopt_fill_domains_pages(struct list_head
*pages_list
)
333 struct iopt_pages_list
*undo_elm
;
334 struct iopt_pages_list
*elm
;
337 list_for_each_entry(elm
, pages_list
, next
) {
338 rc
= iopt_area_fill_domains(elm
->area
, elm
->pages
);
345 list_for_each_entry(undo_elm
, pages_list
, next
) {
348 iopt_area_unfill_domains(undo_elm
->area
, undo_elm
->pages
);
353 int iopt_map_pages(struct io_pagetable
*iopt
, struct list_head
*pages_list
,
354 unsigned long length
, unsigned long *dst_iova
,
355 int iommu_prot
, unsigned int flags
)
357 struct iopt_pages_list
*elm
;
360 rc
= iopt_alloc_area_pages(iopt
, pages_list
, length
, dst_iova
,
365 down_read(&iopt
->domains_rwsem
);
366 rc
= iopt_fill_domains_pages(pages_list
);
368 goto out_unlock_domains
;
370 down_write(&iopt
->iova_rwsem
);
371 list_for_each_entry(elm
, pages_list
, next
) {
373 * area->pages must be set inside the domains_rwsem to ensure
374 * any newly added domains will get filled. Moves the reference
377 elm
->area
->pages
= elm
->pages
;
381 up_write(&iopt
->iova_rwsem
);
383 up_read(&iopt
->domains_rwsem
);
388 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
389 * @ictx: iommufd_ctx the iopt is part of
390 * @iopt: io_pagetable to act on
391 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
392 * the chosen iova on output. Otherwise is the iova to map to on input
393 * @uptr: User VA to map
394 * @length: Number of bytes to map
395 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
396 * @flags: IOPT_ALLOC_IOVA or zero
398 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
399 * page tables this will pin the pages and load them into the domain at iova.
400 * For non-domain page tables this will only setup a lazy reference and the
401 * caller must use iopt_access_pages() to touch them.
403 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
406 int iopt_map_user_pages(struct iommufd_ctx
*ictx
, struct io_pagetable
*iopt
,
407 unsigned long *iova
, void __user
*uptr
,
408 unsigned long length
, int iommu_prot
,
411 struct iopt_pages_list elm
= {};
412 LIST_HEAD(pages_list
);
415 elm
.pages
= iopt_alloc_pages(uptr
, length
, iommu_prot
& IOMMU_WRITE
);
416 if (IS_ERR(elm
.pages
))
417 return PTR_ERR(elm
.pages
);
418 if (ictx
->account_mode
== IOPT_PAGES_ACCOUNT_MM
&&
419 elm
.pages
->account_mode
== IOPT_PAGES_ACCOUNT_USER
)
420 elm
.pages
->account_mode
= IOPT_PAGES_ACCOUNT_MM
;
421 elm
.start_byte
= uptr
- elm
.pages
->uptr
;
423 list_add(&elm
.next
, &pages_list
);
425 rc
= iopt_map_pages(iopt
, &pages_list
, length
, iova
, iommu_prot
, flags
);
428 iopt_abort_area(elm
.area
);
430 iopt_put_pages(elm
.pages
);
436 struct iova_bitmap_fn_arg
{
438 struct io_pagetable
*iopt
;
439 struct iommu_domain
*domain
;
440 struct iommu_dirty_bitmap
*dirty
;
443 static int __iommu_read_and_clear_dirty(struct iova_bitmap
*bitmap
,
444 unsigned long iova
, size_t length
,
447 struct iopt_area
*area
;
448 struct iopt_area_contig_iter iter
;
449 struct iova_bitmap_fn_arg
*arg
= opaque
;
450 struct iommu_domain
*domain
= arg
->domain
;
451 struct iommu_dirty_bitmap
*dirty
= arg
->dirty
;
452 const struct iommu_dirty_ops
*ops
= domain
->dirty_ops
;
453 unsigned long last_iova
= iova
+ length
- 1;
454 unsigned long flags
= arg
->flags
;
457 iopt_for_each_contig_area(&iter
, area
, arg
->iopt
, iova
, last_iova
) {
458 unsigned long last
= min(last_iova
, iopt_area_last_iova(area
));
460 ret
= ops
->read_and_clear_dirty(domain
, iter
.cur_iova
,
461 last
- iter
.cur_iova
+ 1, flags
,
467 if (!iopt_area_contig_done(&iter
))
473 iommu_read_and_clear_dirty(struct iommu_domain
*domain
,
474 struct io_pagetable
*iopt
, unsigned long flags
,
475 struct iommu_hwpt_get_dirty_bitmap
*bitmap
)
477 const struct iommu_dirty_ops
*ops
= domain
->dirty_ops
;
478 struct iommu_iotlb_gather gather
;
479 struct iommu_dirty_bitmap dirty
;
480 struct iova_bitmap_fn_arg arg
;
481 struct iova_bitmap
*iter
;
484 if (!ops
|| !ops
->read_and_clear_dirty
)
487 iter
= iova_bitmap_alloc(bitmap
->iova
, bitmap
->length
,
489 u64_to_user_ptr(bitmap
->data
));
493 iommu_dirty_bitmap_init(&dirty
, iter
, &gather
);
499 iova_bitmap_for_each(iter
, &arg
, __iommu_read_and_clear_dirty
);
501 if (!(flags
& IOMMU_DIRTY_NO_CLEAR
))
502 iommu_iotlb_sync(domain
, &gather
);
504 iova_bitmap_free(iter
);
509 int iommufd_check_iova_range(struct io_pagetable
*iopt
,
510 struct iommu_hwpt_get_dirty_bitmap
*bitmap
)
512 size_t iommu_pgsize
= iopt
->iova_alignment
;
515 if (check_add_overflow(bitmap
->iova
, bitmap
->length
- 1, &last_iova
))
518 if (bitmap
->iova
> ULONG_MAX
|| last_iova
> ULONG_MAX
)
521 if ((bitmap
->iova
& (iommu_pgsize
- 1)) ||
522 ((last_iova
+ 1) & (iommu_pgsize
- 1)))
525 if (!bitmap
->page_size
)
528 if ((bitmap
->iova
& (bitmap
->page_size
- 1)) ||
529 ((last_iova
+ 1) & (bitmap
->page_size
- 1)))
535 int iopt_read_and_clear_dirty_data(struct io_pagetable
*iopt
,
536 struct iommu_domain
*domain
,
538 struct iommu_hwpt_get_dirty_bitmap
*bitmap
)
542 ret
= iommufd_check_iova_range(iopt
, bitmap
);
546 down_read(&iopt
->iova_rwsem
);
547 ret
= iommu_read_and_clear_dirty(domain
, iopt
, flags
, bitmap
);
548 up_read(&iopt
->iova_rwsem
);
553 static int iopt_clear_dirty_data(struct io_pagetable
*iopt
,
554 struct iommu_domain
*domain
)
556 const struct iommu_dirty_ops
*ops
= domain
->dirty_ops
;
557 struct iommu_iotlb_gather gather
;
558 struct iommu_dirty_bitmap dirty
;
559 struct iopt_area
*area
;
562 lockdep_assert_held_read(&iopt
->iova_rwsem
);
564 iommu_dirty_bitmap_init(&dirty
, NULL
, &gather
);
566 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
567 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
571 ret
= ops
->read_and_clear_dirty(domain
, iopt_area_iova(area
),
572 iopt_area_length(area
), 0,
578 iommu_iotlb_sync(domain
, &gather
);
582 int iopt_set_dirty_tracking(struct io_pagetable
*iopt
,
583 struct iommu_domain
*domain
, bool enable
)
585 const struct iommu_dirty_ops
*ops
= domain
->dirty_ops
;
591 down_read(&iopt
->iova_rwsem
);
593 /* Clear dirty bits from PTEs to ensure a clean snapshot */
595 ret
= iopt_clear_dirty_data(iopt
, domain
);
600 ret
= ops
->set_dirty_tracking(domain
, enable
);
603 up_read(&iopt
->iova_rwsem
);
607 int iopt_get_pages(struct io_pagetable
*iopt
, unsigned long iova
,
608 unsigned long length
, struct list_head
*pages_list
)
610 struct iopt_area_contig_iter iter
;
611 unsigned long last_iova
;
612 struct iopt_area
*area
;
617 if (check_add_overflow(iova
, length
- 1, &last_iova
))
620 down_read(&iopt
->iova_rwsem
);
621 iopt_for_each_contig_area(&iter
, area
, iopt
, iova
, last_iova
) {
622 struct iopt_pages_list
*elm
;
623 unsigned long last
= min(last_iova
, iopt_area_last_iova(area
));
625 elm
= kzalloc(sizeof(*elm
), GFP_KERNEL_ACCOUNT
);
630 elm
->start_byte
= iopt_area_start_byte(area
, iter
.cur_iova
);
631 elm
->pages
= area
->pages
;
632 elm
->length
= (last
- iter
.cur_iova
) + 1;
633 kref_get(&elm
->pages
->kref
);
634 list_add_tail(&elm
->next
, pages_list
);
636 if (!iopt_area_contig_done(&iter
)) {
640 up_read(&iopt
->iova_rwsem
);
643 up_read(&iopt
->iova_rwsem
);
644 iopt_free_pages_list(pages_list
);
648 static int iopt_unmap_iova_range(struct io_pagetable
*iopt
, unsigned long start
,
649 unsigned long last
, unsigned long *unmapped
)
651 struct iopt_area
*area
;
652 unsigned long unmapped_bytes
= 0;
653 unsigned int tries
= 0;
657 * The domains_rwsem must be held in read mode any time any area->pages
658 * is NULL. This prevents domain attach/detatch from running
659 * concurrently with cleaning up the area.
662 down_read(&iopt
->domains_rwsem
);
663 down_write(&iopt
->iova_rwsem
);
664 while ((area
= iopt_area_iter_first(iopt
, start
, last
))) {
665 unsigned long area_last
= iopt_area_last_iova(area
);
666 unsigned long area_first
= iopt_area_iova(area
);
667 struct iopt_pages
*pages
;
669 /* Userspace should not race map/unmap's of the same area */
672 goto out_unlock_iova
;
675 if (area_first
< start
|| area_last
> last
) {
677 goto out_unlock_iova
;
680 if (area_first
!= start
)
684 * num_accesses writers must hold the iova_rwsem too, so we can
685 * safely read it under the write side of the iovam_rwsem
686 * without the pages->mutex.
688 if (area
->num_accesses
) {
689 size_t length
= iopt_area_length(area
);
692 area
->prevent_access
= true;
693 up_write(&iopt
->iova_rwsem
);
694 up_read(&iopt
->domains_rwsem
);
696 iommufd_access_notify_unmap(iopt
, area_first
, length
);
697 /* Something is not responding to unmap requests. */
699 if (WARN_ON(tries
> 100))
706 up_write(&iopt
->iova_rwsem
);
708 iopt_area_unfill_domains(area
, pages
);
709 iopt_abort_area(area
);
710 iopt_put_pages(pages
);
712 unmapped_bytes
+= area_last
- area_first
+ 1;
714 down_write(&iopt
->iova_rwsem
);
720 up_write(&iopt
->iova_rwsem
);
721 up_read(&iopt
->domains_rwsem
);
723 *unmapped
= unmapped_bytes
;
728 * iopt_unmap_iova() - Remove a range of iova
729 * @iopt: io_pagetable to act on
730 * @iova: Starting iova to unmap
731 * @length: Number of bytes to unmap
732 * @unmapped: Return number of bytes unmapped
734 * The requested range must be a superset of existing ranges.
735 * Splitting/truncating IOVA mappings is not allowed.
737 int iopt_unmap_iova(struct io_pagetable
*iopt
, unsigned long iova
,
738 unsigned long length
, unsigned long *unmapped
)
740 unsigned long iova_last
;
745 if (check_add_overflow(iova
, length
- 1, &iova_last
))
748 return iopt_unmap_iova_range(iopt
, iova
, iova_last
, unmapped
);
751 int iopt_unmap_all(struct io_pagetable
*iopt
, unsigned long *unmapped
)
755 rc
= iopt_unmap_iova_range(iopt
, 0, ULONG_MAX
, unmapped
);
756 /* If the IOVAs are empty then unmap all succeeds */
762 /* The caller must always free all the nodes in the allowed_iova rb_root. */
763 int iopt_set_allow_iova(struct io_pagetable
*iopt
,
764 struct rb_root_cached
*allowed_iova
)
766 struct iopt_allowed
*allowed
;
768 down_write(&iopt
->iova_rwsem
);
769 swap(*allowed_iova
, iopt
->allowed_itree
);
771 for (allowed
= iopt_allowed_iter_first(iopt
, 0, ULONG_MAX
); allowed
;
772 allowed
= iopt_allowed_iter_next(allowed
, 0, ULONG_MAX
)) {
773 if (iopt_reserved_iter_first(iopt
, allowed
->node
.start
,
774 allowed
->node
.last
)) {
775 swap(*allowed_iova
, iopt
->allowed_itree
);
776 up_write(&iopt
->iova_rwsem
);
780 up_write(&iopt
->iova_rwsem
);
784 int iopt_reserve_iova(struct io_pagetable
*iopt
, unsigned long start
,
785 unsigned long last
, void *owner
)
787 struct iopt_reserved
*reserved
;
789 lockdep_assert_held_write(&iopt
->iova_rwsem
);
791 if (iopt_area_iter_first(iopt
, start
, last
) ||
792 iopt_allowed_iter_first(iopt
, start
, last
))
795 reserved
= kzalloc(sizeof(*reserved
), GFP_KERNEL_ACCOUNT
);
798 reserved
->node
.start
= start
;
799 reserved
->node
.last
= last
;
800 reserved
->owner
= owner
;
801 interval_tree_insert(&reserved
->node
, &iopt
->reserved_itree
);
805 static void __iopt_remove_reserved_iova(struct io_pagetable
*iopt
, void *owner
)
807 struct iopt_reserved
*reserved
, *next
;
809 lockdep_assert_held_write(&iopt
->iova_rwsem
);
811 for (reserved
= iopt_reserved_iter_first(iopt
, 0, ULONG_MAX
); reserved
;
813 next
= iopt_reserved_iter_next(reserved
, 0, ULONG_MAX
);
815 if (reserved
->owner
== owner
) {
816 interval_tree_remove(&reserved
->node
,
817 &iopt
->reserved_itree
);
823 void iopt_remove_reserved_iova(struct io_pagetable
*iopt
, void *owner
)
825 down_write(&iopt
->iova_rwsem
);
826 __iopt_remove_reserved_iova(iopt
, owner
);
827 up_write(&iopt
->iova_rwsem
);
830 void iopt_init_table(struct io_pagetable
*iopt
)
832 init_rwsem(&iopt
->iova_rwsem
);
833 init_rwsem(&iopt
->domains_rwsem
);
834 iopt
->area_itree
= RB_ROOT_CACHED
;
835 iopt
->allowed_itree
= RB_ROOT_CACHED
;
836 iopt
->reserved_itree
= RB_ROOT_CACHED
;
837 xa_init_flags(&iopt
->domains
, XA_FLAGS_ACCOUNT
);
838 xa_init_flags(&iopt
->access_list
, XA_FLAGS_ALLOC
);
841 * iopt's start as SW tables that can use the entire size_t IOVA space
842 * due to the use of size_t in the APIs. They have no alignment
845 iopt
->iova_alignment
= 1;
848 void iopt_destroy_table(struct io_pagetable
*iopt
)
850 struct interval_tree_node
*node
;
852 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
))
853 iopt_remove_reserved_iova(iopt
, NULL
);
855 while ((node
= interval_tree_iter_first(&iopt
->allowed_itree
, 0,
857 interval_tree_remove(node
, &iopt
->allowed_itree
);
858 kfree(container_of(node
, struct iopt_allowed
, node
));
861 WARN_ON(!RB_EMPTY_ROOT(&iopt
->reserved_itree
.rb_root
));
862 WARN_ON(!xa_empty(&iopt
->domains
));
863 WARN_ON(!xa_empty(&iopt
->access_list
));
864 WARN_ON(!RB_EMPTY_ROOT(&iopt
->area_itree
.rb_root
));
868 * iopt_unfill_domain() - Unfill a domain with PFNs
869 * @iopt: io_pagetable to act on
870 * @domain: domain to unfill
872 * This is used when removing a domain from the iopt. Every area in the iopt
873 * will be unmapped from the domain. The domain must already be removed from the
876 static void iopt_unfill_domain(struct io_pagetable
*iopt
,
877 struct iommu_domain
*domain
)
879 struct iopt_area
*area
;
881 lockdep_assert_held(&iopt
->iova_rwsem
);
882 lockdep_assert_held_write(&iopt
->domains_rwsem
);
885 * Some other domain is holding all the pfns still, rapidly unmap this
888 if (iopt
->next_domain_id
!= 0) {
889 /* Pick an arbitrary remaining domain to act as storage */
890 struct iommu_domain
*storage_domain
=
891 xa_load(&iopt
->domains
, 0);
893 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
894 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
895 struct iopt_pages
*pages
= area
->pages
;
900 mutex_lock(&pages
->mutex
);
901 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
))
902 WARN_ON(!area
->storage_domain
);
903 if (area
->storage_domain
== domain
)
904 area
->storage_domain
= storage_domain
;
905 mutex_unlock(&pages
->mutex
);
907 iopt_area_unmap_domain(area
, domain
);
912 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
913 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
914 struct iopt_pages
*pages
= area
->pages
;
919 mutex_lock(&pages
->mutex
);
920 interval_tree_remove(&area
->pages_node
, &pages
->domains_itree
);
921 WARN_ON(area
->storage_domain
!= domain
);
922 area
->storage_domain
= NULL
;
923 iopt_area_unfill_domain(area
, pages
, domain
);
924 mutex_unlock(&pages
->mutex
);
929 * iopt_fill_domain() - Fill a domain with PFNs
930 * @iopt: io_pagetable to act on
931 * @domain: domain to fill
933 * Fill the domain with PFNs from every area in the iopt. On failure the domain
936 static int iopt_fill_domain(struct io_pagetable
*iopt
,
937 struct iommu_domain
*domain
)
939 struct iopt_area
*end_area
;
940 struct iopt_area
*area
;
943 lockdep_assert_held(&iopt
->iova_rwsem
);
944 lockdep_assert_held_write(&iopt
->domains_rwsem
);
946 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
947 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
948 struct iopt_pages
*pages
= area
->pages
;
953 mutex_lock(&pages
->mutex
);
954 rc
= iopt_area_fill_domain(area
, domain
);
956 mutex_unlock(&pages
->mutex
);
959 if (!area
->storage_domain
) {
960 WARN_ON(iopt
->next_domain_id
!= 0);
961 area
->storage_domain
= domain
;
962 interval_tree_insert(&area
->pages_node
,
963 &pages
->domains_itree
);
965 mutex_unlock(&pages
->mutex
);
971 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
972 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
973 struct iopt_pages
*pages
= area
->pages
;
975 if (area
== end_area
)
979 mutex_lock(&pages
->mutex
);
980 if (iopt
->next_domain_id
== 0) {
981 interval_tree_remove(&area
->pages_node
,
982 &pages
->domains_itree
);
983 area
->storage_domain
= NULL
;
985 iopt_area_unfill_domain(area
, pages
, domain
);
986 mutex_unlock(&pages
->mutex
);
991 /* All existing area's conform to an increased page size */
992 static int iopt_check_iova_alignment(struct io_pagetable
*iopt
,
993 unsigned long new_iova_alignment
)
995 unsigned long align_mask
= new_iova_alignment
- 1;
996 struct iopt_area
*area
;
998 lockdep_assert_held(&iopt
->iova_rwsem
);
999 lockdep_assert_held(&iopt
->domains_rwsem
);
1001 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
1002 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
))
1003 if ((iopt_area_iova(area
) & align_mask
) ||
1004 (iopt_area_length(area
) & align_mask
) ||
1005 (area
->page_offset
& align_mask
))
1008 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
)) {
1009 struct iommufd_access
*access
;
1010 unsigned long index
;
1012 xa_for_each(&iopt
->access_list
, index
, access
)
1013 if (WARN_ON(access
->iova_alignment
>
1014 new_iova_alignment
))
1020 int iopt_table_add_domain(struct io_pagetable
*iopt
,
1021 struct iommu_domain
*domain
)
1023 const struct iommu_domain_geometry
*geometry
= &domain
->geometry
;
1024 struct iommu_domain
*iter_domain
;
1025 unsigned int new_iova_alignment
;
1026 unsigned long index
;
1029 down_write(&iopt
->domains_rwsem
);
1030 down_write(&iopt
->iova_rwsem
);
1032 xa_for_each(&iopt
->domains
, index
, iter_domain
) {
1033 if (WARN_ON(iter_domain
== domain
)) {
1040 * The io page size drives the iova_alignment. Internally the iopt_pages
1041 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1042 * objects into the iommu_domain.
1044 * A iommu_domain must always be able to accept PAGE_SIZE to be
1045 * compatible as we can't guarantee higher contiguity.
1047 new_iova_alignment
= max_t(unsigned long,
1048 1UL << __ffs(domain
->pgsize_bitmap
),
1049 iopt
->iova_alignment
);
1050 if (new_iova_alignment
> PAGE_SIZE
) {
1054 if (new_iova_alignment
!= iopt
->iova_alignment
) {
1055 rc
= iopt_check_iova_alignment(iopt
, new_iova_alignment
);
1060 /* No area exists that is outside the allowed domain aperture */
1061 if (geometry
->aperture_start
!= 0) {
1062 rc
= iopt_reserve_iova(iopt
, 0, geometry
->aperture_start
- 1,
1067 if (geometry
->aperture_end
!= ULONG_MAX
) {
1068 rc
= iopt_reserve_iova(iopt
, geometry
->aperture_end
+ 1,
1074 rc
= xa_reserve(&iopt
->domains
, iopt
->next_domain_id
, GFP_KERNEL
);
1078 rc
= iopt_fill_domain(iopt
, domain
);
1082 iopt
->iova_alignment
= new_iova_alignment
;
1083 xa_store(&iopt
->domains
, iopt
->next_domain_id
, domain
, GFP_KERNEL
);
1084 iopt
->next_domain_id
++;
1085 up_write(&iopt
->iova_rwsem
);
1086 up_write(&iopt
->domains_rwsem
);
1089 xa_release(&iopt
->domains
, iopt
->next_domain_id
);
1091 __iopt_remove_reserved_iova(iopt
, domain
);
1093 up_write(&iopt
->iova_rwsem
);
1094 up_write(&iopt
->domains_rwsem
);
1098 static int iopt_calculate_iova_alignment(struct io_pagetable
*iopt
)
1100 unsigned long new_iova_alignment
;
1101 struct iommufd_access
*access
;
1102 struct iommu_domain
*domain
;
1103 unsigned long index
;
1105 lockdep_assert_held_write(&iopt
->iova_rwsem
);
1106 lockdep_assert_held(&iopt
->domains_rwsem
);
1108 /* See batch_iommu_map_small() */
1109 if (iopt
->disable_large_pages
)
1110 new_iova_alignment
= PAGE_SIZE
;
1112 new_iova_alignment
= 1;
1114 xa_for_each(&iopt
->domains
, index
, domain
)
1115 new_iova_alignment
= max_t(unsigned long,
1116 1UL << __ffs(domain
->pgsize_bitmap
),
1117 new_iova_alignment
);
1118 xa_for_each(&iopt
->access_list
, index
, access
)
1119 new_iova_alignment
= max_t(unsigned long,
1120 access
->iova_alignment
,
1121 new_iova_alignment
);
1123 if (new_iova_alignment
> iopt
->iova_alignment
) {
1126 rc
= iopt_check_iova_alignment(iopt
, new_iova_alignment
);
1130 iopt
->iova_alignment
= new_iova_alignment
;
1134 void iopt_table_remove_domain(struct io_pagetable
*iopt
,
1135 struct iommu_domain
*domain
)
1137 struct iommu_domain
*iter_domain
= NULL
;
1138 unsigned long index
;
1140 down_write(&iopt
->domains_rwsem
);
1141 down_write(&iopt
->iova_rwsem
);
1143 xa_for_each(&iopt
->domains
, index
, iter_domain
)
1144 if (iter_domain
== domain
)
1146 if (WARN_ON(iter_domain
!= domain
) || index
>= iopt
->next_domain_id
)
1150 * Compress the xarray to keep it linear by swapping the entry to erase
1151 * with the tail entry and shrinking the tail.
1153 iopt
->next_domain_id
--;
1154 iter_domain
= xa_erase(&iopt
->domains
, iopt
->next_domain_id
);
1155 if (index
!= iopt
->next_domain_id
)
1156 xa_store(&iopt
->domains
, index
, iter_domain
, GFP_KERNEL
);
1158 iopt_unfill_domain(iopt
, domain
);
1159 __iopt_remove_reserved_iova(iopt
, domain
);
1161 WARN_ON(iopt_calculate_iova_alignment(iopt
));
1163 up_write(&iopt
->iova_rwsem
);
1164 up_write(&iopt
->domains_rwsem
);
1168 * iopt_area_split - Split an area into two parts at iova
1169 * @area: The area to split
1170 * @iova: Becomes the last of a new area
1172 * This splits an area into two. It is part of the VFIO compatibility to allow
1173 * poking a hole in the mapping. The two areas continue to point at the same
1174 * iopt_pages, just with different starting bytes.
1176 static int iopt_area_split(struct iopt_area
*area
, unsigned long iova
)
1178 unsigned long alignment
= area
->iopt
->iova_alignment
;
1179 unsigned long last_iova
= iopt_area_last_iova(area
);
1180 unsigned long start_iova
= iopt_area_iova(area
);
1181 unsigned long new_start
= iova
+ 1;
1182 struct io_pagetable
*iopt
= area
->iopt
;
1183 struct iopt_pages
*pages
= area
->pages
;
1184 struct iopt_area
*lhs
;
1185 struct iopt_area
*rhs
;
1188 lockdep_assert_held_write(&iopt
->iova_rwsem
);
1190 if (iova
== start_iova
|| iova
== last_iova
)
1193 if (!pages
|| area
->prevent_access
)
1196 if (new_start
& (alignment
- 1) ||
1197 iopt_area_start_byte(area
, new_start
) & (alignment
- 1))
1200 lhs
= iopt_area_alloc();
1204 rhs
= iopt_area_alloc();
1210 mutex_lock(&pages
->mutex
);
1212 * Splitting is not permitted if an access exists, we don't track enough
1213 * information to split existing accesses.
1215 if (area
->num_accesses
) {
1221 * Splitting is not permitted if a domain could have been mapped with
1224 if (area
->storage_domain
&& !iopt
->disable_large_pages
) {
1229 interval_tree_remove(&area
->node
, &iopt
->area_itree
);
1230 rc
= iopt_insert_area(iopt
, lhs
, area
->pages
, start_iova
,
1231 iopt_area_start_byte(area
, start_iova
),
1232 (new_start
- 1) - start_iova
+ 1,
1237 rc
= iopt_insert_area(iopt
, rhs
, area
->pages
, new_start
,
1238 iopt_area_start_byte(area
, new_start
),
1239 last_iova
- new_start
+ 1, area
->iommu_prot
);
1241 goto err_remove_lhs
;
1244 * If the original area has filled a domain, domains_itree has to be
1247 if (area
->storage_domain
) {
1248 interval_tree_remove(&area
->pages_node
, &pages
->domains_itree
);
1249 interval_tree_insert(&lhs
->pages_node
, &pages
->domains_itree
);
1250 interval_tree_insert(&rhs
->pages_node
, &pages
->domains_itree
);
1253 lhs
->storage_domain
= area
->storage_domain
;
1254 lhs
->pages
= area
->pages
;
1255 rhs
->storage_domain
= area
->storage_domain
;
1256 rhs
->pages
= area
->pages
;
1257 kref_get(&rhs
->pages
->kref
);
1259 mutex_unlock(&pages
->mutex
);
1262 * No change to domains or accesses because the pages hasn't been
1268 interval_tree_remove(&lhs
->node
, &iopt
->area_itree
);
1270 interval_tree_insert(&area
->node
, &iopt
->area_itree
);
1272 mutex_unlock(&pages
->mutex
);
1279 int iopt_cut_iova(struct io_pagetable
*iopt
, unsigned long *iovas
,
1285 down_write(&iopt
->iova_rwsem
);
1286 for (i
= 0; i
< num_iovas
; i
++) {
1287 struct iopt_area
*area
;
1289 area
= iopt_area_iter_first(iopt
, iovas
[i
], iovas
[i
]);
1292 rc
= iopt_area_split(area
, iovas
[i
]);
1296 up_write(&iopt
->iova_rwsem
);
1300 void iopt_enable_large_pages(struct io_pagetable
*iopt
)
1304 down_write(&iopt
->domains_rwsem
);
1305 down_write(&iopt
->iova_rwsem
);
1306 WRITE_ONCE(iopt
->disable_large_pages
, false);
1307 rc
= iopt_calculate_iova_alignment(iopt
);
1309 up_write(&iopt
->iova_rwsem
);
1310 up_write(&iopt
->domains_rwsem
);
1313 int iopt_disable_large_pages(struct io_pagetable
*iopt
)
1317 down_write(&iopt
->domains_rwsem
);
1318 down_write(&iopt
->iova_rwsem
);
1319 if (iopt
->disable_large_pages
)
1322 /* Won't do it if domains already have pages mapped in them */
1323 if (!xa_empty(&iopt
->domains
) &&
1324 !RB_EMPTY_ROOT(&iopt
->area_itree
.rb_root
)) {
1329 WRITE_ONCE(iopt
->disable_large_pages
, true);
1330 rc
= iopt_calculate_iova_alignment(iopt
);
1332 WRITE_ONCE(iopt
->disable_large_pages
, false);
1334 up_write(&iopt
->iova_rwsem
);
1335 up_write(&iopt
->domains_rwsem
);
1339 int iopt_add_access(struct io_pagetable
*iopt
, struct iommufd_access
*access
)
1344 down_write(&iopt
->domains_rwsem
);
1345 down_write(&iopt
->iova_rwsem
);
1346 rc
= xa_alloc(&iopt
->access_list
, &new_id
, access
, xa_limit_16b
,
1347 GFP_KERNEL_ACCOUNT
);
1352 rc
= iopt_calculate_iova_alignment(iopt
);
1354 xa_erase(&iopt
->access_list
, new_id
);
1357 access
->iopt_access_list_id
= new_id
;
1360 up_write(&iopt
->iova_rwsem
);
1361 up_write(&iopt
->domains_rwsem
);
1365 void iopt_remove_access(struct io_pagetable
*iopt
,
1366 struct iommufd_access
*access
,
1367 u32 iopt_access_list_id
)
1369 down_write(&iopt
->domains_rwsem
);
1370 down_write(&iopt
->iova_rwsem
);
1371 WARN_ON(xa_erase(&iopt
->access_list
, iopt_access_list_id
) != access
);
1372 WARN_ON(iopt_calculate_iova_alignment(iopt
));
1373 up_write(&iopt
->iova_rwsem
);
1374 up_write(&iopt
->domains_rwsem
);
1377 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1378 int iopt_table_enforce_dev_resv_regions(struct io_pagetable
*iopt
,
1380 phys_addr_t
*sw_msi_start
)
1382 struct iommu_resv_region
*resv
;
1383 LIST_HEAD(resv_regions
);
1384 unsigned int num_hw_msi
= 0;
1385 unsigned int num_sw_msi
= 0;
1388 if (iommufd_should_fail())
1391 down_write(&iopt
->iova_rwsem
);
1392 /* FIXME: drivers allocate memory but there is no failure propogated */
1393 iommu_get_resv_regions(dev
, &resv_regions
);
1395 list_for_each_entry(resv
, &resv_regions
, list
) {
1396 if (resv
->type
== IOMMU_RESV_DIRECT_RELAXABLE
)
1399 if (sw_msi_start
&& resv
->type
== IOMMU_RESV_MSI
)
1401 if (sw_msi_start
&& resv
->type
== IOMMU_RESV_SW_MSI
) {
1402 *sw_msi_start
= resv
->start
;
1406 rc
= iopt_reserve_iova(iopt
, resv
->start
,
1407 resv
->length
- 1 + resv
->start
, dev
);
1412 /* Drivers must offer sane combinations of regions */
1413 if (WARN_ON(num_sw_msi
&& num_hw_msi
) || WARN_ON(num_sw_msi
> 1)) {
1422 __iopt_remove_reserved_iova(iopt
, dev
);
1424 iommu_put_resv_regions(dev
, &resv_regions
);
1425 up_write(&iopt
->iova_rwsem
);