1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
20 #include "double_span.h"
21 #include "io_pagetable.h"
23 struct iopt_pages_list
{
24 struct iopt_pages
*pages
;
25 struct iopt_area
*area
;
26 struct list_head next
;
27 unsigned long start_byte
;
31 struct iopt_area
*iopt_area_contig_init(struct iopt_area_contig_iter
*iter
,
32 struct io_pagetable
*iopt
,
34 unsigned long last_iova
)
36 lockdep_assert_held(&iopt
->iova_rwsem
);
38 iter
->cur_iova
= iova
;
39 iter
->last_iova
= last_iova
;
40 iter
->area
= iopt_area_iter_first(iopt
, iova
, iova
);
43 if (!iter
->area
->pages
) {
50 struct iopt_area
*iopt_area_contig_next(struct iopt_area_contig_iter
*iter
)
52 unsigned long last_iova
;
56 last_iova
= iopt_area_last_iova(iter
->area
);
57 if (iter
->last_iova
<= last_iova
)
60 iter
->cur_iova
= last_iova
+ 1;
61 iter
->area
= iopt_area_iter_next(iter
->area
, iter
->cur_iova
,
65 if (iter
->cur_iova
!= iopt_area_iova(iter
->area
) ||
73 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter
*span
,
75 unsigned long iova_alignment
,
76 unsigned long page_offset
)
78 if (span
->is_used
|| span
->last_hole
- span
->start_hole
< length
- 1)
81 span
->start_hole
= ALIGN(span
->start_hole
, iova_alignment
) |
83 if (span
->start_hole
> span
->last_hole
||
84 span
->last_hole
- span
->start_hole
< length
- 1)
89 static bool __alloc_iova_check_used(struct interval_tree_span_iter
*span
,
91 unsigned long iova_alignment
,
92 unsigned long page_offset
)
94 if (span
->is_hole
|| span
->last_used
- span
->start_used
< length
- 1)
97 span
->start_used
= ALIGN(span
->start_used
, iova_alignment
) |
99 if (span
->start_used
> span
->last_used
||
100 span
->last_used
- span
->start_used
< length
- 1)
106 * Automatically find a block of IOVA that is not being used and not reserved.
107 * Does not return a 0 IOVA even if it is valid.
109 static int iopt_alloc_iova(struct io_pagetable
*iopt
, unsigned long *iova
,
110 unsigned long addr
, unsigned long length
)
112 unsigned long page_offset
= addr
% PAGE_SIZE
;
113 struct interval_tree_double_span_iter used_span
;
114 struct interval_tree_span_iter allowed_span
;
115 unsigned long max_alignment
= PAGE_SIZE
;
116 unsigned long iova_alignment
;
118 lockdep_assert_held(&iopt
->iova_rwsem
);
120 /* Protect roundup_pow-of_two() from overflow */
121 if (length
== 0 || length
>= ULONG_MAX
/ 2)
125 * Keep alignment present in addr when building the IOVA, which
126 * increases the chance we can map a THP.
129 iova_alignment
= roundup_pow_of_two(length
);
131 iova_alignment
= min_t(unsigned long,
132 roundup_pow_of_two(length
),
133 1UL << __ffs64(addr
));
135 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 max_alignment
= HPAGE_SIZE
;
138 /* Protect against ALIGN() overflow */
139 if (iova_alignment
>= max_alignment
)
140 iova_alignment
= max_alignment
;
142 if (iova_alignment
< iopt
->iova_alignment
)
145 interval_tree_for_each_span(&allowed_span
, &iopt
->allowed_itree
,
146 PAGE_SIZE
, ULONG_MAX
- PAGE_SIZE
) {
147 if (RB_EMPTY_ROOT(&iopt
->allowed_itree
.rb_root
)) {
148 allowed_span
.start_used
= PAGE_SIZE
;
149 allowed_span
.last_used
= ULONG_MAX
- PAGE_SIZE
;
150 allowed_span
.is_hole
= false;
153 if (!__alloc_iova_check_used(&allowed_span
, length
,
154 iova_alignment
, page_offset
))
157 interval_tree_for_each_double_span(
158 &used_span
, &iopt
->reserved_itree
, &iopt
->area_itree
,
159 allowed_span
.start_used
, allowed_span
.last_used
) {
160 if (!__alloc_iova_check_hole(&used_span
, length
,
165 *iova
= used_span
.start_hole
;
172 static int iopt_check_iova(struct io_pagetable
*iopt
, unsigned long iova
,
173 unsigned long length
)
177 lockdep_assert_held(&iopt
->iova_rwsem
);
179 if ((iova
& (iopt
->iova_alignment
- 1)))
182 if (check_add_overflow(iova
, length
- 1, &last
))
185 /* No reserved IOVA intersects the range */
186 if (iopt_reserved_iter_first(iopt
, iova
, last
))
189 /* Check that there is not already a mapping in the range */
190 if (iopt_area_iter_first(iopt
, iova
, last
))
196 * The area takes a slice of the pages from start_bytes to start_byte + length
198 static int iopt_insert_area(struct io_pagetable
*iopt
, struct iopt_area
*area
,
199 struct iopt_pages
*pages
, unsigned long iova
,
200 unsigned long start_byte
, unsigned long length
,
203 lockdep_assert_held_write(&iopt
->iova_rwsem
);
205 if ((iommu_prot
& IOMMU_WRITE
) && !pages
->writable
)
208 area
->iommu_prot
= iommu_prot
;
209 area
->page_offset
= start_byte
% PAGE_SIZE
;
210 if (area
->page_offset
& (iopt
->iova_alignment
- 1))
213 area
->node
.start
= iova
;
214 if (check_add_overflow(iova
, length
- 1, &area
->node
.last
))
217 area
->pages_node
.start
= start_byte
/ PAGE_SIZE
;
218 if (check_add_overflow(start_byte
, length
- 1, &area
->pages_node
.last
))
220 area
->pages_node
.last
= area
->pages_node
.last
/ PAGE_SIZE
;
221 if (WARN_ON(area
->pages_node
.last
>= pages
->npages
))
225 * The area is inserted with a NULL pages indicating it is not fully
229 interval_tree_insert(&area
->node
, &iopt
->area_itree
);
233 static struct iopt_area
*iopt_area_alloc(void)
235 struct iopt_area
*area
;
237 area
= kzalloc(sizeof(*area
), GFP_KERNEL_ACCOUNT
);
240 RB_CLEAR_NODE(&area
->node
.rb
);
241 RB_CLEAR_NODE(&area
->pages_node
.rb
);
245 static int iopt_alloc_area_pages(struct io_pagetable
*iopt
,
246 struct list_head
*pages_list
,
247 unsigned long length
, unsigned long *dst_iova
,
248 int iommu_prot
, unsigned int flags
)
250 struct iopt_pages_list
*elm
;
255 list_for_each_entry(elm
, pages_list
, next
) {
256 elm
->area
= iopt_area_alloc();
261 down_write(&iopt
->iova_rwsem
);
262 if ((length
& (iopt
->iova_alignment
- 1)) || !length
) {
267 if (flags
& IOPT_ALLOC_IOVA
) {
268 /* Use the first entry to guess the ideal IOVA alignment */
269 elm
= list_first_entry(pages_list
, struct iopt_pages_list
,
271 switch (elm
->pages
->type
) {
272 case IOPT_ADDRESS_USER
:
273 start
= elm
->start_byte
+ (uintptr_t)elm
->pages
->uptr
;
275 case IOPT_ADDRESS_FILE
:
276 start
= elm
->start_byte
+ elm
->pages
->start
;
279 rc
= iopt_alloc_iova(iopt
, dst_iova
, start
, length
);
282 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
) &&
283 WARN_ON(iopt_check_iova(iopt
, *dst_iova
, length
))) {
288 rc
= iopt_check_iova(iopt
, *dst_iova
, length
);
294 * Areas are created with a NULL pages so that the IOVA space is
295 * reserved and we can unlock the iova_rwsem.
298 list_for_each_entry(elm
, pages_list
, next
) {
299 rc
= iopt_insert_area(iopt
, elm
->area
, elm
->pages
, iova
,
300 elm
->start_byte
, elm
->length
, iommu_prot
);
307 up_write(&iopt
->iova_rwsem
);
311 static void iopt_abort_area(struct iopt_area
*area
)
313 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
))
314 WARN_ON(area
->pages
);
316 down_write(&area
->iopt
->iova_rwsem
);
317 interval_tree_remove(&area
->node
, &area
->iopt
->area_itree
);
318 up_write(&area
->iopt
->iova_rwsem
);
323 void iopt_free_pages_list(struct list_head
*pages_list
)
325 struct iopt_pages_list
*elm
;
327 while ((elm
= list_first_entry_or_null(pages_list
,
328 struct iopt_pages_list
, next
))) {
330 iopt_abort_area(elm
->area
);
332 iopt_put_pages(elm
->pages
);
333 list_del(&elm
->next
);
338 static int iopt_fill_domains_pages(struct list_head
*pages_list
)
340 struct iopt_pages_list
*undo_elm
;
341 struct iopt_pages_list
*elm
;
344 list_for_each_entry(elm
, pages_list
, next
) {
345 rc
= iopt_area_fill_domains(elm
->area
, elm
->pages
);
352 list_for_each_entry(undo_elm
, pages_list
, next
) {
355 iopt_area_unfill_domains(undo_elm
->area
, undo_elm
->pages
);
360 int iopt_map_pages(struct io_pagetable
*iopt
, struct list_head
*pages_list
,
361 unsigned long length
, unsigned long *dst_iova
,
362 int iommu_prot
, unsigned int flags
)
364 struct iopt_pages_list
*elm
;
367 rc
= iopt_alloc_area_pages(iopt
, pages_list
, length
, dst_iova
,
372 down_read(&iopt
->domains_rwsem
);
373 rc
= iopt_fill_domains_pages(pages_list
);
375 goto out_unlock_domains
;
377 down_write(&iopt
->iova_rwsem
);
378 list_for_each_entry(elm
, pages_list
, next
) {
380 * area->pages must be set inside the domains_rwsem to ensure
381 * any newly added domains will get filled. Moves the reference
384 elm
->area
->pages
= elm
->pages
;
388 up_write(&iopt
->iova_rwsem
);
390 up_read(&iopt
->domains_rwsem
);
394 static int iopt_map_common(struct iommufd_ctx
*ictx
, struct io_pagetable
*iopt
,
395 struct iopt_pages
*pages
, unsigned long *iova
,
396 unsigned long length
, unsigned long start_byte
,
397 int iommu_prot
, unsigned int flags
)
399 struct iopt_pages_list elm
= {};
400 LIST_HEAD(pages_list
);
404 elm
.start_byte
= start_byte
;
405 if (ictx
->account_mode
== IOPT_PAGES_ACCOUNT_MM
&&
406 elm
.pages
->account_mode
== IOPT_PAGES_ACCOUNT_USER
)
407 elm
.pages
->account_mode
= IOPT_PAGES_ACCOUNT_MM
;
409 list_add(&elm
.next
, &pages_list
);
411 rc
= iopt_map_pages(iopt
, &pages_list
, length
, iova
, iommu_prot
, flags
);
414 iopt_abort_area(elm
.area
);
416 iopt_put_pages(elm
.pages
);
423 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
424 * @ictx: iommufd_ctx the iopt is part of
425 * @iopt: io_pagetable to act on
426 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
427 * the chosen iova on output. Otherwise is the iova to map to on input
428 * @uptr: User VA to map
429 * @length: Number of bytes to map
430 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
431 * @flags: IOPT_ALLOC_IOVA or zero
433 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
434 * page tables this will pin the pages and load them into the domain at iova.
435 * For non-domain page tables this will only setup a lazy reference and the
436 * caller must use iopt_access_pages() to touch them.
438 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
441 int iopt_map_user_pages(struct iommufd_ctx
*ictx
, struct io_pagetable
*iopt
,
442 unsigned long *iova
, void __user
*uptr
,
443 unsigned long length
, int iommu_prot
,
446 struct iopt_pages
*pages
;
448 pages
= iopt_alloc_user_pages(uptr
, length
, iommu_prot
& IOMMU_WRITE
);
450 return PTR_ERR(pages
);
452 return iopt_map_common(ictx
, iopt
, pages
, iova
, length
,
453 uptr
- pages
->uptr
, iommu_prot
, flags
);
457 * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
458 * @ictx: iommufd_ctx the iopt is part of
459 * @iopt: io_pagetable to act on
460 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
461 * the chosen iova on output. Otherwise is the iova to map to on input
463 * @start: map file starting at this byte offset
464 * @length: Number of bytes to map
465 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
466 * @flags: IOPT_ALLOC_IOVA or zero
468 int iopt_map_file_pages(struct iommufd_ctx
*ictx
, struct io_pagetable
*iopt
,
469 unsigned long *iova
, struct file
*file
,
470 unsigned long start
, unsigned long length
,
471 int iommu_prot
, unsigned int flags
)
473 struct iopt_pages
*pages
;
475 pages
= iopt_alloc_file_pages(file
, start
, length
,
476 iommu_prot
& IOMMU_WRITE
);
478 return PTR_ERR(pages
);
479 return iopt_map_common(ictx
, iopt
, pages
, iova
, length
,
480 start
- pages
->start
, iommu_prot
, flags
);
483 struct iova_bitmap_fn_arg
{
485 struct io_pagetable
*iopt
;
486 struct iommu_domain
*domain
;
487 struct iommu_dirty_bitmap
*dirty
;
490 static int __iommu_read_and_clear_dirty(struct iova_bitmap
*bitmap
,
491 unsigned long iova
, size_t length
,
494 struct iopt_area
*area
;
495 struct iopt_area_contig_iter iter
;
496 struct iova_bitmap_fn_arg
*arg
= opaque
;
497 struct iommu_domain
*domain
= arg
->domain
;
498 struct iommu_dirty_bitmap
*dirty
= arg
->dirty
;
499 const struct iommu_dirty_ops
*ops
= domain
->dirty_ops
;
500 unsigned long last_iova
= iova
+ length
- 1;
501 unsigned long flags
= arg
->flags
;
504 iopt_for_each_contig_area(&iter
, area
, arg
->iopt
, iova
, last_iova
) {
505 unsigned long last
= min(last_iova
, iopt_area_last_iova(area
));
507 ret
= ops
->read_and_clear_dirty(domain
, iter
.cur_iova
,
508 last
- iter
.cur_iova
+ 1, flags
,
514 if (!iopt_area_contig_done(&iter
))
520 iommu_read_and_clear_dirty(struct iommu_domain
*domain
,
521 struct io_pagetable
*iopt
, unsigned long flags
,
522 struct iommu_hwpt_get_dirty_bitmap
*bitmap
)
524 const struct iommu_dirty_ops
*ops
= domain
->dirty_ops
;
525 struct iommu_iotlb_gather gather
;
526 struct iommu_dirty_bitmap dirty
;
527 struct iova_bitmap_fn_arg arg
;
528 struct iova_bitmap
*iter
;
531 if (!ops
|| !ops
->read_and_clear_dirty
)
534 iter
= iova_bitmap_alloc(bitmap
->iova
, bitmap
->length
,
536 u64_to_user_ptr(bitmap
->data
));
540 iommu_dirty_bitmap_init(&dirty
, iter
, &gather
);
546 iova_bitmap_for_each(iter
, &arg
, __iommu_read_and_clear_dirty
);
548 if (!(flags
& IOMMU_DIRTY_NO_CLEAR
))
549 iommu_iotlb_sync(domain
, &gather
);
551 iova_bitmap_free(iter
);
556 int iommufd_check_iova_range(struct io_pagetable
*iopt
,
557 struct iommu_hwpt_get_dirty_bitmap
*bitmap
)
559 size_t iommu_pgsize
= iopt
->iova_alignment
;
562 if (check_add_overflow(bitmap
->iova
, bitmap
->length
- 1, &last_iova
))
565 if (bitmap
->iova
> ULONG_MAX
|| last_iova
> ULONG_MAX
)
568 if ((bitmap
->iova
& (iommu_pgsize
- 1)) ||
569 ((last_iova
+ 1) & (iommu_pgsize
- 1)))
572 if (!bitmap
->page_size
)
575 if ((bitmap
->iova
& (bitmap
->page_size
- 1)) ||
576 ((last_iova
+ 1) & (bitmap
->page_size
- 1)))
582 int iopt_read_and_clear_dirty_data(struct io_pagetable
*iopt
,
583 struct iommu_domain
*domain
,
585 struct iommu_hwpt_get_dirty_bitmap
*bitmap
)
589 ret
= iommufd_check_iova_range(iopt
, bitmap
);
593 down_read(&iopt
->iova_rwsem
);
594 ret
= iommu_read_and_clear_dirty(domain
, iopt
, flags
, bitmap
);
595 up_read(&iopt
->iova_rwsem
);
600 static int iopt_clear_dirty_data(struct io_pagetable
*iopt
,
601 struct iommu_domain
*domain
)
603 const struct iommu_dirty_ops
*ops
= domain
->dirty_ops
;
604 struct iommu_iotlb_gather gather
;
605 struct iommu_dirty_bitmap dirty
;
606 struct iopt_area
*area
;
609 lockdep_assert_held_read(&iopt
->iova_rwsem
);
611 iommu_dirty_bitmap_init(&dirty
, NULL
, &gather
);
613 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
614 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
618 ret
= ops
->read_and_clear_dirty(domain
, iopt_area_iova(area
),
619 iopt_area_length(area
), 0,
625 iommu_iotlb_sync(domain
, &gather
);
629 int iopt_set_dirty_tracking(struct io_pagetable
*iopt
,
630 struct iommu_domain
*domain
, bool enable
)
632 const struct iommu_dirty_ops
*ops
= domain
->dirty_ops
;
638 down_read(&iopt
->iova_rwsem
);
640 /* Clear dirty bits from PTEs to ensure a clean snapshot */
642 ret
= iopt_clear_dirty_data(iopt
, domain
);
647 ret
= ops
->set_dirty_tracking(domain
, enable
);
650 up_read(&iopt
->iova_rwsem
);
654 int iopt_get_pages(struct io_pagetable
*iopt
, unsigned long iova
,
655 unsigned long length
, struct list_head
*pages_list
)
657 struct iopt_area_contig_iter iter
;
658 unsigned long last_iova
;
659 struct iopt_area
*area
;
664 if (check_add_overflow(iova
, length
- 1, &last_iova
))
667 down_read(&iopt
->iova_rwsem
);
668 iopt_for_each_contig_area(&iter
, area
, iopt
, iova
, last_iova
) {
669 struct iopt_pages_list
*elm
;
670 unsigned long last
= min(last_iova
, iopt_area_last_iova(area
));
672 elm
= kzalloc(sizeof(*elm
), GFP_KERNEL_ACCOUNT
);
677 elm
->start_byte
= iopt_area_start_byte(area
, iter
.cur_iova
);
678 elm
->pages
= area
->pages
;
679 elm
->length
= (last
- iter
.cur_iova
) + 1;
680 kref_get(&elm
->pages
->kref
);
681 list_add_tail(&elm
->next
, pages_list
);
683 if (!iopt_area_contig_done(&iter
)) {
687 up_read(&iopt
->iova_rwsem
);
690 up_read(&iopt
->iova_rwsem
);
691 iopt_free_pages_list(pages_list
);
695 static int iopt_unmap_iova_range(struct io_pagetable
*iopt
, unsigned long start
,
696 unsigned long last
, unsigned long *unmapped
)
698 struct iopt_area
*area
;
699 unsigned long unmapped_bytes
= 0;
700 unsigned int tries
= 0;
704 * The domains_rwsem must be held in read mode any time any area->pages
705 * is NULL. This prevents domain attach/detatch from running
706 * concurrently with cleaning up the area.
709 down_read(&iopt
->domains_rwsem
);
710 down_write(&iopt
->iova_rwsem
);
711 while ((area
= iopt_area_iter_first(iopt
, start
, last
))) {
712 unsigned long area_last
= iopt_area_last_iova(area
);
713 unsigned long area_first
= iopt_area_iova(area
);
714 struct iopt_pages
*pages
;
716 /* Userspace should not race map/unmap's of the same area */
719 goto out_unlock_iova
;
722 if (area_first
< start
|| area_last
> last
) {
724 goto out_unlock_iova
;
727 if (area_first
!= start
)
731 * num_accesses writers must hold the iova_rwsem too, so we can
732 * safely read it under the write side of the iovam_rwsem
733 * without the pages->mutex.
735 if (area
->num_accesses
) {
736 size_t length
= iopt_area_length(area
);
739 area
->prevent_access
= true;
740 up_write(&iopt
->iova_rwsem
);
741 up_read(&iopt
->domains_rwsem
);
743 iommufd_access_notify_unmap(iopt
, area_first
, length
);
744 /* Something is not responding to unmap requests. */
746 if (WARN_ON(tries
> 100))
753 up_write(&iopt
->iova_rwsem
);
755 iopt_area_unfill_domains(area
, pages
);
756 iopt_abort_area(area
);
757 iopt_put_pages(pages
);
759 unmapped_bytes
+= area_last
- area_first
+ 1;
761 down_write(&iopt
->iova_rwsem
);
767 up_write(&iopt
->iova_rwsem
);
768 up_read(&iopt
->domains_rwsem
);
770 *unmapped
= unmapped_bytes
;
775 * iopt_unmap_iova() - Remove a range of iova
776 * @iopt: io_pagetable to act on
777 * @iova: Starting iova to unmap
778 * @length: Number of bytes to unmap
779 * @unmapped: Return number of bytes unmapped
781 * The requested range must be a superset of existing ranges.
782 * Splitting/truncating IOVA mappings is not allowed.
784 int iopt_unmap_iova(struct io_pagetable
*iopt
, unsigned long iova
,
785 unsigned long length
, unsigned long *unmapped
)
787 unsigned long iova_last
;
792 if (check_add_overflow(iova
, length
- 1, &iova_last
))
795 return iopt_unmap_iova_range(iopt
, iova
, iova_last
, unmapped
);
798 int iopt_unmap_all(struct io_pagetable
*iopt
, unsigned long *unmapped
)
802 rc
= iopt_unmap_iova_range(iopt
, 0, ULONG_MAX
, unmapped
);
803 /* If the IOVAs are empty then unmap all succeeds */
809 /* The caller must always free all the nodes in the allowed_iova rb_root. */
810 int iopt_set_allow_iova(struct io_pagetable
*iopt
,
811 struct rb_root_cached
*allowed_iova
)
813 struct iopt_allowed
*allowed
;
815 down_write(&iopt
->iova_rwsem
);
816 swap(*allowed_iova
, iopt
->allowed_itree
);
818 for (allowed
= iopt_allowed_iter_first(iopt
, 0, ULONG_MAX
); allowed
;
819 allowed
= iopt_allowed_iter_next(allowed
, 0, ULONG_MAX
)) {
820 if (iopt_reserved_iter_first(iopt
, allowed
->node
.start
,
821 allowed
->node
.last
)) {
822 swap(*allowed_iova
, iopt
->allowed_itree
);
823 up_write(&iopt
->iova_rwsem
);
827 up_write(&iopt
->iova_rwsem
);
831 int iopt_reserve_iova(struct io_pagetable
*iopt
, unsigned long start
,
832 unsigned long last
, void *owner
)
834 struct iopt_reserved
*reserved
;
836 lockdep_assert_held_write(&iopt
->iova_rwsem
);
838 if (iopt_area_iter_first(iopt
, start
, last
) ||
839 iopt_allowed_iter_first(iopt
, start
, last
))
842 reserved
= kzalloc(sizeof(*reserved
), GFP_KERNEL_ACCOUNT
);
845 reserved
->node
.start
= start
;
846 reserved
->node
.last
= last
;
847 reserved
->owner
= owner
;
848 interval_tree_insert(&reserved
->node
, &iopt
->reserved_itree
);
852 static void __iopt_remove_reserved_iova(struct io_pagetable
*iopt
, void *owner
)
854 struct iopt_reserved
*reserved
, *next
;
856 lockdep_assert_held_write(&iopt
->iova_rwsem
);
858 for (reserved
= iopt_reserved_iter_first(iopt
, 0, ULONG_MAX
); reserved
;
860 next
= iopt_reserved_iter_next(reserved
, 0, ULONG_MAX
);
862 if (reserved
->owner
== owner
) {
863 interval_tree_remove(&reserved
->node
,
864 &iopt
->reserved_itree
);
870 void iopt_remove_reserved_iova(struct io_pagetable
*iopt
, void *owner
)
872 down_write(&iopt
->iova_rwsem
);
873 __iopt_remove_reserved_iova(iopt
, owner
);
874 up_write(&iopt
->iova_rwsem
);
877 void iopt_init_table(struct io_pagetable
*iopt
)
879 init_rwsem(&iopt
->iova_rwsem
);
880 init_rwsem(&iopt
->domains_rwsem
);
881 iopt
->area_itree
= RB_ROOT_CACHED
;
882 iopt
->allowed_itree
= RB_ROOT_CACHED
;
883 iopt
->reserved_itree
= RB_ROOT_CACHED
;
884 xa_init_flags(&iopt
->domains
, XA_FLAGS_ACCOUNT
);
885 xa_init_flags(&iopt
->access_list
, XA_FLAGS_ALLOC
);
888 * iopt's start as SW tables that can use the entire size_t IOVA space
889 * due to the use of size_t in the APIs. They have no alignment
892 iopt
->iova_alignment
= 1;
895 void iopt_destroy_table(struct io_pagetable
*iopt
)
897 struct interval_tree_node
*node
;
899 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
))
900 iopt_remove_reserved_iova(iopt
, NULL
);
902 while ((node
= interval_tree_iter_first(&iopt
->allowed_itree
, 0,
904 interval_tree_remove(node
, &iopt
->allowed_itree
);
905 kfree(container_of(node
, struct iopt_allowed
, node
));
908 WARN_ON(!RB_EMPTY_ROOT(&iopt
->reserved_itree
.rb_root
));
909 WARN_ON(!xa_empty(&iopt
->domains
));
910 WARN_ON(!xa_empty(&iopt
->access_list
));
911 WARN_ON(!RB_EMPTY_ROOT(&iopt
->area_itree
.rb_root
));
915 * iopt_unfill_domain() - Unfill a domain with PFNs
916 * @iopt: io_pagetable to act on
917 * @domain: domain to unfill
919 * This is used when removing a domain from the iopt. Every area in the iopt
920 * will be unmapped from the domain. The domain must already be removed from the
923 static void iopt_unfill_domain(struct io_pagetable
*iopt
,
924 struct iommu_domain
*domain
)
926 struct iopt_area
*area
;
928 lockdep_assert_held(&iopt
->iova_rwsem
);
929 lockdep_assert_held_write(&iopt
->domains_rwsem
);
932 * Some other domain is holding all the pfns still, rapidly unmap this
935 if (iopt
->next_domain_id
!= 0) {
936 /* Pick an arbitrary remaining domain to act as storage */
937 struct iommu_domain
*storage_domain
=
938 xa_load(&iopt
->domains
, 0);
940 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
941 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
942 struct iopt_pages
*pages
= area
->pages
;
947 mutex_lock(&pages
->mutex
);
948 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
))
949 WARN_ON(!area
->storage_domain
);
950 if (area
->storage_domain
== domain
)
951 area
->storage_domain
= storage_domain
;
952 mutex_unlock(&pages
->mutex
);
954 iopt_area_unmap_domain(area
, domain
);
959 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
960 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
961 struct iopt_pages
*pages
= area
->pages
;
966 mutex_lock(&pages
->mutex
);
967 interval_tree_remove(&area
->pages_node
, &pages
->domains_itree
);
968 WARN_ON(area
->storage_domain
!= domain
);
969 area
->storage_domain
= NULL
;
970 iopt_area_unfill_domain(area
, pages
, domain
);
971 mutex_unlock(&pages
->mutex
);
976 * iopt_fill_domain() - Fill a domain with PFNs
977 * @iopt: io_pagetable to act on
978 * @domain: domain to fill
980 * Fill the domain with PFNs from every area in the iopt. On failure the domain
983 static int iopt_fill_domain(struct io_pagetable
*iopt
,
984 struct iommu_domain
*domain
)
986 struct iopt_area
*end_area
;
987 struct iopt_area
*area
;
990 lockdep_assert_held(&iopt
->iova_rwsem
);
991 lockdep_assert_held_write(&iopt
->domains_rwsem
);
993 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
994 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
995 struct iopt_pages
*pages
= area
->pages
;
1000 mutex_lock(&pages
->mutex
);
1001 rc
= iopt_area_fill_domain(area
, domain
);
1003 mutex_unlock(&pages
->mutex
);
1006 if (!area
->storage_domain
) {
1007 WARN_ON(iopt
->next_domain_id
!= 0);
1008 area
->storage_domain
= domain
;
1009 interval_tree_insert(&area
->pages_node
,
1010 &pages
->domains_itree
);
1012 mutex_unlock(&pages
->mutex
);
1018 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
1019 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
)) {
1020 struct iopt_pages
*pages
= area
->pages
;
1022 if (area
== end_area
)
1026 mutex_lock(&pages
->mutex
);
1027 if (iopt
->next_domain_id
== 0) {
1028 interval_tree_remove(&area
->pages_node
,
1029 &pages
->domains_itree
);
1030 area
->storage_domain
= NULL
;
1032 iopt_area_unfill_domain(area
, pages
, domain
);
1033 mutex_unlock(&pages
->mutex
);
1038 /* All existing area's conform to an increased page size */
1039 static int iopt_check_iova_alignment(struct io_pagetable
*iopt
,
1040 unsigned long new_iova_alignment
)
1042 unsigned long align_mask
= new_iova_alignment
- 1;
1043 struct iopt_area
*area
;
1045 lockdep_assert_held(&iopt
->iova_rwsem
);
1046 lockdep_assert_held(&iopt
->domains_rwsem
);
1048 for (area
= iopt_area_iter_first(iopt
, 0, ULONG_MAX
); area
;
1049 area
= iopt_area_iter_next(area
, 0, ULONG_MAX
))
1050 if ((iopt_area_iova(area
) & align_mask
) ||
1051 (iopt_area_length(area
) & align_mask
) ||
1052 (area
->page_offset
& align_mask
))
1055 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
)) {
1056 struct iommufd_access
*access
;
1057 unsigned long index
;
1059 xa_for_each(&iopt
->access_list
, index
, access
)
1060 if (WARN_ON(access
->iova_alignment
>
1061 new_iova_alignment
))
1067 int iopt_table_add_domain(struct io_pagetable
*iopt
,
1068 struct iommu_domain
*domain
)
1070 const struct iommu_domain_geometry
*geometry
= &domain
->geometry
;
1071 struct iommu_domain
*iter_domain
;
1072 unsigned int new_iova_alignment
;
1073 unsigned long index
;
1076 down_write(&iopt
->domains_rwsem
);
1077 down_write(&iopt
->iova_rwsem
);
1079 xa_for_each(&iopt
->domains
, index
, iter_domain
) {
1080 if (WARN_ON(iter_domain
== domain
)) {
1087 * The io page size drives the iova_alignment. Internally the iopt_pages
1088 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1089 * objects into the iommu_domain.
1091 * A iommu_domain must always be able to accept PAGE_SIZE to be
1092 * compatible as we can't guarantee higher contiguity.
1094 new_iova_alignment
= max_t(unsigned long,
1095 1UL << __ffs(domain
->pgsize_bitmap
),
1096 iopt
->iova_alignment
);
1097 if (new_iova_alignment
> PAGE_SIZE
) {
1101 if (new_iova_alignment
!= iopt
->iova_alignment
) {
1102 rc
= iopt_check_iova_alignment(iopt
, new_iova_alignment
);
1107 /* No area exists that is outside the allowed domain aperture */
1108 if (geometry
->aperture_start
!= 0) {
1109 rc
= iopt_reserve_iova(iopt
, 0, geometry
->aperture_start
- 1,
1114 if (geometry
->aperture_end
!= ULONG_MAX
) {
1115 rc
= iopt_reserve_iova(iopt
, geometry
->aperture_end
+ 1,
1121 rc
= xa_reserve(&iopt
->domains
, iopt
->next_domain_id
, GFP_KERNEL
);
1125 rc
= iopt_fill_domain(iopt
, domain
);
1129 iopt
->iova_alignment
= new_iova_alignment
;
1130 xa_store(&iopt
->domains
, iopt
->next_domain_id
, domain
, GFP_KERNEL
);
1131 iopt
->next_domain_id
++;
1132 up_write(&iopt
->iova_rwsem
);
1133 up_write(&iopt
->domains_rwsem
);
1136 xa_release(&iopt
->domains
, iopt
->next_domain_id
);
1138 __iopt_remove_reserved_iova(iopt
, domain
);
1140 up_write(&iopt
->iova_rwsem
);
1141 up_write(&iopt
->domains_rwsem
);
1145 static int iopt_calculate_iova_alignment(struct io_pagetable
*iopt
)
1147 unsigned long new_iova_alignment
;
1148 struct iommufd_access
*access
;
1149 struct iommu_domain
*domain
;
1150 unsigned long index
;
1152 lockdep_assert_held_write(&iopt
->iova_rwsem
);
1153 lockdep_assert_held(&iopt
->domains_rwsem
);
1155 /* See batch_iommu_map_small() */
1156 if (iopt
->disable_large_pages
)
1157 new_iova_alignment
= PAGE_SIZE
;
1159 new_iova_alignment
= 1;
1161 xa_for_each(&iopt
->domains
, index
, domain
)
1162 new_iova_alignment
= max_t(unsigned long,
1163 1UL << __ffs(domain
->pgsize_bitmap
),
1164 new_iova_alignment
);
1165 xa_for_each(&iopt
->access_list
, index
, access
)
1166 new_iova_alignment
= max_t(unsigned long,
1167 access
->iova_alignment
,
1168 new_iova_alignment
);
1170 if (new_iova_alignment
> iopt
->iova_alignment
) {
1173 rc
= iopt_check_iova_alignment(iopt
, new_iova_alignment
);
1177 iopt
->iova_alignment
= new_iova_alignment
;
1181 void iopt_table_remove_domain(struct io_pagetable
*iopt
,
1182 struct iommu_domain
*domain
)
1184 struct iommu_domain
*iter_domain
= NULL
;
1185 unsigned long index
;
1187 down_write(&iopt
->domains_rwsem
);
1188 down_write(&iopt
->iova_rwsem
);
1190 xa_for_each(&iopt
->domains
, index
, iter_domain
)
1191 if (iter_domain
== domain
)
1193 if (WARN_ON(iter_domain
!= domain
) || index
>= iopt
->next_domain_id
)
1197 * Compress the xarray to keep it linear by swapping the entry to erase
1198 * with the tail entry and shrinking the tail.
1200 iopt
->next_domain_id
--;
1201 iter_domain
= xa_erase(&iopt
->domains
, iopt
->next_domain_id
);
1202 if (index
!= iopt
->next_domain_id
)
1203 xa_store(&iopt
->domains
, index
, iter_domain
, GFP_KERNEL
);
1205 iopt_unfill_domain(iopt
, domain
);
1206 __iopt_remove_reserved_iova(iopt
, domain
);
1208 WARN_ON(iopt_calculate_iova_alignment(iopt
));
1210 up_write(&iopt
->iova_rwsem
);
1211 up_write(&iopt
->domains_rwsem
);
1215 * iopt_area_split - Split an area into two parts at iova
1216 * @area: The area to split
1217 * @iova: Becomes the last of a new area
1219 * This splits an area into two. It is part of the VFIO compatibility to allow
1220 * poking a hole in the mapping. The two areas continue to point at the same
1221 * iopt_pages, just with different starting bytes.
1223 static int iopt_area_split(struct iopt_area
*area
, unsigned long iova
)
1225 unsigned long alignment
= area
->iopt
->iova_alignment
;
1226 unsigned long last_iova
= iopt_area_last_iova(area
);
1227 unsigned long start_iova
= iopt_area_iova(area
);
1228 unsigned long new_start
= iova
+ 1;
1229 struct io_pagetable
*iopt
= area
->iopt
;
1230 struct iopt_pages
*pages
= area
->pages
;
1231 struct iopt_area
*lhs
;
1232 struct iopt_area
*rhs
;
1235 lockdep_assert_held_write(&iopt
->iova_rwsem
);
1237 if (iova
== start_iova
|| iova
== last_iova
)
1240 if (!pages
|| area
->prevent_access
)
1243 if (new_start
& (alignment
- 1) ||
1244 iopt_area_start_byte(area
, new_start
) & (alignment
- 1))
1247 lhs
= iopt_area_alloc();
1251 rhs
= iopt_area_alloc();
1257 mutex_lock(&pages
->mutex
);
1259 * Splitting is not permitted if an access exists, we don't track enough
1260 * information to split existing accesses.
1262 if (area
->num_accesses
) {
1268 * Splitting is not permitted if a domain could have been mapped with
1271 if (area
->storage_domain
&& !iopt
->disable_large_pages
) {
1276 interval_tree_remove(&area
->node
, &iopt
->area_itree
);
1277 rc
= iopt_insert_area(iopt
, lhs
, area
->pages
, start_iova
,
1278 iopt_area_start_byte(area
, start_iova
),
1279 (new_start
- 1) - start_iova
+ 1,
1284 rc
= iopt_insert_area(iopt
, rhs
, area
->pages
, new_start
,
1285 iopt_area_start_byte(area
, new_start
),
1286 last_iova
- new_start
+ 1, area
->iommu_prot
);
1288 goto err_remove_lhs
;
1291 * If the original area has filled a domain, domains_itree has to be
1294 if (area
->storage_domain
) {
1295 interval_tree_remove(&area
->pages_node
, &pages
->domains_itree
);
1296 interval_tree_insert(&lhs
->pages_node
, &pages
->domains_itree
);
1297 interval_tree_insert(&rhs
->pages_node
, &pages
->domains_itree
);
1300 lhs
->storage_domain
= area
->storage_domain
;
1301 lhs
->pages
= area
->pages
;
1302 rhs
->storage_domain
= area
->storage_domain
;
1303 rhs
->pages
= area
->pages
;
1304 kref_get(&rhs
->pages
->kref
);
1306 mutex_unlock(&pages
->mutex
);
1309 * No change to domains or accesses because the pages hasn't been
1315 interval_tree_remove(&lhs
->node
, &iopt
->area_itree
);
1317 interval_tree_insert(&area
->node
, &iopt
->area_itree
);
1319 mutex_unlock(&pages
->mutex
);
1326 int iopt_cut_iova(struct io_pagetable
*iopt
, unsigned long *iovas
,
1332 down_write(&iopt
->iova_rwsem
);
1333 for (i
= 0; i
< num_iovas
; i
++) {
1334 struct iopt_area
*area
;
1336 area
= iopt_area_iter_first(iopt
, iovas
[i
], iovas
[i
]);
1339 rc
= iopt_area_split(area
, iovas
[i
]);
1343 up_write(&iopt
->iova_rwsem
);
1347 void iopt_enable_large_pages(struct io_pagetable
*iopt
)
1351 down_write(&iopt
->domains_rwsem
);
1352 down_write(&iopt
->iova_rwsem
);
1353 WRITE_ONCE(iopt
->disable_large_pages
, false);
1354 rc
= iopt_calculate_iova_alignment(iopt
);
1356 up_write(&iopt
->iova_rwsem
);
1357 up_write(&iopt
->domains_rwsem
);
1360 int iopt_disable_large_pages(struct io_pagetable
*iopt
)
1364 down_write(&iopt
->domains_rwsem
);
1365 down_write(&iopt
->iova_rwsem
);
1366 if (iopt
->disable_large_pages
)
1369 /* Won't do it if domains already have pages mapped in them */
1370 if (!xa_empty(&iopt
->domains
) &&
1371 !RB_EMPTY_ROOT(&iopt
->area_itree
.rb_root
)) {
1376 WRITE_ONCE(iopt
->disable_large_pages
, true);
1377 rc
= iopt_calculate_iova_alignment(iopt
);
1379 WRITE_ONCE(iopt
->disable_large_pages
, false);
1381 up_write(&iopt
->iova_rwsem
);
1382 up_write(&iopt
->domains_rwsem
);
1386 int iopt_add_access(struct io_pagetable
*iopt
, struct iommufd_access
*access
)
1391 down_write(&iopt
->domains_rwsem
);
1392 down_write(&iopt
->iova_rwsem
);
1393 rc
= xa_alloc(&iopt
->access_list
, &new_id
, access
, xa_limit_16b
,
1394 GFP_KERNEL_ACCOUNT
);
1399 rc
= iopt_calculate_iova_alignment(iopt
);
1401 xa_erase(&iopt
->access_list
, new_id
);
1404 access
->iopt_access_list_id
= new_id
;
1407 up_write(&iopt
->iova_rwsem
);
1408 up_write(&iopt
->domains_rwsem
);
1412 void iopt_remove_access(struct io_pagetable
*iopt
,
1413 struct iommufd_access
*access
,
1414 u32 iopt_access_list_id
)
1416 down_write(&iopt
->domains_rwsem
);
1417 down_write(&iopt
->iova_rwsem
);
1418 WARN_ON(xa_erase(&iopt
->access_list
, iopt_access_list_id
) != access
);
1419 WARN_ON(iopt_calculate_iova_alignment(iopt
));
1420 up_write(&iopt
->iova_rwsem
);
1421 up_write(&iopt
->domains_rwsem
);
1424 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1425 int iopt_table_enforce_dev_resv_regions(struct io_pagetable
*iopt
,
1427 phys_addr_t
*sw_msi_start
)
1429 struct iommu_resv_region
*resv
;
1430 LIST_HEAD(resv_regions
);
1431 unsigned int num_hw_msi
= 0;
1432 unsigned int num_sw_msi
= 0;
1435 if (iommufd_should_fail())
1438 down_write(&iopt
->iova_rwsem
);
1439 /* FIXME: drivers allocate memory but there is no failure propogated */
1440 iommu_get_resv_regions(dev
, &resv_regions
);
1442 list_for_each_entry(resv
, &resv_regions
, list
) {
1443 if (resv
->type
== IOMMU_RESV_DIRECT_RELAXABLE
)
1446 if (sw_msi_start
&& resv
->type
== IOMMU_RESV_MSI
)
1448 if (sw_msi_start
&& resv
->type
== IOMMU_RESV_SW_MSI
) {
1449 *sw_msi_start
= resv
->start
;
1453 rc
= iopt_reserve_iova(iopt
, resv
->start
,
1454 resv
->length
- 1 + resv
->start
, dev
);
1459 /* Drivers must offer sane combinations of regions */
1460 if (WARN_ON(num_sw_msi
&& num_hw_msi
) || WARN_ON(num_sw_msi
> 1)) {
1469 __iopt_remove_reserved_iova(iopt
, dev
);
1471 iommu_put_resv_regions(dev
, &resv_regions
);
1472 up_write(&iopt
->iova_rwsem
);