1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
5 #include <linux/file.h>
6 #include <linux/interval_tree.h>
7 #include <linux/iommu.h>
8 #include <linux/iommufd.h>
9 #include <uapi/linux/iommufd.h>
11 #include "io_pagetable.h"
13 void iommufd_ioas_destroy(struct iommufd_object
*obj
)
15 struct iommufd_ioas
*ioas
= container_of(obj
, struct iommufd_ioas
, obj
);
18 rc
= iopt_unmap_all(&ioas
->iopt
, NULL
);
19 WARN_ON(rc
&& rc
!= -ENOENT
);
20 iopt_destroy_table(&ioas
->iopt
);
21 mutex_destroy(&ioas
->mutex
);
24 struct iommufd_ioas
*iommufd_ioas_alloc(struct iommufd_ctx
*ictx
)
26 struct iommufd_ioas
*ioas
;
28 ioas
= iommufd_object_alloc(ictx
, ioas
, IOMMUFD_OBJ_IOAS
);
32 iopt_init_table(&ioas
->iopt
);
33 INIT_LIST_HEAD(&ioas
->hwpt_list
);
34 mutex_init(&ioas
->mutex
);
38 int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd
*ucmd
)
40 struct iommu_ioas_alloc
*cmd
= ucmd
->cmd
;
41 struct iommufd_ioas
*ioas
;
47 ioas
= iommufd_ioas_alloc(ucmd
->ictx
);
51 cmd
->out_ioas_id
= ioas
->obj
.id
;
52 rc
= iommufd_ucmd_respond(ucmd
, sizeof(*cmd
));
56 down_read(&ucmd
->ictx
->ioas_creation_lock
);
57 iommufd_object_finalize(ucmd
->ictx
, &ioas
->obj
);
58 up_read(&ucmd
->ictx
->ioas_creation_lock
);
62 iommufd_object_abort_and_destroy(ucmd
->ictx
, &ioas
->obj
);
66 int iommufd_ioas_iova_ranges(struct iommufd_ucmd
*ucmd
)
68 struct iommu_iova_range __user
*ranges
;
69 struct iommu_ioas_iova_ranges
*cmd
= ucmd
->cmd
;
70 struct iommufd_ioas
*ioas
;
71 struct interval_tree_span_iter span
;
78 ioas
= iommufd_get_ioas(ucmd
->ictx
, cmd
->ioas_id
);
82 down_read(&ioas
->iopt
.iova_rwsem
);
83 max_iovas
= cmd
->num_iovas
;
84 ranges
= u64_to_user_ptr(cmd
->allowed_iovas
);
86 cmd
->out_iova_alignment
= ioas
->iopt
.iova_alignment
;
87 interval_tree_for_each_span(&span
, &ioas
->iopt
.reserved_itree
, 0,
91 if (cmd
->num_iovas
< max_iovas
) {
92 struct iommu_iova_range elm
= {
93 .start
= span
.start_hole
,
94 .last
= span
.last_hole
,
97 if (copy_to_user(&ranges
[cmd
->num_iovas
], &elm
,
105 rc
= iommufd_ucmd_respond(ucmd
, sizeof(*cmd
));
108 if (cmd
->num_iovas
> max_iovas
)
111 up_read(&ioas
->iopt
.iova_rwsem
);
112 iommufd_put_object(ucmd
->ictx
, &ioas
->obj
);
116 static int iommufd_ioas_load_iovas(struct rb_root_cached
*itree
,
117 struct iommu_iova_range __user
*ranges
,
122 for (i
= 0; i
!= num
; i
++) {
123 struct iommu_iova_range range
;
124 struct iopt_allowed
*allowed
;
126 if (copy_from_user(&range
, ranges
+ i
, sizeof(range
)))
129 if (range
.start
>= range
.last
)
132 if (interval_tree_iter_first(itree
, range
.start
, range
.last
))
135 allowed
= kzalloc(sizeof(*allowed
), GFP_KERNEL_ACCOUNT
);
138 allowed
->node
.start
= range
.start
;
139 allowed
->node
.last
= range
.last
;
141 interval_tree_insert(&allowed
->node
, itree
);
146 int iommufd_ioas_allow_iovas(struct iommufd_ucmd
*ucmd
)
148 struct iommu_ioas_allow_iovas
*cmd
= ucmd
->cmd
;
149 struct rb_root_cached allowed_iova
= RB_ROOT_CACHED
;
150 struct interval_tree_node
*node
;
151 struct iommufd_ioas
*ioas
;
152 struct io_pagetable
*iopt
;
158 ioas
= iommufd_get_ioas(ucmd
->ictx
, cmd
->ioas_id
);
160 return PTR_ERR(ioas
);
163 rc
= iommufd_ioas_load_iovas(&allowed_iova
,
164 u64_to_user_ptr(cmd
->allowed_iovas
),
170 * We want the allowed tree update to be atomic, so we have to keep the
171 * original nodes around, and keep track of the new nodes as we allocate
172 * memory for them. The simplest solution is to have a new/old tree and
173 * then swap new for old. On success we free the old tree, on failure we
176 rc
= iopt_set_allow_iova(iopt
, &allowed_iova
);
178 while ((node
= interval_tree_iter_first(&allowed_iova
, 0, ULONG_MAX
))) {
179 interval_tree_remove(node
, &allowed_iova
);
180 kfree(container_of(node
, struct iopt_allowed
, node
));
182 iommufd_put_object(ucmd
->ictx
, &ioas
->obj
);
186 static int conv_iommu_prot(u32 map_flags
)
189 * We provide no manual cache coherency ioctls to userspace and most
190 * architectures make the CPU ops for cache flushing privileged.
191 * Therefore we require the underlying IOMMU to support CPU coherent
192 * operation. Support for IOMMU_CACHE is enforced by the
193 * IOMMU_CAP_CACHE_COHERENCY test during bind.
195 int iommu_prot
= IOMMU_CACHE
;
197 if (map_flags
& IOMMU_IOAS_MAP_WRITEABLE
)
198 iommu_prot
|= IOMMU_WRITE
;
199 if (map_flags
& IOMMU_IOAS_MAP_READABLE
)
200 iommu_prot
|= IOMMU_READ
;
204 int iommufd_ioas_map_file(struct iommufd_ucmd
*ucmd
)
206 struct iommu_ioas_map_file
*cmd
= ucmd
->cmd
;
207 unsigned long iova
= cmd
->iova
;
208 struct iommufd_ioas
*ioas
;
209 unsigned int flags
= 0;
214 ~(IOMMU_IOAS_MAP_FIXED_IOVA
| IOMMU_IOAS_MAP_WRITEABLE
|
215 IOMMU_IOAS_MAP_READABLE
))
218 if (cmd
->iova
>= ULONG_MAX
|| cmd
->length
>= ULONG_MAX
)
222 (IOMMU_IOAS_MAP_WRITEABLE
| IOMMU_IOAS_MAP_READABLE
)))
225 ioas
= iommufd_get_ioas(ucmd
->ictx
, cmd
->ioas_id
);
227 return PTR_ERR(ioas
);
229 if (!(cmd
->flags
& IOMMU_IOAS_MAP_FIXED_IOVA
))
230 flags
= IOPT_ALLOC_IOVA
;
232 file
= fget(cmd
->fd
);
236 rc
= iopt_map_file_pages(ucmd
->ictx
, &ioas
->iopt
, &iova
, file
,
237 cmd
->start
, cmd
->length
,
238 conv_iommu_prot(cmd
->flags
), flags
);
243 rc
= iommufd_ucmd_respond(ucmd
, sizeof(*cmd
));
245 iommufd_put_object(ucmd
->ictx
, &ioas
->obj
);
250 int iommufd_ioas_map(struct iommufd_ucmd
*ucmd
)
252 struct iommu_ioas_map
*cmd
= ucmd
->cmd
;
253 unsigned long iova
= cmd
->iova
;
254 struct iommufd_ioas
*ioas
;
255 unsigned int flags
= 0;
259 ~(IOMMU_IOAS_MAP_FIXED_IOVA
| IOMMU_IOAS_MAP_WRITEABLE
|
260 IOMMU_IOAS_MAP_READABLE
)) ||
263 if (cmd
->iova
>= ULONG_MAX
|| cmd
->length
>= ULONG_MAX
)
267 (IOMMU_IOAS_MAP_WRITEABLE
| IOMMU_IOAS_MAP_READABLE
)))
270 ioas
= iommufd_get_ioas(ucmd
->ictx
, cmd
->ioas_id
);
272 return PTR_ERR(ioas
);
274 if (!(cmd
->flags
& IOMMU_IOAS_MAP_FIXED_IOVA
))
275 flags
= IOPT_ALLOC_IOVA
;
276 rc
= iopt_map_user_pages(ucmd
->ictx
, &ioas
->iopt
, &iova
,
277 u64_to_user_ptr(cmd
->user_va
), cmd
->length
,
278 conv_iommu_prot(cmd
->flags
), flags
);
283 rc
= iommufd_ucmd_respond(ucmd
, sizeof(*cmd
));
285 iommufd_put_object(ucmd
->ictx
, &ioas
->obj
);
289 int iommufd_ioas_copy(struct iommufd_ucmd
*ucmd
)
291 struct iommu_ioas_copy
*cmd
= ucmd
->cmd
;
292 struct iommufd_ioas
*src_ioas
;
293 struct iommufd_ioas
*dst_ioas
;
294 unsigned int flags
= 0;
295 LIST_HEAD(pages_list
);
299 iommufd_test_syz_conv_iova_id(ucmd
, cmd
->src_ioas_id
, &cmd
->src_iova
,
303 ~(IOMMU_IOAS_MAP_FIXED_IOVA
| IOMMU_IOAS_MAP_WRITEABLE
|
304 IOMMU_IOAS_MAP_READABLE
)))
306 if (cmd
->length
>= ULONG_MAX
|| cmd
->src_iova
>= ULONG_MAX
||
307 cmd
->dst_iova
>= ULONG_MAX
)
311 (IOMMU_IOAS_MAP_WRITEABLE
| IOMMU_IOAS_MAP_READABLE
)))
314 src_ioas
= iommufd_get_ioas(ucmd
->ictx
, cmd
->src_ioas_id
);
315 if (IS_ERR(src_ioas
))
316 return PTR_ERR(src_ioas
);
317 rc
= iopt_get_pages(&src_ioas
->iopt
, cmd
->src_iova
, cmd
->length
,
319 iommufd_put_object(ucmd
->ictx
, &src_ioas
->obj
);
323 dst_ioas
= iommufd_get_ioas(ucmd
->ictx
, cmd
->dst_ioas_id
);
324 if (IS_ERR(dst_ioas
)) {
325 rc
= PTR_ERR(dst_ioas
);
329 if (!(cmd
->flags
& IOMMU_IOAS_MAP_FIXED_IOVA
))
330 flags
= IOPT_ALLOC_IOVA
;
331 iova
= cmd
->dst_iova
;
332 rc
= iopt_map_pages(&dst_ioas
->iopt
, &pages_list
, cmd
->length
, &iova
,
333 conv_iommu_prot(cmd
->flags
), flags
);
337 cmd
->dst_iova
= iova
;
338 rc
= iommufd_ucmd_respond(ucmd
, sizeof(*cmd
));
340 iommufd_put_object(ucmd
->ictx
, &dst_ioas
->obj
);
342 iopt_free_pages_list(&pages_list
);
346 int iommufd_ioas_unmap(struct iommufd_ucmd
*ucmd
)
348 struct iommu_ioas_unmap
*cmd
= ucmd
->cmd
;
349 struct iommufd_ioas
*ioas
;
350 unsigned long unmapped
= 0;
353 ioas
= iommufd_get_ioas(ucmd
->ictx
, cmd
->ioas_id
);
355 return PTR_ERR(ioas
);
357 if (cmd
->iova
== 0 && cmd
->length
== U64_MAX
) {
358 rc
= iopt_unmap_all(&ioas
->iopt
, &unmapped
);
362 if (cmd
->iova
>= ULONG_MAX
|| cmd
->length
>= ULONG_MAX
) {
366 rc
= iopt_unmap_iova(&ioas
->iopt
, cmd
->iova
, cmd
->length
,
372 cmd
->length
= unmapped
;
373 rc
= iommufd_ucmd_respond(ucmd
, sizeof(*cmd
));
376 iommufd_put_object(ucmd
->ictx
, &ioas
->obj
);
380 static void iommufd_release_all_iova_rwsem(struct iommufd_ctx
*ictx
,
381 struct xarray
*ioas_list
)
383 struct iommufd_ioas
*ioas
;
386 xa_for_each(ioas_list
, index
, ioas
) {
387 up_write(&ioas
->iopt
.iova_rwsem
);
388 refcount_dec(&ioas
->obj
.users
);
390 up_write(&ictx
->ioas_creation_lock
);
391 xa_destroy(ioas_list
);
394 static int iommufd_take_all_iova_rwsem(struct iommufd_ctx
*ictx
,
395 struct xarray
*ioas_list
)
397 struct iommufd_object
*obj
;
402 * This is very ugly, it is done instead of adding a lock around
403 * pages->source_mm, which is a performance path for mdev, we just
404 * obtain the write side of all the iova_rwsems which also protects the
405 * pages->source_*. Due to copies we can't know which IOAS could read
406 * from the pages, so we just lock everything. This is the only place
407 * locks are nested and they are uniformly taken in ID order.
409 * ioas_creation_lock prevents new IOAS from being installed in the
410 * xarray while we do this, and also prevents more than one thread from
411 * holding nested locks.
413 down_write(&ictx
->ioas_creation_lock
);
414 xa_lock(&ictx
->objects
);
415 xa_for_each(&ictx
->objects
, index
, obj
) {
416 struct iommufd_ioas
*ioas
;
418 if (!obj
|| obj
->type
!= IOMMUFD_OBJ_IOAS
)
421 if (!refcount_inc_not_zero(&obj
->users
))
424 xa_unlock(&ictx
->objects
);
426 ioas
= container_of(obj
, struct iommufd_ioas
, obj
);
427 down_write_nest_lock(&ioas
->iopt
.iova_rwsem
,
428 &ictx
->ioas_creation_lock
);
430 rc
= xa_err(xa_store(ioas_list
, index
, ioas
, GFP_KERNEL
));
432 iommufd_release_all_iova_rwsem(ictx
, ioas_list
);
436 xa_lock(&ictx
->objects
);
438 xa_unlock(&ictx
->objects
);
442 static bool need_charge_update(struct iopt_pages
*pages
)
444 switch (pages
->account_mode
) {
445 case IOPT_PAGES_ACCOUNT_NONE
:
447 case IOPT_PAGES_ACCOUNT_MM
:
448 return pages
->source_mm
!= current
->mm
;
449 case IOPT_PAGES_ACCOUNT_USER
:
451 * Update when mm changes because it also accounts
454 return (pages
->source_user
!= current_user()) ||
455 (pages
->source_mm
!= current
->mm
);
460 static int charge_current(unsigned long *npinned
)
462 struct iopt_pages tmp
= {
463 .source_mm
= current
->mm
,
464 .source_task
= current
->group_leader
,
465 .source_user
= current_user(),
467 unsigned int account_mode
;
470 for (account_mode
= 0; account_mode
!= IOPT_PAGES_ACCOUNT_MODE_NUM
;
472 if (!npinned
[account_mode
])
475 tmp
.account_mode
= account_mode
;
476 rc
= iopt_pages_update_pinned(&tmp
, npinned
[account_mode
], true,
484 while (account_mode
!= 0) {
486 if (!npinned
[account_mode
])
488 tmp
.account_mode
= account_mode
;
489 iopt_pages_update_pinned(&tmp
, npinned
[account_mode
], false,
495 static void change_mm(struct iopt_pages
*pages
)
497 struct task_struct
*old_task
= pages
->source_task
;
498 struct user_struct
*old_user
= pages
->source_user
;
499 struct mm_struct
*old_mm
= pages
->source_mm
;
501 pages
->source_mm
= current
->mm
;
502 mmgrab(pages
->source_mm
);
505 pages
->source_task
= current
->group_leader
;
506 get_task_struct(pages
->source_task
);
507 put_task_struct(old_task
);
509 pages
->source_user
= get_uid(current_user());
513 #define for_each_ioas_area(_xa, _index, _ioas, _area) \
514 xa_for_each((_xa), (_index), (_ioas)) \
515 for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \
517 _area = iopt_area_iter_next(_area, 0, ULONG_MAX))
519 int iommufd_ioas_change_process(struct iommufd_ucmd
*ucmd
)
521 struct iommu_ioas_change_process
*cmd
= ucmd
->cmd
;
522 struct iommufd_ctx
*ictx
= ucmd
->ictx
;
523 unsigned long all_npinned
[IOPT_PAGES_ACCOUNT_MODE_NUM
] = {};
524 struct iommufd_ioas
*ioas
;
525 struct iopt_area
*area
;
526 struct iopt_pages
*pages
;
527 struct xarray ioas_list
;
535 rc
= iommufd_take_all_iova_rwsem(ictx
, &ioas_list
);
539 for_each_ioas_area(&ioas_list
, index
, ioas
, area
) {
540 if (area
->pages
->type
!= IOPT_ADDRESS_FILE
) {
547 * Count last_pinned pages, then clear it to avoid double counting
548 * if the same iopt_pages is visited multiple times in this loop.
549 * Since we are under all the locks, npinned == last_npinned, so we
550 * can easily restore last_npinned before we return.
552 for_each_ioas_area(&ioas_list
, index
, ioas
, area
) {
555 if (need_charge_update(pages
)) {
556 all_npinned
[pages
->account_mode
] += pages
->last_npinned
;
557 pages
->last_npinned
= 0;
561 rc
= charge_current(all_npinned
);
564 /* Charge failed. Fix last_npinned and bail. */
565 for_each_ioas_area(&ioas_list
, index
, ioas
, area
)
566 area
->pages
->last_npinned
= area
->pages
->npinned
;
570 for_each_ioas_area(&ioas_list
, index
, ioas
, area
) {
573 /* Uncharge the old one (which also restores last_npinned) */
574 if (need_charge_update(pages
)) {
575 int r
= iopt_pages_update_pinned(pages
, pages
->npinned
,
585 iommufd_release_all_iova_rwsem(ictx
, &ioas_list
);
589 int iommufd_option_rlimit_mode(struct iommu_option
*cmd
,
590 struct iommufd_ctx
*ictx
)
595 if (cmd
->op
== IOMMU_OPTION_OP_GET
) {
596 cmd
->val64
= ictx
->account_mode
== IOPT_PAGES_ACCOUNT_MM
;
599 if (cmd
->op
== IOMMU_OPTION_OP_SET
) {
602 if (!capable(CAP_SYS_RESOURCE
))
605 xa_lock(&ictx
->objects
);
606 if (!xa_empty(&ictx
->objects
)) {
610 ictx
->account_mode
= IOPT_PAGES_ACCOUNT_USER
;
611 else if (cmd
->val64
== 1)
612 ictx
->account_mode
= IOPT_PAGES_ACCOUNT_MM
;
616 xa_unlock(&ictx
->objects
);
623 static int iommufd_ioas_option_huge_pages(struct iommu_option
*cmd
,
624 struct iommufd_ioas
*ioas
)
626 if (cmd
->op
== IOMMU_OPTION_OP_GET
) {
627 cmd
->val64
= !ioas
->iopt
.disable_large_pages
;
630 if (cmd
->op
== IOMMU_OPTION_OP_SET
) {
632 return iopt_disable_large_pages(&ioas
->iopt
);
633 if (cmd
->val64
== 1) {
634 iopt_enable_large_pages(&ioas
->iopt
);
642 int iommufd_ioas_option(struct iommufd_ucmd
*ucmd
)
644 struct iommu_option
*cmd
= ucmd
->cmd
;
645 struct iommufd_ioas
*ioas
;
651 ioas
= iommufd_get_ioas(ucmd
->ictx
, cmd
->object_id
);
653 return PTR_ERR(ioas
);
655 switch (cmd
->option_id
) {
656 case IOMMU_OPTION_HUGE_PAGES
:
657 rc
= iommufd_ioas_option_huge_pages(cmd
, ioas
);
663 iommufd_put_object(ucmd
->ictx
, &ioas
->obj
);