1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
4 #include <linux/file.h>
5 #include <linux/interval_tree.h>
6 #include <linux/iommu.h>
7 #include <linux/iommufd.h>
8 #include <linux/slab.h>
9 #include <linux/vfio.h>
10 #include <uapi/linux/vfio.h>
11 #include <uapi/linux/iommufd.h>
13 #include "iommufd_private.h"
15 static struct iommufd_ioas
*get_compat_ioas(struct iommufd_ctx
*ictx
)
17 struct iommufd_ioas
*ioas
= ERR_PTR(-ENODEV
);
19 xa_lock(&ictx
->objects
);
20 if (!ictx
->vfio_ioas
|| !iommufd_lock_obj(&ictx
->vfio_ioas
->obj
))
22 ioas
= ictx
->vfio_ioas
;
24 xa_unlock(&ictx
->objects
);
29 * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists
30 * @ictx: Context to operate on
31 * @out_ioas_id: The IOAS ID of the compatibility IOAS
33 * Return the ID of the current compatibility IOAS. The ID can be passed into
34 * other functions that take an ioas_id.
36 int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx
*ictx
, u32
*out_ioas_id
)
38 struct iommufd_ioas
*ioas
;
40 ioas
= get_compat_ioas(ictx
);
43 *out_ioas_id
= ioas
->obj
.id
;
44 iommufd_put_object(ictx
, &ioas
->obj
);
47 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id
, IOMMUFD_VFIO
);
50 * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached
51 * @ictx: Context to operate on
53 * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types.
55 int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx
*ictx
)
59 xa_lock(&ictx
->objects
);
60 if (!ictx
->vfio_ioas
) {
61 ictx
->no_iommu_mode
= 1;
66 xa_unlock(&ictx
->objects
);
69 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu
, IOMMUFD_VFIO
);
72 * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created
73 * @ictx: Context to operate on
75 * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
76 * on since they do not have an IOAS ID input in their ABI. Only attaching a
77 * group should cause a default creation of the internal ioas, this does nothing
78 * if an existing ioas has already been assigned somehow.
80 int iommufd_vfio_compat_ioas_create(struct iommufd_ctx
*ictx
)
82 struct iommufd_ioas
*ioas
= NULL
;
85 ioas
= iommufd_ioas_alloc(ictx
);
89 xa_lock(&ictx
->objects
);
91 * VFIO won't allow attaching a container to both iommu and no iommu
94 if (ictx
->no_iommu_mode
) {
99 if (ictx
->vfio_ioas
&& iommufd_lock_obj(&ictx
->vfio_ioas
->obj
)) {
101 iommufd_put_object(ictx
, &ictx
->vfio_ioas
->obj
);
104 ictx
->vfio_ioas
= ioas
;
105 xa_unlock(&ictx
->objects
);
108 * An automatically created compat IOAS is treated as a userspace
109 * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
110 * and if not manually destroyed it will be destroyed automatically
111 * at iommufd release.
113 iommufd_object_finalize(ictx
, &ioas
->obj
);
117 xa_unlock(&ictx
->objects
);
118 iommufd_object_abort(ictx
, &ioas
->obj
);
121 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create
, IOMMUFD_VFIO
);
123 int iommufd_vfio_ioas(struct iommufd_ucmd
*ucmd
)
125 struct iommu_vfio_ioas
*cmd
= ucmd
->cmd
;
126 struct iommufd_ioas
*ioas
;
131 case IOMMU_VFIO_IOAS_GET
:
132 ioas
= get_compat_ioas(ucmd
->ictx
);
134 return PTR_ERR(ioas
);
135 cmd
->ioas_id
= ioas
->obj
.id
;
136 iommufd_put_object(ucmd
->ictx
, &ioas
->obj
);
137 return iommufd_ucmd_respond(ucmd
, sizeof(*cmd
));
139 case IOMMU_VFIO_IOAS_SET
:
140 ioas
= iommufd_get_ioas(ucmd
->ictx
, cmd
->ioas_id
);
142 return PTR_ERR(ioas
);
143 xa_lock(&ucmd
->ictx
->objects
);
144 ucmd
->ictx
->vfio_ioas
= ioas
;
145 xa_unlock(&ucmd
->ictx
->objects
);
146 iommufd_put_object(ucmd
->ictx
, &ioas
->obj
);
149 case IOMMU_VFIO_IOAS_CLEAR
:
150 xa_lock(&ucmd
->ictx
->objects
);
151 ucmd
->ictx
->vfio_ioas
= NULL
;
152 xa_unlock(&ucmd
->ictx
->objects
);
159 static int iommufd_vfio_map_dma(struct iommufd_ctx
*ictx
, unsigned int cmd
,
162 u32 supported_flags
= VFIO_DMA_MAP_FLAG_READ
| VFIO_DMA_MAP_FLAG_WRITE
;
163 size_t minsz
= offsetofend(struct vfio_iommu_type1_dma_map
, size
);
164 struct vfio_iommu_type1_dma_map map
;
165 int iommu_prot
= IOMMU_CACHE
;
166 struct iommufd_ioas
*ioas
;
170 if (copy_from_user(&map
, arg
, minsz
))
173 if (map
.argsz
< minsz
|| map
.flags
& ~supported_flags
)
176 if (map
.flags
& VFIO_DMA_MAP_FLAG_READ
)
177 iommu_prot
|= IOMMU_READ
;
178 if (map
.flags
& VFIO_DMA_MAP_FLAG_WRITE
)
179 iommu_prot
|= IOMMU_WRITE
;
181 ioas
= get_compat_ioas(ictx
);
183 return PTR_ERR(ioas
);
186 * Maps created through the legacy interface always use VFIO compatible
187 * rlimit accounting. If the user wishes to use the faster user based
188 * rlimit accounting then they must use the new interface.
191 rc
= iopt_map_user_pages(ictx
, &ioas
->iopt
, &iova
, u64_to_user_ptr(map
.vaddr
),
192 map
.size
, iommu_prot
, 0);
193 iommufd_put_object(ictx
, &ioas
->obj
);
197 static int iommufd_vfio_unmap_dma(struct iommufd_ctx
*ictx
, unsigned int cmd
,
200 size_t minsz
= offsetofend(struct vfio_iommu_type1_dma_unmap
, size
);
202 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
203 * dirty tracking direction:
204 * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
205 * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
207 u32 supported_flags
= VFIO_DMA_UNMAP_FLAG_ALL
;
208 struct vfio_iommu_type1_dma_unmap unmap
;
209 unsigned long unmapped
= 0;
210 struct iommufd_ioas
*ioas
;
213 if (copy_from_user(&unmap
, arg
, minsz
))
216 if (unmap
.argsz
< minsz
|| unmap
.flags
& ~supported_flags
)
219 ioas
= get_compat_ioas(ictx
);
221 return PTR_ERR(ioas
);
223 if (unmap
.flags
& VFIO_DMA_UNMAP_FLAG_ALL
) {
224 if (unmap
.iova
!= 0 || unmap
.size
!= 0) {
228 rc
= iopt_unmap_all(&ioas
->iopt
, &unmapped
);
230 if (READ_ONCE(ioas
->iopt
.disable_large_pages
)) {
232 * Create cuts at the start and last of the requested
233 * range. If the start IOVA is 0 then it doesn't need to
236 unsigned long iovas
[] = { unmap
.iova
+ unmap
.size
- 1,
239 rc
= iopt_cut_iova(&ioas
->iopt
, iovas
,
244 rc
= iopt_unmap_iova(&ioas
->iopt
, unmap
.iova
, unmap
.size
,
247 unmap
.size
= unmapped
;
248 if (copy_to_user(arg
, &unmap
, minsz
))
252 iommufd_put_object(ictx
, &ioas
->obj
);
256 static int iommufd_vfio_cc_iommu(struct iommufd_ctx
*ictx
)
258 struct iommufd_hwpt_paging
*hwpt_paging
;
259 struct iommufd_ioas
*ioas
;
262 ioas
= get_compat_ioas(ictx
);
264 return PTR_ERR(ioas
);
266 mutex_lock(&ioas
->mutex
);
267 list_for_each_entry(hwpt_paging
, &ioas
->hwpt_list
, hwpt_item
) {
268 if (!hwpt_paging
->enforce_cache_coherency
) {
273 mutex_unlock(&ioas
->mutex
);
275 iommufd_put_object(ictx
, &ioas
->obj
);
279 static int iommufd_vfio_check_extension(struct iommufd_ctx
*ictx
,
283 case VFIO_TYPE1_IOMMU
:
284 case VFIO_TYPE1v2_IOMMU
:
288 case VFIO_NOIOMMU_IOMMU
:
289 return IS_ENABLED(CONFIG_VFIO_NOIOMMU
);
291 case VFIO_DMA_CC_IOMMU
:
292 return iommufd_vfio_cc_iommu(ictx
);
294 case __VFIO_RESERVED_TYPE1_NESTING_IOMMU
:
298 * VFIO_DMA_MAP_FLAG_VADDR
299 * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
300 * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
302 * It is hard to see how this could be implemented safely.
304 case VFIO_UPDATE_VADDR
:
310 static int iommufd_vfio_set_iommu(struct iommufd_ctx
*ictx
, unsigned long type
)
312 bool no_iommu_mode
= READ_ONCE(ictx
->no_iommu_mode
);
313 struct iommufd_ioas
*ioas
= NULL
;
317 * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all
318 * other ioctls. We let them keep working but they mostly fail since no
321 if (IS_ENABLED(CONFIG_VFIO_NOIOMMU
) && type
== VFIO_NOIOMMU_IOMMU
&&
323 if (!capable(CAP_SYS_RAWIO
))
328 if ((type
!= VFIO_TYPE1_IOMMU
&& type
!= VFIO_TYPE1v2_IOMMU
) ||
332 /* VFIO fails the set_iommu if there is no group */
333 ioas
= get_compat_ioas(ictx
);
335 return PTR_ERR(ioas
);
338 * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
339 * the middle of mapped ranges. This is complicated by huge page support
340 * which creates single large IOPTEs that cannot be split by the iommu
341 * driver. TYPE1 is very old at this point and likely nothing uses it,
342 * however it is simple enough to emulate by simply disabling the
343 * problematic large IOPTEs. Then we can safely unmap within any range.
345 if (type
== VFIO_TYPE1_IOMMU
)
346 rc
= iopt_disable_large_pages(&ioas
->iopt
);
347 iommufd_put_object(ictx
, &ioas
->obj
);
351 static unsigned long iommufd_get_pagesizes(struct iommufd_ioas
*ioas
)
353 struct io_pagetable
*iopt
= &ioas
->iopt
;
354 unsigned long pgsize_bitmap
= ULONG_MAX
;
355 struct iommu_domain
*domain
;
358 down_read(&iopt
->domains_rwsem
);
359 xa_for_each(&iopt
->domains
, index
, domain
)
360 pgsize_bitmap
&= domain
->pgsize_bitmap
;
362 /* See vfio_update_pgsize_bitmap() */
363 if (pgsize_bitmap
& ~PAGE_MASK
) {
364 pgsize_bitmap
&= PAGE_MASK
;
365 pgsize_bitmap
|= PAGE_SIZE
;
367 pgsize_bitmap
= max(pgsize_bitmap
, ioas
->iopt
.iova_alignment
);
368 up_read(&iopt
->domains_rwsem
);
369 return pgsize_bitmap
;
372 static int iommufd_fill_cap_iova(struct iommufd_ioas
*ioas
,
373 struct vfio_info_cap_header __user
*cur
,
376 struct vfio_iommu_type1_info_cap_iova_range __user
*ucap_iovas
=
378 struct vfio_iommu_type1_info_cap_iova_range __user
,
380 struct vfio_iommu_type1_info_cap_iova_range cap_iovas
= {
382 .id
= VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
,
386 struct interval_tree_span_iter span
;
388 interval_tree_for_each_span(&span
, &ioas
->iopt
.reserved_itree
, 0,
390 struct vfio_iova_range range
;
394 range
.start
= span
.start_hole
;
395 range
.end
= span
.last_hole
;
396 if (avail
>= struct_size(&cap_iovas
, iova_ranges
,
397 cap_iovas
.nr_iovas
+ 1) &&
398 copy_to_user(&ucap_iovas
->iova_ranges
[cap_iovas
.nr_iovas
],
399 &range
, sizeof(range
)))
401 cap_iovas
.nr_iovas
++;
403 if (avail
>= struct_size(&cap_iovas
, iova_ranges
, cap_iovas
.nr_iovas
) &&
404 copy_to_user(ucap_iovas
, &cap_iovas
, sizeof(cap_iovas
)))
406 return struct_size(&cap_iovas
, iova_ranges
, cap_iovas
.nr_iovas
);
409 static int iommufd_fill_cap_dma_avail(struct iommufd_ioas
*ioas
,
410 struct vfio_info_cap_header __user
*cur
,
413 struct vfio_iommu_type1_info_dma_avail cap_dma
= {
415 .id
= VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL
,
419 * iommufd's limit is based on the cgroup's memory limit.
420 * Normally vfio would return U16_MAX here, and provide a module
421 * parameter to adjust it. Since S390 qemu userspace actually
422 * pays attention and needs a value bigger than U16_MAX return
428 if (avail
>= sizeof(cap_dma
) &&
429 copy_to_user(cur
, &cap_dma
, sizeof(cap_dma
)))
431 return sizeof(cap_dma
);
434 static int iommufd_vfio_iommu_get_info(struct iommufd_ctx
*ictx
,
437 typedef int (*fill_cap_fn
)(struct iommufd_ioas
*ioas
,
438 struct vfio_info_cap_header __user
*cur
,
440 static const fill_cap_fn fill_fns
[] = {
441 iommufd_fill_cap_dma_avail
,
442 iommufd_fill_cap_iova
,
444 size_t minsz
= offsetofend(struct vfio_iommu_type1_info
, iova_pgsizes
);
445 struct vfio_info_cap_header __user
*last_cap
= NULL
;
446 struct vfio_iommu_type1_info info
= {};
447 struct iommufd_ioas
*ioas
;
448 size_t total_cap_size
;
452 if (copy_from_user(&info
, arg
, minsz
))
455 if (info
.argsz
< minsz
)
457 minsz
= min_t(size_t, info
.argsz
, sizeof(info
));
459 ioas
= get_compat_ioas(ictx
);
461 return PTR_ERR(ioas
);
463 info
.flags
= VFIO_IOMMU_INFO_PGSIZES
;
464 info
.iova_pgsizes
= iommufd_get_pagesizes(ioas
);
467 down_read(&ioas
->iopt
.iova_rwsem
);
468 total_cap_size
= sizeof(info
);
469 for (i
= 0; i
!= ARRAY_SIZE(fill_fns
); i
++) {
472 if (info
.argsz
> total_cap_size
)
473 cap_size
= fill_fns
[i
](ioas
, arg
+ total_cap_size
,
474 info
.argsz
- total_cap_size
);
476 cap_size
= fill_fns
[i
](ioas
, NULL
, 0);
481 cap_size
= ALIGN(cap_size
, sizeof(u64
));
483 if (last_cap
&& info
.argsz
>= total_cap_size
&&
484 put_user(total_cap_size
, &last_cap
->next
)) {
488 last_cap
= arg
+ total_cap_size
;
489 total_cap_size
+= cap_size
;
493 * If the user did not provide enough space then only some caps are
494 * returned and the argsz will be updated to the correct amount to get
497 if (info
.argsz
>= total_cap_size
)
498 info
.cap_offset
= sizeof(info
);
499 info
.argsz
= total_cap_size
;
500 info
.flags
|= VFIO_IOMMU_INFO_CAPS
;
501 if (copy_to_user(arg
, &info
, minsz
)) {
508 up_read(&ioas
->iopt
.iova_rwsem
);
509 iommufd_put_object(ictx
, &ioas
->obj
);
513 int iommufd_vfio_ioctl(struct iommufd_ctx
*ictx
, unsigned int cmd
,
516 void __user
*uarg
= (void __user
*)arg
;
519 case VFIO_GET_API_VERSION
:
520 return VFIO_API_VERSION
;
522 return iommufd_vfio_set_iommu(ictx
, arg
);
523 case VFIO_CHECK_EXTENSION
:
524 return iommufd_vfio_check_extension(ictx
, arg
);
525 case VFIO_IOMMU_GET_INFO
:
526 return iommufd_vfio_iommu_get_info(ictx
, uarg
);
527 case VFIO_IOMMU_MAP_DMA
:
528 return iommufd_vfio_map_dma(ictx
, cmd
, uarg
);
529 case VFIO_IOMMU_UNMAP_DMA
:
530 return iommufd_vfio_unmap_dma(ictx
, cmd
, uarg
);
531 case VFIO_IOMMU_DIRTY_PAGES
: