1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
4 #include <linux/iommu.h>
5 #include <linux/iommufd.h>
6 #include <linux/slab.h>
7 #include <uapi/linux/iommufd.h>
9 #include "../iommu-priv.h"
10 #include "io_pagetable.h"
11 #include "iommufd_private.h"
13 static bool allow_unsafe_interrupts
;
14 module_param(allow_unsafe_interrupts
, bool, S_IRUGO
| S_IWUSR
);
16 allow_unsafe_interrupts
,
17 "Allow IOMMUFD to bind to devices even if the platform cannot isolate "
18 "the MSI interrupt window. Enabling this is a security weakness.");
20 static void iommufd_group_release(struct kref
*kref
)
22 struct iommufd_group
*igroup
=
23 container_of(kref
, struct iommufd_group
, ref
);
25 WARN_ON(igroup
->hwpt
|| !list_empty(&igroup
->device_list
));
27 xa_cmpxchg(&igroup
->ictx
->groups
, iommu_group_id(igroup
->group
), igroup
,
29 iommu_group_put(igroup
->group
);
30 mutex_destroy(&igroup
->lock
);
34 static void iommufd_put_group(struct iommufd_group
*group
)
36 kref_put(&group
->ref
, iommufd_group_release
);
39 static bool iommufd_group_try_get(struct iommufd_group
*igroup
,
40 struct iommu_group
*group
)
45 * group ID's cannot be re-used until the group is put back which does
46 * not happen if we could get an igroup pointer under the xa_lock.
48 if (WARN_ON(igroup
->group
!= group
))
50 return kref_get_unless_zero(&igroup
->ref
);
54 * iommufd needs to store some more data for each iommu_group, we keep a
55 * parallel xarray indexed by iommu_group id to hold this instead of putting it
56 * in the core structure. To keep things simple the iommufd_group memory is
57 * unique within the iommufd_ctx. This makes it easy to check there are no
60 static struct iommufd_group
*iommufd_get_group(struct iommufd_ctx
*ictx
,
63 struct iommufd_group
*new_igroup
;
64 struct iommufd_group
*cur_igroup
;
65 struct iommufd_group
*igroup
;
66 struct iommu_group
*group
;
69 group
= iommu_group_get(dev
);
71 return ERR_PTR(-ENODEV
);
73 id
= iommu_group_id(group
);
75 xa_lock(&ictx
->groups
);
76 igroup
= xa_load(&ictx
->groups
, id
);
77 if (iommufd_group_try_get(igroup
, group
)) {
78 xa_unlock(&ictx
->groups
);
79 iommu_group_put(group
);
82 xa_unlock(&ictx
->groups
);
84 new_igroup
= kzalloc(sizeof(*new_igroup
), GFP_KERNEL
);
86 iommu_group_put(group
);
87 return ERR_PTR(-ENOMEM
);
90 kref_init(&new_igroup
->ref
);
91 mutex_init(&new_igroup
->lock
);
92 INIT_LIST_HEAD(&new_igroup
->device_list
);
93 new_igroup
->sw_msi_start
= PHYS_ADDR_MAX
;
94 /* group reference moves into new_igroup */
95 new_igroup
->group
= group
;
98 * The ictx is not additionally refcounted here becase all objects using
99 * an igroup must put it before their destroy completes.
101 new_igroup
->ictx
= ictx
;
104 * We dropped the lock so igroup is invalid. NULL is a safe and likely
105 * value to assume for the xa_cmpxchg algorithm.
108 xa_lock(&ictx
->groups
);
110 igroup
= __xa_cmpxchg(&ictx
->groups
, id
, cur_igroup
, new_igroup
,
112 if (xa_is_err(igroup
)) {
113 xa_unlock(&ictx
->groups
);
114 iommufd_put_group(new_igroup
);
115 return ERR_PTR(xa_err(igroup
));
118 /* new_group was successfully installed */
119 if (cur_igroup
== igroup
) {
120 xa_unlock(&ictx
->groups
);
124 /* Check again if the current group is any good */
125 if (iommufd_group_try_get(igroup
, group
)) {
126 xa_unlock(&ictx
->groups
);
127 iommufd_put_group(new_igroup
);
134 void iommufd_device_destroy(struct iommufd_object
*obj
)
136 struct iommufd_device
*idev
=
137 container_of(obj
, struct iommufd_device
, obj
);
139 iommu_device_release_dma_owner(idev
->dev
);
140 iommufd_put_group(idev
->igroup
);
141 if (!iommufd_selftest_is_mock_dev(idev
->dev
))
142 iommufd_ctx_put(idev
->ictx
);
146 * iommufd_device_bind - Bind a physical device to an iommu fd
147 * @ictx: iommufd file descriptor
148 * @dev: Pointer to a physical device struct
149 * @id: Output ID number to return to userspace for this device
151 * A successful bind establishes an ownership over the device and returns
152 * struct iommufd_device pointer, otherwise returns error pointer.
154 * A driver using this API must set driver_managed_dma and must not touch
155 * the device until this routine succeeds and establishes ownership.
157 * Binding a PCI device places the entire RID under iommufd control.
159 * The caller must undo this with iommufd_device_unbind()
161 struct iommufd_device
*iommufd_device_bind(struct iommufd_ctx
*ictx
,
162 struct device
*dev
, u32
*id
)
164 struct iommufd_device
*idev
;
165 struct iommufd_group
*igroup
;
169 * iommufd always sets IOMMU_CACHE because we offer no way for userspace
170 * to restore cache coherency.
172 if (!device_iommu_capable(dev
, IOMMU_CAP_CACHE_COHERENCY
))
173 return ERR_PTR(-EINVAL
);
175 igroup
= iommufd_get_group(ictx
, dev
);
177 return ERR_CAST(igroup
);
180 * For historical compat with VFIO the insecure interrupt path is
181 * allowed if the module parameter is set. Secure/Isolated means that a
182 * MemWr operation from the device (eg a simple DMA) cannot trigger an
183 * interrupt outside this iommufd context.
185 if (!iommufd_selftest_is_mock_dev(dev
) &&
186 !iommu_group_has_isolated_msi(igroup
->group
)) {
187 if (!allow_unsafe_interrupts
) {
194 "MSI interrupts are not secure, they cannot be isolated by the platform. "
195 "Check that platform features like interrupt remapping are enabled. "
196 "Use the \"allow_unsafe_interrupts\" module parameter to override\n");
199 rc
= iommu_device_claim_dma_owner(dev
, ictx
);
203 idev
= iommufd_object_alloc(ictx
, idev
, IOMMUFD_OBJ_DEVICE
);
206 goto out_release_owner
;
209 if (!iommufd_selftest_is_mock_dev(dev
))
210 iommufd_ctx_get(ictx
);
212 idev
->enforce_cache_coherency
=
213 device_iommu_capable(dev
, IOMMU_CAP_ENFORCE_CACHE_COHERENCY
);
214 /* The calling driver is a user until iommufd_device_unbind() */
215 refcount_inc(&idev
->obj
.users
);
216 /* igroup refcount moves into iommufd_device */
217 idev
->igroup
= igroup
;
218 mutex_init(&idev
->iopf_lock
);
221 * If the caller fails after this success it must call
222 * iommufd_unbind_device() which is safe since we hold this refcount.
223 * This also means the device is a leaf in the graph and no other object
224 * can take a reference on it.
226 iommufd_object_finalize(ictx
, &idev
->obj
);
231 iommu_device_release_dma_owner(dev
);
233 iommufd_put_group(igroup
);
236 EXPORT_SYMBOL_NS_GPL(iommufd_device_bind
, IOMMUFD
);
239 * iommufd_ctx_has_group - True if any device within the group is bound
241 * @ictx: iommufd file descriptor
242 * @group: Pointer to a physical iommu_group struct
244 * True if any device within the group has been bound to this ictx, ex. via
245 * iommufd_device_bind(), therefore implying ictx ownership of the group.
247 bool iommufd_ctx_has_group(struct iommufd_ctx
*ictx
, struct iommu_group
*group
)
249 struct iommufd_object
*obj
;
255 xa_lock(&ictx
->objects
);
256 xa_for_each(&ictx
->objects
, index
, obj
) {
257 if (obj
->type
== IOMMUFD_OBJ_DEVICE
&&
258 container_of(obj
, struct iommufd_device
, obj
)
259 ->igroup
->group
== group
) {
260 xa_unlock(&ictx
->objects
);
264 xa_unlock(&ictx
->objects
);
267 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group
, IOMMUFD
);
270 * iommufd_device_unbind - Undo iommufd_device_bind()
271 * @idev: Device returned by iommufd_device_bind()
273 * Release the device from iommufd control. The DMA ownership will return back
274 * to unowned with DMA controlled by the DMA API. This invalidates the
275 * iommufd_device pointer, other APIs that consume it must not be called
278 void iommufd_device_unbind(struct iommufd_device
*idev
)
280 iommufd_object_destroy_user(idev
->ictx
, &idev
->obj
);
282 EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind
, IOMMUFD
);
284 struct iommufd_ctx
*iommufd_device_to_ictx(struct iommufd_device
*idev
)
288 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx
, IOMMUFD
);
290 u32
iommufd_device_to_id(struct iommufd_device
*idev
)
294 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id
, IOMMUFD
);
296 static int iommufd_group_setup_msi(struct iommufd_group
*igroup
,
297 struct iommufd_hwpt_paging
*hwpt_paging
)
299 phys_addr_t sw_msi_start
= igroup
->sw_msi_start
;
303 * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to
304 * call iommu_get_msi_cookie() on its behalf. This is necessary to setup
305 * the MSI window so iommu_dma_prepare_msi() can install pages into our
306 * domain after request_irq(). If it is not done interrupts will not
307 * work on this domain.
309 * FIXME: This is conceptually broken for iommufd since we want to allow
310 * userspace to change the domains, eg switch from an identity IOAS to a
311 * DMA IOAS. There is currently no way to create a MSI window that
312 * matches what the IRQ layer actually expects in a newly created
315 if (sw_msi_start
!= PHYS_ADDR_MAX
&& !hwpt_paging
->msi_cookie
) {
316 rc
= iommu_get_msi_cookie(hwpt_paging
->common
.domain
,
322 * iommu_get_msi_cookie() can only be called once per domain,
323 * it returns -EBUSY on later calls.
325 hwpt_paging
->msi_cookie
= true;
331 iommufd_device_attach_reserved_iova(struct iommufd_device
*idev
,
332 struct iommufd_hwpt_paging
*hwpt_paging
)
336 lockdep_assert_held(&idev
->igroup
->lock
);
338 rc
= iopt_table_enforce_dev_resv_regions(&hwpt_paging
->ioas
->iopt
,
340 &idev
->igroup
->sw_msi_start
);
344 if (list_empty(&idev
->igroup
->device_list
)) {
345 rc
= iommufd_group_setup_msi(idev
->igroup
, hwpt_paging
);
347 iopt_remove_reserved_iova(&hwpt_paging
->ioas
->iopt
,
355 int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable
*hwpt
,
356 struct iommufd_device
*idev
)
358 struct iommufd_hwpt_paging
*hwpt_paging
= find_hwpt_paging(hwpt
);
361 mutex_lock(&idev
->igroup
->lock
);
363 if (idev
->igroup
->hwpt
!= NULL
&& idev
->igroup
->hwpt
!= hwpt
) {
369 rc
= iommufd_device_attach_reserved_iova(idev
, hwpt_paging
);
375 * Only attach to the group once for the first device that is in the
376 * group. All the other devices will follow this attachment. The user
377 * should attach every device individually to the hwpt as the per-device
378 * reserved regions are only updated during individual device
381 if (list_empty(&idev
->igroup
->device_list
)) {
382 rc
= iommufd_hwpt_attach_device(hwpt
, idev
);
385 idev
->igroup
->hwpt
= hwpt
;
387 refcount_inc(&hwpt
->obj
.users
);
388 list_add_tail(&idev
->group_item
, &idev
->igroup
->device_list
);
389 mutex_unlock(&idev
->igroup
->lock
);
393 iopt_remove_reserved_iova(&hwpt_paging
->ioas
->iopt
, idev
->dev
);
395 mutex_unlock(&idev
->igroup
->lock
);
399 struct iommufd_hw_pagetable
*
400 iommufd_hw_pagetable_detach(struct iommufd_device
*idev
)
402 struct iommufd_hw_pagetable
*hwpt
= idev
->igroup
->hwpt
;
403 struct iommufd_hwpt_paging
*hwpt_paging
= find_hwpt_paging(hwpt
);
405 mutex_lock(&idev
->igroup
->lock
);
406 list_del(&idev
->group_item
);
407 if (list_empty(&idev
->igroup
->device_list
)) {
408 iommufd_hwpt_detach_device(hwpt
, idev
);
409 idev
->igroup
->hwpt
= NULL
;
412 iopt_remove_reserved_iova(&hwpt_paging
->ioas
->iopt
, idev
->dev
);
413 mutex_unlock(&idev
->igroup
->lock
);
415 /* Caller must destroy hwpt */
419 static struct iommufd_hw_pagetable
*
420 iommufd_device_do_attach(struct iommufd_device
*idev
,
421 struct iommufd_hw_pagetable
*hwpt
)
425 rc
= iommufd_hw_pagetable_attach(hwpt
, idev
);
432 iommufd_group_remove_reserved_iova(struct iommufd_group
*igroup
,
433 struct iommufd_hwpt_paging
*hwpt_paging
)
435 struct iommufd_device
*cur
;
437 lockdep_assert_held(&igroup
->lock
);
439 list_for_each_entry(cur
, &igroup
->device_list
, group_item
)
440 iopt_remove_reserved_iova(&hwpt_paging
->ioas
->iopt
, cur
->dev
);
444 iommufd_group_do_replace_reserved_iova(struct iommufd_group
*igroup
,
445 struct iommufd_hwpt_paging
*hwpt_paging
)
447 struct iommufd_hwpt_paging
*old_hwpt_paging
;
448 struct iommufd_device
*cur
;
451 lockdep_assert_held(&igroup
->lock
);
453 old_hwpt_paging
= find_hwpt_paging(igroup
->hwpt
);
454 if (!old_hwpt_paging
|| hwpt_paging
->ioas
!= old_hwpt_paging
->ioas
) {
455 list_for_each_entry(cur
, &igroup
->device_list
, group_item
) {
456 rc
= iopt_table_enforce_dev_resv_regions(
457 &hwpt_paging
->ioas
->iopt
, cur
->dev
, NULL
);
463 rc
= iommufd_group_setup_msi(igroup
, hwpt_paging
);
469 iommufd_group_remove_reserved_iova(igroup
, hwpt_paging
);
473 static struct iommufd_hw_pagetable
*
474 iommufd_device_do_replace(struct iommufd_device
*idev
,
475 struct iommufd_hw_pagetable
*hwpt
)
477 struct iommufd_hwpt_paging
*hwpt_paging
= find_hwpt_paging(hwpt
);
478 struct iommufd_hwpt_paging
*old_hwpt_paging
;
479 struct iommufd_group
*igroup
= idev
->igroup
;
480 struct iommufd_hw_pagetable
*old_hwpt
;
481 unsigned int num_devices
;
484 mutex_lock(&idev
->igroup
->lock
);
486 if (igroup
->hwpt
== NULL
) {
491 if (hwpt
== igroup
->hwpt
) {
492 mutex_unlock(&idev
->igroup
->lock
);
496 old_hwpt
= igroup
->hwpt
;
498 rc
= iommufd_group_do_replace_reserved_iova(igroup
, hwpt_paging
);
503 rc
= iommufd_hwpt_replace_device(idev
, hwpt
, old_hwpt
);
507 old_hwpt_paging
= find_hwpt_paging(old_hwpt
);
508 if (old_hwpt_paging
&&
509 (!hwpt_paging
|| hwpt_paging
->ioas
!= old_hwpt_paging
->ioas
))
510 iommufd_group_remove_reserved_iova(igroup
, old_hwpt_paging
);
514 num_devices
= list_count_nodes(&igroup
->device_list
);
516 * Move the refcounts held by the device_list to the new hwpt. Retain a
517 * refcount for this thread as the caller will free it.
519 refcount_add(num_devices
, &hwpt
->obj
.users
);
521 WARN_ON(refcount_sub_and_test(num_devices
- 1,
522 &old_hwpt
->obj
.users
));
523 mutex_unlock(&idev
->igroup
->lock
);
525 /* Caller must destroy old_hwpt */
529 iommufd_group_remove_reserved_iova(igroup
, hwpt_paging
);
531 mutex_unlock(&idev
->igroup
->lock
);
535 typedef struct iommufd_hw_pagetable
*(*attach_fn
)(
536 struct iommufd_device
*idev
, struct iommufd_hw_pagetable
*hwpt
);
539 * When automatically managing the domains we search for a compatible domain in
540 * the iopt and if one is found use it, otherwise create a new domain.
541 * Automatic domain selection will never pick a manually created domain.
543 static struct iommufd_hw_pagetable
*
544 iommufd_device_auto_get_domain(struct iommufd_device
*idev
,
545 struct iommufd_ioas
*ioas
, u32
*pt_id
,
549 * iommufd_hw_pagetable_attach() is called by
550 * iommufd_hw_pagetable_alloc() in immediate attachment mode, same as
551 * iommufd_device_do_attach(). So if we are in this mode then we prefer
552 * to use the immediate_attach path as it supports drivers that can't
553 * directly allocate a domain.
555 bool immediate_attach
= do_attach
== iommufd_device_do_attach
;
556 struct iommufd_hw_pagetable
*destroy_hwpt
;
557 struct iommufd_hwpt_paging
*hwpt_paging
;
558 struct iommufd_hw_pagetable
*hwpt
;
561 * There is no differentiation when domains are allocated, so any domain
562 * that is willing to attach to the device is interchangeable with any
565 mutex_lock(&ioas
->mutex
);
566 list_for_each_entry(hwpt_paging
, &ioas
->hwpt_list
, hwpt_item
) {
567 if (!hwpt_paging
->auto_domain
)
570 hwpt
= &hwpt_paging
->common
;
571 if (!iommufd_lock_obj(&hwpt
->obj
))
573 destroy_hwpt
= (*do_attach
)(idev
, hwpt
);
574 if (IS_ERR(destroy_hwpt
)) {
575 iommufd_put_object(idev
->ictx
, &hwpt
->obj
);
577 * -EINVAL means the domain is incompatible with the
578 * device. Other error codes should propagate to
579 * userspace as failure. Success means the domain is
582 if (PTR_ERR(destroy_hwpt
) == -EINVAL
)
586 *pt_id
= hwpt
->obj
.id
;
587 iommufd_put_object(idev
->ictx
, &hwpt
->obj
);
591 hwpt_paging
= iommufd_hwpt_paging_alloc(idev
->ictx
, ioas
, idev
, 0,
592 immediate_attach
, NULL
);
593 if (IS_ERR(hwpt_paging
)) {
594 destroy_hwpt
= ERR_CAST(hwpt_paging
);
597 hwpt
= &hwpt_paging
->common
;
599 if (!immediate_attach
) {
600 destroy_hwpt
= (*do_attach
)(idev
, hwpt
);
601 if (IS_ERR(destroy_hwpt
))
607 hwpt_paging
->auto_domain
= true;
608 *pt_id
= hwpt
->obj
.id
;
610 iommufd_object_finalize(idev
->ictx
, &hwpt
->obj
);
611 mutex_unlock(&ioas
->mutex
);
615 iommufd_object_abort_and_destroy(idev
->ictx
, &hwpt
->obj
);
617 mutex_unlock(&ioas
->mutex
);
621 static int iommufd_device_change_pt(struct iommufd_device
*idev
, u32
*pt_id
,
624 struct iommufd_hw_pagetable
*destroy_hwpt
;
625 struct iommufd_object
*pt_obj
;
627 pt_obj
= iommufd_get_object(idev
->ictx
, *pt_id
, IOMMUFD_OBJ_ANY
);
629 return PTR_ERR(pt_obj
);
631 switch (pt_obj
->type
) {
632 case IOMMUFD_OBJ_HWPT_NESTED
:
633 case IOMMUFD_OBJ_HWPT_PAGING
: {
634 struct iommufd_hw_pagetable
*hwpt
=
635 container_of(pt_obj
, struct iommufd_hw_pagetable
, obj
);
637 destroy_hwpt
= (*do_attach
)(idev
, hwpt
);
638 if (IS_ERR(destroy_hwpt
))
642 case IOMMUFD_OBJ_IOAS
: {
643 struct iommufd_ioas
*ioas
=
644 container_of(pt_obj
, struct iommufd_ioas
, obj
);
646 destroy_hwpt
= iommufd_device_auto_get_domain(idev
, ioas
, pt_id
,
648 if (IS_ERR(destroy_hwpt
))
653 destroy_hwpt
= ERR_PTR(-EINVAL
);
656 iommufd_put_object(idev
->ictx
, pt_obj
);
658 /* This destruction has to be after we unlock everything */
660 iommufd_hw_pagetable_put(idev
->ictx
, destroy_hwpt
);
664 iommufd_put_object(idev
->ictx
, pt_obj
);
665 return PTR_ERR(destroy_hwpt
);
669 * iommufd_device_attach - Connect a device to an iommu_domain
670 * @idev: device to attach
671 * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
672 * Output the IOMMUFD_OBJ_HWPT_PAGING ID
674 * This connects the device to an iommu_domain, either automatically or manually
675 * selected. Once this completes the device could do DMA.
677 * The caller should return the resulting pt_id back to userspace.
678 * This function is undone by calling iommufd_device_detach().
680 int iommufd_device_attach(struct iommufd_device
*idev
, u32
*pt_id
)
684 rc
= iommufd_device_change_pt(idev
, pt_id
, &iommufd_device_do_attach
);
689 * Pairs with iommufd_device_detach() - catches caller bugs attempting
690 * to destroy a device with an attachment.
692 refcount_inc(&idev
->obj
.users
);
695 EXPORT_SYMBOL_NS_GPL(iommufd_device_attach
, IOMMUFD
);
698 * iommufd_device_replace - Change the device's iommu_domain
699 * @idev: device to change
700 * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
701 * Output the IOMMUFD_OBJ_HWPT_PAGING ID
703 * This is the same as::
705 * iommufd_device_detach();
706 * iommufd_device_attach();
708 * If it fails then no change is made to the attachment. The iommu driver may
709 * implement this so there is no disruption in translation. This can only be
710 * called if iommufd_device_attach() has already succeeded.
712 int iommufd_device_replace(struct iommufd_device
*idev
, u32
*pt_id
)
714 return iommufd_device_change_pt(idev
, pt_id
,
715 &iommufd_device_do_replace
);
717 EXPORT_SYMBOL_NS_GPL(iommufd_device_replace
, IOMMUFD
);
720 * iommufd_device_detach - Disconnect a device to an iommu_domain
721 * @idev: device to detach
723 * Undo iommufd_device_attach(). This disconnects the idev from the previously
724 * attached pt_id. The device returns back to a blocked DMA translation.
726 void iommufd_device_detach(struct iommufd_device
*idev
)
728 struct iommufd_hw_pagetable
*hwpt
;
730 hwpt
= iommufd_hw_pagetable_detach(idev
);
731 iommufd_hw_pagetable_put(idev
->ictx
, hwpt
);
732 refcount_dec(&idev
->obj
.users
);
734 EXPORT_SYMBOL_NS_GPL(iommufd_device_detach
, IOMMUFD
);
737 * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at
738 * a valid cur_ioas (access->ioas). A caller passing in a valid new_ioas should
739 * call iommufd_put_object() if it does an iommufd_get_object() for a new_ioas.
741 static int iommufd_access_change_ioas(struct iommufd_access
*access
,
742 struct iommufd_ioas
*new_ioas
)
744 u32 iopt_access_list_id
= access
->iopt_access_list_id
;
745 struct iommufd_ioas
*cur_ioas
= access
->ioas
;
748 lockdep_assert_held(&access
->ioas_lock
);
750 /* We are racing with a concurrent detach, bail */
751 if (cur_ioas
!= access
->ioas_unpin
)
754 if (cur_ioas
== new_ioas
)
758 * Set ioas to NULL to block any further iommufd_access_pin_pages().
759 * iommufd_access_unpin_pages() can continue using access->ioas_unpin.
764 rc
= iopt_add_access(&new_ioas
->iopt
, access
);
766 access
->ioas
= cur_ioas
;
769 refcount_inc(&new_ioas
->obj
.users
);
773 if (access
->ops
->unmap
) {
774 mutex_unlock(&access
->ioas_lock
);
775 access
->ops
->unmap(access
->data
, 0, ULONG_MAX
);
776 mutex_lock(&access
->ioas_lock
);
778 iopt_remove_access(&cur_ioas
->iopt
, access
, iopt_access_list_id
);
779 refcount_dec(&cur_ioas
->obj
.users
);
782 access
->ioas
= new_ioas
;
783 access
->ioas_unpin
= new_ioas
;
788 static int iommufd_access_change_ioas_id(struct iommufd_access
*access
, u32 id
)
790 struct iommufd_ioas
*ioas
= iommufd_get_ioas(access
->ictx
, id
);
794 return PTR_ERR(ioas
);
795 rc
= iommufd_access_change_ioas(access
, ioas
);
796 iommufd_put_object(access
->ictx
, &ioas
->obj
);
800 void iommufd_access_destroy_object(struct iommufd_object
*obj
)
802 struct iommufd_access
*access
=
803 container_of(obj
, struct iommufd_access
, obj
);
805 mutex_lock(&access
->ioas_lock
);
807 WARN_ON(iommufd_access_change_ioas(access
, NULL
));
808 mutex_unlock(&access
->ioas_lock
);
809 iommufd_ctx_put(access
->ictx
);
813 * iommufd_access_create - Create an iommufd_access
814 * @ictx: iommufd file descriptor
815 * @ops: Driver's ops to associate with the access
816 * @data: Opaque data to pass into ops functions
817 * @id: Output ID number to return to userspace for this access
819 * An iommufd_access allows a driver to read/write to the IOAS without using
820 * DMA. The underlying CPU memory can be accessed using the
821 * iommufd_access_pin_pages() or iommufd_access_rw() functions.
823 * The provided ops are required to use iommufd_access_pin_pages().
825 struct iommufd_access
*
826 iommufd_access_create(struct iommufd_ctx
*ictx
,
827 const struct iommufd_access_ops
*ops
, void *data
, u32
*id
)
829 struct iommufd_access
*access
;
832 * There is no uAPI for the access object, but to keep things symmetric
833 * use the object infrastructure anyhow.
835 access
= iommufd_object_alloc(ictx
, access
, IOMMUFD_OBJ_ACCESS
);
842 if (ops
->needs_pin_pages
)
843 access
->iova_alignment
= PAGE_SIZE
;
845 access
->iova_alignment
= 1;
847 /* The calling driver is a user until iommufd_access_destroy() */
848 refcount_inc(&access
->obj
.users
);
850 iommufd_ctx_get(ictx
);
851 iommufd_object_finalize(ictx
, &access
->obj
);
852 *id
= access
->obj
.id
;
853 mutex_init(&access
->ioas_lock
);
856 EXPORT_SYMBOL_NS_GPL(iommufd_access_create
, IOMMUFD
);
859 * iommufd_access_destroy - Destroy an iommufd_access
860 * @access: The access to destroy
862 * The caller must stop using the access before destroying it.
864 void iommufd_access_destroy(struct iommufd_access
*access
)
866 iommufd_object_destroy_user(access
->ictx
, &access
->obj
);
868 EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy
, IOMMUFD
);
870 void iommufd_access_detach(struct iommufd_access
*access
)
872 mutex_lock(&access
->ioas_lock
);
873 if (WARN_ON(!access
->ioas
)) {
874 mutex_unlock(&access
->ioas_lock
);
877 WARN_ON(iommufd_access_change_ioas(access
, NULL
));
878 mutex_unlock(&access
->ioas_lock
);
880 EXPORT_SYMBOL_NS_GPL(iommufd_access_detach
, IOMMUFD
);
882 int iommufd_access_attach(struct iommufd_access
*access
, u32 ioas_id
)
886 mutex_lock(&access
->ioas_lock
);
887 if (WARN_ON(access
->ioas
)) {
888 mutex_unlock(&access
->ioas_lock
);
892 rc
= iommufd_access_change_ioas_id(access
, ioas_id
);
893 mutex_unlock(&access
->ioas_lock
);
896 EXPORT_SYMBOL_NS_GPL(iommufd_access_attach
, IOMMUFD
);
898 int iommufd_access_replace(struct iommufd_access
*access
, u32 ioas_id
)
902 mutex_lock(&access
->ioas_lock
);
904 mutex_unlock(&access
->ioas_lock
);
907 rc
= iommufd_access_change_ioas_id(access
, ioas_id
);
908 mutex_unlock(&access
->ioas_lock
);
911 EXPORT_SYMBOL_NS_GPL(iommufd_access_replace
, IOMMUFD
);
914 * iommufd_access_notify_unmap - Notify users of an iopt to stop using it
915 * @iopt: iopt to work on
916 * @iova: Starting iova in the iopt
917 * @length: Number of bytes
919 * After this function returns there should be no users attached to the pages
920 * linked to this iopt that intersect with iova,length. Anyone that has attached
921 * a user through iopt_access_pages() needs to detach it through
922 * iommufd_access_unpin_pages() before this function returns.
924 * iommufd_access_destroy() will wait for any outstanding unmap callback to
925 * complete. Once iommufd_access_destroy() no unmap ops are running or will
926 * run in the future. Due to this a driver must not create locking that prevents
927 * unmap to complete while iommufd_access_destroy() is running.
929 void iommufd_access_notify_unmap(struct io_pagetable
*iopt
, unsigned long iova
,
930 unsigned long length
)
932 struct iommufd_ioas
*ioas
=
933 container_of(iopt
, struct iommufd_ioas
, iopt
);
934 struct iommufd_access
*access
;
937 xa_lock(&ioas
->iopt
.access_list
);
938 xa_for_each(&ioas
->iopt
.access_list
, index
, access
) {
939 if (!iommufd_lock_obj(&access
->obj
))
941 xa_unlock(&ioas
->iopt
.access_list
);
943 access
->ops
->unmap(access
->data
, iova
, length
);
945 iommufd_put_object(access
->ictx
, &access
->obj
);
946 xa_lock(&ioas
->iopt
.access_list
);
948 xa_unlock(&ioas
->iopt
.access_list
);
952 * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages
953 * @access: IOAS access to act on
954 * @iova: Starting IOVA
955 * @length: Number of bytes to access
957 * Return the struct page's. The caller must stop accessing them before calling
958 * this. The iova/length must exactly match the one provided to access_pages.
960 void iommufd_access_unpin_pages(struct iommufd_access
*access
,
961 unsigned long iova
, unsigned long length
)
963 struct iopt_area_contig_iter iter
;
964 struct io_pagetable
*iopt
;
965 unsigned long last_iova
;
966 struct iopt_area
*area
;
968 if (WARN_ON(!length
) ||
969 WARN_ON(check_add_overflow(iova
, length
- 1, &last_iova
)))
972 mutex_lock(&access
->ioas_lock
);
974 * The driver must be doing something wrong if it calls this before an
975 * iommufd_access_attach() or after an iommufd_access_detach().
977 if (WARN_ON(!access
->ioas_unpin
)) {
978 mutex_unlock(&access
->ioas_lock
);
981 iopt
= &access
->ioas_unpin
->iopt
;
983 down_read(&iopt
->iova_rwsem
);
984 iopt_for_each_contig_area(&iter
, area
, iopt
, iova
, last_iova
)
985 iopt_area_remove_access(
986 area
, iopt_area_iova_to_index(area
, iter
.cur_iova
),
987 iopt_area_iova_to_index(
989 min(last_iova
, iopt_area_last_iova(area
))));
990 WARN_ON(!iopt_area_contig_done(&iter
));
991 up_read(&iopt
->iova_rwsem
);
992 mutex_unlock(&access
->ioas_lock
);
994 EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages
, IOMMUFD
);
996 static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter
*iter
)
998 if (iopt_area_start_byte(iter
->area
, iter
->cur_iova
) % PAGE_SIZE
)
1001 if (!iopt_area_contig_done(iter
) &&
1002 (iopt_area_start_byte(iter
->area
, iopt_area_last_iova(iter
->area
)) %
1003 PAGE_SIZE
) != (PAGE_SIZE
- 1))
1008 static bool check_area_prot(struct iopt_area
*area
, unsigned int flags
)
1010 if (flags
& IOMMUFD_ACCESS_RW_WRITE
)
1011 return area
->iommu_prot
& IOMMU_WRITE
;
1012 return area
->iommu_prot
& IOMMU_READ
;
1016 * iommufd_access_pin_pages() - Return a list of pages under the iova
1017 * @access: IOAS access to act on
1018 * @iova: Starting IOVA
1019 * @length: Number of bytes to access
1020 * @out_pages: Output page list
1021 * @flags: IOPMMUFD_ACCESS_RW_* flags
1023 * Reads @length bytes starting at iova and returns the struct page * pointers.
1024 * These can be kmap'd by the caller for CPU access.
1026 * The caller must perform iommufd_access_unpin_pages() when done to balance
1029 * This API always requires a page aligned iova. This happens naturally if the
1030 * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However
1031 * smaller alignments have corner cases where this API can fail on otherwise
1034 int iommufd_access_pin_pages(struct iommufd_access
*access
, unsigned long iova
,
1035 unsigned long length
, struct page
**out_pages
,
1038 struct iopt_area_contig_iter iter
;
1039 struct io_pagetable
*iopt
;
1040 unsigned long last_iova
;
1041 struct iopt_area
*area
;
1044 /* Driver's ops don't support pin_pages */
1045 if (IS_ENABLED(CONFIG_IOMMUFD_TEST
) &&
1046 WARN_ON(access
->iova_alignment
!= PAGE_SIZE
|| !access
->ops
->unmap
))
1051 if (check_add_overflow(iova
, length
- 1, &last_iova
))
1054 mutex_lock(&access
->ioas_lock
);
1055 if (!access
->ioas
) {
1056 mutex_unlock(&access
->ioas_lock
);
1059 iopt
= &access
->ioas
->iopt
;
1061 down_read(&iopt
->iova_rwsem
);
1062 iopt_for_each_contig_area(&iter
, area
, iopt
, iova
, last_iova
) {
1063 unsigned long last
= min(last_iova
, iopt_area_last_iova(area
));
1064 unsigned long last_index
= iopt_area_iova_to_index(area
, last
);
1065 unsigned long index
=
1066 iopt_area_iova_to_index(area
, iter
.cur_iova
);
1068 if (area
->prevent_access
||
1069 !iopt_area_contig_is_aligned(&iter
)) {
1074 if (!check_area_prot(area
, flags
)) {
1079 rc
= iopt_area_add_access(area
, index
, last_index
, out_pages
,
1083 out_pages
+= last_index
- index
+ 1;
1085 if (!iopt_area_contig_done(&iter
)) {
1090 up_read(&iopt
->iova_rwsem
);
1091 mutex_unlock(&access
->ioas_lock
);
1095 if (iova
< iter
.cur_iova
) {
1096 last_iova
= iter
.cur_iova
- 1;
1097 iopt_for_each_contig_area(&iter
, area
, iopt
, iova
, last_iova
)
1098 iopt_area_remove_access(
1100 iopt_area_iova_to_index(area
, iter
.cur_iova
),
1101 iopt_area_iova_to_index(
1102 area
, min(last_iova
,
1103 iopt_area_last_iova(area
))));
1105 up_read(&iopt
->iova_rwsem
);
1106 mutex_unlock(&access
->ioas_lock
);
1109 EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages
, IOMMUFD
);
1112 * iommufd_access_rw - Read or write data under the iova
1113 * @access: IOAS access to act on
1114 * @iova: Starting IOVA
1115 * @data: Kernel buffer to copy to/from
1116 * @length: Number of bytes to access
1117 * @flags: IOMMUFD_ACCESS_RW_* flags
1119 * Copy kernel to/from data into the range given by IOVA/length. If flags
1120 * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized
1121 * by changing it into copy_to/from_user().
1123 int iommufd_access_rw(struct iommufd_access
*access
, unsigned long iova
,
1124 void *data
, size_t length
, unsigned int flags
)
1126 struct iopt_area_contig_iter iter
;
1127 struct io_pagetable
*iopt
;
1128 struct iopt_area
*area
;
1129 unsigned long last_iova
;
1134 if (check_add_overflow(iova
, length
- 1, &last_iova
))
1137 mutex_lock(&access
->ioas_lock
);
1138 if (!access
->ioas
) {
1139 mutex_unlock(&access
->ioas_lock
);
1142 iopt
= &access
->ioas
->iopt
;
1144 down_read(&iopt
->iova_rwsem
);
1145 iopt_for_each_contig_area(&iter
, area
, iopt
, iova
, last_iova
) {
1146 unsigned long last
= min(last_iova
, iopt_area_last_iova(area
));
1147 unsigned long bytes
= (last
- iter
.cur_iova
) + 1;
1149 if (area
->prevent_access
) {
1154 if (!check_area_prot(area
, flags
)) {
1159 rc
= iopt_pages_rw_access(
1160 area
->pages
, iopt_area_start_byte(area
, iter
.cur_iova
),
1161 data
, bytes
, flags
);
1166 if (!iopt_area_contig_done(&iter
))
1169 up_read(&iopt
->iova_rwsem
);
1170 mutex_unlock(&access
->ioas_lock
);
1173 EXPORT_SYMBOL_NS_GPL(iommufd_access_rw
, IOMMUFD
);
1175 int iommufd_get_hw_info(struct iommufd_ucmd
*ucmd
)
1177 struct iommu_hw_info
*cmd
= ucmd
->cmd
;
1178 void __user
*user_ptr
= u64_to_user_ptr(cmd
->data_uptr
);
1179 const struct iommu_ops
*ops
;
1180 struct iommufd_device
*idev
;
1181 unsigned int data_len
;
1182 unsigned int copy_len
;
1186 if (cmd
->flags
|| cmd
->__reserved
)
1189 idev
= iommufd_get_device(ucmd
, cmd
->dev_id
);
1191 return PTR_ERR(idev
);
1193 ops
= dev_iommu_ops(idev
->dev
);
1195 data
= ops
->hw_info(idev
->dev
, &data_len
, &cmd
->out_data_type
);
1202 * drivers that have hw_info callback should have a unique
1203 * iommu_hw_info_type.
1205 if (WARN_ON_ONCE(cmd
->out_data_type
==
1206 IOMMU_HW_INFO_TYPE_NONE
)) {
1211 cmd
->out_data_type
= IOMMU_HW_INFO_TYPE_NONE
;
1216 copy_len
= min(cmd
->data_len
, data_len
);
1217 if (copy_to_user(user_ptr
, data
, copy_len
)) {
1223 * Zero the trailing bytes if the user buffer is bigger than the
1224 * data size kernel actually has.
1226 if (copy_len
< cmd
->data_len
) {
1227 if (clear_user(user_ptr
+ copy_len
, cmd
->data_len
- copy_len
)) {
1234 * We return the length the kernel supports so userspace may know what
1235 * the kernel capability is. It could be larger than the input buffer.
1237 cmd
->data_len
= data_len
;
1239 cmd
->out_capabilities
= 0;
1240 if (device_iommu_capable(idev
->dev
, IOMMU_CAP_DIRTY_TRACKING
))
1241 cmd
->out_capabilities
|= IOMMU_HW_CAP_DIRTY_TRACKING
;
1243 rc
= iommufd_ucmd_respond(ucmd
, sizeof(*cmd
));
1247 iommufd_put_object(ucmd
->ictx
, &idev
->obj
);