1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
4 * Author: Joerg Roedel <jroedel@suse.de>
7 #define pr_fmt(fmt) "AMD-Vi: " fmt
9 #include <linux/mmu_notifier.h>
10 #include <linux/amd-iommu.h>
11 #include <linux/mm_types.h>
12 #include <linux/profile.h>
13 #include <linux/module.h>
14 #include <linux/sched.h>
15 #include <linux/sched/mm.h>
16 #include <linux/wait.h>
17 #include <linux/pci.h>
18 #include <linux/gfp.h>
20 #include "amd_iommu.h"
22 MODULE_LICENSE("GPL v2");
23 MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>");
25 #define MAX_DEVICES 0x10000
26 #define PRI_QUEUE_SIZE 512
35 struct list_head list
; /* For global state-list */
36 atomic_t count
; /* Reference count */
37 unsigned mmu_notifier_count
; /* Counting nested mmu_notifier
39 struct mm_struct
*mm
; /* mm_struct for the faults */
40 struct mmu_notifier mn
; /* mmu_notifier handle */
41 struct pri_queue pri
[PRI_QUEUE_SIZE
]; /* PRI tag states */
42 struct device_state
*device_state
; /* Link to our device_state */
43 u32 pasid
; /* PASID index */
44 bool invalid
; /* Used during setup and
45 teardown of the pasid */
46 spinlock_t lock
; /* Protect pri_queues and
48 wait_queue_head_t wq
; /* To wait for count == 0 */
52 struct list_head list
;
56 struct pasid_state
**states
;
57 struct iommu_domain
*domain
;
60 amd_iommu_invalid_ppr_cb inv_ppr_cb
;
61 amd_iommu_invalidate_ctx inv_ctx_cb
;
67 struct work_struct work
;
68 struct device_state
*dev_state
;
69 struct pasid_state
*state
;
79 static LIST_HEAD(state_list
);
80 static spinlock_t state_lock
;
82 static struct workqueue_struct
*iommu_wq
;
84 static void free_pasid_states(struct device_state
*dev_state
);
86 static u16
device_id(struct pci_dev
*pdev
)
90 devid
= pdev
->bus
->number
;
91 devid
= (devid
<< 8) | pdev
->devfn
;
96 static struct device_state
*__get_device_state(u16 devid
)
98 struct device_state
*dev_state
;
100 list_for_each_entry(dev_state
, &state_list
, list
) {
101 if (dev_state
->devid
== devid
)
108 static struct device_state
*get_device_state(u16 devid
)
110 struct device_state
*dev_state
;
113 spin_lock_irqsave(&state_lock
, flags
);
114 dev_state
= __get_device_state(devid
);
115 if (dev_state
!= NULL
)
116 atomic_inc(&dev_state
->count
);
117 spin_unlock_irqrestore(&state_lock
, flags
);
122 static void free_device_state(struct device_state
*dev_state
)
124 struct iommu_group
*group
;
127 * First detach device from domain - No more PRI requests will arrive
128 * from that device after it is unbound from the IOMMUv2 domain.
130 group
= iommu_group_get(&dev_state
->pdev
->dev
);
134 iommu_detach_group(dev_state
->domain
, group
);
136 iommu_group_put(group
);
138 /* Everything is down now, free the IOMMUv2 domain */
139 iommu_domain_free(dev_state
->domain
);
141 /* Finally get rid of the device-state */
145 static void put_device_state(struct device_state
*dev_state
)
147 if (atomic_dec_and_test(&dev_state
->count
))
148 wake_up(&dev_state
->wq
);
151 /* Must be called under dev_state->lock */
152 static struct pasid_state
**__get_pasid_state_ptr(struct device_state
*dev_state
,
153 u32 pasid
, bool alloc
)
155 struct pasid_state
**root
, **ptr
;
158 level
= dev_state
->pasid_levels
;
159 root
= dev_state
->states
;
163 index
= (pasid
>> (9 * level
)) & 0x1ff;
173 *ptr
= (void *)get_zeroed_page(GFP_ATOMIC
);
178 root
= (struct pasid_state
**)*ptr
;
185 static int set_pasid_state(struct device_state
*dev_state
,
186 struct pasid_state
*pasid_state
,
189 struct pasid_state
**ptr
;
193 spin_lock_irqsave(&dev_state
->lock
, flags
);
194 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
209 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
214 static void clear_pasid_state(struct device_state
*dev_state
, u32 pasid
)
216 struct pasid_state
**ptr
;
219 spin_lock_irqsave(&dev_state
->lock
, flags
);
220 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
228 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
231 static struct pasid_state
*get_pasid_state(struct device_state
*dev_state
,
234 struct pasid_state
**ptr
, *ret
= NULL
;
237 spin_lock_irqsave(&dev_state
->lock
, flags
);
238 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, false);
245 atomic_inc(&ret
->count
);
248 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
253 static void free_pasid_state(struct pasid_state
*pasid_state
)
258 static void put_pasid_state(struct pasid_state
*pasid_state
)
260 if (atomic_dec_and_test(&pasid_state
->count
))
261 wake_up(&pasid_state
->wq
);
264 static void put_pasid_state_wait(struct pasid_state
*pasid_state
)
266 atomic_dec(&pasid_state
->count
);
267 wait_event(pasid_state
->wq
, !atomic_read(&pasid_state
->count
));
268 free_pasid_state(pasid_state
);
271 static void unbind_pasid(struct pasid_state
*pasid_state
)
273 struct iommu_domain
*domain
;
275 domain
= pasid_state
->device_state
->domain
;
278 * Mark pasid_state as invalid, no more faults will we added to the
279 * work queue after this is visible everywhere.
281 pasid_state
->invalid
= true;
283 /* Make sure this is visible */
286 /* After this the device/pasid can't access the mm anymore */
287 amd_iommu_domain_clear_gcr3(domain
, pasid_state
->pasid
);
289 /* Make sure no more pending faults are in the queue */
290 flush_workqueue(iommu_wq
);
293 static void free_pasid_states_level1(struct pasid_state
**tbl
)
297 for (i
= 0; i
< 512; ++i
) {
301 free_page((unsigned long)tbl
[i
]);
305 static void free_pasid_states_level2(struct pasid_state
**tbl
)
307 struct pasid_state
**ptr
;
310 for (i
= 0; i
< 512; ++i
) {
314 ptr
= (struct pasid_state
**)tbl
[i
];
315 free_pasid_states_level1(ptr
);
319 static void free_pasid_states(struct device_state
*dev_state
)
321 struct pasid_state
*pasid_state
;
324 for (i
= 0; i
< dev_state
->max_pasids
; ++i
) {
325 pasid_state
= get_pasid_state(dev_state
, i
);
326 if (pasid_state
== NULL
)
329 put_pasid_state(pasid_state
);
332 * This will call the mn_release function and
335 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
337 put_pasid_state_wait(pasid_state
); /* Reference taken in
338 amd_iommu_bind_pasid */
340 /* Drop reference taken in amd_iommu_bind_pasid */
341 put_device_state(dev_state
);
344 if (dev_state
->pasid_levels
== 2)
345 free_pasid_states_level2(dev_state
->states
);
346 else if (dev_state
->pasid_levels
== 1)
347 free_pasid_states_level1(dev_state
->states
);
349 BUG_ON(dev_state
->pasid_levels
!= 0);
351 free_page((unsigned long)dev_state
->states
);
354 static struct pasid_state
*mn_to_state(struct mmu_notifier
*mn
)
356 return container_of(mn
, struct pasid_state
, mn
);
359 static void mn_invalidate_range(struct mmu_notifier
*mn
,
360 struct mm_struct
*mm
,
361 unsigned long start
, unsigned long end
)
363 struct pasid_state
*pasid_state
;
364 struct device_state
*dev_state
;
366 pasid_state
= mn_to_state(mn
);
367 dev_state
= pasid_state
->device_state
;
369 if ((start
^ (end
- 1)) < PAGE_SIZE
)
370 amd_iommu_flush_page(dev_state
->domain
, pasid_state
->pasid
,
373 amd_iommu_flush_tlb(dev_state
->domain
, pasid_state
->pasid
);
376 static void mn_release(struct mmu_notifier
*mn
, struct mm_struct
*mm
)
378 struct pasid_state
*pasid_state
;
379 struct device_state
*dev_state
;
384 pasid_state
= mn_to_state(mn
);
385 dev_state
= pasid_state
->device_state
;
386 run_inv_ctx_cb
= !pasid_state
->invalid
;
388 if (run_inv_ctx_cb
&& dev_state
->inv_ctx_cb
)
389 dev_state
->inv_ctx_cb(dev_state
->pdev
, pasid_state
->pasid
);
391 unbind_pasid(pasid_state
);
394 static const struct mmu_notifier_ops iommu_mn
= {
395 .release
= mn_release
,
396 .invalidate_range
= mn_invalidate_range
,
399 static void set_pri_tag_status(struct pasid_state
*pasid_state
,
404 spin_lock_irqsave(&pasid_state
->lock
, flags
);
405 pasid_state
->pri
[tag
].status
= status
;
406 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
409 static void finish_pri_tag(struct device_state
*dev_state
,
410 struct pasid_state
*pasid_state
,
415 spin_lock_irqsave(&pasid_state
->lock
, flags
);
416 if (atomic_dec_and_test(&pasid_state
->pri
[tag
].inflight
) &&
417 pasid_state
->pri
[tag
].finish
) {
418 amd_iommu_complete_ppr(dev_state
->pdev
, pasid_state
->pasid
,
419 pasid_state
->pri
[tag
].status
, tag
);
420 pasid_state
->pri
[tag
].finish
= false;
421 pasid_state
->pri
[tag
].status
= PPR_SUCCESS
;
423 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
426 static void handle_fault_error(struct fault
*fault
)
430 if (!fault
->dev_state
->inv_ppr_cb
) {
431 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
435 status
= fault
->dev_state
->inv_ppr_cb(fault
->dev_state
->pdev
,
440 case AMD_IOMMU_INV_PRI_RSP_SUCCESS
:
441 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_SUCCESS
);
443 case AMD_IOMMU_INV_PRI_RSP_INVALID
:
444 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
446 case AMD_IOMMU_INV_PRI_RSP_FAIL
:
447 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_FAILURE
);
454 static bool access_error(struct vm_area_struct
*vma
, struct fault
*fault
)
456 unsigned long requested
= 0;
458 if (fault
->flags
& PPR_FAULT_EXEC
)
459 requested
|= VM_EXEC
;
461 if (fault
->flags
& PPR_FAULT_READ
)
462 requested
|= VM_READ
;
464 if (fault
->flags
& PPR_FAULT_WRITE
)
465 requested
|= VM_WRITE
;
467 return (requested
& ~vma
->vm_flags
) != 0;
470 static void do_fault(struct work_struct
*work
)
472 struct fault
*fault
= container_of(work
, struct fault
, work
);
473 struct vm_area_struct
*vma
;
474 vm_fault_t ret
= VM_FAULT_ERROR
;
475 unsigned int flags
= 0;
476 struct mm_struct
*mm
;
479 mm
= fault
->state
->mm
;
480 address
= fault
->address
;
482 if (fault
->flags
& PPR_FAULT_USER
)
483 flags
|= FAULT_FLAG_USER
;
484 if (fault
->flags
& PPR_FAULT_WRITE
)
485 flags
|= FAULT_FLAG_WRITE
;
486 flags
|= FAULT_FLAG_REMOTE
;
489 vma
= find_extend_vma(mm
, address
);
490 if (!vma
|| address
< vma
->vm_start
)
491 /* failed to get a vma in the right range */
494 /* Check if we have the right permissions on the vma */
495 if (access_error(vma
, fault
))
498 ret
= handle_mm_fault(vma
, address
, flags
, NULL
);
500 mmap_read_unlock(mm
);
502 if (ret
& VM_FAULT_ERROR
)
503 /* failed to service fault */
504 handle_fault_error(fault
);
506 finish_pri_tag(fault
->dev_state
, fault
->state
, fault
->tag
);
508 put_pasid_state(fault
->state
);
513 static int ppr_notifier(struct notifier_block
*nb
, unsigned long e
, void *data
)
515 struct amd_iommu_fault
*iommu_fault
;
516 struct pasid_state
*pasid_state
;
517 struct device_state
*dev_state
;
518 struct pci_dev
*pdev
= NULL
;
526 tag
= iommu_fault
->tag
& 0x1ff;
527 finish
= (iommu_fault
->tag
>> 9) & 1;
529 devid
= iommu_fault
->device_id
;
530 pdev
= pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid
),
537 /* In kdump kernel pci dev is not initialized yet -> send INVALID */
538 if (amd_iommu_is_attach_deferred(NULL
, &pdev
->dev
)) {
539 amd_iommu_complete_ppr(pdev
, iommu_fault
->pasid
,
544 dev_state
= get_device_state(iommu_fault
->device_id
);
545 if (dev_state
== NULL
)
548 pasid_state
= get_pasid_state(dev_state
, iommu_fault
->pasid
);
549 if (pasid_state
== NULL
|| pasid_state
->invalid
) {
550 /* We know the device but not the PASID -> send INVALID */
551 amd_iommu_complete_ppr(dev_state
->pdev
, iommu_fault
->pasid
,
556 spin_lock_irqsave(&pasid_state
->lock
, flags
);
557 atomic_inc(&pasid_state
->pri
[tag
].inflight
);
559 pasid_state
->pri
[tag
].finish
= true;
560 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
562 fault
= kzalloc(sizeof(*fault
), GFP_ATOMIC
);
564 /* We are OOM - send success and let the device re-fault */
565 finish_pri_tag(dev_state
, pasid_state
, tag
);
569 fault
->dev_state
= dev_state
;
570 fault
->address
= iommu_fault
->address
;
571 fault
->state
= pasid_state
;
573 fault
->finish
= finish
;
574 fault
->pasid
= iommu_fault
->pasid
;
575 fault
->flags
= iommu_fault
->flags
;
576 INIT_WORK(&fault
->work
, do_fault
);
578 queue_work(iommu_wq
, &fault
->work
);
584 if (ret
!= NOTIFY_OK
&& pasid_state
)
585 put_pasid_state(pasid_state
);
587 put_device_state(dev_state
);
593 static struct notifier_block ppr_nb
= {
594 .notifier_call
= ppr_notifier
,
597 int amd_iommu_bind_pasid(struct pci_dev
*pdev
, u32 pasid
,
598 struct task_struct
*task
)
600 struct pasid_state
*pasid_state
;
601 struct device_state
*dev_state
;
602 struct mm_struct
*mm
;
608 if (!amd_iommu_v2_supported())
611 devid
= device_id(pdev
);
612 dev_state
= get_device_state(devid
);
614 if (dev_state
== NULL
)
618 if (pasid
>= dev_state
->max_pasids
)
622 pasid_state
= kzalloc(sizeof(*pasid_state
), GFP_KERNEL
);
623 if (pasid_state
== NULL
)
627 atomic_set(&pasid_state
->count
, 1);
628 init_waitqueue_head(&pasid_state
->wq
);
629 spin_lock_init(&pasid_state
->lock
);
631 mm
= get_task_mm(task
);
632 pasid_state
->mm
= mm
;
633 pasid_state
->device_state
= dev_state
;
634 pasid_state
->pasid
= pasid
;
635 pasid_state
->invalid
= true; /* Mark as valid only if we are
636 done with setting up the pasid */
637 pasid_state
->mn
.ops
= &iommu_mn
;
639 if (pasid_state
->mm
== NULL
)
642 mmu_notifier_register(&pasid_state
->mn
, mm
);
644 ret
= set_pasid_state(dev_state
, pasid_state
, pasid
);
648 ret
= amd_iommu_domain_set_gcr3(dev_state
->domain
, pasid
,
649 __pa(pasid_state
->mm
->pgd
));
651 goto out_clear_state
;
653 /* Now we are ready to handle faults */
654 pasid_state
->invalid
= false;
657 * Drop the reference to the mm_struct here. We rely on the
658 * mmu_notifier release call-back to inform us when the mm
666 clear_pasid_state(dev_state
, pasid
);
669 mmu_notifier_unregister(&pasid_state
->mn
, mm
);
673 free_pasid_state(pasid_state
);
676 put_device_state(dev_state
);
680 EXPORT_SYMBOL(amd_iommu_bind_pasid
);
682 void amd_iommu_unbind_pasid(struct pci_dev
*pdev
, u32 pasid
)
684 struct pasid_state
*pasid_state
;
685 struct device_state
*dev_state
;
690 if (!amd_iommu_v2_supported())
693 devid
= device_id(pdev
);
694 dev_state
= get_device_state(devid
);
695 if (dev_state
== NULL
)
698 if (pasid
>= dev_state
->max_pasids
)
701 pasid_state
= get_pasid_state(dev_state
, pasid
);
702 if (pasid_state
== NULL
)
705 * Drop reference taken here. We are safe because we still hold
706 * the reference taken in the amd_iommu_bind_pasid function.
708 put_pasid_state(pasid_state
);
710 /* Clear the pasid state so that the pasid can be re-used */
711 clear_pasid_state(dev_state
, pasid_state
->pasid
);
714 * Call mmu_notifier_unregister to drop our reference
717 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
719 put_pasid_state_wait(pasid_state
); /* Reference taken in
720 amd_iommu_bind_pasid */
722 /* Drop reference taken in this function */
723 put_device_state(dev_state
);
725 /* Drop reference taken in amd_iommu_bind_pasid */
726 put_device_state(dev_state
);
728 EXPORT_SYMBOL(amd_iommu_unbind_pasid
);
730 int amd_iommu_init_device(struct pci_dev
*pdev
, int pasids
)
732 struct device_state
*dev_state
;
733 struct iommu_group
*group
;
741 * When memory encryption is active the device is likely not in a
742 * direct-mapped domain. Forbid using IOMMUv2 functionality for now.
744 if (mem_encrypt_active())
747 if (!amd_iommu_v2_supported())
750 if (pasids
<= 0 || pasids
> (PASID_MASK
+ 1))
753 devid
= device_id(pdev
);
755 dev_state
= kzalloc(sizeof(*dev_state
), GFP_KERNEL
);
756 if (dev_state
== NULL
)
759 spin_lock_init(&dev_state
->lock
);
760 init_waitqueue_head(&dev_state
->wq
);
761 dev_state
->pdev
= pdev
;
762 dev_state
->devid
= devid
;
765 for (dev_state
->pasid_levels
= 0; (tmp
- 1) & ~0x1ff; tmp
>>= 9)
766 dev_state
->pasid_levels
+= 1;
768 atomic_set(&dev_state
->count
, 1);
769 dev_state
->max_pasids
= pasids
;
772 dev_state
->states
= (void *)get_zeroed_page(GFP_KERNEL
);
773 if (dev_state
->states
== NULL
)
774 goto out_free_dev_state
;
776 dev_state
->domain
= iommu_domain_alloc(&pci_bus_type
);
777 if (dev_state
->domain
== NULL
)
778 goto out_free_states
;
780 amd_iommu_domain_direct_map(dev_state
->domain
);
782 ret
= amd_iommu_domain_enable_v2(dev_state
->domain
, pasids
);
784 goto out_free_domain
;
786 group
= iommu_group_get(&pdev
->dev
);
789 goto out_free_domain
;
792 ret
= iommu_attach_group(dev_state
->domain
, group
);
796 iommu_group_put(group
);
798 spin_lock_irqsave(&state_lock
, flags
);
800 if (__get_device_state(devid
) != NULL
) {
801 spin_unlock_irqrestore(&state_lock
, flags
);
803 goto out_free_domain
;
806 list_add_tail(&dev_state
->list
, &state_list
);
808 spin_unlock_irqrestore(&state_lock
, flags
);
813 iommu_group_put(group
);
816 iommu_domain_free(dev_state
->domain
);
819 free_page((unsigned long)dev_state
->states
);
826 EXPORT_SYMBOL(amd_iommu_init_device
);
828 void amd_iommu_free_device(struct pci_dev
*pdev
)
830 struct device_state
*dev_state
;
834 if (!amd_iommu_v2_supported())
837 devid
= device_id(pdev
);
839 spin_lock_irqsave(&state_lock
, flags
);
841 dev_state
= __get_device_state(devid
);
842 if (dev_state
== NULL
) {
843 spin_unlock_irqrestore(&state_lock
, flags
);
847 list_del(&dev_state
->list
);
849 spin_unlock_irqrestore(&state_lock
, flags
);
851 /* Get rid of any remaining pasid states */
852 free_pasid_states(dev_state
);
854 put_device_state(dev_state
);
856 * Wait until the last reference is dropped before freeing
859 wait_event(dev_state
->wq
, !atomic_read(&dev_state
->count
));
860 free_device_state(dev_state
);
862 EXPORT_SYMBOL(amd_iommu_free_device
);
864 int amd_iommu_set_invalid_ppr_cb(struct pci_dev
*pdev
,
865 amd_iommu_invalid_ppr_cb cb
)
867 struct device_state
*dev_state
;
872 if (!amd_iommu_v2_supported())
875 devid
= device_id(pdev
);
877 spin_lock_irqsave(&state_lock
, flags
);
880 dev_state
= __get_device_state(devid
);
881 if (dev_state
== NULL
)
884 dev_state
->inv_ppr_cb
= cb
;
889 spin_unlock_irqrestore(&state_lock
, flags
);
893 EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb
);
895 int amd_iommu_set_invalidate_ctx_cb(struct pci_dev
*pdev
,
896 amd_iommu_invalidate_ctx cb
)
898 struct device_state
*dev_state
;
903 if (!amd_iommu_v2_supported())
906 devid
= device_id(pdev
);
908 spin_lock_irqsave(&state_lock
, flags
);
911 dev_state
= __get_device_state(devid
);
912 if (dev_state
== NULL
)
915 dev_state
->inv_ctx_cb
= cb
;
920 spin_unlock_irqrestore(&state_lock
, flags
);
924 EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb
);
926 static int __init
amd_iommu_v2_init(void)
930 pr_info("AMD IOMMUv2 driver by Joerg Roedel <jroedel@suse.de>\n");
932 if (!amd_iommu_v2_supported()) {
933 pr_info("AMD IOMMUv2 functionality not available on this system\n");
935 * Load anyway to provide the symbols to other modules
936 * which may use AMD IOMMUv2 optionally.
941 spin_lock_init(&state_lock
);
944 iommu_wq
= alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM
, 0);
945 if (iommu_wq
== NULL
)
948 amd_iommu_register_ppr_notifier(&ppr_nb
);
956 static void __exit
amd_iommu_v2_exit(void)
958 struct device_state
*dev_state
;
961 if (!amd_iommu_v2_supported())
964 amd_iommu_unregister_ppr_notifier(&ppr_nb
);
966 flush_workqueue(iommu_wq
);
969 * The loop below might call flush_workqueue(), so call
970 * destroy_workqueue() after it
972 for (i
= 0; i
< MAX_DEVICES
; ++i
) {
973 dev_state
= get_device_state(i
);
975 if (dev_state
== NULL
)
980 put_device_state(dev_state
);
981 amd_iommu_free_device(dev_state
->pdev
);
984 destroy_workqueue(iommu_wq
);
987 module_init(amd_iommu_v2_init
);
988 module_exit(amd_iommu_v2_exit
);