1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
4 * Author: Joerg Roedel <jroedel@suse.de>
7 #define pr_fmt(fmt) "AMD-Vi: " fmt
9 #include <linux/mmu_notifier.h>
10 #include <linux/amd-iommu.h>
11 #include <linux/mm_types.h>
12 #include <linux/profile.h>
13 #include <linux/module.h>
14 #include <linux/sched.h>
15 #include <linux/sched/mm.h>
16 #include <linux/iommu.h>
17 #include <linux/wait.h>
18 #include <linux/pci.h>
19 #include <linux/gfp.h>
21 #include "amd_iommu_types.h"
22 #include "amd_iommu_proto.h"
24 MODULE_LICENSE("GPL v2");
25 MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>");
27 #define MAX_DEVICES 0x10000
28 #define PRI_QUEUE_SIZE 512
37 struct list_head list
; /* For global state-list */
38 atomic_t count
; /* Reference count */
39 unsigned mmu_notifier_count
; /* Counting nested mmu_notifier
41 struct mm_struct
*mm
; /* mm_struct for the faults */
42 struct mmu_notifier mn
; /* mmu_notifier handle */
43 struct pri_queue pri
[PRI_QUEUE_SIZE
]; /* PRI tag states */
44 struct device_state
*device_state
; /* Link to our device_state */
45 int pasid
; /* PASID index */
46 bool invalid
; /* Used during setup and
47 teardown of the pasid */
48 spinlock_t lock
; /* Protect pri_queues and
50 wait_queue_head_t wq
; /* To wait for count == 0 */
54 struct list_head list
;
58 struct pasid_state
**states
;
59 struct iommu_domain
*domain
;
62 amd_iommu_invalid_ppr_cb inv_ppr_cb
;
63 amd_iommu_invalidate_ctx inv_ctx_cb
;
69 struct work_struct work
;
70 struct device_state
*dev_state
;
71 struct pasid_state
*state
;
81 static LIST_HEAD(state_list
);
82 static spinlock_t state_lock
;
84 static struct workqueue_struct
*iommu_wq
;
86 static void free_pasid_states(struct device_state
*dev_state
);
88 static u16
device_id(struct pci_dev
*pdev
)
92 devid
= pdev
->bus
->number
;
93 devid
= (devid
<< 8) | pdev
->devfn
;
98 static struct device_state
*__get_device_state(u16 devid
)
100 struct device_state
*dev_state
;
102 list_for_each_entry(dev_state
, &state_list
, list
) {
103 if (dev_state
->devid
== devid
)
110 static struct device_state
*get_device_state(u16 devid
)
112 struct device_state
*dev_state
;
115 spin_lock_irqsave(&state_lock
, flags
);
116 dev_state
= __get_device_state(devid
);
117 if (dev_state
!= NULL
)
118 atomic_inc(&dev_state
->count
);
119 spin_unlock_irqrestore(&state_lock
, flags
);
124 static void free_device_state(struct device_state
*dev_state
)
126 struct iommu_group
*group
;
129 * First detach device from domain - No more PRI requests will arrive
130 * from that device after it is unbound from the IOMMUv2 domain.
132 group
= iommu_group_get(&dev_state
->pdev
->dev
);
136 iommu_detach_group(dev_state
->domain
, group
);
138 iommu_group_put(group
);
140 /* Everything is down now, free the IOMMUv2 domain */
141 iommu_domain_free(dev_state
->domain
);
143 /* Finally get rid of the device-state */
147 static void put_device_state(struct device_state
*dev_state
)
149 if (atomic_dec_and_test(&dev_state
->count
))
150 wake_up(&dev_state
->wq
);
153 /* Must be called under dev_state->lock */
154 static struct pasid_state
**__get_pasid_state_ptr(struct device_state
*dev_state
,
155 int pasid
, bool alloc
)
157 struct pasid_state
**root
, **ptr
;
160 level
= dev_state
->pasid_levels
;
161 root
= dev_state
->states
;
165 index
= (pasid
>> (9 * level
)) & 0x1ff;
175 *ptr
= (void *)get_zeroed_page(GFP_ATOMIC
);
180 root
= (struct pasid_state
**)*ptr
;
187 static int set_pasid_state(struct device_state
*dev_state
,
188 struct pasid_state
*pasid_state
,
191 struct pasid_state
**ptr
;
195 spin_lock_irqsave(&dev_state
->lock
, flags
);
196 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
211 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
216 static void clear_pasid_state(struct device_state
*dev_state
, int pasid
)
218 struct pasid_state
**ptr
;
221 spin_lock_irqsave(&dev_state
->lock
, flags
);
222 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
230 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
233 static struct pasid_state
*get_pasid_state(struct device_state
*dev_state
,
236 struct pasid_state
**ptr
, *ret
= NULL
;
239 spin_lock_irqsave(&dev_state
->lock
, flags
);
240 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, false);
247 atomic_inc(&ret
->count
);
250 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
255 static void free_pasid_state(struct pasid_state
*pasid_state
)
260 static void put_pasid_state(struct pasid_state
*pasid_state
)
262 if (atomic_dec_and_test(&pasid_state
->count
))
263 wake_up(&pasid_state
->wq
);
266 static void put_pasid_state_wait(struct pasid_state
*pasid_state
)
268 atomic_dec(&pasid_state
->count
);
269 wait_event(pasid_state
->wq
, !atomic_read(&pasid_state
->count
));
270 free_pasid_state(pasid_state
);
273 static void unbind_pasid(struct pasid_state
*pasid_state
)
275 struct iommu_domain
*domain
;
277 domain
= pasid_state
->device_state
->domain
;
280 * Mark pasid_state as invalid, no more faults will we added to the
281 * work queue after this is visible everywhere.
283 pasid_state
->invalid
= true;
285 /* Make sure this is visible */
288 /* After this the device/pasid can't access the mm anymore */
289 amd_iommu_domain_clear_gcr3(domain
, pasid_state
->pasid
);
291 /* Make sure no more pending faults are in the queue */
292 flush_workqueue(iommu_wq
);
295 static void free_pasid_states_level1(struct pasid_state
**tbl
)
299 for (i
= 0; i
< 512; ++i
) {
303 free_page((unsigned long)tbl
[i
]);
307 static void free_pasid_states_level2(struct pasid_state
**tbl
)
309 struct pasid_state
**ptr
;
312 for (i
= 0; i
< 512; ++i
) {
316 ptr
= (struct pasid_state
**)tbl
[i
];
317 free_pasid_states_level1(ptr
);
321 static void free_pasid_states(struct device_state
*dev_state
)
323 struct pasid_state
*pasid_state
;
326 for (i
= 0; i
< dev_state
->max_pasids
; ++i
) {
327 pasid_state
= get_pasid_state(dev_state
, i
);
328 if (pasid_state
== NULL
)
331 put_pasid_state(pasid_state
);
334 * This will call the mn_release function and
337 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
339 put_pasid_state_wait(pasid_state
); /* Reference taken in
340 amd_iommu_bind_pasid */
342 /* Drop reference taken in amd_iommu_bind_pasid */
343 put_device_state(dev_state
);
346 if (dev_state
->pasid_levels
== 2)
347 free_pasid_states_level2(dev_state
->states
);
348 else if (dev_state
->pasid_levels
== 1)
349 free_pasid_states_level1(dev_state
->states
);
351 BUG_ON(dev_state
->pasid_levels
!= 0);
353 free_page((unsigned long)dev_state
->states
);
356 static struct pasid_state
*mn_to_state(struct mmu_notifier
*mn
)
358 return container_of(mn
, struct pasid_state
, mn
);
361 static void mn_invalidate_range(struct mmu_notifier
*mn
,
362 struct mm_struct
*mm
,
363 unsigned long start
, unsigned long end
)
365 struct pasid_state
*pasid_state
;
366 struct device_state
*dev_state
;
368 pasid_state
= mn_to_state(mn
);
369 dev_state
= pasid_state
->device_state
;
371 if ((start
^ (end
- 1)) < PAGE_SIZE
)
372 amd_iommu_flush_page(dev_state
->domain
, pasid_state
->pasid
,
375 amd_iommu_flush_tlb(dev_state
->domain
, pasid_state
->pasid
);
378 static void mn_release(struct mmu_notifier
*mn
, struct mm_struct
*mm
)
380 struct pasid_state
*pasid_state
;
381 struct device_state
*dev_state
;
386 pasid_state
= mn_to_state(mn
);
387 dev_state
= pasid_state
->device_state
;
388 run_inv_ctx_cb
= !pasid_state
->invalid
;
390 if (run_inv_ctx_cb
&& dev_state
->inv_ctx_cb
)
391 dev_state
->inv_ctx_cb(dev_state
->pdev
, pasid_state
->pasid
);
393 unbind_pasid(pasid_state
);
396 static const struct mmu_notifier_ops iommu_mn
= {
397 .release
= mn_release
,
398 .invalidate_range
= mn_invalidate_range
,
401 static void set_pri_tag_status(struct pasid_state
*pasid_state
,
406 spin_lock_irqsave(&pasid_state
->lock
, flags
);
407 pasid_state
->pri
[tag
].status
= status
;
408 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
411 static void finish_pri_tag(struct device_state
*dev_state
,
412 struct pasid_state
*pasid_state
,
417 spin_lock_irqsave(&pasid_state
->lock
, flags
);
418 if (atomic_dec_and_test(&pasid_state
->pri
[tag
].inflight
) &&
419 pasid_state
->pri
[tag
].finish
) {
420 amd_iommu_complete_ppr(dev_state
->pdev
, pasid_state
->pasid
,
421 pasid_state
->pri
[tag
].status
, tag
);
422 pasid_state
->pri
[tag
].finish
= false;
423 pasid_state
->pri
[tag
].status
= PPR_SUCCESS
;
425 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
428 static void handle_fault_error(struct fault
*fault
)
432 if (!fault
->dev_state
->inv_ppr_cb
) {
433 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
437 status
= fault
->dev_state
->inv_ppr_cb(fault
->dev_state
->pdev
,
442 case AMD_IOMMU_INV_PRI_RSP_SUCCESS
:
443 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_SUCCESS
);
445 case AMD_IOMMU_INV_PRI_RSP_INVALID
:
446 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
448 case AMD_IOMMU_INV_PRI_RSP_FAIL
:
449 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_FAILURE
);
456 static bool access_error(struct vm_area_struct
*vma
, struct fault
*fault
)
458 unsigned long requested
= 0;
460 if (fault
->flags
& PPR_FAULT_EXEC
)
461 requested
|= VM_EXEC
;
463 if (fault
->flags
& PPR_FAULT_READ
)
464 requested
|= VM_READ
;
466 if (fault
->flags
& PPR_FAULT_WRITE
)
467 requested
|= VM_WRITE
;
469 return (requested
& ~vma
->vm_flags
) != 0;
472 static void do_fault(struct work_struct
*work
)
474 struct fault
*fault
= container_of(work
, struct fault
, work
);
475 struct vm_area_struct
*vma
;
476 vm_fault_t ret
= VM_FAULT_ERROR
;
477 unsigned int flags
= 0;
478 struct mm_struct
*mm
;
481 mm
= fault
->state
->mm
;
482 address
= fault
->address
;
484 if (fault
->flags
& PPR_FAULT_USER
)
485 flags
|= FAULT_FLAG_USER
;
486 if (fault
->flags
& PPR_FAULT_WRITE
)
487 flags
|= FAULT_FLAG_WRITE
;
488 flags
|= FAULT_FLAG_REMOTE
;
490 down_read(&mm
->mmap_sem
);
491 vma
= find_extend_vma(mm
, address
);
492 if (!vma
|| address
< vma
->vm_start
)
493 /* failed to get a vma in the right range */
496 /* Check if we have the right permissions on the vma */
497 if (access_error(vma
, fault
))
500 ret
= handle_mm_fault(vma
, address
, flags
);
502 up_read(&mm
->mmap_sem
);
504 if (ret
& VM_FAULT_ERROR
)
505 /* failed to service fault */
506 handle_fault_error(fault
);
508 finish_pri_tag(fault
->dev_state
, fault
->state
, fault
->tag
);
510 put_pasid_state(fault
->state
);
515 static int ppr_notifier(struct notifier_block
*nb
, unsigned long e
, void *data
)
517 struct amd_iommu_fault
*iommu_fault
;
518 struct pasid_state
*pasid_state
;
519 struct device_state
*dev_state
;
525 struct iommu_dev_data
*dev_data
;
526 struct pci_dev
*pdev
= NULL
;
529 tag
= iommu_fault
->tag
& 0x1ff;
530 finish
= (iommu_fault
->tag
>> 9) & 1;
532 devid
= iommu_fault
->device_id
;
533 pdev
= pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid
),
537 dev_data
= get_dev_data(&pdev
->dev
);
539 /* In kdump kernel pci dev is not initialized yet -> send INVALID */
541 if (translation_pre_enabled(amd_iommu_rlookup_table
[devid
])
542 && dev_data
->defer_attach
) {
543 amd_iommu_complete_ppr(pdev
, iommu_fault
->pasid
,
548 dev_state
= get_device_state(iommu_fault
->device_id
);
549 if (dev_state
== NULL
)
552 pasid_state
= get_pasid_state(dev_state
, iommu_fault
->pasid
);
553 if (pasid_state
== NULL
|| pasid_state
->invalid
) {
554 /* We know the device but not the PASID -> send INVALID */
555 amd_iommu_complete_ppr(dev_state
->pdev
, iommu_fault
->pasid
,
560 spin_lock_irqsave(&pasid_state
->lock
, flags
);
561 atomic_inc(&pasid_state
->pri
[tag
].inflight
);
563 pasid_state
->pri
[tag
].finish
= true;
564 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
566 fault
= kzalloc(sizeof(*fault
), GFP_ATOMIC
);
568 /* We are OOM - send success and let the device re-fault */
569 finish_pri_tag(dev_state
, pasid_state
, tag
);
573 fault
->dev_state
= dev_state
;
574 fault
->address
= iommu_fault
->address
;
575 fault
->state
= pasid_state
;
577 fault
->finish
= finish
;
578 fault
->pasid
= iommu_fault
->pasid
;
579 fault
->flags
= iommu_fault
->flags
;
580 INIT_WORK(&fault
->work
, do_fault
);
582 queue_work(iommu_wq
, &fault
->work
);
588 if (ret
!= NOTIFY_OK
&& pasid_state
)
589 put_pasid_state(pasid_state
);
591 put_device_state(dev_state
);
597 static struct notifier_block ppr_nb
= {
598 .notifier_call
= ppr_notifier
,
601 int amd_iommu_bind_pasid(struct pci_dev
*pdev
, int pasid
,
602 struct task_struct
*task
)
604 struct pasid_state
*pasid_state
;
605 struct device_state
*dev_state
;
606 struct mm_struct
*mm
;
612 if (!amd_iommu_v2_supported())
615 devid
= device_id(pdev
);
616 dev_state
= get_device_state(devid
);
618 if (dev_state
== NULL
)
622 if (pasid
< 0 || pasid
>= dev_state
->max_pasids
)
626 pasid_state
= kzalloc(sizeof(*pasid_state
), GFP_KERNEL
);
627 if (pasid_state
== NULL
)
631 atomic_set(&pasid_state
->count
, 1);
632 init_waitqueue_head(&pasid_state
->wq
);
633 spin_lock_init(&pasid_state
->lock
);
635 mm
= get_task_mm(task
);
636 pasid_state
->mm
= mm
;
637 pasid_state
->device_state
= dev_state
;
638 pasid_state
->pasid
= pasid
;
639 pasid_state
->invalid
= true; /* Mark as valid only if we are
640 done with setting up the pasid */
641 pasid_state
->mn
.ops
= &iommu_mn
;
643 if (pasid_state
->mm
== NULL
)
646 mmu_notifier_register(&pasid_state
->mn
, mm
);
648 ret
= set_pasid_state(dev_state
, pasid_state
, pasid
);
652 ret
= amd_iommu_domain_set_gcr3(dev_state
->domain
, pasid
,
653 __pa(pasid_state
->mm
->pgd
));
655 goto out_clear_state
;
657 /* Now we are ready to handle faults */
658 pasid_state
->invalid
= false;
661 * Drop the reference to the mm_struct here. We rely on the
662 * mmu_notifier release call-back to inform us when the mm
670 clear_pasid_state(dev_state
, pasid
);
673 mmu_notifier_unregister(&pasid_state
->mn
, mm
);
677 free_pasid_state(pasid_state
);
680 put_device_state(dev_state
);
684 EXPORT_SYMBOL(amd_iommu_bind_pasid
);
686 void amd_iommu_unbind_pasid(struct pci_dev
*pdev
, int pasid
)
688 struct pasid_state
*pasid_state
;
689 struct device_state
*dev_state
;
694 if (!amd_iommu_v2_supported())
697 devid
= device_id(pdev
);
698 dev_state
= get_device_state(devid
);
699 if (dev_state
== NULL
)
702 if (pasid
< 0 || pasid
>= dev_state
->max_pasids
)
705 pasid_state
= get_pasid_state(dev_state
, pasid
);
706 if (pasid_state
== NULL
)
709 * Drop reference taken here. We are safe because we still hold
710 * the reference taken in the amd_iommu_bind_pasid function.
712 put_pasid_state(pasid_state
);
714 /* Clear the pasid state so that the pasid can be re-used */
715 clear_pasid_state(dev_state
, pasid_state
->pasid
);
718 * Call mmu_notifier_unregister to drop our reference
721 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
723 put_pasid_state_wait(pasid_state
); /* Reference taken in
724 amd_iommu_bind_pasid */
726 /* Drop reference taken in this function */
727 put_device_state(dev_state
);
729 /* Drop reference taken in amd_iommu_bind_pasid */
730 put_device_state(dev_state
);
732 EXPORT_SYMBOL(amd_iommu_unbind_pasid
);
734 int amd_iommu_init_device(struct pci_dev
*pdev
, int pasids
)
736 struct device_state
*dev_state
;
737 struct iommu_group
*group
;
744 if (!amd_iommu_v2_supported())
747 if (pasids
<= 0 || pasids
> (PASID_MASK
+ 1))
750 devid
= device_id(pdev
);
752 dev_state
= kzalloc(sizeof(*dev_state
), GFP_KERNEL
);
753 if (dev_state
== NULL
)
756 spin_lock_init(&dev_state
->lock
);
757 init_waitqueue_head(&dev_state
->wq
);
758 dev_state
->pdev
= pdev
;
759 dev_state
->devid
= devid
;
762 for (dev_state
->pasid_levels
= 0; (tmp
- 1) & ~0x1ff; tmp
>>= 9)
763 dev_state
->pasid_levels
+= 1;
765 atomic_set(&dev_state
->count
, 1);
766 dev_state
->max_pasids
= pasids
;
769 dev_state
->states
= (void *)get_zeroed_page(GFP_KERNEL
);
770 if (dev_state
->states
== NULL
)
771 goto out_free_dev_state
;
773 dev_state
->domain
= iommu_domain_alloc(&pci_bus_type
);
774 if (dev_state
->domain
== NULL
)
775 goto out_free_states
;
777 amd_iommu_domain_direct_map(dev_state
->domain
);
779 ret
= amd_iommu_domain_enable_v2(dev_state
->domain
, pasids
);
781 goto out_free_domain
;
783 group
= iommu_group_get(&pdev
->dev
);
786 goto out_free_domain
;
789 ret
= iommu_attach_group(dev_state
->domain
, group
);
793 iommu_group_put(group
);
795 spin_lock_irqsave(&state_lock
, flags
);
797 if (__get_device_state(devid
) != NULL
) {
798 spin_unlock_irqrestore(&state_lock
, flags
);
800 goto out_free_domain
;
803 list_add_tail(&dev_state
->list
, &state_list
);
805 spin_unlock_irqrestore(&state_lock
, flags
);
810 iommu_group_put(group
);
813 iommu_domain_free(dev_state
->domain
);
816 free_page((unsigned long)dev_state
->states
);
823 EXPORT_SYMBOL(amd_iommu_init_device
);
825 void amd_iommu_free_device(struct pci_dev
*pdev
)
827 struct device_state
*dev_state
;
831 if (!amd_iommu_v2_supported())
834 devid
= device_id(pdev
);
836 spin_lock_irqsave(&state_lock
, flags
);
838 dev_state
= __get_device_state(devid
);
839 if (dev_state
== NULL
) {
840 spin_unlock_irqrestore(&state_lock
, flags
);
844 list_del(&dev_state
->list
);
846 spin_unlock_irqrestore(&state_lock
, flags
);
848 /* Get rid of any remaining pasid states */
849 free_pasid_states(dev_state
);
851 put_device_state(dev_state
);
853 * Wait until the last reference is dropped before freeing
856 wait_event(dev_state
->wq
, !atomic_read(&dev_state
->count
));
857 free_device_state(dev_state
);
859 EXPORT_SYMBOL(amd_iommu_free_device
);
861 int amd_iommu_set_invalid_ppr_cb(struct pci_dev
*pdev
,
862 amd_iommu_invalid_ppr_cb cb
)
864 struct device_state
*dev_state
;
869 if (!amd_iommu_v2_supported())
872 devid
= device_id(pdev
);
874 spin_lock_irqsave(&state_lock
, flags
);
877 dev_state
= __get_device_state(devid
);
878 if (dev_state
== NULL
)
881 dev_state
->inv_ppr_cb
= cb
;
886 spin_unlock_irqrestore(&state_lock
, flags
);
890 EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb
);
892 int amd_iommu_set_invalidate_ctx_cb(struct pci_dev
*pdev
,
893 amd_iommu_invalidate_ctx cb
)
895 struct device_state
*dev_state
;
900 if (!amd_iommu_v2_supported())
903 devid
= device_id(pdev
);
905 spin_lock_irqsave(&state_lock
, flags
);
908 dev_state
= __get_device_state(devid
);
909 if (dev_state
== NULL
)
912 dev_state
->inv_ctx_cb
= cb
;
917 spin_unlock_irqrestore(&state_lock
, flags
);
921 EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb
);
923 static int __init
amd_iommu_v2_init(void)
927 pr_info("AMD IOMMUv2 driver by Joerg Roedel <jroedel@suse.de>\n");
929 if (!amd_iommu_v2_supported()) {
930 pr_info("AMD IOMMUv2 functionality not available on this system\n");
932 * Load anyway to provide the symbols to other modules
933 * which may use AMD IOMMUv2 optionally.
938 spin_lock_init(&state_lock
);
941 iommu_wq
= alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM
, 0);
942 if (iommu_wq
== NULL
)
945 amd_iommu_register_ppr_notifier(&ppr_nb
);
953 static void __exit
amd_iommu_v2_exit(void)
955 struct device_state
*dev_state
;
958 if (!amd_iommu_v2_supported())
961 amd_iommu_unregister_ppr_notifier(&ppr_nb
);
963 flush_workqueue(iommu_wq
);
966 * The loop below might call flush_workqueue(), so call
967 * destroy_workqueue() after it
969 for (i
= 0; i
< MAX_DEVICES
; ++i
) {
970 dev_state
= get_device_state(i
);
972 if (dev_state
== NULL
)
977 put_device_state(dev_state
);
978 amd_iommu_free_device(dev_state
->pdev
);
981 destroy_workqueue(iommu_wq
);
984 module_init(amd_iommu_v2_init
);
985 module_exit(amd_iommu_v2_exit
);