2 * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <jroedel@suse.de>
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 #include <linux/mmu_notifier.h>
20 #include <linux/amd-iommu.h>
21 #include <linux/mm_types.h>
22 #include <linux/profile.h>
23 #include <linux/module.h>
24 #include <linux/sched.h>
25 #include <linux/iommu.h>
26 #include <linux/wait.h>
27 #include <linux/pci.h>
28 #include <linux/gfp.h>
30 #include "amd_iommu_types.h"
31 #include "amd_iommu_proto.h"
33 MODULE_LICENSE("GPL v2");
34 MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>");
36 #define MAX_DEVICES 0x10000
37 #define PRI_QUEUE_SIZE 512
46 struct list_head list
; /* For global state-list */
47 atomic_t count
; /* Reference count */
48 unsigned mmu_notifier_count
; /* Counting nested mmu_notifier
50 struct mm_struct
*mm
; /* mm_struct for the faults */
51 struct mmu_notifier mn
; /* mmu_notifier handle */
52 struct pri_queue pri
[PRI_QUEUE_SIZE
]; /* PRI tag states */
53 struct device_state
*device_state
; /* Link to our device_state */
54 int pasid
; /* PASID index */
55 bool invalid
; /* Used during setup and
56 teardown of the pasid */
57 spinlock_t lock
; /* Protect pri_queues and
59 wait_queue_head_t wq
; /* To wait for count == 0 */
63 struct list_head list
;
67 struct pasid_state
**states
;
68 struct iommu_domain
*domain
;
71 amd_iommu_invalid_ppr_cb inv_ppr_cb
;
72 amd_iommu_invalidate_ctx inv_ctx_cb
;
78 struct work_struct work
;
79 struct device_state
*dev_state
;
80 struct pasid_state
*state
;
90 static LIST_HEAD(state_list
);
91 static spinlock_t state_lock
;
93 static struct workqueue_struct
*iommu_wq
;
95 static void free_pasid_states(struct device_state
*dev_state
);
97 static u16
device_id(struct pci_dev
*pdev
)
101 devid
= pdev
->bus
->number
;
102 devid
= (devid
<< 8) | pdev
->devfn
;
107 static struct device_state
*__get_device_state(u16 devid
)
109 struct device_state
*dev_state
;
111 list_for_each_entry(dev_state
, &state_list
, list
) {
112 if (dev_state
->devid
== devid
)
119 static struct device_state
*get_device_state(u16 devid
)
121 struct device_state
*dev_state
;
124 spin_lock_irqsave(&state_lock
, flags
);
125 dev_state
= __get_device_state(devid
);
126 if (dev_state
!= NULL
)
127 atomic_inc(&dev_state
->count
);
128 spin_unlock_irqrestore(&state_lock
, flags
);
133 static void free_device_state(struct device_state
*dev_state
)
136 * First detach device from domain - No more PRI requests will arrive
137 * from that device after it is unbound from the IOMMUv2 domain.
139 iommu_detach_device(dev_state
->domain
, &dev_state
->pdev
->dev
);
141 /* Everything is down now, free the IOMMUv2 domain */
142 iommu_domain_free(dev_state
->domain
);
144 /* Finally get rid of the device-state */
148 static void put_device_state(struct device_state
*dev_state
)
150 if (atomic_dec_and_test(&dev_state
->count
))
151 wake_up(&dev_state
->wq
);
154 /* Must be called under dev_state->lock */
155 static struct pasid_state
**__get_pasid_state_ptr(struct device_state
*dev_state
,
156 int pasid
, bool alloc
)
158 struct pasid_state
**root
, **ptr
;
161 level
= dev_state
->pasid_levels
;
162 root
= dev_state
->states
;
166 index
= (pasid
>> (9 * level
)) & 0x1ff;
176 *ptr
= (void *)get_zeroed_page(GFP_ATOMIC
);
181 root
= (struct pasid_state
**)*ptr
;
188 static int set_pasid_state(struct device_state
*dev_state
,
189 struct pasid_state
*pasid_state
,
192 struct pasid_state
**ptr
;
196 spin_lock_irqsave(&dev_state
->lock
, flags
);
197 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
212 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
217 static void clear_pasid_state(struct device_state
*dev_state
, int pasid
)
219 struct pasid_state
**ptr
;
222 spin_lock_irqsave(&dev_state
->lock
, flags
);
223 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, true);
231 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
234 static struct pasid_state
*get_pasid_state(struct device_state
*dev_state
,
237 struct pasid_state
**ptr
, *ret
= NULL
;
240 spin_lock_irqsave(&dev_state
->lock
, flags
);
241 ptr
= __get_pasid_state_ptr(dev_state
, pasid
, false);
248 atomic_inc(&ret
->count
);
251 spin_unlock_irqrestore(&dev_state
->lock
, flags
);
256 static void free_pasid_state(struct pasid_state
*pasid_state
)
261 static void put_pasid_state(struct pasid_state
*pasid_state
)
263 if (atomic_dec_and_test(&pasid_state
->count
))
264 wake_up(&pasid_state
->wq
);
267 static void put_pasid_state_wait(struct pasid_state
*pasid_state
)
269 wait_event(pasid_state
->wq
, !atomic_read(&pasid_state
->count
));
270 free_pasid_state(pasid_state
);
273 static void unbind_pasid(struct pasid_state
*pasid_state
)
275 struct iommu_domain
*domain
;
277 domain
= pasid_state
->device_state
->domain
;
280 * Mark pasid_state as invalid, no more faults will we added to the
281 * work queue after this is visible everywhere.
283 pasid_state
->invalid
= true;
285 /* Make sure this is visible */
288 /* After this the device/pasid can't access the mm anymore */
289 amd_iommu_domain_clear_gcr3(domain
, pasid_state
->pasid
);
291 /* Make sure no more pending faults are in the queue */
292 flush_workqueue(iommu_wq
);
295 static void free_pasid_states_level1(struct pasid_state
**tbl
)
299 for (i
= 0; i
< 512; ++i
) {
303 free_page((unsigned long)tbl
[i
]);
307 static void free_pasid_states_level2(struct pasid_state
**tbl
)
309 struct pasid_state
**ptr
;
312 for (i
= 0; i
< 512; ++i
) {
316 ptr
= (struct pasid_state
**)tbl
[i
];
317 free_pasid_states_level1(ptr
);
321 static void free_pasid_states(struct device_state
*dev_state
)
323 struct pasid_state
*pasid_state
;
326 for (i
= 0; i
< dev_state
->max_pasids
; ++i
) {
327 pasid_state
= get_pasid_state(dev_state
, i
);
328 if (pasid_state
== NULL
)
331 put_pasid_state(pasid_state
);
334 * This will call the mn_release function and
337 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
339 put_pasid_state_wait(pasid_state
); /* Reference taken in
340 amd_iommu_bind_pasid */
342 /* Drop reference taken in amd_iommu_bind_pasid */
343 put_device_state(dev_state
);
346 if (dev_state
->pasid_levels
== 2)
347 free_pasid_states_level2(dev_state
->states
);
348 else if (dev_state
->pasid_levels
== 1)
349 free_pasid_states_level1(dev_state
->states
);
350 else if (dev_state
->pasid_levels
!= 0)
353 free_page((unsigned long)dev_state
->states
);
356 static struct pasid_state
*mn_to_state(struct mmu_notifier
*mn
)
358 return container_of(mn
, struct pasid_state
, mn
);
361 static void __mn_flush_page(struct mmu_notifier
*mn
,
362 unsigned long address
)
364 struct pasid_state
*pasid_state
;
365 struct device_state
*dev_state
;
367 pasid_state
= mn_to_state(mn
);
368 dev_state
= pasid_state
->device_state
;
370 amd_iommu_flush_page(dev_state
->domain
, pasid_state
->pasid
, address
);
373 static int mn_clear_flush_young(struct mmu_notifier
*mn
,
374 struct mm_struct
*mm
,
378 for (; start
< end
; start
+= PAGE_SIZE
)
379 __mn_flush_page(mn
, start
);
384 static void mn_invalidate_page(struct mmu_notifier
*mn
,
385 struct mm_struct
*mm
,
386 unsigned long address
)
388 __mn_flush_page(mn
, address
);
391 static void mn_invalidate_range(struct mmu_notifier
*mn
,
392 struct mm_struct
*mm
,
393 unsigned long start
, unsigned long end
)
395 struct pasid_state
*pasid_state
;
396 struct device_state
*dev_state
;
398 pasid_state
= mn_to_state(mn
);
399 dev_state
= pasid_state
->device_state
;
401 if ((start
^ (end
- 1)) < PAGE_SIZE
)
402 amd_iommu_flush_page(dev_state
->domain
, pasid_state
->pasid
,
405 amd_iommu_flush_tlb(dev_state
->domain
, pasid_state
->pasid
);
408 static void mn_release(struct mmu_notifier
*mn
, struct mm_struct
*mm
)
410 struct pasid_state
*pasid_state
;
411 struct device_state
*dev_state
;
416 pasid_state
= mn_to_state(mn
);
417 dev_state
= pasid_state
->device_state
;
418 run_inv_ctx_cb
= !pasid_state
->invalid
;
420 if (run_inv_ctx_cb
&& pasid_state
->device_state
->inv_ctx_cb
)
421 dev_state
->inv_ctx_cb(dev_state
->pdev
, pasid_state
->pasid
);
423 unbind_pasid(pasid_state
);
426 static struct mmu_notifier_ops iommu_mn
= {
427 .release
= mn_release
,
428 .clear_flush_young
= mn_clear_flush_young
,
429 .invalidate_page
= mn_invalidate_page
,
430 .invalidate_range
= mn_invalidate_range
,
433 static void set_pri_tag_status(struct pasid_state
*pasid_state
,
438 spin_lock_irqsave(&pasid_state
->lock
, flags
);
439 pasid_state
->pri
[tag
].status
= status
;
440 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
443 static void finish_pri_tag(struct device_state
*dev_state
,
444 struct pasid_state
*pasid_state
,
449 spin_lock_irqsave(&pasid_state
->lock
, flags
);
450 if (atomic_dec_and_test(&pasid_state
->pri
[tag
].inflight
) &&
451 pasid_state
->pri
[tag
].finish
) {
452 amd_iommu_complete_ppr(dev_state
->pdev
, pasid_state
->pasid
,
453 pasid_state
->pri
[tag
].status
, tag
);
454 pasid_state
->pri
[tag
].finish
= false;
455 pasid_state
->pri
[tag
].status
= PPR_SUCCESS
;
457 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
460 static void handle_fault_error(struct fault
*fault
)
464 if (!fault
->dev_state
->inv_ppr_cb
) {
465 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
469 status
= fault
->dev_state
->inv_ppr_cb(fault
->dev_state
->pdev
,
474 case AMD_IOMMU_INV_PRI_RSP_SUCCESS
:
475 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_SUCCESS
);
477 case AMD_IOMMU_INV_PRI_RSP_INVALID
:
478 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_INVALID
);
480 case AMD_IOMMU_INV_PRI_RSP_FAIL
:
481 set_pri_tag_status(fault
->state
, fault
->tag
, PPR_FAILURE
);
488 static void do_fault(struct work_struct
*work
)
490 struct fault
*fault
= container_of(work
, struct fault
, work
);
491 struct mm_struct
*mm
;
492 struct vm_area_struct
*vma
;
496 write
= !!(fault
->flags
& PPR_FAULT_WRITE
);
498 mm
= fault
->state
->mm
;
499 address
= fault
->address
;
501 down_read(&mm
->mmap_sem
);
502 vma
= find_extend_vma(mm
, address
);
503 if (!vma
|| address
< vma
->vm_start
) {
504 /* failed to get a vma in the right range */
505 up_read(&mm
->mmap_sem
);
506 handle_fault_error(fault
);
510 ret
= handle_mm_fault(mm
, vma
, address
, write
);
511 if (ret
& VM_FAULT_ERROR
) {
512 /* failed to service fault */
513 up_read(&mm
->mmap_sem
);
514 handle_fault_error(fault
);
518 up_read(&mm
->mmap_sem
);
521 finish_pri_tag(fault
->dev_state
, fault
->state
, fault
->tag
);
523 put_pasid_state(fault
->state
);
528 static int ppr_notifier(struct notifier_block
*nb
, unsigned long e
, void *data
)
530 struct amd_iommu_fault
*iommu_fault
;
531 struct pasid_state
*pasid_state
;
532 struct device_state
*dev_state
;
540 tag
= iommu_fault
->tag
& 0x1ff;
541 finish
= (iommu_fault
->tag
>> 9) & 1;
544 dev_state
= get_device_state(iommu_fault
->device_id
);
545 if (dev_state
== NULL
)
548 pasid_state
= get_pasid_state(dev_state
, iommu_fault
->pasid
);
549 if (pasid_state
== NULL
|| pasid_state
->invalid
) {
550 /* We know the device but not the PASID -> send INVALID */
551 amd_iommu_complete_ppr(dev_state
->pdev
, iommu_fault
->pasid
,
556 spin_lock_irqsave(&pasid_state
->lock
, flags
);
557 atomic_inc(&pasid_state
->pri
[tag
].inflight
);
559 pasid_state
->pri
[tag
].finish
= true;
560 spin_unlock_irqrestore(&pasid_state
->lock
, flags
);
562 fault
= kzalloc(sizeof(*fault
), GFP_ATOMIC
);
564 /* We are OOM - send success and let the device re-fault */
565 finish_pri_tag(dev_state
, pasid_state
, tag
);
569 fault
->dev_state
= dev_state
;
570 fault
->address
= iommu_fault
->address
;
571 fault
->state
= pasid_state
;
573 fault
->finish
= finish
;
574 fault
->pasid
= iommu_fault
->pasid
;
575 fault
->flags
= iommu_fault
->flags
;
576 INIT_WORK(&fault
->work
, do_fault
);
578 queue_work(iommu_wq
, &fault
->work
);
584 if (ret
!= NOTIFY_OK
&& pasid_state
)
585 put_pasid_state(pasid_state
);
587 put_device_state(dev_state
);
593 static struct notifier_block ppr_nb
= {
594 .notifier_call
= ppr_notifier
,
597 int amd_iommu_bind_pasid(struct pci_dev
*pdev
, int pasid
,
598 struct task_struct
*task
)
600 struct pasid_state
*pasid_state
;
601 struct device_state
*dev_state
;
602 struct mm_struct
*mm
;
608 if (!amd_iommu_v2_supported())
611 devid
= device_id(pdev
);
612 dev_state
= get_device_state(devid
);
614 if (dev_state
== NULL
)
618 if (pasid
< 0 || pasid
>= dev_state
->max_pasids
)
622 pasid_state
= kzalloc(sizeof(*pasid_state
), GFP_KERNEL
);
623 if (pasid_state
== NULL
)
627 atomic_set(&pasid_state
->count
, 1);
628 init_waitqueue_head(&pasid_state
->wq
);
629 spin_lock_init(&pasid_state
->lock
);
631 mm
= get_task_mm(task
);
632 pasid_state
->mm
= mm
;
633 pasid_state
->device_state
= dev_state
;
634 pasid_state
->pasid
= pasid
;
635 pasid_state
->invalid
= true; /* Mark as valid only if we are
636 done with setting up the pasid */
637 pasid_state
->mn
.ops
= &iommu_mn
;
639 if (pasid_state
->mm
== NULL
)
642 mmu_notifier_register(&pasid_state
->mn
, mm
);
644 ret
= set_pasid_state(dev_state
, pasid_state
, pasid
);
648 ret
= amd_iommu_domain_set_gcr3(dev_state
->domain
, pasid
,
649 __pa(pasid_state
->mm
->pgd
));
651 goto out_clear_state
;
653 /* Now we are ready to handle faults */
654 pasid_state
->invalid
= false;
657 * Drop the reference to the mm_struct here. We rely on the
658 * mmu_notifier release call-back to inform us when the mm
666 clear_pasid_state(dev_state
, pasid
);
669 mmu_notifier_unregister(&pasid_state
->mn
, mm
);
673 free_pasid_state(pasid_state
);
676 put_device_state(dev_state
);
680 EXPORT_SYMBOL(amd_iommu_bind_pasid
);
682 void amd_iommu_unbind_pasid(struct pci_dev
*pdev
, int pasid
)
684 struct pasid_state
*pasid_state
;
685 struct device_state
*dev_state
;
690 if (!amd_iommu_v2_supported())
693 devid
= device_id(pdev
);
694 dev_state
= get_device_state(devid
);
695 if (dev_state
== NULL
)
698 if (pasid
< 0 || pasid
>= dev_state
->max_pasids
)
701 pasid_state
= get_pasid_state(dev_state
, pasid
);
702 if (pasid_state
== NULL
)
705 * Drop reference taken here. We are safe because we still hold
706 * the reference taken in the amd_iommu_bind_pasid function.
708 put_pasid_state(pasid_state
);
710 /* Clear the pasid state so that the pasid can be re-used */
711 clear_pasid_state(dev_state
, pasid_state
->pasid
);
714 * Call mmu_notifier_unregister to drop our reference
717 mmu_notifier_unregister(&pasid_state
->mn
, pasid_state
->mm
);
719 put_pasid_state_wait(pasid_state
); /* Reference taken in
720 amd_iommu_bind_pasid */
722 /* Drop reference taken in this function */
723 put_device_state(dev_state
);
725 /* Drop reference taken in amd_iommu_bind_pasid */
726 put_device_state(dev_state
);
728 EXPORT_SYMBOL(amd_iommu_unbind_pasid
);
730 int amd_iommu_init_device(struct pci_dev
*pdev
, int pasids
)
732 struct device_state
*dev_state
;
739 if (!amd_iommu_v2_supported())
742 if (pasids
<= 0 || pasids
> (PASID_MASK
+ 1))
745 devid
= device_id(pdev
);
747 dev_state
= kzalloc(sizeof(*dev_state
), GFP_KERNEL
);
748 if (dev_state
== NULL
)
751 spin_lock_init(&dev_state
->lock
);
752 init_waitqueue_head(&dev_state
->wq
);
753 dev_state
->pdev
= pdev
;
754 dev_state
->devid
= devid
;
757 for (dev_state
->pasid_levels
= 0; (tmp
- 1) & ~0x1ff; tmp
>>= 9)
758 dev_state
->pasid_levels
+= 1;
760 atomic_set(&dev_state
->count
, 1);
761 dev_state
->max_pasids
= pasids
;
764 dev_state
->states
= (void *)get_zeroed_page(GFP_KERNEL
);
765 if (dev_state
->states
== NULL
)
766 goto out_free_dev_state
;
768 dev_state
->domain
= iommu_domain_alloc(&pci_bus_type
);
769 if (dev_state
->domain
== NULL
)
770 goto out_free_states
;
772 amd_iommu_domain_direct_map(dev_state
->domain
);
774 ret
= amd_iommu_domain_enable_v2(dev_state
->domain
, pasids
);
776 goto out_free_domain
;
778 ret
= iommu_attach_device(dev_state
->domain
, &pdev
->dev
);
780 goto out_free_domain
;
782 spin_lock_irqsave(&state_lock
, flags
);
784 if (__get_device_state(devid
) != NULL
) {
785 spin_unlock_irqrestore(&state_lock
, flags
);
787 goto out_free_domain
;
790 list_add_tail(&dev_state
->list
, &state_list
);
792 spin_unlock_irqrestore(&state_lock
, flags
);
797 iommu_domain_free(dev_state
->domain
);
800 free_page((unsigned long)dev_state
->states
);
807 EXPORT_SYMBOL(amd_iommu_init_device
);
809 void amd_iommu_free_device(struct pci_dev
*pdev
)
811 struct device_state
*dev_state
;
815 if (!amd_iommu_v2_supported())
818 devid
= device_id(pdev
);
820 spin_lock_irqsave(&state_lock
, flags
);
822 dev_state
= __get_device_state(devid
);
823 if (dev_state
== NULL
) {
824 spin_unlock_irqrestore(&state_lock
, flags
);
828 list_del(&dev_state
->list
);
830 spin_unlock_irqrestore(&state_lock
, flags
);
832 /* Get rid of any remaining pasid states */
833 free_pasid_states(dev_state
);
835 put_device_state(dev_state
);
837 * Wait until the last reference is dropped before freeing
840 wait_event(dev_state
->wq
, !atomic_read(&dev_state
->count
));
841 free_device_state(dev_state
);
843 EXPORT_SYMBOL(amd_iommu_free_device
);
845 int amd_iommu_set_invalid_ppr_cb(struct pci_dev
*pdev
,
846 amd_iommu_invalid_ppr_cb cb
)
848 struct device_state
*dev_state
;
853 if (!amd_iommu_v2_supported())
856 devid
= device_id(pdev
);
858 spin_lock_irqsave(&state_lock
, flags
);
861 dev_state
= __get_device_state(devid
);
862 if (dev_state
== NULL
)
865 dev_state
->inv_ppr_cb
= cb
;
870 spin_unlock_irqrestore(&state_lock
, flags
);
874 EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb
);
876 int amd_iommu_set_invalidate_ctx_cb(struct pci_dev
*pdev
,
877 amd_iommu_invalidate_ctx cb
)
879 struct device_state
*dev_state
;
884 if (!amd_iommu_v2_supported())
887 devid
= device_id(pdev
);
889 spin_lock_irqsave(&state_lock
, flags
);
892 dev_state
= __get_device_state(devid
);
893 if (dev_state
== NULL
)
896 dev_state
->inv_ctx_cb
= cb
;
901 spin_unlock_irqrestore(&state_lock
, flags
);
905 EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb
);
907 static int __init
amd_iommu_v2_init(void)
911 pr_info("AMD IOMMUv2 driver by Joerg Roedel <jroedel@suse.de>\n");
913 if (!amd_iommu_v2_supported()) {
914 pr_info("AMD IOMMUv2 functionality not available on this system\n");
916 * Load anyway to provide the symbols to other modules
917 * which may use AMD IOMMUv2 optionally.
922 spin_lock_init(&state_lock
);
925 iommu_wq
= create_workqueue("amd_iommu_v2");
926 if (iommu_wq
== NULL
)
929 amd_iommu_register_ppr_notifier(&ppr_nb
);
937 static void __exit
amd_iommu_v2_exit(void)
939 struct device_state
*dev_state
;
942 if (!amd_iommu_v2_supported())
945 amd_iommu_unregister_ppr_notifier(&ppr_nb
);
947 flush_workqueue(iommu_wq
);
950 * The loop below might call flush_workqueue(), so call
951 * destroy_workqueue() after it
953 for (i
= 0; i
< MAX_DEVICES
; ++i
) {
954 dev_state
= get_device_state(i
);
956 if (dev_state
== NULL
)
961 put_device_state(dev_state
);
962 amd_iommu_free_device(dev_state
->pdev
);
965 destroy_workqueue(iommu_wq
);
968 module_init(amd_iommu_v2_init
);
969 module_exit(amd_iommu_v2_exit
);