1 // SPDX-License-Identifier: GPL-2.0-only
4 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
5 * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
6 * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
9 #include <linux/types.h>
10 #include <linux/string.h>
11 #include <linux/kvm.h>
12 #include <linux/kvm_host.h>
13 #include <linux/highmem.h>
14 #include <linux/gfp.h>
15 #include <linux/slab.h>
16 #include <linux/sched/signal.h>
17 #include <linux/hugetlb.h>
18 #include <linux/list.h>
19 #include <linux/anon_inodes.h>
20 #include <linux/iommu.h>
21 #include <linux/file.h>
24 #include <asm/kvm_ppc.h>
25 #include <asm/kvm_book3s.h>
26 #include <asm/book3s/64/mmu-hash.h>
27 #include <asm/hvcall.h>
28 #include <asm/synch.h>
29 #include <asm/ppc-opcode.h>
31 #include <asm/iommu.h>
33 #include <asm/mmu_context.h>
35 static unsigned long kvmppc_tce_pages(unsigned long iommu_pages
)
37 return ALIGN(iommu_pages
* sizeof(u64
), PAGE_SIZE
) / PAGE_SIZE
;
40 static unsigned long kvmppc_stt_pages(unsigned long tce_pages
)
42 unsigned long stt_bytes
= sizeof(struct kvmppc_spapr_tce_table
) +
43 (tce_pages
* sizeof(struct page
*));
45 return tce_pages
+ ALIGN(stt_bytes
, PAGE_SIZE
) / PAGE_SIZE
;
48 static void kvm_spapr_tce_iommu_table_free(struct rcu_head
*head
)
50 struct kvmppc_spapr_tce_iommu_table
*stit
= container_of(head
,
51 struct kvmppc_spapr_tce_iommu_table
, rcu
);
53 iommu_tce_table_put(stit
->tbl
);
58 static void kvm_spapr_tce_liobn_put(struct kref
*kref
)
60 struct kvmppc_spapr_tce_iommu_table
*stit
= container_of(kref
,
61 struct kvmppc_spapr_tce_iommu_table
, kref
);
63 list_del_rcu(&stit
->next
);
65 call_rcu(&stit
->rcu
, kvm_spapr_tce_iommu_table_free
);
68 extern void kvm_spapr_tce_release_iommu_group(struct kvm
*kvm
,
69 struct iommu_group
*grp
)
72 struct kvmppc_spapr_tce_table
*stt
;
73 struct kvmppc_spapr_tce_iommu_table
*stit
, *tmp
;
74 struct iommu_table_group
*table_group
= NULL
;
77 list_for_each_entry_rcu(stt
, &kvm
->arch
.spapr_tce_tables
, list
) {
79 table_group
= iommu_group_get_iommudata(grp
);
80 if (WARN_ON(!table_group
))
83 list_for_each_entry_safe(stit
, tmp
, &stt
->iommu_tables
, next
) {
84 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
85 if (table_group
->tables
[i
] != stit
->tbl
)
88 kref_put(&stit
->kref
, kvm_spapr_tce_liobn_put
);
96 extern long kvm_spapr_tce_attach_iommu_group(struct kvm
*kvm
, int tablefd
,
97 struct iommu_group
*grp
)
99 struct kvmppc_spapr_tce_table
*stt
= NULL
;
101 struct iommu_table
*tbl
= NULL
;
102 struct iommu_table_group
*table_group
;
104 struct kvmppc_spapr_tce_iommu_table
*stit
;
112 list_for_each_entry_rcu(stt
, &kvm
->arch
.spapr_tce_tables
, list
) {
113 if (stt
== f
.file
->private_data
) {
125 table_group
= iommu_group_get_iommudata(grp
);
126 if (WARN_ON(!table_group
))
129 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
130 struct iommu_table
*tbltmp
= table_group
->tables
[i
];
134 /* Make sure hardware table parameters are compatible */
135 if ((tbltmp
->it_page_shift
<= stt
->page_shift
) &&
136 (tbltmp
->it_offset
<< tbltmp
->it_page_shift
==
137 stt
->offset
<< stt
->page_shift
) &&
138 (tbltmp
->it_size
<< tbltmp
->it_page_shift
>=
139 stt
->size
<< stt
->page_shift
)) {
141 * Reference the table to avoid races with
142 * add/remove DMA windows.
144 tbl
= iommu_tce_table_get(tbltmp
);
152 list_for_each_entry_rcu(stit
, &stt
->iommu_tables
, next
) {
153 if (tbl
!= stit
->tbl
)
156 if (!kref_get_unless_zero(&stit
->kref
)) {
157 /* stit is being destroyed */
158 iommu_tce_table_put(tbl
);
163 * The table is already known to this KVM, we just increased
164 * its KVM reference counter and can return.
171 stit
= kzalloc(sizeof(*stit
), GFP_KERNEL
);
173 iommu_tce_table_put(tbl
);
178 kref_init(&stit
->kref
);
180 list_add_rcu(&stit
->next
, &stt
->iommu_tables
);
185 static void release_spapr_tce_table(struct rcu_head
*head
)
187 struct kvmppc_spapr_tce_table
*stt
= container_of(head
,
188 struct kvmppc_spapr_tce_table
, rcu
);
189 unsigned long i
, npages
= kvmppc_tce_pages(stt
->size
);
191 for (i
= 0; i
< npages
; i
++)
193 __free_page(stt
->pages
[i
]);
198 static struct page
*kvm_spapr_get_tce_page(struct kvmppc_spapr_tce_table
*stt
,
199 unsigned long sttpage
)
201 struct page
*page
= stt
->pages
[sttpage
];
206 mutex_lock(&stt
->alloc_lock
);
207 page
= stt
->pages
[sttpage
];
209 page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
212 stt
->pages
[sttpage
] = page
;
214 mutex_unlock(&stt
->alloc_lock
);
219 static vm_fault_t
kvm_spapr_tce_fault(struct vm_fault
*vmf
)
221 struct kvmppc_spapr_tce_table
*stt
= vmf
->vma
->vm_file
->private_data
;
224 if (vmf
->pgoff
>= kvmppc_tce_pages(stt
->size
))
225 return VM_FAULT_SIGBUS
;
227 page
= kvm_spapr_get_tce_page(stt
, vmf
->pgoff
);
236 static const struct vm_operations_struct kvm_spapr_tce_vm_ops
= {
237 .fault
= kvm_spapr_tce_fault
,
240 static int kvm_spapr_tce_mmap(struct file
*file
, struct vm_area_struct
*vma
)
242 vma
->vm_ops
= &kvm_spapr_tce_vm_ops
;
246 static int kvm_spapr_tce_release(struct inode
*inode
, struct file
*filp
)
248 struct kvmppc_spapr_tce_table
*stt
= filp
->private_data
;
249 struct kvmppc_spapr_tce_iommu_table
*stit
, *tmp
;
250 struct kvm
*kvm
= stt
->kvm
;
252 mutex_lock(&kvm
->lock
);
253 list_del_rcu(&stt
->list
);
254 mutex_unlock(&kvm
->lock
);
256 list_for_each_entry_safe(stit
, tmp
, &stt
->iommu_tables
, next
) {
257 WARN_ON(!kref_read(&stit
->kref
));
259 if (kref_put(&stit
->kref
, kvm_spapr_tce_liobn_put
))
264 account_locked_vm(kvm
->mm
,
265 kvmppc_stt_pages(kvmppc_tce_pages(stt
->size
)), false);
267 kvm_put_kvm(stt
->kvm
);
269 call_rcu(&stt
->rcu
, release_spapr_tce_table
);
274 static const struct file_operations kvm_spapr_tce_fops
= {
275 .mmap
= kvm_spapr_tce_mmap
,
276 .release
= kvm_spapr_tce_release
,
279 long kvm_vm_ioctl_create_spapr_tce(struct kvm
*kvm
,
280 struct kvm_create_spapr_tce_64
*args
)
282 struct kvmppc_spapr_tce_table
*stt
= NULL
;
283 struct kvmppc_spapr_tce_table
*siter
;
284 struct mm_struct
*mm
= kvm
->mm
;
285 unsigned long npages
, size
= args
->size
;
288 if (!args
->size
|| args
->page_shift
< 12 || args
->page_shift
> 34 ||
289 (args
->offset
+ args
->size
> (ULLONG_MAX
>> args
->page_shift
)))
292 npages
= kvmppc_tce_pages(size
);
293 ret
= account_locked_vm(mm
, kvmppc_stt_pages(npages
), true);
298 stt
= kzalloc(sizeof(*stt
) + npages
* sizeof(struct page
*),
303 stt
->liobn
= args
->liobn
;
304 stt
->page_shift
= args
->page_shift
;
305 stt
->offset
= args
->offset
;
308 mutex_init(&stt
->alloc_lock
);
309 INIT_LIST_HEAD_RCU(&stt
->iommu_tables
);
311 mutex_lock(&kvm
->lock
);
313 /* Check this LIOBN hasn't been previously allocated */
315 list_for_each_entry(siter
, &kvm
->arch
.spapr_tce_tables
, list
) {
316 if (siter
->liobn
== args
->liobn
) {
324 ret
= anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops
,
325 stt
, O_RDWR
| O_CLOEXEC
);
328 list_add_rcu(&stt
->list
, &kvm
->arch
.spapr_tce_tables
);
330 kvm_put_kvm_no_destroy(kvm
);
332 mutex_unlock(&kvm
->lock
);
339 account_locked_vm(mm
, kvmppc_stt_pages(npages
), false);
343 static long kvmppc_tce_to_ua(struct kvm
*kvm
, unsigned long tce
,
346 unsigned long gfn
= tce
>> PAGE_SHIFT
;
347 struct kvm_memory_slot
*memslot
;
349 memslot
= search_memslots(kvm_memslots(kvm
), gfn
);
353 *ua
= __gfn_to_hva_memslot(memslot
, gfn
) |
354 (tce
& ~(PAGE_MASK
| TCE_PCI_READ
| TCE_PCI_WRITE
));
359 static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table
*stt
,
362 unsigned long gpa
= tce
& ~(TCE_PCI_READ
| TCE_PCI_WRITE
);
363 enum dma_data_direction dir
= iommu_tce_direction(tce
);
364 struct kvmppc_spapr_tce_iommu_table
*stit
;
365 unsigned long ua
= 0;
367 /* Allow userspace to poison TCE table */
371 if (iommu_tce_check_gpa(stt
->page_shift
, gpa
))
374 if (kvmppc_tce_to_ua(stt
->kvm
, tce
, &ua
))
378 list_for_each_entry_rcu(stit
, &stt
->iommu_tables
, next
) {
379 unsigned long hpa
= 0;
380 struct mm_iommu_table_group_mem_t
*mem
;
381 long shift
= stit
->tbl
->it_page_shift
;
383 mem
= mm_iommu_lookup(stt
->kvm
->mm
, ua
, 1ULL << shift
);
384 if (!mem
|| mm_iommu_ua_to_hpa(mem
, ua
, shift
, &hpa
)) {
395 * Handles TCE requests for emulated devices.
396 * Puts guest TCE values to the table and expects user space to convert them.
397 * Cannot fail so kvmppc_tce_validate must be called before it.
399 static void kvmppc_tce_put(struct kvmppc_spapr_tce_table
*stt
,
400 unsigned long idx
, unsigned long tce
)
404 unsigned long sttpage
;
407 sttpage
= idx
/ TCES_PER_PAGE
;
408 page
= stt
->pages
[sttpage
];
411 /* We allow any TCE, not just with read|write permissions */
415 page
= kvm_spapr_get_tce_page(stt
, sttpage
);
419 tbl
= page_to_virt(page
);
421 tbl
[idx
% TCES_PER_PAGE
] = tce
;
424 static void kvmppc_clear_tce(struct mm_struct
*mm
, struct iommu_table
*tbl
,
427 unsigned long hpa
= 0;
428 enum dma_data_direction dir
= DMA_NONE
;
430 iommu_tce_xchg_no_kill(mm
, tbl
, entry
, &hpa
, &dir
);
433 static long kvmppc_tce_iommu_mapped_dec(struct kvm
*kvm
,
434 struct iommu_table
*tbl
, unsigned long entry
)
436 struct mm_iommu_table_group_mem_t
*mem
= NULL
;
437 const unsigned long pgsize
= 1ULL << tbl
->it_page_shift
;
438 __be64
*pua
= IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl
, entry
);
443 mem
= mm_iommu_lookup(kvm
->mm
, be64_to_cpu(*pua
), pgsize
);
447 mm_iommu_mapped_dec(mem
);
449 *pua
= cpu_to_be64(0);
454 static long kvmppc_tce_iommu_do_unmap(struct kvm
*kvm
,
455 struct iommu_table
*tbl
, unsigned long entry
)
457 enum dma_data_direction dir
= DMA_NONE
;
458 unsigned long hpa
= 0;
461 if (WARN_ON_ONCE(iommu_tce_xchg_no_kill(kvm
->mm
, tbl
, entry
, &hpa
,
468 ret
= kvmppc_tce_iommu_mapped_dec(kvm
, tbl
, entry
);
469 if (ret
!= H_SUCCESS
)
470 iommu_tce_xchg_no_kill(kvm
->mm
, tbl
, entry
, &hpa
, &dir
);
475 static long kvmppc_tce_iommu_unmap(struct kvm
*kvm
,
476 struct kvmppc_spapr_tce_table
*stt
, struct iommu_table
*tbl
,
479 unsigned long i
, ret
= H_SUCCESS
;
480 unsigned long subpages
= 1ULL << (stt
->page_shift
- tbl
->it_page_shift
);
481 unsigned long io_entry
= entry
* subpages
;
483 for (i
= 0; i
< subpages
; ++i
) {
484 ret
= kvmppc_tce_iommu_do_unmap(kvm
, tbl
, io_entry
+ i
);
485 if (ret
!= H_SUCCESS
)
492 static long kvmppc_tce_iommu_do_map(struct kvm
*kvm
, struct iommu_table
*tbl
,
493 unsigned long entry
, unsigned long ua
,
494 enum dma_data_direction dir
)
498 __be64
*pua
= IOMMU_TABLE_USERSPACE_ENTRY(tbl
, entry
);
499 struct mm_iommu_table_group_mem_t
*mem
;
502 /* it_userspace allocation might be delayed */
505 mem
= mm_iommu_lookup(kvm
->mm
, ua
, 1ULL << tbl
->it_page_shift
);
507 /* This only handles v2 IOMMU type, v1 is handled via ioctl() */
510 if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem
, ua
, tbl
->it_page_shift
, &hpa
)))
513 if (mm_iommu_mapped_inc(mem
))
516 ret
= iommu_tce_xchg_no_kill(kvm
->mm
, tbl
, entry
, &hpa
, &dir
);
517 if (WARN_ON_ONCE(ret
)) {
518 mm_iommu_mapped_dec(mem
);
523 kvmppc_tce_iommu_mapped_dec(kvm
, tbl
, entry
);
525 *pua
= cpu_to_be64(ua
);
530 static long kvmppc_tce_iommu_map(struct kvm
*kvm
,
531 struct kvmppc_spapr_tce_table
*stt
, struct iommu_table
*tbl
,
532 unsigned long entry
, unsigned long ua
,
533 enum dma_data_direction dir
)
535 unsigned long i
, pgoff
, ret
= H_SUCCESS
;
536 unsigned long subpages
= 1ULL << (stt
->page_shift
- tbl
->it_page_shift
);
537 unsigned long io_entry
= entry
* subpages
;
539 for (i
= 0, pgoff
= 0; i
< subpages
;
540 ++i
, pgoff
+= IOMMU_PAGE_SIZE(tbl
)) {
542 ret
= kvmppc_tce_iommu_do_map(kvm
, tbl
,
543 io_entry
+ i
, ua
+ pgoff
, dir
);
544 if (ret
!= H_SUCCESS
)
551 long kvmppc_h_put_tce(struct kvm_vcpu
*vcpu
, unsigned long liobn
,
552 unsigned long ioba
, unsigned long tce
)
554 struct kvmppc_spapr_tce_table
*stt
;
556 struct kvmppc_spapr_tce_iommu_table
*stit
;
557 unsigned long entry
, ua
= 0;
558 enum dma_data_direction dir
;
560 /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
561 /* liobn, ioba, tce); */
563 stt
= kvmppc_find_table(vcpu
->kvm
, liobn
);
567 ret
= kvmppc_ioba_validate(stt
, ioba
, 1);
568 if (ret
!= H_SUCCESS
)
571 idx
= srcu_read_lock(&vcpu
->kvm
->srcu
);
573 ret
= kvmppc_tce_validate(stt
, tce
);
574 if (ret
!= H_SUCCESS
)
577 dir
= iommu_tce_direction(tce
);
579 if ((dir
!= DMA_NONE
) && kvmppc_tce_to_ua(vcpu
->kvm
, tce
, &ua
)) {
584 entry
= ioba
>> stt
->page_shift
;
586 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
) {
588 ret
= kvmppc_tce_iommu_unmap(vcpu
->kvm
, stt
,
591 ret
= kvmppc_tce_iommu_map(vcpu
->kvm
, stt
, stit
->tbl
,
594 iommu_tce_kill(stit
->tbl
, entry
, 1);
596 if (ret
!= H_SUCCESS
) {
597 kvmppc_clear_tce(vcpu
->kvm
->mm
, stit
->tbl
, entry
);
602 kvmppc_tce_put(stt
, entry
, tce
);
605 srcu_read_unlock(&vcpu
->kvm
->srcu
, idx
);
609 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce
);
611 long kvmppc_h_put_tce_indirect(struct kvm_vcpu
*vcpu
,
612 unsigned long liobn
, unsigned long ioba
,
613 unsigned long tce_list
, unsigned long npages
)
615 struct kvmppc_spapr_tce_table
*stt
;
616 long i
, ret
= H_SUCCESS
, idx
;
617 unsigned long entry
, ua
= 0;
620 struct kvmppc_spapr_tce_iommu_table
*stit
;
622 stt
= kvmppc_find_table(vcpu
->kvm
, liobn
);
626 entry
= ioba
>> stt
->page_shift
;
628 * SPAPR spec says that the maximum size of the list is 512 TCEs
629 * so the whole table fits in 4K page
634 if (tce_list
& (SZ_4K
- 1))
637 ret
= kvmppc_ioba_validate(stt
, ioba
, npages
);
638 if (ret
!= H_SUCCESS
)
641 idx
= srcu_read_lock(&vcpu
->kvm
->srcu
);
642 if (kvmppc_tce_to_ua(vcpu
->kvm
, tce_list
, &ua
)) {
646 tces
= (u64 __user
*) ua
;
648 for (i
= 0; i
< npages
; ++i
) {
649 if (get_user(tce
, tces
+ i
)) {
653 tce
= be64_to_cpu(tce
);
655 ret
= kvmppc_tce_validate(stt
, tce
);
656 if (ret
!= H_SUCCESS
)
660 for (i
= 0; i
< npages
; ++i
) {
662 * This looks unsafe, because we validate, then regrab
663 * the TCE from userspace which could have been changed by
666 * But it actually is safe, because the relevant checks will be
667 * re-executed in the following code. If userspace tries to
668 * change this dodgily it will result in a messier failure mode
669 * but won't threaten the host.
671 if (get_user(tce
, tces
+ i
)) {
673 goto invalidate_exit
;
675 tce
= be64_to_cpu(tce
);
677 if (kvmppc_tce_to_ua(vcpu
->kvm
, tce
, &ua
)) {
679 goto invalidate_exit
;
682 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
) {
683 ret
= kvmppc_tce_iommu_map(vcpu
->kvm
, stt
,
684 stit
->tbl
, entry
+ i
, ua
,
685 iommu_tce_direction(tce
));
687 if (ret
!= H_SUCCESS
) {
688 kvmppc_clear_tce(vcpu
->kvm
->mm
, stit
->tbl
,
690 goto invalidate_exit
;
694 kvmppc_tce_put(stt
, entry
+ i
, tce
);
698 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
)
699 iommu_tce_kill(stit
->tbl
, entry
, npages
);
702 srcu_read_unlock(&vcpu
->kvm
->srcu
, idx
);
706 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect
);
708 long kvmppc_h_stuff_tce(struct kvm_vcpu
*vcpu
,
709 unsigned long liobn
, unsigned long ioba
,
710 unsigned long tce_value
, unsigned long npages
)
712 struct kvmppc_spapr_tce_table
*stt
;
714 struct kvmppc_spapr_tce_iommu_table
*stit
;
716 stt
= kvmppc_find_table(vcpu
->kvm
, liobn
);
720 ret
= kvmppc_ioba_validate(stt
, ioba
, npages
);
721 if (ret
!= H_SUCCESS
)
724 /* Check permission bits only to allow userspace poison TCE for debug */
725 if (tce_value
& (TCE_PCI_WRITE
| TCE_PCI_READ
))
728 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
) {
729 unsigned long entry
= ioba
>> stt
->page_shift
;
731 for (i
= 0; i
< npages
; ++i
) {
732 ret
= kvmppc_tce_iommu_unmap(vcpu
->kvm
, stt
,
733 stit
->tbl
, entry
+ i
);
735 if (ret
== H_SUCCESS
)
738 if (ret
== H_TOO_HARD
)
739 goto invalidate_exit
;
742 kvmppc_clear_tce(vcpu
->kvm
->mm
, stit
->tbl
, entry
);
746 for (i
= 0; i
< npages
; ++i
, ioba
+= (1ULL << stt
->page_shift
))
747 kvmppc_tce_put(stt
, ioba
>> stt
->page_shift
, tce_value
);
750 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
)
751 iommu_tce_kill(stit
->tbl
, ioba
>> stt
->page_shift
, npages
);
755 EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce
);