1 // SPDX-License-Identifier: GPL-2.0-only
4 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
5 * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
6 * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
9 #include <linux/types.h>
10 #include <linux/string.h>
11 #include <linux/kvm.h>
12 #include <linux/kvm_host.h>
13 #include <linux/highmem.h>
14 #include <linux/gfp.h>
15 #include <linux/slab.h>
16 #include <linux/sched/signal.h>
17 #include <linux/hugetlb.h>
18 #include <linux/list.h>
19 #include <linux/anon_inodes.h>
20 #include <linux/iommu.h>
21 #include <linux/file.h>
24 #include <asm/kvm_ppc.h>
25 #include <asm/kvm_book3s.h>
26 #include <asm/book3s/64/mmu-hash.h>
27 #include <asm/hvcall.h>
28 #include <asm/synch.h>
29 #include <asm/ppc-opcode.h>
30 #include <asm/kvm_host.h>
32 #include <asm/iommu.h>
34 #include <asm/mmu_context.h>
36 static unsigned long kvmppc_tce_pages(unsigned long iommu_pages
)
38 return ALIGN(iommu_pages
* sizeof(u64
), PAGE_SIZE
) / PAGE_SIZE
;
41 static unsigned long kvmppc_stt_pages(unsigned long tce_pages
)
43 unsigned long stt_bytes
= sizeof(struct kvmppc_spapr_tce_table
) +
44 (tce_pages
* sizeof(struct page
*));
46 return tce_pages
+ ALIGN(stt_bytes
, PAGE_SIZE
) / PAGE_SIZE
;
49 static void kvm_spapr_tce_iommu_table_free(struct rcu_head
*head
)
51 struct kvmppc_spapr_tce_iommu_table
*stit
= container_of(head
,
52 struct kvmppc_spapr_tce_iommu_table
, rcu
);
54 iommu_tce_table_put(stit
->tbl
);
59 static void kvm_spapr_tce_liobn_put(struct kref
*kref
)
61 struct kvmppc_spapr_tce_iommu_table
*stit
= container_of(kref
,
62 struct kvmppc_spapr_tce_iommu_table
, kref
);
64 list_del_rcu(&stit
->next
);
66 call_rcu(&stit
->rcu
, kvm_spapr_tce_iommu_table_free
);
69 extern void kvm_spapr_tce_release_iommu_group(struct kvm
*kvm
,
70 struct iommu_group
*grp
)
73 struct kvmppc_spapr_tce_table
*stt
;
74 struct kvmppc_spapr_tce_iommu_table
*stit
, *tmp
;
75 struct iommu_table_group
*table_group
= NULL
;
77 list_for_each_entry_rcu(stt
, &kvm
->arch
.spapr_tce_tables
, list
) {
79 table_group
= iommu_group_get_iommudata(grp
);
80 if (WARN_ON(!table_group
))
83 list_for_each_entry_safe(stit
, tmp
, &stt
->iommu_tables
, next
) {
84 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
85 if (table_group
->tables
[i
] != stit
->tbl
)
88 kref_put(&stit
->kref
, kvm_spapr_tce_liobn_put
);
94 extern long kvm_spapr_tce_attach_iommu_group(struct kvm
*kvm
, int tablefd
,
95 struct iommu_group
*grp
)
97 struct kvmppc_spapr_tce_table
*stt
= NULL
;
99 struct iommu_table
*tbl
= NULL
;
100 struct iommu_table_group
*table_group
;
102 struct kvmppc_spapr_tce_iommu_table
*stit
;
109 list_for_each_entry_rcu(stt
, &kvm
->arch
.spapr_tce_tables
, list
) {
110 if (stt
== f
.file
->private_data
) {
121 table_group
= iommu_group_get_iommudata(grp
);
122 if (WARN_ON(!table_group
))
125 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
126 struct iommu_table
*tbltmp
= table_group
->tables
[i
];
130 /* Make sure hardware table parameters are compatible */
131 if ((tbltmp
->it_page_shift
<= stt
->page_shift
) &&
132 (tbltmp
->it_offset
<< tbltmp
->it_page_shift
==
133 stt
->offset
<< stt
->page_shift
) &&
134 (tbltmp
->it_size
<< tbltmp
->it_page_shift
>=
135 stt
->size
<< stt
->page_shift
)) {
137 * Reference the table to avoid races with
138 * add/remove DMA windows.
140 tbl
= iommu_tce_table_get(tbltmp
);
147 list_for_each_entry_rcu(stit
, &stt
->iommu_tables
, next
) {
148 if (tbl
!= stit
->tbl
)
151 if (!kref_get_unless_zero(&stit
->kref
)) {
152 /* stit is being destroyed */
153 iommu_tce_table_put(tbl
);
157 * The table is already known to this KVM, we just increased
158 * its KVM reference counter and can return.
163 stit
= kzalloc(sizeof(*stit
), GFP_KERNEL
);
165 iommu_tce_table_put(tbl
);
170 kref_init(&stit
->kref
);
172 list_add_rcu(&stit
->next
, &stt
->iommu_tables
);
177 static void release_spapr_tce_table(struct rcu_head
*head
)
179 struct kvmppc_spapr_tce_table
*stt
= container_of(head
,
180 struct kvmppc_spapr_tce_table
, rcu
);
181 unsigned long i
, npages
= kvmppc_tce_pages(stt
->size
);
183 for (i
= 0; i
< npages
; i
++)
185 __free_page(stt
->pages
[i
]);
190 static struct page
*kvm_spapr_get_tce_page(struct kvmppc_spapr_tce_table
*stt
,
191 unsigned long sttpage
)
193 struct page
*page
= stt
->pages
[sttpage
];
198 mutex_lock(&stt
->alloc_lock
);
199 page
= stt
->pages
[sttpage
];
201 page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
204 stt
->pages
[sttpage
] = page
;
206 mutex_unlock(&stt
->alloc_lock
);
211 static vm_fault_t
kvm_spapr_tce_fault(struct vm_fault
*vmf
)
213 struct kvmppc_spapr_tce_table
*stt
= vmf
->vma
->vm_file
->private_data
;
216 if (vmf
->pgoff
>= kvmppc_tce_pages(stt
->size
))
217 return VM_FAULT_SIGBUS
;
219 page
= kvm_spapr_get_tce_page(stt
, vmf
->pgoff
);
228 static const struct vm_operations_struct kvm_spapr_tce_vm_ops
= {
229 .fault
= kvm_spapr_tce_fault
,
232 static int kvm_spapr_tce_mmap(struct file
*file
, struct vm_area_struct
*vma
)
234 vma
->vm_ops
= &kvm_spapr_tce_vm_ops
;
238 static int kvm_spapr_tce_release(struct inode
*inode
, struct file
*filp
)
240 struct kvmppc_spapr_tce_table
*stt
= filp
->private_data
;
241 struct kvmppc_spapr_tce_iommu_table
*stit
, *tmp
;
242 struct kvm
*kvm
= stt
->kvm
;
244 mutex_lock(&kvm
->lock
);
245 list_del_rcu(&stt
->list
);
246 mutex_unlock(&kvm
->lock
);
248 list_for_each_entry_safe(stit
, tmp
, &stt
->iommu_tables
, next
) {
249 WARN_ON(!kref_read(&stit
->kref
));
251 if (kref_put(&stit
->kref
, kvm_spapr_tce_liobn_put
))
256 account_locked_vm(kvm
->mm
,
257 kvmppc_stt_pages(kvmppc_tce_pages(stt
->size
)), false);
259 kvm_put_kvm(stt
->kvm
);
261 call_rcu(&stt
->rcu
, release_spapr_tce_table
);
266 static const struct file_operations kvm_spapr_tce_fops
= {
267 .mmap
= kvm_spapr_tce_mmap
,
268 .release
= kvm_spapr_tce_release
,
271 long kvm_vm_ioctl_create_spapr_tce(struct kvm
*kvm
,
272 struct kvm_create_spapr_tce_64
*args
)
274 struct kvmppc_spapr_tce_table
*stt
= NULL
;
275 struct kvmppc_spapr_tce_table
*siter
;
276 struct mm_struct
*mm
= kvm
->mm
;
277 unsigned long npages
, size
= args
->size
;
280 if (!args
->size
|| args
->page_shift
< 12 || args
->page_shift
> 34 ||
281 (args
->offset
+ args
->size
> (ULLONG_MAX
>> args
->page_shift
)))
284 npages
= kvmppc_tce_pages(size
);
285 ret
= account_locked_vm(mm
, kvmppc_stt_pages(npages
), true);
290 stt
= kzalloc(sizeof(*stt
) + npages
* sizeof(struct page
*),
295 stt
->liobn
= args
->liobn
;
296 stt
->page_shift
= args
->page_shift
;
297 stt
->offset
= args
->offset
;
300 mutex_init(&stt
->alloc_lock
);
301 INIT_LIST_HEAD_RCU(&stt
->iommu_tables
);
303 mutex_lock(&kvm
->lock
);
305 /* Check this LIOBN hasn't been previously allocated */
307 list_for_each_entry(siter
, &kvm
->arch
.spapr_tce_tables
, list
) {
308 if (siter
->liobn
== args
->liobn
) {
316 ret
= anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops
,
317 stt
, O_RDWR
| O_CLOEXEC
);
320 list_add_rcu(&stt
->list
, &kvm
->arch
.spapr_tce_tables
);
322 kvm_put_kvm_no_destroy(kvm
);
324 mutex_unlock(&kvm
->lock
);
331 account_locked_vm(mm
, kvmppc_stt_pages(npages
), false);
335 static long kvmppc_tce_to_ua(struct kvm
*kvm
, unsigned long tce
,
338 unsigned long gfn
= tce
>> PAGE_SHIFT
;
339 struct kvm_memory_slot
*memslot
;
341 memslot
= search_memslots(kvm_memslots(kvm
), gfn
);
345 *ua
= __gfn_to_hva_memslot(memslot
, gfn
) |
346 (tce
& ~(PAGE_MASK
| TCE_PCI_READ
| TCE_PCI_WRITE
));
351 static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table
*stt
,
354 unsigned long gpa
= tce
& ~(TCE_PCI_READ
| TCE_PCI_WRITE
);
355 enum dma_data_direction dir
= iommu_tce_direction(tce
);
356 struct kvmppc_spapr_tce_iommu_table
*stit
;
357 unsigned long ua
= 0;
359 /* Allow userspace to poison TCE table */
363 if (iommu_tce_check_gpa(stt
->page_shift
, gpa
))
366 if (kvmppc_tce_to_ua(stt
->kvm
, tce
, &ua
))
369 list_for_each_entry_rcu(stit
, &stt
->iommu_tables
, next
) {
370 unsigned long hpa
= 0;
371 struct mm_iommu_table_group_mem_t
*mem
;
372 long shift
= stit
->tbl
->it_page_shift
;
374 mem
= mm_iommu_lookup(stt
->kvm
->mm
, ua
, 1ULL << shift
);
378 if (mm_iommu_ua_to_hpa(mem
, ua
, shift
, &hpa
))
386 * Handles TCE requests for emulated devices.
387 * Puts guest TCE values to the table and expects user space to convert them.
388 * Cannot fail so kvmppc_tce_validate must be called before it.
390 static void kvmppc_tce_put(struct kvmppc_spapr_tce_table
*stt
,
391 unsigned long idx
, unsigned long tce
)
395 unsigned long sttpage
;
398 sttpage
= idx
/ TCES_PER_PAGE
;
399 page
= stt
->pages
[sttpage
];
402 /* We allow any TCE, not just with read|write permissions */
406 page
= kvm_spapr_get_tce_page(stt
, sttpage
);
410 tbl
= page_to_virt(page
);
412 tbl
[idx
% TCES_PER_PAGE
] = tce
;
415 static void kvmppc_clear_tce(struct mm_struct
*mm
, struct iommu_table
*tbl
,
418 unsigned long hpa
= 0;
419 enum dma_data_direction dir
= DMA_NONE
;
421 iommu_tce_xchg_no_kill(mm
, tbl
, entry
, &hpa
, &dir
);
424 static long kvmppc_tce_iommu_mapped_dec(struct kvm
*kvm
,
425 struct iommu_table
*tbl
, unsigned long entry
)
427 struct mm_iommu_table_group_mem_t
*mem
= NULL
;
428 const unsigned long pgsize
= 1ULL << tbl
->it_page_shift
;
429 __be64
*pua
= IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl
, entry
);
434 mem
= mm_iommu_lookup(kvm
->mm
, be64_to_cpu(*pua
), pgsize
);
438 mm_iommu_mapped_dec(mem
);
440 *pua
= cpu_to_be64(0);
445 static long kvmppc_tce_iommu_do_unmap(struct kvm
*kvm
,
446 struct iommu_table
*tbl
, unsigned long entry
)
448 enum dma_data_direction dir
= DMA_NONE
;
449 unsigned long hpa
= 0;
452 if (WARN_ON_ONCE(iommu_tce_xchg_no_kill(kvm
->mm
, tbl
, entry
, &hpa
,
459 ret
= kvmppc_tce_iommu_mapped_dec(kvm
, tbl
, entry
);
460 if (ret
!= H_SUCCESS
)
461 iommu_tce_xchg_no_kill(kvm
->mm
, tbl
, entry
, &hpa
, &dir
);
466 static long kvmppc_tce_iommu_unmap(struct kvm
*kvm
,
467 struct kvmppc_spapr_tce_table
*stt
, struct iommu_table
*tbl
,
470 unsigned long i
, ret
= H_SUCCESS
;
471 unsigned long subpages
= 1ULL << (stt
->page_shift
- tbl
->it_page_shift
);
472 unsigned long io_entry
= entry
* subpages
;
474 for (i
= 0; i
< subpages
; ++i
) {
475 ret
= kvmppc_tce_iommu_do_unmap(kvm
, tbl
, io_entry
+ i
);
476 if (ret
!= H_SUCCESS
)
483 long kvmppc_tce_iommu_do_map(struct kvm
*kvm
, struct iommu_table
*tbl
,
484 unsigned long entry
, unsigned long ua
,
485 enum dma_data_direction dir
)
489 __be64
*pua
= IOMMU_TABLE_USERSPACE_ENTRY(tbl
, entry
);
490 struct mm_iommu_table_group_mem_t
*mem
;
493 /* it_userspace allocation might be delayed */
496 mem
= mm_iommu_lookup(kvm
->mm
, ua
, 1ULL << tbl
->it_page_shift
);
498 /* This only handles v2 IOMMU type, v1 is handled via ioctl() */
501 if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem
, ua
, tbl
->it_page_shift
, &hpa
)))
504 if (mm_iommu_mapped_inc(mem
))
507 ret
= iommu_tce_xchg_no_kill(kvm
->mm
, tbl
, entry
, &hpa
, &dir
);
508 if (WARN_ON_ONCE(ret
)) {
509 mm_iommu_mapped_dec(mem
);
514 kvmppc_tce_iommu_mapped_dec(kvm
, tbl
, entry
);
516 *pua
= cpu_to_be64(ua
);
521 static long kvmppc_tce_iommu_map(struct kvm
*kvm
,
522 struct kvmppc_spapr_tce_table
*stt
, struct iommu_table
*tbl
,
523 unsigned long entry
, unsigned long ua
,
524 enum dma_data_direction dir
)
526 unsigned long i
, pgoff
, ret
= H_SUCCESS
;
527 unsigned long subpages
= 1ULL << (stt
->page_shift
- tbl
->it_page_shift
);
528 unsigned long io_entry
= entry
* subpages
;
530 for (i
= 0, pgoff
= 0; i
< subpages
;
531 ++i
, pgoff
+= IOMMU_PAGE_SIZE(tbl
)) {
533 ret
= kvmppc_tce_iommu_do_map(kvm
, tbl
,
534 io_entry
+ i
, ua
+ pgoff
, dir
);
535 if (ret
!= H_SUCCESS
)
542 long kvmppc_h_put_tce(struct kvm_vcpu
*vcpu
, unsigned long liobn
,
543 unsigned long ioba
, unsigned long tce
)
545 struct kvmppc_spapr_tce_table
*stt
;
547 struct kvmppc_spapr_tce_iommu_table
*stit
;
548 unsigned long entry
, ua
= 0;
549 enum dma_data_direction dir
;
551 /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
552 /* liobn, ioba, tce); */
554 stt
= kvmppc_find_table(vcpu
->kvm
, liobn
);
558 ret
= kvmppc_ioba_validate(stt
, ioba
, 1);
559 if (ret
!= H_SUCCESS
)
562 idx
= srcu_read_lock(&vcpu
->kvm
->srcu
);
564 ret
= kvmppc_tce_validate(stt
, tce
);
565 if (ret
!= H_SUCCESS
)
568 dir
= iommu_tce_direction(tce
);
570 if ((dir
!= DMA_NONE
) && kvmppc_tce_to_ua(vcpu
->kvm
, tce
, &ua
)) {
575 entry
= ioba
>> stt
->page_shift
;
577 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
) {
579 ret
= kvmppc_tce_iommu_unmap(vcpu
->kvm
, stt
,
582 ret
= kvmppc_tce_iommu_map(vcpu
->kvm
, stt
, stit
->tbl
,
585 iommu_tce_kill(stit
->tbl
, entry
, 1);
587 if (ret
!= H_SUCCESS
) {
588 kvmppc_clear_tce(vcpu
->kvm
->mm
, stit
->tbl
, entry
);
593 kvmppc_tce_put(stt
, entry
, tce
);
596 srcu_read_unlock(&vcpu
->kvm
->srcu
, idx
);
600 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce
);
602 long kvmppc_h_put_tce_indirect(struct kvm_vcpu
*vcpu
,
603 unsigned long liobn
, unsigned long ioba
,
604 unsigned long tce_list
, unsigned long npages
)
606 struct kvmppc_spapr_tce_table
*stt
;
607 long i
, ret
= H_SUCCESS
, idx
;
608 unsigned long entry
, ua
= 0;
611 struct kvmppc_spapr_tce_iommu_table
*stit
;
613 stt
= kvmppc_find_table(vcpu
->kvm
, liobn
);
617 entry
= ioba
>> stt
->page_shift
;
619 * SPAPR spec says that the maximum size of the list is 512 TCEs
620 * so the whole table fits in 4K page
625 if (tce_list
& (SZ_4K
- 1))
628 ret
= kvmppc_ioba_validate(stt
, ioba
, npages
);
629 if (ret
!= H_SUCCESS
)
632 idx
= srcu_read_lock(&vcpu
->kvm
->srcu
);
633 if (kvmppc_tce_to_ua(vcpu
->kvm
, tce_list
, &ua
)) {
637 tces
= (u64 __user
*) ua
;
639 for (i
= 0; i
< npages
; ++i
) {
640 if (get_user(tce
, tces
+ i
)) {
644 tce
= be64_to_cpu(tce
);
646 ret
= kvmppc_tce_validate(stt
, tce
);
647 if (ret
!= H_SUCCESS
)
651 for (i
= 0; i
< npages
; ++i
) {
653 * This looks unsafe, because we validate, then regrab
654 * the TCE from userspace which could have been changed by
657 * But it actually is safe, because the relevant checks will be
658 * re-executed in the following code. If userspace tries to
659 * change this dodgily it will result in a messier failure mode
660 * but won't threaten the host.
662 if (get_user(tce
, tces
+ i
)) {
664 goto invalidate_exit
;
666 tce
= be64_to_cpu(tce
);
668 if (kvmppc_tce_to_ua(vcpu
->kvm
, tce
, &ua
)) {
670 goto invalidate_exit
;
673 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
) {
674 ret
= kvmppc_tce_iommu_map(vcpu
->kvm
, stt
,
675 stit
->tbl
, entry
+ i
, ua
,
676 iommu_tce_direction(tce
));
678 if (ret
!= H_SUCCESS
) {
679 kvmppc_clear_tce(vcpu
->kvm
->mm
, stit
->tbl
,
681 goto invalidate_exit
;
685 kvmppc_tce_put(stt
, entry
+ i
, tce
);
689 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
)
690 iommu_tce_kill(stit
->tbl
, entry
, npages
);
693 srcu_read_unlock(&vcpu
->kvm
->srcu
, idx
);
697 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect
);
699 long kvmppc_h_stuff_tce(struct kvm_vcpu
*vcpu
,
700 unsigned long liobn
, unsigned long ioba
,
701 unsigned long tce_value
, unsigned long npages
)
703 struct kvmppc_spapr_tce_table
*stt
;
705 struct kvmppc_spapr_tce_iommu_table
*stit
;
707 stt
= kvmppc_find_table(vcpu
->kvm
, liobn
);
711 ret
= kvmppc_ioba_validate(stt
, ioba
, npages
);
712 if (ret
!= H_SUCCESS
)
715 /* Check permission bits only to allow userspace poison TCE for debug */
716 if (tce_value
& (TCE_PCI_WRITE
| TCE_PCI_READ
))
719 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
) {
720 unsigned long entry
= ioba
>> stt
->page_shift
;
722 for (i
= 0; i
< npages
; ++i
) {
723 ret
= kvmppc_tce_iommu_unmap(vcpu
->kvm
, stt
,
724 stit
->tbl
, entry
+ i
);
726 if (ret
== H_SUCCESS
)
729 if (ret
== H_TOO_HARD
)
730 goto invalidate_exit
;
733 kvmppc_clear_tce(vcpu
->kvm
->mm
, stit
->tbl
, entry
);
737 for (i
= 0; i
< npages
; ++i
, ioba
+= (1ULL << stt
->page_shift
))
738 kvmppc_tce_put(stt
, ioba
>> stt
->page_shift
, tce_value
);
741 list_for_each_entry_lockless(stit
, &stt
->iommu_tables
, next
)
742 iommu_tce_kill(stit
->tbl
, ioba
>> stt
->page_shift
, npages
);
746 EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce
);