1 // SPDX-License-Identifier: GPL-2.0-only
2 /******************************************************************************
5 * Interface to privileged domain-0 commands.
7 * Copyright (c) 2002-2004, K A Fraser, B Dragovic
10 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
12 #include <linux/eventfd.h>
13 #include <linux/file.h>
14 #include <linux/kernel.h>
15 #include <linux/module.h>
16 #include <linux/mutex.h>
17 #include <linux/poll.h>
18 #include <linux/sched.h>
19 #include <linux/slab.h>
20 #include <linux/srcu.h>
21 #include <linux/string.h>
22 #include <linux/workqueue.h>
23 #include <linux/errno.h>
25 #include <linux/mman.h>
26 #include <linux/uaccess.h>
27 #include <linux/swap.h>
28 #include <linux/highmem.h>
29 #include <linux/pagemap.h>
30 #include <linux/seq_file.h>
31 #include <linux/miscdevice.h>
32 #include <linux/moduleparam.h>
33 #include <linux/virtio_mmio.h>
35 #include <asm/xen/hypervisor.h>
36 #include <asm/xen/hypercall.h>
39 #include <xen/events.h>
40 #include <xen/privcmd.h>
41 #include <xen/interface/xen.h>
42 #include <xen/interface/memory.h>
43 #include <xen/interface/hvm/dm_op.h>
44 #include <xen/interface/hvm/ioreq.h>
45 #include <xen/features.h>
47 #include <xen/xen-ops.h>
48 #include <xen/balloon.h>
49 #ifdef CONFIG_XEN_ACPI
55 MODULE_DESCRIPTION("Xen hypercall passthrough driver");
56 MODULE_LICENSE("GPL");
58 #define PRIV_VMA_LOCKED ((void *)1)
60 static unsigned int privcmd_dm_op_max_num
= 16;
61 module_param_named(dm_op_max_nr_bufs
, privcmd_dm_op_max_num
, uint
, 0644);
62 MODULE_PARM_DESC(dm_op_max_nr_bufs
,
63 "Maximum number of buffers per dm_op hypercall");
65 static unsigned int privcmd_dm_op_buf_max_size
= 4096;
66 module_param_named(dm_op_buf_max_size
, privcmd_dm_op_buf_max_size
, uint
,
68 MODULE_PARM_DESC(dm_op_buf_max_size
,
69 "Maximum size of a dm_op hypercall buffer");
75 static int privcmd_vma_range_is_mapped(
76 struct vm_area_struct
*vma
,
78 unsigned long nr_pages
);
80 static long privcmd_ioctl_hypercall(struct file
*file
, void __user
*udata
)
82 struct privcmd_data
*data
= file
->private_data
;
83 struct privcmd_hypercall hypercall
;
86 /* Disallow arbitrary hypercalls if restricted */
87 if (data
->domid
!= DOMID_INVALID
)
90 if (copy_from_user(&hypercall
, udata
, sizeof(hypercall
)))
93 xen_preemptible_hcall_begin();
94 ret
= privcmd_call(hypercall
.op
,
95 hypercall
.arg
[0], hypercall
.arg
[1],
96 hypercall
.arg
[2], hypercall
.arg
[3],
98 xen_preemptible_hcall_end();
103 static void free_page_list(struct list_head
*pages
)
107 list_for_each_entry_safe(p
, n
, pages
, lru
)
110 INIT_LIST_HEAD(pages
);
114 * Given an array of items in userspace, return a list of pages
115 * containing the data. If copying fails, either because of memory
116 * allocation failure or a problem reading user memory, return an
117 * error code; its up to the caller to dispose of any partial list.
119 static int gather_array(struct list_head
*pagelist
,
120 unsigned nelem
, size_t size
,
121 const void __user
*data
)
127 if (size
> PAGE_SIZE
)
131 pagedata
= NULL
; /* quiet, gcc */
133 if (pageidx
> PAGE_SIZE
-size
) {
134 struct page
*page
= alloc_page(GFP_KERNEL
);
140 pagedata
= page_address(page
);
142 list_add_tail(&page
->lru
, pagelist
);
147 if (copy_from_user(pagedata
+ pageidx
, data
, size
))
161 * Call function "fn" on each element of the array fragmented
162 * over a list of pages.
164 static int traverse_pages(unsigned nelem
, size_t size
,
165 struct list_head
*pos
,
166 int (*fn
)(void *data
, void *state
),
173 BUG_ON(size
> PAGE_SIZE
);
176 pagedata
= NULL
; /* hush, gcc */
179 if (pageidx
> PAGE_SIZE
-size
) {
182 page
= list_entry(pos
, struct page
, lru
);
183 pagedata
= page_address(page
);
187 ret
= (*fn
)(pagedata
+ pageidx
, state
);
197 * Similar to traverse_pages, but use each page as a "block" of
198 * data to be processed as one unit.
200 static int traverse_pages_block(unsigned nelem
, size_t size
,
201 struct list_head
*pos
,
202 int (*fn
)(void *data
, int nr
, void *state
),
208 BUG_ON(size
> PAGE_SIZE
);
211 int nr
= (PAGE_SIZE
/size
);
216 page
= list_entry(pos
, struct page
, lru
);
217 pagedata
= page_address(page
);
218 ret
= (*fn
)(pagedata
, nr
, state
);
227 struct mmap_gfn_state
{
229 struct vm_area_struct
*vma
;
233 static int mmap_gfn_range(void *data
, void *state
)
235 struct privcmd_mmap_entry
*msg
= data
;
236 struct mmap_gfn_state
*st
= state
;
237 struct vm_area_struct
*vma
= st
->vma
;
240 /* Do not allow range to wrap the address space. */
241 if ((msg
->npages
> (LONG_MAX
>> PAGE_SHIFT
)) ||
242 ((unsigned long)(msg
->npages
<< PAGE_SHIFT
) >= -st
->va
))
245 /* Range chunks must be contiguous in va space. */
246 if ((msg
->va
!= st
->va
) ||
247 ((msg
->va
+(msg
->npages
<<PAGE_SHIFT
)) > vma
->vm_end
))
250 rc
= xen_remap_domain_gfn_range(vma
,
252 msg
->mfn
, msg
->npages
,
258 st
->va
+= msg
->npages
<< PAGE_SHIFT
;
263 static long privcmd_ioctl_mmap(struct file
*file
, void __user
*udata
)
265 struct privcmd_data
*data
= file
->private_data
;
266 struct privcmd_mmap mmapcmd
;
267 struct mm_struct
*mm
= current
->mm
;
268 struct vm_area_struct
*vma
;
271 struct mmap_gfn_state state
;
273 /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */
274 if (xen_feature(XENFEAT_auto_translated_physmap
))
277 if (copy_from_user(&mmapcmd
, udata
, sizeof(mmapcmd
)))
280 /* If restriction is in place, check the domid matches */
281 if (data
->domid
!= DOMID_INVALID
&& data
->domid
!= mmapcmd
.dom
)
284 rc
= gather_array(&pagelist
,
285 mmapcmd
.num
, sizeof(struct privcmd_mmap_entry
),
288 if (rc
|| list_empty(&pagelist
))
294 struct page
*page
= list_first_entry(&pagelist
,
296 struct privcmd_mmap_entry
*msg
= page_address(page
);
298 vma
= vma_lookup(mm
, msg
->va
);
301 if (!vma
|| (msg
->va
!= vma
->vm_start
) || vma
->vm_private_data
)
303 vma
->vm_private_data
= PRIV_VMA_LOCKED
;
306 state
.va
= vma
->vm_start
;
308 state
.domain
= mmapcmd
.dom
;
310 rc
= traverse_pages(mmapcmd
.num
, sizeof(struct privcmd_mmap_entry
),
312 mmap_gfn_range
, &state
);
316 mmap_write_unlock(mm
);
319 free_page_list(&pagelist
);
324 struct mmap_batch_state
{
327 struct vm_area_struct
*vma
;
331 * 1 if at least one error has happened (and no
332 * -ENOENT errors have happened)
333 * -ENOENT if at least 1 -ENOENT has happened.
338 /* User-space gfn array to store errors in the second pass for V1. */
339 xen_pfn_t __user
*user_gfn
;
340 /* User-space int array to store errors in the second pass for V2. */
341 int __user
*user_err
;
344 /* auto translated dom0 note: if domU being created is PV, then gfn is
345 * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP).
347 static int mmap_batch_fn(void *data
, int nr
, void *state
)
349 xen_pfn_t
*gfnp
= data
;
350 struct mmap_batch_state
*st
= state
;
351 struct vm_area_struct
*vma
= st
->vma
;
352 struct page
**pages
= vma
->vm_private_data
;
353 struct page
**cur_pages
= NULL
;
356 if (xen_feature(XENFEAT_auto_translated_physmap
))
357 cur_pages
= &pages
[st
->index
];
360 ret
= xen_remap_domain_gfn_array(st
->vma
, st
->va
& PAGE_MASK
, gfnp
, nr
,
361 (int *)gfnp
, st
->vma
->vm_page_prot
,
362 st
->domain
, cur_pages
);
364 /* Adjust the global_error? */
367 st
->global_error
= -ENOENT
;
369 /* Record that at least one error has happened. */
370 if (st
->global_error
== 0)
371 st
->global_error
= 1;
374 st
->va
+= XEN_PAGE_SIZE
* nr
;
375 st
->index
+= nr
/ XEN_PFN_PER_PAGE
;
380 static int mmap_return_error(int err
, struct mmap_batch_state
*st
)
384 if (st
->version
== 1) {
388 ret
= get_user(gfn
, st
->user_gfn
);
392 * V1 encodes the error codes in the 32bit top
393 * nibble of the gfn (with its known
394 * limitations vis-a-vis 64 bit callers).
396 gfn
|= (err
== -ENOENT
) ?
397 PRIVCMD_MMAPBATCH_PAGED_ERROR
:
398 PRIVCMD_MMAPBATCH_MFN_ERROR
;
399 return __put_user(gfn
, st
->user_gfn
++);
402 } else { /* st->version == 2 */
404 return __put_user(err
, st
->user_err
++);
412 static int mmap_return_errors(void *data
, int nr
, void *state
)
414 struct mmap_batch_state
*st
= state
;
419 for (i
= 0; i
< nr
; i
++) {
420 ret
= mmap_return_error(errs
[i
], st
);
427 /* Allocate pfns that are then mapped with gfns from foreign domid. Update
428 * the vma with the page info to use later.
429 * Returns: 0 if success, otherwise -errno
431 static int alloc_empty_pages(struct vm_area_struct
*vma
, int numpgs
)
436 pages
= kvcalloc(numpgs
, sizeof(pages
[0]), GFP_KERNEL
);
440 rc
= xen_alloc_unpopulated_pages(numpgs
, pages
);
442 pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__
,
447 BUG_ON(vma
->vm_private_data
!= NULL
);
448 vma
->vm_private_data
= pages
;
453 static const struct vm_operations_struct privcmd_vm_ops
;
455 static long privcmd_ioctl_mmap_batch(
456 struct file
*file
, void __user
*udata
, int version
)
458 struct privcmd_data
*data
= file
->private_data
;
460 struct privcmd_mmapbatch_v2 m
;
461 struct mm_struct
*mm
= current
->mm
;
462 struct vm_area_struct
*vma
;
463 unsigned long nr_pages
;
465 struct mmap_batch_state state
;
469 if (copy_from_user(&m
, udata
, sizeof(struct privcmd_mmapbatch
)))
471 /* Returns per-frame error in m.arr. */
473 if (!access_ok(m
.arr
, m
.num
* sizeof(*m
.arr
)))
477 if (copy_from_user(&m
, udata
, sizeof(struct privcmd_mmapbatch_v2
)))
479 /* Returns per-frame error code in m.err. */
480 if (!access_ok(m
.err
, m
.num
* (sizeof(*m
.err
))))
487 /* If restriction is in place, check the domid matches */
488 if (data
->domid
!= DOMID_INVALID
&& data
->domid
!= m
.dom
)
491 nr_pages
= DIV_ROUND_UP(m
.num
, XEN_PFN_PER_PAGE
);
492 if ((m
.num
<= 0) || (nr_pages
> (LONG_MAX
>> PAGE_SHIFT
)))
495 ret
= gather_array(&pagelist
, m
.num
, sizeof(xen_pfn_t
), m
.arr
);
499 if (list_empty(&pagelist
)) {
505 /* Zero error array now to only copy back actual errors. */
506 if (clear_user(m
.err
, sizeof(int) * m
.num
)) {
514 vma
= find_vma(mm
, m
.addr
);
516 vma
->vm_ops
!= &privcmd_vm_ops
) {
522 * Caller must either:
524 * Map the whole VMA range, which will also allocate all the
525 * pages required for the auto_translated_physmap case.
529 * Map unmapped holes left from a previous map attempt (e.g.,
530 * because those foreign frames were previously paged out).
532 if (vma
->vm_private_data
== NULL
) {
533 if (m
.addr
!= vma
->vm_start
||
534 m
.addr
+ (nr_pages
<< PAGE_SHIFT
) != vma
->vm_end
) {
538 if (xen_feature(XENFEAT_auto_translated_physmap
)) {
539 ret
= alloc_empty_pages(vma
, nr_pages
);
543 vma
->vm_private_data
= PRIV_VMA_LOCKED
;
545 if (m
.addr
< vma
->vm_start
||
546 m
.addr
+ (nr_pages
<< PAGE_SHIFT
) > vma
->vm_end
) {
550 if (privcmd_vma_range_is_mapped(vma
, m
.addr
, nr_pages
)) {
556 state
.domain
= m
.dom
;
560 state
.global_error
= 0;
561 state
.version
= version
;
563 BUILD_BUG_ON(((PAGE_SIZE
/ sizeof(xen_pfn_t
)) % XEN_PFN_PER_PAGE
) != 0);
564 /* mmap_batch_fn guarantees ret == 0 */
565 BUG_ON(traverse_pages_block(m
.num
, sizeof(xen_pfn_t
),
566 &pagelist
, mmap_batch_fn
, &state
));
568 mmap_write_unlock(mm
);
570 if (state
.global_error
) {
571 /* Write back errors in second pass. */
572 state
.user_gfn
= (xen_pfn_t
*)m
.arr
;
573 state
.user_err
= m
.err
;
574 ret
= traverse_pages_block(m
.num
, sizeof(xen_pfn_t
),
575 &pagelist
, mmap_return_errors
, &state
);
579 /* If we have not had any EFAULT-like global errors then set the global
580 * error to -ENOENT if necessary. */
581 if ((ret
== 0) && (state
.global_error
== -ENOENT
))
585 free_page_list(&pagelist
);
589 mmap_write_unlock(mm
);
593 static int lock_pages(
594 struct privcmd_dm_op_buf kbufs
[], unsigned int num
,
595 struct page
*pages
[], unsigned int nr_pages
, unsigned int *pinned
)
597 unsigned int i
, off
= 0;
599 for (i
= 0; i
< num
; ) {
600 unsigned int requested
;
603 requested
= DIV_ROUND_UP(
604 offset_in_page(kbufs
[i
].uptr
) + kbufs
[i
].size
,
606 if (requested
> nr_pages
)
609 page_count
= pin_user_pages_fast(
610 (unsigned long)kbufs
[i
].uptr
+ off
* PAGE_SIZE
,
611 requested
, FOLL_WRITE
, pages
);
613 return page_count
? : -EFAULT
;
615 *pinned
+= page_count
;
616 nr_pages
-= page_count
;
619 off
= (requested
== page_count
) ? 0 : off
+ page_count
;
626 static void unlock_pages(struct page
*pages
[], unsigned int nr_pages
)
628 unpin_user_pages_dirty_lock(pages
, nr_pages
, true);
631 static long privcmd_ioctl_dm_op(struct file
*file
, void __user
*udata
)
633 struct privcmd_data
*data
= file
->private_data
;
634 struct privcmd_dm_op kdata
;
635 struct privcmd_dm_op_buf
*kbufs
;
636 unsigned int nr_pages
= 0;
637 struct page
**pages
= NULL
;
638 struct xen_dm_op_buf
*xbufs
= NULL
;
641 unsigned int pinned
= 0;
643 if (copy_from_user(&kdata
, udata
, sizeof(kdata
)))
646 /* If restriction is in place, check the domid matches */
647 if (data
->domid
!= DOMID_INVALID
&& data
->domid
!= kdata
.dom
)
653 if (kdata
.num
> privcmd_dm_op_max_num
)
656 kbufs
= kcalloc(kdata
.num
, sizeof(*kbufs
), GFP_KERNEL
);
660 if (copy_from_user(kbufs
, kdata
.ubufs
,
661 sizeof(*kbufs
) * kdata
.num
)) {
666 for (i
= 0; i
< kdata
.num
; i
++) {
667 if (kbufs
[i
].size
> privcmd_dm_op_buf_max_size
) {
672 if (!access_ok(kbufs
[i
].uptr
,
678 nr_pages
+= DIV_ROUND_UP(
679 offset_in_page(kbufs
[i
].uptr
) + kbufs
[i
].size
,
683 pages
= kcalloc(nr_pages
, sizeof(*pages
), GFP_KERNEL
);
689 xbufs
= kcalloc(kdata
.num
, sizeof(*xbufs
), GFP_KERNEL
);
695 rc
= lock_pages(kbufs
, kdata
.num
, pages
, nr_pages
, &pinned
);
699 for (i
= 0; i
< kdata
.num
; i
++) {
700 set_xen_guest_handle(xbufs
[i
].h
, kbufs
[i
].uptr
);
701 xbufs
[i
].size
= kbufs
[i
].size
;
704 xen_preemptible_hcall_begin();
705 rc
= HYPERVISOR_dm_op(kdata
.dom
, kdata
.num
, xbufs
);
706 xen_preemptible_hcall_end();
709 unlock_pages(pages
, pinned
);
717 static long privcmd_ioctl_restrict(struct file
*file
, void __user
*udata
)
719 struct privcmd_data
*data
= file
->private_data
;
722 if (copy_from_user(&dom
, udata
, sizeof(dom
)))
725 /* Set restriction to the specified domain, or check it matches */
726 if (data
->domid
== DOMID_INVALID
)
728 else if (data
->domid
!= dom
)
734 static long privcmd_ioctl_mmap_resource(struct file
*file
,
735 struct privcmd_mmap_resource __user
*udata
)
737 struct privcmd_data
*data
= file
->private_data
;
738 struct mm_struct
*mm
= current
->mm
;
739 struct vm_area_struct
*vma
;
740 struct privcmd_mmap_resource kdata
;
741 xen_pfn_t
*pfns
= NULL
;
742 struct xen_mem_acquire_resource xdata
= { };
745 if (copy_from_user(&kdata
, udata
, sizeof(kdata
)))
748 /* If restriction is in place, check the domid matches */
749 if (data
->domid
!= DOMID_INVALID
&& data
->domid
!= kdata
.dom
)
752 /* Both fields must be set or unset */
753 if (!!kdata
.addr
!= !!kdata
.num
)
756 xdata
.domid
= kdata
.dom
;
757 xdata
.type
= kdata
.type
;
760 if (!kdata
.addr
&& !kdata
.num
) {
761 /* Query the size of the resource. */
762 rc
= HYPERVISOR_memory_op(XENMEM_acquire_resource
, &xdata
);
765 return __put_user(xdata
.nr_frames
, &udata
->num
);
770 vma
= find_vma(mm
, kdata
.addr
);
771 if (!vma
|| vma
->vm_ops
!= &privcmd_vm_ops
) {
776 pfns
= kcalloc(kdata
.num
, sizeof(*pfns
), GFP_KERNEL
| __GFP_NOWARN
);
782 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE
) &&
783 xen_feature(XENFEAT_auto_translated_physmap
)) {
784 unsigned int nr
= DIV_ROUND_UP(kdata
.num
, XEN_PFN_PER_PAGE
);
788 rc
= alloc_empty_pages(vma
, nr
);
792 pages
= vma
->vm_private_data
;
794 for (i
= 0; i
< kdata
.num
; i
++) {
796 page_to_xen_pfn(pages
[i
/ XEN_PFN_PER_PAGE
]);
798 pfns
[i
] = pfn
+ (i
% XEN_PFN_PER_PAGE
);
801 vma
->vm_private_data
= PRIV_VMA_LOCKED
;
803 xdata
.frame
= kdata
.idx
;
804 xdata
.nr_frames
= kdata
.num
;
805 set_xen_guest_handle(xdata
.frame_list
, pfns
);
807 xen_preemptible_hcall_begin();
808 rc
= HYPERVISOR_memory_op(XENMEM_acquire_resource
, &xdata
);
809 xen_preemptible_hcall_end();
814 if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE
) &&
815 xen_feature(XENFEAT_auto_translated_physmap
)) {
816 rc
= xen_remap_vma_range(vma
, kdata
.addr
, kdata
.num
<< PAGE_SHIFT
);
819 (xdata
.flags
& XENMEM_rsrc_acq_caller_owned
) ?
820 DOMID_SELF
: kdata
.dom
;
821 int num
, *errs
= (int *)pfns
;
823 BUILD_BUG_ON(sizeof(*errs
) > sizeof(*pfns
));
824 num
= xen_remap_domain_mfn_array(vma
,
825 kdata
.addr
& PAGE_MASK
,
826 pfns
, kdata
.num
, errs
,
831 else if (num
!= kdata
.num
) {
834 for (i
= 0; i
< num
; i
++) {
844 mmap_write_unlock(mm
);
850 static long privcmd_ioctl_pcidev_get_gsi(struct file
*file
, void __user
*udata
)
852 #if defined(CONFIG_XEN_ACPI)
854 struct privcmd_pcidev_get_gsi kdata
;
856 if (copy_from_user(&kdata
, udata
, sizeof(kdata
)))
859 rc
= xen_acpi_get_gsi_from_sbdf(kdata
.sbdf
);
864 if (copy_to_user(udata
, &kdata
, sizeof(kdata
)))
873 #ifdef CONFIG_XEN_PRIVCMD_EVENTFD
875 static struct workqueue_struct
*irqfd_cleanup_wq
;
876 static DEFINE_SPINLOCK(irqfds_lock
);
877 DEFINE_STATIC_SRCU(irqfds_srcu
);
878 static LIST_HEAD(irqfds_list
);
880 struct privcmd_kernel_irqfd
{
881 struct xen_dm_op_buf xbufs
;
884 struct eventfd_ctx
*eventfd
;
885 struct work_struct shutdown
;
886 wait_queue_entry_t wait
;
887 struct list_head list
;
891 static void irqfd_deactivate(struct privcmd_kernel_irqfd
*kirqfd
)
893 lockdep_assert_held(&irqfds_lock
);
895 list_del_init(&kirqfd
->list
);
896 queue_work(irqfd_cleanup_wq
, &kirqfd
->shutdown
);
899 static void irqfd_shutdown(struct work_struct
*work
)
901 struct privcmd_kernel_irqfd
*kirqfd
=
902 container_of(work
, struct privcmd_kernel_irqfd
, shutdown
);
905 /* Make sure irqfd has been initialized in assign path */
906 synchronize_srcu(&irqfds_srcu
);
908 eventfd_ctx_remove_wait_queue(kirqfd
->eventfd
, &kirqfd
->wait
, &cnt
);
909 eventfd_ctx_put(kirqfd
->eventfd
);
913 static void irqfd_inject(struct privcmd_kernel_irqfd
*kirqfd
)
918 eventfd_ctx_do_read(kirqfd
->eventfd
, &cnt
);
920 xen_preemptible_hcall_begin();
921 rc
= HYPERVISOR_dm_op(kirqfd
->dom
, 1, &kirqfd
->xbufs
);
922 xen_preemptible_hcall_end();
924 /* Don't repeat the error message for consecutive failures */
925 if (rc
&& !kirqfd
->error
) {
926 pr_err("Failed to configure irq for guest domain: %d\n",
934 irqfd_wakeup(wait_queue_entry_t
*wait
, unsigned int mode
, int sync
, void *key
)
936 struct privcmd_kernel_irqfd
*kirqfd
=
937 container_of(wait
, struct privcmd_kernel_irqfd
, wait
);
938 __poll_t flags
= key_to_poll(key
);
941 irqfd_inject(kirqfd
);
943 if (flags
& EPOLLHUP
) {
946 spin_lock_irqsave(&irqfds_lock
, flags
);
947 irqfd_deactivate(kirqfd
);
948 spin_unlock_irqrestore(&irqfds_lock
, flags
);
955 irqfd_poll_func(struct file
*file
, wait_queue_head_t
*wqh
, poll_table
*pt
)
957 struct privcmd_kernel_irqfd
*kirqfd
=
958 container_of(pt
, struct privcmd_kernel_irqfd
, pt
);
960 add_wait_queue_priority(wqh
, &kirqfd
->wait
);
963 static int privcmd_irqfd_assign(struct privcmd_irqfd
*irqfd
)
965 struct privcmd_kernel_irqfd
*kirqfd
, *tmp
;
971 CLASS(fd
, f
)(irqfd
->fd
);
973 kirqfd
= kzalloc(sizeof(*kirqfd
) + irqfd
->size
, GFP_KERNEL
);
978 if (copy_from_user(dm_op
, u64_to_user_ptr(irqfd
->dm_op
), irqfd
->size
)) {
983 kirqfd
->xbufs
.size
= irqfd
->size
;
984 set_xen_guest_handle(kirqfd
->xbufs
.h
, dm_op
);
985 kirqfd
->dom
= irqfd
->dom
;
986 INIT_WORK(&kirqfd
->shutdown
, irqfd_shutdown
);
993 kirqfd
->eventfd
= eventfd_ctx_fileget(fd_file(f
));
994 if (IS_ERR(kirqfd
->eventfd
)) {
995 ret
= PTR_ERR(kirqfd
->eventfd
);
1000 * Install our own custom wake-up handling so we are notified via a
1001 * callback whenever someone signals the underlying eventfd.
1003 init_waitqueue_func_entry(&kirqfd
->wait
, irqfd_wakeup
);
1004 init_poll_funcptr(&kirqfd
->pt
, irqfd_poll_func
);
1006 spin_lock_irqsave(&irqfds_lock
, flags
);
1008 list_for_each_entry(tmp
, &irqfds_list
, list
) {
1009 if (kirqfd
->eventfd
== tmp
->eventfd
) {
1011 spin_unlock_irqrestore(&irqfds_lock
, flags
);
1016 idx
= srcu_read_lock(&irqfds_srcu
);
1017 list_add_tail(&kirqfd
->list
, &irqfds_list
);
1018 spin_unlock_irqrestore(&irqfds_lock
, flags
);
1021 * Check if there was an event already pending on the eventfd before we
1022 * registered, and trigger it as if we didn't miss it.
1024 events
= vfs_poll(fd_file(f
), &kirqfd
->pt
);
1025 if (events
& EPOLLIN
)
1026 irqfd_inject(kirqfd
);
1028 srcu_read_unlock(&irqfds_srcu
, idx
);
1032 eventfd_ctx_put(kirqfd
->eventfd
);
1039 static int privcmd_irqfd_deassign(struct privcmd_irqfd
*irqfd
)
1041 struct privcmd_kernel_irqfd
*kirqfd
;
1042 struct eventfd_ctx
*eventfd
;
1043 unsigned long flags
;
1045 eventfd
= eventfd_ctx_fdget(irqfd
->fd
);
1046 if (IS_ERR(eventfd
))
1047 return PTR_ERR(eventfd
);
1049 spin_lock_irqsave(&irqfds_lock
, flags
);
1051 list_for_each_entry(kirqfd
, &irqfds_list
, list
) {
1052 if (kirqfd
->eventfd
== eventfd
) {
1053 irqfd_deactivate(kirqfd
);
1058 spin_unlock_irqrestore(&irqfds_lock
, flags
);
1060 eventfd_ctx_put(eventfd
);
1063 * Block until we know all outstanding shutdown jobs have completed so
1064 * that we guarantee there will not be any more interrupts once this
1065 * deassign function returns.
1067 flush_workqueue(irqfd_cleanup_wq
);
1072 static long privcmd_ioctl_irqfd(struct file
*file
, void __user
*udata
)
1074 struct privcmd_data
*data
= file
->private_data
;
1075 struct privcmd_irqfd irqfd
;
1077 if (copy_from_user(&irqfd
, udata
, sizeof(irqfd
)))
1080 /* No other flags should be set */
1081 if (irqfd
.flags
& ~PRIVCMD_IRQFD_FLAG_DEASSIGN
)
1084 /* If restriction is in place, check the domid matches */
1085 if (data
->domid
!= DOMID_INVALID
&& data
->domid
!= irqfd
.dom
)
1088 if (irqfd
.flags
& PRIVCMD_IRQFD_FLAG_DEASSIGN
)
1089 return privcmd_irqfd_deassign(&irqfd
);
1091 return privcmd_irqfd_assign(&irqfd
);
1094 static int privcmd_irqfd_init(void)
1096 irqfd_cleanup_wq
= alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
1097 if (!irqfd_cleanup_wq
)
1103 static void privcmd_irqfd_exit(void)
1105 struct privcmd_kernel_irqfd
*kirqfd
, *tmp
;
1106 unsigned long flags
;
1108 spin_lock_irqsave(&irqfds_lock
, flags
);
1110 list_for_each_entry_safe(kirqfd
, tmp
, &irqfds_list
, list
)
1111 irqfd_deactivate(kirqfd
);
1113 spin_unlock_irqrestore(&irqfds_lock
, flags
);
1115 destroy_workqueue(irqfd_cleanup_wq
);
1118 /* Ioeventfd Support */
1119 #define QUEUE_NOTIFY_VQ_MASK 0xFFFF
1121 static DEFINE_MUTEX(ioreq_lock
);
1122 static LIST_HEAD(ioreq_list
);
1124 /* per-eventfd structure */
1125 struct privcmd_kernel_ioeventfd
{
1126 struct eventfd_ctx
*eventfd
;
1127 struct list_head list
;
1129 unsigned int addr_len
;
1133 /* per-guest CPU / port structure */
1137 struct privcmd_kernel_ioreq
*kioreq
;
1140 /* per-guest structure */
1141 struct privcmd_kernel_ioreq
{
1145 struct ioreq
*ioreq
;
1146 spinlock_t lock
; /* Protects ioeventfds list */
1147 struct list_head ioeventfds
;
1148 struct list_head list
;
1149 struct ioreq_port ports
[] __counted_by(vcpus
);
1152 static irqreturn_t
ioeventfd_interrupt(int irq
, void *dev_id
)
1154 struct ioreq_port
*port
= dev_id
;
1155 struct privcmd_kernel_ioreq
*kioreq
= port
->kioreq
;
1156 struct ioreq
*ioreq
= &kioreq
->ioreq
[port
->vcpu
];
1157 struct privcmd_kernel_ioeventfd
*kioeventfd
;
1158 unsigned int state
= STATE_IOREQ_READY
;
1160 if (ioreq
->state
!= STATE_IOREQ_READY
||
1161 ioreq
->type
!= IOREQ_TYPE_COPY
|| ioreq
->dir
!= IOREQ_WRITE
)
1165 * We need a barrier, smp_mb(), here to ensure reads are finished before
1166 * `state` is updated. Since the lock implementation ensures that
1167 * appropriate barrier will be added anyway, we can avoid adding
1168 * explicit barrier here.
1170 * Ideally we don't need to update `state` within the locks, but we do
1171 * that here to avoid adding explicit barrier.
1174 spin_lock(&kioreq
->lock
);
1175 ioreq
->state
= STATE_IOREQ_INPROCESS
;
1177 list_for_each_entry(kioeventfd
, &kioreq
->ioeventfds
, list
) {
1178 if (ioreq
->addr
== kioeventfd
->addr
+ VIRTIO_MMIO_QUEUE_NOTIFY
&&
1179 ioreq
->size
== kioeventfd
->addr_len
&&
1180 (ioreq
->data
& QUEUE_NOTIFY_VQ_MASK
) == kioeventfd
->vq
) {
1181 eventfd_signal(kioeventfd
->eventfd
);
1182 state
= STATE_IORESP_READY
;
1186 spin_unlock(&kioreq
->lock
);
1189 * We need a barrier, smp_mb(), here to ensure writes are finished
1190 * before `state` is updated. Since the lock implementation ensures that
1191 * appropriate barrier will be added anyway, we can avoid adding
1192 * explicit barrier here.
1195 ioreq
->state
= state
;
1197 if (state
== STATE_IORESP_READY
) {
1198 notify_remote_via_evtchn(port
->port
);
1205 static void ioreq_free(struct privcmd_kernel_ioreq
*kioreq
)
1207 struct ioreq_port
*ports
= kioreq
->ports
;
1210 lockdep_assert_held(&ioreq_lock
);
1212 list_del(&kioreq
->list
);
1214 for (i
= kioreq
->vcpus
- 1; i
>= 0; i
--)
1215 unbind_from_irqhandler(irq_from_evtchn(ports
[i
].port
), &ports
[i
]);
1221 struct privcmd_kernel_ioreq
*alloc_ioreq(struct privcmd_ioeventfd
*ioeventfd
)
1223 struct privcmd_kernel_ioreq
*kioreq
;
1224 struct mm_struct
*mm
= current
->mm
;
1225 struct vm_area_struct
*vma
;
1226 struct page
**pages
;
1227 unsigned int *ports
;
1230 lockdep_assert_held(&ioreq_lock
);
1232 size
= struct_size(kioreq
, ports
, ioeventfd
->vcpus
);
1233 kioreq
= kzalloc(size
, GFP_KERNEL
);
1235 return ERR_PTR(-ENOMEM
);
1237 kioreq
->dom
= ioeventfd
->dom
;
1238 kioreq
->vcpus
= ioeventfd
->vcpus
;
1239 kioreq
->uioreq
= ioeventfd
->ioreq
;
1240 spin_lock_init(&kioreq
->lock
);
1241 INIT_LIST_HEAD(&kioreq
->ioeventfds
);
1243 /* The memory for ioreq server must have been mapped earlier */
1244 mmap_write_lock(mm
);
1245 vma
= find_vma(mm
, (unsigned long)ioeventfd
->ioreq
);
1247 pr_err("Failed to find vma for ioreq page!\n");
1248 mmap_write_unlock(mm
);
1253 pages
= vma
->vm_private_data
;
1254 kioreq
->ioreq
= (struct ioreq
*)(page_to_virt(pages
[0]));
1255 mmap_write_unlock(mm
);
1257 ports
= memdup_array_user(u64_to_user_ptr(ioeventfd
->ports
),
1258 kioreq
->vcpus
, sizeof(*ports
));
1259 if (IS_ERR(ports
)) {
1260 ret
= PTR_ERR(ports
);
1264 for (i
= 0; i
< kioreq
->vcpus
; i
++) {
1265 kioreq
->ports
[i
].vcpu
= i
;
1266 kioreq
->ports
[i
].port
= ports
[i
];
1267 kioreq
->ports
[i
].kioreq
= kioreq
;
1269 ret
= bind_evtchn_to_irqhandler_lateeoi(ports
[i
],
1270 ioeventfd_interrupt
, IRQF_SHARED
, "ioeventfd",
1278 list_add_tail(&kioreq
->list
, &ioreq_list
);
1284 unbind_from_irqhandler(irq_from_evtchn(ports
[i
]), &kioreq
->ports
[i
]);
1289 return ERR_PTR(ret
);
1292 static struct privcmd_kernel_ioreq
*
1293 get_ioreq(struct privcmd_ioeventfd
*ioeventfd
, struct eventfd_ctx
*eventfd
)
1295 struct privcmd_kernel_ioreq
*kioreq
;
1296 unsigned long flags
;
1298 list_for_each_entry(kioreq
, &ioreq_list
, list
) {
1299 struct privcmd_kernel_ioeventfd
*kioeventfd
;
1302 * kioreq fields can be accessed here without a lock as they are
1303 * never updated after being added to the ioreq_list.
1305 if (kioreq
->uioreq
!= ioeventfd
->ioreq
) {
1307 } else if (kioreq
->dom
!= ioeventfd
->dom
||
1308 kioreq
->vcpus
!= ioeventfd
->vcpus
) {
1309 pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n",
1310 kioreq
->dom
, ioeventfd
->dom
, kioreq
->vcpus
,
1312 return ERR_PTR(-EINVAL
);
1315 /* Look for a duplicate eventfd for the same guest */
1316 spin_lock_irqsave(&kioreq
->lock
, flags
);
1317 list_for_each_entry(kioeventfd
, &kioreq
->ioeventfds
, list
) {
1318 if (eventfd
== kioeventfd
->eventfd
) {
1319 spin_unlock_irqrestore(&kioreq
->lock
, flags
);
1320 return ERR_PTR(-EBUSY
);
1323 spin_unlock_irqrestore(&kioreq
->lock
, flags
);
1328 /* Matching kioreq isn't found, allocate a new one */
1329 return alloc_ioreq(ioeventfd
);
1332 static void ioeventfd_free(struct privcmd_kernel_ioeventfd
*kioeventfd
)
1334 list_del(&kioeventfd
->list
);
1335 eventfd_ctx_put(kioeventfd
->eventfd
);
1339 static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd
*ioeventfd
)
1341 struct privcmd_kernel_ioeventfd
*kioeventfd
;
1342 struct privcmd_kernel_ioreq
*kioreq
;
1343 unsigned long flags
;
1346 /* Check for range overflow */
1347 if (ioeventfd
->addr
+ ioeventfd
->addr_len
< ioeventfd
->addr
)
1350 /* Vhost requires us to support length 1, 2, 4, and 8 */
1351 if (!(ioeventfd
->addr_len
== 1 || ioeventfd
->addr_len
== 2 ||
1352 ioeventfd
->addr_len
== 4 || ioeventfd
->addr_len
== 8))
1355 /* 4096 vcpus limit enough ? */
1356 if (!ioeventfd
->vcpus
|| ioeventfd
->vcpus
> 4096)
1359 kioeventfd
= kzalloc(sizeof(*kioeventfd
), GFP_KERNEL
);
1363 kioeventfd
->eventfd
= eventfd_ctx_fdget(ioeventfd
->event_fd
);
1364 if (IS_ERR(kioeventfd
->eventfd
)) {
1365 ret
= PTR_ERR(kioeventfd
->eventfd
);
1369 kioeventfd
->addr
= ioeventfd
->addr
;
1370 kioeventfd
->addr_len
= ioeventfd
->addr_len
;
1371 kioeventfd
->vq
= ioeventfd
->vq
;
1373 mutex_lock(&ioreq_lock
);
1374 kioreq
= get_ioreq(ioeventfd
, kioeventfd
->eventfd
);
1375 if (IS_ERR(kioreq
)) {
1376 mutex_unlock(&ioreq_lock
);
1377 ret
= PTR_ERR(kioreq
);
1381 spin_lock_irqsave(&kioreq
->lock
, flags
);
1382 list_add_tail(&kioeventfd
->list
, &kioreq
->ioeventfds
);
1383 spin_unlock_irqrestore(&kioreq
->lock
, flags
);
1385 mutex_unlock(&ioreq_lock
);
1390 eventfd_ctx_put(kioeventfd
->eventfd
);
1397 static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd
*ioeventfd
)
1399 struct privcmd_kernel_ioreq
*kioreq
, *tkioreq
;
1400 struct eventfd_ctx
*eventfd
;
1401 unsigned long flags
;
1404 eventfd
= eventfd_ctx_fdget(ioeventfd
->event_fd
);
1405 if (IS_ERR(eventfd
))
1406 return PTR_ERR(eventfd
);
1408 mutex_lock(&ioreq_lock
);
1409 list_for_each_entry_safe(kioreq
, tkioreq
, &ioreq_list
, list
) {
1410 struct privcmd_kernel_ioeventfd
*kioeventfd
, *tmp
;
1412 * kioreq fields can be accessed here without a lock as they are
1413 * never updated after being added to the ioreq_list.
1415 if (kioreq
->dom
!= ioeventfd
->dom
||
1416 kioreq
->uioreq
!= ioeventfd
->ioreq
||
1417 kioreq
->vcpus
!= ioeventfd
->vcpus
)
1420 spin_lock_irqsave(&kioreq
->lock
, flags
);
1421 list_for_each_entry_safe(kioeventfd
, tmp
, &kioreq
->ioeventfds
, list
) {
1422 if (eventfd
== kioeventfd
->eventfd
) {
1423 ioeventfd_free(kioeventfd
);
1424 spin_unlock_irqrestore(&kioreq
->lock
, flags
);
1426 if (list_empty(&kioreq
->ioeventfds
))
1431 spin_unlock_irqrestore(&kioreq
->lock
, flags
);
1435 pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n",
1436 ioeventfd
->dom
, ioeventfd
->addr
);
1440 mutex_unlock(&ioreq_lock
);
1441 eventfd_ctx_put(eventfd
);
1446 static long privcmd_ioctl_ioeventfd(struct file
*file
, void __user
*udata
)
1448 struct privcmd_data
*data
= file
->private_data
;
1449 struct privcmd_ioeventfd ioeventfd
;
1451 if (copy_from_user(&ioeventfd
, udata
, sizeof(ioeventfd
)))
1454 /* No other flags should be set */
1455 if (ioeventfd
.flags
& ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN
)
1458 /* If restriction is in place, check the domid matches */
1459 if (data
->domid
!= DOMID_INVALID
&& data
->domid
!= ioeventfd
.dom
)
1462 if (ioeventfd
.flags
& PRIVCMD_IOEVENTFD_FLAG_DEASSIGN
)
1463 return privcmd_ioeventfd_deassign(&ioeventfd
);
1465 return privcmd_ioeventfd_assign(&ioeventfd
);
1468 static void privcmd_ioeventfd_exit(void)
1470 struct privcmd_kernel_ioreq
*kioreq
, *tmp
;
1471 unsigned long flags
;
1473 mutex_lock(&ioreq_lock
);
1474 list_for_each_entry_safe(kioreq
, tmp
, &ioreq_list
, list
) {
1475 struct privcmd_kernel_ioeventfd
*kioeventfd
, *tmp
;
1477 spin_lock_irqsave(&kioreq
->lock
, flags
);
1478 list_for_each_entry_safe(kioeventfd
, tmp
, &kioreq
->ioeventfds
, list
)
1479 ioeventfd_free(kioeventfd
);
1480 spin_unlock_irqrestore(&kioreq
->lock
, flags
);
1484 mutex_unlock(&ioreq_lock
);
1487 static inline long privcmd_ioctl_irqfd(struct file
*file
, void __user
*udata
)
1492 static inline int privcmd_irqfd_init(void)
1497 static inline void privcmd_irqfd_exit(void)
1501 static inline long privcmd_ioctl_ioeventfd(struct file
*file
, void __user
*udata
)
1506 static inline void privcmd_ioeventfd_exit(void)
1509 #endif /* CONFIG_XEN_PRIVCMD_EVENTFD */
1511 static long privcmd_ioctl(struct file
*file
,
1512 unsigned int cmd
, unsigned long data
)
1515 void __user
*udata
= (void __user
*) data
;
1518 case IOCTL_PRIVCMD_HYPERCALL
:
1519 ret
= privcmd_ioctl_hypercall(file
, udata
);
1522 case IOCTL_PRIVCMD_MMAP
:
1523 ret
= privcmd_ioctl_mmap(file
, udata
);
1526 case IOCTL_PRIVCMD_MMAPBATCH
:
1527 ret
= privcmd_ioctl_mmap_batch(file
, udata
, 1);
1530 case IOCTL_PRIVCMD_MMAPBATCH_V2
:
1531 ret
= privcmd_ioctl_mmap_batch(file
, udata
, 2);
1534 case IOCTL_PRIVCMD_DM_OP
:
1535 ret
= privcmd_ioctl_dm_op(file
, udata
);
1538 case IOCTL_PRIVCMD_RESTRICT
:
1539 ret
= privcmd_ioctl_restrict(file
, udata
);
1542 case IOCTL_PRIVCMD_MMAP_RESOURCE
:
1543 ret
= privcmd_ioctl_mmap_resource(file
, udata
);
1546 case IOCTL_PRIVCMD_IRQFD
:
1547 ret
= privcmd_ioctl_irqfd(file
, udata
);
1550 case IOCTL_PRIVCMD_IOEVENTFD
:
1551 ret
= privcmd_ioctl_ioeventfd(file
, udata
);
1554 case IOCTL_PRIVCMD_PCIDEV_GET_GSI
:
1555 ret
= privcmd_ioctl_pcidev_get_gsi(file
, udata
);
1565 static int privcmd_open(struct inode
*ino
, struct file
*file
)
1567 struct privcmd_data
*data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
1572 /* DOMID_INVALID implies no restriction */
1573 data
->domid
= DOMID_INVALID
;
1575 file
->private_data
= data
;
1579 static int privcmd_release(struct inode
*ino
, struct file
*file
)
1581 struct privcmd_data
*data
= file
->private_data
;
1587 static void privcmd_close(struct vm_area_struct
*vma
)
1589 struct page
**pages
= vma
->vm_private_data
;
1590 int numpgs
= vma_pages(vma
);
1591 int numgfns
= (vma
->vm_end
- vma
->vm_start
) >> XEN_PAGE_SHIFT
;
1594 if (!xen_feature(XENFEAT_auto_translated_physmap
) || !numpgs
|| !pages
)
1597 rc
= xen_unmap_domain_gfn_range(vma
, numgfns
, pages
);
1599 xen_free_unpopulated_pages(numpgs
, pages
);
1601 pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n",
1606 static vm_fault_t
privcmd_fault(struct vm_fault
*vmf
)
1608 printk(KERN_DEBUG
"privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
1609 vmf
->vma
, vmf
->vma
->vm_start
, vmf
->vma
->vm_end
,
1610 vmf
->pgoff
, (void *)vmf
->address
);
1612 return VM_FAULT_SIGBUS
;
1615 static const struct vm_operations_struct privcmd_vm_ops
= {
1616 .close
= privcmd_close
,
1617 .fault
= privcmd_fault
1620 static int privcmd_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1622 /* DONTCOPY is essential for Xen because copy_page_range doesn't know
1623 * how to recreate these mappings */
1624 vm_flags_set(vma
, VM_IO
| VM_PFNMAP
| VM_DONTCOPY
|
1625 VM_DONTEXPAND
| VM_DONTDUMP
);
1626 vma
->vm_ops
= &privcmd_vm_ops
;
1627 vma
->vm_private_data
= NULL
;
1633 * For MMAPBATCH*. This allows asserting the singleshot mapping
1634 * on a per pfn/pte basis. Mapping calls that fail with ENOENT
1635 * can be then retried until success.
1637 static int is_mapped_fn(pte_t
*pte
, unsigned long addr
, void *data
)
1639 return pte_none(ptep_get(pte
)) ? 0 : -EBUSY
;
1642 static int privcmd_vma_range_is_mapped(
1643 struct vm_area_struct
*vma
,
1645 unsigned long nr_pages
)
1647 return apply_to_page_range(vma
->vm_mm
, addr
, nr_pages
<< PAGE_SHIFT
,
1648 is_mapped_fn
, NULL
) != 0;
1651 const struct file_operations xen_privcmd_fops
= {
1652 .owner
= THIS_MODULE
,
1653 .unlocked_ioctl
= privcmd_ioctl
,
1654 .open
= privcmd_open
,
1655 .release
= privcmd_release
,
1656 .mmap
= privcmd_mmap
,
1658 EXPORT_SYMBOL_GPL(xen_privcmd_fops
);
1660 static struct miscdevice privcmd_dev
= {
1661 .minor
= MISC_DYNAMIC_MINOR
,
1662 .name
= "xen/privcmd",
1663 .fops
= &xen_privcmd_fops
,
1666 static int __init
privcmd_init(void)
1673 err
= misc_register(&privcmd_dev
);
1675 pr_err("Could not register Xen privcmd device\n");
1679 err
= misc_register(&xen_privcmdbuf_dev
);
1681 pr_err("Could not register Xen hypercall-buf device\n");
1682 goto err_privcmdbuf
;
1685 err
= privcmd_irqfd_init();
1687 pr_err("irqfd init failed\n");
1694 misc_deregister(&xen_privcmdbuf_dev
);
1696 misc_deregister(&privcmd_dev
);
1700 static void __exit
privcmd_exit(void)
1702 privcmd_ioeventfd_exit();
1703 privcmd_irqfd_exit();
1704 misc_deregister(&privcmd_dev
);
1705 misc_deregister(&xen_privcmdbuf_dev
);
1708 module_init(privcmd_init
);
1709 module_exit(privcmd_exit
);