2 * Xen HVM emulation support in KVM
4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
14 #include "qemu/main-loop.h"
15 #include "qemu/error-report.h"
16 #include "hw/xen/xen.h"
17 #include "sysemu/kvm_int.h"
18 #include "sysemu/kvm_xen.h"
19 #include "kvm/kvm_i386.h"
20 #include "exec/address-spaces.h"
23 #include "sysemu/runstate.h"
25 #include "hw/pci/msi.h"
26 #include "hw/i386/apic-msidef.h"
27 #include "hw/i386/e820_memory_layout.h"
28 #include "hw/i386/kvm/xen_overlay.h"
29 #include "hw/i386/kvm/xen_evtchn.h"
30 #include "hw/i386/kvm/xen_gnttab.h"
31 #include "hw/i386/kvm/xen_primary_console.h"
32 #include "hw/i386/kvm/xen_xenstore.h"
34 #include "hw/xen/interface/version.h"
35 #include "hw/xen/interface/sched.h"
36 #include "hw/xen/interface/memory.h"
37 #include "hw/xen/interface/hvm/hvm_op.h"
38 #include "hw/xen/interface/hvm/params.h"
39 #include "hw/xen/interface/vcpu.h"
40 #include "hw/xen/interface/event_channel.h"
41 #include "hw/xen/interface/grant_table.h"
43 #include "xen-compat.h"
45 static void xen_vcpu_singleshot_timer_event(void *opaque
);
46 static void xen_vcpu_periodic_timer_event(void *opaque
);
47 static int vcpuop_stop_singleshot_timer(CPUState
*cs
);
50 #define hypercall_compat32(longmode) (!(longmode))
52 #define hypercall_compat32(longmode) (false)
55 static bool kvm_gva_to_gpa(CPUState
*cs
, uint64_t gva
, uint64_t *gpa
,
56 size_t *len
, bool is_write
)
58 struct kvm_translation tr
= {
59 .linear_address
= gva
,
63 *len
= TARGET_PAGE_SIZE
- (gva
& ~TARGET_PAGE_MASK
);
66 if (kvm_vcpu_ioctl(cs
, KVM_TRANSLATE
, &tr
) || !tr
.valid
||
67 (is_write
&& !tr
.writeable
)) {
70 *gpa
= tr
.physical_address
;
74 static int kvm_gva_rw(CPUState
*cs
, uint64_t gva
, void *_buf
, size_t sz
,
77 uint8_t *buf
= (uint8_t *)_buf
;
82 if (!kvm_gva_to_gpa(cs
, gva
, &gpa
, &len
, is_write
)) {
89 cpu_physical_memory_rw(gpa
, buf
, len
, is_write
);
99 static inline int kvm_copy_from_gva(CPUState
*cs
, uint64_t gva
, void *buf
,
102 return kvm_gva_rw(cs
, gva
, buf
, sz
, false);
105 static inline int kvm_copy_to_gva(CPUState
*cs
, uint64_t gva
, void *buf
,
108 return kvm_gva_rw(cs
, gva
, buf
, sz
, true);
111 int kvm_xen_init(KVMState
*s
, uint32_t hypercall_msr
)
113 const int required_caps
= KVM_XEN_HVM_CONFIG_HYPERCALL_MSR
|
114 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL
| KVM_XEN_HVM_CONFIG_SHARED_INFO
;
115 struct kvm_xen_hvm_config cfg
= {
116 .msr
= hypercall_msr
,
117 .flags
= KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL
,
121 xen_caps
= kvm_check_extension(s
, KVM_CAP_XEN_HVM
);
122 if (required_caps
& ~xen_caps
) {
123 error_report("kvm: Xen HVM guest support not present or insufficient");
127 if (xen_caps
& KVM_XEN_HVM_CONFIG_EVTCHN_SEND
) {
128 struct kvm_xen_hvm_attr ha
= {
129 .type
= KVM_XEN_ATTR_TYPE_XEN_VERSION
,
130 .u
.xen_version
= s
->xen_version
,
132 (void)kvm_vm_ioctl(s
, KVM_XEN_HVM_SET_ATTR
, &ha
);
134 cfg
.flags
|= KVM_XEN_HVM_CONFIG_EVTCHN_SEND
;
137 ret
= kvm_vm_ioctl(s
, KVM_XEN_HVM_CONFIG
, &cfg
);
139 error_report("kvm: Failed to enable Xen HVM support: %s",
144 /* If called a second time, don't repeat the rest of the setup. */
150 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
151 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
153 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
154 * such things to be polled at precisely the right time. We *could* do
155 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
156 * the moment the IRQ is acked, and see if it should be reasserted.
158 * But the in-kernel irqchip is deprecated, so we're unlikely to add
159 * that support in the kernel. Insist on using the split irqchip mode
162 * This leaves us polling for the level going low in QEMU, which lacks
163 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
164 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
165 * the device (for which it has to unmap the device and trap access, for
166 * some period after an IRQ!!). In the Xen case, we do it on exit from
167 * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
168 * Which is kind of icky, but less so than the VFIO one. I may fix them
171 if (!kvm_kernel_irqchip_split()) {
172 error_report("kvm: Xen support requires kernel-irqchip=split");
176 s
->xen_caps
= xen_caps
;
178 /* Tell fw_cfg to notify the BIOS to reserve the range. */
179 e820_add_entry(XEN_SPECIAL_AREA_ADDR
, XEN_SPECIAL_AREA_SIZE
, E820_RESERVED
);
181 /* The pages couldn't be overlaid until KVM was initialized */
182 xen_primary_console_reset();
183 xen_xenstore_reset();
188 int kvm_xen_init_vcpu(CPUState
*cs
)
190 X86CPU
*cpu
= X86_CPU(cs
);
191 CPUX86State
*env
= &cpu
->env
;
195 * The kernel needs to know the Xen/ACPI vCPU ID because that's
196 * what the guest uses in hypercalls such as timers. It doesn't
197 * match the APIC ID which is generally used for talking to the
198 * kernel about vCPUs. And if vCPU threads race with creating
199 * their KVM vCPUs out of order, it doesn't necessarily match
200 * with the kernel's internal vCPU indices either.
202 if (kvm_xen_has_cap(EVTCHN_SEND
)) {
203 struct kvm_xen_vcpu_attr va
= {
204 .type
= KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
,
205 .u
.vcpu_id
= cs
->cpu_index
,
207 err
= kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &va
);
209 error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
215 env
->xen_vcpu_info_gpa
= INVALID_GPA
;
216 env
->xen_vcpu_info_default_gpa
= INVALID_GPA
;
217 env
->xen_vcpu_time_info_gpa
= INVALID_GPA
;
218 env
->xen_vcpu_runstate_gpa
= INVALID_GPA
;
220 qemu_mutex_init(&env
->xen_timers_lock
);
221 env
->xen_singleshot_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
222 xen_vcpu_singleshot_timer_event
,
224 if (!env
->xen_singleshot_timer
) {
227 env
->xen_singleshot_timer
->opaque
= cs
;
229 env
->xen_periodic_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
230 xen_vcpu_periodic_timer_event
,
232 if (!env
->xen_periodic_timer
) {
235 env
->xen_periodic_timer
->opaque
= cs
;
240 uint32_t kvm_xen_get_caps(void)
242 return kvm_state
->xen_caps
;
245 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
246 int cmd
, uint64_t arg
)
251 case XENVER_get_features
: {
252 struct xen_feature_info fi
;
254 /* No need for 32/64 compat handling */
255 qemu_build_assert(sizeof(fi
) == 8);
257 err
= kvm_copy_from_gva(CPU(cpu
), arg
, &fi
, sizeof(fi
));
263 if (fi
.submap_idx
== 0) {
264 fi
.submap
|= 1 << XENFEAT_writable_page_tables
|
265 1 << XENFEAT_writable_descriptor_tables
|
266 1 << XENFEAT_auto_translated_physmap
|
267 1 << XENFEAT_hvm_callback_vector
|
268 1 << XENFEAT_hvm_safe_pvclock
|
269 1 << XENFEAT_hvm_pirqs
;
272 err
= kvm_copy_to_gva(CPU(cpu
), arg
, &fi
, sizeof(fi
));
280 exit
->u
.hcall
.result
= err
;
284 static int kvm_xen_set_vcpu_attr(CPUState
*cs
, uint16_t type
, uint64_t gpa
)
286 struct kvm_xen_vcpu_attr xhsi
;
291 trace_kvm_xen_set_vcpu_attr(cs
->cpu_index
, type
, gpa
);
293 return kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &xhsi
);
296 static int kvm_xen_set_vcpu_callback_vector(CPUState
*cs
)
298 uint8_t vector
= X86_CPU(cs
)->env
.xen_vcpu_callback_vector
;
299 struct kvm_xen_vcpu_attr xva
;
301 xva
.type
= KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
;
302 xva
.u
.vector
= vector
;
304 trace_kvm_xen_set_vcpu_callback(cs
->cpu_index
, vector
);
306 return kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &xva
);
309 static void do_set_vcpu_callback_vector(CPUState
*cs
, run_on_cpu_data data
)
311 X86CPU
*cpu
= X86_CPU(cs
);
312 CPUX86State
*env
= &cpu
->env
;
314 env
->xen_vcpu_callback_vector
= data
.host_int
;
316 if (kvm_xen_has_cap(EVTCHN_SEND
)) {
317 kvm_xen_set_vcpu_callback_vector(cs
);
321 static int set_vcpu_info(CPUState
*cs
, uint64_t gpa
)
323 X86CPU
*cpu
= X86_CPU(cs
);
324 CPUX86State
*env
= &cpu
->env
;
325 MemoryRegionSection mrs
= { .mr
= NULL
};
326 void *vcpu_info_hva
= NULL
;
329 ret
= kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
, gpa
);
330 if (ret
|| gpa
== INVALID_GPA
) {
334 mrs
= memory_region_find(get_system_memory(), gpa
,
335 sizeof(struct vcpu_info
));
336 if (mrs
.mr
&& mrs
.mr
->ram_block
&&
337 !int128_lt(mrs
.size
, int128_make64(sizeof(struct vcpu_info
)))) {
338 vcpu_info_hva
= qemu_map_ram_ptr(mrs
.mr
->ram_block
,
339 mrs
.offset_within_region
);
341 if (!vcpu_info_hva
) {
343 memory_region_unref(mrs
.mr
);
350 if (env
->xen_vcpu_info_mr
) {
351 memory_region_unref(env
->xen_vcpu_info_mr
);
353 env
->xen_vcpu_info_hva
= vcpu_info_hva
;
354 env
->xen_vcpu_info_mr
= mrs
.mr
;
358 static void do_set_vcpu_info_default_gpa(CPUState
*cs
, run_on_cpu_data data
)
360 X86CPU
*cpu
= X86_CPU(cs
);
361 CPUX86State
*env
= &cpu
->env
;
363 env
->xen_vcpu_info_default_gpa
= data
.host_ulong
;
365 /* Changing the default does nothing if a vcpu_info was explicitly set. */
366 if (env
->xen_vcpu_info_gpa
== INVALID_GPA
) {
367 set_vcpu_info(cs
, env
->xen_vcpu_info_default_gpa
);
371 static void do_set_vcpu_info_gpa(CPUState
*cs
, run_on_cpu_data data
)
373 X86CPU
*cpu
= X86_CPU(cs
);
374 CPUX86State
*env
= &cpu
->env
;
376 env
->xen_vcpu_info_gpa
= data
.host_ulong
;
378 set_vcpu_info(cs
, env
->xen_vcpu_info_gpa
);
381 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id
)
383 CPUState
*cs
= qemu_get_cpu(vcpu_id
);
388 return X86_CPU(cs
)->env
.xen_vcpu_info_hva
;
391 void kvm_xen_maybe_deassert_callback(CPUState
*cs
)
393 CPUX86State
*env
= &X86_CPU(cs
)->env
;
394 struct vcpu_info
*vi
= env
->xen_vcpu_info_hva
;
399 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
400 if (!vi
->evtchn_upcall_pending
) {
403 * Check again now we have the lock, because it may have been
404 * asserted in the interim. And we don't want to take the lock
405 * every time because this is a fast path.
407 if (!vi
->evtchn_upcall_pending
) {
408 X86_CPU(cs
)->env
.xen_callback_asserted
= false;
409 xen_evtchn_set_callback_level(0);
415 void kvm_xen_set_callback_asserted(void)
417 CPUState
*cs
= qemu_get_cpu(0);
420 X86_CPU(cs
)->env
.xen_callback_asserted
= true;
424 bool kvm_xen_has_vcpu_callback_vector(void)
426 CPUState
*cs
= qemu_get_cpu(0);
428 return cs
&& !!X86_CPU(cs
)->env
.xen_vcpu_callback_vector
;
431 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id
, int type
)
433 CPUState
*cs
= qemu_get_cpu(vcpu_id
);
440 vector
= X86_CPU(cs
)->env
.xen_vcpu_callback_vector
;
443 * The per-vCPU callback vector injected via lapic. Just
444 * deliver it as an MSI.
447 .address
= APIC_DEFAULT_ADDRESS
|
448 (X86_CPU(cs
)->apic_id
<< MSI_ADDR_DEST_ID_SHIFT
),
449 .data
= vector
| (1UL << MSI_DATA_LEVEL_SHIFT
),
451 kvm_irqchip_send_msi(kvm_state
, msg
);
456 case HVM_PARAM_CALLBACK_TYPE_VECTOR
:
458 * If the evtchn_upcall_pending field in the vcpu_info is set, then
459 * KVM will automatically deliver the vector on entering the vCPU
460 * so all we have to do is kick it out.
465 case HVM_PARAM_CALLBACK_TYPE_GSI
:
466 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX
:
468 xen_evtchn_set_callback_level(1);
474 /* Must always be called with xen_timers_lock held */
475 static int kvm_xen_set_vcpu_timer(CPUState
*cs
)
477 X86CPU
*cpu
= X86_CPU(cs
);
478 CPUX86State
*env
= &cpu
->env
;
480 struct kvm_xen_vcpu_attr va
= {
481 .type
= KVM_XEN_VCPU_ATTR_TYPE_TIMER
,
482 .u
.timer
.port
= env
->xen_virq
[VIRQ_TIMER
],
483 .u
.timer
.priority
= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
,
484 .u
.timer
.expires_ns
= env
->xen_singleshot_timer_ns
,
487 return kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &va
);
490 static void do_set_vcpu_timer_virq(CPUState
*cs
, run_on_cpu_data data
)
492 QEMU_LOCK_GUARD(&X86_CPU(cs
)->env
.xen_timers_lock
);
493 kvm_xen_set_vcpu_timer(cs
);
496 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id
, uint16_t virq
, uint16_t port
)
498 CPUState
*cs
= qemu_get_cpu(vcpu_id
);
504 /* cpu.h doesn't include the actual Xen header. */
505 qemu_build_assert(NR_VIRQS
== XEN_NR_VIRQS
);
507 if (virq
>= NR_VIRQS
) {
511 if (port
&& X86_CPU(cs
)->env
.xen_virq
[virq
]) {
515 X86_CPU(cs
)->env
.xen_virq
[virq
] = port
;
516 if (virq
== VIRQ_TIMER
&& kvm_xen_has_cap(EVTCHN_SEND
)) {
517 async_run_on_cpu(cs
, do_set_vcpu_timer_virq
,
518 RUN_ON_CPU_HOST_INT(port
));
523 static void do_set_vcpu_time_info_gpa(CPUState
*cs
, run_on_cpu_data data
)
525 X86CPU
*cpu
= X86_CPU(cs
);
526 CPUX86State
*env
= &cpu
->env
;
528 env
->xen_vcpu_time_info_gpa
= data
.host_ulong
;
530 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
,
531 env
->xen_vcpu_time_info_gpa
);
534 static void do_set_vcpu_runstate_gpa(CPUState
*cs
, run_on_cpu_data data
)
536 X86CPU
*cpu
= X86_CPU(cs
);
537 CPUX86State
*env
= &cpu
->env
;
539 env
->xen_vcpu_runstate_gpa
= data
.host_ulong
;
541 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
,
542 env
->xen_vcpu_runstate_gpa
);
545 static void do_vcpu_soft_reset(CPUState
*cs
, run_on_cpu_data data
)
547 X86CPU
*cpu
= X86_CPU(cs
);
548 CPUX86State
*env
= &cpu
->env
;
550 env
->xen_vcpu_info_gpa
= INVALID_GPA
;
551 env
->xen_vcpu_info_default_gpa
= INVALID_GPA
;
552 env
->xen_vcpu_time_info_gpa
= INVALID_GPA
;
553 env
->xen_vcpu_runstate_gpa
= INVALID_GPA
;
554 env
->xen_vcpu_callback_vector
= 0;
555 memset(env
->xen_virq
, 0, sizeof(env
->xen_virq
));
557 set_vcpu_info(cs
, INVALID_GPA
);
558 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
,
560 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
,
562 if (kvm_xen_has_cap(EVTCHN_SEND
)) {
563 kvm_xen_set_vcpu_callback_vector(cs
);
565 QEMU_LOCK_GUARD(&X86_CPU(cs
)->env
.xen_timers_lock
);
566 env
->xen_singleshot_timer_ns
= 0;
567 kvm_xen_set_vcpu_timer(cs
);
569 vcpuop_stop_singleshot_timer(cs
);
574 static int xen_set_shared_info(uint64_t gfn
)
576 uint64_t gpa
= gfn
<< TARGET_PAGE_BITS
;
582 * The xen_overlay device tells KVM about it too, since it had to
583 * do that on migration load anyway (unless we're going to jump
584 * through lots of hoops to maintain the fiction that this isn't
587 err
= xen_overlay_map_shinfo_page(gpa
);
592 trace_kvm_xen_set_shared_info(gfn
);
594 for (i
= 0; i
< XEN_LEGACY_MAX_VCPUS
; i
++) {
595 CPUState
*cpu
= qemu_get_cpu(i
);
597 async_run_on_cpu(cpu
, do_set_vcpu_info_default_gpa
,
598 RUN_ON_CPU_HOST_ULONG(gpa
));
600 gpa
+= sizeof(vcpu_info_t
);
606 static int add_to_physmap_one(uint32_t space
, uint64_t idx
, uint64_t gfn
)
609 case XENMAPSPACE_shared_info
:
613 return xen_set_shared_info(gfn
);
615 case XENMAPSPACE_grant_table
:
616 return xen_gnttab_map_page(idx
, gfn
);
618 case XENMAPSPACE_gmfn
:
619 case XENMAPSPACE_gmfn_range
:
622 case XENMAPSPACE_gmfn_foreign
:
623 case XENMAPSPACE_dev_mmio
:
631 static int do_add_to_physmap(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
634 struct xen_add_to_physmap xatp
;
635 CPUState
*cs
= CPU(cpu
);
637 if (hypercall_compat32(exit
->u
.hcall
.longmode
)) {
638 struct compat_xen_add_to_physmap xatp32
;
640 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap
) == 16);
641 if (kvm_copy_from_gva(cs
, arg
, &xatp32
, sizeof(xatp32
))) {
644 xatp
.domid
= xatp32
.domid
;
645 xatp
.size
= xatp32
.size
;
646 xatp
.space
= xatp32
.space
;
647 xatp
.idx
= xatp32
.idx
;
648 xatp
.gpfn
= xatp32
.gpfn
;
650 if (kvm_copy_from_gva(cs
, arg
, &xatp
, sizeof(xatp
))) {
655 if (xatp
.domid
!= DOMID_SELF
&& xatp
.domid
!= xen_domid
) {
659 return add_to_physmap_one(xatp
.space
, xatp
.idx
, xatp
.gpfn
);
662 static int do_add_to_physmap_batch(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
665 struct xen_add_to_physmap_batch xatpb
;
666 unsigned long idxs_gva
, gpfns_gva
, errs_gva
;
667 CPUState
*cs
= CPU(cpu
);
670 if (hypercall_compat32(exit
->u
.hcall
.longmode
)) {
671 struct compat_xen_add_to_physmap_batch xatpb32
;
673 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch
) == 20);
674 if (kvm_copy_from_gva(cs
, arg
, &xatpb32
, sizeof(xatpb32
))) {
677 xatpb
.domid
= xatpb32
.domid
;
678 xatpb
.space
= xatpb32
.space
;
679 xatpb
.size
= xatpb32
.size
;
681 idxs_gva
= xatpb32
.idxs
.c
;
682 gpfns_gva
= xatpb32
.gpfns
.c
;
683 errs_gva
= xatpb32
.errs
.c
;
684 op_sz
= sizeof(uint32_t);
686 if (kvm_copy_from_gva(cs
, arg
, &xatpb
, sizeof(xatpb
))) {
689 op_sz
= sizeof(unsigned long);
690 idxs_gva
= (unsigned long)xatpb
.idxs
.p
;
691 gpfns_gva
= (unsigned long)xatpb
.gpfns
.p
;
692 errs_gva
= (unsigned long)xatpb
.errs
.p
;
695 if (xatpb
.domid
!= DOMID_SELF
&& xatpb
.domid
!= xen_domid
) {
699 /* Explicitly invalid for the batch op. Not that we implement it anyway. */
700 if (xatpb
.space
== XENMAPSPACE_gmfn_range
) {
704 while (xatpb
.size
--) {
705 unsigned long idx
= 0;
706 unsigned long gpfn
= 0;
709 /* For 32-bit compat this only copies the low 32 bits of each */
710 if (kvm_copy_from_gva(cs
, idxs_gva
, &idx
, op_sz
) ||
711 kvm_copy_from_gva(cs
, gpfns_gva
, &gpfn
, op_sz
)) {
717 err
= add_to_physmap_one(xatpb
.space
, idx
, gpfn
);
719 if (kvm_copy_to_gva(cs
, errs_gva
, &err
, sizeof(err
))) {
722 errs_gva
+= sizeof(err
);
727 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
728 int cmd
, uint64_t arg
)
733 case XENMEM_add_to_physmap
:
734 err
= do_add_to_physmap(exit
, cpu
, arg
);
737 case XENMEM_add_to_physmap_batch
:
738 err
= do_add_to_physmap_batch(exit
, cpu
, arg
);
745 exit
->u
.hcall
.result
= err
;
749 static bool handle_set_param(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
752 CPUState
*cs
= CPU(cpu
);
753 struct xen_hvm_param hp
;
756 /* No need for 32/64 compat handling */
757 qemu_build_assert(sizeof(hp
) == 16);
759 if (kvm_copy_from_gva(cs
, arg
, &hp
, sizeof(hp
))) {
764 if (hp
.domid
!= DOMID_SELF
&& hp
.domid
!= xen_domid
) {
770 case HVM_PARAM_CALLBACK_IRQ
:
772 err
= xen_evtchn_set_callback_param(hp
.value
);
774 xen_set_long_mode(exit
->u
.hcall
.longmode
);
781 exit
->u
.hcall
.result
= err
;
785 static bool handle_get_param(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
788 CPUState
*cs
= CPU(cpu
);
789 struct xen_hvm_param hp
;
792 /* No need for 32/64 compat handling */
793 qemu_build_assert(sizeof(hp
) == 16);
795 if (kvm_copy_from_gva(cs
, arg
, &hp
, sizeof(hp
))) {
800 if (hp
.domid
!= DOMID_SELF
&& hp
.domid
!= xen_domid
) {
806 case HVM_PARAM_STORE_PFN
:
807 hp
.value
= XEN_SPECIAL_PFN(XENSTORE
);
809 case HVM_PARAM_STORE_EVTCHN
:
810 hp
.value
= xen_xenstore_get_port();
812 case HVM_PARAM_CONSOLE_PFN
:
813 hp
.value
= xen_primary_console_get_pfn();
818 case HVM_PARAM_CONSOLE_EVTCHN
:
819 hp
.value
= xen_primary_console_get_port();
828 if (!err
&& kvm_copy_to_gva(cs
, arg
, &hp
, sizeof(hp
))) {
832 exit
->u
.hcall
.result
= err
;
836 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit
*exit
,
837 X86CPU
*cpu
, uint64_t arg
)
839 struct xen_hvm_evtchn_upcall_vector up
;
842 /* No need for 32/64 compat handling */
843 qemu_build_assert(sizeof(up
) == 8);
845 if (kvm_copy_from_gva(CPU(cpu
), arg
, &up
, sizeof(up
))) {
849 if (up
.vector
< 0x10) {
853 target_cs
= qemu_get_cpu(up
.vcpu
);
858 async_run_on_cpu(target_cs
, do_set_vcpu_callback_vector
,
859 RUN_ON_CPU_HOST_INT(up
.vector
));
863 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
864 int cmd
, uint64_t arg
)
868 case HVMOP_set_evtchn_upcall_vector
:
869 ret
= kvm_xen_hcall_evtchn_upcall_vector(exit
, cpu
, arg
);
872 case HVMOP_pagetable_dying
:
876 case HVMOP_set_param
:
877 return handle_set_param(exit
, cpu
, arg
);
879 case HVMOP_get_param
:
880 return handle_get_param(exit
, cpu
, arg
);
886 exit
->u
.hcall
.result
= ret
;
890 static int vcpuop_register_vcpu_info(CPUState
*cs
, CPUState
*target
,
893 struct vcpu_register_vcpu_info rvi
;
896 /* No need for 32/64 compat handling */
897 qemu_build_assert(sizeof(rvi
) == 16);
898 qemu_build_assert(sizeof(struct vcpu_info
) == 64);
904 if (kvm_copy_from_gva(cs
, arg
, &rvi
, sizeof(rvi
))) {
908 if (rvi
.offset
> TARGET_PAGE_SIZE
- sizeof(struct vcpu_info
)) {
912 gpa
= ((rvi
.mfn
<< TARGET_PAGE_BITS
) + rvi
.offset
);
913 async_run_on_cpu(target
, do_set_vcpu_info_gpa
, RUN_ON_CPU_HOST_ULONG(gpa
));
917 static int vcpuop_register_vcpu_time_info(CPUState
*cs
, CPUState
*target
,
920 struct vcpu_register_time_memory_area tma
;
924 /* No need for 32/64 compat handling */
925 qemu_build_assert(sizeof(tma
) == 8);
926 qemu_build_assert(sizeof(struct vcpu_time_info
) == 32);
932 if (kvm_copy_from_gva(cs
, arg
, &tma
, sizeof(tma
))) {
937 * Xen actually uses the GVA and does the translation through the guest
938 * page tables each time. But Linux/KVM uses the GPA, on the assumption
939 * that guests only ever use *global* addresses (kernel virtual addresses)
940 * for it. If Linux is changed to redo the GVA→GPA translation each time,
941 * it will offer a new vCPU attribute for that, and we'll use it instead.
943 if (!kvm_gva_to_gpa(cs
, tma
.addr
.p
, &gpa
, &len
, false) ||
944 len
< sizeof(struct vcpu_time_info
)) {
948 async_run_on_cpu(target
, do_set_vcpu_time_info_gpa
,
949 RUN_ON_CPU_HOST_ULONG(gpa
));
953 static int vcpuop_register_runstate_info(CPUState
*cs
, CPUState
*target
,
956 struct vcpu_register_runstate_memory_area rma
;
960 /* No need for 32/64 compat handling */
961 qemu_build_assert(sizeof(rma
) == 8);
962 /* The runstate area actually does change size, but Linux copes. */
968 if (kvm_copy_from_gva(cs
, arg
, &rma
, sizeof(rma
))) {
972 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
973 if (!kvm_gva_to_gpa(cs
, rma
.addr
.p
, &gpa
, &len
, false)) {
977 async_run_on_cpu(target
, do_set_vcpu_runstate_gpa
,
978 RUN_ON_CPU_HOST_ULONG(gpa
));
982 static uint64_t kvm_get_current_ns(void)
984 struct kvm_clock_data data
;
987 ret
= kvm_vm_ioctl(kvm_state
, KVM_GET_CLOCK
, &data
);
989 fprintf(stderr
, "KVM_GET_CLOCK failed: %s\n", strerror(ret
));
996 static void xen_vcpu_singleshot_timer_event(void *opaque
)
998 CPUState
*cpu
= opaque
;
999 CPUX86State
*env
= &X86_CPU(cpu
)->env
;
1000 uint16_t port
= env
->xen_virq
[VIRQ_TIMER
];
1003 xen_evtchn_set_port(port
);
1006 qemu_mutex_lock(&env
->xen_timers_lock
);
1007 env
->xen_singleshot_timer_ns
= 0;
1008 qemu_mutex_unlock(&env
->xen_timers_lock
);
1011 static void xen_vcpu_periodic_timer_event(void *opaque
)
1013 CPUState
*cpu
= opaque
;
1014 CPUX86State
*env
= &X86_CPU(cpu
)->env
;
1015 uint16_t port
= env
->xen_virq
[VIRQ_TIMER
];
1019 xen_evtchn_set_port(port
);
1022 qemu_mutex_lock(&env
->xen_timers_lock
);
1024 qemu_now
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
1025 timer_mod_ns(env
->xen_periodic_timer
,
1026 qemu_now
+ env
->xen_periodic_timer_period
);
1028 qemu_mutex_unlock(&env
->xen_timers_lock
);
1031 static int do_set_periodic_timer(CPUState
*target
, uint64_t period_ns
)
1033 CPUX86State
*tenv
= &X86_CPU(target
)->env
;
1036 timer_del(tenv
->xen_periodic_timer
);
1038 qemu_mutex_lock(&tenv
->xen_timers_lock
);
1040 qemu_now
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
1041 timer_mod_ns(tenv
->xen_periodic_timer
, qemu_now
+ period_ns
);
1042 tenv
->xen_periodic_timer_period
= period_ns
;
1044 qemu_mutex_unlock(&tenv
->xen_timers_lock
);
1048 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL))
1049 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL))
1050 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1051 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */
1052 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1054 static int vcpuop_set_periodic_timer(CPUState
*cs
, CPUState
*target
,
1057 struct vcpu_set_periodic_timer spt
;
1059 qemu_build_assert(sizeof(spt
) == 8);
1060 if (kvm_copy_from_gva(cs
, arg
, &spt
, sizeof(spt
))) {
1064 if (spt
.period_ns
< MILLISECS(1) || spt
.period_ns
> STIME_DELTA_MAX
) {
1068 return do_set_periodic_timer(target
, spt
.period_ns
);
1071 static int vcpuop_stop_periodic_timer(CPUState
*target
)
1073 CPUX86State
*tenv
= &X86_CPU(target
)->env
;
1075 qemu_mutex_lock(&tenv
->xen_timers_lock
);
1077 timer_del(tenv
->xen_periodic_timer
);
1078 tenv
->xen_periodic_timer_period
= 0;
1080 qemu_mutex_unlock(&tenv
->xen_timers_lock
);
1085 * Userspace handling of timer, for older kernels.
1086 * Must always be called with xen_timers_lock held.
1088 static int do_set_singleshot_timer(CPUState
*cs
, uint64_t timeout_abs
,
1091 CPUX86State
*env
= &X86_CPU(cs
)->env
;
1092 int64_t now
= kvm_get_current_ns();
1093 int64_t qemu_now
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
1094 int64_t delta
= timeout_abs
- now
;
1096 if (linux_wa
&& unlikely((int64_t)timeout_abs
< 0 ||
1097 (delta
> 0 && (uint32_t)(delta
>> 50) != 0))) {
1099 * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1100 * for negative absolute timeout values (caused by integer
1101 * overflow), and for values about 13 days in the future (2^50ns)
1102 * which would be caused by jiffies overflow. For those cases, it
1103 * sets the timeout 100ms in the future (not *too* soon, since if
1104 * a guest really did set a long timeout on purpose we don't want
1105 * to keep churning CPU time by waking it up).
1107 delta
= (100 * SCALE_MS
);
1108 timeout_abs
= now
+ delta
;
1111 timer_mod_ns(env
->xen_singleshot_timer
, qemu_now
+ delta
);
1112 env
->xen_singleshot_timer_ns
= now
+ delta
;
1116 static int vcpuop_set_singleshot_timer(CPUState
*cs
, uint64_t arg
)
1118 struct vcpu_set_singleshot_timer sst
= { 0 };
1121 * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1122 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1123 * that get used are identical, and there's four bytes of padding
1124 * unused at the end. For true Xen compatibility we should attempt
1125 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1126 * if we can't get the padding too. But that's daft. Just copy what
1129 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer
, flags
) == 8);
1130 qemu_build_assert(sizeof(sst
) >= 12);
1132 if (kvm_copy_from_gva(cs
, arg
, &sst
, 12)) {
1136 QEMU_LOCK_GUARD(&X86_CPU(cs
)->env
.xen_timers_lock
);
1139 * We ignore the VCPU_SSHOTTMR_future flag, just as Xen now does.
1140 * The only guest that ever used it, got it wrong.
1141 * https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909
1143 return do_set_singleshot_timer(cs
, sst
.timeout_abs_ns
, false);
1146 static int vcpuop_stop_singleshot_timer(CPUState
*cs
)
1148 CPUX86State
*env
= &X86_CPU(cs
)->env
;
1150 qemu_mutex_lock(&env
->xen_timers_lock
);
1152 timer_del(env
->xen_singleshot_timer
);
1153 env
->xen_singleshot_timer_ns
= 0;
1155 qemu_mutex_unlock(&env
->xen_timers_lock
);
1159 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1164 if (unlikely(timeout
== 0)) {
1165 err
= vcpuop_stop_singleshot_timer(CPU(cpu
));
1167 QEMU_LOCK_GUARD(&X86_CPU(cpu
)->env
.xen_timers_lock
);
1168 err
= do_set_singleshot_timer(CPU(cpu
), timeout
, true);
1170 exit
->u
.hcall
.result
= err
;
1174 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1175 int cmd
, int vcpu_id
, uint64_t arg
)
1177 CPUState
*cs
= CPU(cpu
);
1178 CPUState
*dest
= cs
->cpu_index
== vcpu_id
? cs
: qemu_get_cpu(vcpu_id
);
1187 case VCPUOP_register_runstate_memory_area
:
1188 err
= vcpuop_register_runstate_info(cs
, dest
, arg
);
1190 case VCPUOP_register_vcpu_time_memory_area
:
1191 err
= vcpuop_register_vcpu_time_info(cs
, dest
, arg
);
1193 case VCPUOP_register_vcpu_info
:
1194 err
= vcpuop_register_vcpu_info(cs
, dest
, arg
);
1196 case VCPUOP_set_singleshot_timer
: {
1197 if (cs
->cpu_index
== vcpu_id
) {
1198 err
= vcpuop_set_singleshot_timer(dest
, arg
);
1204 case VCPUOP_stop_singleshot_timer
:
1205 if (cs
->cpu_index
== vcpu_id
) {
1206 err
= vcpuop_stop_singleshot_timer(dest
);
1211 case VCPUOP_set_periodic_timer
: {
1212 err
= vcpuop_set_periodic_timer(cs
, dest
, arg
);
1215 case VCPUOP_stop_periodic_timer
:
1216 err
= vcpuop_stop_periodic_timer(dest
);
1224 exit
->u
.hcall
.result
= err
;
1228 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1229 int cmd
, uint64_t arg
)
1231 CPUState
*cs
= CPU(cpu
);
1235 case EVTCHNOP_init_control
:
1236 case EVTCHNOP_expand_array
:
1237 case EVTCHNOP_set_priority
:
1238 /* We do not support FIFO channels at this point */
1242 case EVTCHNOP_status
: {
1243 struct evtchn_status status
;
1245 qemu_build_assert(sizeof(status
) == 24);
1246 if (kvm_copy_from_gva(cs
, arg
, &status
, sizeof(status
))) {
1251 err
= xen_evtchn_status_op(&status
);
1252 if (!err
&& kvm_copy_to_gva(cs
, arg
, &status
, sizeof(status
))) {
1257 case EVTCHNOP_close
: {
1258 struct evtchn_close close
;
1260 qemu_build_assert(sizeof(close
) == 4);
1261 if (kvm_copy_from_gva(cs
, arg
, &close
, sizeof(close
))) {
1266 err
= xen_evtchn_close_op(&close
);
1269 case EVTCHNOP_unmask
: {
1270 struct evtchn_unmask unmask
;
1272 qemu_build_assert(sizeof(unmask
) == 4);
1273 if (kvm_copy_from_gva(cs
, arg
, &unmask
, sizeof(unmask
))) {
1278 err
= xen_evtchn_unmask_op(&unmask
);
1281 case EVTCHNOP_bind_virq
: {
1282 struct evtchn_bind_virq virq
;
1284 qemu_build_assert(sizeof(virq
) == 12);
1285 if (kvm_copy_from_gva(cs
, arg
, &virq
, sizeof(virq
))) {
1290 err
= xen_evtchn_bind_virq_op(&virq
);
1291 if (!err
&& kvm_copy_to_gva(cs
, arg
, &virq
, sizeof(virq
))) {
1296 case EVTCHNOP_bind_pirq
: {
1297 struct evtchn_bind_pirq pirq
;
1299 qemu_build_assert(sizeof(pirq
) == 12);
1300 if (kvm_copy_from_gva(cs
, arg
, &pirq
, sizeof(pirq
))) {
1305 err
= xen_evtchn_bind_pirq_op(&pirq
);
1306 if (!err
&& kvm_copy_to_gva(cs
, arg
, &pirq
, sizeof(pirq
))) {
1311 case EVTCHNOP_bind_ipi
: {
1312 struct evtchn_bind_ipi ipi
;
1314 qemu_build_assert(sizeof(ipi
) == 8);
1315 if (kvm_copy_from_gva(cs
, arg
, &ipi
, sizeof(ipi
))) {
1320 err
= xen_evtchn_bind_ipi_op(&ipi
);
1321 if (!err
&& kvm_copy_to_gva(cs
, arg
, &ipi
, sizeof(ipi
))) {
1326 case EVTCHNOP_send
: {
1327 struct evtchn_send send
;
1329 qemu_build_assert(sizeof(send
) == 4);
1330 if (kvm_copy_from_gva(cs
, arg
, &send
, sizeof(send
))) {
1335 err
= xen_evtchn_send_op(&send
);
1338 case EVTCHNOP_alloc_unbound
: {
1339 struct evtchn_alloc_unbound alloc
;
1341 qemu_build_assert(sizeof(alloc
) == 8);
1342 if (kvm_copy_from_gva(cs
, arg
, &alloc
, sizeof(alloc
))) {
1347 err
= xen_evtchn_alloc_unbound_op(&alloc
);
1348 if (!err
&& kvm_copy_to_gva(cs
, arg
, &alloc
, sizeof(alloc
))) {
1353 case EVTCHNOP_bind_interdomain
: {
1354 struct evtchn_bind_interdomain interdomain
;
1356 qemu_build_assert(sizeof(interdomain
) == 12);
1357 if (kvm_copy_from_gva(cs
, arg
, &interdomain
, sizeof(interdomain
))) {
1362 err
= xen_evtchn_bind_interdomain_op(&interdomain
);
1364 kvm_copy_to_gva(cs
, arg
, &interdomain
, sizeof(interdomain
))) {
1369 case EVTCHNOP_bind_vcpu
: {
1370 struct evtchn_bind_vcpu vcpu
;
1372 qemu_build_assert(sizeof(vcpu
) == 8);
1373 if (kvm_copy_from_gva(cs
, arg
, &vcpu
, sizeof(vcpu
))) {
1378 err
= xen_evtchn_bind_vcpu_op(&vcpu
);
1381 case EVTCHNOP_reset
: {
1382 struct evtchn_reset reset
;
1384 qemu_build_assert(sizeof(reset
) == 2);
1385 if (kvm_copy_from_gva(cs
, arg
, &reset
, sizeof(reset
))) {
1390 err
= xen_evtchn_reset_op(&reset
);
1397 exit
->u
.hcall
.result
= err
;
1401 int kvm_xen_soft_reset(void)
1406 assert(bql_locked());
1408 trace_kvm_xen_soft_reset();
1410 err
= xen_evtchn_soft_reset();
1416 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1417 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1418 * to deliver to the timer interrupt and treats that as 'disabled'.
1420 err
= xen_evtchn_set_callback_param(0);
1426 async_run_on_cpu(cpu
, do_vcpu_soft_reset
, RUN_ON_CPU_NULL
);
1429 err
= xen_overlay_map_shinfo_page(INVALID_GFN
);
1434 err
= xen_gnttab_reset();
1439 err
= xen_primary_console_reset();
1444 err
= xen_xenstore_reset();
1452 static int schedop_shutdown(CPUState
*cs
, uint64_t arg
)
1454 struct sched_shutdown shutdown
;
1457 /* No need for 32/64 compat handling */
1458 qemu_build_assert(sizeof(shutdown
) == 4);
1460 if (kvm_copy_from_gva(cs
, arg
, &shutdown
, sizeof(shutdown
))) {
1464 switch (shutdown
.reason
) {
1465 case SHUTDOWN_crash
:
1466 cpu_dump_state(cs
, stderr
, CPU_DUMP_CODE
);
1467 qemu_system_guest_panicked(NULL
);
1470 case SHUTDOWN_reboot
:
1471 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET
);
1474 case SHUTDOWN_poweroff
:
1475 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN
);
1478 case SHUTDOWN_soft_reset
:
1480 ret
= kvm_xen_soft_reset();
1492 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1493 int cmd
, uint64_t arg
)
1495 CPUState
*cs
= CPU(cpu
);
1499 case SCHEDOP_shutdown
:
1500 err
= schedop_shutdown(cs
, arg
);
1505 * Linux will panic if this doesn't work. Just yield; it's not
1506 * worth overthinking it because with event channel handling
1507 * in KVM, the kernel will intercept this and it will never
1508 * reach QEMU anyway. The semantics of the hypercall explicltly
1509 * permit spurious wakeups.
1520 exit
->u
.hcall
.result
= err
;
1524 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1525 int cmd
, uint64_t arg
, int count
)
1527 CPUState
*cs
= CPU(cpu
);
1531 case GNTTABOP_set_version
: {
1532 struct gnttab_set_version set
;
1534 qemu_build_assert(sizeof(set
) == 4);
1535 if (kvm_copy_from_gva(cs
, arg
, &set
, sizeof(set
))) {
1540 err
= xen_gnttab_set_version_op(&set
);
1541 if (!err
&& kvm_copy_to_gva(cs
, arg
, &set
, sizeof(set
))) {
1546 case GNTTABOP_get_version
: {
1547 struct gnttab_get_version get
;
1549 qemu_build_assert(sizeof(get
) == 8);
1550 if (kvm_copy_from_gva(cs
, arg
, &get
, sizeof(get
))) {
1555 err
= xen_gnttab_get_version_op(&get
);
1556 if (!err
&& kvm_copy_to_gva(cs
, arg
, &get
, sizeof(get
))) {
1561 case GNTTABOP_query_size
: {
1562 struct gnttab_query_size size
;
1564 qemu_build_assert(sizeof(size
) == 16);
1565 if (kvm_copy_from_gva(cs
, arg
, &size
, sizeof(size
))) {
1570 err
= xen_gnttab_query_size_op(&size
);
1571 if (!err
&& kvm_copy_to_gva(cs
, arg
, &size
, sizeof(size
))) {
1576 case GNTTABOP_setup_table
:
1578 case GNTTABOP_map_grant_ref
:
1579 case GNTTABOP_unmap_grant_ref
:
1580 case GNTTABOP_swap_grant_ref
:
1584 /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1589 exit
->u
.hcall
.result
= err
;
1593 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1594 int cmd
, uint64_t arg
)
1596 CPUState
*cs
= CPU(cpu
);
1600 case PHYSDEVOP_map_pirq
: {
1601 struct physdev_map_pirq map
;
1603 if (hypercall_compat32(exit
->u
.hcall
.longmode
)) {
1604 struct compat_physdev_map_pirq
*map32
= (void *)&map
;
1606 if (kvm_copy_from_gva(cs
, arg
, map32
, sizeof(*map32
))) {
1611 * The only thing that's different is the alignment of the
1612 * uint64_t table_base at the end, which gets padding to make
1613 * it 64-bit aligned in the 64-bit version.
1615 qemu_build_assert(sizeof(*map32
) == 36);
1616 qemu_build_assert(offsetof(struct physdev_map_pirq
, entry_nr
) ==
1617 offsetof(struct compat_physdev_map_pirq
, entry_nr
));
1618 memmove(&map
.table_base
, &map32
->table_base
, sizeof(map
.table_base
));
1620 if (kvm_copy_from_gva(cs
, arg
, &map
, sizeof(map
))) {
1625 err
= xen_physdev_map_pirq(&map
);
1627 * Since table_base is an IN parameter and won't be changed, just
1628 * copy the size of the compat structure back to the guest.
1630 if (!err
&& kvm_copy_to_gva(cs
, arg
, &map
,
1631 sizeof(struct compat_physdev_map_pirq
))) {
1636 case PHYSDEVOP_unmap_pirq
: {
1637 struct physdev_unmap_pirq unmap
;
1639 qemu_build_assert(sizeof(unmap
) == 8);
1640 if (kvm_copy_from_gva(cs
, arg
, &unmap
, sizeof(unmap
))) {
1645 err
= xen_physdev_unmap_pirq(&unmap
);
1646 if (!err
&& kvm_copy_to_gva(cs
, arg
, &unmap
, sizeof(unmap
))) {
1651 case PHYSDEVOP_eoi
: {
1652 struct physdev_eoi eoi
;
1654 qemu_build_assert(sizeof(eoi
) == 4);
1655 if (kvm_copy_from_gva(cs
, arg
, &eoi
, sizeof(eoi
))) {
1660 err
= xen_physdev_eoi_pirq(&eoi
);
1661 if (!err
&& kvm_copy_to_gva(cs
, arg
, &eoi
, sizeof(eoi
))) {
1666 case PHYSDEVOP_irq_status_query
: {
1667 struct physdev_irq_status_query query
;
1669 qemu_build_assert(sizeof(query
) == 8);
1670 if (kvm_copy_from_gva(cs
, arg
, &query
, sizeof(query
))) {
1675 err
= xen_physdev_query_pirq(&query
);
1676 if (!err
&& kvm_copy_to_gva(cs
, arg
, &query
, sizeof(query
))) {
1681 case PHYSDEVOP_get_free_pirq
: {
1682 struct physdev_get_free_pirq get
;
1684 qemu_build_assert(sizeof(get
) == 8);
1685 if (kvm_copy_from_gva(cs
, arg
, &get
, sizeof(get
))) {
1690 err
= xen_physdev_get_free_pirq(&get
);
1691 if (!err
&& kvm_copy_to_gva(cs
, arg
, &get
, sizeof(get
))) {
1696 case PHYSDEVOP_pirq_eoi_gmfn_v2
: /* FreeBSD 13 makes this hypercall */
1704 exit
->u
.hcall
.result
= err
;
1708 static bool do_kvm_xen_handle_exit(X86CPU
*cpu
, struct kvm_xen_exit
*exit
)
1710 uint16_t code
= exit
->u
.hcall
.input
;
1712 if (exit
->u
.hcall
.cpl
> 0) {
1713 exit
->u
.hcall
.result
= -EPERM
;
1718 case __HYPERVISOR_set_timer_op
:
1719 if (exit
->u
.hcall
.longmode
) {
1720 return kvm_xen_hcall_set_timer_op(exit
, cpu
,
1721 exit
->u
.hcall
.params
[0]);
1723 /* In 32-bit mode, the 64-bit timer value is in two args. */
1724 uint64_t val
= ((uint64_t)exit
->u
.hcall
.params
[1]) << 32 |
1725 (uint32_t)exit
->u
.hcall
.params
[0];
1726 return kvm_xen_hcall_set_timer_op(exit
, cpu
, val
);
1728 case __HYPERVISOR_grant_table_op
:
1729 return kvm_xen_hcall_gnttab_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1730 exit
->u
.hcall
.params
[1],
1731 exit
->u
.hcall
.params
[2]);
1732 case __HYPERVISOR_sched_op
:
1733 return kvm_xen_hcall_sched_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1734 exit
->u
.hcall
.params
[1]);
1735 case __HYPERVISOR_event_channel_op
:
1736 return kvm_xen_hcall_evtchn_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1737 exit
->u
.hcall
.params
[1]);
1738 case __HYPERVISOR_vcpu_op
:
1739 return kvm_xen_hcall_vcpu_op(exit
, cpu
,
1740 exit
->u
.hcall
.params
[0],
1741 exit
->u
.hcall
.params
[1],
1742 exit
->u
.hcall
.params
[2]);
1743 case __HYPERVISOR_hvm_op
:
1744 return kvm_xen_hcall_hvm_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1745 exit
->u
.hcall
.params
[1]);
1746 case __HYPERVISOR_memory_op
:
1747 return kvm_xen_hcall_memory_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1748 exit
->u
.hcall
.params
[1]);
1749 case __HYPERVISOR_physdev_op
:
1750 return kvm_xen_hcall_physdev_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1751 exit
->u
.hcall
.params
[1]);
1752 case __HYPERVISOR_xen_version
:
1753 return kvm_xen_hcall_xen_version(exit
, cpu
, exit
->u
.hcall
.params
[0],
1754 exit
->u
.hcall
.params
[1]);
1760 int kvm_xen_handle_exit(X86CPU
*cpu
, struct kvm_xen_exit
*exit
)
1762 if (exit
->type
!= KVM_EXIT_XEN_HCALL
) {
1767 * The kernel latches the guest 32/64 mode when the MSR is used to fill
1768 * the hypercall page. So if we see a hypercall in a mode that doesn't
1769 * match our own idea of the guest mode, fetch the kernel's idea of the
1770 * "long mode" to remain in sync.
1772 if (exit
->u
.hcall
.longmode
!= xen_is_long_mode()) {
1773 xen_sync_long_mode();
1776 if (!do_kvm_xen_handle_exit(cpu
, exit
)) {
1778 * Some hypercalls will be deliberately "implemented" by returning
1779 * -ENOSYS. This case is for hypercalls which are unexpected.
1781 exit
->u
.hcall
.result
= -ENOSYS
;
1782 qemu_log_mask(LOG_UNIMP
, "Unimplemented Xen hypercall %"
1783 PRId64
" (0x%" PRIx64
" 0x%" PRIx64
" 0x%" PRIx64
")\n",
1784 (uint64_t)exit
->u
.hcall
.input
,
1785 (uint64_t)exit
->u
.hcall
.params
[0],
1786 (uint64_t)exit
->u
.hcall
.params
[1],
1787 (uint64_t)exit
->u
.hcall
.params
[2]);
1790 trace_kvm_xen_hypercall(CPU(cpu
)->cpu_index
, exit
->u
.hcall
.cpl
,
1791 exit
->u
.hcall
.input
, exit
->u
.hcall
.params
[0],
1792 exit
->u
.hcall
.params
[1], exit
->u
.hcall
.params
[2],
1793 exit
->u
.hcall
.result
);
1797 uint16_t kvm_xen_get_gnttab_max_frames(void)
1799 KVMState
*s
= KVM_STATE(current_accel());
1800 return s
->xen_gnttab_max_frames
;
1803 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1805 KVMState
*s
= KVM_STATE(current_accel());
1806 return s
->xen_evtchn_max_pirq
;
1809 int kvm_put_xen_state(CPUState
*cs
)
1811 X86CPU
*cpu
= X86_CPU(cs
);
1812 CPUX86State
*env
= &cpu
->env
;
1816 gpa
= env
->xen_vcpu_info_gpa
;
1817 if (gpa
== INVALID_GPA
) {
1818 gpa
= env
->xen_vcpu_info_default_gpa
;
1821 if (gpa
!= INVALID_GPA
) {
1822 ret
= set_vcpu_info(cs
, gpa
);
1828 gpa
= env
->xen_vcpu_time_info_gpa
;
1829 if (gpa
!= INVALID_GPA
) {
1830 ret
= kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
,
1837 gpa
= env
->xen_vcpu_runstate_gpa
;
1838 if (gpa
!= INVALID_GPA
) {
1839 ret
= kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
,
1846 if (env
->xen_periodic_timer_period
) {
1847 ret
= do_set_periodic_timer(cs
, env
->xen_periodic_timer_period
);
1853 if (!kvm_xen_has_cap(EVTCHN_SEND
)) {
1855 * If the kernel has EVTCHN_SEND support then it handles timers too,
1856 * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1858 QEMU_LOCK_GUARD(&env
->xen_timers_lock
);
1859 if (env
->xen_singleshot_timer_ns
) {
1860 ret
= do_set_singleshot_timer(cs
, env
->xen_singleshot_timer_ns
,
1869 if (env
->xen_vcpu_callback_vector
) {
1870 ret
= kvm_xen_set_vcpu_callback_vector(cs
);
1876 if (env
->xen_virq
[VIRQ_TIMER
]) {
1877 do_set_vcpu_timer_virq(cs
,
1878 RUN_ON_CPU_HOST_INT(env
->xen_virq
[VIRQ_TIMER
]));
1883 int kvm_get_xen_state(CPUState
*cs
)
1885 X86CPU
*cpu
= X86_CPU(cs
);
1886 CPUX86State
*env
= &cpu
->env
;
1891 * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1892 * to it. It's up to userspace to *assume* that any page shared thus is
1893 * always considered dirty. The shared_info page is different since it's
1894 * an overlay and migrated separately anyway.
1896 gpa
= env
->xen_vcpu_info_gpa
;
1897 if (gpa
== INVALID_GPA
) {
1898 gpa
= env
->xen_vcpu_info_default_gpa
;
1900 if (gpa
!= INVALID_GPA
) {
1901 MemoryRegionSection mrs
= memory_region_find(get_system_memory(),
1903 sizeof(struct vcpu_info
));
1905 !int128_lt(mrs
.size
, int128_make64(sizeof(struct vcpu_info
)))) {
1906 memory_region_set_dirty(mrs
.mr
, mrs
.offset_within_region
,
1907 sizeof(struct vcpu_info
));
1911 if (!kvm_xen_has_cap(EVTCHN_SEND
)) {
1916 * If the kernel is accelerating timers, read out the current value of the
1917 * singleshot timer deadline.
1919 if (env
->xen_virq
[VIRQ_TIMER
]) {
1920 struct kvm_xen_vcpu_attr va
= {
1921 .type
= KVM_XEN_VCPU_ATTR_TYPE_TIMER
,
1923 ret
= kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_GET_ATTR
, &va
);
1929 * This locking is fairly pointless, and is here to appease Coverity.
1930 * There is an unavoidable race condition if a different vCPU sets a
1931 * timer for this vCPU after the value has been read out. But that's
1932 * OK in practice because *all* the vCPUs need to be stopped before
1933 * we set about migrating their state.
1935 QEMU_LOCK_GUARD(&X86_CPU(cs
)->env
.xen_timers_lock
);
1936 env
->xen_singleshot_timer_ns
= va
.u
.timer
.expires_ns
;