qapi: allow unions to contain further unions
[qemu/armbru.git] / target / i386 / kvm / xen-emu.c
blobd7c7eb8d9c8e035fb1b6a32042514ac821490f39
1 /*
2 * Xen HVM emulation support in KVM
4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include "qemu/log.h"
14 #include "qemu/main-loop.h"
15 #include "qemu/error-report.h"
16 #include "hw/xen/xen.h"
17 #include "sysemu/kvm_int.h"
18 #include "sysemu/kvm_xen.h"
19 #include "kvm/kvm_i386.h"
20 #include "exec/address-spaces.h"
21 #include "xen-emu.h"
22 #include "trace.h"
23 #include "sysemu/runstate.h"
25 #include "hw/pci/msi.h"
26 #include "hw/i386/apic-msidef.h"
27 #include "hw/i386/e820_memory_layout.h"
28 #include "hw/i386/kvm/xen_overlay.h"
29 #include "hw/i386/kvm/xen_evtchn.h"
30 #include "hw/i386/kvm/xen_gnttab.h"
31 #include "hw/i386/kvm/xen_xenstore.h"
33 #include "hw/xen/interface/version.h"
34 #include "hw/xen/interface/sched.h"
35 #include "hw/xen/interface/memory.h"
36 #include "hw/xen/interface/hvm/hvm_op.h"
37 #include "hw/xen/interface/hvm/params.h"
38 #include "hw/xen/interface/vcpu.h"
39 #include "hw/xen/interface/event_channel.h"
40 #include "hw/xen/interface/grant_table.h"
42 #include "xen-compat.h"
44 static void xen_vcpu_singleshot_timer_event(void *opaque);
45 static void xen_vcpu_periodic_timer_event(void *opaque);
47 #ifdef TARGET_X86_64
48 #define hypercall_compat32(longmode) (!(longmode))
49 #else
50 #define hypercall_compat32(longmode) (false)
51 #endif
53 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
54 size_t *len, bool is_write)
56 struct kvm_translation tr = {
57 .linear_address = gva,
60 if (len) {
61 *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
64 if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
65 (is_write && !tr.writeable)) {
66 return false;
68 *gpa = tr.physical_address;
69 return true;
72 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
73 bool is_write)
75 uint8_t *buf = (uint8_t *)_buf;
76 uint64_t gpa;
77 size_t len;
79 while (sz) {
80 if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
81 return -EFAULT;
83 if (len > sz) {
84 len = sz;
87 cpu_physical_memory_rw(gpa, buf, len, is_write);
89 buf += len;
90 sz -= len;
91 gva += len;
94 return 0;
97 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
98 size_t sz)
100 return kvm_gva_rw(cs, gva, buf, sz, false);
103 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
104 size_t sz)
106 return kvm_gva_rw(cs, gva, buf, sz, true);
109 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
111 const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
112 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
113 struct kvm_xen_hvm_config cfg = {
114 .msr = hypercall_msr,
115 .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
117 int xen_caps, ret;
119 xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
120 if (required_caps & ~xen_caps) {
121 error_report("kvm: Xen HVM guest support not present or insufficient");
122 return -ENOSYS;
125 if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
126 struct kvm_xen_hvm_attr ha = {
127 .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
128 .u.xen_version = s->xen_version,
130 (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);
132 cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
135 ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
136 if (ret < 0) {
137 error_report("kvm: Failed to enable Xen HVM support: %s",
138 strerror(-ret));
139 return ret;
142 /* If called a second time, don't repeat the rest of the setup. */
143 if (s->xen_caps) {
144 return 0;
148 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
149 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
151 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
152 * such things to be polled at precisely the right time. We *could* do
153 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
154 * the moment the IRQ is acked, and see if it should be reasserted.
156 * But the in-kernel irqchip is deprecated, so we're unlikely to add
157 * that support in the kernel. Insist on using the split irqchip mode
158 * instead.
160 * This leaves us polling for the level going low in QEMU, which lacks
161 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
162 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
163 * the device (for which it has to unmap the device and trap access, for
164 * some period after an IRQ!!). In the Xen case, we do it on exit from
165 * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
166 * Which is kind of icky, but less so than the VFIO one. I may fix them
167 * both later...
169 if (!kvm_kernel_irqchip_split()) {
170 error_report("kvm: Xen support requires kernel-irqchip=split");
171 return -EINVAL;
174 s->xen_caps = xen_caps;
176 /* Tell fw_cfg to notify the BIOS to reserve the range. */
177 ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE,
178 E820_RESERVED);
179 if (ret < 0) {
180 fprintf(stderr, "e820_add_entry() table is full\n");
181 return ret;
184 /* The page couldn't be overlaid until KVM was initialized */
185 xen_xenstore_reset();
187 return 0;
190 int kvm_xen_init_vcpu(CPUState *cs)
192 X86CPU *cpu = X86_CPU(cs);
193 CPUX86State *env = &cpu->env;
194 int err;
197 * The kernel needs to know the Xen/ACPI vCPU ID because that's
198 * what the guest uses in hypercalls such as timers. It doesn't
199 * match the APIC ID which is generally used for talking to the
200 * kernel about vCPUs. And if vCPU threads race with creating
201 * their KVM vCPUs out of order, it doesn't necessarily match
202 * with the kernel's internal vCPU indices either.
204 if (kvm_xen_has_cap(EVTCHN_SEND)) {
205 struct kvm_xen_vcpu_attr va = {
206 .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
207 .u.vcpu_id = cs->cpu_index,
209 err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
210 if (err) {
211 error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
212 strerror(-err));
213 return err;
217 env->xen_vcpu_info_gpa = INVALID_GPA;
218 env->xen_vcpu_info_default_gpa = INVALID_GPA;
219 env->xen_vcpu_time_info_gpa = INVALID_GPA;
220 env->xen_vcpu_runstate_gpa = INVALID_GPA;
222 qemu_mutex_init(&env->xen_timers_lock);
223 env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
224 xen_vcpu_singleshot_timer_event,
225 cpu);
226 if (!env->xen_singleshot_timer) {
227 return -ENOMEM;
229 env->xen_singleshot_timer->opaque = cs;
231 env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
232 xen_vcpu_periodic_timer_event,
233 cpu);
234 if (!env->xen_periodic_timer) {
235 return -ENOMEM;
237 env->xen_periodic_timer->opaque = cs;
239 return 0;
242 uint32_t kvm_xen_get_caps(void)
244 return kvm_state->xen_caps;
247 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
248 int cmd, uint64_t arg)
250 int err = 0;
252 switch (cmd) {
253 case XENVER_get_features: {
254 struct xen_feature_info fi;
256 /* No need for 32/64 compat handling */
257 qemu_build_assert(sizeof(fi) == 8);
259 err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
260 if (err) {
261 break;
264 fi.submap = 0;
265 if (fi.submap_idx == 0) {
266 fi.submap |= 1 << XENFEAT_writable_page_tables |
267 1 << XENFEAT_writable_descriptor_tables |
268 1 << XENFEAT_auto_translated_physmap |
269 1 << XENFEAT_supervisor_mode_kernel |
270 1 << XENFEAT_hvm_callback_vector |
271 1 << XENFEAT_hvm_safe_pvclock |
272 1 << XENFEAT_hvm_pirqs;
275 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
276 break;
279 default:
280 return false;
283 exit->u.hcall.result = err;
284 return true;
287 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
289 struct kvm_xen_vcpu_attr xhsi;
291 xhsi.type = type;
292 xhsi.u.gpa = gpa;
294 trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
296 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
299 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
301 uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
302 struct kvm_xen_vcpu_attr xva;
304 xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
305 xva.u.vector = vector;
307 trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
309 return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva);
312 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
314 X86CPU *cpu = X86_CPU(cs);
315 CPUX86State *env = &cpu->env;
317 env->xen_vcpu_callback_vector = data.host_int;
319 if (kvm_xen_has_cap(EVTCHN_SEND)) {
320 kvm_xen_set_vcpu_callback_vector(cs);
324 static int set_vcpu_info(CPUState *cs, uint64_t gpa)
326 X86CPU *cpu = X86_CPU(cs);
327 CPUX86State *env = &cpu->env;
328 MemoryRegionSection mrs = { .mr = NULL };
329 void *vcpu_info_hva = NULL;
330 int ret;
332 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
333 if (ret || gpa == INVALID_GPA) {
334 goto out;
337 mrs = memory_region_find(get_system_memory(), gpa,
338 sizeof(struct vcpu_info));
339 if (mrs.mr && mrs.mr->ram_block &&
340 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
341 vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block,
342 mrs.offset_within_region);
344 if (!vcpu_info_hva) {
345 if (mrs.mr) {
346 memory_region_unref(mrs.mr);
347 mrs.mr = NULL;
349 ret = -EINVAL;
352 out:
353 if (env->xen_vcpu_info_mr) {
354 memory_region_unref(env->xen_vcpu_info_mr);
356 env->xen_vcpu_info_hva = vcpu_info_hva;
357 env->xen_vcpu_info_mr = mrs.mr;
358 return ret;
361 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
363 X86CPU *cpu = X86_CPU(cs);
364 CPUX86State *env = &cpu->env;
366 env->xen_vcpu_info_default_gpa = data.host_ulong;
368 /* Changing the default does nothing if a vcpu_info was explicitly set. */
369 if (env->xen_vcpu_info_gpa == INVALID_GPA) {
370 set_vcpu_info(cs, env->xen_vcpu_info_default_gpa);
374 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
376 X86CPU *cpu = X86_CPU(cs);
377 CPUX86State *env = &cpu->env;
379 env->xen_vcpu_info_gpa = data.host_ulong;
381 set_vcpu_info(cs, env->xen_vcpu_info_gpa);
384 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
386 CPUState *cs = qemu_get_cpu(vcpu_id);
387 if (!cs) {
388 return NULL;
391 return X86_CPU(cs)->env.xen_vcpu_info_hva;
394 void kvm_xen_maybe_deassert_callback(CPUState *cs)
396 CPUX86State *env = &X86_CPU(cs)->env;
397 struct vcpu_info *vi = env->xen_vcpu_info_hva;
398 if (!vi) {
399 return;
402 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
403 if (!vi->evtchn_upcall_pending) {
404 qemu_mutex_lock_iothread();
406 * Check again now we have the lock, because it may have been
407 * asserted in the interim. And we don't want to take the lock
408 * every time because this is a fast path.
410 if (!vi->evtchn_upcall_pending) {
411 X86_CPU(cs)->env.xen_callback_asserted = false;
412 xen_evtchn_set_callback_level(0);
414 qemu_mutex_unlock_iothread();
418 void kvm_xen_set_callback_asserted(void)
420 CPUState *cs = qemu_get_cpu(0);
422 if (cs) {
423 X86_CPU(cs)->env.xen_callback_asserted = true;
427 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
429 CPUState *cs = qemu_get_cpu(vcpu_id);
430 uint8_t vector;
432 if (!cs) {
433 return;
436 vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
437 if (vector) {
439 * The per-vCPU callback vector injected via lapic. Just
440 * deliver it as an MSI.
442 MSIMessage msg = {
443 .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id,
444 .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
446 kvm_irqchip_send_msi(kvm_state, msg);
447 return;
450 switch (type) {
451 case HVM_PARAM_CALLBACK_TYPE_VECTOR:
453 * If the evtchn_upcall_pending field in the vcpu_info is set, then
454 * KVM will automatically deliver the vector on entering the vCPU
455 * so all we have to do is kick it out.
457 qemu_cpu_kick(cs);
458 break;
460 case HVM_PARAM_CALLBACK_TYPE_GSI:
461 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
462 if (vcpu_id == 0) {
463 xen_evtchn_set_callback_level(1);
465 break;
469 static int kvm_xen_set_vcpu_timer(CPUState *cs)
471 X86CPU *cpu = X86_CPU(cs);
472 CPUX86State *env = &cpu->env;
474 struct kvm_xen_vcpu_attr va = {
475 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
476 .u.timer.port = env->xen_virq[VIRQ_TIMER],
477 .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
478 .u.timer.expires_ns = env->xen_singleshot_timer_ns,
481 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
484 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
486 kvm_xen_set_vcpu_timer(cs);
489 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port)
491 CPUState *cs = qemu_get_cpu(vcpu_id);
493 if (!cs) {
494 return -ENOENT;
497 /* cpu.h doesn't include the actual Xen header. */
498 qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS);
500 if (virq >= NR_VIRQS) {
501 return -EINVAL;
504 if (port && X86_CPU(cs)->env.xen_virq[virq]) {
505 return -EEXIST;
508 X86_CPU(cs)->env.xen_virq[virq] = port;
509 if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) {
510 async_run_on_cpu(cs, do_set_vcpu_timer_virq,
511 RUN_ON_CPU_HOST_INT(port));
513 return 0;
516 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
518 X86CPU *cpu = X86_CPU(cs);
519 CPUX86State *env = &cpu->env;
521 env->xen_vcpu_time_info_gpa = data.host_ulong;
523 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
524 env->xen_vcpu_time_info_gpa);
527 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data)
529 X86CPU *cpu = X86_CPU(cs);
530 CPUX86State *env = &cpu->env;
532 env->xen_vcpu_runstate_gpa = data.host_ulong;
534 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
535 env->xen_vcpu_runstate_gpa);
538 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
540 X86CPU *cpu = X86_CPU(cs);
541 CPUX86State *env = &cpu->env;
543 env->xen_vcpu_info_gpa = INVALID_GPA;
544 env->xen_vcpu_info_default_gpa = INVALID_GPA;
545 env->xen_vcpu_time_info_gpa = INVALID_GPA;
546 env->xen_vcpu_runstate_gpa = INVALID_GPA;
547 env->xen_vcpu_callback_vector = 0;
548 env->xen_singleshot_timer_ns = 0;
549 memset(env->xen_virq, 0, sizeof(env->xen_virq));
551 set_vcpu_info(cs, INVALID_GPA);
552 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
553 INVALID_GPA);
554 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
555 INVALID_GPA);
556 if (kvm_xen_has_cap(EVTCHN_SEND)) {
557 kvm_xen_set_vcpu_callback_vector(cs);
558 kvm_xen_set_vcpu_timer(cs);
563 static int xen_set_shared_info(uint64_t gfn)
565 uint64_t gpa = gfn << TARGET_PAGE_BITS;
566 int i, err;
568 QEMU_IOTHREAD_LOCK_GUARD();
571 * The xen_overlay device tells KVM about it too, since it had to
572 * do that on migration load anyway (unless we're going to jump
573 * through lots of hoops to maintain the fiction that this isn't
574 * KVM-specific.
576 err = xen_overlay_map_shinfo_page(gpa);
577 if (err) {
578 return err;
581 trace_kvm_xen_set_shared_info(gfn);
583 for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
584 CPUState *cpu = qemu_get_cpu(i);
585 if (cpu) {
586 async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
587 RUN_ON_CPU_HOST_ULONG(gpa));
589 gpa += sizeof(vcpu_info_t);
592 return err;
595 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
597 switch (space) {
598 case XENMAPSPACE_shared_info:
599 if (idx > 0) {
600 return -EINVAL;
602 return xen_set_shared_info(gfn);
604 case XENMAPSPACE_grant_table:
605 return xen_gnttab_map_page(idx, gfn);
607 case XENMAPSPACE_gmfn:
608 case XENMAPSPACE_gmfn_range:
609 return -ENOTSUP;
611 case XENMAPSPACE_gmfn_foreign:
612 case XENMAPSPACE_dev_mmio:
613 return -EPERM;
615 default:
616 return -EINVAL;
620 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
621 uint64_t arg)
623 struct xen_add_to_physmap xatp;
624 CPUState *cs = CPU(cpu);
626 if (hypercall_compat32(exit->u.hcall.longmode)) {
627 struct compat_xen_add_to_physmap xatp32;
629 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
630 if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
631 return -EFAULT;
633 xatp.domid = xatp32.domid;
634 xatp.size = xatp32.size;
635 xatp.space = xatp32.space;
636 xatp.idx = xatp32.idx;
637 xatp.gpfn = xatp32.gpfn;
638 } else {
639 if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
640 return -EFAULT;
644 if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
645 return -ESRCH;
648 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
651 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
652 uint64_t arg)
654 struct xen_add_to_physmap_batch xatpb;
655 unsigned long idxs_gva, gpfns_gva, errs_gva;
656 CPUState *cs = CPU(cpu);
657 size_t op_sz;
659 if (hypercall_compat32(exit->u.hcall.longmode)) {
660 struct compat_xen_add_to_physmap_batch xatpb32;
662 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
663 if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
664 return -EFAULT;
666 xatpb.domid = xatpb32.domid;
667 xatpb.space = xatpb32.space;
668 xatpb.size = xatpb32.size;
670 idxs_gva = xatpb32.idxs.c;
671 gpfns_gva = xatpb32.gpfns.c;
672 errs_gva = xatpb32.errs.c;
673 op_sz = sizeof(uint32_t);
674 } else {
675 if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
676 return -EFAULT;
678 op_sz = sizeof(unsigned long);
679 idxs_gva = (unsigned long)xatpb.idxs.p;
680 gpfns_gva = (unsigned long)xatpb.gpfns.p;
681 errs_gva = (unsigned long)xatpb.errs.p;
684 if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
685 return -ESRCH;
688 /* Explicitly invalid for the batch op. Not that we implement it anyway. */
689 if (xatpb.space == XENMAPSPACE_gmfn_range) {
690 return -EINVAL;
693 while (xatpb.size--) {
694 unsigned long idx = 0;
695 unsigned long gpfn = 0;
696 int err;
698 /* For 32-bit compat this only copies the low 32 bits of each */
699 if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
700 kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
701 return -EFAULT;
703 idxs_gva += op_sz;
704 gpfns_gva += op_sz;
706 err = add_to_physmap_one(xatpb.space, idx, gpfn);
708 if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
709 return -EFAULT;
711 errs_gva += sizeof(err);
713 return 0;
716 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
717 int cmd, uint64_t arg)
719 int err;
721 switch (cmd) {
722 case XENMEM_add_to_physmap:
723 err = do_add_to_physmap(exit, cpu, arg);
724 break;
726 case XENMEM_add_to_physmap_batch:
727 err = do_add_to_physmap_batch(exit, cpu, arg);
728 break;
730 default:
731 return false;
734 exit->u.hcall.result = err;
735 return true;
738 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
739 uint64_t arg)
741 CPUState *cs = CPU(cpu);
742 struct xen_hvm_param hp;
743 int err = 0;
745 /* No need for 32/64 compat handling */
746 qemu_build_assert(sizeof(hp) == 16);
748 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
749 err = -EFAULT;
750 goto out;
753 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
754 err = -ESRCH;
755 goto out;
758 switch (hp.index) {
759 case HVM_PARAM_CALLBACK_IRQ:
760 qemu_mutex_lock_iothread();
761 err = xen_evtchn_set_callback_param(hp.value);
762 qemu_mutex_unlock_iothread();
763 xen_set_long_mode(exit->u.hcall.longmode);
764 break;
765 default:
766 return false;
769 out:
770 exit->u.hcall.result = err;
771 return true;
774 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
775 uint64_t arg)
777 CPUState *cs = CPU(cpu);
778 struct xen_hvm_param hp;
779 int err = 0;
781 /* No need for 32/64 compat handling */
782 qemu_build_assert(sizeof(hp) == 16);
784 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
785 err = -EFAULT;
786 goto out;
789 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
790 err = -ESRCH;
791 goto out;
794 switch (hp.index) {
795 case HVM_PARAM_STORE_PFN:
796 hp.value = XEN_SPECIAL_PFN(XENSTORE);
797 break;
798 case HVM_PARAM_STORE_EVTCHN:
799 hp.value = xen_xenstore_get_port();
800 break;
801 default:
802 return false;
805 if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
806 err = -EFAULT;
808 out:
809 exit->u.hcall.result = err;
810 return true;
813 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
814 X86CPU *cpu, uint64_t arg)
816 struct xen_hvm_evtchn_upcall_vector up;
817 CPUState *target_cs;
819 /* No need for 32/64 compat handling */
820 qemu_build_assert(sizeof(up) == 8);
822 if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
823 return -EFAULT;
826 if (up.vector < 0x10) {
827 return -EINVAL;
830 target_cs = qemu_get_cpu(up.vcpu);
831 if (!target_cs) {
832 return -EINVAL;
835 async_run_on_cpu(target_cs, do_set_vcpu_callback_vector,
836 RUN_ON_CPU_HOST_INT(up.vector));
837 return 0;
840 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
841 int cmd, uint64_t arg)
843 int ret = -ENOSYS;
844 switch (cmd) {
845 case HVMOP_set_evtchn_upcall_vector:
846 ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu,
847 exit->u.hcall.params[0]);
848 break;
850 case HVMOP_pagetable_dying:
851 ret = -ENOSYS;
852 break;
854 case HVMOP_set_param:
855 return handle_set_param(exit, cpu, arg);
857 case HVMOP_get_param:
858 return handle_get_param(exit, cpu, arg);
860 default:
861 return false;
864 exit->u.hcall.result = ret;
865 return true;
868 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
869 uint64_t arg)
871 struct vcpu_register_vcpu_info rvi;
872 uint64_t gpa;
874 /* No need for 32/64 compat handling */
875 qemu_build_assert(sizeof(rvi) == 16);
876 qemu_build_assert(sizeof(struct vcpu_info) == 64);
878 if (!target) {
879 return -ENOENT;
882 if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
883 return -EFAULT;
886 if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
887 return -EINVAL;
890 gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
891 async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
892 return 0;
895 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
896 uint64_t arg)
898 struct vcpu_register_time_memory_area tma;
899 uint64_t gpa;
900 size_t len;
902 /* No need for 32/64 compat handling */
903 qemu_build_assert(sizeof(tma) == 8);
904 qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
906 if (!target) {
907 return -ENOENT;
910 if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
911 return -EFAULT;
915 * Xen actually uses the GVA and does the translation through the guest
916 * page tables each time. But Linux/KVM uses the GPA, on the assumption
917 * that guests only ever use *global* addresses (kernel virtual addresses)
918 * for it. If Linux is changed to redo the GVA→GPA translation each time,
919 * it will offer a new vCPU attribute for that, and we'll use it instead.
921 if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
922 len < sizeof(struct vcpu_time_info)) {
923 return -EFAULT;
926 async_run_on_cpu(target, do_set_vcpu_time_info_gpa,
927 RUN_ON_CPU_HOST_ULONG(gpa));
928 return 0;
931 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target,
932 uint64_t arg)
934 struct vcpu_register_runstate_memory_area rma;
935 uint64_t gpa;
936 size_t len;
938 /* No need for 32/64 compat handling */
939 qemu_build_assert(sizeof(rma) == 8);
940 /* The runstate area actually does change size, but Linux copes. */
942 if (!target) {
943 return -ENOENT;
946 if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) {
947 return -EFAULT;
950 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
951 if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) {
952 return -EFAULT;
955 async_run_on_cpu(target, do_set_vcpu_runstate_gpa,
956 RUN_ON_CPU_HOST_ULONG(gpa));
957 return 0;
960 static uint64_t kvm_get_current_ns(void)
962 struct kvm_clock_data data;
963 int ret;
965 ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
966 if (ret < 0) {
967 fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
968 abort();
971 return data.clock;
974 static void xen_vcpu_singleshot_timer_event(void *opaque)
976 CPUState *cpu = opaque;
977 CPUX86State *env = &X86_CPU(cpu)->env;
978 uint16_t port = env->xen_virq[VIRQ_TIMER];
980 if (likely(port)) {
981 xen_evtchn_set_port(port);
984 qemu_mutex_lock(&env->xen_timers_lock);
985 env->xen_singleshot_timer_ns = 0;
986 qemu_mutex_unlock(&env->xen_timers_lock);
989 static void xen_vcpu_periodic_timer_event(void *opaque)
991 CPUState *cpu = opaque;
992 CPUX86State *env = &X86_CPU(cpu)->env;
993 uint16_t port = env->xen_virq[VIRQ_TIMER];
994 int64_t qemu_now;
996 if (likely(port)) {
997 xen_evtchn_set_port(port);
1000 qemu_mutex_lock(&env->xen_timers_lock);
1002 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1003 timer_mod_ns(env->xen_periodic_timer,
1004 qemu_now + env->xen_periodic_timer_period);
1006 qemu_mutex_unlock(&env->xen_timers_lock);
1009 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns)
1011 CPUX86State *tenv = &X86_CPU(target)->env;
1012 int64_t qemu_now;
1014 timer_del(tenv->xen_periodic_timer);
1016 qemu_mutex_lock(&tenv->xen_timers_lock);
1018 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1019 timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns);
1020 tenv->xen_periodic_timer_period = period_ns;
1022 qemu_mutex_unlock(&tenv->xen_timers_lock);
1023 return 0;
1026 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL))
1027 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL))
1028 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1029 /* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */
1030 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1032 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target,
1033 uint64_t arg)
1035 struct vcpu_set_periodic_timer spt;
1037 qemu_build_assert(sizeof(spt) == 8);
1038 if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) {
1039 return -EFAULT;
1042 if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) {
1043 return -EINVAL;
1046 return do_set_periodic_timer(target, spt.period_ns);
1049 static int vcpuop_stop_periodic_timer(CPUState *target)
1051 CPUX86State *tenv = &X86_CPU(target)->env;
1053 qemu_mutex_lock(&tenv->xen_timers_lock);
1055 timer_del(tenv->xen_periodic_timer);
1056 tenv->xen_periodic_timer_period = 0;
1058 qemu_mutex_unlock(&tenv->xen_timers_lock);
1059 return 0;
1062 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
1063 bool future, bool linux_wa)
1065 CPUX86State *env = &X86_CPU(cs)->env;
1066 int64_t now = kvm_get_current_ns();
1067 int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1068 int64_t delta = timeout_abs - now;
1070 if (future && timeout_abs < now) {
1071 return -ETIME;
1074 if (linux_wa && unlikely((int64_t)timeout_abs < 0 ||
1075 (delta > 0 && (uint32_t)(delta >> 50) != 0))) {
1077 * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1078 * for negative absolute timeout values (caused by integer
1079 * overflow), and for values about 13 days in the future (2^50ns)
1080 * which would be caused by jiffies overflow. For those cases, it
1081 * sets the timeout 100ms in the future (not *too* soon, since if
1082 * a guest really did set a long timeout on purpose we don't want
1083 * to keep churning CPU time by waking it up).
1085 delta = (100 * SCALE_MS);
1086 timeout_abs = now + delta;
1089 qemu_mutex_lock(&env->xen_timers_lock);
1091 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
1092 env->xen_singleshot_timer_ns = now + delta;
1094 qemu_mutex_unlock(&env->xen_timers_lock);
1095 return 0;
1098 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg)
1100 struct vcpu_set_singleshot_timer sst = { 0 };
1103 * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1104 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1105 * that get used are identical, and there's four bytes of padding
1106 * unused at the end. For true Xen compatibility we should attempt
1107 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1108 * if we can't get the padding too. But that's daft. Just copy what
1109 * we need.
1111 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8);
1112 qemu_build_assert(sizeof(sst) >= 12);
1114 if (kvm_copy_from_gva(cs, arg, &sst, 12)) {
1115 return -EFAULT;
1118 return do_set_singleshot_timer(cs, sst.timeout_abs_ns,
1119 !!(sst.flags & VCPU_SSHOTTMR_future),
1120 false);
1123 static int vcpuop_stop_singleshot_timer(CPUState *cs)
1125 CPUX86State *env = &X86_CPU(cs)->env;
1127 qemu_mutex_lock(&env->xen_timers_lock);
1129 timer_del(env->xen_singleshot_timer);
1130 env->xen_singleshot_timer_ns = 0;
1132 qemu_mutex_unlock(&env->xen_timers_lock);
1133 return 0;
1136 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1137 uint64_t timeout)
1139 int err;
1141 if (unlikely(timeout == 0)) {
1142 err = vcpuop_stop_singleshot_timer(CPU(cpu));
1143 } else {
1144 err = do_set_singleshot_timer(CPU(cpu), timeout, false, true);
1146 exit->u.hcall.result = err;
1147 return true;
1150 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1151 int cmd, int vcpu_id, uint64_t arg)
1153 CPUState *cs = CPU(cpu);
1154 CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id);
1155 int err;
1157 if (!dest) {
1158 err = -ENOENT;
1159 goto out;
1162 switch (cmd) {
1163 case VCPUOP_register_runstate_memory_area:
1164 err = vcpuop_register_runstate_info(cs, dest, arg);
1165 break;
1166 case VCPUOP_register_vcpu_time_memory_area:
1167 err = vcpuop_register_vcpu_time_info(cs, dest, arg);
1168 break;
1169 case VCPUOP_register_vcpu_info:
1170 err = vcpuop_register_vcpu_info(cs, dest, arg);
1171 break;
1172 case VCPUOP_set_singleshot_timer: {
1173 if (cs->cpu_index == vcpu_id) {
1174 err = vcpuop_set_singleshot_timer(dest, arg);
1175 } else {
1176 err = -EINVAL;
1178 break;
1180 case VCPUOP_stop_singleshot_timer:
1181 if (cs->cpu_index == vcpu_id) {
1182 err = vcpuop_stop_singleshot_timer(dest);
1183 } else {
1184 err = -EINVAL;
1186 break;
1187 case VCPUOP_set_periodic_timer: {
1188 err = vcpuop_set_periodic_timer(cs, dest, arg);
1189 break;
1191 case VCPUOP_stop_periodic_timer:
1192 err = vcpuop_stop_periodic_timer(dest);
1193 break;
1195 default:
1196 return false;
1199 out:
1200 exit->u.hcall.result = err;
1201 return true;
1204 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1205 int cmd, uint64_t arg)
1207 CPUState *cs = CPU(cpu);
1208 int err = -ENOSYS;
1210 switch (cmd) {
1211 case EVTCHNOP_init_control:
1212 case EVTCHNOP_expand_array:
1213 case EVTCHNOP_set_priority:
1214 /* We do not support FIFO channels at this point */
1215 err = -ENOSYS;
1216 break;
1218 case EVTCHNOP_status: {
1219 struct evtchn_status status;
1221 qemu_build_assert(sizeof(status) == 24);
1222 if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) {
1223 err = -EFAULT;
1224 break;
1227 err = xen_evtchn_status_op(&status);
1228 if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) {
1229 err = -EFAULT;
1231 break;
1233 case EVTCHNOP_close: {
1234 struct evtchn_close close;
1236 qemu_build_assert(sizeof(close) == 4);
1237 if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) {
1238 err = -EFAULT;
1239 break;
1242 err = xen_evtchn_close_op(&close);
1243 break;
1245 case EVTCHNOP_unmask: {
1246 struct evtchn_unmask unmask;
1248 qemu_build_assert(sizeof(unmask) == 4);
1249 if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) {
1250 err = -EFAULT;
1251 break;
1254 err = xen_evtchn_unmask_op(&unmask);
1255 break;
1257 case EVTCHNOP_bind_virq: {
1258 struct evtchn_bind_virq virq;
1260 qemu_build_assert(sizeof(virq) == 12);
1261 if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) {
1262 err = -EFAULT;
1263 break;
1266 err = xen_evtchn_bind_virq_op(&virq);
1267 if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) {
1268 err = -EFAULT;
1270 break;
1272 case EVTCHNOP_bind_pirq: {
1273 struct evtchn_bind_pirq pirq;
1275 qemu_build_assert(sizeof(pirq) == 12);
1276 if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) {
1277 err = -EFAULT;
1278 break;
1281 err = xen_evtchn_bind_pirq_op(&pirq);
1282 if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) {
1283 err = -EFAULT;
1285 break;
1287 case EVTCHNOP_bind_ipi: {
1288 struct evtchn_bind_ipi ipi;
1290 qemu_build_assert(sizeof(ipi) == 8);
1291 if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) {
1292 err = -EFAULT;
1293 break;
1296 err = xen_evtchn_bind_ipi_op(&ipi);
1297 if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) {
1298 err = -EFAULT;
1300 break;
1302 case EVTCHNOP_send: {
1303 struct evtchn_send send;
1305 qemu_build_assert(sizeof(send) == 4);
1306 if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) {
1307 err = -EFAULT;
1308 break;
1311 err = xen_evtchn_send_op(&send);
1312 break;
1314 case EVTCHNOP_alloc_unbound: {
1315 struct evtchn_alloc_unbound alloc;
1317 qemu_build_assert(sizeof(alloc) == 8);
1318 if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
1319 err = -EFAULT;
1320 break;
1323 err = xen_evtchn_alloc_unbound_op(&alloc);
1324 if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
1325 err = -EFAULT;
1327 break;
1329 case EVTCHNOP_bind_interdomain: {
1330 struct evtchn_bind_interdomain interdomain;
1332 qemu_build_assert(sizeof(interdomain) == 12);
1333 if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1334 err = -EFAULT;
1335 break;
1338 err = xen_evtchn_bind_interdomain_op(&interdomain);
1339 if (!err &&
1340 kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1341 err = -EFAULT;
1343 break;
1345 case EVTCHNOP_bind_vcpu: {
1346 struct evtchn_bind_vcpu vcpu;
1348 qemu_build_assert(sizeof(vcpu) == 8);
1349 if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) {
1350 err = -EFAULT;
1351 break;
1354 err = xen_evtchn_bind_vcpu_op(&vcpu);
1355 break;
1357 case EVTCHNOP_reset: {
1358 struct evtchn_reset reset;
1360 qemu_build_assert(sizeof(reset) == 2);
1361 if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
1362 err = -EFAULT;
1363 break;
1366 err = xen_evtchn_reset_op(&reset);
1367 break;
1369 default:
1370 return false;
1373 exit->u.hcall.result = err;
1374 return true;
1377 int kvm_xen_soft_reset(void)
1379 CPUState *cpu;
1380 int err;
1382 assert(qemu_mutex_iothread_locked());
1384 trace_kvm_xen_soft_reset();
1386 err = xen_evtchn_soft_reset();
1387 if (err) {
1388 return err;
1392 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1393 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1394 * to deliver to the timer interrupt and treats that as 'disabled'.
1396 err = xen_evtchn_set_callback_param(0);
1397 if (err) {
1398 return err;
1401 CPU_FOREACH(cpu) {
1402 async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
1405 err = xen_overlay_map_shinfo_page(INVALID_GFN);
1406 if (err) {
1407 return err;
1410 err = xen_gnttab_reset();
1411 if (err) {
1412 return err;
1415 err = xen_xenstore_reset();
1416 if (err) {
1417 return err;
1420 return 0;
1423 static int schedop_shutdown(CPUState *cs, uint64_t arg)
1425 struct sched_shutdown shutdown;
1426 int ret = 0;
1428 /* No need for 32/64 compat handling */
1429 qemu_build_assert(sizeof(shutdown) == 4);
1431 if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
1432 return -EFAULT;
1435 switch (shutdown.reason) {
1436 case SHUTDOWN_crash:
1437 cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
1438 qemu_system_guest_panicked(NULL);
1439 break;
1441 case SHUTDOWN_reboot:
1442 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1443 break;
1445 case SHUTDOWN_poweroff:
1446 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
1447 break;
1449 case SHUTDOWN_soft_reset:
1450 qemu_mutex_lock_iothread();
1451 ret = kvm_xen_soft_reset();
1452 qemu_mutex_unlock_iothread();
1453 break;
1455 default:
1456 ret = -EINVAL;
1457 break;
1460 return ret;
1463 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1464 int cmd, uint64_t arg)
1466 CPUState *cs = CPU(cpu);
1467 int err = -ENOSYS;
1469 switch (cmd) {
1470 case SCHEDOP_shutdown:
1471 err = schedop_shutdown(cs, arg);
1472 break;
1474 case SCHEDOP_poll:
1476 * Linux will panic if this doesn't work. Just yield; it's not
1477 * worth overthinking it because with event channel handling
1478 * in KVM, the kernel will intercept this and it will never
1479 * reach QEMU anyway. The semantics of the hypercall explicltly
1480 * permit spurious wakeups.
1482 case SCHEDOP_yield:
1483 sched_yield();
1484 err = 0;
1485 break;
1487 default:
1488 return false;
1491 exit->u.hcall.result = err;
1492 return true;
1495 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1496 int cmd, uint64_t arg, int count)
1498 CPUState *cs = CPU(cpu);
1499 int err;
1501 switch (cmd) {
1502 case GNTTABOP_set_version: {
1503 struct gnttab_set_version set;
1505 qemu_build_assert(sizeof(set) == 4);
1506 if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) {
1507 err = -EFAULT;
1508 break;
1511 err = xen_gnttab_set_version_op(&set);
1512 if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) {
1513 err = -EFAULT;
1515 break;
1517 case GNTTABOP_get_version: {
1518 struct gnttab_get_version get;
1520 qemu_build_assert(sizeof(get) == 8);
1521 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1522 err = -EFAULT;
1523 break;
1526 err = xen_gnttab_get_version_op(&get);
1527 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1528 err = -EFAULT;
1530 break;
1532 case GNTTABOP_query_size: {
1533 struct gnttab_query_size size;
1535 qemu_build_assert(sizeof(size) == 16);
1536 if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
1537 err = -EFAULT;
1538 break;
1541 err = xen_gnttab_query_size_op(&size);
1542 if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
1543 err = -EFAULT;
1545 break;
1547 case GNTTABOP_setup_table:
1548 case GNTTABOP_copy:
1549 case GNTTABOP_map_grant_ref:
1550 case GNTTABOP_unmap_grant_ref:
1551 case GNTTABOP_swap_grant_ref:
1552 return false;
1554 default:
1555 /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1556 err = -ENOSYS;
1557 break;
1560 exit->u.hcall.result = err;
1561 return true;
1564 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1565 int cmd, uint64_t arg)
1567 CPUState *cs = CPU(cpu);
1568 int err;
1570 switch (cmd) {
1571 case PHYSDEVOP_map_pirq: {
1572 struct physdev_map_pirq map;
1574 if (hypercall_compat32(exit->u.hcall.longmode)) {
1575 struct compat_physdev_map_pirq *map32 = (void *)&map;
1577 if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) {
1578 return -EFAULT;
1582 * The only thing that's different is the alignment of the
1583 * uint64_t table_base at the end, which gets padding to make
1584 * it 64-bit aligned in the 64-bit version.
1586 qemu_build_assert(sizeof(*map32) == 36);
1587 qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) ==
1588 offsetof(struct compat_physdev_map_pirq, entry_nr));
1589 memmove(&map.table_base, &map32->table_base, sizeof(map.table_base));
1590 } else {
1591 if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) {
1592 err = -EFAULT;
1593 break;
1596 err = xen_physdev_map_pirq(&map);
1598 * Since table_base is an IN parameter and won't be changed, just
1599 * copy the size of the compat structure back to the guest.
1601 if (!err && kvm_copy_to_gva(cs, arg, &map,
1602 sizeof(struct compat_physdev_map_pirq))) {
1603 err = -EFAULT;
1605 break;
1607 case PHYSDEVOP_unmap_pirq: {
1608 struct physdev_unmap_pirq unmap;
1610 qemu_build_assert(sizeof(unmap) == 8);
1611 if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) {
1612 err = -EFAULT;
1613 break;
1616 err = xen_physdev_unmap_pirq(&unmap);
1617 if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) {
1618 err = -EFAULT;
1620 break;
1622 case PHYSDEVOP_eoi: {
1623 struct physdev_eoi eoi;
1625 qemu_build_assert(sizeof(eoi) == 4);
1626 if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) {
1627 err = -EFAULT;
1628 break;
1631 err = xen_physdev_eoi_pirq(&eoi);
1632 if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) {
1633 err = -EFAULT;
1635 break;
1637 case PHYSDEVOP_irq_status_query: {
1638 struct physdev_irq_status_query query;
1640 qemu_build_assert(sizeof(query) == 8);
1641 if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) {
1642 err = -EFAULT;
1643 break;
1646 err = xen_physdev_query_pirq(&query);
1647 if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) {
1648 err = -EFAULT;
1650 break;
1652 case PHYSDEVOP_get_free_pirq: {
1653 struct physdev_get_free_pirq get;
1655 qemu_build_assert(sizeof(get) == 8);
1656 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1657 err = -EFAULT;
1658 break;
1661 err = xen_physdev_get_free_pirq(&get);
1662 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1663 err = -EFAULT;
1665 break;
1667 case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */
1668 err = -ENOSYS;
1669 break;
1671 default:
1672 return false;
1675 exit->u.hcall.result = err;
1676 return true;
1679 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1681 uint16_t code = exit->u.hcall.input;
1683 if (exit->u.hcall.cpl > 0) {
1684 exit->u.hcall.result = -EPERM;
1685 return true;
1688 switch (code) {
1689 case __HYPERVISOR_set_timer_op:
1690 if (exit->u.hcall.longmode) {
1691 return kvm_xen_hcall_set_timer_op(exit, cpu,
1692 exit->u.hcall.params[0]);
1693 } else {
1694 /* In 32-bit mode, the 64-bit timer value is in two args. */
1695 uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 |
1696 (uint32_t)exit->u.hcall.params[0];
1697 return kvm_xen_hcall_set_timer_op(exit, cpu, val);
1699 case __HYPERVISOR_grant_table_op:
1700 return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
1701 exit->u.hcall.params[1],
1702 exit->u.hcall.params[2]);
1703 case __HYPERVISOR_sched_op:
1704 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
1705 exit->u.hcall.params[1]);
1706 case __HYPERVISOR_event_channel_op:
1707 return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0],
1708 exit->u.hcall.params[1]);
1709 case __HYPERVISOR_vcpu_op:
1710 return kvm_xen_hcall_vcpu_op(exit, cpu,
1711 exit->u.hcall.params[0],
1712 exit->u.hcall.params[1],
1713 exit->u.hcall.params[2]);
1714 case __HYPERVISOR_hvm_op:
1715 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
1716 exit->u.hcall.params[1]);
1717 case __HYPERVISOR_memory_op:
1718 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
1719 exit->u.hcall.params[1]);
1720 case __HYPERVISOR_physdev_op:
1721 return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0],
1722 exit->u.hcall.params[1]);
1723 case __HYPERVISOR_xen_version:
1724 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
1725 exit->u.hcall.params[1]);
1726 default:
1727 return false;
1731 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1733 if (exit->type != KVM_EXIT_XEN_HCALL) {
1734 return -1;
1738 * The kernel latches the guest 32/64 mode when the MSR is used to fill
1739 * the hypercall page. So if we see a hypercall in a mode that doesn't
1740 * match our own idea of the guest mode, fetch the kernel's idea of the
1741 * "long mode" to remain in sync.
1743 if (exit->u.hcall.longmode != xen_is_long_mode()) {
1744 xen_sync_long_mode();
1747 if (!do_kvm_xen_handle_exit(cpu, exit)) {
1749 * Some hypercalls will be deliberately "implemented" by returning
1750 * -ENOSYS. This case is for hypercalls which are unexpected.
1752 exit->u.hcall.result = -ENOSYS;
1753 qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
1754 PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
1755 (uint64_t)exit->u.hcall.input,
1756 (uint64_t)exit->u.hcall.params[0],
1757 (uint64_t)exit->u.hcall.params[1],
1758 (uint64_t)exit->u.hcall.params[2]);
1761 trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
1762 exit->u.hcall.input, exit->u.hcall.params[0],
1763 exit->u.hcall.params[1], exit->u.hcall.params[2],
1764 exit->u.hcall.result);
1765 return 0;
1768 uint16_t kvm_xen_get_gnttab_max_frames(void)
1770 KVMState *s = KVM_STATE(current_accel());
1771 return s->xen_gnttab_max_frames;
1774 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1776 KVMState *s = KVM_STATE(current_accel());
1777 return s->xen_evtchn_max_pirq;
1780 int kvm_put_xen_state(CPUState *cs)
1782 X86CPU *cpu = X86_CPU(cs);
1783 CPUX86State *env = &cpu->env;
1784 uint64_t gpa;
1785 int ret;
1787 gpa = env->xen_vcpu_info_gpa;
1788 if (gpa == INVALID_GPA) {
1789 gpa = env->xen_vcpu_info_default_gpa;
1792 if (gpa != INVALID_GPA) {
1793 ret = set_vcpu_info(cs, gpa);
1794 if (ret < 0) {
1795 return ret;
1799 gpa = env->xen_vcpu_time_info_gpa;
1800 if (gpa != INVALID_GPA) {
1801 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
1802 gpa);
1803 if (ret < 0) {
1804 return ret;
1808 gpa = env->xen_vcpu_runstate_gpa;
1809 if (gpa != INVALID_GPA) {
1810 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1811 gpa);
1812 if (ret < 0) {
1813 return ret;
1817 if (env->xen_periodic_timer_period) {
1818 ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period);
1819 if (ret < 0) {
1820 return ret;
1824 if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1826 * If the kernel has EVTCHN_SEND support then it handles timers too,
1827 * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1829 if (env->xen_singleshot_timer_ns) {
1830 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
1831 false, false);
1832 if (ret < 0) {
1833 return ret;
1836 return 0;
1839 if (env->xen_vcpu_callback_vector) {
1840 ret = kvm_xen_set_vcpu_callback_vector(cs);
1841 if (ret < 0) {
1842 return ret;
1846 if (env->xen_virq[VIRQ_TIMER]) {
1847 ret = kvm_xen_set_vcpu_timer(cs);
1848 if (ret < 0) {
1849 return ret;
1852 return 0;
1855 int kvm_get_xen_state(CPUState *cs)
1857 X86CPU *cpu = X86_CPU(cs);
1858 CPUX86State *env = &cpu->env;
1859 uint64_t gpa;
1860 int ret;
1863 * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1864 * to it. It's up to userspace to *assume* that any page shared thus is
1865 * always considered dirty. The shared_info page is different since it's
1866 * an overlay and migrated separately anyway.
1868 gpa = env->xen_vcpu_info_gpa;
1869 if (gpa == INVALID_GPA) {
1870 gpa = env->xen_vcpu_info_default_gpa;
1872 if (gpa != INVALID_GPA) {
1873 MemoryRegionSection mrs = memory_region_find(get_system_memory(),
1874 gpa,
1875 sizeof(struct vcpu_info));
1876 if (mrs.mr &&
1877 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
1878 memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
1879 sizeof(struct vcpu_info));
1883 if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1884 return 0;
1888 * If the kernel is accelerating timers, read out the current value of the
1889 * singleshot timer deadline.
1891 if (env->xen_virq[VIRQ_TIMER]) {
1892 struct kvm_xen_vcpu_attr va = {
1893 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
1895 ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va);
1896 if (ret < 0) {
1897 return ret;
1899 env->xen_singleshot_timer_ns = va.u.timer.expires_ns;
1902 return 0;