target/i386/kvm/xen-emu.c

   1 /*
   2  * Xen HVM emulation support in KVM
   3  *
   4  * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
   5  * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
   6  *
   7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
   8  * See the COPYING file in the top-level directory.
   9  *
  10  */
  11
  12 #include "qemu/osdep.h"
  13 #include "qemu/log.h"
  14 #include "qemu/main-loop.h"
  15 #include "qemu/error-report.h"
  16 #include "hw/xen/xen.h"
  17 #include "sysemu/kvm_int.h"
  18 #include "sysemu/kvm_xen.h"
  19 #include "kvm/kvm_i386.h"
  20 #include "exec/address-spaces.h"
  21 #include "xen-emu.h"
  22 #include "trace.h"
  23 #include "sysemu/runstate.h"
  24
  25 #include "hw/pci/msi.h"
  26 #include "hw/i386/apic-msidef.h"
  27 #include "hw/i386/e820_memory_layout.h"
  28 #include "hw/i386/kvm/xen_overlay.h"
  29 #include "hw/i386/kvm/xen_evtchn.h"
  30 #include "hw/i386/kvm/xen_gnttab.h"
  31 #include "hw/i386/kvm/xen_xenstore.h"
  32
  33 #include "hw/xen/interface/version.h"
  34 #include "hw/xen/interface/sched.h"
  35 #include "hw/xen/interface/memory.h"
  36 #include "hw/xen/interface/hvm/hvm_op.h"
  37 #include "hw/xen/interface/hvm/params.h"
  38 #include "hw/xen/interface/vcpu.h"
  39 #include "hw/xen/interface/event_channel.h"
  40 #include "hw/xen/interface/grant_table.h"
  41
  42 #include "xen-compat.h"
  43
  44 static void xen_vcpu_singleshot_timer_event(void *opaque);
  45 static void xen_vcpu_periodic_timer_event(void *opaque);
  46
  47 #ifdef TARGET_X86_64
  48 #define hypercall_compat32(longmode) (!(longmode))
  49 #else
  50 #define hypercall_compat32(longmode) (false)
  51 #endif
  52
  53 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
  54                            size_t *len, bool is_write)
  55 {
  56         struct kvm_translation tr = {
  57             .linear_address = gva,
  58         };
  59
  60         if (len) {
  61             *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
  62         }
  63
  64         if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
  65             (is_write && !tr.writeable)) {
  66             return false;
  67         }
  68         *gpa = tr.physical_address;
  69         return true;
  70 }
  71
  72 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
  73                       bool is_write)
  74 {
  75     uint8_t *buf = (uint8_t *)_buf;
  76     uint64_t gpa;
  77     size_t len;
  78
  79     while (sz) {
  80         if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
  81             return -EFAULT;
  82         }
  83         if (len > sz) {
  84             len = sz;
  85         }
  86
  87         cpu_physical_memory_rw(gpa, buf, len, is_write);
  88
  89         buf += len;
  90         sz -= len;
  91         gva += len;
  92     }
  93
  94     return 0;
  95 }
  96
  97 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
  98                                     size_t sz)
  99 {
 100     return kvm_gva_rw(cs, gva, buf, sz, false);
 101 }
 102
 103 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
 104                                   size_t sz)
 105 {
 106     return kvm_gva_rw(cs, gva, buf, sz, true);
 107 }
 108
 109 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
 110 {
 111     const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
 112         KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
 113     struct kvm_xen_hvm_config cfg = {
 114         .msr = hypercall_msr,
 115         .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
 116     };
 117     int xen_caps, ret;
 118
 119     xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
 120     if (required_caps & ~xen_caps) {
 121         error_report("kvm: Xen HVM guest support not present or insufficient");
 122         return -ENOSYS;
 123     }
 124
 125     if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
 126         struct kvm_xen_hvm_attr ha = {
 127             .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
 128             .u.xen_version = s->xen_version,
 129         };
 130         (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);
 131
 132         cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
 133     }
 134
 135     ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
 136     if (ret < 0) {
 137         error_report("kvm: Failed to enable Xen HVM support: %s",
 138                      strerror(-ret));
 139         return ret;
 140     }
 141
 142     /* If called a second time, don't repeat the rest of the setup. */
 143     if (s->xen_caps) {
 144         return 0;
 145     }
 146
 147     /*
 148      * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
 149      * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
 150      *
 151      * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
 152      * such things to be polled at precisely the right time. We *could* do
 153      * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
 154      * the moment the IRQ is acked, and see if it should be reasserted.
 155      *
 156      * But the in-kernel irqchip is deprecated, so we're unlikely to add
 157      * that support in the kernel. Insist on using the split irqchip mode
 158      * instead.
 159      *
 160      * This leaves us polling for the level going low in QEMU, which lacks
 161      * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
 162      * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
 163      * the device (for which it has to unmap the device and trap access, for
 164      * some period after an IRQ!!). In the Xen case, we do it on exit from
 165      * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
 166      * Which is kind of icky, but less so than the VFIO one. I may fix them
 167      * both later...
 168      */
 169     if (!kvm_kernel_irqchip_split()) {
 170         error_report("kvm: Xen support requires kernel-irqchip=split");
 171         return -EINVAL;
 172     }
 173
 174     s->xen_caps = xen_caps;
 175
 176     /* Tell fw_cfg to notify the BIOS to reserve the range. */
 177     ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE,
 178                          E820_RESERVED);
 179     if (ret < 0) {
 180         fprintf(stderr, "e820_add_entry() table is full\n");
 181         return ret;
 182     }
 183
 184     /* The page couldn't be overlaid until KVM was initialized */
 185     xen_xenstore_reset();
 186
 187     return 0;
 188 }
 189
 190 int kvm_xen_init_vcpu(CPUState *cs)
 191 {
 192     X86CPU *cpu = X86_CPU(cs);
 193     CPUX86State *env = &cpu->env;
 194     int err;
 195
 196     /*
 197      * The kernel needs to know the Xen/ACPI vCPU ID because that's
 198      * what the guest uses in hypercalls such as timers. It doesn't
 199      * match the APIC ID which is generally used for talking to the
 200      * kernel about vCPUs. And if vCPU threads race with creating
 201      * their KVM vCPUs out of order, it doesn't necessarily match
 202      * with the kernel's internal vCPU indices either.
 203      */
 204     if (kvm_xen_has_cap(EVTCHN_SEND)) {
 205         struct kvm_xen_vcpu_attr va = {
 206             .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
 207             .u.vcpu_id = cs->cpu_index,
 208         };
 209         err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
 210         if (err) {
 211             error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
 212                          strerror(-err));
 213             return err;
 214         }
 215     }
 216
 217     env->xen_vcpu_info_gpa = INVALID_GPA;
 218     env->xen_vcpu_info_default_gpa = INVALID_GPA;
 219     env->xen_vcpu_time_info_gpa = INVALID_GPA;
 220     env->xen_vcpu_runstate_gpa = INVALID_GPA;
 221
 222     qemu_mutex_init(&env->xen_timers_lock);
 223     env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 224                                              xen_vcpu_singleshot_timer_event,
 225                                              cpu);
 226     if (!env->xen_singleshot_timer) {
 227         return -ENOMEM;
 228     }
 229     env->xen_singleshot_timer->opaque = cs;
 230
 231     env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 232                                            xen_vcpu_periodic_timer_event,
 233                                            cpu);
 234     if (!env->xen_periodic_timer) {
 235         return -ENOMEM;
 236     }
 237     env->xen_periodic_timer->opaque = cs;
 238
 239     return 0;
 240 }
 241
 242 uint32_t kvm_xen_get_caps(void)
 243 {
 244     return kvm_state->xen_caps;
 245 }
 246
 247 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
 248                                      int cmd, uint64_t arg)
 249 {
 250     int err = 0;
 251
 252     switch (cmd) {
 253     case XENVER_get_features: {
 254         struct xen_feature_info fi;
 255
 256         /* No need for 32/64 compat handling */
 257         qemu_build_assert(sizeof(fi) == 8);
 258
 259         err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
 260         if (err) {
 261             break;
 262         }
 263
 264         fi.submap = 0;
 265         if (fi.submap_idx == 0) {
 266             fi.submap |= 1 << XENFEAT_writable_page_tables |
 267                          1 << XENFEAT_writable_descriptor_tables |
 268                          1 << XENFEAT_auto_translated_physmap |
 269                          1 << XENFEAT_supervisor_mode_kernel |
 270                          1 << XENFEAT_hvm_callback_vector |
 271                          1 << XENFEAT_hvm_safe_pvclock |
 272                          1 << XENFEAT_hvm_pirqs;
 273         }
 274
 275         err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
 276         break;
 277     }
 278
 279     default:
 280         return false;
 281     }
 282
 283     exit->u.hcall.result = err;
 284     return true;
 285 }
 286
 287 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
 288 {
 289     struct kvm_xen_vcpu_attr xhsi;
 290
 291     xhsi.type = type;
 292     xhsi.u.gpa = gpa;
 293
 294     trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
 295
 296     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
 297 }
 298
 299 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
 300 {
 301     uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
 302     struct kvm_xen_vcpu_attr xva;
 303
 304     xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
 305     xva.u.vector = vector;
 306
 307     trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
 308
 309     return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva);
 310 }
 311
 312 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
 313 {
 314     X86CPU *cpu = X86_CPU(cs);
 315     CPUX86State *env = &cpu->env;
 316
 317     env->xen_vcpu_callback_vector = data.host_int;
 318
 319     if (kvm_xen_has_cap(EVTCHN_SEND)) {
 320         kvm_xen_set_vcpu_callback_vector(cs);
 321     }
 322 }
 323
 324 static int set_vcpu_info(CPUState *cs, uint64_t gpa)
 325 {
 326     X86CPU *cpu = X86_CPU(cs);
 327     CPUX86State *env = &cpu->env;
 328     MemoryRegionSection mrs = { .mr = NULL };
 329     void *vcpu_info_hva = NULL;
 330     int ret;
 331
 332     ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
 333     if (ret || gpa == INVALID_GPA) {
 334         goto out;
 335     }
 336
 337     mrs = memory_region_find(get_system_memory(), gpa,
 338                              sizeof(struct vcpu_info));
 339     if (mrs.mr && mrs.mr->ram_block &&
 340         !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
 341         vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block,
 342                                          mrs.offset_within_region);
 343     }
 344     if (!vcpu_info_hva) {
 345         if (mrs.mr) {
 346             memory_region_unref(mrs.mr);
 347             mrs.mr = NULL;
 348         }
 349         ret = -EINVAL;
 350     }
 351
 352  out:
 353     if (env->xen_vcpu_info_mr) {
 354         memory_region_unref(env->xen_vcpu_info_mr);
 355     }
 356     env->xen_vcpu_info_hva = vcpu_info_hva;
 357     env->xen_vcpu_info_mr = mrs.mr;
 358     return ret;
 359 }
 360
 361 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
 362 {
 363     X86CPU *cpu = X86_CPU(cs);
 364     CPUX86State *env = &cpu->env;
 365
 366     env->xen_vcpu_info_default_gpa = data.host_ulong;
 367
 368     /* Changing the default does nothing if a vcpu_info was explicitly set. */
 369     if (env->xen_vcpu_info_gpa == INVALID_GPA) {
 370         set_vcpu_info(cs, env->xen_vcpu_info_default_gpa);
 371     }
 372 }
 373
 374 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
 375 {
 376     X86CPU *cpu = X86_CPU(cs);
 377     CPUX86State *env = &cpu->env;
 378
 379     env->xen_vcpu_info_gpa = data.host_ulong;
 380
 381     set_vcpu_info(cs, env->xen_vcpu_info_gpa);
 382 }
 383
 384 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
 385 {
 386     CPUState *cs = qemu_get_cpu(vcpu_id);
 387     if (!cs) {
 388         return NULL;
 389     }
 390
 391     return X86_CPU(cs)->env.xen_vcpu_info_hva;
 392 }
 393
 394 void kvm_xen_maybe_deassert_callback(CPUState *cs)
 395 {
 396     CPUX86State *env = &X86_CPU(cs)->env;
 397     struct vcpu_info *vi = env->xen_vcpu_info_hva;
 398     if (!vi) {
 399         return;
 400     }
 401
 402     /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
 403     if (!vi->evtchn_upcall_pending) {
 404         qemu_mutex_lock_iothread();
 405         /*
 406          * Check again now we have the lock, because it may have been
 407          * asserted in the interim. And we don't want to take the lock
 408          * every time because this is a fast path.
 409          */
 410         if (!vi->evtchn_upcall_pending) {
 411             X86_CPU(cs)->env.xen_callback_asserted = false;
 412             xen_evtchn_set_callback_level(0);
 413         }
 414         qemu_mutex_unlock_iothread();
 415     }
 416 }
 417
 418 void kvm_xen_set_callback_asserted(void)
 419 {
 420     CPUState *cs = qemu_get_cpu(0);
 421
 422     if (cs) {
 423         X86_CPU(cs)->env.xen_callback_asserted = true;
 424     }
 425 }
 426
 427 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
 428 {
 429     CPUState *cs = qemu_get_cpu(vcpu_id);
 430     uint8_t vector;
 431
 432     if (!cs) {
 433         return;
 434     }
 435
 436     vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
 437     if (vector) {
 438         /*
 439          * The per-vCPU callback vector injected via lapic. Just
 440          * deliver it as an MSI.
 441          */
 442         MSIMessage msg = {
 443             .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id,
 444             .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
 445         };
 446         kvm_irqchip_send_msi(kvm_state, msg);
 447         return;
 448     }
 449
 450     switch (type) {
 451     case HVM_PARAM_CALLBACK_TYPE_VECTOR:
 452         /*
 453          * If the evtchn_upcall_pending field in the vcpu_info is set, then
 454          * KVM will automatically deliver the vector on entering the vCPU
 455          * so all we have to do is kick it out.
 456          */
 457         qemu_cpu_kick(cs);
 458         break;
 459
 460     case HVM_PARAM_CALLBACK_TYPE_GSI:
 461     case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
 462         if (vcpu_id == 0) {
 463             xen_evtchn_set_callback_level(1);
 464         }
 465         break;
 466     }
 467 }
 468
 469 static int kvm_xen_set_vcpu_timer(CPUState *cs)
 470 {
 471     X86CPU *cpu = X86_CPU(cs);
 472     CPUX86State *env = &cpu->env;
 473
 474     struct kvm_xen_vcpu_attr va = {
 475         .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
 476         .u.timer.port = env->xen_virq[VIRQ_TIMER],
 477         .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
 478         .u.timer.expires_ns = env->xen_singleshot_timer_ns,
 479     };
 480
 481     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
 482 }
 483
 484 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
 485 {
 486     kvm_xen_set_vcpu_timer(cs);
 487 }
 488
 489 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port)
 490 {
 491     CPUState *cs = qemu_get_cpu(vcpu_id);
 492
 493     if (!cs) {
 494         return -ENOENT;
 495     }
 496
 497     /* cpu.h doesn't include the actual Xen header. */
 498     qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS);
 499
 500     if (virq >= NR_VIRQS) {
 501         return -EINVAL;
 502     }
 503
 504     if (port && X86_CPU(cs)->env.xen_virq[virq]) {
 505         return -EEXIST;
 506     }
 507
 508     X86_CPU(cs)->env.xen_virq[virq] = port;
 509     if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) {
 510         async_run_on_cpu(cs, do_set_vcpu_timer_virq,
 511                          RUN_ON_CPU_HOST_INT(port));
 512     }
 513     return 0;
 514 }
 515
 516 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
 517 {
 518     X86CPU *cpu = X86_CPU(cs);
 519     CPUX86State *env = &cpu->env;
 520
 521     env->xen_vcpu_time_info_gpa = data.host_ulong;
 522
 523     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
 524                           env->xen_vcpu_time_info_gpa);
 525 }
 526
 527 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data)
 528 {
 529     X86CPU *cpu = X86_CPU(cs);
 530     CPUX86State *env = &cpu->env;
 531
 532     env->xen_vcpu_runstate_gpa = data.host_ulong;
 533
 534     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
 535                           env->xen_vcpu_runstate_gpa);
 536 }
 537
 538 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
 539 {
 540     X86CPU *cpu = X86_CPU(cs);
 541     CPUX86State *env = &cpu->env;
 542
 543     env->xen_vcpu_info_gpa = INVALID_GPA;
 544     env->xen_vcpu_info_default_gpa = INVALID_GPA;
 545     env->xen_vcpu_time_info_gpa = INVALID_GPA;
 546     env->xen_vcpu_runstate_gpa = INVALID_GPA;
 547     env->xen_vcpu_callback_vector = 0;
 548     env->xen_singleshot_timer_ns = 0;
 549     memset(env->xen_virq, 0, sizeof(env->xen_virq));
 550
 551     set_vcpu_info(cs, INVALID_GPA);
 552     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
 553                           INVALID_GPA);
 554     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
 555                           INVALID_GPA);
 556     if (kvm_xen_has_cap(EVTCHN_SEND)) {
 557         kvm_xen_set_vcpu_callback_vector(cs);
 558         kvm_xen_set_vcpu_timer(cs);
 559     }
 560
 561 }
 562
 563 static int xen_set_shared_info(uint64_t gfn)
 564 {
 565     uint64_t gpa = gfn << TARGET_PAGE_BITS;
 566     int i, err;
 567
 568     QEMU_IOTHREAD_LOCK_GUARD();
 569
 570     /*
 571      * The xen_overlay device tells KVM about it too, since it had to
 572      * do that on migration load anyway (unless we're going to jump
 573      * through lots of hoops to maintain the fiction that this isn't
 574      * KVM-specific.
 575      */
 576     err = xen_overlay_map_shinfo_page(gpa);
 577     if (err) {
 578             return err;
 579     }
 580
 581     trace_kvm_xen_set_shared_info(gfn);
 582
 583     for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
 584         CPUState *cpu = qemu_get_cpu(i);
 585         if (cpu) {
 586             async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
 587                              RUN_ON_CPU_HOST_ULONG(gpa));
 588         }
 589         gpa += sizeof(vcpu_info_t);
 590     }
 591
 592     return err;
 593 }
 594
 595 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
 596 {
 597     switch (space) {
 598     case XENMAPSPACE_shared_info:
 599         if (idx > 0) {
 600             return -EINVAL;
 601         }
 602         return xen_set_shared_info(gfn);
 603
 604     case XENMAPSPACE_grant_table:
 605         return xen_gnttab_map_page(idx, gfn);
 606
 607     case XENMAPSPACE_gmfn:
 608     case XENMAPSPACE_gmfn_range:
 609         return -ENOTSUP;
 610
 611     case XENMAPSPACE_gmfn_foreign:
 612     case XENMAPSPACE_dev_mmio:
 613         return -EPERM;
 614
 615     default:
 616         return -EINVAL;
 617     }
 618 }
 619
 620 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
 621                              uint64_t arg)
 622 {
 623     struct xen_add_to_physmap xatp;
 624     CPUState *cs = CPU(cpu);
 625
 626     if (hypercall_compat32(exit->u.hcall.longmode)) {
 627         struct compat_xen_add_to_physmap xatp32;
 628
 629         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
 630         if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
 631             return -EFAULT;
 632         }
 633         xatp.domid = xatp32.domid;
 634         xatp.size = xatp32.size;
 635         xatp.space = xatp32.space;
 636         xatp.idx = xatp32.idx;
 637         xatp.gpfn = xatp32.gpfn;
 638     } else {
 639         if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
 640             return -EFAULT;
 641         }
 642     }
 643
 644     if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
 645         return -ESRCH;
 646     }
 647
 648     return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
 649 }
 650
 651 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
 652                                    uint64_t arg)
 653 {
 654     struct xen_add_to_physmap_batch xatpb;
 655     unsigned long idxs_gva, gpfns_gva, errs_gva;
 656     CPUState *cs = CPU(cpu);
 657     size_t op_sz;
 658
 659     if (hypercall_compat32(exit->u.hcall.longmode)) {
 660         struct compat_xen_add_to_physmap_batch xatpb32;
 661
 662         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
 663         if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
 664             return -EFAULT;
 665         }
 666         xatpb.domid = xatpb32.domid;
 667         xatpb.space = xatpb32.space;
 668         xatpb.size = xatpb32.size;
 669
 670         idxs_gva = xatpb32.idxs.c;
 671         gpfns_gva = xatpb32.gpfns.c;
 672         errs_gva = xatpb32.errs.c;
 673         op_sz = sizeof(uint32_t);
 674     } else {
 675         if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
 676             return -EFAULT;
 677         }
 678         op_sz = sizeof(unsigned long);
 679         idxs_gva = (unsigned long)xatpb.idxs.p;
 680         gpfns_gva = (unsigned long)xatpb.gpfns.p;
 681         errs_gva = (unsigned long)xatpb.errs.p;
 682     }
 683
 684     if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
 685         return -ESRCH;
 686     }
 687
 688     /* Explicitly invalid for the batch op. Not that we implement it anyway. */
 689     if (xatpb.space == XENMAPSPACE_gmfn_range) {
 690         return -EINVAL;
 691     }
 692
 693     while (xatpb.size--) {
 694         unsigned long idx = 0;
 695         unsigned long gpfn = 0;
 696         int err;
 697
 698         /* For 32-bit compat this only copies the low 32 bits of each */
 699         if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
 700             kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
 701             return -EFAULT;
 702         }
 703         idxs_gva += op_sz;
 704         gpfns_gva += op_sz;
 705
 706         err = add_to_physmap_one(xatpb.space, idx, gpfn);
 707
 708         if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
 709             return -EFAULT;
 710         }
 711         errs_gva += sizeof(err);
 712     }
 713     return 0;
 714 }
 715
 716 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
 717                                    int cmd, uint64_t arg)
 718 {
 719     int err;
 720
 721     switch (cmd) {
 722     case XENMEM_add_to_physmap:
 723         err = do_add_to_physmap(exit, cpu, arg);
 724         break;
 725
 726     case XENMEM_add_to_physmap_batch:
 727         err = do_add_to_physmap_batch(exit, cpu, arg);
 728         break;
 729
 730     default:
 731         return false;
 732     }
 733
 734     exit->u.hcall.result = err;
 735     return true;
 736 }
 737
 738 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
 739                              uint64_t arg)
 740 {
 741     CPUState *cs = CPU(cpu);
 742     struct xen_hvm_param hp;
 743     int err = 0;
 744
 745     /* No need for 32/64 compat handling */
 746     qemu_build_assert(sizeof(hp) == 16);
 747
 748     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
 749         err = -EFAULT;
 750         goto out;
 751     }
 752
 753     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
 754         err = -ESRCH;
 755         goto out;
 756     }
 757
 758     switch (hp.index) {
 759     case HVM_PARAM_CALLBACK_IRQ:
 760         qemu_mutex_lock_iothread();
 761         err = xen_evtchn_set_callback_param(hp.value);
 762         qemu_mutex_unlock_iothread();
 763         xen_set_long_mode(exit->u.hcall.longmode);
 764         break;
 765     default:
 766         return false;
 767     }
 768
 769 out:
 770     exit->u.hcall.result = err;
 771     return true;
 772 }
 773
 774 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
 775                              uint64_t arg)
 776 {
 777     CPUState *cs = CPU(cpu);
 778     struct xen_hvm_param hp;
 779     int err = 0;
 780
 781     /* No need for 32/64 compat handling */
 782     qemu_build_assert(sizeof(hp) == 16);
 783
 784     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
 785         err = -EFAULT;
 786         goto out;
 787     }
 788
 789     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
 790         err = -ESRCH;
 791         goto out;
 792     }
 793
 794     switch (hp.index) {
 795     case HVM_PARAM_STORE_PFN:
 796         hp.value = XEN_SPECIAL_PFN(XENSTORE);
 797         break;
 798     case HVM_PARAM_STORE_EVTCHN:
 799         hp.value = xen_xenstore_get_port();
 800         break;
 801     default:
 802         return false;
 803     }
 804
 805     if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
 806         err = -EFAULT;
 807     }
 808 out:
 809     exit->u.hcall.result = err;
 810     return true;
 811 }
 812
 813 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
 814                                               X86CPU *cpu, uint64_t arg)
 815 {
 816     struct xen_hvm_evtchn_upcall_vector up;
 817     CPUState *target_cs;
 818
 819     /* No need for 32/64 compat handling */
 820     qemu_build_assert(sizeof(up) == 8);
 821
 822     if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
 823         return -EFAULT;
 824     }
 825
 826     if (up.vector < 0x10) {
 827         return -EINVAL;
 828     }
 829
 830     target_cs = qemu_get_cpu(up.vcpu);
 831     if (!target_cs) {
 832         return -EINVAL;
 833     }
 834
 835     async_run_on_cpu(target_cs, do_set_vcpu_callback_vector,
 836                      RUN_ON_CPU_HOST_INT(up.vector));
 837     return 0;
 838 }
 839
 840 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
 841                                  int cmd, uint64_t arg)
 842 {
 843     int ret = -ENOSYS;
 844     switch (cmd) {
 845     case HVMOP_set_evtchn_upcall_vector:
 846         ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu,
 847                                                  exit->u.hcall.params[0]);
 848         break;
 849
 850     case HVMOP_pagetable_dying:
 851         ret = -ENOSYS;
 852         break;
 853
 854     case HVMOP_set_param:
 855         return handle_set_param(exit, cpu, arg);
 856
 857     case HVMOP_get_param:
 858         return handle_get_param(exit, cpu, arg);
 859
 860     default:
 861         return false;
 862     }
 863
 864     exit->u.hcall.result = ret;
 865     return true;
 866 }
 867
 868 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
 869                                      uint64_t arg)
 870 {
 871     struct vcpu_register_vcpu_info rvi;
 872     uint64_t gpa;
 873
 874     /* No need for 32/64 compat handling */
 875     qemu_build_assert(sizeof(rvi) == 16);
 876     qemu_build_assert(sizeof(struct vcpu_info) == 64);
 877
 878     if (!target) {
 879         return -ENOENT;
 880     }
 881
 882     if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
 883         return -EFAULT;
 884     }
 885
 886     if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
 887         return -EINVAL;
 888     }
 889
 890     gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
 891     async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
 892     return 0;
 893 }
 894
 895 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
 896                                           uint64_t arg)
 897 {
 898     struct vcpu_register_time_memory_area tma;
 899     uint64_t gpa;
 900     size_t len;
 901
 902     /* No need for 32/64 compat handling */
 903     qemu_build_assert(sizeof(tma) == 8);
 904     qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
 905
 906     if (!target) {
 907         return -ENOENT;
 908     }
 909
 910     if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
 911         return -EFAULT;
 912     }
 913
 914     /*
 915      * Xen actually uses the GVA and does the translation through the guest
 916      * page tables each time. But Linux/KVM uses the GPA, on the assumption
 917      * that guests only ever use *global* addresses (kernel virtual addresses)
 918      * for it. If Linux is changed to redo the GVA→GPA translation each time,
 919      * it will offer a new vCPU attribute for that, and we'll use it instead.
 920      */
 921     if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
 922         len < sizeof(struct vcpu_time_info)) {
 923         return -EFAULT;
 924     }
 925
 926     async_run_on_cpu(target, do_set_vcpu_time_info_gpa,
 927                      RUN_ON_CPU_HOST_ULONG(gpa));
 928     return 0;
 929 }
 930
 931 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target,
 932                                          uint64_t arg)
 933 {
 934     struct vcpu_register_runstate_memory_area rma;
 935     uint64_t gpa;
 936     size_t len;
 937
 938     /* No need for 32/64 compat handling */
 939     qemu_build_assert(sizeof(rma) == 8);
 940     /* The runstate area actually does change size, but Linux copes. */
 941
 942     if (!target) {
 943         return -ENOENT;
 944     }
 945
 946     if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) {
 947         return -EFAULT;
 948     }
 949
 950     /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
 951     if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) {
 952         return -EFAULT;
 953     }
 954
 955     async_run_on_cpu(target, do_set_vcpu_runstate_gpa,
 956                      RUN_ON_CPU_HOST_ULONG(gpa));
 957     return 0;
 958 }
 959
 960 static uint64_t kvm_get_current_ns(void)
 961 {
 962     struct kvm_clock_data data;
 963     int ret;
 964
 965     ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
 966     if (ret < 0) {
 967         fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
 968                 abort();
 969     }
 970
 971     return data.clock;
 972 }
 973
 974 static void xen_vcpu_singleshot_timer_event(void *opaque)
 975 {
 976     CPUState *cpu = opaque;
 977     CPUX86State *env = &X86_CPU(cpu)->env;
 978     uint16_t port = env->xen_virq[VIRQ_TIMER];
 979
 980     if (likely(port)) {
 981         xen_evtchn_set_port(port);
 982     }
 983
 984     qemu_mutex_lock(&env->xen_timers_lock);
 985     env->xen_singleshot_timer_ns = 0;
 986     qemu_mutex_unlock(&env->xen_timers_lock);
 987 }
 988
 989 static void xen_vcpu_periodic_timer_event(void *opaque)
 990 {
 991     CPUState *cpu = opaque;
 992     CPUX86State *env = &X86_CPU(cpu)->env;
 993     uint16_t port = env->xen_virq[VIRQ_TIMER];
 994     int64_t qemu_now;
 995
 996     if (likely(port)) {
 997         xen_evtchn_set_port(port);
 998     }
 999
1000     qemu_mutex_lock(&env->xen_timers_lock);
1001
1002     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1003     timer_mod_ns(env->xen_periodic_timer,
1004                  qemu_now + env->xen_periodic_timer_period);
1005
1006     qemu_mutex_unlock(&env->xen_timers_lock);
1007 }
1008
1009 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns)
1010 {
1011     CPUX86State *tenv = &X86_CPU(target)->env;
1012     int64_t qemu_now;
1013
1014     timer_del(tenv->xen_periodic_timer);
1015
1016     qemu_mutex_lock(&tenv->xen_timers_lock);
1017
1018     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1019     timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns);
1020     tenv->xen_periodic_timer_period = period_ns;
1021
1022     qemu_mutex_unlock(&tenv->xen_timers_lock);
1023     return 0;
1024 }
1025
1026 #define MILLISECS(_ms)  ((int64_t)((_ms) * 1000000ULL))
1027 #define MICROSECS(_us)  ((int64_t)((_us) * 1000ULL))
1028 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1029 /* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */
1030 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1031
1032 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target,
1033                                      uint64_t arg)
1034 {
1035     struct vcpu_set_periodic_timer spt;
1036
1037     qemu_build_assert(sizeof(spt) == 8);
1038     if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) {
1039         return -EFAULT;
1040     }
1041
1042     if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) {
1043         return -EINVAL;
1044     }
1045
1046     return do_set_periodic_timer(target, spt.period_ns);
1047 }
1048
1049 static int vcpuop_stop_periodic_timer(CPUState *target)
1050 {
1051     CPUX86State *tenv = &X86_CPU(target)->env;
1052
1053     qemu_mutex_lock(&tenv->xen_timers_lock);
1054
1055     timer_del(tenv->xen_periodic_timer);
1056     tenv->xen_periodic_timer_period = 0;
1057
1058     qemu_mutex_unlock(&tenv->xen_timers_lock);
1059     return 0;
1060 }
1061
1062 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
1063                                    bool future, bool linux_wa)
1064 {
1065     CPUX86State *env = &X86_CPU(cs)->env;
1066     int64_t now = kvm_get_current_ns();
1067     int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1068     int64_t delta = timeout_abs - now;
1069
1070     if (future && timeout_abs < now) {
1071         return -ETIME;
1072     }
1073
1074     if (linux_wa && unlikely((int64_t)timeout_abs < 0 ||
1075                              (delta > 0 && (uint32_t)(delta >> 50) != 0))) {
1076         /*
1077          * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1078          * for negative absolute timeout values (caused by integer
1079          * overflow), and for values about 13 days in the future (2^50ns)
1080          * which would be caused by jiffies overflow. For those cases, it
1081          * sets the timeout 100ms in the future (not *too* soon, since if
1082          * a guest really did set a long timeout on purpose we don't want
1083          * to keep churning CPU time by waking it up).
1084          */
1085         delta = (100 * SCALE_MS);
1086         timeout_abs = now + delta;
1087     }
1088
1089     qemu_mutex_lock(&env->xen_timers_lock);
1090
1091     timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
1092     env->xen_singleshot_timer_ns = now + delta;
1093
1094     qemu_mutex_unlock(&env->xen_timers_lock);
1095     return 0;
1096 }
1097
1098 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg)
1099 {
1100     struct vcpu_set_singleshot_timer sst = { 0 };
1101
1102     /*
1103      * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1104      * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1105      * that get used are identical, and there's four bytes of padding
1106      * unused at the end. For true Xen compatibility we should attempt
1107      * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1108      * if we can't get the padding too. But that's daft. Just copy what
1109      * we need.
1110      */
1111     qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8);
1112     qemu_build_assert(sizeof(sst) >= 12);
1113
1114     if (kvm_copy_from_gva(cs, arg, &sst, 12)) {
1115         return -EFAULT;
1116     }
1117
1118     return do_set_singleshot_timer(cs, sst.timeout_abs_ns,
1119                                    !!(sst.flags & VCPU_SSHOTTMR_future),
1120                                    false);
1121 }
1122
1123 static int vcpuop_stop_singleshot_timer(CPUState *cs)
1124 {
1125     CPUX86State *env = &X86_CPU(cs)->env;
1126
1127     qemu_mutex_lock(&env->xen_timers_lock);
1128
1129     timer_del(env->xen_singleshot_timer);
1130     env->xen_singleshot_timer_ns = 0;
1131
1132     qemu_mutex_unlock(&env->xen_timers_lock);
1133     return 0;
1134 }
1135
1136 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1137                                        uint64_t timeout)
1138 {
1139     int err;
1140
1141     if (unlikely(timeout == 0)) {
1142         err = vcpuop_stop_singleshot_timer(CPU(cpu));
1143     } else {
1144         err = do_set_singleshot_timer(CPU(cpu), timeout, false, true);
1145     }
1146     exit->u.hcall.result = err;
1147     return true;
1148 }
1149
1150 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1151                                   int cmd, int vcpu_id, uint64_t arg)
1152 {
1153     CPUState *cs = CPU(cpu);
1154     CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id);
1155     int err;
1156
1157     if (!dest) {
1158         err = -ENOENT;
1159         goto out;
1160     }
1161
1162     switch (cmd) {
1163     case VCPUOP_register_runstate_memory_area:
1164         err = vcpuop_register_runstate_info(cs, dest, arg);
1165         break;
1166     case VCPUOP_register_vcpu_time_memory_area:
1167         err = vcpuop_register_vcpu_time_info(cs, dest, arg);
1168         break;
1169     case VCPUOP_register_vcpu_info:
1170         err = vcpuop_register_vcpu_info(cs, dest, arg);
1171         break;
1172     case VCPUOP_set_singleshot_timer: {
1173         if (cs->cpu_index == vcpu_id) {
1174             err = vcpuop_set_singleshot_timer(dest, arg);
1175         } else {
1176             err = -EINVAL;
1177         }
1178         break;
1179     }
1180     case VCPUOP_stop_singleshot_timer:
1181         if (cs->cpu_index == vcpu_id) {
1182             err = vcpuop_stop_singleshot_timer(dest);
1183         } else {
1184             err = -EINVAL;
1185         }
1186         break;
1187     case VCPUOP_set_periodic_timer: {
1188         err = vcpuop_set_periodic_timer(cs, dest, arg);
1189         break;
1190     }
1191     case VCPUOP_stop_periodic_timer:
1192         err = vcpuop_stop_periodic_timer(dest);
1193         break;
1194
1195     default:
1196         return false;
1197     }
1198
1199  out:
1200     exit->u.hcall.result = err;
1201     return true;
1202 }
1203
1204 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1205                                     int cmd, uint64_t arg)
1206 {
1207     CPUState *cs = CPU(cpu);
1208     int err = -ENOSYS;
1209
1210     switch (cmd) {
1211     case EVTCHNOP_init_control:
1212     case EVTCHNOP_expand_array:
1213     case EVTCHNOP_set_priority:
1214         /* We do not support FIFO channels at this point */
1215         err = -ENOSYS;
1216         break;
1217
1218     case EVTCHNOP_status: {
1219         struct evtchn_status status;
1220
1221         qemu_build_assert(sizeof(status) == 24);
1222         if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) {
1223             err = -EFAULT;
1224             break;
1225         }
1226
1227         err = xen_evtchn_status_op(&status);
1228         if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) {
1229             err = -EFAULT;
1230         }
1231         break;
1232     }
1233     case EVTCHNOP_close: {
1234         struct evtchn_close close;
1235
1236         qemu_build_assert(sizeof(close) == 4);
1237         if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) {
1238             err = -EFAULT;
1239             break;
1240         }
1241
1242         err = xen_evtchn_close_op(&close);
1243         break;
1244     }
1245     case EVTCHNOP_unmask: {
1246         struct evtchn_unmask unmask;
1247
1248         qemu_build_assert(sizeof(unmask) == 4);
1249         if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) {
1250             err = -EFAULT;
1251             break;
1252         }
1253
1254         err = xen_evtchn_unmask_op(&unmask);
1255         break;
1256     }
1257     case EVTCHNOP_bind_virq: {
1258         struct evtchn_bind_virq virq;
1259
1260         qemu_build_assert(sizeof(virq) == 12);
1261         if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) {
1262             err = -EFAULT;
1263             break;
1264         }
1265
1266         err = xen_evtchn_bind_virq_op(&virq);
1267         if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) {
1268             err = -EFAULT;
1269         }
1270         break;
1271     }
1272     case EVTCHNOP_bind_pirq: {
1273         struct evtchn_bind_pirq pirq;
1274
1275         qemu_build_assert(sizeof(pirq) == 12);
1276         if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) {
1277             err = -EFAULT;
1278             break;
1279         }
1280
1281         err = xen_evtchn_bind_pirq_op(&pirq);
1282         if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) {
1283             err = -EFAULT;
1284         }
1285         break;
1286     }
1287     case EVTCHNOP_bind_ipi: {
1288         struct evtchn_bind_ipi ipi;
1289
1290         qemu_build_assert(sizeof(ipi) == 8);
1291         if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) {
1292             err = -EFAULT;
1293             break;
1294         }
1295
1296         err = xen_evtchn_bind_ipi_op(&ipi);
1297         if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) {
1298             err = -EFAULT;
1299         }
1300         break;
1301     }
1302     case EVTCHNOP_send: {
1303         struct evtchn_send send;
1304
1305         qemu_build_assert(sizeof(send) == 4);
1306         if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) {
1307             err = -EFAULT;
1308             break;
1309         }
1310
1311         err = xen_evtchn_send_op(&send);
1312         break;
1313     }
1314     case EVTCHNOP_alloc_unbound: {
1315         struct evtchn_alloc_unbound alloc;
1316
1317         qemu_build_assert(sizeof(alloc) == 8);
1318         if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
1319             err = -EFAULT;
1320             break;
1321         }
1322
1323         err = xen_evtchn_alloc_unbound_op(&alloc);
1324         if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
1325             err = -EFAULT;
1326         }
1327         break;
1328     }
1329     case EVTCHNOP_bind_interdomain: {
1330         struct evtchn_bind_interdomain interdomain;
1331
1332         qemu_build_assert(sizeof(interdomain) == 12);
1333         if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1334             err = -EFAULT;
1335             break;
1336         }
1337
1338         err = xen_evtchn_bind_interdomain_op(&interdomain);
1339         if (!err &&
1340             kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1341             err = -EFAULT;
1342         }
1343         break;
1344     }
1345     case EVTCHNOP_bind_vcpu: {
1346         struct evtchn_bind_vcpu vcpu;
1347
1348         qemu_build_assert(sizeof(vcpu) == 8);
1349         if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) {
1350             err = -EFAULT;
1351             break;
1352         }
1353
1354         err = xen_evtchn_bind_vcpu_op(&vcpu);
1355         break;
1356     }
1357     case EVTCHNOP_reset: {
1358         struct evtchn_reset reset;
1359
1360         qemu_build_assert(sizeof(reset) == 2);
1361         if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
1362             err = -EFAULT;
1363             break;
1364         }
1365
1366         err = xen_evtchn_reset_op(&reset);
1367         break;
1368     }
1369     default:
1370         return false;
1371     }
1372
1373     exit->u.hcall.result = err;
1374     return true;
1375 }
1376
1377 int kvm_xen_soft_reset(void)
1378 {
1379     CPUState *cpu;
1380     int err;
1381
1382     assert(qemu_mutex_iothread_locked());
1383
1384     trace_kvm_xen_soft_reset();
1385
1386     err = xen_evtchn_soft_reset();
1387     if (err) {
1388         return err;
1389     }
1390
1391     /*
1392      * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1393      * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1394      * to deliver to the timer interrupt and treats that as 'disabled'.
1395      */
1396     err = xen_evtchn_set_callback_param(0);
1397     if (err) {
1398         return err;
1399     }
1400
1401     CPU_FOREACH(cpu) {
1402         async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
1403     }
1404
1405     err = xen_overlay_map_shinfo_page(INVALID_GFN);
1406     if (err) {
1407         return err;
1408     }
1409
1410     err = xen_gnttab_reset();
1411     if (err) {
1412         return err;
1413     }
1414
1415     err = xen_xenstore_reset();
1416     if (err) {
1417         return err;
1418     }
1419
1420     return 0;
1421 }
1422
1423 static int schedop_shutdown(CPUState *cs, uint64_t arg)
1424 {
1425     struct sched_shutdown shutdown;
1426     int ret = 0;
1427
1428     /* No need for 32/64 compat handling */
1429     qemu_build_assert(sizeof(shutdown) == 4);
1430
1431     if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
1432         return -EFAULT;
1433     }
1434
1435     switch (shutdown.reason) {
1436     case SHUTDOWN_crash:
1437         cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
1438         qemu_system_guest_panicked(NULL);
1439         break;
1440
1441     case SHUTDOWN_reboot:
1442         qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1443         break;
1444
1445     case SHUTDOWN_poweroff:
1446         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
1447         break;
1448
1449     case SHUTDOWN_soft_reset:
1450         qemu_mutex_lock_iothread();
1451         ret = kvm_xen_soft_reset();
1452         qemu_mutex_unlock_iothread();
1453         break;
1454
1455     default:
1456         ret = -EINVAL;
1457         break;
1458     }
1459
1460     return ret;
1461 }
1462
1463 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1464                                    int cmd, uint64_t arg)
1465 {
1466     CPUState *cs = CPU(cpu);
1467     int err = -ENOSYS;
1468
1469     switch (cmd) {
1470     case SCHEDOP_shutdown:
1471         err = schedop_shutdown(cs, arg);
1472         break;
1473
1474     case SCHEDOP_poll:
1475         /*
1476          * Linux will panic if this doesn't work. Just yield; it's not
1477          * worth overthinking it because with event channel handling
1478          * in KVM, the kernel will intercept this and it will never
1479          * reach QEMU anyway. The semantics of the hypercall explicltly
1480          * permit spurious wakeups.
1481          */
1482     case SCHEDOP_yield:
1483         sched_yield();
1484         err = 0;
1485         break;
1486
1487     default:
1488         return false;
1489     }
1490
1491     exit->u.hcall.result = err;
1492     return true;
1493 }
1494
1495 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1496                                     int cmd, uint64_t arg, int count)
1497 {
1498     CPUState *cs = CPU(cpu);
1499     int err;
1500
1501     switch (cmd) {
1502     case GNTTABOP_set_version: {
1503         struct gnttab_set_version set;
1504
1505         qemu_build_assert(sizeof(set) == 4);
1506         if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) {
1507             err = -EFAULT;
1508             break;
1509         }
1510
1511         err = xen_gnttab_set_version_op(&set);
1512         if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) {
1513             err = -EFAULT;
1514         }
1515         break;
1516     }
1517     case GNTTABOP_get_version: {
1518         struct gnttab_get_version get;
1519
1520         qemu_build_assert(sizeof(get) == 8);
1521         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1522             err = -EFAULT;
1523             break;
1524         }
1525
1526         err = xen_gnttab_get_version_op(&get);
1527         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1528             err = -EFAULT;
1529         }
1530         break;
1531     }
1532     case GNTTABOP_query_size: {
1533         struct gnttab_query_size size;
1534
1535         qemu_build_assert(sizeof(size) == 16);
1536         if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
1537             err = -EFAULT;
1538             break;
1539         }
1540
1541         err = xen_gnttab_query_size_op(&size);
1542         if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
1543             err = -EFAULT;
1544         }
1545         break;
1546     }
1547     case GNTTABOP_setup_table:
1548     case GNTTABOP_copy:
1549     case GNTTABOP_map_grant_ref:
1550     case GNTTABOP_unmap_grant_ref:
1551     case GNTTABOP_swap_grant_ref:
1552         return false;
1553
1554     default:
1555         /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1556         err = -ENOSYS;
1557         break;
1558     }
1559
1560     exit->u.hcall.result = err;
1561     return true;
1562 }
1563
1564 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1565                                      int cmd, uint64_t arg)
1566 {
1567     CPUState *cs = CPU(cpu);
1568     int err;
1569
1570     switch (cmd) {
1571     case PHYSDEVOP_map_pirq: {
1572         struct physdev_map_pirq map;
1573
1574         if (hypercall_compat32(exit->u.hcall.longmode)) {
1575             struct compat_physdev_map_pirq *map32 = (void *)&map;
1576
1577             if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) {
1578                 return -EFAULT;
1579             }
1580
1581             /*
1582              * The only thing that's different is the alignment of the
1583              * uint64_t table_base at the end, which gets padding to make
1584              * it 64-bit aligned in the 64-bit version.
1585              */
1586             qemu_build_assert(sizeof(*map32) == 36);
1587             qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) ==
1588                               offsetof(struct compat_physdev_map_pirq, entry_nr));
1589             memmove(&map.table_base, &map32->table_base, sizeof(map.table_base));
1590         } else {
1591             if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) {
1592                 err = -EFAULT;
1593                 break;
1594             }
1595         }
1596         err = xen_physdev_map_pirq(&map);
1597         /*
1598          * Since table_base is an IN parameter and won't be changed, just
1599          * copy the size of the compat structure back to the guest.
1600          */
1601         if (!err && kvm_copy_to_gva(cs, arg, &map,
1602                                     sizeof(struct compat_physdev_map_pirq))) {
1603             err = -EFAULT;
1604         }
1605         break;
1606     }
1607     case PHYSDEVOP_unmap_pirq: {
1608         struct physdev_unmap_pirq unmap;
1609
1610         qemu_build_assert(sizeof(unmap) == 8);
1611         if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) {
1612             err = -EFAULT;
1613             break;
1614         }
1615
1616         err = xen_physdev_unmap_pirq(&unmap);
1617         if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) {
1618             err = -EFAULT;
1619         }
1620         break;
1621     }
1622     case PHYSDEVOP_eoi: {
1623         struct physdev_eoi eoi;
1624
1625         qemu_build_assert(sizeof(eoi) == 4);
1626         if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) {
1627             err = -EFAULT;
1628             break;
1629         }
1630
1631         err = xen_physdev_eoi_pirq(&eoi);
1632         if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) {
1633             err = -EFAULT;
1634         }
1635         break;
1636     }
1637     case PHYSDEVOP_irq_status_query: {
1638         struct physdev_irq_status_query query;
1639
1640         qemu_build_assert(sizeof(query) == 8);
1641         if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) {
1642             err = -EFAULT;
1643             break;
1644         }
1645
1646         err = xen_physdev_query_pirq(&query);
1647         if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) {
1648             err = -EFAULT;
1649         }
1650         break;
1651     }
1652     case PHYSDEVOP_get_free_pirq: {
1653         struct physdev_get_free_pirq get;
1654
1655         qemu_build_assert(sizeof(get) == 8);
1656         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1657             err = -EFAULT;
1658             break;
1659         }
1660
1661         err = xen_physdev_get_free_pirq(&get);
1662         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1663             err = -EFAULT;
1664         }
1665         break;
1666     }
1667     case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */
1668         err = -ENOSYS;
1669         break;
1670
1671     default:
1672         return false;
1673     }
1674
1675     exit->u.hcall.result = err;
1676     return true;
1677 }
1678
1679 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1680 {
1681     uint16_t code = exit->u.hcall.input;
1682
1683     if (exit->u.hcall.cpl > 0) {
1684         exit->u.hcall.result = -EPERM;
1685         return true;
1686     }
1687
1688     switch (code) {
1689     case __HYPERVISOR_set_timer_op:
1690         if (exit->u.hcall.longmode) {
1691             return kvm_xen_hcall_set_timer_op(exit, cpu,
1692                                               exit->u.hcall.params[0]);
1693         } else {
1694             /* In 32-bit mode, the 64-bit timer value is in two args. */
1695             uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 |
1696                 (uint32_t)exit->u.hcall.params[0];
1697             return kvm_xen_hcall_set_timer_op(exit, cpu, val);
1698         }
1699     case __HYPERVISOR_grant_table_op:
1700         return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
1701                                        exit->u.hcall.params[1],
1702                                        exit->u.hcall.params[2]);
1703     case __HYPERVISOR_sched_op:
1704         return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
1705                                       exit->u.hcall.params[1]);
1706     case __HYPERVISOR_event_channel_op:
1707         return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0],
1708                                        exit->u.hcall.params[1]);
1709     case __HYPERVISOR_vcpu_op:
1710         return kvm_xen_hcall_vcpu_op(exit, cpu,
1711                                      exit->u.hcall.params[0],
1712                                      exit->u.hcall.params[1],
1713                                      exit->u.hcall.params[2]);
1714     case __HYPERVISOR_hvm_op:
1715         return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
1716                                     exit->u.hcall.params[1]);
1717     case __HYPERVISOR_memory_op:
1718         return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
1719                                        exit->u.hcall.params[1]);
1720     case __HYPERVISOR_physdev_op:
1721         return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0],
1722                                         exit->u.hcall.params[1]);
1723     case __HYPERVISOR_xen_version:
1724         return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
1725                                          exit->u.hcall.params[1]);
1726     default:
1727         return false;
1728     }
1729 }
1730
1731 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1732 {
1733     if (exit->type != KVM_EXIT_XEN_HCALL) {
1734         return -1;
1735     }
1736
1737     /*
1738      * The kernel latches the guest 32/64 mode when the MSR is used to fill
1739      * the hypercall page. So if we see a hypercall in a mode that doesn't
1740      * match our own idea of the guest mode, fetch the kernel's idea of the
1741      * "long mode" to remain in sync.
1742      */
1743     if (exit->u.hcall.longmode != xen_is_long_mode()) {
1744         xen_sync_long_mode();
1745     }
1746
1747     if (!do_kvm_xen_handle_exit(cpu, exit)) {
1748         /*
1749          * Some hypercalls will be deliberately "implemented" by returning
1750          * -ENOSYS. This case is for hypercalls which are unexpected.
1751          */
1752         exit->u.hcall.result = -ENOSYS;
1753         qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
1754                       PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
1755                       (uint64_t)exit->u.hcall.input,
1756                       (uint64_t)exit->u.hcall.params[0],
1757                       (uint64_t)exit->u.hcall.params[1],
1758                       (uint64_t)exit->u.hcall.params[2]);
1759     }
1760
1761     trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
1762                             exit->u.hcall.input, exit->u.hcall.params[0],
1763                             exit->u.hcall.params[1], exit->u.hcall.params[2],
1764                             exit->u.hcall.result);
1765     return 0;
1766 }
1767
1768 uint16_t kvm_xen_get_gnttab_max_frames(void)
1769 {
1770     KVMState *s = KVM_STATE(current_accel());
1771     return s->xen_gnttab_max_frames;
1772 }
1773
1774 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1775 {
1776     KVMState *s = KVM_STATE(current_accel());
1777     return s->xen_evtchn_max_pirq;
1778 }
1779
1780 int kvm_put_xen_state(CPUState *cs)
1781 {
1782     X86CPU *cpu = X86_CPU(cs);
1783     CPUX86State *env = &cpu->env;
1784     uint64_t gpa;
1785     int ret;
1786
1787     gpa = env->xen_vcpu_info_gpa;
1788     if (gpa == INVALID_GPA) {
1789         gpa = env->xen_vcpu_info_default_gpa;
1790     }
1791
1792     if (gpa != INVALID_GPA) {
1793         ret = set_vcpu_info(cs, gpa);
1794         if (ret < 0) {
1795             return ret;
1796         }
1797     }
1798
1799     gpa = env->xen_vcpu_time_info_gpa;
1800     if (gpa != INVALID_GPA) {
1801         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
1802                                     gpa);
1803         if (ret < 0) {
1804             return ret;
1805         }
1806     }
1807
1808     gpa = env->xen_vcpu_runstate_gpa;
1809     if (gpa != INVALID_GPA) {
1810         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1811                                     gpa);
1812         if (ret < 0) {
1813             return ret;
1814         }
1815     }
1816
1817     if (env->xen_periodic_timer_period) {
1818         ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period);
1819         if (ret < 0) {
1820             return ret;
1821         }
1822     }
1823
1824     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1825         /*
1826          * If the kernel has EVTCHN_SEND support then it handles timers too,
1827          * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1828          */
1829         if (env->xen_singleshot_timer_ns) {
1830             ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
1831                                     false, false);
1832             if (ret < 0) {
1833                 return ret;
1834             }
1835         }
1836         return 0;
1837     }
1838
1839     if (env->xen_vcpu_callback_vector) {
1840         ret = kvm_xen_set_vcpu_callback_vector(cs);
1841         if (ret < 0) {
1842             return ret;
1843         }
1844     }
1845
1846     if (env->xen_virq[VIRQ_TIMER]) {
1847         ret = kvm_xen_set_vcpu_timer(cs);
1848         if (ret < 0) {
1849             return ret;
1850         }
1851     }
1852     return 0;
1853 }
1854
1855 int kvm_get_xen_state(CPUState *cs)
1856 {
1857     X86CPU *cpu = X86_CPU(cs);
1858     CPUX86State *env = &cpu->env;
1859     uint64_t gpa;
1860     int ret;
1861
1862     /*
1863      * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1864      * to it. It's up to userspace to *assume* that any page shared thus is
1865      * always considered dirty. The shared_info page is different since it's
1866      * an overlay and migrated separately anyway.
1867      */
1868     gpa = env->xen_vcpu_info_gpa;
1869     if (gpa == INVALID_GPA) {
1870         gpa = env->xen_vcpu_info_default_gpa;
1871     }
1872     if (gpa != INVALID_GPA) {
1873         MemoryRegionSection mrs = memory_region_find(get_system_memory(),
1874                                                      gpa,
1875                                                      sizeof(struct vcpu_info));
1876         if (mrs.mr &&
1877             !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
1878             memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
1879                                     sizeof(struct vcpu_info));
1880         }
1881     }
1882
1883     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1884         return 0;
1885     }
1886
1887     /*
1888      * If the kernel is accelerating timers, read out the current value of the
1889      * singleshot timer deadline.
1890      */
1891     if (env->xen_virq[VIRQ_TIMER]) {
1892         struct kvm_xen_vcpu_attr va = {
1893             .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
1894         };
1895         ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va);
1896         if (ret < 0) {
1897             return ret;
1898         }
1899         env->xen_singleshot_timer_ns = va.u.timer.expires_ns;
1900     }
1901
1902     return 0;
1903 }