1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
4 * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/eventfd.h>
16 #include <linux/kvm_host.h>
17 #include <linux/sched/stat.h>
19 #include <trace/events/kvm.h>
20 #include <xen/interface/xen.h>
21 #include <xen/interface/vcpu.h>
22 #include <xen/interface/version.h>
23 #include <xen/interface/event_channel.h>
24 #include <xen/interface/sched.h>
26 #include <asm/xen/cpuid.h>
27 #include <asm/pvclock.h>
32 static int kvm_xen_set_evtchn(struct kvm_xen_evtchn
*xe
, struct kvm
*kvm
);
33 static int kvm_xen_setattr_evtchn(struct kvm
*kvm
, struct kvm_xen_hvm_attr
*data
);
34 static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu
*vcpu
, u64 param
, u64
*r
);
36 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled
, HZ
);
38 static int kvm_xen_shared_info_init(struct kvm
*kvm
)
40 struct gfn_to_pfn_cache
*gpc
= &kvm
->arch
.xen
.shinfo_cache
;
41 struct pvclock_wall_clock
*wc
;
46 int idx
= srcu_read_lock(&kvm
->srcu
);
48 read_lock_irq(&gpc
->lock
);
49 while (!kvm_gpc_check(gpc
, PAGE_SIZE
)) {
50 read_unlock_irq(&gpc
->lock
);
52 ret
= kvm_gpc_refresh(gpc
, PAGE_SIZE
);
56 read_lock_irq(&gpc
->lock
);
60 * This code mirrors kvm_write_wall_clock() except that it writes
61 * directly through the pfn cache and doesn't mark the page dirty.
63 wall_nsec
= kvm_get_wall_clock_epoch(kvm
);
65 /* Paranoia checks on the 32-bit struct layout */
66 BUILD_BUG_ON(offsetof(struct compat_shared_info
, wc
) != 0x900);
67 BUILD_BUG_ON(offsetof(struct compat_shared_info
, arch
.wc_sec_hi
) != 0x924);
68 BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info
, version
) != 0);
71 /* Paranoia checks on the 64-bit struct layout */
72 BUILD_BUG_ON(offsetof(struct shared_info
, wc
) != 0xc00);
73 BUILD_BUG_ON(offsetof(struct shared_info
, wc_sec_hi
) != 0xc0c);
75 if (IS_ENABLED(CONFIG_64BIT
) && kvm
->arch
.xen
.long_mode
) {
76 struct shared_info
*shinfo
= gpc
->khva
;
78 wc_sec_hi
= &shinfo
->wc_sec_hi
;
83 struct compat_shared_info
*shinfo
= gpc
->khva
;
85 wc_sec_hi
= &shinfo
->arch
.wc_sec_hi
;
89 /* Increment and ensure an odd value */
90 wc_version
= wc
->version
= (wc
->version
+ 1) | 1;
93 wc
->nsec
= do_div(wall_nsec
, NSEC_PER_SEC
);
94 wc
->sec
= (u32
)wall_nsec
;
95 *wc_sec_hi
= wall_nsec
>> 32;
98 wc
->version
= wc_version
+ 1;
99 read_unlock_irq(&gpc
->lock
);
101 kvm_make_all_cpus_request(kvm
, KVM_REQ_MASTERCLOCK_UPDATE
);
104 srcu_read_unlock(&kvm
->srcu
, idx
);
108 void kvm_xen_inject_timer_irqs(struct kvm_vcpu
*vcpu
)
110 if (atomic_read(&vcpu
->arch
.xen
.timer_pending
) > 0) {
111 struct kvm_xen_evtchn e
;
113 e
.vcpu_id
= vcpu
->vcpu_id
;
114 e
.vcpu_idx
= vcpu
->vcpu_idx
;
115 e
.port
= vcpu
->arch
.xen
.timer_virq
;
116 e
.priority
= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
;
118 kvm_xen_set_evtchn(&e
, vcpu
->kvm
);
120 vcpu
->arch
.xen
.timer_expires
= 0;
121 atomic_set(&vcpu
->arch
.xen
.timer_pending
, 0);
125 static enum hrtimer_restart
xen_timer_callback(struct hrtimer
*timer
)
127 struct kvm_vcpu
*vcpu
= container_of(timer
, struct kvm_vcpu
,
129 struct kvm_xen_evtchn e
;
132 if (atomic_read(&vcpu
->arch
.xen
.timer_pending
))
133 return HRTIMER_NORESTART
;
135 e
.vcpu_id
= vcpu
->vcpu_id
;
136 e
.vcpu_idx
= vcpu
->vcpu_idx
;
137 e
.port
= vcpu
->arch
.xen
.timer_virq
;
138 e
.priority
= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
;
140 rc
= kvm_xen_set_evtchn_fast(&e
, vcpu
->kvm
);
141 if (rc
!= -EWOULDBLOCK
) {
142 vcpu
->arch
.xen
.timer_expires
= 0;
143 return HRTIMER_NORESTART
;
146 atomic_inc(&vcpu
->arch
.xen
.timer_pending
);
147 kvm_make_request(KVM_REQ_UNBLOCK
, vcpu
);
150 return HRTIMER_NORESTART
;
153 static void kvm_xen_start_timer(struct kvm_vcpu
*vcpu
, u64 guest_abs
,
156 int64_t kernel_now
, delta
;
160 * The guest provides the requested timeout in absolute nanoseconds
161 * of the KVM clock — as *it* sees it, based on the scaled TSC and
162 * the pvclock information provided by KVM.
164 * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
165 * so use CLOCK_MONOTONIC. In the timescales covered by timers, the
166 * difference won't matter much as there is no cumulative effect.
168 * Calculate the time for some arbitrary point in time around "now"
169 * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
170 * delta between the kvmclock "now" value and the guest's requested
171 * timeout, apply the "Linux workaround" described below, and add
172 * the resulting delta to the CLOCK_MONOTONIC "now" value, to get
173 * the absolute CLOCK_MONOTONIC time at which the timer should
176 if (vcpu
->arch
.hv_clock
.version
&& vcpu
->kvm
->arch
.use_master_clock
&&
177 static_cpu_has(X86_FEATURE_CONSTANT_TSC
)) {
178 uint64_t host_tsc
, guest_tsc
;
180 if (!IS_ENABLED(CONFIG_64BIT
) ||
181 !kvm_get_monotonic_and_clockread(&kernel_now
, &host_tsc
)) {
183 * Don't fall back to get_kvmclock_ns() because it's
184 * broken; it has a systemic error in its results
185 * because it scales directly from host TSC to
186 * nanoseconds, and doesn't scale first to guest TSC
187 * and *then* to nanoseconds as the guest does.
189 * There is a small error introduced here because time
190 * continues to elapse between the ktime_get() and the
191 * subsequent rdtsc(). But not the systemic drift due
192 * to get_kvmclock_ns().
194 kernel_now
= ktime_get(); /* This is CLOCK_MONOTONIC */
198 /* Calculate the guest kvmclock as the guest would do it. */
199 guest_tsc
= kvm_read_l1_tsc(vcpu
, host_tsc
);
200 guest_now
= __pvclock_read_cycles(&vcpu
->arch
.hv_clock
,
204 * Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
206 * Also if the guest PV clock hasn't been set up yet, as is
207 * likely to be the case during migration when the vCPU has
208 * not been run yet. It would be possible to calculate the
209 * scaling factors properly in that case but there's not much
210 * point in doing so. The get_kvmclock_ns() drift accumulates
211 * over time, so it's OK to use it at startup. Besides, on
212 * migration there's going to be a little bit of skew in the
213 * precise moment at which timers fire anyway. Often they'll
214 * be in the "past" by the time the VM is running again after
217 guest_now
= get_kvmclock_ns(vcpu
->kvm
);
218 kernel_now
= ktime_get();
221 delta
= guest_abs
- guest_now
;
224 * Xen has a 'Linux workaround' in do_set_timer_op() which checks for
225 * negative absolute timeout values (caused by integer overflow), and
226 * for values about 13 days in the future (2^50ns) which would be
227 * caused by jiffies overflow. For those cases, Xen sets the timeout
228 * 100ms in the future (not *too* soon, since if a guest really did
229 * set a long timeout on purpose we don't want to keep churning CPU
230 * time by waking it up). Emulate Xen's workaround when starting the
231 * timer in response to __HYPERVISOR_set_timer_op.
234 unlikely((int64_t)guest_abs
< 0 ||
235 (delta
> 0 && (uint32_t) (delta
>> 50) != 0))) {
236 delta
= 100 * NSEC_PER_MSEC
;
237 guest_abs
= guest_now
+ delta
;
241 * Avoid races with the old timer firing. Checking timer_expires
242 * to avoid calling hrtimer_cancel() will only have false positives
245 if (vcpu
->arch
.xen
.timer_expires
)
246 hrtimer_cancel(&vcpu
->arch
.xen
.timer
);
248 atomic_set(&vcpu
->arch
.xen
.timer_pending
, 0);
249 vcpu
->arch
.xen
.timer_expires
= guest_abs
;
252 xen_timer_callback(&vcpu
->arch
.xen
.timer
);
254 hrtimer_start(&vcpu
->arch
.xen
.timer
,
255 ktime_add_ns(kernel_now
, delta
),
256 HRTIMER_MODE_ABS_HARD
);
259 static void kvm_xen_stop_timer(struct kvm_vcpu
*vcpu
)
261 hrtimer_cancel(&vcpu
->arch
.xen
.timer
);
262 vcpu
->arch
.xen
.timer_expires
= 0;
263 atomic_set(&vcpu
->arch
.xen
.timer_pending
, 0);
266 static void kvm_xen_init_timer(struct kvm_vcpu
*vcpu
)
268 hrtimer_init(&vcpu
->arch
.xen
.timer
, CLOCK_MONOTONIC
,
269 HRTIMER_MODE_ABS_HARD
);
270 vcpu
->arch
.xen
.timer
.function
= xen_timer_callback
;
273 static void kvm_xen_update_runstate_guest(struct kvm_vcpu
*v
, bool atomic
)
275 struct kvm_vcpu_xen
*vx
= &v
->arch
.xen
;
276 struct gfn_to_pfn_cache
*gpc1
= &vx
->runstate_cache
;
277 struct gfn_to_pfn_cache
*gpc2
= &vx
->runstate2_cache
;
278 size_t user_len
, user_len1
, user_len2
;
279 struct vcpu_runstate_info rs
;
282 uint8_t *update_bit
= NULL
;
288 * The only difference between 32-bit and 64-bit versions of the
289 * runstate struct is the alignment of uint64_t in 32-bit, which
290 * means that the 64-bit version has an additional 4 bytes of
291 * padding after the first field 'state'. Let's be really really
292 * paranoid about that, and matching it with our internal data
293 * structures that we memcpy into it...
295 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info
, state
) != 0);
296 BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info
, state
) != 0);
297 BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info
) != 0x2c);
300 * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
301 * so each subsequent field is shifted by 4, and it's 4 bytes longer.
303 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info
, state_entry_time
) !=
304 offsetof(struct compat_vcpu_runstate_info
, state_entry_time
) + 4);
305 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info
, time
) !=
306 offsetof(struct compat_vcpu_runstate_info
, time
) + 4);
307 BUILD_BUG_ON(sizeof(struct vcpu_runstate_info
) != 0x2c + 4);
310 * The state field is in the same place at the start of both structs,
311 * and is the same size (int) as vx->current_runstate.
313 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info
, state
) !=
314 offsetof(struct compat_vcpu_runstate_info
, state
));
315 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info
, state
) !=
316 sizeof(vx
->current_runstate
));
317 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info
, state
) !=
318 sizeof(vx
->current_runstate
));
321 * The state_entry_time field is 64 bits in both versions, and the
322 * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
323 * is little-endian means that it's in the last *byte* of the word.
324 * That detail is important later.
326 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info
, state_entry_time
) !=
328 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info
, state_entry_time
) !=
330 BUILD_BUG_ON((XEN_RUNSTATE_UPDATE
>> 56) != 0x80);
333 * The time array is four 64-bit quantities in both versions, matching
334 * the vx->runstate_times and immediately following state_entry_time.
336 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info
, state_entry_time
) !=
337 offsetof(struct vcpu_runstate_info
, time
) - sizeof(uint64_t));
338 BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info
, state_entry_time
) !=
339 offsetof(struct compat_vcpu_runstate_info
, time
) - sizeof(uint64_t));
340 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info
, time
) !=
341 sizeof_field(struct compat_vcpu_runstate_info
, time
));
342 BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info
, time
) !=
343 sizeof(vx
->runstate_times
));
345 if (IS_ENABLED(CONFIG_64BIT
) && v
->kvm
->arch
.xen
.long_mode
) {
346 user_len
= sizeof(struct vcpu_runstate_info
);
347 times_ofs
= offsetof(struct vcpu_runstate_info
,
350 user_len
= sizeof(struct compat_vcpu_runstate_info
);
351 times_ofs
= offsetof(struct compat_vcpu_runstate_info
,
356 * There are basically no alignment constraints. The guest can set it
357 * up so it crosses from one page to the next, and at arbitrary byte
358 * alignment (and the 32-bit ABI doesn't align the 64-bit integers
359 * anyway, even if the overall struct had been 64-bit aligned).
361 if ((gpc1
->gpa
& ~PAGE_MASK
) + user_len
>= PAGE_SIZE
) {
362 user_len1
= PAGE_SIZE
- (gpc1
->gpa
& ~PAGE_MASK
);
363 user_len2
= user_len
- user_len1
;
365 user_len1
= user_len
;
368 BUG_ON(user_len1
+ user_len2
!= user_len
);
372 * Attempt to obtain the GPC lock on *both* (if there are two)
373 * gfn_to_pfn caches that cover the region.
376 local_irq_save(flags
);
377 if (!read_trylock(&gpc1
->lock
)) {
378 local_irq_restore(flags
);
382 read_lock_irqsave(&gpc1
->lock
, flags
);
384 while (!kvm_gpc_check(gpc1
, user_len1
)) {
385 read_unlock_irqrestore(&gpc1
->lock
, flags
);
387 /* When invoked from kvm_sched_out() we cannot sleep */
391 if (kvm_gpc_refresh(gpc1
, user_len1
))
394 read_lock_irqsave(&gpc1
->lock
, flags
);
397 if (likely(!user_len2
)) {
399 * Set up three pointers directly to the runstate_info
400 * struct in the guest (via the GPC).
402 * • @rs_state → state field
403 * • @rs_times → state_entry_time field.
404 * • @update_bit → last byte of state_entry_time, which
405 * contains the XEN_RUNSTATE_UPDATE bit.
407 rs_state
= gpc1
->khva
;
408 rs_times
= gpc1
->khva
+ times_ofs
;
409 if (v
->kvm
->arch
.xen
.runstate_update_flag
)
410 update_bit
= ((void *)(&rs_times
[1])) - 1;
413 * The guest's runstate_info is split across two pages and we
414 * need to hold and validate both GPCs simultaneously. We can
415 * declare a lock ordering GPC1 > GPC2 because nothing else
416 * takes them more than one at a time. Set a subclass on the
417 * gpc1 lock to make lockdep shut up about it.
419 lock_set_subclass(&gpc1
->lock
.dep_map
, 1, _THIS_IP_
);
421 if (!read_trylock(&gpc2
->lock
)) {
422 read_unlock_irqrestore(&gpc1
->lock
, flags
);
426 read_lock(&gpc2
->lock
);
429 if (!kvm_gpc_check(gpc2
, user_len2
)) {
430 read_unlock(&gpc2
->lock
);
431 read_unlock_irqrestore(&gpc1
->lock
, flags
);
433 /* When invoked from kvm_sched_out() we cannot sleep */
438 * Use kvm_gpc_activate() here because if the runstate
439 * area was configured in 32-bit mode and only extends
440 * to the second page now because the guest changed to
441 * 64-bit mode, the second GPC won't have been set up.
443 if (kvm_gpc_activate(gpc2
, gpc1
->gpa
+ user_len1
,
448 * We dropped the lock on GPC1 so we have to go all the
449 * way back and revalidate that too.
455 * In this case, the runstate_info struct will be assembled on
456 * the kernel stack (compat or not as appropriate) and will
457 * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
458 * rs pointers accordingly.
460 rs_times
= &rs
.state_entry_time
;
463 * The rs_state pointer points to the start of what we'll
464 * copy to the guest, which in the case of a compat guest
465 * is the 32-bit field that the compiler thinks is padding.
467 rs_state
= ((void *)rs_times
) - times_ofs
;
470 * The update_bit is still directly in the guest memory,
471 * via one GPC or the other.
473 if (v
->kvm
->arch
.xen
.runstate_update_flag
) {
474 if (user_len1
>= times_ofs
+ sizeof(uint64_t))
475 update_bit
= gpc1
->khva
+ times_ofs
+
476 sizeof(uint64_t) - 1;
478 update_bit
= gpc2
->khva
+ times_ofs
+
479 sizeof(uint64_t) - 1 - user_len1
;
484 * Don't leak kernel memory through the padding in the 64-bit
485 * version of the struct.
487 memset(&rs
, 0, offsetof(struct vcpu_runstate_info
, state_entry_time
));
492 * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
493 * state_entry_time field, directly in the guest. We need to set
494 * that (and write-barrier) before writing to the rest of the
495 * structure, and clear it last. Just as Xen does, we address the
496 * single *byte* in which it resides because it might be in a
497 * different cache line to the rest of the 64-bit word, due to
498 * the (lack of) alignment constraints.
500 entry_time
= vx
->runstate_entry_time
;
502 entry_time
|= XEN_RUNSTATE_UPDATE
;
503 *update_bit
= (vx
->runstate_entry_time
| XEN_RUNSTATE_UPDATE
) >> 56;
508 * Now assemble the actual structure, either on our kernel stack
509 * or directly in the guest according to how the rs_state and
510 * rs_times pointers were set up above.
512 *rs_state
= vx
->current_runstate
;
513 rs_times
[0] = entry_time
;
514 memcpy(rs_times
+ 1, vx
->runstate_times
, sizeof(vx
->runstate_times
));
516 /* For the split case, we have to then copy it to the guest. */
518 memcpy(gpc1
->khva
, rs_state
, user_len1
);
519 memcpy(gpc2
->khva
, ((void *)rs_state
) + user_len1
, user_len2
);
523 /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */
525 entry_time
&= ~XEN_RUNSTATE_UPDATE
;
526 *update_bit
= entry_time
>> 56;
531 kvm_gpc_mark_dirty_in_slot(gpc2
);
532 read_unlock(&gpc2
->lock
);
535 kvm_gpc_mark_dirty_in_slot(gpc1
);
536 read_unlock_irqrestore(&gpc1
->lock
, flags
);
539 void kvm_xen_update_runstate(struct kvm_vcpu
*v
, int state
)
541 struct kvm_vcpu_xen
*vx
= &v
->arch
.xen
;
542 u64 now
= get_kvmclock_ns(v
->kvm
);
543 u64 delta_ns
= now
- vx
->runstate_entry_time
;
544 u64 run_delay
= current
->sched_info
.run_delay
;
546 if (unlikely(!vx
->runstate_entry_time
))
547 vx
->current_runstate
= RUNSTATE_offline
;
550 * Time waiting for the scheduler isn't "stolen" if the
551 * vCPU wasn't running anyway.
553 if (vx
->current_runstate
== RUNSTATE_running
) {
554 u64 steal_ns
= run_delay
- vx
->last_steal
;
556 delta_ns
-= steal_ns
;
558 vx
->runstate_times
[RUNSTATE_runnable
] += steal_ns
;
560 vx
->last_steal
= run_delay
;
562 vx
->runstate_times
[vx
->current_runstate
] += delta_ns
;
563 vx
->current_runstate
= state
;
564 vx
->runstate_entry_time
= now
;
566 if (vx
->runstate_cache
.active
)
567 kvm_xen_update_runstate_guest(v
, state
== RUNSTATE_runnable
);
570 void kvm_xen_inject_vcpu_vector(struct kvm_vcpu
*v
)
572 struct kvm_lapic_irq irq
= { };
574 irq
.dest_id
= v
->vcpu_id
;
575 irq
.vector
= v
->arch
.xen
.upcall_vector
;
576 irq
.dest_mode
= APIC_DEST_PHYSICAL
;
577 irq
.shorthand
= APIC_DEST_NOSHORT
;
578 irq
.delivery_mode
= APIC_DM_FIXED
;
581 kvm_irq_delivery_to_apic(v
->kvm
, NULL
, &irq
, NULL
);
585 * On event channel delivery, the vcpu_info may not have been accessible.
586 * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
587 * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
588 * Do so now that we can sleep in the context of the vCPU to bring the
589 * page in, and refresh the pfn cache for it.
591 void kvm_xen_inject_pending_events(struct kvm_vcpu
*v
)
593 unsigned long evtchn_pending_sel
= READ_ONCE(v
->arch
.xen
.evtchn_pending_sel
);
594 struct gfn_to_pfn_cache
*gpc
= &v
->arch
.xen
.vcpu_info_cache
;
597 if (!evtchn_pending_sel
)
601 * Yes, this is an open-coded loop. But that's just what put_user()
602 * does anyway. Page it in and retry the instruction. We're just a
603 * little more honest about it.
605 read_lock_irqsave(&gpc
->lock
, flags
);
606 while (!kvm_gpc_check(gpc
, sizeof(struct vcpu_info
))) {
607 read_unlock_irqrestore(&gpc
->lock
, flags
);
609 if (kvm_gpc_refresh(gpc
, sizeof(struct vcpu_info
)))
612 read_lock_irqsave(&gpc
->lock
, flags
);
615 /* Now gpc->khva is a valid kernel address for the vcpu_info */
616 if (IS_ENABLED(CONFIG_64BIT
) && v
->kvm
->arch
.xen
.long_mode
) {
617 struct vcpu_info
*vi
= gpc
->khva
;
619 asm volatile(LOCK_PREFIX
"orq %0, %1\n"
621 LOCK_PREFIX
"andq %0, %2\n"
622 : "=r" (evtchn_pending_sel
),
623 "+m" (vi
->evtchn_pending_sel
),
624 "+m" (v
->arch
.xen
.evtchn_pending_sel
)
625 : "0" (evtchn_pending_sel
));
626 WRITE_ONCE(vi
->evtchn_upcall_pending
, 1);
628 u32 evtchn_pending_sel32
= evtchn_pending_sel
;
629 struct compat_vcpu_info
*vi
= gpc
->khva
;
631 asm volatile(LOCK_PREFIX
"orl %0, %1\n"
633 LOCK_PREFIX
"andl %0, %2\n"
634 : "=r" (evtchn_pending_sel32
),
635 "+m" (vi
->evtchn_pending_sel
),
636 "+m" (v
->arch
.xen
.evtchn_pending_sel
)
637 : "0" (evtchn_pending_sel32
));
638 WRITE_ONCE(vi
->evtchn_upcall_pending
, 1);
641 kvm_gpc_mark_dirty_in_slot(gpc
);
642 read_unlock_irqrestore(&gpc
->lock
, flags
);
644 /* For the per-vCPU lapic vector, deliver it as MSI. */
645 if (v
->arch
.xen
.upcall_vector
)
646 kvm_xen_inject_vcpu_vector(v
);
649 int __kvm_xen_has_interrupt(struct kvm_vcpu
*v
)
651 struct gfn_to_pfn_cache
*gpc
= &v
->arch
.xen
.vcpu_info_cache
;
656 * If the global upcall vector (HVMIRQ_callback_vector) is set and
657 * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
660 /* No need for compat handling here */
661 BUILD_BUG_ON(offsetof(struct vcpu_info
, evtchn_upcall_pending
) !=
662 offsetof(struct compat_vcpu_info
, evtchn_upcall_pending
));
663 BUILD_BUG_ON(sizeof(rc
) !=
664 sizeof_field(struct vcpu_info
, evtchn_upcall_pending
));
665 BUILD_BUG_ON(sizeof(rc
) !=
666 sizeof_field(struct compat_vcpu_info
, evtchn_upcall_pending
));
668 read_lock_irqsave(&gpc
->lock
, flags
);
669 while (!kvm_gpc_check(gpc
, sizeof(struct vcpu_info
))) {
670 read_unlock_irqrestore(&gpc
->lock
, flags
);
673 * This function gets called from kvm_vcpu_block() after setting the
674 * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
675 * from a HLT. So we really mustn't sleep. If the page ended up absent
676 * at that point, just return 1 in order to trigger an immediate wake,
677 * and we'll end up getting called again from a context where we *can*
678 * fault in the page and wait for it.
680 if (in_atomic() || !task_is_running(current
))
683 if (kvm_gpc_refresh(gpc
, sizeof(struct vcpu_info
))) {
685 * If this failed, userspace has screwed up the
686 * vcpu_info mapping. No interrupts for you.
690 read_lock_irqsave(&gpc
->lock
, flags
);
693 rc
= ((struct vcpu_info
*)gpc
->khva
)->evtchn_upcall_pending
;
694 read_unlock_irqrestore(&gpc
->lock
, flags
);
698 int kvm_xen_hvm_set_attr(struct kvm
*kvm
, struct kvm_xen_hvm_attr
*data
)
703 switch (data
->type
) {
704 case KVM_XEN_ATTR_TYPE_LONG_MODE
:
705 if (!IS_ENABLED(CONFIG_64BIT
) && data
->u
.long_mode
) {
708 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
709 kvm
->arch
.xen
.long_mode
= !!data
->u
.long_mode
;
712 * Re-initialize shared_info to put the wallclock in the
713 * correct place. Whilst it's not necessary to do this
714 * unless the mode is actually changed, it does no harm
715 * to make the call anyway.
717 r
= kvm
->arch
.xen
.shinfo_cache
.active
?
718 kvm_xen_shared_info_init(kvm
) : 0;
719 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
723 case KVM_XEN_ATTR_TYPE_SHARED_INFO
:
724 case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA
: {
727 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
729 idx
= srcu_read_lock(&kvm
->srcu
);
731 if (data
->type
== KVM_XEN_ATTR_TYPE_SHARED_INFO
) {
732 gfn_t gfn
= data
->u
.shared_info
.gfn
;
734 if (gfn
== KVM_XEN_INVALID_GFN
) {
735 kvm_gpc_deactivate(&kvm
->arch
.xen
.shinfo_cache
);
738 r
= kvm_gpc_activate(&kvm
->arch
.xen
.shinfo_cache
,
739 gfn_to_gpa(gfn
), PAGE_SIZE
);
742 void __user
* hva
= u64_to_user_ptr(data
->u
.shared_info
.hva
);
744 if (!PAGE_ALIGNED(hva
)) {
747 kvm_gpc_deactivate(&kvm
->arch
.xen
.shinfo_cache
);
750 r
= kvm_gpc_activate_hva(&kvm
->arch
.xen
.shinfo_cache
,
751 (unsigned long)hva
, PAGE_SIZE
);
755 srcu_read_unlock(&kvm
->srcu
, idx
);
757 if (!r
&& kvm
->arch
.xen
.shinfo_cache
.active
)
758 r
= kvm_xen_shared_info_init(kvm
);
760 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
763 case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
:
764 if (data
->u
.vector
&& data
->u
.vector
< 0x10)
767 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
768 kvm
->arch
.xen
.upcall_vector
= data
->u
.vector
;
769 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
774 case KVM_XEN_ATTR_TYPE_EVTCHN
:
775 r
= kvm_xen_setattr_evtchn(kvm
, data
);
778 case KVM_XEN_ATTR_TYPE_XEN_VERSION
:
779 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
780 kvm
->arch
.xen
.xen_version
= data
->u
.xen_version
;
781 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
785 case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG
:
786 if (!sched_info_on()) {
790 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
791 kvm
->arch
.xen
.runstate_update_flag
= !!data
->u
.runstate_update_flag
;
792 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
803 int kvm_xen_hvm_get_attr(struct kvm
*kvm
, struct kvm_xen_hvm_attr
*data
)
807 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
809 switch (data
->type
) {
810 case KVM_XEN_ATTR_TYPE_LONG_MODE
:
811 data
->u
.long_mode
= kvm
->arch
.xen
.long_mode
;
815 case KVM_XEN_ATTR_TYPE_SHARED_INFO
:
816 if (kvm_gpc_is_gpa_active(&kvm
->arch
.xen
.shinfo_cache
))
817 data
->u
.shared_info
.gfn
= gpa_to_gfn(kvm
->arch
.xen
.shinfo_cache
.gpa
);
819 data
->u
.shared_info
.gfn
= KVM_XEN_INVALID_GFN
;
823 case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA
:
824 if (kvm_gpc_is_hva_active(&kvm
->arch
.xen
.shinfo_cache
))
825 data
->u
.shared_info
.hva
= kvm
->arch
.xen
.shinfo_cache
.uhva
;
827 data
->u
.shared_info
.hva
= 0;
831 case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
:
832 data
->u
.vector
= kvm
->arch
.xen
.upcall_vector
;
836 case KVM_XEN_ATTR_TYPE_XEN_VERSION
:
837 data
->u
.xen_version
= kvm
->arch
.xen
.xen_version
;
841 case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG
:
842 if (!sched_info_on()) {
846 data
->u
.runstate_update_flag
= kvm
->arch
.xen
.runstate_update_flag
;
854 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
858 int kvm_xen_vcpu_set_attr(struct kvm_vcpu
*vcpu
, struct kvm_xen_vcpu_attr
*data
)
860 int idx
, r
= -ENOENT
;
862 mutex_lock(&vcpu
->kvm
->arch
.xen
.xen_lock
);
863 idx
= srcu_read_lock(&vcpu
->kvm
->srcu
);
865 switch (data
->type
) {
866 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
:
867 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA
:
868 /* No compat necessary here. */
869 BUILD_BUG_ON(sizeof(struct vcpu_info
) !=
870 sizeof(struct compat_vcpu_info
));
871 BUILD_BUG_ON(offsetof(struct vcpu_info
, time
) !=
872 offsetof(struct compat_vcpu_info
, time
));
874 if (data
->type
== KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
) {
875 if (data
->u
.gpa
== KVM_XEN_INVALID_GPA
) {
876 kvm_gpc_deactivate(&vcpu
->arch
.xen
.vcpu_info_cache
);
881 r
= kvm_gpc_activate(&vcpu
->arch
.xen
.vcpu_info_cache
,
882 data
->u
.gpa
, sizeof(struct vcpu_info
));
884 if (data
->u
.hva
== 0) {
885 kvm_gpc_deactivate(&vcpu
->arch
.xen
.vcpu_info_cache
);
890 r
= kvm_gpc_activate_hva(&vcpu
->arch
.xen
.vcpu_info_cache
,
891 data
->u
.hva
, sizeof(struct vcpu_info
));
895 kvm_make_request(KVM_REQ_CLOCK_UPDATE
, vcpu
);
899 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
:
900 if (data
->u
.gpa
== KVM_XEN_INVALID_GPA
) {
901 kvm_gpc_deactivate(&vcpu
->arch
.xen
.vcpu_time_info_cache
);
906 r
= kvm_gpc_activate(&vcpu
->arch
.xen
.vcpu_time_info_cache
,
908 sizeof(struct pvclock_vcpu_time_info
));
910 kvm_make_request(KVM_REQ_CLOCK_UPDATE
, vcpu
);
913 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
: {
916 if (!sched_info_on()) {
920 if (data
->u
.gpa
== KVM_XEN_INVALID_GPA
) {
923 kvm_gpc_deactivate(&vcpu
->arch
.xen
.runstate_cache
);
924 kvm_gpc_deactivate(&vcpu
->arch
.xen
.runstate2_cache
);
929 * If the guest switches to 64-bit mode after setting the runstate
930 * address, that's actually OK. kvm_xen_update_runstate_guest()
933 if (IS_ENABLED(CONFIG_64BIT
) && vcpu
->kvm
->arch
.xen
.long_mode
)
934 sz
= sizeof(struct vcpu_runstate_info
);
936 sz
= sizeof(struct compat_vcpu_runstate_info
);
938 /* How much fits in the (first) page? */
939 sz1
= PAGE_SIZE
- (data
->u
.gpa
& ~PAGE_MASK
);
940 r
= kvm_gpc_activate(&vcpu
->arch
.xen
.runstate_cache
,
945 /* Either map the second page, or deactivate the second GPC */
947 kvm_gpc_deactivate(&vcpu
->arch
.xen
.runstate2_cache
);
950 BUG_ON((data
->u
.gpa
+ sz1
) & ~PAGE_MASK
);
951 r
= kvm_gpc_activate(&vcpu
->arch
.xen
.runstate2_cache
,
952 data
->u
.gpa
+ sz1
, sz2
);
957 kvm_xen_update_runstate_guest(vcpu
, false);
960 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT
:
961 if (!sched_info_on()) {
965 if (data
->u
.runstate
.state
> RUNSTATE_offline
) {
970 kvm_xen_update_runstate(vcpu
, data
->u
.runstate
.state
);
974 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA
:
975 if (!sched_info_on()) {
979 if (data
->u
.runstate
.state
> RUNSTATE_offline
) {
983 if (data
->u
.runstate
.state_entry_time
!=
984 (data
->u
.runstate
.time_running
+
985 data
->u
.runstate
.time_runnable
+
986 data
->u
.runstate
.time_blocked
+
987 data
->u
.runstate
.time_offline
)) {
991 if (get_kvmclock_ns(vcpu
->kvm
) <
992 data
->u
.runstate
.state_entry_time
) {
997 vcpu
->arch
.xen
.current_runstate
= data
->u
.runstate
.state
;
998 vcpu
->arch
.xen
.runstate_entry_time
=
999 data
->u
.runstate
.state_entry_time
;
1000 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_running
] =
1001 data
->u
.runstate
.time_running
;
1002 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_runnable
] =
1003 data
->u
.runstate
.time_runnable
;
1004 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_blocked
] =
1005 data
->u
.runstate
.time_blocked
;
1006 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_offline
] =
1007 data
->u
.runstate
.time_offline
;
1008 vcpu
->arch
.xen
.last_steal
= current
->sched_info
.run_delay
;
1012 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
:
1013 if (!sched_info_on()) {
1017 if (data
->u
.runstate
.state
> RUNSTATE_offline
&&
1018 data
->u
.runstate
.state
!= (u64
)-1) {
1022 /* The adjustment must add up */
1023 if (data
->u
.runstate
.state_entry_time
!=
1024 (data
->u
.runstate
.time_running
+
1025 data
->u
.runstate
.time_runnable
+
1026 data
->u
.runstate
.time_blocked
+
1027 data
->u
.runstate
.time_offline
)) {
1032 if (get_kvmclock_ns(vcpu
->kvm
) <
1033 (vcpu
->arch
.xen
.runstate_entry_time
+
1034 data
->u
.runstate
.state_entry_time
)) {
1039 vcpu
->arch
.xen
.runstate_entry_time
+=
1040 data
->u
.runstate
.state_entry_time
;
1041 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_running
] +=
1042 data
->u
.runstate
.time_running
;
1043 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_runnable
] +=
1044 data
->u
.runstate
.time_runnable
;
1045 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_blocked
] +=
1046 data
->u
.runstate
.time_blocked
;
1047 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_offline
] +=
1048 data
->u
.runstate
.time_offline
;
1050 if (data
->u
.runstate
.state
<= RUNSTATE_offline
)
1051 kvm_xen_update_runstate(vcpu
, data
->u
.runstate
.state
);
1052 else if (vcpu
->arch
.xen
.runstate_cache
.active
)
1053 kvm_xen_update_runstate_guest(vcpu
, false);
1057 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
:
1058 if (data
->u
.vcpu_id
>= KVM_MAX_VCPUS
)
1061 vcpu
->arch
.xen
.vcpu_id
= data
->u
.vcpu_id
;
1066 case KVM_XEN_VCPU_ATTR_TYPE_TIMER
:
1067 if (data
->u
.timer
.port
&&
1068 data
->u
.timer
.priority
!= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
) {
1073 if (!vcpu
->arch
.xen
.timer
.function
)
1074 kvm_xen_init_timer(vcpu
);
1076 /* Stop the timer (if it's running) before changing the vector */
1077 kvm_xen_stop_timer(vcpu
);
1078 vcpu
->arch
.xen
.timer_virq
= data
->u
.timer
.port
;
1080 /* Start the timer if the new value has a valid vector+expiry. */
1081 if (data
->u
.timer
.port
&& data
->u
.timer
.expires_ns
)
1082 kvm_xen_start_timer(vcpu
, data
->u
.timer
.expires_ns
, false);
1087 case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
:
1088 if (data
->u
.vector
&& data
->u
.vector
< 0x10)
1091 vcpu
->arch
.xen
.upcall_vector
= data
->u
.vector
;
1100 srcu_read_unlock(&vcpu
->kvm
->srcu
, idx
);
1101 mutex_unlock(&vcpu
->kvm
->arch
.xen
.xen_lock
);
1105 int kvm_xen_vcpu_get_attr(struct kvm_vcpu
*vcpu
, struct kvm_xen_vcpu_attr
*data
)
1109 mutex_lock(&vcpu
->kvm
->arch
.xen
.xen_lock
);
1111 switch (data
->type
) {
1112 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
:
1113 if (kvm_gpc_is_gpa_active(&vcpu
->arch
.xen
.vcpu_info_cache
))
1114 data
->u
.gpa
= vcpu
->arch
.xen
.vcpu_info_cache
.gpa
;
1116 data
->u
.gpa
= KVM_XEN_INVALID_GPA
;
1120 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA
:
1121 if (kvm_gpc_is_hva_active(&vcpu
->arch
.xen
.vcpu_info_cache
))
1122 data
->u
.hva
= vcpu
->arch
.xen
.vcpu_info_cache
.uhva
;
1128 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
:
1129 if (vcpu
->arch
.xen
.vcpu_time_info_cache
.active
)
1130 data
->u
.gpa
= vcpu
->arch
.xen
.vcpu_time_info_cache
.gpa
;
1132 data
->u
.gpa
= KVM_XEN_INVALID_GPA
;
1136 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
:
1137 if (!sched_info_on()) {
1141 if (vcpu
->arch
.xen
.runstate_cache
.active
) {
1142 data
->u
.gpa
= vcpu
->arch
.xen
.runstate_cache
.gpa
;
1147 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT
:
1148 if (!sched_info_on()) {
1152 data
->u
.runstate
.state
= vcpu
->arch
.xen
.current_runstate
;
1156 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA
:
1157 if (!sched_info_on()) {
1161 data
->u
.runstate
.state
= vcpu
->arch
.xen
.current_runstate
;
1162 data
->u
.runstate
.state_entry_time
=
1163 vcpu
->arch
.xen
.runstate_entry_time
;
1164 data
->u
.runstate
.time_running
=
1165 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_running
];
1166 data
->u
.runstate
.time_runnable
=
1167 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_runnable
];
1168 data
->u
.runstate
.time_blocked
=
1169 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_blocked
];
1170 data
->u
.runstate
.time_offline
=
1171 vcpu
->arch
.xen
.runstate_times
[RUNSTATE_offline
];
1175 case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
:
1179 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
:
1180 data
->u
.vcpu_id
= vcpu
->arch
.xen
.vcpu_id
;
1184 case KVM_XEN_VCPU_ATTR_TYPE_TIMER
:
1186 * Ensure a consistent snapshot of state is captured, with a
1187 * timer either being pending, or the event channel delivered
1188 * to the corresponding bit in the shared_info. Not still
1189 * lurking in the timer_pending flag for deferred delivery.
1190 * Purely as an optimisation, if the timer_expires field is
1191 * zero, that means the timer isn't active (or even in the
1192 * timer_pending flag) and there is no need to cancel it.
1194 if (vcpu
->arch
.xen
.timer_expires
) {
1195 hrtimer_cancel(&vcpu
->arch
.xen
.timer
);
1196 kvm_xen_inject_timer_irqs(vcpu
);
1199 data
->u
.timer
.port
= vcpu
->arch
.xen
.timer_virq
;
1200 data
->u
.timer
.priority
= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
;
1201 data
->u
.timer
.expires_ns
= vcpu
->arch
.xen
.timer_expires
;
1204 * The hrtimer may trigger and raise the IRQ immediately,
1205 * while the returned state causes it to be set up and
1206 * raised again on the destination system after migration.
1207 * That's fine, as the guest won't even have had a chance
1208 * to run and handle the interrupt. Asserting an already
1209 * pending event channel is idempotent.
1211 if (vcpu
->arch
.xen
.timer_expires
)
1212 hrtimer_start_expires(&vcpu
->arch
.xen
.timer
,
1213 HRTIMER_MODE_ABS_HARD
);
1218 case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
:
1219 data
->u
.vector
= vcpu
->arch
.xen
.upcall_vector
;
1227 mutex_unlock(&vcpu
->kvm
->arch
.xen
.xen_lock
);
1231 int kvm_xen_write_hypercall_page(struct kvm_vcpu
*vcpu
, u64 data
)
1233 struct kvm
*kvm
= vcpu
->kvm
;
1234 u32 page_num
= data
& ~PAGE_MASK
;
1235 u64 page_addr
= data
& PAGE_MASK
;
1236 bool lm
= is_long_mode(vcpu
);
1239 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
1240 if (kvm
->arch
.xen
.long_mode
!= lm
) {
1241 kvm
->arch
.xen
.long_mode
= lm
;
1244 * Re-initialize shared_info to put the wallclock in the
1247 if (kvm
->arch
.xen
.shinfo_cache
.active
&&
1248 kvm_xen_shared_info_init(kvm
))
1251 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
1257 * If Xen hypercall intercept is enabled, fill the hypercall
1258 * page with VMCALL/VMMCALL instructions since that's what
1259 * we catch. Else the VMM has provided the hypercall pages
1260 * with instructions of its own choosing, so use those.
1262 if (kvm_xen_hypercall_enabled(kvm
)) {
1263 u8 instructions
[32];
1269 /* mov imm32, %eax */
1270 instructions
[0] = 0xb8;
1272 /* vmcall / vmmcall */
1273 kvm_x86_call(patch_hypercall
)(vcpu
, instructions
+ 5);
1276 instructions
[8] = 0xc3;
1279 memset(instructions
+ 9, 0xcc, sizeof(instructions
) - 9);
1281 for (i
= 0; i
< PAGE_SIZE
/ sizeof(instructions
); i
++) {
1282 *(u32
*)&instructions
[1] = i
;
1283 if (kvm_vcpu_write_guest(vcpu
,
1284 page_addr
+ (i
* sizeof(instructions
)),
1285 instructions
, sizeof(instructions
)))
1290 * Note, truncation is a non-issue as 'lm' is guaranteed to be
1291 * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
1293 hva_t blob_addr
= lm
? kvm
->arch
.xen_hvm_config
.blob_addr_64
1294 : kvm
->arch
.xen_hvm_config
.blob_addr_32
;
1295 u8 blob_size
= lm
? kvm
->arch
.xen_hvm_config
.blob_size_64
1296 : kvm
->arch
.xen_hvm_config
.blob_size_32
;
1300 if (page_num
>= blob_size
)
1303 blob_addr
+= page_num
* PAGE_SIZE
;
1305 page
= memdup_user((u8 __user
*)blob_addr
, PAGE_SIZE
);
1307 return PTR_ERR(page
);
1309 ret
= kvm_vcpu_write_guest(vcpu
, page_addr
, page
, PAGE_SIZE
);
1317 int kvm_xen_hvm_config(struct kvm
*kvm
, struct kvm_xen_hvm_config
*xhc
)
1319 /* Only some feature flags need to be *enabled* by userspace */
1320 u32 permitted_flags
= KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL
|
1321 KVM_XEN_HVM_CONFIG_EVTCHN_SEND
|
1322 KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE
;
1325 if (xhc
->flags
& ~permitted_flags
)
1329 * With hypercall interception the kernel generates its own
1330 * hypercall page so it must not be provided.
1332 if ((xhc
->flags
& KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL
) &&
1333 (xhc
->blob_addr_32
|| xhc
->blob_addr_64
||
1334 xhc
->blob_size_32
|| xhc
->blob_size_64
))
1337 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
1339 if (xhc
->msr
&& !kvm
->arch
.xen_hvm_config
.msr
)
1340 static_branch_inc(&kvm_xen_enabled
.key
);
1341 else if (!xhc
->msr
&& kvm
->arch
.xen_hvm_config
.msr
)
1342 static_branch_slow_dec_deferred(&kvm_xen_enabled
);
1344 old_flags
= kvm
->arch
.xen_hvm_config
.flags
;
1345 memcpy(&kvm
->arch
.xen_hvm_config
, xhc
, sizeof(*xhc
));
1347 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
1349 if ((old_flags
^ xhc
->flags
) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE
)
1350 kvm_make_all_cpus_request(kvm
, KVM_REQ_CLOCK_UPDATE
);
1355 static int kvm_xen_hypercall_set_result(struct kvm_vcpu
*vcpu
, u64 result
)
1357 kvm_rax_write(vcpu
, result
);
1358 return kvm_skip_emulated_instruction(vcpu
);
1361 static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu
*vcpu
)
1363 struct kvm_run
*run
= vcpu
->run
;
1365 if (unlikely(!kvm_is_linear_rip(vcpu
, vcpu
->arch
.xen
.hypercall_rip
)))
1368 return kvm_xen_hypercall_set_result(vcpu
, run
->xen
.u
.hcall
.result
);
1371 static inline int max_evtchn_port(struct kvm
*kvm
)
1373 if (IS_ENABLED(CONFIG_64BIT
) && kvm
->arch
.xen
.long_mode
)
1374 return EVTCHN_2L_NR_CHANNELS
;
1376 return COMPAT_EVTCHN_2L_NR_CHANNELS
;
1379 static bool wait_pending_event(struct kvm_vcpu
*vcpu
, int nr_ports
,
1380 evtchn_port_t
*ports
)
1382 struct kvm
*kvm
= vcpu
->kvm
;
1383 struct gfn_to_pfn_cache
*gpc
= &kvm
->arch
.xen
.shinfo_cache
;
1384 unsigned long *pending_bits
;
1385 unsigned long flags
;
1389 idx
= srcu_read_lock(&kvm
->srcu
);
1390 read_lock_irqsave(&gpc
->lock
, flags
);
1391 if (!kvm_gpc_check(gpc
, PAGE_SIZE
))
1395 if (IS_ENABLED(CONFIG_64BIT
) && kvm
->arch
.xen
.long_mode
) {
1396 struct shared_info
*shinfo
= gpc
->khva
;
1397 pending_bits
= (unsigned long *)&shinfo
->evtchn_pending
;
1399 struct compat_shared_info
*shinfo
= gpc
->khva
;
1400 pending_bits
= (unsigned long *)&shinfo
->evtchn_pending
;
1403 for (i
= 0; i
< nr_ports
; i
++) {
1404 if (test_bit(ports
[i
], pending_bits
)) {
1411 read_unlock_irqrestore(&gpc
->lock
, flags
);
1412 srcu_read_unlock(&kvm
->srcu
, idx
);
1417 static bool kvm_xen_schedop_poll(struct kvm_vcpu
*vcpu
, bool longmode
,
1420 struct sched_poll sched_poll
;
1421 evtchn_port_t port
, *ports
;
1422 struct x86_exception e
;
1425 if (!lapic_in_kernel(vcpu
) ||
1426 !(vcpu
->kvm
->arch
.xen_hvm_config
.flags
& KVM_XEN_HVM_CONFIG_EVTCHN_SEND
))
1429 if (IS_ENABLED(CONFIG_64BIT
) && !longmode
) {
1430 struct compat_sched_poll sp32
;
1432 /* Sanity check that the compat struct definition is correct */
1433 BUILD_BUG_ON(sizeof(sp32
) != 16);
1435 if (kvm_read_guest_virt(vcpu
, param
, &sp32
, sizeof(sp32
), &e
)) {
1441 * This is a 32-bit pointer to an array of evtchn_port_t which
1442 * are uint32_t, so once it's converted no further compat
1443 * handling is needed.
1445 sched_poll
.ports
= (void *)(unsigned long)(sp32
.ports
);
1446 sched_poll
.nr_ports
= sp32
.nr_ports
;
1447 sched_poll
.timeout
= sp32
.timeout
;
1449 if (kvm_read_guest_virt(vcpu
, param
, &sched_poll
,
1450 sizeof(sched_poll
), &e
)) {
1456 if (unlikely(sched_poll
.nr_ports
> 1)) {
1457 /* Xen (unofficially) limits number of pollers to 128 */
1458 if (sched_poll
.nr_ports
> 128) {
1463 ports
= kmalloc_array(sched_poll
.nr_ports
,
1464 sizeof(*ports
), GFP_KERNEL
);
1472 if (kvm_read_guest_virt(vcpu
, (gva_t
)sched_poll
.ports
, ports
,
1473 sched_poll
.nr_ports
* sizeof(*ports
), &e
)) {
1478 for (i
= 0; i
< sched_poll
.nr_ports
; i
++) {
1479 if (ports
[i
] >= max_evtchn_port(vcpu
->kvm
)) {
1485 if (sched_poll
.nr_ports
== 1)
1486 vcpu
->arch
.xen
.poll_evtchn
= port
;
1488 vcpu
->arch
.xen
.poll_evtchn
= -1;
1490 set_bit(vcpu
->vcpu_idx
, vcpu
->kvm
->arch
.xen
.poll_mask
);
1492 if (!wait_pending_event(vcpu
, sched_poll
.nr_ports
, ports
)) {
1493 vcpu
->arch
.mp_state
= KVM_MP_STATE_HALTED
;
1495 if (sched_poll
.timeout
)
1496 mod_timer(&vcpu
->arch
.xen
.poll_timer
,
1497 jiffies
+ nsecs_to_jiffies(sched_poll
.timeout
));
1499 kvm_vcpu_halt(vcpu
);
1501 if (sched_poll
.timeout
)
1502 del_timer(&vcpu
->arch
.xen
.poll_timer
);
1504 vcpu
->arch
.mp_state
= KVM_MP_STATE_RUNNABLE
;
1507 vcpu
->arch
.xen
.poll_evtchn
= 0;
1510 /* Really, this is only needed in case of timeout */
1511 clear_bit(vcpu
->vcpu_idx
, vcpu
->kvm
->arch
.xen
.poll_mask
);
1513 if (unlikely(sched_poll
.nr_ports
> 1))
1518 static void cancel_evtchn_poll(struct timer_list
*t
)
1520 struct kvm_vcpu
*vcpu
= from_timer(vcpu
, t
, arch
.xen
.poll_timer
);
1522 kvm_make_request(KVM_REQ_UNBLOCK
, vcpu
);
1523 kvm_vcpu_kick(vcpu
);
1526 static bool kvm_xen_hcall_sched_op(struct kvm_vcpu
*vcpu
, bool longmode
,
1527 int cmd
, u64 param
, u64
*r
)
1531 if (kvm_xen_schedop_poll(vcpu
, longmode
, param
, r
))
1535 kvm_vcpu_on_spin(vcpu
, true);
1545 struct compat_vcpu_set_singleshot_timer
{
1546 uint64_t timeout_abs_ns
;
1548 } __attribute__((packed
));
1550 static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu
*vcpu
, bool longmode
, int cmd
,
1551 int vcpu_id
, u64 param
, u64
*r
)
1553 struct vcpu_set_singleshot_timer oneshot
;
1554 struct x86_exception e
;
1556 if (!kvm_xen_timer_enabled(vcpu
))
1560 case VCPUOP_set_singleshot_timer
:
1561 if (vcpu
->arch
.xen
.vcpu_id
!= vcpu_id
) {
1567 * The only difference for 32-bit compat is the 4 bytes of
1568 * padding after the interesting part of the structure. So
1569 * for a faithful emulation of Xen we have to *try* to copy
1570 * the padding and return -EFAULT if we can't. Otherwise we
1571 * might as well just have copied the 12-byte 32-bit struct.
1573 BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer
, timeout_abs_ns
) !=
1574 offsetof(struct vcpu_set_singleshot_timer
, timeout_abs_ns
));
1575 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer
, timeout_abs_ns
) !=
1576 sizeof_field(struct vcpu_set_singleshot_timer
, timeout_abs_ns
));
1577 BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer
, flags
) !=
1578 offsetof(struct vcpu_set_singleshot_timer
, flags
));
1579 BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer
, flags
) !=
1580 sizeof_field(struct vcpu_set_singleshot_timer
, flags
));
1582 if (kvm_read_guest_virt(vcpu
, param
, &oneshot
, longmode
? sizeof(oneshot
) :
1583 sizeof(struct compat_vcpu_set_singleshot_timer
), &e
)) {
1588 kvm_xen_start_timer(vcpu
, oneshot
.timeout_abs_ns
, false);
1592 case VCPUOP_stop_singleshot_timer
:
1593 if (vcpu
->arch
.xen
.vcpu_id
!= vcpu_id
) {
1597 kvm_xen_stop_timer(vcpu
);
1605 static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu
*vcpu
, uint64_t timeout
,
1608 if (!kvm_xen_timer_enabled(vcpu
))
1612 kvm_xen_start_timer(vcpu
, timeout
, true);
1614 kvm_xen_stop_timer(vcpu
);
1620 int kvm_xen_hypercall(struct kvm_vcpu
*vcpu
)
1623 u64 input
, params
[6], r
= -ENOSYS
;
1624 bool handled
= false;
1627 input
= (u64
)kvm_register_read(vcpu
, VCPU_REGS_RAX
);
1629 /* Hyper-V hypercalls get bit 31 set in EAX */
1630 if ((input
& 0x80000000) &&
1631 kvm_hv_hypercall_enabled(vcpu
))
1632 return kvm_hv_hypercall(vcpu
);
1634 longmode
= is_64_bit_hypercall(vcpu
);
1636 params
[0] = (u32
)kvm_rbx_read(vcpu
);
1637 params
[1] = (u32
)kvm_rcx_read(vcpu
);
1638 params
[2] = (u32
)kvm_rdx_read(vcpu
);
1639 params
[3] = (u32
)kvm_rsi_read(vcpu
);
1640 params
[4] = (u32
)kvm_rdi_read(vcpu
);
1641 params
[5] = (u32
)kvm_rbp_read(vcpu
);
1643 #ifdef CONFIG_X86_64
1645 params
[0] = (u64
)kvm_rdi_read(vcpu
);
1646 params
[1] = (u64
)kvm_rsi_read(vcpu
);
1647 params
[2] = (u64
)kvm_rdx_read(vcpu
);
1648 params
[3] = (u64
)kvm_r10_read(vcpu
);
1649 params
[4] = (u64
)kvm_r8_read(vcpu
);
1650 params
[5] = (u64
)kvm_r9_read(vcpu
);
1653 cpl
= kvm_x86_call(get_cpl
)(vcpu
);
1654 trace_kvm_xen_hypercall(cpl
, input
, params
[0], params
[1], params
[2],
1655 params
[3], params
[4], params
[5]);
1658 * Only allow hypercall acceleration for CPL0. The rare hypercalls that
1659 * are permitted in guest userspace can be handled by the VMM.
1661 if (unlikely(cpl
> 0))
1662 goto handle_in_userspace
;
1665 case __HYPERVISOR_xen_version
:
1666 if (params
[0] == XENVER_version
&& vcpu
->kvm
->arch
.xen
.xen_version
) {
1667 r
= vcpu
->kvm
->arch
.xen
.xen_version
;
1671 case __HYPERVISOR_event_channel_op
:
1672 if (params
[0] == EVTCHNOP_send
)
1673 handled
= kvm_xen_hcall_evtchn_send(vcpu
, params
[1], &r
);
1675 case __HYPERVISOR_sched_op
:
1676 handled
= kvm_xen_hcall_sched_op(vcpu
, longmode
, params
[0],
1679 case __HYPERVISOR_vcpu_op
:
1680 handled
= kvm_xen_hcall_vcpu_op(vcpu
, longmode
, params
[0], params
[1],
1683 case __HYPERVISOR_set_timer_op
: {
1684 u64 timeout
= params
[0];
1685 /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
1687 timeout
|= params
[1] << 32;
1688 handled
= kvm_xen_hcall_set_timer_op(vcpu
, timeout
, &r
);
1696 return kvm_xen_hypercall_set_result(vcpu
, r
);
1698 handle_in_userspace
:
1699 vcpu
->run
->exit_reason
= KVM_EXIT_XEN
;
1700 vcpu
->run
->xen
.type
= KVM_EXIT_XEN_HCALL
;
1701 vcpu
->run
->xen
.u
.hcall
.longmode
= longmode
;
1702 vcpu
->run
->xen
.u
.hcall
.cpl
= cpl
;
1703 vcpu
->run
->xen
.u
.hcall
.input
= input
;
1704 vcpu
->run
->xen
.u
.hcall
.params
[0] = params
[0];
1705 vcpu
->run
->xen
.u
.hcall
.params
[1] = params
[1];
1706 vcpu
->run
->xen
.u
.hcall
.params
[2] = params
[2];
1707 vcpu
->run
->xen
.u
.hcall
.params
[3] = params
[3];
1708 vcpu
->run
->xen
.u
.hcall
.params
[4] = params
[4];
1709 vcpu
->run
->xen
.u
.hcall
.params
[5] = params
[5];
1710 vcpu
->arch
.xen
.hypercall_rip
= kvm_get_linear_rip(vcpu
);
1711 vcpu
->arch
.complete_userspace_io
=
1712 kvm_xen_hypercall_complete_userspace
;
1717 static void kvm_xen_check_poller(struct kvm_vcpu
*vcpu
, int port
)
1719 int poll_evtchn
= vcpu
->arch
.xen
.poll_evtchn
;
1721 if ((poll_evtchn
== port
|| poll_evtchn
== -1) &&
1722 test_and_clear_bit(vcpu
->vcpu_idx
, vcpu
->kvm
->arch
.xen
.poll_mask
)) {
1723 kvm_make_request(KVM_REQ_UNBLOCK
, vcpu
);
1724 kvm_vcpu_kick(vcpu
);
1729 * The return value from this function is propagated to kvm_set_irq() API,
1731 * < 0 Interrupt was ignored (masked or not delivered for other reasons)
1732 * = 0 Interrupt was coalesced (previous irq is still pending)
1733 * > 0 Number of CPUs interrupt was delivered to
1735 * It is also called directly from kvm_arch_set_irq_inatomic(), where the
1736 * only check on its return value is a comparison with -EWOULDBLOCK'.
1738 int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn
*xe
, struct kvm
*kvm
)
1740 struct gfn_to_pfn_cache
*gpc
= &kvm
->arch
.xen
.shinfo_cache
;
1741 struct kvm_vcpu
*vcpu
;
1742 unsigned long *pending_bits
, *mask_bits
;
1743 unsigned long flags
;
1745 bool kick_vcpu
= false;
1746 int vcpu_idx
, idx
, rc
;
1748 vcpu_idx
= READ_ONCE(xe
->vcpu_idx
);
1750 vcpu
= kvm_get_vcpu(kvm
, vcpu_idx
);
1752 vcpu
= kvm_get_vcpu_by_id(kvm
, xe
->vcpu_id
);
1755 WRITE_ONCE(xe
->vcpu_idx
, vcpu
->vcpu_idx
);
1758 if (xe
->port
>= max_evtchn_port(kvm
))
1763 idx
= srcu_read_lock(&kvm
->srcu
);
1765 read_lock_irqsave(&gpc
->lock
, flags
);
1766 if (!kvm_gpc_check(gpc
, PAGE_SIZE
))
1769 if (IS_ENABLED(CONFIG_64BIT
) && kvm
->arch
.xen
.long_mode
) {
1770 struct shared_info
*shinfo
= gpc
->khva
;
1771 pending_bits
= (unsigned long *)&shinfo
->evtchn_pending
;
1772 mask_bits
= (unsigned long *)&shinfo
->evtchn_mask
;
1773 port_word_bit
= xe
->port
/ 64;
1775 struct compat_shared_info
*shinfo
= gpc
->khva
;
1776 pending_bits
= (unsigned long *)&shinfo
->evtchn_pending
;
1777 mask_bits
= (unsigned long *)&shinfo
->evtchn_mask
;
1778 port_word_bit
= xe
->port
/ 32;
1782 * If this port wasn't already set, and if it isn't masked, then
1783 * we try to set the corresponding bit in the in-kernel shadow of
1784 * evtchn_pending_sel for the target vCPU. And if *that* wasn't
1785 * already set, then we kick the vCPU in question to write to the
1786 * *real* evtchn_pending_sel in its own guest vcpu_info struct.
1788 if (test_and_set_bit(xe
->port
, pending_bits
)) {
1789 rc
= 0; /* It was already raised */
1790 } else if (test_bit(xe
->port
, mask_bits
)) {
1791 rc
= -ENOTCONN
; /* Masked */
1792 kvm_xen_check_poller(vcpu
, xe
->port
);
1794 rc
= 1; /* Delivered to the bitmap in shared_info. */
1795 /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
1796 read_unlock_irqrestore(&gpc
->lock
, flags
);
1797 gpc
= &vcpu
->arch
.xen
.vcpu_info_cache
;
1799 read_lock_irqsave(&gpc
->lock
, flags
);
1800 if (!kvm_gpc_check(gpc
, sizeof(struct vcpu_info
))) {
1802 * Could not access the vcpu_info. Set the bit in-kernel
1803 * and prod the vCPU to deliver it for itself.
1805 if (!test_and_set_bit(port_word_bit
, &vcpu
->arch
.xen
.evtchn_pending_sel
))
1810 if (IS_ENABLED(CONFIG_64BIT
) && kvm
->arch
.xen
.long_mode
) {
1811 struct vcpu_info
*vcpu_info
= gpc
->khva
;
1812 if (!test_and_set_bit(port_word_bit
, &vcpu_info
->evtchn_pending_sel
)) {
1813 WRITE_ONCE(vcpu_info
->evtchn_upcall_pending
, 1);
1817 struct compat_vcpu_info
*vcpu_info
= gpc
->khva
;
1818 if (!test_and_set_bit(port_word_bit
,
1819 (unsigned long *)&vcpu_info
->evtchn_pending_sel
)) {
1820 WRITE_ONCE(vcpu_info
->evtchn_upcall_pending
, 1);
1825 /* For the per-vCPU lapic vector, deliver it as MSI. */
1826 if (kick_vcpu
&& vcpu
->arch
.xen
.upcall_vector
) {
1827 kvm_xen_inject_vcpu_vector(vcpu
);
1833 read_unlock_irqrestore(&gpc
->lock
, flags
);
1834 srcu_read_unlock(&kvm
->srcu
, idx
);
1837 kvm_make_request(KVM_REQ_UNBLOCK
, vcpu
);
1838 kvm_vcpu_kick(vcpu
);
1844 static int kvm_xen_set_evtchn(struct kvm_xen_evtchn
*xe
, struct kvm
*kvm
)
1846 bool mm_borrowed
= false;
1849 rc
= kvm_xen_set_evtchn_fast(xe
, kvm
);
1850 if (rc
!= -EWOULDBLOCK
)
1853 if (current
->mm
!= kvm
->mm
) {
1855 * If not on a thread which already belongs to this KVM,
1856 * we'd better be in the irqfd workqueue.
1858 if (WARN_ON_ONCE(current
->mm
))
1861 kthread_use_mm(kvm
->mm
);
1866 * It is theoretically possible for the page to be unmapped
1867 * and the MMU notifier to invalidate the shared_info before
1868 * we even get to use it. In that case, this looks like an
1869 * infinite loop. It was tempting to do it via the userspace
1870 * HVA instead... but that just *hides* the fact that it's
1871 * an infinite loop, because if a fault occurs and it waits
1872 * for the page to come back, it can *still* immediately
1873 * fault and have to wait again, repeatedly.
1875 * Conversely, the page could also have been reinstated by
1876 * another thread before we even obtain the mutex above, so
1877 * check again *first* before remapping it.
1880 struct gfn_to_pfn_cache
*gpc
= &kvm
->arch
.xen
.shinfo_cache
;
1883 rc
= kvm_xen_set_evtchn_fast(xe
, kvm
);
1884 if (rc
!= -EWOULDBLOCK
)
1887 idx
= srcu_read_lock(&kvm
->srcu
);
1888 rc
= kvm_gpc_refresh(gpc
, PAGE_SIZE
);
1889 srcu_read_unlock(&kvm
->srcu
, idx
);
1893 kthread_unuse_mm(kvm
->mm
);
1898 /* This is the version called from kvm_set_irq() as the .set function */
1899 static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry
*e
, struct kvm
*kvm
,
1900 int irq_source_id
, int level
, bool line_status
)
1905 return kvm_xen_set_evtchn(&e
->xen_evtchn
, kvm
);
1909 * Set up an event channel interrupt from the KVM IRQ routing table.
1910 * Used for e.g. PIRQ from passed through physical devices.
1912 int kvm_xen_setup_evtchn(struct kvm
*kvm
,
1913 struct kvm_kernel_irq_routing_entry
*e
,
1914 const struct kvm_irq_routing_entry
*ue
)
1917 struct kvm_vcpu
*vcpu
;
1919 if (ue
->u
.xen_evtchn
.port
>= max_evtchn_port(kvm
))
1922 /* We only support 2 level event channels for now */
1923 if (ue
->u
.xen_evtchn
.priority
!= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
)
1927 * Xen gives us interesting mappings from vCPU index to APIC ID,
1928 * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
1929 * to find it. Do that once at setup time, instead of every time.
1930 * But beware that on live update / live migration, the routing
1931 * table might be reinstated before the vCPU threads have finished
1932 * recreating their vCPUs.
1934 vcpu
= kvm_get_vcpu_by_id(kvm
, ue
->u
.xen_evtchn
.vcpu
);
1936 e
->xen_evtchn
.vcpu_idx
= vcpu
->vcpu_idx
;
1938 e
->xen_evtchn
.vcpu_idx
= -1;
1940 e
->xen_evtchn
.port
= ue
->u
.xen_evtchn
.port
;
1941 e
->xen_evtchn
.vcpu_id
= ue
->u
.xen_evtchn
.vcpu
;
1942 e
->xen_evtchn
.priority
= ue
->u
.xen_evtchn
.priority
;
1943 e
->set
= evtchn_set_fn
;
1949 * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
1951 int kvm_xen_hvm_evtchn_send(struct kvm
*kvm
, struct kvm_irq_routing_xen_evtchn
*uxe
)
1953 struct kvm_xen_evtchn e
;
1956 if (!uxe
->port
|| uxe
->port
>= max_evtchn_port(kvm
))
1959 /* We only support 2 level event channels for now */
1960 if (uxe
->priority
!= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
)
1964 e
.vcpu_id
= uxe
->vcpu
;
1966 e
.priority
= uxe
->priority
;
1968 ret
= kvm_xen_set_evtchn(&e
, kvm
);
1971 * None of that 'return 1 if it actually got delivered' nonsense.
1972 * We don't care if it was masked (-ENOTCONN) either.
1974 if (ret
> 0 || ret
== -ENOTCONN
)
1981 * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
1987 struct kvm_xen_evtchn port
;
1989 u32 port
; /* zero */
1990 struct eventfd_ctx
*ctx
;
1996 * Update target vCPU or priority for a registered sending channel.
1998 static int kvm_xen_eventfd_update(struct kvm
*kvm
,
1999 struct kvm_xen_hvm_attr
*data
)
2001 u32 port
= data
->u
.evtchn
.send_port
;
2002 struct evtchnfd
*evtchnfd
;
2005 /* Protect writes to evtchnfd as well as the idr lookup. */
2006 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
2007 evtchnfd
= idr_find(&kvm
->arch
.xen
.evtchn_ports
, port
);
2013 /* For an UPDATE, nothing may change except the priority/vcpu */
2015 if (evtchnfd
->type
!= data
->u
.evtchn
.type
)
2019 * Port cannot change, and if it's zero that was an eventfd
2020 * which can't be changed either.
2022 if (!evtchnfd
->deliver
.port
.port
||
2023 evtchnfd
->deliver
.port
.port
!= data
->u
.evtchn
.deliver
.port
.port
)
2026 /* We only support 2 level event channels for now */
2027 if (data
->u
.evtchn
.deliver
.port
.priority
!= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
)
2030 evtchnfd
->deliver
.port
.priority
= data
->u
.evtchn
.deliver
.port
.priority
;
2031 if (evtchnfd
->deliver
.port
.vcpu_id
!= data
->u
.evtchn
.deliver
.port
.vcpu
) {
2032 evtchnfd
->deliver
.port
.vcpu_id
= data
->u
.evtchn
.deliver
.port
.vcpu
;
2033 evtchnfd
->deliver
.port
.vcpu_idx
= -1;
2037 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
2042 * Configure the target (eventfd or local port delivery) for sending on
2043 * a given event channel.
2045 static int kvm_xen_eventfd_assign(struct kvm
*kvm
,
2046 struct kvm_xen_hvm_attr
*data
)
2048 u32 port
= data
->u
.evtchn
.send_port
;
2049 struct eventfd_ctx
*eventfd
= NULL
;
2050 struct evtchnfd
*evtchnfd
;
2053 evtchnfd
= kzalloc(sizeof(struct evtchnfd
), GFP_KERNEL
);
2057 switch(data
->u
.evtchn
.type
) {
2058 case EVTCHNSTAT_ipi
:
2059 /* IPI must map back to the same port# */
2060 if (data
->u
.evtchn
.deliver
.port
.port
!= data
->u
.evtchn
.send_port
)
2061 goto out_noeventfd
; /* -EINVAL */
2064 case EVTCHNSTAT_interdomain
:
2065 if (data
->u
.evtchn
.deliver
.port
.port
) {
2066 if (data
->u
.evtchn
.deliver
.port
.port
>= max_evtchn_port(kvm
))
2067 goto out_noeventfd
; /* -EINVAL */
2069 eventfd
= eventfd_ctx_fdget(data
->u
.evtchn
.deliver
.eventfd
.fd
);
2070 if (IS_ERR(eventfd
)) {
2071 ret
= PTR_ERR(eventfd
);
2077 case EVTCHNSTAT_virq
:
2078 case EVTCHNSTAT_closed
:
2079 case EVTCHNSTAT_unbound
:
2080 case EVTCHNSTAT_pirq
:
2081 default: /* Unknown event channel type */
2082 goto out
; /* -EINVAL */
2085 evtchnfd
->send_port
= data
->u
.evtchn
.send_port
;
2086 evtchnfd
->type
= data
->u
.evtchn
.type
;
2088 evtchnfd
->deliver
.eventfd
.ctx
= eventfd
;
2090 /* We only support 2 level event channels for now */
2091 if (data
->u
.evtchn
.deliver
.port
.priority
!= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
)
2092 goto out
; /* -EINVAL; */
2094 evtchnfd
->deliver
.port
.port
= data
->u
.evtchn
.deliver
.port
.port
;
2095 evtchnfd
->deliver
.port
.vcpu_id
= data
->u
.evtchn
.deliver
.port
.vcpu
;
2096 evtchnfd
->deliver
.port
.vcpu_idx
= -1;
2097 evtchnfd
->deliver
.port
.priority
= data
->u
.evtchn
.deliver
.port
.priority
;
2100 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
2101 ret
= idr_alloc(&kvm
->arch
.xen
.evtchn_ports
, evtchnfd
, port
, port
+ 1,
2103 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
2111 eventfd_ctx_put(eventfd
);
2117 static int kvm_xen_eventfd_deassign(struct kvm
*kvm
, u32 port
)
2119 struct evtchnfd
*evtchnfd
;
2121 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
2122 evtchnfd
= idr_remove(&kvm
->arch
.xen
.evtchn_ports
, port
);
2123 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
2128 synchronize_srcu(&kvm
->srcu
);
2129 if (!evtchnfd
->deliver
.port
.port
)
2130 eventfd_ctx_put(evtchnfd
->deliver
.eventfd
.ctx
);
2135 static int kvm_xen_eventfd_reset(struct kvm
*kvm
)
2137 struct evtchnfd
*evtchnfd
, **all_evtchnfds
;
2141 mutex_lock(&kvm
->arch
.xen
.xen_lock
);
2144 * Because synchronize_srcu() cannot be called inside the
2145 * critical section, first collect all the evtchnfd objects
2146 * in an array as they are removed from evtchn_ports.
2148 idr_for_each_entry(&kvm
->arch
.xen
.evtchn_ports
, evtchnfd
, i
)
2151 all_evtchnfds
= kmalloc_array(n
, sizeof(struct evtchnfd
*), GFP_KERNEL
);
2152 if (!all_evtchnfds
) {
2153 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
2158 idr_for_each_entry(&kvm
->arch
.xen
.evtchn_ports
, evtchnfd
, i
) {
2159 all_evtchnfds
[n
++] = evtchnfd
;
2160 idr_remove(&kvm
->arch
.xen
.evtchn_ports
, evtchnfd
->send_port
);
2162 mutex_unlock(&kvm
->arch
.xen
.xen_lock
);
2164 synchronize_srcu(&kvm
->srcu
);
2167 evtchnfd
= all_evtchnfds
[n
];
2168 if (!evtchnfd
->deliver
.port
.port
)
2169 eventfd_ctx_put(evtchnfd
->deliver
.eventfd
.ctx
);
2172 kfree(all_evtchnfds
);
2177 static int kvm_xen_setattr_evtchn(struct kvm
*kvm
, struct kvm_xen_hvm_attr
*data
)
2179 u32 port
= data
->u
.evtchn
.send_port
;
2181 if (data
->u
.evtchn
.flags
== KVM_XEN_EVTCHN_RESET
)
2182 return kvm_xen_eventfd_reset(kvm
);
2184 if (!port
|| port
>= max_evtchn_port(kvm
))
2187 if (data
->u
.evtchn
.flags
== KVM_XEN_EVTCHN_DEASSIGN
)
2188 return kvm_xen_eventfd_deassign(kvm
, port
);
2189 if (data
->u
.evtchn
.flags
== KVM_XEN_EVTCHN_UPDATE
)
2190 return kvm_xen_eventfd_update(kvm
, data
);
2191 if (data
->u
.evtchn
.flags
)
2194 return kvm_xen_eventfd_assign(kvm
, data
);
2197 static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu
*vcpu
, u64 param
, u64
*r
)
2199 struct evtchnfd
*evtchnfd
;
2200 struct evtchn_send send
;
2201 struct x86_exception e
;
2203 /* Sanity check: this structure is the same for 32-bit and 64-bit */
2204 BUILD_BUG_ON(sizeof(send
) != 4);
2205 if (kvm_read_guest_virt(vcpu
, param
, &send
, sizeof(send
), &e
)) {
2211 * evtchnfd is protected by kvm->srcu; the idr lookup instead
2212 * is protected by RCU.
2215 evtchnfd
= idr_find(&vcpu
->kvm
->arch
.xen
.evtchn_ports
, send
.port
);
2220 if (evtchnfd
->deliver
.port
.port
) {
2221 int ret
= kvm_xen_set_evtchn(&evtchnfd
->deliver
.port
, vcpu
->kvm
);
2222 if (ret
< 0 && ret
!= -ENOTCONN
)
2225 eventfd_signal(evtchnfd
->deliver
.eventfd
.ctx
);
2232 void kvm_xen_init_vcpu(struct kvm_vcpu
*vcpu
)
2234 vcpu
->arch
.xen
.vcpu_id
= vcpu
->vcpu_idx
;
2235 vcpu
->arch
.xen
.poll_evtchn
= 0;
2237 timer_setup(&vcpu
->arch
.xen
.poll_timer
, cancel_evtchn_poll
, 0);
2239 kvm_gpc_init(&vcpu
->arch
.xen
.runstate_cache
, vcpu
->kvm
);
2240 kvm_gpc_init(&vcpu
->arch
.xen
.runstate2_cache
, vcpu
->kvm
);
2241 kvm_gpc_init(&vcpu
->arch
.xen
.vcpu_info_cache
, vcpu
->kvm
);
2242 kvm_gpc_init(&vcpu
->arch
.xen
.vcpu_time_info_cache
, vcpu
->kvm
);
2245 void kvm_xen_destroy_vcpu(struct kvm_vcpu
*vcpu
)
2247 if (kvm_xen_timer_enabled(vcpu
))
2248 kvm_xen_stop_timer(vcpu
);
2250 kvm_gpc_deactivate(&vcpu
->arch
.xen
.runstate_cache
);
2251 kvm_gpc_deactivate(&vcpu
->arch
.xen
.runstate2_cache
);
2252 kvm_gpc_deactivate(&vcpu
->arch
.xen
.vcpu_info_cache
);
2253 kvm_gpc_deactivate(&vcpu
->arch
.xen
.vcpu_time_info_cache
);
2255 del_timer_sync(&vcpu
->arch
.xen
.poll_timer
);
2258 void kvm_xen_update_tsc_info(struct kvm_vcpu
*vcpu
)
2260 struct kvm_cpuid_entry2
*entry
;
2263 if (!vcpu
->arch
.xen
.cpuid
.base
)
2266 function
= vcpu
->arch
.xen
.cpuid
.base
| XEN_CPUID_LEAF(3);
2267 if (function
> vcpu
->arch
.xen
.cpuid
.limit
)
2270 entry
= kvm_find_cpuid_entry_index(vcpu
, function
, 1);
2272 entry
->ecx
= vcpu
->arch
.hv_clock
.tsc_to_system_mul
;
2273 entry
->edx
= vcpu
->arch
.hv_clock
.tsc_shift
;
2276 entry
= kvm_find_cpuid_entry_index(vcpu
, function
, 2);
2278 entry
->eax
= vcpu
->arch
.hw_tsc_khz
;
2281 void kvm_xen_init_vm(struct kvm
*kvm
)
2283 mutex_init(&kvm
->arch
.xen
.xen_lock
);
2284 idr_init(&kvm
->arch
.xen
.evtchn_ports
);
2285 kvm_gpc_init(&kvm
->arch
.xen
.shinfo_cache
, kvm
);
2288 void kvm_xen_destroy_vm(struct kvm
*kvm
)
2290 struct evtchnfd
*evtchnfd
;
2293 kvm_gpc_deactivate(&kvm
->arch
.xen
.shinfo_cache
);
2295 idr_for_each_entry(&kvm
->arch
.xen
.evtchn_ports
, evtchnfd
, i
) {
2296 if (!evtchnfd
->deliver
.port
.port
)
2297 eventfd_ctx_put(evtchnfd
->deliver
.eventfd
.ctx
);
2300 idr_destroy(&kvm
->arch
.xen
.evtchn_ports
);
2302 if (kvm
->arch
.xen_hvm_config
.msr
)
2303 static_branch_slow_dec_deferred(&kvm_xen_enabled
);