kvm tools, setup: Create private directory
[linux-2.6/next.git] / arch / x86 / xen / time.c
blob5158c505bef9772400d263c3e895178820f5b305
1 /*
2 * Xen time implementation.
4 * This is implemented in terms of a clocksource driver which uses
5 * the hypervisor clock as a nanosecond timebase, and a clockevent
6 * driver which uses the hypervisor's timer mechanism.
8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
9 */
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/math64.h>
16 #include <linux/gfp.h>
18 #include <asm/pvclock.h>
19 #include <asm/xen/hypervisor.h>
20 #include <asm/xen/hypercall.h>
22 #include <xen/events.h>
23 #include <xen/features.h>
24 #include <xen/interface/xen.h>
25 #include <xen/interface/vcpu.h>
27 #include "xen-ops.h"
29 /* Xen may fire a timer up to this many ns early */
30 #define TIMER_SLOP 100000
31 #define NS_PER_TICK (1000000000LL / HZ)
33 /* runstate info updated by Xen */
34 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
36 /* snapshots of runstate info */
37 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
39 /* unused ns of stolen and blocked time */
40 static DEFINE_PER_CPU(u64, xen_residual_stolen);
41 static DEFINE_PER_CPU(u64, xen_residual_blocked);
43 /* return an consistent snapshot of 64-bit time/counter value */
44 static u64 get64(const u64 *p)
46 u64 ret;
48 if (BITS_PER_LONG < 64) {
49 u32 *p32 = (u32 *)p;
50 u32 h, l;
53 * Read high then low, and then make sure high is
54 * still the same; this will only loop if low wraps
55 * and carries into high.
56 * XXX some clean way to make this endian-proof?
58 do {
59 h = p32[1];
60 barrier();
61 l = p32[0];
62 barrier();
63 } while (p32[1] != h);
65 ret = (((u64)h) << 32) | l;
66 } else
67 ret = *p;
69 return ret;
73 * Runstate accounting
75 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
77 u64 state_time;
78 struct vcpu_runstate_info *state;
80 BUG_ON(preemptible());
82 state = &__get_cpu_var(xen_runstate);
85 * The runstate info is always updated by the hypervisor on
86 * the current CPU, so there's no need to use anything
87 * stronger than a compiler barrier when fetching it.
89 do {
90 state_time = get64(&state->state_entry_time);
91 barrier();
92 *res = *state;
93 barrier();
94 } while (get64(&state->state_entry_time) != state_time);
97 /* return true when a vcpu could run but has no real cpu to run on */
98 bool xen_vcpu_stolen(int vcpu)
100 return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
103 void xen_setup_runstate_info(int cpu)
105 struct vcpu_register_runstate_memory_area area;
107 area.addr.v = &per_cpu(xen_runstate, cpu);
109 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
110 cpu, &area))
111 BUG();
114 static void do_stolen_accounting(void)
116 struct vcpu_runstate_info state;
117 struct vcpu_runstate_info *snap;
118 s64 blocked, runnable, offline, stolen;
119 cputime_t ticks;
121 get_runstate_snapshot(&state);
123 WARN_ON(state.state != RUNSTATE_running);
125 snap = &__get_cpu_var(xen_runstate_snapshot);
127 /* work out how much time the VCPU has not been runn*ing* */
128 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
129 runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
130 offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
132 *snap = state;
134 /* Add the appropriate number of ticks of stolen time,
135 including any left-overs from last time. */
136 stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
138 if (stolen < 0)
139 stolen = 0;
141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
142 __this_cpu_write(xen_residual_stolen, stolen);
143 account_steal_ticks(ticks);
145 /* Add the appropriate number of ticks of blocked time,
146 including any left-overs from last time. */
147 blocked += __this_cpu_read(xen_residual_blocked);
149 if (blocked < 0)
150 blocked = 0;
152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
153 __this_cpu_write(xen_residual_blocked, blocked);
154 account_idle_ticks(ticks);
157 /* Get the TSC speed from Xen */
158 static unsigned long xen_tsc_khz(void)
160 struct pvclock_vcpu_time_info *info =
161 &HYPERVISOR_shared_info->vcpu_info[0].time;
163 return pvclock_tsc_khz(info);
166 cycle_t xen_clocksource_read(void)
168 struct pvclock_vcpu_time_info *src;
169 cycle_t ret;
171 src = &get_cpu_var(xen_vcpu)->time;
172 ret = pvclock_clocksource_read(src);
173 put_cpu_var(xen_vcpu);
174 return ret;
177 static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
179 return xen_clocksource_read();
182 static void xen_read_wallclock(struct timespec *ts)
184 struct shared_info *s = HYPERVISOR_shared_info;
185 struct pvclock_wall_clock *wall_clock = &(s->wc);
186 struct pvclock_vcpu_time_info *vcpu_time;
188 vcpu_time = &get_cpu_var(xen_vcpu)->time;
189 pvclock_read_wallclock(wall_clock, vcpu_time, ts);
190 put_cpu_var(xen_vcpu);
193 static unsigned long xen_get_wallclock(void)
195 struct timespec ts;
197 xen_read_wallclock(&ts);
198 return ts.tv_sec;
201 static int xen_set_wallclock(unsigned long now)
203 /* do nothing for domU */
204 return -1;
207 static struct clocksource xen_clocksource __read_mostly = {
208 .name = "xen",
209 .rating = 400,
210 .read = xen_clocksource_get_cycles,
211 .mask = ~0,
212 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
216 Xen clockevent implementation
218 Xen has two clockevent implementations:
220 The old timer_op one works with all released versions of Xen prior
221 to version 3.0.4. This version of the hypervisor provides a
222 single-shot timer with nanosecond resolution. However, sharing the
223 same event channel is a 100Hz tick which is delivered while the
224 vcpu is running. We don't care about or use this tick, but it will
225 cause the core time code to think the timer fired too soon, and
226 will end up resetting it each time. It could be filtered, but
227 doing so has complications when the ktime clocksource is not yet
228 the xen clocksource (ie, at boot time).
230 The new vcpu_op-based timer interface allows the tick timer period
231 to be changed or turned off. The tick timer is not useful as a
232 periodic timer because events are only delivered to running vcpus.
233 The one-shot timer can report when a timeout is in the past, so
234 set_next_event is capable of returning -ETIME when appropriate.
235 This interface is used when available.
240 Get a hypervisor absolute time. In theory we could maintain an
241 offset between the kernel's time and the hypervisor's time, and
242 apply that to a kernel's absolute timeout. Unfortunately the
243 hypervisor and kernel times can drift even if the kernel is using
244 the Xen clocksource, because ntp can warp the kernel's clocksource.
246 static s64 get_abs_timeout(unsigned long delta)
248 return xen_clocksource_read() + delta;
251 static void xen_timerop_set_mode(enum clock_event_mode mode,
252 struct clock_event_device *evt)
254 switch (mode) {
255 case CLOCK_EVT_MODE_PERIODIC:
256 /* unsupported */
257 WARN_ON(1);
258 break;
260 case CLOCK_EVT_MODE_ONESHOT:
261 case CLOCK_EVT_MODE_RESUME:
262 break;
264 case CLOCK_EVT_MODE_UNUSED:
265 case CLOCK_EVT_MODE_SHUTDOWN:
266 HYPERVISOR_set_timer_op(0); /* cancel timeout */
267 break;
271 static int xen_timerop_set_next_event(unsigned long delta,
272 struct clock_event_device *evt)
274 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
276 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
277 BUG();
279 /* We may have missed the deadline, but there's no real way of
280 knowing for sure. If the event was in the past, then we'll
281 get an immediate interrupt. */
283 return 0;
286 static const struct clock_event_device xen_timerop_clockevent = {
287 .name = "xen",
288 .features = CLOCK_EVT_FEAT_ONESHOT,
290 .max_delta_ns = 0xffffffff,
291 .min_delta_ns = TIMER_SLOP,
293 .mult = 1,
294 .shift = 0,
295 .rating = 500,
297 .set_mode = xen_timerop_set_mode,
298 .set_next_event = xen_timerop_set_next_event,
303 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
304 struct clock_event_device *evt)
306 int cpu = smp_processor_id();
308 switch (mode) {
309 case CLOCK_EVT_MODE_PERIODIC:
310 WARN_ON(1); /* unsupported */
311 break;
313 case CLOCK_EVT_MODE_ONESHOT:
314 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
315 BUG();
316 break;
318 case CLOCK_EVT_MODE_UNUSED:
319 case CLOCK_EVT_MODE_SHUTDOWN:
320 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
321 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
322 BUG();
323 break;
324 case CLOCK_EVT_MODE_RESUME:
325 break;
329 static int xen_vcpuop_set_next_event(unsigned long delta,
330 struct clock_event_device *evt)
332 int cpu = smp_processor_id();
333 struct vcpu_set_singleshot_timer single;
334 int ret;
336 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
338 single.timeout_abs_ns = get_abs_timeout(delta);
339 single.flags = VCPU_SSHOTTMR_future;
341 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
343 BUG_ON(ret != 0 && ret != -ETIME);
345 return ret;
348 static const struct clock_event_device xen_vcpuop_clockevent = {
349 .name = "xen",
350 .features = CLOCK_EVT_FEAT_ONESHOT,
352 .max_delta_ns = 0xffffffff,
353 .min_delta_ns = TIMER_SLOP,
355 .mult = 1,
356 .shift = 0,
357 .rating = 500,
359 .set_mode = xen_vcpuop_set_mode,
360 .set_next_event = xen_vcpuop_set_next_event,
363 static const struct clock_event_device *xen_clockevent =
364 &xen_timerop_clockevent;
365 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
367 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
369 struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
370 irqreturn_t ret;
372 ret = IRQ_NONE;
373 if (evt->event_handler) {
374 evt->event_handler(evt);
375 ret = IRQ_HANDLED;
378 do_stolen_accounting();
380 return ret;
383 void xen_setup_timer(int cpu)
385 const char *name;
386 struct clock_event_device *evt;
387 int irq;
389 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
391 name = kasprintf(GFP_KERNEL, "timer%d", cpu);
392 if (!name)
393 name = "<timer kasprintf failed>";
395 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
396 IRQF_DISABLED|IRQF_PERCPU|
397 IRQF_NOBALANCING|IRQF_TIMER|
398 IRQF_FORCE_RESUME,
399 name, NULL);
401 evt = &per_cpu(xen_clock_events, cpu);
402 memcpy(evt, xen_clockevent, sizeof(*evt));
404 evt->cpumask = cpumask_of(cpu);
405 evt->irq = irq;
408 void xen_teardown_timer(int cpu)
410 struct clock_event_device *evt;
411 BUG_ON(cpu == 0);
412 evt = &per_cpu(xen_clock_events, cpu);
413 unbind_from_irqhandler(evt->irq, NULL);
416 void xen_setup_cpu_clockevents(void)
418 BUG_ON(preemptible());
420 clockevents_register_device(&__get_cpu_var(xen_clock_events));
423 void xen_timer_resume(void)
425 int cpu;
427 pvclock_resume();
429 if (xen_clockevent != &xen_vcpuop_clockevent)
430 return;
432 for_each_online_cpu(cpu) {
433 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
434 BUG();
438 static const struct pv_time_ops xen_time_ops __initconst = {
439 .sched_clock = xen_clocksource_read,
442 static void __init xen_time_init(void)
444 int cpu = smp_processor_id();
445 struct timespec tp;
447 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
449 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
450 /* Successfully turned off 100Hz tick, so we have the
451 vcpuop-based timer interface */
452 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
453 xen_clockevent = &xen_vcpuop_clockevent;
456 /* Set initial system time with full resolution */
457 xen_read_wallclock(&tp);
458 do_settimeofday(&tp);
460 setup_force_cpu_cap(X86_FEATURE_TSC);
462 xen_setup_runstate_info(cpu);
463 xen_setup_timer(cpu);
464 xen_setup_cpu_clockevents();
467 void __init xen_init_time_ops(void)
469 pv_time_ops = xen_time_ops;
471 x86_init.timers.timer_init = xen_time_init;
472 x86_init.timers.setup_percpu_clockev = x86_init_noop;
473 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
475 x86_platform.calibrate_tsc = xen_tsc_khz;
476 x86_platform.get_wallclock = xen_get_wallclock;
477 x86_platform.set_wallclock = xen_set_wallclock;
480 #ifdef CONFIG_XEN_PVHVM
481 static void xen_hvm_setup_cpu_clockevents(void)
483 int cpu = smp_processor_id();
484 xen_setup_runstate_info(cpu);
485 xen_setup_timer(cpu);
486 xen_setup_cpu_clockevents();
489 void __init xen_hvm_init_time_ops(void)
491 /* vector callback is needed otherwise we cannot receive interrupts
492 * on cpu > 0 and at this point we don't know how many cpus are
493 * available */
494 if (!xen_have_vector_callback)
495 return;
496 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
497 printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
498 "disable pv timer\n");
499 return;
502 pv_time_ops = xen_time_ops;
503 x86_init.timers.setup_percpu_clockev = xen_time_init;
504 x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
506 x86_platform.calibrate_tsc = xen_tsc_khz;
507 x86_platform.get_wallclock = xen_get_wallclock;
508 x86_platform.set_wallclock = xen_set_wallclock;
510 #endif