kvm: release: merge from trunk
[kvm-userspace.git] / qemu / qemu-kvm.c
blob401c7e12c71cee01826d4bed32a1c99408d7c22c
2 #include "config.h"
3 #include "config-host.h"
5 #ifdef USE_KVM
7 #include "exec.h"
9 #include "qemu-kvm.h"
10 #include <kvmctl.h>
11 #include <string.h>
13 #define MSR_IA32_TSC 0x10
15 extern void perror(const char *s);
17 int kvm_allowed = 1;
18 kvm_context_t kvm_context;
19 static struct kvm_msr_list *kvm_msr_list;
20 static int kvm_has_msr_star;
22 #define NR_CPU 16
23 static CPUState *saved_env[NR_CPU];
25 static void set_msr_entry(struct kvm_msr_entry *entry, uint32_t index,
26 uint64_t data)
28 entry->index = index;
29 entry->data = data;
32 /* returns 0 on success, non-0 on failure */
33 static int get_msr_entry(struct kvm_msr_entry *entry, CPUState *env)
35 switch (entry->index) {
36 case MSR_IA32_SYSENTER_CS:
37 env->sysenter_cs = entry->data;
38 break;
39 case MSR_IA32_SYSENTER_ESP:
40 env->sysenter_esp = entry->data;
41 break;
42 case MSR_IA32_SYSENTER_EIP:
43 env->sysenter_eip = entry->data;
44 break;
45 case MSR_STAR:
46 env->star = entry->data;
47 break;
48 #ifdef TARGET_X86_64
49 case MSR_CSTAR:
50 env->cstar = entry->data;
51 break;
52 case MSR_KERNELGSBASE:
53 env->kernelgsbase = entry->data;
54 break;
55 case MSR_FMASK:
56 env->fmask = entry->data;
57 break;
58 case MSR_LSTAR:
59 env->lstar = entry->data;
60 break;
61 #endif
62 case MSR_IA32_TSC:
63 env->tsc = entry->data;
64 break;
65 default:
66 printf("Warning unknown msr index 0x%x\n", entry->index);
67 return 1;
69 return 0;
72 #ifdef TARGET_X86_64
73 #define MSR_COUNT 9
74 #else
75 #define MSR_COUNT 5
76 #endif
78 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
80 lhs->selector = rhs->selector;
81 lhs->base = rhs->base;
82 lhs->limit = rhs->limit;
83 lhs->type = 3;
84 lhs->present = 1;
85 lhs->dpl = 3;
86 lhs->db = 0;
87 lhs->s = 1;
88 lhs->l = 0;
89 lhs->g = 0;
90 lhs->avl = 0;
91 lhs->unusable = 0;
94 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
96 unsigned flags = rhs->flags;
97 lhs->selector = rhs->selector;
98 lhs->base = rhs->base;
99 lhs->limit = rhs->limit;
100 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
101 lhs->present = (flags & DESC_P_MASK) != 0;
102 lhs->dpl = rhs->selector & 3;
103 lhs->db = (flags >> DESC_B_SHIFT) & 1;
104 lhs->s = (flags & DESC_S_MASK) != 0;
105 lhs->l = (flags >> DESC_L_SHIFT) & 1;
106 lhs->g = (flags & DESC_G_MASK) != 0;
107 lhs->avl = (flags & DESC_AVL_MASK) != 0;
108 lhs->unusable = 0;
111 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
113 lhs->selector = rhs->selector;
114 lhs->base = rhs->base;
115 lhs->limit = rhs->limit;
116 lhs->flags =
117 (rhs->type << DESC_TYPE_SHIFT)
118 | (rhs->present * DESC_P_MASK)
119 | (rhs->dpl << DESC_DPL_SHIFT)
120 | (rhs->db << DESC_B_SHIFT)
121 | (rhs->s * DESC_S_MASK)
122 | (rhs->l << DESC_L_SHIFT)
123 | (rhs->g * DESC_G_MASK)
124 | (rhs->avl * DESC_AVL_MASK);
127 /* the reset values of qemu are not compatible to SVM
128 * this function is used to fix the segment descriptor values */
129 static void fix_realmode_dataseg(struct kvm_segment *seg)
131 seg->type = 0x02;
132 seg->present = 1;
133 seg->s = 1;
136 static void load_regs(CPUState *env)
138 struct kvm_regs regs;
139 struct kvm_sregs sregs;
140 struct kvm_msr_entry msrs[MSR_COUNT];
141 int rc, n;
143 /* hack: save env */
144 if (!saved_env[0])
145 saved_env[0] = env;
147 regs.rax = env->regs[R_EAX];
148 regs.rbx = env->regs[R_EBX];
149 regs.rcx = env->regs[R_ECX];
150 regs.rdx = env->regs[R_EDX];
151 regs.rsi = env->regs[R_ESI];
152 regs.rdi = env->regs[R_EDI];
153 regs.rsp = env->regs[R_ESP];
154 regs.rbp = env->regs[R_EBP];
155 #ifdef TARGET_X86_64
156 regs.r8 = env->regs[8];
157 regs.r9 = env->regs[9];
158 regs.r10 = env->regs[10];
159 regs.r11 = env->regs[11];
160 regs.r12 = env->regs[12];
161 regs.r13 = env->regs[13];
162 regs.r14 = env->regs[14];
163 regs.r15 = env->regs[15];
164 #endif
166 regs.rflags = env->eflags;
167 regs.rip = env->eip;
169 kvm_set_regs(kvm_context, 0, &regs);
171 memcpy(sregs.interrupt_bitmap, env->kvm_interrupt_bitmap, sizeof(sregs.interrupt_bitmap));
173 if ((env->eflags & VM_MASK)) {
174 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
175 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
176 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
177 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
178 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
179 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
180 } else {
181 set_seg(&sregs.cs, &env->segs[R_CS]);
182 set_seg(&sregs.ds, &env->segs[R_DS]);
183 set_seg(&sregs.es, &env->segs[R_ES]);
184 set_seg(&sregs.fs, &env->segs[R_FS]);
185 set_seg(&sregs.gs, &env->segs[R_GS]);
186 set_seg(&sregs.ss, &env->segs[R_SS]);
188 if (env->cr[0] & CR0_PE_MASK) {
189 /* force ss cpl to cs cpl */
190 sregs.ss.selector = (sregs.ss.selector & ~3) |
191 (sregs.cs.selector & 3);
192 sregs.ss.dpl = sregs.ss.selector & 3;
195 if (!(env->cr[0] & CR0_PG_MASK)) {
196 fix_realmode_dataseg(&sregs.ds);
197 fix_realmode_dataseg(&sregs.es);
198 fix_realmode_dataseg(&sregs.fs);
199 fix_realmode_dataseg(&sregs.gs);
200 fix_realmode_dataseg(&sregs.ss);
204 set_seg(&sregs.tr, &env->tr);
205 set_seg(&sregs.ldt, &env->ldt);
207 sregs.idt.limit = env->idt.limit;
208 sregs.idt.base = env->idt.base;
209 sregs.gdt.limit = env->gdt.limit;
210 sregs.gdt.base = env->gdt.base;
212 sregs.cr0 = env->cr[0];
213 sregs.cr2 = env->cr[2];
214 sregs.cr3 = env->cr[3];
215 sregs.cr4 = env->cr[4];
217 sregs.apic_base = cpu_get_apic_base(env);
218 sregs.efer = env->efer;
219 sregs.cr8 = cpu_get_apic_tpr(env);
221 kvm_set_sregs(kvm_context, 0, &sregs);
223 /* msrs */
224 n = 0;
225 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
226 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
227 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
228 if (kvm_has_msr_star)
229 set_msr_entry(&msrs[n++], MSR_STAR, env->star);
230 set_msr_entry(&msrs[n++], MSR_IA32_TSC, env->tsc);
231 #ifdef TARGET_X86_64
232 set_msr_entry(&msrs[n++], MSR_CSTAR, env->cstar);
233 set_msr_entry(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
234 set_msr_entry(&msrs[n++], MSR_FMASK, env->fmask);
235 set_msr_entry(&msrs[n++], MSR_LSTAR , env->lstar);
236 #endif
238 rc = kvm_set_msrs(kvm_context, 0, msrs, n);
239 if (rc == -1)
240 perror("kvm_set_msrs FAILED");
244 static void save_regs(CPUState *env)
246 struct kvm_regs regs;
247 struct kvm_sregs sregs;
248 struct kvm_msr_entry msrs[MSR_COUNT];
249 uint32_t hflags;
250 uint32_t i, n, rc;
252 kvm_get_regs(kvm_context, 0, &regs);
254 env->regs[R_EAX] = regs.rax;
255 env->regs[R_EBX] = regs.rbx;
256 env->regs[R_ECX] = regs.rcx;
257 env->regs[R_EDX] = regs.rdx;
258 env->regs[R_ESI] = regs.rsi;
259 env->regs[R_EDI] = regs.rdi;
260 env->regs[R_ESP] = regs.rsp;
261 env->regs[R_EBP] = regs.rbp;
262 #ifdef TARGET_X86_64
263 env->regs[8] = regs.r8;
264 env->regs[9] = regs.r9;
265 env->regs[10] = regs.r10;
266 env->regs[11] = regs.r11;
267 env->regs[12] = regs.r12;
268 env->regs[13] = regs.r13;
269 env->regs[14] = regs.r14;
270 env->regs[15] = regs.r15;
271 #endif
273 env->eflags = regs.rflags;
274 env->eip = regs.rip;
276 kvm_get_sregs(kvm_context, 0, &sregs);
278 memcpy(env->kvm_interrupt_bitmap, sregs.interrupt_bitmap, sizeof(env->kvm_interrupt_bitmap));
280 get_seg(&env->segs[R_CS], &sregs.cs);
281 get_seg(&env->segs[R_DS], &sregs.ds);
282 get_seg(&env->segs[R_ES], &sregs.es);
283 get_seg(&env->segs[R_FS], &sregs.fs);
284 get_seg(&env->segs[R_GS], &sregs.gs);
285 get_seg(&env->segs[R_SS], &sregs.ss);
287 get_seg(&env->tr, &sregs.tr);
288 get_seg(&env->ldt, &sregs.ldt);
290 env->idt.limit = sregs.idt.limit;
291 env->idt.base = sregs.idt.base;
292 env->gdt.limit = sregs.gdt.limit;
293 env->gdt.base = sregs.gdt.base;
295 env->cr[0] = sregs.cr0;
296 env->cr[2] = sregs.cr2;
297 env->cr[3] = sregs.cr3;
298 env->cr[4] = sregs.cr4;
300 cpu_set_apic_base(env, sregs.apic_base);
302 env->efer = sregs.efer;
303 cpu_set_apic_tpr(env, sregs.cr8);
305 #define HFLAG_COPY_MASK ~( \
306 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
307 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
308 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
309 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
313 hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
314 hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
315 hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
316 (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
317 hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
318 hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
319 (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
321 if (env->efer & MSR_EFER_LMA) {
322 hflags |= HF_LMA_MASK;
325 if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
326 hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
327 } else {
328 hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
329 (DESC_B_SHIFT - HF_CS32_SHIFT);
330 hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
331 (DESC_B_SHIFT - HF_SS32_SHIFT);
332 if (!(env->cr[0] & CR0_PE_MASK) ||
333 (env->eflags & VM_MASK) ||
334 !(hflags & HF_CS32_MASK)) {
335 hflags |= HF_ADDSEG_MASK;
336 } else {
337 hflags |= ((env->segs[R_DS].base |
338 env->segs[R_ES].base |
339 env->segs[R_SS].base) != 0) <<
340 HF_ADDSEG_SHIFT;
343 env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
344 CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
345 DF = 1 - (2 * ((env->eflags >> 10) & 1));
346 CC_OP = CC_OP_EFLAGS;
347 env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
349 tlb_flush(env, 1);
351 /* msrs */
352 n = 0;
353 msrs[n++].index = MSR_IA32_SYSENTER_CS;
354 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
355 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
356 if (kvm_has_msr_star)
357 msrs[n++].index = MSR_STAR;
358 msrs[n++].index = MSR_IA32_TSC;
359 #ifdef TARGET_X86_64
360 msrs[n++].index = MSR_CSTAR;
361 msrs[n++].index = MSR_KERNELGSBASE;
362 msrs[n++].index = MSR_FMASK;
363 msrs[n++].index = MSR_LSTAR;
364 #endif
365 rc = kvm_get_msrs(kvm_context, 0, msrs, n);
366 if (rc == -1) {
367 perror("kvm_get_msrs FAILED");
369 else {
370 n = rc; /* actual number of MSRs */
371 for (i=0 ; i<n; i++) {
372 if (get_msr_entry(&msrs[i], env))
373 return;
378 #include <signal.h>
381 static int try_push_interrupts(void *opaque)
383 CPUState **envs = opaque, *env;
384 env = envs[0];
386 if (env->ready_for_interrupt_injection &&
387 (env->interrupt_request & CPU_INTERRUPT_HARD) &&
388 (env->eflags & IF_MASK)) {
389 env->interrupt_request &= ~CPU_INTERRUPT_HARD;
390 // for now using cpu 0
391 kvm_inject_irq(kvm_context, 0, cpu_get_pic_interrupt(env));
394 return (env->interrupt_request & CPU_INTERRUPT_HARD) != 0;
397 static void post_kvm_run(void *opaque, struct kvm_run *kvm_run)
399 CPUState **envs = opaque, *env;
400 env = envs[0];
402 env->eflags = (kvm_run->if_flag) ? env->eflags | IF_MASK:env->eflags & ~IF_MASK;
403 env->ready_for_interrupt_injection = kvm_run->ready_for_interrupt_injection;
404 cpu_set_apic_tpr(env, kvm_run->cr8);
405 cpu_set_apic_base(env, kvm_run->apic_base);
408 static void pre_kvm_run(void *opaque, struct kvm_run *kvm_run)
410 CPUState **envs = opaque, *env;
411 env = envs[0];
413 kvm_run->cr8 = cpu_get_apic_tpr(env);
416 void kvm_load_registers(CPUState *env)
418 load_regs(env);
421 void kvm_save_registers(CPUState *env)
423 save_regs(env);
426 int kvm_cpu_exec(CPUState *env)
428 int r;
429 int pending = (!env->ready_for_interrupt_injection ||
430 ((env->interrupt_request & CPU_INTERRUPT_HARD) &&
431 (env->eflags & IF_MASK)));
433 if (!pending && (env->interrupt_request & CPU_INTERRUPT_EXIT)) {
434 env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
435 env->exception_index = EXCP_INTERRUPT;
436 cpu_loop_exit();
440 if (!saved_env[0])
441 saved_env[0] = env;
443 r = kvm_run(kvm_context, 0);
444 if (r < 0) {
445 printf("kvm_run returned %d\n", r);
446 exit(1);
449 return 0;
453 static int kvm_cpuid(void *opaque, uint64_t *rax, uint64_t *rbx,
454 uint64_t *rcx, uint64_t *rdx)
456 CPUState **envs = opaque;
457 CPUState *saved_env;
458 uint32_t eax = *rax;
460 saved_env = env;
461 env = envs[0];
463 env->regs[R_EAX] = *rax;
464 env->regs[R_EBX] = *rbx;
465 env->regs[R_ECX] = *rcx;
466 env->regs[R_EDX] = *rdx;
467 helper_cpuid();
468 *rdx = env->regs[R_EDX];
469 *rcx = env->regs[R_ECX];
470 *rbx = env->regs[R_EBX];
471 *rax = env->regs[R_EAX];
472 // don't report long mode/syscall/nx if no native support
473 if (eax == 0x80000001) {
474 unsigned long h_eax = eax, h_edx;
477 // push/pop hack to workaround gcc 3 register pressure trouble
478 asm (
479 #ifdef __x86_64__
480 "push %%rbx; push %%rcx; cpuid; pop %%rcx; pop %%rbx"
481 #else
482 "push %%ebx; push %%ecx; cpuid; pop %%ecx; pop %%ebx"
483 #endif
484 : "+a"(h_eax), "=d"(h_edx));
486 // long mode
487 if ((h_edx & 0x20000000) == 0)
488 *rdx &= ~0x20000000ull;
489 // syscall
490 if ((h_edx & 0x00000800) == 0)
491 *rdx &= ~0x00000800ull;
492 // nx
493 if ((h_edx & 0x00100000) == 0)
494 *rdx &= ~0x00100000ull;
496 env = saved_env;
497 return 0;
500 static int kvm_debug(void *opaque, int vcpu)
502 CPUState **envs = opaque;
504 env = envs[0];
505 env->exception_index = EXCP_DEBUG;
506 return 1;
509 static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data)
511 *data = cpu_inb(0, addr);
512 return 0;
515 static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data)
517 *data = cpu_inw(0, addr);
518 return 0;
521 static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data)
523 *data = cpu_inl(0, addr);
524 return 0;
527 static int kvm_outb(void *opaque, uint16_t addr, uint8_t data)
529 cpu_outb(0, addr, data);
530 return 0;
533 static int kvm_outw(void *opaque, uint16_t addr, uint16_t data)
535 cpu_outw(0, addr, data);
536 return 0;
539 static int kvm_outl(void *opaque, uint16_t addr, uint32_t data)
541 cpu_outl(0, addr, data);
542 return 0;
545 static int kvm_readb(void *opaque, uint64_t addr, uint8_t *data)
547 *data = ldub_phys(addr);
548 return 0;
551 static int kvm_readw(void *opaque, uint64_t addr, uint16_t *data)
553 *data = lduw_phys(addr);
554 return 0;
557 static int kvm_readl(void *opaque, uint64_t addr, uint32_t *data)
559 *data = ldl_phys(addr);
560 return 0;
563 static int kvm_readq(void *opaque, uint64_t addr, uint64_t *data)
565 *data = ldq_phys(addr);
566 return 0;
569 static int kvm_writeb(void *opaque, uint64_t addr, uint8_t data)
571 stb_phys(addr, data);
572 return 0;
575 static int kvm_writew(void *opaque, uint64_t addr, uint16_t data)
577 stw_phys(addr, data);
578 return 0;
581 static int kvm_writel(void *opaque, uint64_t addr, uint32_t data)
583 stl_phys(addr, data);
584 return 0;
587 static int kvm_writeq(void *opaque, uint64_t addr, uint64_t data)
589 stq_phys(addr, data);
590 return 0;
593 static int kvm_io_window(void *opaque)
595 return 1;
599 static int kvm_halt(void *opaque, int vcpu)
601 CPUState **envs = opaque, *env;
603 env = envs[0];
604 if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
605 (env->eflags & IF_MASK))) {
606 env->hflags |= HF_HALTED_MASK;
607 env->exception_index = EXCP_HLT;
610 return 1;
613 static int kvm_shutdown(void *opaque, int vcpu)
615 qemu_system_reset_request();
616 return 1;
619 static struct kvm_callbacks qemu_kvm_ops = {
620 .cpuid = kvm_cpuid,
621 .debug = kvm_debug,
622 .inb = kvm_inb,
623 .inw = kvm_inw,
624 .inl = kvm_inl,
625 .outb = kvm_outb,
626 .outw = kvm_outw,
627 .outl = kvm_outl,
628 .readb = kvm_readb,
629 .readw = kvm_readw,
630 .readl = kvm_readl,
631 .readq = kvm_readq,
632 .writeb = kvm_writeb,
633 .writew = kvm_writew,
634 .writel = kvm_writel,
635 .writeq = kvm_writeq,
636 .halt = kvm_halt,
637 .shutdown = kvm_shutdown,
638 .io_window = kvm_io_window,
639 .try_push_interrupts = try_push_interrupts,
640 .post_kvm_run = post_kvm_run,
641 .pre_kvm_run = pre_kvm_run,
644 int kvm_qemu_init()
646 /* Try to initialize kvm */
647 kvm_context = kvm_init(&qemu_kvm_ops, saved_env);
648 if (!kvm_context) {
649 return -1;
652 return 0;
655 int kvm_qemu_create_context(void)
657 int i;
659 if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
660 kvm_qemu_destroy();
661 return -1;
663 kvm_msr_list = kvm_get_msr_list(kvm_context);
664 if (!kvm_msr_list) {
665 kvm_qemu_destroy();
666 return -1;
668 for (i = 0; i < kvm_msr_list->nmsrs; ++i)
669 if (kvm_msr_list->indices[i] == MSR_STAR)
670 kvm_has_msr_star = 1;
671 return 0;
674 void kvm_qemu_destroy(void)
676 kvm_finalize(kvm_context);
679 int kvm_update_debugger(CPUState *env)
681 struct kvm_debug_guest dbg;
682 int i;
684 dbg.enabled = 0;
685 if (env->nb_breakpoints || env->singlestep_enabled) {
686 dbg.enabled = 1;
687 for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) {
688 dbg.breakpoints[i].enabled = 1;
689 dbg.breakpoints[i].address = env->breakpoints[i];
691 dbg.singlestep = env->singlestep_enabled;
693 return kvm_guest_debug(kvm_context, 0, &dbg);
697 #endif