3 #include "kvm/boot-protocol.h"
4 #include "kvm/cpufeature.h"
5 #include "kvm/read-write.h"
6 #include "kvm/interrupt.h"
7 #include "kvm/mptable.h"
10 #include "kvm/kvm-cpu.h"
12 #include <linux/kvm.h>
14 #include <asm/bootparam.h>
16 #include <sys/ioctl.h>
30 #include <sys/eventfd.h>
31 #include <asm/unistd.h>
34 #define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
35 #define KVM_PID_FILE_PATH "/.kvm-tools/"
36 #define HOME_DIR getenv("HOME")
38 const char *kvm_exit_reasons
[] = {
39 DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN
),
40 DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION
),
41 DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO
),
42 DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL
),
43 DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG
),
44 DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT
),
45 DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO
),
46 DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN
),
47 DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN
),
48 DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY
),
49 DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR
),
50 DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR
),
51 DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS
),
52 DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC
),
53 DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET
),
54 DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR
),
55 DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI
),
56 DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR
),
59 #define DEFINE_KVM_EXT(ext) \
67 { DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO
) },
68 { DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR
) },
69 { DEFINE_KVM_EXT(KVM_CAP_PIT2
) },
70 { DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY
) },
71 { DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING
) },
72 { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP
) },
73 { DEFINE_KVM_EXT(KVM_CAP_HLT
) },
74 { DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS
) },
75 { DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID
) },
78 extern struct kvm
*kvm
;
79 extern struct kvm_cpu
*kvm_cpus
[KVM_NR_CPUS
];
80 static int pause_event
;
81 static DEFINE_MUTEX(pause_lock
);
83 static bool kvm__supports_extension(struct kvm
*kvm
, unsigned int extension
)
87 ret
= ioctl(kvm
->sys_fd
, KVM_CHECK_EXTENSION
, extension
);
94 static int kvm__check_extensions(struct kvm
*kvm
)
98 for (i
= 0; i
< ARRAY_SIZE(kvm_req_ext
); i
++) {
99 if (!kvm__supports_extension(kvm
, kvm_req_ext
[i
].code
)) {
100 pr_error("Unsuppored KVM extension detected: %s",
101 kvm_req_ext
[i
].name
);
109 static struct kvm
*kvm__new(void)
111 struct kvm
*kvm
= calloc(1, sizeof *kvm
);
114 die("out of memory");
119 static void kvm__create_pidfile(struct kvm
*kvm
)
122 char full_name
[PATH_MAX
], pid
[10];
127 sprintf(full_name
, "%s/%s", HOME_DIR
, KVM_PID_FILE_PATH
);
128 mkdir(full_name
, 0777);
129 sprintf(full_name
, "%s/%s/%s.pid", HOME_DIR
, KVM_PID_FILE_PATH
, kvm
->name
);
130 fd
= open(full_name
, O_CREAT
| O_WRONLY
, 0666);
131 sprintf(pid
, "%u\n", getpid());
132 if (write(fd
, pid
, strlen(pid
)) <= 0)
133 die("Failed creating PID file");
137 void kvm__remove_pidfile(const char *name
)
139 char full_name
[PATH_MAX
];
141 sprintf(full_name
, "%s/%s/%s.pid", HOME_DIR
, KVM_PID_FILE_PATH
, name
);
145 pid_t
kvm__get_pid_by_instance(const char *name
)
149 char pid_str
[10], pid_file
[PATH_MAX
];
151 sprintf(pid_file
, "%s/%s/%s.pid", HOME_DIR
, KVM_PID_FILE_PATH
, name
);
152 fd
= open(pid_file
, O_RDONLY
);
156 if (read(fd
, pid_str
, 10) == 0)
168 int kvm__enumerate_instances(int (*callback
)(const char *name
, int pid
))
170 char full_name
[PATH_MAX
];
173 struct dirent entry
, *result
;
176 sprintf(full_name
, "%s/%s", HOME_DIR
, KVM_PID_FILE_PATH
);
177 dir
= opendir(full_name
);
179 while (dir
!= NULL
) {
180 readdir_r(dir
, &entry
, &result
);
183 if (entry
.d_type
== DT_REG
) {
184 entry
.d_name
[strlen(entry
.d_name
)-4] = 0;
185 pid
= kvm__get_pid_by_instance(entry
.d_name
);
186 ret
= callback(entry
.d_name
, pid
);
197 void kvm__delete(struct kvm
*kvm
)
199 kvm__stop_timer(kvm
);
201 munmap(kvm
->ram_start
, kvm
->ram_size
);
202 kvm__remove_pidfile(kvm
->name
);
206 static bool kvm__cpu_supports_vm(void)
208 struct cpuid_regs regs
;
212 regs
= (struct cpuid_regs
) {
218 case CPUID_VENDOR_INTEL_1
:
220 feature
= KVM__X86_FEATURE_VMX
;
223 case CPUID_VENDOR_AMD_1
:
224 eax_base
= 0x80000000;
225 feature
= KVM__X86_FEATURE_SVM
;
232 regs
= (struct cpuid_regs
) {
237 if (regs
.eax
< eax_base
+ 0x01)
240 regs
= (struct cpuid_regs
) {
241 .eax
= eax_base
+ 0x01
245 return regs
.ecx
& (1 << feature
);
249 * Note: KVM_SET_USER_MEMORY_REGION assumes that we don't pass overlapping
250 * memory regions to it. Therefore, be careful if you use this function for
251 * registering memory regions for emulating hardware.
253 void kvm__register_mem(struct kvm
*kvm
, u64 guest_phys
, u64 size
, void *userspace_addr
)
255 struct kvm_userspace_memory_region mem
;
258 mem
= (struct kvm_userspace_memory_region
) {
259 .slot
= kvm
->mem_slots
++,
260 .guest_phys_addr
= guest_phys
,
262 .userspace_addr
= (unsigned long)userspace_addr
,
265 ret
= ioctl(kvm
->vm_fd
, KVM_SET_USER_MEMORY_REGION
, &mem
);
267 die_perror("KVM_SET_USER_MEMORY_REGION ioctl");
271 * Allocating RAM size bigger than 4GB requires us to leave a gap
272 * in the RAM which is used for PCI MMIO, hotplug, and unconfigured
273 * devices (see documentation of e820_setup_gap() for details).
275 * If we're required to initialize RAM bigger than 4GB, we will create
276 * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space.
279 void kvm__init_ram(struct kvm
*kvm
)
281 u64 phys_start
, phys_size
;
284 if (kvm
->ram_size
< KVM_32BIT_GAP_START
) {
285 /* Use a single block of RAM for 32bit RAM */
288 phys_size
= kvm
->ram_size
;
289 host_mem
= kvm
->ram_start
;
291 kvm__register_mem(kvm
, phys_start
, phys_size
, host_mem
);
293 /* First RAM range from zero to the PCI gap: */
296 phys_size
= KVM_32BIT_GAP_START
;
297 host_mem
= kvm
->ram_start
;
299 kvm__register_mem(kvm
, phys_start
, phys_size
, host_mem
);
301 /* Second RAM range from 4GB to the end of RAM: */
303 phys_start
= 0x100000000ULL
;
304 phys_size
= kvm
->ram_size
- phys_size
;
305 host_mem
= kvm
->ram_start
+ phys_start
;
307 kvm__register_mem(kvm
, phys_start
, phys_size
, host_mem
);
311 int kvm__recommended_cpus(struct kvm
*kvm
)
315 ret
= ioctl(kvm
->sys_fd
, KVM_CHECK_EXTENSION
, KVM_CAP_NR_VCPUS
);
317 die_perror("KVM_CAP_NR_VCPUS");
323 * The following hack should be removed once 'x86: Raise the hard
324 * VCPU count limit' makes it's way into the mainline.
326 #ifndef KVM_CAP_MAX_VCPUS
327 #define KVM_CAP_MAX_VCPUS 66
330 int kvm__max_cpus(struct kvm
*kvm
)
334 ret
= ioctl(kvm
->sys_fd
, KVM_CHECK_EXTENSION
, KVM_CAP_MAX_VCPUS
);
336 ret
= kvm__recommended_cpus(kvm
);
341 struct kvm
*kvm__init(const char *kvm_dev
, u64 ram_size
, const char *name
)
343 struct kvm_pit_config pit_config
= { .flags
= 0, };
347 if (!kvm__cpu_supports_vm())
348 die("Your CPU does not support hardware virtualization");
352 kvm
->sys_fd
= open(kvm_dev
, O_RDWR
);
353 if (kvm
->sys_fd
< 0) {
355 die("'%s' not found. Please make sure your kernel has CONFIG_KVM enabled and that the KVM modules are loaded.", kvm_dev
);
357 die("'%s' KVM driver not available.\n # (If the KVM module is loaded then 'dmesg' may offer further clues about the failure.)", kvm_dev
);
359 fprintf(stderr
, " Fatal, could not open %s: ", kvm_dev
);
364 ret
= ioctl(kvm
->sys_fd
, KVM_GET_API_VERSION
, 0);
365 if (ret
!= KVM_API_VERSION
)
366 die_perror("KVM_API_VERSION ioctl");
368 kvm
->vm_fd
= ioctl(kvm
->sys_fd
, KVM_CREATE_VM
, 0);
370 die_perror("KVM_CREATE_VM ioctl");
372 if (kvm__check_extensions(kvm
))
373 die("A required KVM extention is not supported by OS");
375 ret
= ioctl(kvm
->vm_fd
, KVM_SET_TSS_ADDR
, 0xfffbd000);
377 die_perror("KVM_SET_TSS_ADDR ioctl");
379 ret
= ioctl(kvm
->vm_fd
, KVM_CREATE_PIT2
, &pit_config
);
381 die_perror("KVM_CREATE_PIT2 ioctl");
383 kvm
->ram_size
= ram_size
;
385 if (kvm
->ram_size
< KVM_32BIT_GAP_START
) {
386 kvm
->ram_start
= mmap(NULL
, ram_size
, PROT_RW
, MAP_ANON_NORESERVE
, -1, 0);
388 kvm
->ram_start
= mmap(NULL
, ram_size
+ KVM_32BIT_GAP_SIZE
, PROT_RW
, MAP_ANON_NORESERVE
, -1, 0);
389 if (kvm
->ram_start
!= MAP_FAILED
) {
391 * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that
392 * if we accidently write to it, we will know.
394 mprotect(kvm
->ram_start
+ KVM_32BIT_GAP_START
, KVM_32BIT_GAP_SIZE
, PROT_NONE
);
397 if (kvm
->ram_start
== MAP_FAILED
)
398 die("out of memory");
400 madvise(kvm
->ram_start
, kvm
->ram_size
, MADV_MERGEABLE
);
402 ret
= ioctl(kvm
->vm_fd
, KVM_CREATE_IRQCHIP
);
404 die_perror("KVM_CREATE_IRQCHIP ioctl");
408 kvm__create_pidfile(kvm
);
413 #define BOOT_LOADER_SELECTOR 0x1000
414 #define BOOT_LOADER_IP 0x0000
415 #define BOOT_LOADER_SP 0x8000
416 #define BOOT_CMDLINE_OFFSET 0x20000
418 #define BOOT_PROTOCOL_REQUIRED 0x206
419 #define LOAD_HIGH 0x01
421 static int load_flat_binary(struct kvm
*kvm
, int fd
)
426 if (lseek(fd
, 0, SEEK_SET
) < 0)
429 p
= guest_real_to_host(kvm
, BOOT_LOADER_SELECTOR
, BOOT_LOADER_IP
);
431 while ((nr
= read(fd
, p
, 65536)) > 0)
434 kvm
->boot_selector
= BOOT_LOADER_SELECTOR
;
435 kvm
->boot_ip
= BOOT_LOADER_IP
;
436 kvm
->boot_sp
= BOOT_LOADER_SP
;
441 static const char *BZIMAGE_MAGIC
= "HdrS";
443 static bool load_bzimage(struct kvm
*kvm
, int fd_kernel
,
444 int fd_initrd
, const char *kernel_cmdline
, u16 vidmode
)
446 struct boot_params
*kern_boot
;
447 unsigned long setup_sects
;
448 struct boot_params boot
;
455 * See Documentation/x86/boot.txt for details no bzImage on-disk and
459 if (lseek(fd_kernel
, 0, SEEK_SET
) < 0)
462 if (read(fd_kernel
, &boot
, sizeof(boot
)) != sizeof(boot
))
465 if (memcmp(&boot
.hdr
.header
, BZIMAGE_MAGIC
, strlen(BZIMAGE_MAGIC
)))
468 if (boot
.hdr
.version
< BOOT_PROTOCOL_REQUIRED
)
469 die("Too old kernel");
471 if (lseek(fd_kernel
, 0, SEEK_SET
) < 0)
474 if (!boot
.hdr
.setup_sects
)
475 boot
.hdr
.setup_sects
= BZ_DEFAULT_SETUP_SECTS
;
476 setup_sects
= boot
.hdr
.setup_sects
+ 1;
478 setup_size
= setup_sects
<< 9;
479 p
= guest_real_to_host(kvm
, BOOT_LOADER_SELECTOR
, BOOT_LOADER_IP
);
481 /* copy setup.bin to mem*/
482 if (read(fd_kernel
, p
, setup_size
) != setup_size
)
485 /* copy vmlinux.bin to BZ_KERNEL_START*/
486 p
= guest_flat_to_host(kvm
, BZ_KERNEL_START
);
488 while ((nr
= read(fd_kernel
, p
, 65536)) > 0)
491 p
= guest_flat_to_host(kvm
, BOOT_CMDLINE_OFFSET
);
492 if (kernel_cmdline
) {
493 cmdline_size
= strlen(kernel_cmdline
) + 1;
494 if (cmdline_size
> boot
.hdr
.cmdline_size
)
495 cmdline_size
= boot
.hdr
.cmdline_size
;
497 memset(p
, 0, boot
.hdr
.cmdline_size
);
498 memcpy(p
, kernel_cmdline
, cmdline_size
- 1);
501 kern_boot
= guest_real_to_host(kvm
, BOOT_LOADER_SELECTOR
, 0x00);
503 kern_boot
->hdr
.cmd_line_ptr
= BOOT_CMDLINE_OFFSET
;
504 kern_boot
->hdr
.type_of_loader
= 0xff;
505 kern_boot
->hdr
.heap_end_ptr
= 0xfe00;
506 kern_boot
->hdr
.loadflags
|= CAN_USE_HEAP
;
507 kern_boot
->hdr
.vid_mode
= vidmode
;
510 * Read initrd image into guest memory
512 if (fd_initrd
>= 0) {
513 struct stat initrd_stat
;
516 if (fstat(fd_initrd
, &initrd_stat
))
519 addr
= boot
.hdr
.initrd_addr_max
& ~0xfffff;
521 if (addr
< BZ_KERNEL_START
)
522 die("Not enough memory for initrd");
523 else if (addr
< (kvm
->ram_size
- initrd_stat
.st_size
))
528 p
= guest_flat_to_host(kvm
, addr
);
529 nr
= read(fd_initrd
, p
, initrd_stat
.st_size
);
530 if (nr
!= initrd_stat
.st_size
)
531 die("Failed to read initrd");
533 kern_boot
->hdr
.ramdisk_image
= addr
;
534 kern_boot
->hdr
.ramdisk_size
= initrd_stat
.st_size
;
537 kvm
->boot_selector
= BOOT_LOADER_SELECTOR
;
539 * The real-mode setup code starts at offset 0x200 of a bzImage. See
540 * Documentation/x86/boot.txt for details.
542 kvm
->boot_ip
= BOOT_LOADER_IP
+ 0x200;
543 kvm
->boot_sp
= BOOT_LOADER_SP
;
549 #define GZIP_ID1 0x1f
550 #define GZIP_ID2 0x8b
552 static bool initrd_check(int fd
)
556 if (read_in_full(fd
, id
, ARRAY_SIZE(id
)) < 0)
559 if (lseek(fd
, 0, SEEK_SET
) < 0)
562 return id
[0] == GZIP_ID1
&& id
[1] == GZIP_ID2
;
565 bool kvm__load_kernel(struct kvm
*kvm
, const char *kernel_filename
,
566 const char *initrd_filename
, const char *kernel_cmdline
, u16 vidmode
)
569 int fd_kernel
= -1, fd_initrd
= -1;
571 fd_kernel
= open(kernel_filename
, O_RDONLY
);
573 die("Unable to open kernel %s", kernel_filename
);
575 if (initrd_filename
) {
576 fd_initrd
= open(initrd_filename
, O_RDONLY
);
578 die("Unable to open initrd %s", initrd_filename
);
580 if (!initrd_check(fd_initrd
))
581 die("%s is not an initrd", initrd_filename
);
584 ret
= load_bzimage(kvm
, fd_kernel
, fd_initrd
, kernel_cmdline
, vidmode
);
592 pr_warning("%s is not a bzImage. Trying to load it as a flat binary...", kernel_filename
);
594 ret
= load_flat_binary(kvm
, fd_kernel
);
600 die("%s is not a valid bzImage or flat binary", kernel_filename
);
609 * kvm__setup_bios - inject BIOS into guest system memory
610 * @kvm - guest system descriptor
612 * This function is a main routine where we poke guest memory
613 * and install BIOS there.
615 void kvm__setup_bios(struct kvm
*kvm
)
617 /* standart minimal configuration */
620 /* FIXME: SMP, ACPI and friends here */
623 mptable_setup(kvm
, kvm
->nrcpus
);
626 #define TIMER_INTERVAL_NS 1000000 /* 1 msec */
629 * This function sets up a timer that's used to inject interrupts from the
630 * userspace hypervisor into the guest at periodical intervals. Please note
631 * that clock interrupt, for example, is not handled here.
633 void kvm__start_timer(struct kvm
*kvm
)
635 struct itimerspec its
;
638 memset(&sev
, 0, sizeof(struct sigevent
));
639 sev
.sigev_value
.sival_int
= 0;
640 sev
.sigev_notify
= SIGEV_THREAD_ID
;
641 sev
.sigev_signo
= SIGALRM
;
642 sev
._sigev_un
._tid
= syscall(__NR_gettid
);
644 if (timer_create(CLOCK_REALTIME
, &sev
, &kvm
->timerid
) < 0)
645 die("timer_create()");
647 its
.it_value
.tv_sec
= TIMER_INTERVAL_NS
/ 1000000000;
648 its
.it_value
.tv_nsec
= TIMER_INTERVAL_NS
% 1000000000;
649 its
.it_interval
.tv_sec
= its
.it_value
.tv_sec
;
650 its
.it_interval
.tv_nsec
= its
.it_value
.tv_nsec
;
652 if (timer_settime(kvm
->timerid
, 0, &its
, NULL
) < 0)
653 die("timer_settime()");
656 void kvm__stop_timer(struct kvm
*kvm
)
659 if (timer_delete(kvm
->timerid
) < 0)
660 die("timer_delete()");
665 void kvm__irq_line(struct kvm
*kvm
, int irq
, int level
)
667 struct kvm_irq_level irq_level
;
669 irq_level
= (struct kvm_irq_level
) {
676 if (ioctl(kvm
->vm_fd
, KVM_IRQ_LINE
, &irq_level
) < 0)
677 die_perror("KVM_IRQ_LINE failed");
680 void kvm__irq_trigger(struct kvm
*kvm
, int irq
)
682 kvm__irq_line(kvm
, irq
, 1);
683 kvm__irq_line(kvm
, irq
, 0);
686 void kvm__dump_mem(struct kvm
*kvm
, unsigned long addr
, unsigned long size
)
691 size
&= ~7; /* mod 8 */
695 p
= guest_flat_to_host(kvm
, addr
);
697 for (n
= 0; n
< size
; n
+= 8) {
698 if (!host_ptr_in_ram(kvm
, p
+ n
))
701 printf(" 0x%08lx: %02x %02x %02x %02x %02x %02x %02x %02x\n",
702 addr
+ n
, p
[n
+ 0], p
[n
+ 1], p
[n
+ 2], p
[n
+ 3],
703 p
[n
+ 4], p
[n
+ 5], p
[n
+ 6], p
[n
+ 7]);
707 void kvm__pause(void)
709 int i
, paused_vcpus
= 0;
711 /* Check if the guest is running */
712 if (!kvm_cpus
[0] || kvm_cpus
[0]->thread
== 0)
715 mutex_lock(&pause_lock
);
717 pause_event
= eventfd(0, 0);
719 die("Failed creating pause notification event");
720 for (i
= 0; i
< kvm
->nrcpus
; i
++)
721 pthread_kill(kvm_cpus
[i
]->thread
, SIGKVMPAUSE
);
723 while (paused_vcpus
< kvm
->nrcpus
) {
726 if (read(pause_event
, &cur_read
, sizeof(cur_read
)) < 0)
727 die("Failed reading pause event");
728 paused_vcpus
+= cur_read
;
733 void kvm__continue(void)
735 /* Check if the guest is running */
736 if (!kvm_cpus
[0] || kvm_cpus
[0]->thread
== 0)
739 mutex_unlock(&pause_lock
);
742 void kvm__notify_paused(void)
746 if (write(pause_event
, &p
, sizeof(p
)) < 0)
747 die("Failed notifying of paused VCPU.");
749 mutex_lock(&pause_lock
);
750 mutex_unlock(&pause_lock
);