4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
9 #include "config-host.h"
19 #include "qemu-common.h"
26 #include <sys/utsname.h>
27 #include <sys/syscall.h>
34 extern void perror(const char *s
);
36 kvm_context_t kvm_context
;
40 pthread_mutex_t qemu_mutex
= PTHREAD_MUTEX_INITIALIZER
;
41 pthread_cond_t qemu_vcpu_cond
= PTHREAD_COND_INITIALIZER
;
42 pthread_cond_t qemu_system_cond
= PTHREAD_COND_INITIALIZER
;
43 pthread_cond_t qemu_pause_cond
= PTHREAD_COND_INITIALIZER
;
44 pthread_cond_t qemu_work_cond
= PTHREAD_COND_INITIALIZER
;
45 __thread
struct vcpu_info
*vcpu
;
47 static int qemu_system_ready
;
49 #define SIG_IPI (SIGRTMIN+4)
51 struct qemu_kvm_work_item
{
52 struct qemu_kvm_work_item
*next
;
53 void (*func
)(void *data
);
67 struct qemu_kvm_work_item
*queued_work_first
, *queued_work_last
;
71 static int io_thread_fd
= -1;
72 static int io_thread_sigfd
= -1;
74 static int kvm_debug_stop_requested
;
76 static inline unsigned long kvm_get_thread_id(void)
78 return syscall(SYS_gettid
);
81 static void qemu_cond_wait(pthread_cond_t
*cond
)
83 CPUState
*env
= cpu_single_env
;
84 static const struct timespec ts
= {
89 pthread_cond_timedwait(cond
, &qemu_mutex
, &ts
);
90 /* If we're the I/O thread, some other thread may be waiting for aio
97 CPUState
*qemu_kvm_cpu_env(int index
)
99 return vcpu_info
[index
].env
;
102 static void sig_ipi_handler(int n
)
106 static void on_vcpu(CPUState
*env
, void (*func
)(void *data
), void *data
)
108 struct vcpu_info
*vi
= &vcpu_info
[env
->cpu_index
];
109 struct qemu_kvm_work_item wi
;
118 if (!vi
->queued_work_first
)
119 vi
->queued_work_first
= &wi
;
121 vi
->queued_work_last
->next
= &wi
;
122 vi
->queued_work_last
= &wi
;
126 pthread_kill(vi
->thread
, SIG_IPI
);
128 qemu_cond_wait(&qemu_work_cond
);
131 void kvm_update_interrupt_request(CPUState
*env
)
138 if (vcpu
&& env
!= vcpu
->env
&& !vcpu_info
[env
->cpu_index
].signalled
)
142 vcpu_info
[env
->cpu_index
].signalled
= 1;
143 if (vcpu_info
[env
->cpu_index
].thread
)
144 pthread_kill(vcpu_info
[env
->cpu_index
].thread
, SIG_IPI
);
149 void kvm_update_after_sipi(CPUState
*env
)
151 vcpu_info
[env
->cpu_index
].sipi_needed
= 1;
152 kvm_update_interrupt_request(env
);
155 void kvm_apic_init(CPUState
*env
)
157 if (env
->cpu_index
!= 0)
158 vcpu_info
[env
->cpu_index
].init
= 1;
159 kvm_update_interrupt_request(env
);
164 static int try_push_interrupts(void *opaque
)
166 return kvm_arch_try_push_interrupts(opaque
);
169 static void post_kvm_run(void *opaque
, int vcpu
)
172 pthread_mutex_lock(&qemu_mutex
);
173 kvm_arch_post_kvm_run(opaque
, vcpu
);
176 static int pre_kvm_run(void *opaque
, int vcpu
)
178 CPUState
*env
= qemu_kvm_cpu_env(vcpu
);
180 kvm_arch_pre_kvm_run(opaque
, vcpu
);
182 if (env
->interrupt_request
& CPU_INTERRUPT_EXIT
)
184 pthread_mutex_unlock(&qemu_mutex
);
188 static void kvm_do_load_registers(void *_env
)
190 CPUState
*env
= _env
;
192 kvm_arch_load_regs(env
);
195 void kvm_load_registers(CPUState
*env
)
197 if (kvm_enabled() && qemu_system_ready
)
198 on_vcpu(env
, kvm_do_load_registers
, env
);
201 static void kvm_do_save_registers(void *_env
)
203 CPUState
*env
= _env
;
205 kvm_arch_save_regs(env
);
208 void kvm_save_registers(CPUState
*env
)
211 on_vcpu(env
, kvm_do_save_registers
, env
);
214 int kvm_cpu_exec(CPUState
*env
)
218 r
= kvm_run(kvm_context
, env
->cpu_index
);
220 printf("kvm_run returned %d\n", r
);
227 extern int vm_running
;
229 static int has_work(CPUState
*env
)
231 if (!vm_running
|| (env
&& vcpu_info
[env
->cpu_index
].stopped
))
235 return kvm_arch_has_work(env
);
238 static void flush_queued_work(CPUState
*env
)
240 struct vcpu_info
*vi
= &vcpu_info
[env
->cpu_index
];
241 struct qemu_kvm_work_item
*wi
;
243 if (!vi
->queued_work_first
)
246 while ((wi
= vi
->queued_work_first
)) {
247 vi
->queued_work_first
= wi
->next
;
251 vi
->queued_work_last
= NULL
;
252 pthread_cond_broadcast(&qemu_work_cond
);
255 static void kvm_main_loop_wait(CPUState
*env
, int timeout
)
262 pthread_mutex_unlock(&qemu_mutex
);
264 ts
.tv_sec
= timeout
/ 1000;
265 ts
.tv_nsec
= (timeout
% 1000) * 1000000;
266 sigemptyset(&waitset
);
267 sigaddset(&waitset
, SIG_IPI
);
269 r
= sigtimedwait(&waitset
, &siginfo
, &ts
);
272 pthread_mutex_lock(&qemu_mutex
);
274 if (r
== -1 && !(e
== EAGAIN
|| e
== EINTR
)) {
275 printf("sigtimedwait: %s\n", strerror(e
));
279 cpu_single_env
= env
;
280 flush_queued_work(env
);
282 if (vcpu_info
[env
->cpu_index
].stop
) {
283 vcpu_info
[env
->cpu_index
].stop
= 0;
284 vcpu_info
[env
->cpu_index
].stopped
= 1;
285 pthread_cond_signal(&qemu_pause_cond
);
288 vcpu_info
[env
->cpu_index
].signalled
= 0;
291 static int all_threads_paused(void)
295 for (i
= 0; i
< smp_cpus
; ++i
)
296 if (vcpu_info
[i
].stop
)
301 static void pause_all_threads(void)
305 assert(!cpu_single_env
);
307 for (i
= 0; i
< smp_cpus
; ++i
) {
308 vcpu_info
[i
].stop
= 1;
309 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
311 while (!all_threads_paused())
312 qemu_cond_wait(&qemu_pause_cond
);
315 static void resume_all_threads(void)
319 assert(!cpu_single_env
);
321 for (i
= 0; i
< smp_cpus
; ++i
) {
322 vcpu_info
[i
].stop
= 0;
323 vcpu_info
[i
].stopped
= 0;
324 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
328 static void kvm_vm_state_change_handler(void *context
, int running
)
331 resume_all_threads();
336 static void update_regs_for_sipi(CPUState
*env
)
338 kvm_arch_update_regs_for_sipi(env
);
339 vcpu_info
[env
->cpu_index
].sipi_needed
= 0;
340 vcpu_info
[env
->cpu_index
].init
= 0;
343 static void update_regs_for_init(CPUState
*env
)
346 kvm_arch_load_regs(env
);
349 static void setup_kernel_sigmask(CPUState
*env
)
354 sigaddset(&set
, SIGUSR2
);
355 sigaddset(&set
, SIGIO
);
356 sigaddset(&set
, SIGALRM
);
357 sigprocmask(SIG_BLOCK
, &set
, NULL
);
359 sigprocmask(SIG_BLOCK
, NULL
, &set
);
360 sigdelset(&set
, SIG_IPI
);
362 kvm_set_signal_mask(kvm_context
, env
->cpu_index
, &set
);
365 void qemu_kvm_system_reset(void)
373 for (i
= 0; i
< smp_cpus
; ++i
)
374 kvm_arch_cpu_reset(vcpu_info
[i
].env
);
376 resume_all_threads();
379 static int kvm_main_loop_cpu(CPUState
*env
)
381 struct vcpu_info
*info
= &vcpu_info
[env
->cpu_index
];
383 setup_kernel_sigmask(env
);
385 pthread_mutex_lock(&qemu_mutex
);
386 if (kvm_irqchip_in_kernel(kvm_context
))
389 kvm_qemu_init_env(env
);
391 kvm_tpr_vcpu_start(env
);
394 cpu_single_env
= env
;
395 kvm_load_registers(env
);
398 while (!has_work(env
))
399 kvm_main_loop_wait(env
, 1000);
400 if (env
->interrupt_request
& CPU_INTERRUPT_HARD
)
402 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->sipi_needed
)
403 update_regs_for_sipi(env
);
404 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->init
)
405 update_regs_for_init(env
);
406 if (!env
->halted
&& !info
->init
)
408 env
->interrupt_request
&= ~CPU_INTERRUPT_EXIT
;
409 kvm_main_loop_wait(env
, 0);
411 pthread_mutex_unlock(&qemu_mutex
);
415 static void *ap_main_loop(void *_env
)
417 CPUState
*env
= _env
;
420 vcpu
= &vcpu_info
[env
->cpu_index
];
422 vcpu
->env
->thread_id
= kvm_get_thread_id();
423 sigfillset(&signals
);
424 sigprocmask(SIG_BLOCK
, &signals
, NULL
);
425 kvm_create_vcpu(kvm_context
, env
->cpu_index
);
426 kvm_qemu_init_env(env
);
428 /* signal VCPU creation */
429 pthread_mutex_lock(&qemu_mutex
);
431 pthread_cond_signal(&qemu_vcpu_cond
);
433 /* and wait for machine initialization */
434 while (!qemu_system_ready
)
435 qemu_cond_wait(&qemu_system_cond
);
436 pthread_mutex_unlock(&qemu_mutex
);
438 kvm_main_loop_cpu(env
);
442 void kvm_init_new_ap(int cpu
, CPUState
*env
)
444 pthread_create(&vcpu_info
[cpu
].thread
, NULL
, ap_main_loop
, env
);
446 while (vcpu_info
[cpu
].created
== 0)
447 qemu_cond_wait(&qemu_vcpu_cond
);
450 int kvm_init_ap(void)
455 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler
, NULL
);
457 signal(SIG_IPI
, sig_ipi_handler
);
461 void qemu_kvm_notify_work(void)
467 if (io_thread_fd
== -1)
470 memcpy(buffer
, &value
, sizeof(value
));
475 len
= write(io_thread_fd
, buffer
+ offset
, 8 - offset
);
476 if (len
== -1 && errno
== EINTR
)
486 fprintf(stderr
, "failed to notify io thread\n");
489 /* If we have signalfd, we mask out the signals we want to handle and then
490 * use signalfd to listen for them. We rely on whatever the current signal
491 * handler is to dispatch the signals when we receive them.
494 static void sigfd_handler(void *opaque
)
496 int fd
= (unsigned long)opaque
;
497 struct signalfd_siginfo info
;
498 struct sigaction action
;
503 len
= read(fd
, &info
, sizeof(info
));
504 } while (len
== -1 && errno
== EINTR
);
506 if (len
== -1 && errno
== EAGAIN
)
509 if (len
!= sizeof(info
)) {
510 printf("read from sigfd returned %ld: %m\n", len
);
514 sigaction(info
.ssi_signo
, NULL
, &action
);
515 if (action
.sa_handler
)
516 action
.sa_handler(info
.ssi_signo
);
521 /* Used to break IO thread out of select */
522 static void io_thread_wakeup(void *opaque
)
524 int fd
= (unsigned long)opaque
;
531 len
= read(fd
, buffer
+ offset
, 8 - offset
);
532 if (len
== -1 && errno
== EINTR
)
542 int kvm_main_loop(void)
548 io_thread
= pthread_self();
549 qemu_system_ready
= 1;
551 if (kvm_eventfd(fds
) == -1) {
552 fprintf(stderr
, "failed to create eventfd\n");
556 qemu_set_fd_handler2(fds
[0], NULL
, io_thread_wakeup
, NULL
,
557 (void *)(unsigned long)fds
[0]);
559 io_thread_fd
= fds
[1];
562 sigaddset(&mask
, SIGIO
);
563 sigaddset(&mask
, SIGALRM
);
564 sigprocmask(SIG_BLOCK
, &mask
, NULL
);
566 sigfd
= kvm_signalfd(&mask
);
568 fprintf(stderr
, "failed to create signalfd\n");
572 fcntl(sigfd
, F_SETFL
, O_NONBLOCK
);
574 qemu_set_fd_handler2(sigfd
, NULL
, sigfd_handler
, NULL
,
575 (void *)(unsigned long)sigfd
);
577 pthread_cond_broadcast(&qemu_system_cond
);
579 io_thread_sigfd
= sigfd
;
580 cpu_single_env
= NULL
;
583 main_loop_wait(1000);
584 if (qemu_shutdown_requested())
586 else if (qemu_powerdown_requested())
587 qemu_system_powerdown();
588 else if (qemu_reset_requested())
589 qemu_kvm_system_reset();
590 else if (kvm_debug_stop_requested
) {
592 kvm_debug_stop_requested
= 0;
597 pthread_mutex_unlock(&qemu_mutex
);
602 static int kvm_debug(void *opaque
, int vcpu
)
604 kvm_debug_stop_requested
= 1;
605 vcpu_info
[vcpu
].stopped
= 1;
609 static int kvm_inb(void *opaque
, uint16_t addr
, uint8_t *data
)
611 *data
= cpu_inb(0, addr
);
615 static int kvm_inw(void *opaque
, uint16_t addr
, uint16_t *data
)
617 *data
= cpu_inw(0, addr
);
621 static int kvm_inl(void *opaque
, uint16_t addr
, uint32_t *data
)
623 *data
= cpu_inl(0, addr
);
627 #define PM_IO_BASE 0xb000
629 static int kvm_outb(void *opaque
, uint16_t addr
, uint8_t data
)
634 cpu_outb(0, 0xb3, 0);
641 x
= cpu_inw(0, PM_IO_BASE
+ 4);
643 cpu_outw(0, PM_IO_BASE
+ 4, x
);
650 x
= cpu_inw(0, PM_IO_BASE
+ 4);
652 cpu_outw(0, PM_IO_BASE
+ 4, x
);
660 cpu_outb(0, addr
, data
);
664 static int kvm_outw(void *opaque
, uint16_t addr
, uint16_t data
)
666 cpu_outw(0, addr
, data
);
670 static int kvm_outl(void *opaque
, uint16_t addr
, uint32_t data
)
672 cpu_outl(0, addr
, data
);
676 static int kvm_mmio_read(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
678 cpu_physical_memory_rw(addr
, data
, len
, 0);
682 static int kvm_mmio_write(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
684 cpu_physical_memory_rw(addr
, data
, len
, 1);
688 static int kvm_io_window(void *opaque
)
694 static int kvm_halt(void *opaque
, int vcpu
)
696 return kvm_arch_halt(opaque
, vcpu
);
699 static int kvm_shutdown(void *opaque
, int vcpu
)
701 /* stop the current vcpu from going back to guest mode */
702 vcpu_info
[cpu_single_env
->cpu_index
].stopped
= 1;
704 qemu_system_reset_request();
708 static struct kvm_callbacks qemu_kvm_ops
= {
716 .mmio_read
= kvm_mmio_read
,
717 .mmio_write
= kvm_mmio_write
,
719 .shutdown
= kvm_shutdown
,
720 .io_window
= kvm_io_window
,
721 .try_push_interrupts
= try_push_interrupts
,
722 .post_kvm_run
= post_kvm_run
,
723 .pre_kvm_run
= pre_kvm_run
,
725 .tpr_access
= handle_tpr_access
,
728 .powerpc_dcr_read
= handle_powerpc_dcr_read
,
729 .powerpc_dcr_write
= handle_powerpc_dcr_write
,
735 /* Try to initialize kvm */
736 kvm_context
= kvm_init(&qemu_kvm_ops
, cpu_single_env
);
740 pthread_mutex_lock(&qemu_mutex
);
745 int kvm_qemu_create_context(void)
749 kvm_disable_irqchip_creation(kvm_context
);
752 kvm_disable_pit_creation(kvm_context
);
754 if (kvm_create(kvm_context
, phys_ram_size
, (void**)&phys_ram_base
) < 0) {
758 r
= kvm_arch_qemu_create_context();
764 void kvm_qemu_destroy(void)
766 kvm_finalize(kvm_context
);
769 void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr
,
771 unsigned long phys_offset
)
774 if (!(phys_offset
& ~TARGET_PAGE_MASK
)) {
775 r
= kvm_is_allocated_mem(kvm_context
, start_addr
, size
);
778 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
780 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
781 r
= kvm_register_phys_mem(kvm_context
, start_addr
,
782 phys_ram_base
+ phys_offset
,
785 if (phys_offset
& IO_MEM_ROM
) {
786 phys_offset
&= ~IO_MEM_ROM
;
787 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
789 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
790 r
= kvm_register_phys_mem(kvm_context
, start_addr
,
791 phys_ram_base
+ phys_offset
,
795 printf("kvm_cpu_register_physical_memory: failed\n");
801 int kvm_setup_guest_memory(void *area
, unsigned long size
)
806 if (kvm_enabled() && !kvm_has_sync_mmu(kvm_context
))
807 ret
= madvise(area
, size
, MADV_DONTFORK
);
816 int kvm_qemu_check_extension(int ext
)
818 return kvm_check_extension(kvm_context
, ext
);
821 int kvm_qemu_init_env(CPUState
*cenv
)
823 return kvm_arch_qemu_init_env(cenv
);
826 struct kvm_guest_debug_data
{
827 struct kvm_debug_guest dbg
;
831 void kvm_invoke_guest_debug(void *data
)
833 struct kvm_guest_debug_data
*dbg_data
= data
;
835 dbg_data
->err
= kvm_guest_debug(kvm_context
, cpu_single_env
->cpu_index
,
839 int kvm_update_debugger(CPUState
*env
)
841 struct kvm_guest_debug_data data
;
844 memset(data
.dbg
.breakpoints
, 0, sizeof(data
.dbg
.breakpoints
));
846 data
.dbg
.enabled
= 0;
847 if (env
->nb_breakpoints
|| env
->singlestep_enabled
) {
848 data
.dbg
.enabled
= 1;
849 for (i
= 0; i
< 4 && i
< env
->nb_breakpoints
; ++i
) {
850 data
.dbg
.breakpoints
[i
].enabled
= 1;
851 data
.dbg
.breakpoints
[i
].address
= env
->breakpoints
[i
];
853 data
.dbg
.singlestep
= env
->singlestep_enabled
;
855 on_vcpu(env
, kvm_invoke_guest_debug
, &data
);
861 * dirty pages logging
863 /* FIXME: use unsigned long pointer instead of unsigned char */
864 unsigned char *kvm_dirty_bitmap
= NULL
;
865 int kvm_physical_memory_set_dirty_tracking(int enable
)
873 if (!kvm_dirty_bitmap
) {
874 unsigned bitmap_size
= BITMAP_SIZE(phys_ram_size
);
875 kvm_dirty_bitmap
= qemu_malloc(bitmap_size
);
876 if (kvm_dirty_bitmap
== NULL
) {
877 perror("Failed to allocate dirty pages bitmap");
881 r
= kvm_dirty_pages_log_enable_all(kvm_context
);
886 if (kvm_dirty_bitmap
) {
887 r
= kvm_dirty_pages_log_reset(kvm_context
);
888 qemu_free(kvm_dirty_bitmap
);
889 kvm_dirty_bitmap
= NULL
;
895 /* get kvm's dirty pages bitmap and update qemu's */
896 int kvm_get_dirty_pages_log_range(unsigned long start_addr
,
897 unsigned char *bitmap
,
899 unsigned long mem_size
)
901 unsigned int i
, j
, n
=0;
903 unsigned page_number
, addr
, addr1
;
904 unsigned int len
= ((mem_size
/TARGET_PAGE_SIZE
) + 7) / 8;
907 * bitmap-traveling is faster than memory-traveling (for addr...)
908 * especially when most of the memory is not dirty.
910 for (i
=0; i
<len
; i
++) {
915 page_number
= i
* 8 + j
;
916 addr1
= page_number
* TARGET_PAGE_SIZE
;
917 addr
= offset
+ addr1
;
918 cpu_physical_memory_set_dirty(addr
);
924 int kvm_get_dirty_bitmap_cb(unsigned long start
, unsigned long len
,
925 void *bitmap
, void *opaque
)
927 return kvm_get_dirty_pages_log_range(start
, bitmap
, start
, len
);
931 * get kvm's dirty pages bitmap and update qemu's
932 * we only care about physical ram, which resides in slots 0 and 3
934 int kvm_update_dirty_pages_log(void)
939 r
= kvm_get_dirty_pages_range(kvm_context
, 0, phys_ram_size
,
940 kvm_dirty_bitmap
, NULL
,
941 kvm_get_dirty_bitmap_cb
);
945 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap
)
947 unsigned int bsize
= BITMAP_SIZE(phys_ram_size
);
948 unsigned int brsize
= BITMAP_SIZE(ram_size
);
949 unsigned int extra_pages
= (phys_ram_size
- ram_size
) / TARGET_PAGE_SIZE
;
950 unsigned int extra_bytes
= (extra_pages
+7)/8;
951 unsigned int hole_start
= BITMAP_SIZE(0xa0000);
952 unsigned int hole_end
= BITMAP_SIZE(0xc0000);
954 memset(bitmap
, 0xFF, brsize
+ extra_bytes
);
955 memset(bitmap
+ hole_start
, 0, hole_end
- hole_start
);
956 memset(bitmap
+ brsize
+ extra_bytes
, 0, bsize
- brsize
- extra_bytes
);
961 #ifdef KVM_CAP_IRQCHIP
963 int kvm_set_irq(int irq
, int level
)
965 return kvm_set_irq_level(kvm_context
, irq
, level
);
970 int qemu_kvm_get_dirty_pages(unsigned long phys_addr
, void *buf
)
972 return kvm_get_dirty_pages(kvm_context
, phys_addr
, buf
);
975 void *kvm_cpu_create_phys_mem(target_phys_addr_t start_addr
,
976 unsigned long size
, int log
, int writable
)
978 return kvm_create_phys_mem(kvm_context
, start_addr
, size
, log
, writable
);
981 void kvm_cpu_destroy_phys_mem(target_phys_addr_t start_addr
,
984 kvm_destroy_phys_mem(kvm_context
, start_addr
, size
);
987 void kvm_mutex_unlock(void)
989 assert(!cpu_single_env
);
990 pthread_mutex_unlock(&qemu_mutex
);
993 void kvm_mutex_lock(void)
995 pthread_mutex_lock(&qemu_mutex
);
996 cpu_single_env
= NULL
;
999 int qemu_kvm_register_coalesced_mmio(target_phys_addr_t addr
, unsigned int size
)
1001 return kvm_register_coalesced_mmio(kvm_context
, addr
, size
);
1004 int qemu_kvm_unregister_coalesced_mmio(target_phys_addr_t addr
,
1007 return kvm_unregister_coalesced_mmio(kvm_context
, addr
, size
);