4 * Copyright (C) 2004-2008 Fabrice Bellard
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * version 2 as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 #include "kqemu_int.h"
21 #include "monitor-image.h"
24 //#define DEBUG_INVALIDATE
26 static int mon_set_pte(struct kqemu_state
*s
, unsigned long vaddr
,
27 unsigned long page_index
, uint32_t pte_flags
);
28 static void *mon_alloc_page(struct kqemu_state
*s
,
29 unsigned long *ppage_index
);
33 void *memcpy(void *d1
, const void *s1
, size_t len
)
36 const uint8_t *s
= s1
;
44 void *memset(void *d1
, int val
, size_t len
)
54 static void set_seg(uint32_t *p
, unsigned long addr
, unsigned long limit
,
58 e1
= (addr
<< 16) | (limit
& 0xffff);
59 e2
= ((addr
>> 16) & 0xff) | (addr
& 0xff000000) | (limit
& 0x000f0000) |
66 static void set_seg64(uint32_t *p
, unsigned long addr
, unsigned long limit
,
70 e1
= (addr
<< 16) | (limit
& 0xffff);
71 e2
= ((addr
>> 16) & 0xff) | (addr
& 0xff000000) | (limit
& 0x000f0000) |
80 static void set_gate(uint32_t *p
, unsigned int type
, unsigned int dpl
,
81 unsigned long addr
, unsigned int sel
)
84 e1
= (addr
& 0xffff) | (sel
<< 16);
85 e2
= (addr
& 0xffff0000) | 0x8000 | (dpl
<< 13) | (type
<< 8);
95 static void set_trap_gate(struct kqemu_state
*s
, int n
, int dpl
, void *addr
)
97 set_gate((uint32_t *)(s
->idt_table
+ IDT_ENTRY_SIZE
* n
),
98 15, dpl
, (unsigned long )addr
, s
->monitor_cs_sel
);
102 static void set_intr_gate(struct kqemu_state
*s
, int n
, int dpl
, unsigned long addr
)
104 set_gate((uint32_t *)(s
->idt_table
+ IDT_ENTRY_SIZE
* n
),
105 14, dpl
, addr
, s
->monitor_cs_sel
);
108 static void mon_set_interrupt(struct kqemu_state
*s
, int intno
, int is_int
)
110 const struct monitor_code_header
*m
= (void *)monitor_code
;
123 set_intr_gate(s
, intno
, dpl
, m
->interrupt_table
+
124 INTERRUPT_ENTRY_SIZE
* intno
+ s
->monitor_vaddr
);
128 /* only used during init */
130 static void mon_map_page_init(struct kqemu_state
*s
)
134 s
->first_mapped_page
= (s
->monitor_end_vaddr
- s
->monitor_vaddr
) >> PAGE_SHIFT
;
135 for(i
= s
->first_mapped_page
; i
< MAX_MAPPED_PAGES
- 1; i
++) {
136 s
->mapped_pages
[i
].next
= i
+ 1;
138 s
->mapped_pages
[MAX_MAPPED_PAGES
- 1].next
= -1;
139 for(i
= 0; i
< MAX_MAPPED_PAGES
; i
++) {
140 s
->mapped_pages
[i
].page_index
= -1;
141 s
->mapped_pages
[i
].host_page
= NULL
;
145 /* return NULL if error */
146 static void *mon_alloc_page(struct kqemu_state
*s
,
147 unsigned long *ppage_index
)
149 unsigned long vaddr
, page_index
;
150 struct kqemu_page
*host_page
;
151 host_page
= kqemu_alloc_zeroed_page(&page_index
);
154 kqemu_log("mon_alloc_page: NULL\n");
158 vaddr
= get_vaddr(s
);
159 set_vaddr_page_index(s
, vaddr
, page_index
, host_page
, 0);
160 /* avoid recursion during init */
161 if (!s
->in_page_init
)
162 mon_set_pte(s
, vaddr
, page_index
, PG_PRESENT_MASK
| PG_GLOBAL(s
) | PG_RW_MASK
);
164 kqemu_log("mon_alloc_page: vaddr=%p page_index=%08lx\n",
165 (void *)vaddr
, (void *)page_index
);
168 *ppage_index
= page_index
;
169 return (void *)vaddr
;
172 static int mon_set_pte(struct kqemu_state
*s
, unsigned long vaddr
,
173 unsigned long page_index
, uint32_t pte_flags
)
176 kqemu_log("mon_set_pte: vaddr=0x%lx page_index=0x%lx pte_flags=0x%x\n",
177 vaddr
, page_index
, pte_flags
);
181 ptep
= mon_get_ptep_l3(s
, 0, vaddr
, 1, NULL
);
184 *ptep
= ((uint64_t)page_index
<< PAGE_SHIFT
) | pte_flags
;
187 ptep
= mon_get_ptep_l2(s
, 0, vaddr
, 1, NULL
);
190 *ptep
= (page_index
<< PAGE_SHIFT
) | pte_flags
;
195 /* return NULL if error */
196 static void *mon_user_map(struct kqemu_state
*s
, void *uaddr
, int size
,
199 unsigned long page_index
, vaddr
, i
;
201 struct kqemu_user_page
*host_page
;
203 size
= PAGE_ALIGN(size
);
205 /* NOTE: we use the fact that getvaddr returns contiguous pages */
206 for(i
= 0; i
< size
; i
+= 4096) {
207 host_page
= kqemu_lock_user_page(&page_index
,
208 (unsigned long)uaddr
+ i
);
211 vaddr
= get_vaddr(s
);
212 set_vaddr_page_index(s
, vaddr
, page_index
, host_page
, 1);
213 mon_set_pte(s
, vaddr
, page_index
,
214 PG_PRESENT_MASK
| PG_GLOBAL(s
) | pte_flags
);
221 #define cpuid(index, eax, ebx, ecx, edx) \
222 asm volatile ("cpuid" \
223 : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) \
227 static int is_cpuid_supported(void)
232 static int is_cpuid_supported(void)
235 asm volatile ("pushf\n"
238 "xorl $0x00200000, %0\n"
243 : "=a" (v0
), "=d" (v1
)
250 static void get_cpuid_features(struct kqemu_state
*s
)
252 uint32_t eax
, ebx
, ecx
, edx
;
255 if (!is_cpuid_supported()) {
256 s
->cpuid_features
= 0;
259 cpuid(0, eax
, ebx
, ecx
, edx
);
260 is_intel
= (ebx
== 0x756e6547 && edx
== 0x49656e69 &&
262 cpuid(1, eax
, ebx
, ecx
, edx
);
263 /* SEP is buggy on some pentium pros */
264 if (is_intel
&& (edx
& CPUID_SEP
) &&
265 (eax
& 0xfff) < 0x633) {
268 s
->cpuid_features
= edx
;
270 s
->cpuid_ext2_features
= 0;
271 cpuid(0x80000000, eax
, ebx
, ecx
, edx
);
272 if (eax
>= 0x80000001) {
273 cpuid(0x80000001, eax
, ebx
, ecx
, edx
);
274 s
->cpuid_ext2_features
= edx
;
279 /* per instance locked ram page allocation logic */
280 static void kqemu_update_locked_ram_pages(struct kqemu_global_state
*g
)
282 struct kqemu_state
*s
;
283 unsigned long total_ram_pages
, max_locked_ram_pages
;
286 for(s
= g
->first_state
; s
!= NULL
; s
= s
->next_state
) {
287 total_ram_pages
+= s
->nb_ram_pages
;
290 /* XXX: better logic to guaranty no overflow ? */
291 for(s
= g
->first_state
; s
!= NULL
; s
= s
->next_state
) {
292 max_locked_ram_pages
= (g
->max_locked_ram_pages
* s
->nb_ram_pages
) /
294 if (max_locked_ram_pages
< MIN_LOCKED_RAM_PAGES
)
295 max_locked_ram_pages
= MIN_LOCKED_RAM_PAGES
;
296 s
->max_locked_ram_pages
= max_locked_ram_pages
;
298 kqemu_log("state %p: max locked ram=%d KB\n",
299 s
, s
->max_locked_ram_pages
* 4);
304 static int kqemu_add_state(struct kqemu_global_state
*g
,
305 struct kqemu_state
*s
)
310 if (((g
->nb_kqemu_states
+ 1) * MIN_LOCKED_RAM_PAGES
) >
311 g
->max_locked_ram_pages
) {
315 s
->next_state
= g
->first_state
;
317 g
->nb_kqemu_states
++;
318 kqemu_update_locked_ram_pages(g
);
321 spin_unlock(&g
->lock
);
325 static void kqemu_del_state(struct kqemu_state
*s
)
327 struct kqemu_global_state
*g
= s
->global_state
;
328 struct kqemu_state
**ps
;
332 for(ps
= &g
->first_state
; *ps
!= NULL
; ps
= &(*ps
)->next_state
) {
338 g
->nb_kqemu_states
--;
339 kqemu_update_locked_ram_pages(g
);
340 spin_unlock(&g
->lock
);
344 struct kqemu_global_state
*kqemu_global_init(int max_locked_pages
)
346 struct kqemu_global_state
*g
;
348 g
= kqemu_vmalloc(PAGE_ALIGN(sizeof(struct kqemu_global_state
)));
351 memset(g
, 0, sizeof(struct kqemu_global_state
));
352 spin_lock_init(&g
->lock
);
353 g
->max_locked_ram_pages
= max_locked_pages
;
357 void kqemu_global_delete(struct kqemu_global_state
*g
)
359 /* XXX: free all existing states ? */
363 struct kqemu_state
*kqemu_init(struct kqemu_init
*d
,
364 struct kqemu_global_state
*g
)
366 struct kqemu_state
*s
;
367 const struct monitor_code_header
*m
= (void *)monitor_code
;
369 const uint8_t *kernel_vaddr
;
370 int i
, j
, n
, kqemu_state_size
;
374 /* some consistency checks */
375 if (((unsigned long)d
->ram_base
& ~PAGE_MASK
) != 0 ||
376 ((unsigned long)d
->ram_dirty
& ~PAGE_MASK
) != 0 ||
377 (d
->ram_size
& ~PAGE_MASK
) != 0 ||
378 d
->ram_size
>= 0x7ffff000 ||
379 ((unsigned long)d
->pages_to_flush
& ~PAGE_MASK
) != 0 ||
380 ((unsigned long)d
->ram_pages_to_update
& ~PAGE_MASK
) != 0 ||
381 ((unsigned long)d
->modified_ram_pages
& ~PAGE_MASK
) != 0) {
382 kqemu_log("Invalid kqemu_init data alignment\n");
386 n
= d
->ram_size
>> PAGE_SHIFT
;
387 kqemu_state_size
= PAGE_ALIGN(sizeof(monitor_code
)) +
388 PAGE_ALIGN(sizeof(struct kqemu_state
) +
389 n
* sizeof(struct kqemu_ram_page
));
390 s1
= kqemu_vmalloc(kqemu_state_size
);
393 memset(s1
, 0, kqemu_state_size
);
394 memcpy(s1
, monitor_code
, sizeof(monitor_code
));
395 s
= (void *)(s1
+ PAGE_ALIGN(sizeof(monitor_code
)));
398 /* check PAE state */
400 unsigned long host_cr4
;
401 asm volatile ("movl %%cr4, %0" : "=r" (host_cr4
));
402 s
->use_pae
= (host_cr4
& CR4_PAE_MASK
) != 0;
406 /* the following can be initialized with any value */
408 // s->monitor_vaddr = 0xffff900000000000;
409 /* must stay in low 4GB for easier 16 bit ESP fix */
410 s
->monitor_vaddr
= 0xf0000000;
412 s
->monitor_vaddr
= 0xf0000000;
414 s
->monitor_selector_base
= 0xf180;
416 kqemu_log("kqemu_init monitor_vaddr=0x%08lx sel_base=0x%04x\n",
417 s
->monitor_vaddr
, s
->monitor_selector_base
);
421 s
->monitor_cs_sel
= s
->monitor_selector_base
+ (0 << 3);
423 s
->monitor_ds_sel
= 0; /* no need for a specific data segment */
424 /* used for 16 bit esp fix */
425 s
->monitor_cs32_sel
= (s
->monitor_selector_base
+ (7 << 3)) | 1;
426 s
->monitor_ss16_sel
= (s
->monitor_selector_base
+ (6 << 3)) | 1;
427 s
->monitor_ss_null_sel
= (s
->monitor_selector_base
+ (1 << 3)) | 3;
429 s
->monitor_ds_sel
= s
->monitor_selector_base
+ (1 << 3);
430 s
->monitor_ss16_sel
= s
->monitor_selector_base
+ (6 << 3);
432 s
->monitor_ldt_sel
= s
->monitor_selector_base
+ (2 << 3);
434 s
->monitor_data_vaddr
= s
->monitor_vaddr
+
435 PAGE_ALIGN(sizeof(monitor_code
));
436 s
->monitor_end_vaddr
= s
->monitor_vaddr
+ kqemu_state_size
;
437 s
->monitor_to_kernel_offset
= (unsigned long)s
- s
->monitor_data_vaddr
;
439 /* must be done easly so that 'fail' case works */
443 s
->monitor_idt
.base
= s
->monitor_data_vaddr
+
444 offsetof(struct kqemu_state
, idt_table
);
445 s
->monitor_idt
.limit
= sizeof(s
->idt_table
) - 1;
447 /* we use interrupt gates to disable IF */
448 for(i
= 0; i
<= 0x13; i
++) {
449 mon_set_interrupt(s
, i
, 0);
451 for(i
= 0x14; i
< 256; i
++) {
452 mon_set_interrupt(s
, i
, 1);
456 s
->monitor_gdt
.limit
= 0xffff;
460 s
->monitor_tss
.rsp0
= s
->monitor_data_vaddr
+
461 offsetof(struct kqemu_state
, regs1
.dummy
[0]);
462 s
->monitor_tss
.bitmap
= 0x8000; /* no I/O permitted */
465 s
->monitor_tss
.esp0
= s
->monitor_data_vaddr
+
466 offsetof(struct kqemu_state
, regs1
.dummy
[0]);
467 s
->monitor_tss
.ss0
= s
->monitor_ds_sel
;
468 s
->monitor_tss
.bitmap
= 0x8000; /* no I/O permitted */
469 s
->monitor_tss
.back_link
= 0xffff; /* generates error if iret with
474 set_seg64(s
->tr_desc_cache
,
475 s
->monitor_data_vaddr
+
476 offsetof(struct kqemu_state
, monitor_tss
),
477 sizeof(struct kqemu_tss
) - 1, 0x89);
479 set_seg(s
->tr_desc_cache
,
480 s
->monitor_data_vaddr
+
481 offsetof(struct kqemu_state
, monitor_tss
),
485 /* for each CPL we create a LDT and GDT */
486 for(i
= 0; i
< NB_DT_TABLES
; i
++) {
487 unsigned long ldt_addr
;
488 dt_table
= s
->dt_table
+ i
* 16384;
489 ldt_addr
= s
->monitor_data_vaddr
+
490 offsetof(struct kqemu_state
, dt_table
) + 0x10000 + 0x20000 * i
;
492 set_seg64((uint32_t *)(dt_table
+ (s
->monitor_ldt_sel
>> 3)),
493 ldt_addr
, 0xffff, 0x82);
494 set_seg((uint32_t *)(dt_table
+ (s
->monitor_cs_sel
>> 3)),
495 0, 0xfffff, 0xa09a); /* long mode segment */
496 set_seg((uint32_t *)(dt_table
+ (s
->monitor_ss16_sel
>> 3)),
497 (s
->monitor_data_vaddr
+ offsetof(struct kqemu_state
, stack
)) & ~0xffff,
498 0xffff, 0x00b2); /* SS16 segment for 16 bit ESP fix */
499 set_seg((uint32_t *)(dt_table
+ (s
->monitor_cs32_sel
>> 3)),
500 0, 0xfffff, 0xc0ba); /* CS32 segment for 16 bit ESP fix */
501 set_seg((uint32_t *)(dt_table
+ (s
->monitor_ss_null_sel
>> 3)),
502 0, 0, 0x40f2); /* substitute for null SS segment */
504 set_seg((uint32_t *)(dt_table
+ (s
->monitor_ldt_sel
>> 3)),
505 ldt_addr
, 0xffff, 0x82);
506 set_seg((uint32_t *)(dt_table
+ (s
->monitor_cs_sel
>> 3)),
508 set_seg((uint32_t *)(dt_table
+ (s
->monitor_ds_sel
>> 3)),
510 set_seg((uint32_t *)(dt_table
+ (s
->monitor_ss16_sel
>> 3)),
511 (s
->monitor_data_vaddr
+ offsetof(struct kqemu_state
, stack
)) & ~0xffff,
516 /* page table init */
517 mon_map_page_init(s
);
519 s
->in_page_init
= 1; /* avoid recursion in page allocator */
521 /* make sure we allocate enough PTE for the monitor itself (2 MB
522 is OK for both PAE and normal MMU) */
523 for(i
= 0; i
< MONITOR_MEM_SIZE
; i
+= 2048 * 1024) {
524 mon_set_pte(s
, s
->monitor_vaddr
+ i
, 0, 0);
527 /* set the pte of the allocated pages (no page_alloc is needed) */
528 for(i
= 0; i
< MAX_MAPPED_PAGES
; i
++) {
529 unsigned long page_index
;
530 page_index
= s
->mapped_pages
[i
].page_index
;
531 if (page_index
!= -1) {
532 mon_set_pte(s
, s
->monitor_vaddr
+ ((unsigned long)i
<< PAGE_SHIFT
),
534 PG_PRESENT_MASK
| PG_GLOBAL(s
) | PG_RW_MASK
);
540 for(vaddr
= s
->monitor_vaddr
; vaddr
< s
->monitor_data_vaddr
;
541 vaddr
+= PAGE_SIZE
) {
542 /* XXX: RW because of data, need to set it only to the right
544 mon_set_pte(s
, vaddr
, kqemu_vmalloc_to_phys(kernel_vaddr
),
545 PG_PRESENT_MASK
| PG_GLOBAL(s
) | PG_RW_MASK
);
546 kernel_vaddr
+= PAGE_SIZE
;
548 for(; vaddr
< s
->monitor_end_vaddr
;
549 vaddr
+= PAGE_SIZE
) {
550 mon_set_pte(s
, vaddr
, kqemu_vmalloc_to_phys(kernel_vaddr
),
551 PG_PRESENT_MASK
| PG_GLOBAL(s
) | PG_RW_MASK
);
552 kernel_vaddr
+= PAGE_SIZE
;
555 /* clone the monitor PTE pages in each address space */
556 for(i
= 1; i
< NB_ADDRESS_SPACES
; i
++) {
558 uint64_t *pdep
, *pdep1
;
559 for(j
= 0; j
< MONITOR_MEM_SIZE
; j
+= 2048 * 1024) {
560 vaddr
= s
->monitor_vaddr
+ j
;
561 pdep
= mon_get_ptep_l3(s
, 0, vaddr
, 2, NULL
);
562 pdep1
= mon_get_ptep_l3(s
, i
, vaddr
, 2, NULL
);
566 uint32_t *pdep
, *pdep1
;
567 for(j
= 0; j
< MONITOR_MEM_SIZE
; j
+= 4096 * 1024) {
568 vaddr
= s
->monitor_vaddr
+ j
;
569 pdep
= mon_get_ptep_l2(s
, 0, vaddr
, 2, NULL
);
570 pdep1
= mon_get_ptep_l2(s
, i
, vaddr
, 2, NULL
);
576 /* set the cr3 register of each address space */
577 for(i
= 0; i
< NB_ADDRESS_SPACES
; i
++) {
580 pfn
= kqemu_vmalloc_to_phys(&s
->pgds
[i
]);
583 if (pfn
>= (1 << (32 - PAGE_SHIFT
))) {
584 kqemu_log("Error: invalid cr3 (%p)\n", (void *)pfn
);
588 s
->pgds_cr3
[i
] = pfn
<< PAGE_SHIFT
;
590 kqemu_log("pgds_cr3[%d] = %p\n", i
, (void *)s
->pgds_cr3
[i
]);
593 start
= s
->monitor_vaddr
>> 22;
594 end
= start
+ (MONITOR_MEM_SIZE
>> 22);
595 for(j
=start
;j
<end
;j
++) {
596 kqemu_log("%03x: %08x\n", j
, s
->pgds
[i
].l2
[j
]);
602 /* prepare nexus page switch logic */
604 unsigned long monitor_page
;
606 s
->nexus_kaddr
= (unsigned long)s1
;
607 monitor_page
= kqemu_vmalloc_to_phys((void *)s
->nexus_kaddr
);
609 s
->nexus_pte
= ((uint64_t)monitor_page
<< PAGE_SHIFT
) |
610 PG_PRESENT_MASK
| PG_GLOBAL(s
);
611 for(i
= 0; i
< NB_ADDRESS_SPACES
; i
++) {
612 s
->nexus_kaddr_ptep
[i
] =
613 mon_get_ptep_l3(s
, i
, s
->nexus_kaddr
, 1,
614 (unsigned long *)&s
->nexus_kaddr_vptep
[i
]);
617 s
->nexus_pte
= (monitor_page
<< PAGE_SHIFT
) |
618 PG_PRESENT_MASK
| PG_GLOBAL(s
);
619 for(i
= 0; i
< NB_ADDRESS_SPACES
; i
++) {
620 s
->nexus_kaddr_ptep
[i
] =
621 mon_get_ptep_l2(s
, i
, s
->nexus_kaddr
, 1,
622 (unsigned long *)&s
->nexus_kaddr_vptep
[i
]);
626 kqemu_log("nexus_kaddr=%p nexus_pte=0x%08x vptep0=%p vptep1=%p\n",
627 (void *)s
->nexus_kaddr
,
629 (void *)s
->nexus_kaddr_vptep
[0],
630 (void *)s
->nexus_kaddr_vptep
[1]);
633 s
->monitor_data_kaddr
= (unsigned long)s
;
634 s
->monitor_jmp
= m
->kernel2monitor_jmp_offset
+ s
->monitor_vaddr
;
635 s
->kernel_jmp
= m
->monitor2kernel_jmp_offset
+ (unsigned long)s1
;
637 /* communication page */
638 s
->comm_page_index
= kqemu_vmalloc_to_phys(&s
->comm_page
);
643 struct kqemu_ram_page
*p
;
645 s
->ram_size
= d
->ram_size
;
646 s
->nb_ram_pages
= s
->ram_size
>> PAGE_SHIFT
;
647 s
->ram_base_uaddr
= (unsigned long)d
->ram_base
;
649 kqemu_log("nb_ram_pages=%d\n", s
->nb_ram_pages
);
652 for(i
= 0; i
< s
->nb_ram_pages
; i
++) {
657 /* init mapped ram page list */
661 s
->ram_dirty
= mon_user_map(s
, d
->ram_dirty
, s
->ram_size
>> PAGE_SHIFT
,
666 s
->pages_to_flush
= mon_user_map(s
, d
->pages_to_flush
, PAGE_SIZE
,
668 if (!s
->pages_to_flush
)
671 s
->ram_pages_to_update
= mon_user_map(s
, d
->ram_pages_to_update
,
673 if (!s
->ram_pages_to_update
)
676 s
->modified_ram_pages
= mon_user_map(s
, d
->modified_ram_pages
,
677 PAGE_SIZE
, PG_RW_MASK
);
678 if (!s
->modified_ram_pages
)
681 for(i
= 0;i
< RAM_PAGE_CACHE_SIZE
;i
++) {
682 vaddr
= get_vaddr(s
);
684 s
->ram_page_cache_base
= vaddr
;
686 for(i
= 0;i
< RAM_PAGE_CACHE_SIZE
;i
++) {
687 s
->slot_to_ram_addr
[i
] = -1;
692 get_cpuid_features(s
);
694 /* disable SEP code if sysenter is not supported by the CPU or not
697 if (s
->cpuid_features
& CPUID_SEP
) {
698 uint32_t dummy
, cs_val
;
699 rdmsr(MSR_IA32_SYSENTER_CS
, cs_val
, dummy
);
704 /* syscall support */
706 if (s
->cpuid_ext2_features
& CPUID_EXT2_SYSCALL
) {
707 uint32_t efer_low
, efer_high
;
708 rdmsr(MSR_EFER
, efer_low
, efer_high
);
709 if (efer_low
& MSR_EFER_SCE
) {
713 /* apic to disable NMI if required */
715 if (s
->cpuid_features
& CPUID_APIC
) {
716 uint32_t apic_base
, apic_baseh
;
717 rdmsr(MSR_IA32_APICBASE
, apic_base
, apic_baseh
);
718 if (apic_base
& MSR_IA32_APICBASE_ENABLE
) {
719 apic_base
= apic_base
& MSR_IA32_APICBASE_BASE
;
720 s
->apic_regs
= kqemu_io_map(apic_base
>> PAGE_SHIFT
, PAGE_SIZE
);
722 s
->apic_lvt_max
= (s
->apic_regs
[APIC_LVR
>> 2] >> 16) & 0xff;
723 if (s
->apic_lvt_max
< 3)
725 else if (s
->apic_lvt_max
> 5)
729 kqemu_log("apic_base=%p (virt=%p) apic_lvt_max=%d\n",
730 (void *)apic_base
, (void *)s
->apic_regs
,
739 s
->pg_global_mask
= 0;
740 if (s
->cpuid_features
& CPUID_PGE
)
741 s
->pg_global_mask
= PG_GLOBAL_MASK
;
747 s
->tab_insn_cycles_min
[i
] = 0x7fffffff;
751 if (kqemu_add_state(g
, s
) < 0)
759 int kqemu_set_phys_mem(struct kqemu_state
*s
,
760 const struct kqemu_phys_mem
*kphys_mem
)
762 uint64_t start
, size
, end
, addr
;
763 uint32_t ram_addr
, ram_end
, *ptr
, pd
, io_index
;
765 start
= kphys_mem
->phys_addr
;
766 size
= kphys_mem
->size
;
768 if ((start
& ~PAGE_MASK
) != 0 || (end
& ~PAGE_MASK
) != 0)
770 /* XXX: we only support 32 bit physical address space */
771 if ((start
& ~0xffffffffULL
) != 0 ||
772 ((end
- 1) & ~0xffffffffULL
) != 0)
774 io_index
= kphys_mem
->io_index
;
775 if (io_index
> KQEMU_IO_MEM_UNASSIGNED
)
778 if (io_index
<= KQEMU_IO_MEM_ROM
) {
779 ram_addr
= kphys_mem
->ram_addr
;
780 if ((ram_addr
& ~PAGE_MASK
) != 0)
782 ram_end
= ram_addr
+ size
;
784 if (ram_end
< ram_addr
)
786 if (ram_end
> s
->ram_size
)
788 pd
|= (ram_addr
& PAGE_MASK
);
790 for(addr
= start
; addr
!= end
; addr
+= PAGE_SIZE
) {
791 ptr
= phys_page_findp(s
, addr
>> PAGE_SHIFT
, 1);
795 if (io_index
<= KQEMU_IO_MEM_ROM
)
801 #ifdef PROFILE_INTERP2
804 static inline unsigned int lldiv(uint64_t a
, uint64_t b
)
815 static unsigned int lldiv(uint64_t a
, uint64_t b
)
823 while (b
>= 0x100000000LL
) {
828 asm volatile ("divl %2"
830 : "m" (b32
), "a" ((uint32_t )a
), "d" ((uint32_t )(a
>> 32)));
836 #define CYCLES_TO_MS(x) lldiv(x, 2400000)
837 #define EXCP_CYCLES 1200 /* approximate cycles to handle one exception */
839 static void profile_dump(struct kqemu_state
*s
)
843 if (s
->tab_insn_count
[i
] != 0) {
844 kqemu_log("%02x: %9lld %4d %4d %4d %11lld\n",
846 s
->tab_insn_count
[i
],
847 s
->tab_insn_cycles_min
[i
],
848 lldiv(s
->tab_insn_cycles
[i
], s
->tab_insn_count
[i
]),
849 s
->tab_insn_cycles_max
[i
],
850 s
->tab_insn_cycles
[i
]);
854 #ifdef PROFILE_INTERP_PC
857 ProfileInterpEntry
*pe
, *pe1
, *pe2
, tmp
;
858 int64_t cycles_tot
, cycles_sum
;
860 kqemu_log("Interp PC dump:\n");
861 kqemu_log("n: EIP count avg_insn_count avg_cycles cumulative_time\n");
863 /* add exception cost */
864 for(i
= 0; i
< s
->nb_profile_interp_entries
; i
++) {
865 pe
= &s
->profile_interp_entries
[i
];
866 pe
->cycles
+= pe
->count
* EXCP_CYCLES
;
870 for(i
= 0; i
< (s
->nb_profile_interp_entries
- 1); i
++) {
871 for(j
= i
+ 1; j
< s
->nb_profile_interp_entries
; j
++) {
872 pe1
= &s
->profile_interp_entries
[i
];
873 pe2
= &s
->profile_interp_entries
[j
];
874 if (pe1
->cycles
< pe2
->cycles
) {
883 for(i
= 0; i
< s
->nb_profile_interp_entries
; i
++)
884 cycles_tot
+= s
->profile_interp_entries
[i
].cycles
;
887 n
= s
->nb_profile_interp_entries
;
890 for(i
= 0; i
< n
; i
++) {
891 pe
= &s
->profile_interp_entries
[i
];
892 cycles_sum
+= pe
->cycles
;
893 kqemu_log("%4d: " FMT_lx
" %lld %d %d %d%%\n",
897 lldiv(pe
->insn_count
, pe
->count
),
898 lldiv(pe
->cycles
, pe
->count
),
899 lldiv(cycles_sum
* 100, cycles_tot
));
903 kqemu_log("Execution statistics:\n");
904 kqemu_log("total_interp_count=%lld\n",
905 s
->total_interp_count
);
906 kqemu_log("exc_interp: count=%lld avg_insn=%d (%lld)\n",
908 lldiv(s
->exc_insn_count
, s
->exc_interp_count
),
910 kqemu_log("exc_interp: max=%d EIP=%08lx\n",
911 s
->exc_insn_count_max
,
912 s
->exc_start_eip_max
);
913 kqemu_log("exc_seg_cycles=%d cycles/insn=%d (%d ms)\n",
914 lldiv(s
->exc_seg_cycles
, s
->exc_interp_count
),
915 lldiv(s
->exc_interp_cycles
, s
->exc_insn_count
),
916 CYCLES_TO_MS(s
->exc_interp_cycles
+ s
->exc_seg_cycles
+ s
->exc_interp_count
* EXCP_CYCLES
));
917 kqemu_log("interp_interrupt: count=%lld cycles=%d (%d ms)\n",
918 s
->interp_interrupt_count
,
919 lldiv(s
->interp_interrupt_cycles
, s
->interp_interrupt_count
),
920 CYCLES_TO_MS(s
->interp_interrupt_cycles
));
922 kqemu_log("tlb_flush: count=%lld cycles=%d (%d ms)\n",
924 lldiv(s
->tlb_flush_cycles
, s
->tlb_flush_count
),
925 CYCLES_TO_MS(s
->tlb_flush_cycles
));
926 kqemu_log("tlb_flush_page: count=%lld cycles=%d (%d ms)\n",
927 s
->tlb_flush_page_count
,
928 lldiv(s
->tlb_flush_page_cycles
, s
->tlb_flush_page_count
),
929 CYCLES_TO_MS(s
->tlb_flush_page_cycles
));
930 kqemu_log("page faults: total=%lld mmu=%lld cycles=%d (%d ms)\n",
931 s
->total_page_fault_count
,
932 s
->mmu_page_fault_count
,
933 lldiv(s
->mmu_page_fault_cycles
+ s
->tlb_page_fault_cycles
, s
->mmu_page_fault_count
),
934 CYCLES_TO_MS(s
->mmu_page_fault_cycles
+ s
->tlb_page_fault_cycles
+ EXCP_CYCLES
* s
->total_page_fault_count
));
935 kqemu_log("page faults tlb: count=%lld (interp_count=%lld) cycles=%d (%d ms)\n",
936 s
->tlb_page_fault_count
,
937 s
->tlb_interp_page_fault_count
,
938 lldiv(s
->tlb_page_fault_cycles
, s
->tlb_page_fault_count
),
939 CYCLES_TO_MS(s
->tlb_page_fault_cycles
+ EXCP_CYCLES
* s
->tlb_page_fault_count
));
940 kqemu_log("exec_init: count=%lld cycles=%d (%d ms)\n",
942 lldiv(s
->exec_init_cycles
, s
->exec_init_count
),
943 CYCLES_TO_MS(s
->exec_init_cycles
));
944 kqemu_log("hw_interrupt: count=%lld cycles=%d (%d ms)\n",
945 s
->hw_interrupt_count
,
946 lldiv(s
->hw_interrupt_cycles
, s
->hw_interrupt_count
),
947 CYCLES_TO_MS(s
->hw_interrupt_cycles
+ EXCP_CYCLES
* s
->hw_interrupt_count
));
948 kqemu_log("ram_map: count=%lld miss=%d%%\n",
950 lldiv(s
->ram_map_miss_count
* 100, s
->ram_map_count
));
954 void kqemu_delete(struct kqemu_state
*s
)
957 struct kqemu_ram_page
*rp
;
958 struct mapped_page
*p
;
961 #ifdef PROFILE_INTERP2
964 /* unlock the user pages */
965 for(rp
= s
->locked_page_head
.lock_next
;
966 rp
!= KER_RP_PTR(s
, &s
->locked_page_head
);
967 rp
= rp
->lock_next
) {
968 rp
= MON_RP_PTR(s
, rp
);
969 kqemu_unlock_user_page(rp
->host_page
);
972 /* free all user and kernel pages */
973 for(i
= 0; i
< MAX_MAPPED_PAGES
; i
++) {
974 p
= &s
->mapped_pages
[i
];
975 if (p
->host_page
!= NULL
) {
977 kqemu_unlock_user_page(p
->host_page
);
979 kqemu_free_page(p
->host_page
);
985 kqemu_io_unmap((void *)s
->apic_regs
, PAGE_SIZE
);
989 s1
= (uint8_t *)s
- PAGE_ALIGN(sizeof(monitor_code
));
993 struct kqemu_cpu_state
*kqemu_get_cpu_state(struct kqemu_state
*s
)
995 return &s
->cpu_state
;
998 static inline int apic_check_lvt(struct kqemu_state
*s
, int lvt
)
1001 val
= s
->apic_regs
[(APIC_LVTT
>> 2) + lvt
* 4];
1002 if (!(val
& APIC_LVT_MASKED
) &&
1003 (val
& APIC_DM_MASK
) == APIC_DM_NMI
) {
1004 val
|= APIC_LVT_MASKED
;
1005 s
->apic_regs
[(APIC_LVTT
>> 2) + lvt
* 4] = val
;
1012 static inline void apic_restore_lvt(struct kqemu_state
*s
, int lvt
,
1015 if (lvt_mask
& (1 << lvt
))
1016 s
->apic_regs
[(APIC_LVTT
>> 2) + lvt
* 4] &= ~APIC_LVT_MASKED
;
1019 static int apic_save_and_disable_nmi(struct kqemu_state
*s
)
1024 switch(s
->apic_lvt_max
) {
1027 lvt_mask
|= apic_check_lvt(s
, 1); /* APIC_LVTTHMR */
1030 lvt_mask
|= apic_check_lvt(s
, 2); /* APIC_LVTPC */
1031 lvt_mask
|= apic_check_lvt(s
, 2); /* APIC_LVTPC (twice because
1032 could be masked by hardware) */
1035 lvt_mask
|= apic_check_lvt(s
, 0); /* APIC_LVTT */
1036 lvt_mask
|= apic_check_lvt(s
, 3); /* APIC_LVT0 */
1037 lvt_mask
|= apic_check_lvt(s
, 4); /* APIC_LVT1 */
1038 lvt_mask
|= apic_check_lvt(s
, 5); /* APIC_LVTERR */
1044 static void apic_restore_nmi(struct kqemu_state
*s
, int lvt_mask
)
1047 apic_restore_lvt(s
, 0, lvt_mask
);
1048 apic_restore_lvt(s
, 1, lvt_mask
);
1049 apic_restore_lvt(s
, 2, lvt_mask
);
1050 apic_restore_lvt(s
, 3, lvt_mask
);
1051 apic_restore_lvt(s
, 4, lvt_mask
);
1052 apic_restore_lvt(s
, 5, lvt_mask
);
1058 if ((s->cpu_state.dr ## n - s->monitor_vaddr) < MONITOR_MEM_SIZE) {\
1059 /* cannot set breakpoint */\
1060 s->monitor_dr7 &= ~(3 << (2 * n));\
1062 asm volatile ("mov %0, %%dr" #n : : "r" (s->cpu_state.dr ## n));\
1066 long kqemu_exec(struct kqemu_state
*s
)
1068 const struct monitor_code_header
*m
= (void *)monitor_code
;
1069 void (*kernel2monitor
)(struct kqemu_state
*s
) =
1070 (void *)(m
->kernel2monitor
+ s
->nexus_kaddr
);
1072 int ret
, apic_nmi_mask
, cpl
;
1074 unsigned long flags
;
1075 uint32_t efer_low
, efer_high
, efer_low1
;
1077 uint16_t saved_fs
, saved_gs
;
1079 uint16_t saved_ds
, saved_es
;
1080 unsigned long fs_base
, gs_base
;
1084 s
->nb_profile_ts
= 0;
1089 cs_val
= 0; /* avoid warning */
1090 efer_low
= 0; /* avoid warning */
1091 efer_high
= 0; /* avoid warning */
1092 apic_nmi_mask
= 0; /* avoid warning */
1094 /* NOTE: we do not abort here because we need to execute the
1095 various page commands before */
1096 if ((s
->cpu_state
.tr
.selector
& 0xfffc) == 0 ||
1097 (s
->cpu_state
.tr
.selector
& 4) != 0) {
1098 s
->monitor_tr_sel
= s
->monitor_selector_base
+ (4 << 3);
1100 s
->monitor_tr_sel
= s
->cpu_state
.tr
.selector
& 0xfff8;
1103 /* init the initial cr3 */
1104 cpl
= s
->cpu_state
.cpl
;
1106 s
->cpu_state
.cpl
= cpl
;
1107 s
->monitor_cr3
= s
->pgds_cr3
[(cpl
== 3)];
1108 /* init the initial GDT */
1110 s
->monitor_gdt
.base
= s
->monitor_data_vaddr
+
1111 offsetof(struct kqemu_state
, dt_table
) + 0x20000 * (cpl
== 3);
1113 s
->monitor_gdt
.base
= s
->monitor_data_vaddr
+
1114 offsetof(struct kqemu_state
, dt_table
) + 0x20000 * cpl
;
1117 /* push stack frame to call monitor_exec() */
1118 /* reserve space for the registers */
1119 ptr
= (void *)(s
->stack_end
- sizeof(struct kqemu_exception_regs
));
1121 *--ptr
= 0; /* no return addr */
1122 *--ptr
= m
->monitor_exec
+ s
->monitor_vaddr
;
1123 *--ptr
= 0; /* rbp */
1124 *--ptr
= 0; /* rbx */
1125 *--ptr
= 0; /* r12 */
1126 *--ptr
= 0; /* r13 */
1127 *--ptr
= 0; /* r14 */
1128 *--ptr
= 0; /* r15 */
1130 *--ptr
= s
->monitor_data_vaddr
; /* parameter = kqemu_state */
1131 *--ptr
= 0; /* no return addr */
1132 *--ptr
= m
->monitor_exec
+ s
->monitor_vaddr
;
1133 *--ptr
= 0; /* ebp */
1134 *--ptr
= 0; /* ebx */
1135 *--ptr
= 0; /* esi */
1136 *--ptr
= 0; /* edi */
1138 s
->monitor_esp
= s
->monitor_data_vaddr
+ (unsigned long)ptr
-
1142 /* currently we execute all the monitor code with interrupt
1143 masked. It is not optimal but simpler */
1149 apic_nmi_mask
= apic_save_and_disable_nmi(s
);
1152 /* load breakpoint registers and avoid setting them if in the
1153 monitor address space. We suppose that no breakpoints are
1154 set by the host OS for this process */
1155 if (s
->cpu_state
.dr7
& 0xff) {
1156 s
->monitor_dr7
= s
->cpu_state
.dr7
;
1161 asm volatile ("mov %0, %%dr6" : : "r" (s
->cpu_state
.dr6
));
1170 rdmsr(MSR_IA32_SYSENTER_CS
, cs_val
, dummy
);
1171 wrmsr(MSR_IA32_SYSENTER_CS
, 0, 0);
1174 if (s
->use_syscall
) {
1175 rdmsr(MSR_EFER
, efer_low
, efer_high
);
1176 efer_low1
= efer_low
& ~MSR_EFER_SCE
;
1177 wrmsr(MSR_EFER
, efer_low1
, efer_high
);
1181 /* disable syscall/sysret (will generate ILLOP execption) */
1182 /* save segment registers */
1183 asm volatile ("movw %%ds, %0" : "=m" (saved_ds
));
1184 asm volatile ("movw %%es, %0" : "=m" (saved_es
));
1185 rdmsrl(MSR_FSBASE
, fs_base
);
1186 rdmsrl(MSR_GSBASE
, gs_base
);
1188 asm volatile ("movw %%fs, %0" : "=m" (saved_fs
));
1189 asm volatile ("movw %%gs, %0" : "=m" (saved_gs
));
1192 /* write the nexus PTE - we assume the pointer does not change */
1193 is_user
= (s
->cpu_state
.cpl
== 3);
1196 ptep
= s
->nexus_kaddr_ptep
[is_user
];
1197 s
->nexus_orig_pte
= *ptep
;
1198 *ptep
= s
->nexus_pte
;
1201 ptep
= s
->nexus_kaddr_ptep
[is_user
];
1202 s
->nexus_orig_pte
= *ptep
;
1203 *ptep
= s
->nexus_pte
;
1208 /* restore the original PTE (note that the CPL can change) */
1209 is_user
= (s
->cpu_state
.cpl
== 3);
1212 ptep
= s
->nexus_kaddr_ptep
[is_user
];
1213 *ptep
= s
->nexus_orig_pte
;
1216 ptep
= s
->nexus_kaddr_ptep
[is_user
];
1217 *ptep
= s
->nexus_orig_pte
;
1221 /* restore segments */
1222 asm volatile ("movw %0, %%fs" : : "m" (saved_fs
));
1223 asm volatile ("movw %0, %%gs" : : "m" (saved_gs
));
1225 wrmsrl(MSR_FSBASE
, fs_base
);
1226 wrmsrl(MSR_GSBASE
, gs_base
);
1227 asm volatile ("movw %0, %%ds" : : "m" (saved_ds
));
1228 asm volatile ("movw %0, %%es" : : "m" (saved_es
));
1231 if (s
->use_syscall
) {
1232 /* restore syscall/sysret */
1233 wrmsr(MSR_EFER
, efer_low
, efer_high
);
1237 wrmsr(MSR_IA32_SYSENTER_CS
, cs_val
, 0);
1241 apic_restore_nmi(s
, apic_nmi_mask
);
1245 if (s
->mon_req
== MON_REQ_IRQ
) {
1246 struct kqemu_exception_regs
*r
;
1247 /* execute the requested host interrupt and then schedule
1249 exec_irq(s
->arg0
); /* side effect: restore the IRQs */
1250 r
= (void *)((unsigned long)s
->regs
- s
->monitor_data_vaddr
+ (unsigned long)s
);
1251 if ((r
->cs_sel
& 3) == 3) {
1252 /* if interrupting user code, we schedule to give time
1253 to the other processes. We can be interrupted by a
1254 signal a that case. */
1255 if (kqemu_schedule()) {
1256 restore_cpu_state_from_regs(s
, r
);
1257 ret
= KQEMU_RET_INTR
;
1262 unsigned long page_index
;
1264 restore_flags(flags
);
1265 switch(s
->mon_req
) {
1267 kqemu_log("aborting: %s", s
->log_buf
);
1268 ret
= KQEMU_RET_ABORT
;
1273 struct kqemu_exception_regs
*r
;
1274 r
= (void *)((unsigned long)s
->regs
- s
->monitor_data_vaddr
+ (unsigned long)s
);
1275 restore_cpu_state_from_regs(s
, r
);
1279 kqemu_log("%s", s
->log_buf
);
1281 case MON_REQ_ALLOC_PAGE
:
1282 s
->ret
= (unsigned long)kqemu_alloc_zeroed_page(&page_index
);
1283 s
->ret2
= page_index
;
1285 case MON_REQ_LOCK_USER_PAGE
:
1286 s
->ret
= (unsigned long)kqemu_lock_user_page(&page_index
,
1288 s
->ret2
= page_index
;
1290 case MON_REQ_UNLOCK_USER_PAGE
:
1291 kqemu_unlock_user_page((struct kqemu_user_page
*)s
->arg0
);
1293 case MON_REQ_EXCEPTION
:
1294 exec_exception(s
->arg0
);
1297 kqemu_log("invalid mon request: %d\n", s
->mon_req
);
1307 int i
, last
, first
, overhead
;
1308 first
= s
->profile_ts
[0];
1310 overhead
= s
->profile_ts
[1] - s
->profile_ts
[0];
1311 kqemu_log("profile (overhead=%d):\n", overhead
);
1312 for(i
= 1; i
< s
->nb_profile_ts
; i
++) {
1313 kqemu_log("%3d@%4d: %6d %6d\n",
1314 i
, s
->profile_line
[i
],
1315 s
->profile_ts
[i
] - first
- i
* overhead
,
1316 s
->profile_ts
[i
] - last
- overhead
);
1317 last
= s
->profile_ts
[i
];
1321 s
->cpu_state
.retval
= ret
;