common/monitor.c

   1 /*
   2  * KQEMU
   3  *
   4  * Copyright (C) 2004-2008 Fabrice Bellard
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * version 2 as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18  */
  19 #include "kqemu_int.h"
  20
  21 //#define DEBUG_TLB
  22 //#define DEBUG_MMU
  23 //#define DEBUG_PHYS_LOAD_STORE
  24 //#define DEBUG_RAM
  25 //#define DEBUG_LOCK
  26 //#define DEBUG_SOFT_TLB
  27 //#define DEBUG_INVALIDATE
  28
  29 //#define PROFILE_SOFTMMU
  30 //#define DEBUG_DT_CACHE
  31
  32 static void mon_set_pte(struct kqemu_state *s,
  33                         int as_index, unsigned long vaddr,
  34                         unsigned long paddr, int pte_flags);
  35 static void unmap_ram_page(struct kqemu_state *s,
  36                            struct kqemu_ram_page *rp);
  37 static void unlock_ram_page(struct kqemu_state *s,
  38                             struct kqemu_ram_page *rp);
  39 static void *mon_alloc_page(struct kqemu_state *s,
  40                             unsigned long *ppage_index);
  41
  42 #define IN_MONITOR
  43 #include "common.c"
  44
  45 /*
  46  * Segment state in monitor code:
  47  *
  48  * If CPL = 3 or not USE_SEG_GP:
  49  *   FS, GS are stored in %fs, %gs.
  50  *   CS, SS, DS, ES are stored in s->reg1.xx_sel
  51  *   the content of the CPU seg desc caches are consistent with the dt_table
  52  *
  53  * If CPL != 3 and USE_SEG_GP:
  54  *
  55  *   FS, GS are stored in %fs, %gs. If not null and different from
  56  *   s->reg1.cs_sel and s->reg1.ss_sel, then the content of the CPU
  57  *   seg desc caches are consistent with s->seg_desc_cache[R_xx]
  58  *
  59  *   DS, ES are stored in s1->reg1.xx_sel. Same remark as FS and FS
  60  *   for CPU seg desc cache consistency.
  61  *
  62  *   CS, SS are stored in s1->reg1.xx_sel. The content of the CPU seg
  63  *   desc caches are consistent with the dt_table
  64  *
  65  * If seg_cache_loaded is true, then s->cpu_state.segs[].base is
  66  * updated. For CS and SS, s->cpu_state.segs[].flags is updated too.
  67  *
  68  */
  69
  70 static inline void save_segs(struct kqemu_state *s)
  71 {
  72     struct kqemu_cpu_state *env = &s->cpu_state;
  73
  74     asm volatile ("movw %%fs, %0" : "=m" (env->segs[R_FS].selector));
  75     asm volatile ("movw %%gs, %0" : "=m" (env->segs[R_GS].selector));
  76 #ifdef __x86_64__
  77     rdmsrl(MSR_FSBASE, env->segs[R_FS].base);
  78     rdmsrl(MSR_GSBASE, env->segs[R_GS].base);
  79
  80     asm volatile ("movw %%ds, %0" : "=m" (env->segs[R_DS].selector));
  81     asm volatile ("movw %%es, %0" : "=m" (env->segs[R_ES].selector));
  82 #endif
  83 }
  84
  85 static inline void reload_segs(struct kqemu_state *s)
  86 {
  87     struct kqemu_cpu_state *env = &s->cpu_state;
  88
  89 #ifdef USE_SEG_GP
  90     if (s->cpu_state.cpl != 3) {
  91         set_cpu_seg_cache(s, R_FS, env->segs[R_FS].selector);
  92         set_cpu_seg_cache(s, R_GS, env->segs[R_GS].selector);
  93 #ifdef __x86_64__
  94         set_cpu_seg_cache(s, R_DS, env->segs[R_DS].selector);
  95         set_cpu_seg_cache(s, R_ES, env->segs[R_ES].selector);
  96 #endif
  97     } else
  98 #endif
  99     {
 100         LOAD_SEG(fs, env->segs[R_FS].selector);
 101         LOAD_SEG(gs, env->segs[R_GS].selector);
 102 #ifdef __x86_64__
 103         LOAD_SEG(ds, env->segs[R_DS].selector);
 104         LOAD_SEG(es, env->segs[R_ES].selector);
 105 #endif
 106     }
 107 #ifdef __x86_64__
 108     wrmsrl(MSR_FSBASE, env->segs[R_FS].base);
 109     wrmsrl(MSR_GSBASE, env->segs[R_GS].base);
 110 #endif
 111 }
 112
 113 void update_host_cr0(struct kqemu_state *s)
 114 {
 115     unsigned long guest_cr0, host_cr0;
 116
 117     guest_cr0 = s->cpu_state.cr0;
 118     host_cr0 = s->kernel_cr0;
 119     if (guest_cr0 & (CR0_TS_MASK | CR0_EM_MASK)) {
 120         host_cr0 |= CR0_TS_MASK;
 121     }
 122     host_cr0 = (host_cr0 & ~(CR0_MP_MASK)) | (guest_cr0 & CR0_MP_MASK);
 123     host_cr0 &= ~CR0_AM_MASK;
 124     if ((guest_cr0 & CR0_AM_MASK) && s->cpu_state.cpl == 3)
 125         host_cr0 |= CR0_AM_MASK;
 126     asm volatile ("mov %0, %%cr0" : : "r" (host_cr0));
 127 }
 128
 129 void update_host_cr4(struct kqemu_state *s)
 130 {
 131     unsigned long guest_cr4, host_cr4, mask;
 132     asm volatile("mov %%cr4, %0" : "=r" (host_cr4));
 133     mask = 0;
 134     if (s->cpuid_features & CPUID_FXSR)
 135         mask |= CR4_OSFXSR_MASK;
 136     if (s->cpuid_features & CPUID_SSE)
 137         mask |= CR4_OSXMMEXCPT_MASK;
 138     guest_cr4 = s->cpu_state.cr4;
 139     host_cr4 = (guest_cr4 & mask) | (host_cr4 & ~mask);
 140     if (s->cpu_state.cpl == 0) {
 141         host_cr4 &= ~CR4_TSD_MASK; /* rdtsc is enabled */
 142     } else {
 143         host_cr4 = (guest_cr4 & CR4_TSD_MASK) | (host_cr4 & ~CR4_TSD_MASK);
 144     }
 145     asm volatile ("mov %0, %%cr4" : : "r" (host_cr4));
 146 }
 147
 148 static inline void restore_monitor_nexus_mapping(struct kqemu_state *s)
 149 {
 150     int is_user;
 151     /* restore the original mapping */
 152     is_user = (s->cpu_state.cpl == 3);
 153     if (USE_PAE(s)) {
 154         uint64_t *ptep;
 155         ptep = s->nexus_kaddr_vptep[is_user];
 156         *ptep = s->nexus_orig_pte;
 157     } else {
 158         uint32_t *ptep;
 159         ptep = s->nexus_kaddr_vptep[is_user];
 160         *ptep = s->nexus_orig_pte;
 161     }
 162     asm volatile ("invlpg (%0)" : : "r" (s->nexus_kaddr));
 163 }
 164
 165 static void monitor2kernel1(struct kqemu_state *s)
 166 {
 167     struct kqemu_exception_regs *r;
 168     int is_user;
 169
 170     r = s->regs;
 171     if (r) {
 172         save_segs(s);
 173     }
 174
 175     /* map the nexus page to its kernel address */
 176     is_user = (s->cpu_state.cpl == 3);
 177     if (USE_PAE(s)) {
 178         uint64_t *ptep;
 179         ptep = s->nexus_kaddr_vptep[is_user];
 180         s->nexus_orig_pte = *ptep;
 181         *ptep = s->nexus_pte;
 182     } else {
 183         uint32_t *ptep;
 184         ptep = s->nexus_kaddr_vptep[is_user];
 185         s->nexus_orig_pte = *ptep;
 186         *ptep = s->nexus_pte;
 187     }
 188     asm volatile ("invlpg (%0)" : : "r" (s->nexus_kaddr));
 189
 190     monitor2kernel(s);
 191
 192     update_host_cr0(s);
 193
 194     update_host_cr4(s);
 195
 196     restore_monitor_nexus_mapping(s);
 197
 198     if (r) {
 199         reload_segs(s);
 200     }
 201 }
 202
 203 void monitor_log(struct kqemu_state *s, const char *fmt, ...)
 204 {
 205     va_list ap;
 206     va_start(ap, fmt);
 207     mon_vsnprintf(s->log_buf, sizeof(s->log_buf), fmt, ap);
 208     s->mon_req = MON_REQ_LOG;
 209     monitor2kernel1(s);
 210     va_end(ap);
 211 }
 212
 213 void monitor_panic(struct kqemu_state *s, const char *fmt, ...)
 214 {
 215     va_list ap;
 216     va_start(ap, fmt);
 217     mon_vsnprintf(s->log_buf, sizeof(s->log_buf), fmt, ap);
 218     s->mon_req = MON_REQ_ABORT;
 219     monitor2kernel1(s);
 220     /* should never come here */
 221     while (1);
 222 }
 223
 224 void __attribute__((noreturn, format (printf, 3, 4)))
 225 monitor_panic_regs(struct kqemu_state *s, struct kqemu_exception_regs *r,
 226                    const char *fmt, ...)
 227 {
 228     va_list ap;
 229     int len;
 230     va_start(ap, fmt);
 231     mon_vsnprintf(s->log_buf, sizeof(s->log_buf), fmt, ap);
 232     len = strlen(s->log_buf);
 233     mon_snprintf(s->log_buf + len, sizeof(s->log_buf) - len,
 234                  "err=%04x CS:EIP=%04x:" FMT_lx " SS:SP=%04x:" FMT_lx "\n",
 235                  (int)r->error_code, r->cs_sel, (long)r->eip,
 236                  r->ss_sel, (long)r->esp);
 237     s->mon_req = MON_REQ_ABORT;
 238     monitor2kernel1(s);
 239     /* should never come here */
 240     while (1);
 241 }
 242
 243 struct kqemu_page *monitor_alloc_page(struct kqemu_state *s,
 244                                       unsigned long *ppage_index)
 245 {
 246     s->mon_req = MON_REQ_ALLOC_PAGE;
 247     monitor2kernel1(s);
 248     *ppage_index = s->ret2;
 249     return (void *)s->ret;
 250 }
 251
 252 static struct kqemu_user_page *monitor_lock_user_page(struct kqemu_state *s,
 253                                                       unsigned long *ppage_index,
 254                                                       unsigned long uaddr)
 255 {
 256     s->mon_req = MON_REQ_LOCK_USER_PAGE;
 257     s->arg0 = uaddr;
 258     monitor2kernel1(s);
 259     *ppage_index = s->ret2;
 260     return (void *)s->ret;
 261 }
 262
 263 static void monitor_unlock_user_page(struct kqemu_state *s,
 264                                      struct kqemu_user_page *page)
 265 {
 266     s->mon_req = MON_REQ_UNLOCK_USER_PAGE;
 267     s->arg0 = (long)page;
 268     monitor2kernel1(s);
 269 }
 270
 271 /* return NULL if error */
 272 static void *mon_alloc_page(struct kqemu_state *s,
 273                             unsigned long *ppage_index)
 274 {
 275     unsigned long vaddr, page_index;
 276     struct kqemu_page *host_page;
 277
 278     host_page = monitor_alloc_page(s, &page_index);
 279     if (!host_page) {
 280         return NULL;
 281     }
 282     vaddr = get_vaddr(s);
 283     /* XXX: check error */
 284     set_vaddr_page_index(s, vaddr, page_index, host_page, 0);
 285     mon_set_pte(s, 0, vaddr, page_index,
 286                 PG_PRESENT_MASK | PG_GLOBAL(s) | PG_RW_MASK);
 287     if (ppage_index)
 288         *ppage_index = page_index;
 289     return (void *)vaddr;
 290 }
 291
 292 static void mon_set_pte(struct kqemu_state *s,
 293                        int as_index, unsigned long vaddr,
 294                        unsigned long page_index, int pte_flags)
 295 {
 296     if (USE_PAE(s)) {
 297         uint64_t *ptep;
 298         ptep = mon_get_ptep_l3(s, as_index, vaddr, 1);
 299         *ptep = ((uint64_t)page_index << PAGE_SHIFT) | pte_flags;
 300     } else {
 301         uint32_t *ptep;
 302         ptep = mon_get_ptep_l2(s, as_index, vaddr, 1);
 303         *ptep = (page_index << PAGE_SHIFT) | pte_flags;
 304     }
 305     asm volatile("invlpg %0" : : "m" (*(uint8_t *)vaddr));
 306 }
 307
 308 static uint32_t phys_page_find(struct kqemu_state *s,
 309                                unsigned long page_index)
 310 {
 311     uint32_t *ptr, pd;
 312
 313     ptr = phys_page_findp(s, page_index, 0);
 314     if (!ptr)
 315         return KQEMU_IO_MEM_UNASSIGNED;
 316     pd = *ptr;
 317 #ifdef DEBUG_TLB
 318     monitor_log(s, "pd=%08x\n", pd);
 319 #endif
 320     return pd;
 321 }
 322
 323 /* return the ram page only if it is already locked */
 324 static struct kqemu_ram_page *get_locked_ram_page(struct kqemu_state *s,
 325                                                   unsigned long ram_addr)
 326 {
 327     int ram_page_index;
 328     struct kqemu_ram_page *rp;
 329     ram_page_index = ram_addr >> PAGE_SHIFT;
 330     rp = &s->ram_pages[ram_page_index];
 331     if (rp->paddr == -1)
 332         return NULL;
 333     return rp;
 334 }
 335
 336 /* unlock some pages to be able to allocate at least one page */
 337 static void unlock_pages(struct kqemu_state *s)
 338 {
 339     while (s->nb_locked_ram_pages >= s->max_locked_ram_pages) {
 340         /* unlock the least recently used pages */
 341         unlock_ram_page(s, s->locked_page_head.lock_prev);
 342     }
 343 }
 344
 345 static struct kqemu_ram_page *lock_ram_page(struct kqemu_state *s,
 346                                             unsigned long ram_addr)
 347 {
 348     int ram_page_index;
 349     struct kqemu_ram_page *rp, **p, *rp_prev, *rp_next;
 350     unsigned long uaddr, page_index;
 351     struct kqemu_user_page *host_page;
 352
 353     ram_page_index = ram_addr >> PAGE_SHIFT;
 354     rp = &s->ram_pages[ram_page_index];
 355     if (rp->paddr == -1) {
 356
 357         unlock_pages(s);
 358
 359         uaddr = ram_addr + s->ram_base_uaddr;
 360         host_page = monitor_lock_user_page(s, &page_index, uaddr);
 361         if (!host_page)
 362             monitor_panic(s, "Could not lock user page %p", (void *)uaddr);
 363         rp->paddr = page_index;
 364         rp->host_page = host_page;
 365
 366         /* insert in hash table */
 367         p = &s->ram_page_hash[ram_page_hash_func(page_index)];
 368         rp->hash_next = *p;
 369         *p = rp;
 370
 371         /* insert at lock list head */
 372         rp_prev = &s->locked_page_head;
 373         rp_next = s->locked_page_head.lock_next;
 374         rp_next->lock_prev = rp;
 375         rp->lock_next = rp_next;
 376         rp_prev->lock_next = rp;
 377         rp->lock_prev = rp_prev;
 378         s->nb_locked_ram_pages++;
 379 #ifdef DEBUG_LOCK
 380         monitor_log(s, "lock_ram_page: %p rp=%p\n", (void *)ram_addr, rp);
 381 #endif
 382     }
 383     return rp;
 384 }
 385
 386 static void unlock_ram_page(struct kqemu_state *s,
 387                             struct kqemu_ram_page *rp)
 388 {
 389     struct kqemu_ram_page **prp;
 390
 391     if (rp->paddr == -1)
 392         return;
 393 #ifdef DEBUG_LOCK
 394     monitor_log(s, "unlock_ram_page: rp=%p\n", rp);
 395 #endif
 396     unmap_ram_page(s, rp);
 397
 398     /* remove it from the hash list */
 399     prp = &s->ram_page_hash[ram_page_hash_func(rp->paddr)];
 400     for(;;) {
 401         if (*prp == NULL)
 402             break;
 403         if (*prp == rp) {
 404             *prp = rp->hash_next;
 405             break;
 406         }
 407         prp = &(*prp)->hash_next;
 408     }
 409
 410     /* unlock it in the kernel */
 411     monitor_unlock_user_page(s, rp->host_page);
 412
 413     rp->paddr = -1;
 414
 415     /* remove from lock list */
 416     rp->lock_prev->lock_next = rp->lock_next;
 417     rp->lock_next->lock_prev = rp->lock_prev;
 418     s->nb_locked_ram_pages--;
 419 }
 420
 421 static void map_ram_page(struct kqemu_state *s,
 422                          int as_index, unsigned long vaddr,
 423                          struct kqemu_ram_page *rp, int pte_flags)
 424 {
 425     unsigned long *rptep;
 426     struct kqemu_ram_page *rp_prev, *rp_next;
 427
 428 #ifdef DEBUG_RAM
 429     monitor_log(s, "map_ram_page: vaddr=%p rp=%p pte_flags=0x%x\n",
 430                 (void *)vaddr, rp, pte_flags);
 431 #endif
 432     unmap_virtual_ram_page(s, as_index, vaddr);
 433
 434     mon_set_pte(s, as_index, vaddr, rp->paddr, pte_flags);
 435
 436     if (rp->vaddr == -1) {
 437         /* most common case */
 438         rp->vaddr = vaddr | (as_index << 1);
 439
 440         /* add in mapping list */
 441         rp_prev = s->mapped_page_head.map_prev;
 442         rp_next = &s->mapped_page_head;
 443         rp_next->map_prev = rp;
 444         rp->map_next = rp_next;
 445         rp_prev->map_next = rp;
 446         rp->map_prev = rp_prev;
 447     } else {
 448         /* add a new mapping (there is already at least one mapping) */
 449         rptep = get_ram_page_next_mapping_alloc(s, as_index, vaddr, 1);
 450         if (!rptep)
 451             monitor_panic(s, "next_mapping: could not alloc page");
 452         *rptep = rp->vaddr;
 453         rp->vaddr = vaddr | (as_index << 1) | 1;
 454     }
 455
 456     /* move to head in locked list */
 457     rp_prev = &s->locked_page_head;
 458     if (rp != rp_prev->lock_next) {
 459         /* delete */
 460         rp->lock_prev->lock_next = rp->lock_next;
 461         rp->lock_next->lock_prev = rp->lock_prev;
 462
 463         /* insert at head */
 464         rp_next = s->locked_page_head.lock_next;
 465         rp_next->lock_prev = rp;
 466         rp->lock_next = rp_next;
 467         rp_prev->lock_next = rp;
 468         rp->lock_prev = rp_prev;
 469     }
 470 }
 471
 472 static unsigned long ram_ptr_to_ram_addr(struct kqemu_state *s, void *ptr)
 473 {
 474     int slot;
 475     slot = ((unsigned long)ptr - s->ram_page_cache_base) >> PAGE_SHIFT;
 476     return s->slot_to_ram_addr[slot];
 477 }
 478
 479 static void *get_ram_ptr_slow(struct kqemu_state *s, int slot,
 480                               unsigned long ram_addr)
 481 {
 482     struct kqemu_ram_page *rp;
 483     unsigned long vaddr;
 484     void *ptr;
 485
 486 #ifdef PROFILE_INTERP2
 487     s->ram_map_miss_count++;
 488 #endif
 489     rp = lock_ram_page(s, ram_addr);
 490     vaddr = (slot << PAGE_SHIFT) + s->ram_page_cache_base;
 491     /* map the ram page */
 492     map_ram_page(s, 0, vaddr, rp,
 493                  PG_PRESENT_MASK | PG_GLOBAL(s) |
 494                  PG_ACCESSED_MASK | PG_DIRTY_MASK |
 495                  PG_RW_MASK);
 496     s->slot_to_ram_addr[slot] = ram_addr;
 497     ptr = (void *)vaddr;
 498 #if defined(DEBUG_SOFT_TLB)
 499     monitor_log(s, "get_ram_ptr: slot=%d ram_addr=%p ptr=%p\n",
 500                 slot, (void *)ram_addr, ptr);
 501 #endif
 502     return ptr;
 503 }
 504
 505 static inline void *get_ram_ptr(struct kqemu_state *s, int slot,
 506                                 unsigned long ram_addr)
 507 {
 508     unsigned long vaddr;
 509 #ifdef PROFILE_INTERP2
 510     s->ram_map_count++;
 511 #endif
 512     if (likely(s->slot_to_ram_addr[slot] == ram_addr)) {
 513         vaddr = (slot << PAGE_SHIFT) + s->ram_page_cache_base;
 514         return (void *)vaddr;
 515     } else {
 516         return get_ram_ptr_slow(s, slot, ram_addr);
 517     }
 518 }
 519
 520 static inline int ram_is_dirty(struct kqemu_state *s, unsigned long ram_addr)
 521 {
 522     return s->ram_dirty[ram_addr >> PAGE_SHIFT] == 0xff;
 523 }
 524
 525 static inline int ram_get_dirty(struct kqemu_state *s, unsigned long ram_addr,
 526                                 int dirty_flags)
 527 {
 528     return s->ram_dirty[ram_addr >> PAGE_SHIFT] & dirty_flags;
 529 }
 530
 531 static void ram_set_read_only(struct kqemu_state *s,
 532                               unsigned long ram_addr)
 533 {
 534     struct kqemu_ram_page *rp;
 535     unsigned long addr, vaddr;
 536     unsigned long *nptep;
 537     uint32_t *ptep;
 538
 539     rp = get_locked_ram_page(s, ram_addr);
 540     if (rp) {
 541         vaddr = rp->vaddr;
 542         if (vaddr == -1)
 543             return;
 544         for(;;) {
 545             addr = vaddr & ~0xfff;
 546             if ((addr - s->ram_page_cache_base) < SOFT_TLB_SIZE * PAGE_SIZE) {
 547                 /* XXX: do it too */
 548             } else {
 549                 if (USE_PAE(s))
 550                     ptep = (uint32_t *)mon_get_ptep_l3(s,
 551                                                        GET_AS(vaddr), addr, 0);
 552                 else
 553                     ptep = mon_get_ptep_l2(s, GET_AS(vaddr), addr, 0);
 554                 *ptep &= ~PG_RW_MASK;
 555                 asm volatile("invlpg %0" : : "m" (*(uint8_t *)addr));
 556             }
 557             if (IS_LAST_VADDR(vaddr))
 558                 break;
 559             nptep = get_ram_page_next_mapping(s, GET_AS(vaddr), addr);
 560             vaddr = *nptep;
 561         }
 562     }
 563 }
 564
 565 /* XXX: need to reset user space structures too */
 566 static void ram_reset_dirty(struct kqemu_state *s,
 567                             unsigned long ram_addr, int dirty_flag)
 568 {
 569
 570     /* we must modify the protection of all the user pages if it is
 571        not already done */
 572     if (ram_is_dirty(s, ram_addr)) {
 573         ram_set_read_only(s, ram_addr);
 574         /* signal QEMU that it needs to update its TLB info */
 575         s->cpu_state.nb_ram_pages_to_update = 1;
 576     }
 577     s->ram_dirty[ram_addr >> PAGE_SHIFT] &= ~dirty_flag;
 578 }
 579
 580 static inline void *get_phys_mem_ptr(struct kqemu_state *s,
 581                                      unsigned long paddr, int write)
 582 {
 583     int io_index, slot;
 584     unsigned long pd, ram_addr;
 585     uint8_t *ptr;
 586
 587     pd = phys_page_find(s, paddr >> PAGE_SHIFT);
 588     io_index = (pd & ~PAGE_MASK);
 589     if (unlikely(io_index != KQEMU_IO_MEM_RAM)) {
 590         if (io_index != KQEMU_IO_MEM_ROM)
 591             return NULL;
 592         if (write)
 593             return NULL;
 594     }
 595     ram_addr = pd & PAGE_MASK;
 596     slot = (ram_addr >> PAGE_SHIFT);
 597     slot = slot ^ (slot >> PHYS_SLOT_BITS) ^ (slot >> (2 * PHYS_SLOT_BITS));
 598     slot = (slot & (PHYS_NB_SLOTS - 1)) + SOFT_TLB_SIZE;
 599     ptr = get_ram_ptr(s, slot, ram_addr);
 600 #if defined(DEBUG_TLB)
 601     monitor_log(s, "get_phys_mem_ptr: paddr=%p ram_addr=%p ptr=%p\n",
 602                 (void *)paddr,
 603                 (void *)ram_addr,
 604                 (void *)ptr);
 605 #endif
 606     return ptr + (paddr & ~PAGE_MASK);
 607 }
 608
 609 static uint32_t ldl_phys_mmu(struct kqemu_state *s, unsigned long addr)
 610 {
 611     uint32_t *ptr;
 612     uint32_t val;
 613     ptr = get_phys_mem_ptr(s, addr, 0);
 614     if (!ptr)
 615         val = 0;
 616     else
 617         val = *ptr;
 618 #ifdef DEBUG_PHYS_LOAD_STORE
 619     monitor_log(s, "ldl_phys_mmu: %p = 0x%08x\n", (void *)addr, val);
 620 #endif
 621     return val;
 622 }
 623
 624 /* NOTE: we do not update the dirty bits. This function is only used
 625    to update the D and A bits, so it is not critical */
 626 static void stl_phys_mmu(struct kqemu_state *s, unsigned long addr,
 627                              uint32_t val)
 628 {
 629     uint32_t *ptr;
 630 #ifdef DEBUG_PHYS_LOAD_STORE
 631     monitor_log(s, "st_phys_mmu: %p = 0x%08x\n", (void *)addr, val);
 632 #endif
 633     ptr = get_phys_mem_ptr(s, addr, 1);
 634     if (ptr)
 635         *ptr = val;
 636 }
 637
 638 /* return 0 if OK, 2 if the mapping could not be done because I/O
 639    memory region or monitor memory area */
 640 static long tlb_set_page(struct kqemu_state *s,
 641                         unsigned long vaddr, unsigned long paddr,
 642                         int prot, int is_softmmu)
 643 {
 644     unsigned long pd;
 645     int pte_flags, mask, is_user;
 646     long ret;
 647     struct kqemu_ram_page *rp;
 648
 649 #ifdef DEBUG_RAM
 650     monitor_log(s, "tlb_set_page: vaddr=%p paddr=%p prot=0x%02x s=%d\n",
 651                 (void *)vaddr, (void *)paddr, prot, is_softmmu);
 652 #endif
 653     pd = phys_page_find(s, paddr >> PAGE_SHIFT);
 654
 655     if ((pd & ~PAGE_MASK) > KQEMU_IO_MEM_ROM) {
 656         if ((pd & ~PAGE_MASK) == KQEMU_IO_MEM_COMM) {
 657             /* special case: mapping of the kqemu communication page */
 658             pte_flags = PG_PRESENT_MASK | PG_USER_MASK |
 659                 PG_ACCESSED_MASK | PG_DIRTY_MASK;
 660             is_user = (s->cpu_state.cpl == 3);
 661             if (is_user)
 662                 mask = PAGE_UWRITE;
 663             else
 664                 mask = PAGE_KWRITE;
 665             if (prot & mask)
 666                 pte_flags |= PG_ORIG_RW_MASK | PG_RW_MASK;
 667             mon_set_pte(s, is_user, vaddr, s->comm_page_index, pte_flags);
 668             ret = 0;
 669         } else {
 670             /* IO access: no mapping is done as it will be handled by the
 671                soft MMU */
 672             ret = 2;
 673         }
 674     } else {
 675         if (is_softmmu) {
 676             /* XXX: dirty ram support */
 677             /* XXX: rom support */
 678             TLBEntry *e;
 679             unsigned long vaddr1;
 680             int slot;
 681             void *ptr;
 682             slot = (vaddr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1);
 683             e = &s->soft_tlb[slot];
 684             vaddr1 = vaddr & PAGE_MASK;
 685             if (prot & PAGE_KREAD)
 686                 e->vaddr[0] = vaddr1;
 687             else
 688                 e->vaddr[0] = -1;
 689             if (prot & PAGE_KWRITE)
 690                 e->vaddr[1] = vaddr1;
 691             else
 692                 e->vaddr[1] = -1;
 693             if (prot & PAGE_UREAD)
 694                 e->vaddr[2] = vaddr1;
 695             else
 696                 e->vaddr[2] = -1;
 697             if (prot & PAGE_UWRITE)
 698                 e->vaddr[3] = vaddr1;
 699             else
 700                 e->vaddr[3] = -1;
 701             ptr = get_ram_ptr(s, slot, pd & PAGE_MASK);
 702             e->addend = (unsigned long)ptr - vaddr1;
 703 #ifdef DEBUG_SOFT_TLB
 704             monitor_log(s, "tlb_set_page: vaddr=%p paddr=%p prot=0x%02x s=%d\n",
 705                         (void *)vaddr, (void *)paddr, prot, is_softmmu);
 706 #endif
 707             ret = 0;
 708         } else if ((vaddr - s->monitor_vaddr) < MONITOR_MEM_SIZE) {
 709             ret = 2;
 710         } else {
 711             pte_flags = PG_PRESENT_MASK | PG_USER_MASK |
 712                 PG_ACCESSED_MASK | PG_DIRTY_MASK;
 713 #ifdef USE_USER_PG_GLOBAL
 714             /* user pages are marked as global to stay in TLB when
 715                switching to kernel mode */
 716             /* XXX: check WP bit or ensure once that WP is set in
 717                kqemu */
 718             if (prot & PAGE_UREAD)
 719                 pte_flags |= PG_GLOBAL(s);
 720 #endif
 721             is_user = (s->cpu_state.cpl == 3);
 722             if (is_user)
 723                 mask = PAGE_UWRITE;
 724             else
 725                 mask = PAGE_KWRITE;
 726             if (prot & mask) {
 727                 pte_flags |= PG_ORIG_RW_MASK | PG_RW_MASK;
 728                 if ((pd & ~PAGE_MASK) == KQEMU_IO_MEM_ROM ||
 729                     ((pd & ~PAGE_MASK) == KQEMU_IO_MEM_RAM &&
 730                      !ram_is_dirty(s, pd))) {
 731                     pte_flags &= ~PG_RW_MASK;
 732                 }
 733             }
 734             rp = lock_ram_page(s, pd & PAGE_MASK);
 735             map_ram_page(s, is_user, vaddr, rp, pte_flags);
 736             ret = 0;
 737         }
 738     }
 739     return ret;
 740 }
 741
 742 /* return value:
 743    0  = nothing more to do
 744    1  = generate PF fault
 745    2  = soft MMU activation required for this block
 746 */
 747 long cpu_x86_handle_mmu_fault(struct kqemu_state *s, unsigned long addr,
 748                               int is_write, int is_user, int is_softmmu)
 749 {
 750     struct kqemu_cpu_state *env = &s->cpu_state;
 751     uint32_t pdpe_addr, pde_addr, pte_addr;
 752     uint32_t pde, pte, ptep, pdpe;
 753     int error_code, is_dirty, prot, page_size;
 754     unsigned long paddr, page_offset;
 755     unsigned long vaddr, virt_addr;
 756     long ret;
 757
 758 #ifdef DEBUG_MMU
 759     monitor_log(s, "mmu_fault: addr=%08lx w=%d u=%d s=%d\n",
 760                 addr, is_write, is_user, is_softmmu);
 761 #endif
 762
 763     is_write &= 1;
 764
 765     if (!(env->cr0 & CR0_PG_MASK)) {
 766         pte = addr;
 767         virt_addr = addr & PAGE_MASK;
 768         prot = PAGE_KREAD | PAGE_KWRITE | PAGE_UREAD | PAGE_UWRITE;
 769         page_size = 4096;
 770         goto do_mapping;
 771     }
 772
 773
 774     if (env->cr4 & CR4_PAE_MASK) {
 775         /* XXX: we only use 32 bit physical addresses */
 776 #ifdef __x86_64__
 777         if (env->efer & MSR_EFER_LMA) {
 778             uint32_t pml4e_addr, pml4e;
 779             int32_t sext;
 780
 781             /* XXX: handle user + rw rights */
 782             /* XXX: handle NX flag */
 783             /* test virtual address sign extension */
 784             sext = (int64_t)addr >> 47;
 785             if (sext != 0 && sext != -1) {
 786                 error_code = 0;
 787                 goto do_fault;
 788             }
 789
 790             pml4e_addr = ((env->cr3 & ~0xfff) + (((addr >> 39) & 0x1ff) << 3)) &
 791                 env->a20_mask;
 792             pml4e = ldl_phys_mmu(s, pml4e_addr);
 793             if (!(pml4e & PG_PRESENT_MASK)) {
 794                 error_code = 0;
 795                 goto do_fault;
 796             }
 797             if (!(pml4e & PG_ACCESSED_MASK)) {
 798                 pml4e |= PG_ACCESSED_MASK;
 799                 stl_phys_mmu(s, pml4e_addr, pml4e);
 800             }
 801
 802             pdpe_addr = ((pml4e & ~0xfff) + (((addr >> 30) & 0x1ff) << 3)) &
 803                 env->a20_mask;
 804             pdpe = ldl_phys_mmu(s, pdpe_addr);
 805             if (!(pdpe & PG_PRESENT_MASK)) {
 806                 error_code = 0;
 807                 goto do_fault;
 808             }
 809             if (!(pdpe & PG_ACCESSED_MASK)) {
 810                 pdpe |= PG_ACCESSED_MASK;
 811                 stl_phys_mmu(s, pdpe_addr, pdpe);
 812             }
 813         } else
 814 #endif
 815         {
 816             pdpe_addr = ((env->cr3 & ~0x1f) + ((addr >> 30) << 3)) &
 817                 env->a20_mask;
 818             pdpe = ldl_phys_mmu(s, pdpe_addr);
 819             if (!(pdpe & PG_PRESENT_MASK)) {
 820                 error_code = 0;
 821                 goto do_fault;
 822             }
 823         }
 824
 825         pde_addr = ((pdpe & ~0xfff) + (((addr >> 21) & 0x1ff) << 3)) &
 826             env->a20_mask;
 827         pde = ldl_phys_mmu(s, pde_addr);
 828         if (!(pde & PG_PRESENT_MASK)) {
 829             error_code = 0;
 830             goto do_fault;
 831         }
 832         if (pde & PG_PSE_MASK) {
 833             /* 2 MB page */
 834             page_size = 2048 * 1024;
 835             goto handle_big_page;
 836         } else {
 837             /* 4 KB page */
 838             if (!(pde & PG_ACCESSED_MASK)) {
 839                 pde |= PG_ACCESSED_MASK;
 840                 stl_phys_mmu(s, pde_addr, pde);
 841             }
 842             pte_addr = ((pde & ~0xfff) + (((addr >> 12) & 0x1ff) << 3)) &
 843                 env->a20_mask;
 844             goto handle_4k_page;
 845         }
 846     } else {
 847         /* page directory entry */
 848         pde_addr = ((env->cr3 & ~0xfff) + ((addr >> 20) & ~3)) &
 849             env->a20_mask;
 850         pde = ldl_phys_mmu(s, pde_addr);
 851         if (!(pde & PG_PRESENT_MASK)) {
 852             error_code = 0;
 853             goto do_fault;
 854         }
 855         /* if PSE bit is set, then we use a 4MB page */
 856         if ((pde & PG_PSE_MASK) && (env->cr4 & CR4_PSE_MASK)) {
 857             page_size = 4096 * 1024;
 858         handle_big_page:
 859             if (is_user) {
 860                 if (!(pde & PG_USER_MASK))
 861                     goto do_fault_protect;
 862                 if (is_write && !(pde & PG_RW_MASK))
 863                     goto do_fault_protect;
 864             } else {
 865                 if ((env->cr0 & CR0_WP_MASK) &&
 866                     is_write && !(pde & PG_RW_MASK))
 867                     goto do_fault_protect;
 868             }
 869             is_dirty = is_write && !(pde & PG_DIRTY_MASK);
 870             if (!(pde & PG_ACCESSED_MASK) || is_dirty) {
 871                 pde |= PG_ACCESSED_MASK;
 872                 if (is_dirty)
 873                     pde |= PG_DIRTY_MASK;
 874                 stl_phys_mmu(s, pde_addr, pde);
 875             }
 876
 877             pte = pde & ~( (page_size - 1) & ~0xfff); /* align to page_size */
 878             ptep = pte;
 879             virt_addr = addr & ~(page_size - 1);
 880         } else {
 881             if (!(pde & PG_ACCESSED_MASK)) {
 882                 pde |= PG_ACCESSED_MASK;
 883                 stl_phys_mmu(s, pde_addr, pde);
 884             }
 885
 886             /* page directory entry */
 887             pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) &
 888                 env->a20_mask;
 889         handle_4k_page:
 890             pte = ldl_phys_mmu(s, pte_addr);
 891             if (!(pte & PG_PRESENT_MASK)) {
 892                 error_code = 0;
 893                 goto do_fault;
 894             }
 895             /* combine pde and pte user and rw protections */
 896             ptep = pte & pde;
 897             if (is_user) {
 898                 if (!(ptep & PG_USER_MASK))
 899                     goto do_fault_protect;
 900                 if (is_write && !(ptep & PG_RW_MASK))
 901                     goto do_fault_protect;
 902             } else {
 903                 if ((env->cr0 & CR0_WP_MASK) &&
 904                     is_write && !(ptep & PG_RW_MASK))
 905                     goto do_fault_protect;
 906             }
 907             is_dirty = is_write && !(pte & PG_DIRTY_MASK);
 908             if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
 909                 pte |= PG_ACCESSED_MASK;
 910                 if (is_dirty)
 911                     pte |= PG_DIRTY_MASK;
 912                 stl_phys_mmu(s, pte_addr, pte);
 913             }
 914             page_size = 4096;
 915             virt_addr = addr & ~0xfff;
 916         }
 917
 918         /* the page can be put in the TLB */
 919         prot = PAGE_KREAD;
 920         if (ptep & PG_USER_MASK)
 921             prot |= PAGE_UREAD;
 922         if (pte & PG_DIRTY_MASK) {
 923             /* only set write access if already dirty... otherwise wait
 924                for dirty access */
 925             if (ptep & PG_USER_MASK) {
 926                 if (ptep & PG_RW_MASK)
 927                     prot |= PAGE_UWRITE;
 928             }
 929             if (!(env->cr0 & CR0_WP_MASK) ||
 930                 (ptep & PG_RW_MASK))
 931                 prot |= PAGE_KWRITE;
 932         }
 933     }
 934  do_mapping:
 935     pte = pte & env->a20_mask;
 936
 937     /* Even if 4MB pages, we map only one 4KB page in the cache to
 938        avoid filling it too fast */
 939     page_offset = (addr & PAGE_MASK) & (page_size - 1);
 940     paddr = (pte & PAGE_MASK) + page_offset;
 941     vaddr = virt_addr + page_offset;
 942
 943     ret = tlb_set_page(s, vaddr, paddr, prot, is_softmmu);
 944     return ret;
 945
 946  do_fault_protect:
 947     error_code = PG_ERROR_P_MASK;
 948  do_fault:
 949     env->cr2 = addr;
 950     env->error_code = (is_write << PG_ERROR_W_BIT) | error_code;
 951     if (is_user)
 952         env->error_code |= PG_ERROR_U_MASK;
 953     return 1;
 954 }
 955
 956 static void soft_tlb_fill(struct kqemu_state *s, unsigned long vaddr,
 957                           int is_write, int is_user)
 958 {
 959     long ret;
 960 #ifdef PROFILE_SOFTMMU
 961     int ti;
 962     ti = getclock();
 963 #endif
 964     ret = cpu_x86_handle_mmu_fault(s, vaddr, is_write, is_user, 1);
 965 #ifdef PROFILE_SOFTMMU
 966     ti = getclock() - ti;
 967     monitor_log(s, "soft_tlb_fill: w=%d u=%d addr=%p cycle=%d\n",
 968                 is_write, is_user, (void *)vaddr, ti);
 969 #endif
 970     if (ret == 1)
 971         raise_exception(s, EXCP0E_PAGE);
 972     else if (ret == 2)
 973         raise_exception(s, KQEMU_RET_SOFTMMU);
 974 }
 975
 976 static void *map_vaddr(struct kqemu_state *s, unsigned long addr,
 977                        int is_write, int is_user)
 978 {
 979     TLBEntry *e;
 980     unsigned long taddr;
 981
 982     e = &s->soft_tlb[(addr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1)];
 983  redo:
 984     if (e->vaddr[(is_user << 1) + is_write] != (addr & PAGE_MASK)) {
 985         soft_tlb_fill(s, addr, is_write, is_user);
 986         goto redo;
 987     } else {
 988         taddr = e->addend + addr;
 989     }
 990     return (void *)taddr;
 991 }
 992
 993 uint32_t ldub_slow(struct kqemu_state *s, unsigned long addr,
 994                    int is_user)
 995 {
 996     TLBEntry *e;
 997     uint32_t val;
 998     unsigned long taddr;
 999
1000     e = &s->soft_tlb[(addr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1)];
1001  redo:
1002     if (unlikely(e->vaddr[(is_user << 1)] != (addr & PAGE_MASK))) {
1003         soft_tlb_fill(s, addr, 0, is_user);
1004         goto redo;
1005     } else {
1006         taddr = e->addend + addr;
1007         val = *(uint8_t *)taddr;
1008     }
1009     return val;
1010 }
1011
1012 uint32_t lduw_slow(struct kqemu_state *s, unsigned long addr,
1013                    int is_user)
1014 {
1015     TLBEntry *e;
1016     uint32_t val;
1017     unsigned long taddr;
1018
1019     e = &s->soft_tlb[(addr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1)];
1020  redo:
1021     if (unlikely(e->vaddr[(is_user << 1)] != (addr & (PAGE_MASK | 1)))) {
1022         if (e->vaddr[(is_user << 1)] == (addr & PAGE_MASK)) {
1023             /* unaligned access */
1024             if (((addr + 1) & PAGE_MASK) == (addr & PAGE_MASK)) {
1025                 goto access_ok;
1026             } else {
1027                 uint32_t v0, v1;
1028                 /* access spans two pages (rare case) */
1029                 v0 = ldub_slow(s, addr, is_user);
1030                 v1 = ldub_slow(s, addr + 1, is_user);
1031                 val = v0 | (v1 << 8);
1032             }
1033         } else {
1034             soft_tlb_fill(s, addr, 0, is_user);
1035             goto redo;
1036         }
1037     } else {
1038     access_ok:
1039         taddr = e->addend + addr;
1040         val = *(uint16_t *)taddr;
1041     }
1042     return val;
1043 }
1044
1045 uint32_t ldl_slow(struct kqemu_state *s, unsigned long addr,
1046                   int is_user)
1047 {
1048     TLBEntry *e;
1049     uint32_t val;
1050     unsigned long taddr;
1051
1052     e = &s->soft_tlb[(addr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1)];
1053  redo:
1054     if (unlikely(e->vaddr[(is_user << 1)] != (addr & (PAGE_MASK | 3)))) {
1055         if (e->vaddr[(is_user << 1)] == (addr & PAGE_MASK)) {
1056             /* unaligned access */
1057             if (((addr + 3) & PAGE_MASK) == (addr & PAGE_MASK)) {
1058                 goto access_ok;
1059             } else {
1060                 uint32_t v0, v1;
1061                 int shift;
1062                 /* access spans two pages (rare case) */
1063                 shift = (addr & 3) * 8;
1064                 addr &= ~3;
1065                 v0 = ldl_slow(s, addr, is_user);
1066                 v1 = ldl_slow(s, addr + 4, is_user);
1067                 val = (v0 >> shift) | (v1 << (32 - shift));
1068             }
1069         } else {
1070             soft_tlb_fill(s, addr, 0, is_user);
1071             goto redo;
1072         }
1073     } else {
1074     access_ok:
1075         taddr = e->addend + addr;
1076         val = *(uint32_t *)taddr;
1077     }
1078     return val;
1079 }
1080
1081 uint64_t ldq_slow(struct kqemu_state *s, unsigned long addr,
1082                   int is_user)
1083 {
1084     TLBEntry *e;
1085     uint64_t val;
1086     unsigned long taddr;
1087
1088     e = &s->soft_tlb[(addr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1)];
1089  redo:
1090     if (unlikely(e->vaddr[(is_user << 1)] != (addr & (PAGE_MASK | 7)))) {
1091         if (e->vaddr[(is_user << 1)] == (addr & PAGE_MASK)) {
1092             /* unaligned access */
1093             if (((addr + 7) & PAGE_MASK) == (addr & PAGE_MASK)) {
1094                 goto access_ok;
1095             } else {
1096                 uint64_t v0, v1;
1097                 int shift;
1098                 /* access spans two pages (rare case) */
1099                 shift = (addr & 7) * 8;
1100                 addr &= ~7;
1101                 v0 = ldq_slow(s, addr, is_user);
1102                 v1 = ldq_slow(s, addr + 8, is_user);
1103                 val = (v0 >> shift) | (v1 << (64 - shift));
1104             }
1105         } else {
1106             soft_tlb_fill(s, addr, 0, is_user);
1107             goto redo;
1108         }
1109     } else {
1110     access_ok:
1111         taddr = e->addend + addr;
1112         val = *(uint64_t *)taddr;
1113     }
1114     return val;
1115 }
1116
1117 void stb_slow(struct kqemu_state *s, unsigned long addr,
1118               uint32_t val, int is_user)
1119 {
1120     TLBEntry *e;
1121     unsigned long taddr;
1122
1123     e = &s->soft_tlb[(addr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1)];
1124  redo:
1125     if (unlikely(e->vaddr[(is_user << 1) + 1] != (addr & PAGE_MASK))) {
1126         soft_tlb_fill(s, addr, 1, is_user);
1127         goto redo;
1128     } else {
1129         taddr = e->addend + addr;
1130         *(uint8_t *)taddr = val;
1131     }
1132 }
1133
1134 void stw_slow(struct kqemu_state *s, unsigned long addr,
1135               uint32_t val, int is_user)
1136 {
1137     TLBEntry *e;
1138     unsigned long taddr;
1139
1140     e = &s->soft_tlb[(addr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1)];
1141  redo:
1142     if (unlikely(e->vaddr[(is_user << 1) + 1] != (addr & (PAGE_MASK | 1)))) {
1143         if (e->vaddr[(is_user << 1) + 1] == (addr & PAGE_MASK)) {
1144             /* unaligned access */
1145             if (((addr + 1) & PAGE_MASK) == (addr & PAGE_MASK)) {
1146                 goto access_ok;
1147             } else {
1148                 /* access spans two pages (rare case) */
1149                 stb_slow(s, addr, val, is_user);
1150                 stb_slow(s, addr + 1, val >> 8, is_user);
1151             }
1152         } else {
1153             soft_tlb_fill(s, addr, 1, is_user);
1154             goto redo;
1155         }
1156     } else {
1157     access_ok:
1158         taddr = e->addend + addr;
1159         *(uint16_t *)taddr = val;
1160     }
1161 }
1162
1163 void stl_slow(struct kqemu_state *s, unsigned long addr,
1164               uint32_t val, int is_user)
1165 {
1166     TLBEntry *e;
1167     unsigned long taddr;
1168
1169     e = &s->soft_tlb[(addr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1)];
1170  redo:
1171     if (unlikely(e->vaddr[(is_user << 1) + 1] != (addr & (PAGE_MASK | 3)))) {
1172         if (e->vaddr[(is_user << 1) + 1] == (addr & PAGE_MASK)) {
1173             /* unaligned access */
1174             if (((addr + 3) & PAGE_MASK) == (addr & PAGE_MASK)) {
1175                 goto access_ok;
1176             } else {
1177                 /* access spans two pages (rare case) */
1178                 stb_slow(s, addr, val, is_user);
1179                 stb_slow(s, addr + 1, val >> 8, is_user);
1180                 stb_slow(s, addr + 2, val >> 16, is_user);
1181                 stb_slow(s, addr + 3, val >> 24, is_user);
1182             }
1183         } else {
1184             soft_tlb_fill(s, addr, 1, is_user);
1185             goto redo;
1186         }
1187     } else {
1188     access_ok:
1189         taddr = e->addend + addr;
1190         *(uint32_t *)taddr = val;
1191     }
1192 }
1193
1194 void stq_slow(struct kqemu_state *s, unsigned long addr,
1195               uint64_t val, int is_user)
1196 {
1197     TLBEntry *e;
1198     unsigned long taddr;
1199
1200     e = &s->soft_tlb[(addr >> PAGE_SHIFT) & (SOFT_TLB_SIZE - 1)];
1201  redo:
1202     if (unlikely(e->vaddr[(is_user << 1) + 1] != (addr & (PAGE_MASK | 7)))) {
1203         if (e->vaddr[(is_user << 1) + 1] == (addr & PAGE_MASK)) {
1204             /* unaligned access */
1205             if (((addr + 7) & PAGE_MASK) == (addr & PAGE_MASK)) {
1206                 goto access_ok;
1207             } else {
1208                 /* access spans two pages (rare case) */
1209                 stb_slow(s, addr, val, is_user);
1210                 stb_slow(s, addr + 1, val >> 8, is_user);
1211                 stb_slow(s, addr + 2, val >> 16, is_user);
1212                 stb_slow(s, addr + 3, val >> 24, is_user);
1213                 stb_slow(s, addr + 4, val >> 32, is_user);
1214                 stb_slow(s, addr + 5, val >> 40, is_user);
1215                 stb_slow(s, addr + 6, val >> 48, is_user);
1216                 stb_slow(s, addr + 7, val >> 56, is_user);
1217             }
1218         } else {
1219             soft_tlb_fill(s, addr, 1, is_user);
1220             goto redo;
1221         }
1222     } else {
1223     access_ok:
1224         taddr = e->addend + addr;
1225         *(uint64_t *)taddr = val;
1226     }
1227 }
1228
1229 extern unsigned long __start_mmu_ex_table;
1230 extern unsigned long __stop_mmu_ex_table;
1231 int sorted = 0;
1232
1233 void lsort(unsigned long *tab, int n)
1234 {
1235     int i, j;
1236     unsigned long tmp;
1237
1238     for(i = 0; i < n - 1; i++) {
1239         for(j = i + 1; j < n;j++) {
1240             if (tab[i] > tab[j]) {
1241                 tmp = tab[i];
1242                 tab[i] = tab[j];
1243                 tab[j] = tmp;
1244             }
1245         }
1246     }
1247 #if 0
1248     for(i = 0; i < n - 1; i++) {
1249         if (tab[i] > tab[i + 1])
1250             asm volatile("ud2");
1251     }
1252 #endif
1253 }
1254
1255 static int expected_monitor_exception(unsigned long pc)
1256 {
1257     unsigned long *tab, v;
1258     int a, b, m;
1259     if (unlikely(!sorted)) {
1260         lsort(&__start_mmu_ex_table,
1261               &__stop_mmu_ex_table - &__start_mmu_ex_table);
1262         sorted = 1;
1263     }
1264
1265     tab = &__start_mmu_ex_table;
1266     a = 0;
1267     b = &__stop_mmu_ex_table - &__start_mmu_ex_table - 1;
1268     while (a <= b) {
1269         m = (a + b) >> 1;
1270         v = tab[m];
1271         if (v == pc)
1272             return 1;
1273         else if (v > pc) {
1274             b = m - 1;
1275         } else {
1276             a = m + 1;
1277         }
1278     }
1279     return 0;
1280 }
1281
1282 /* page fault */
1283 void kqemu_exception_0e(struct kqemu_state *s,
1284                         struct kqemu_exception_regs regs)
1285 {
1286     unsigned long address;
1287     int is_write, is_user;
1288     long ret;
1289 #ifdef PROFILE_INTERP2
1290     int64_t ti;
1291 #endif
1292     asm volatile ("mov %%cr2, %0" : "=r" (address));
1293 #ifdef PROFILE_INTERP2
1294     ti = getclock();
1295 #endif
1296
1297     if ((regs.cs_sel & 3) != 3) {
1298         if (!expected_monitor_exception(regs.eip)) {
1299             /* exception in monitor space - we may accept it someday if it
1300                is a user access indicated as such */
1301             monitor_panic_regs(s, &regs,
1302                                "Paging exception in monitor address space. CR2=%p\n",
1303                                (void *)address);
1304         }
1305         /* do not reload s->regs because we are already in interpreter */
1306         s->seg_cache_loaded = 1;
1307     } else {
1308         s->regs = &regs;
1309         s->seg_cache_loaded = 0;
1310     }
1311     is_write = (regs.error_code >> 1) & 1;
1312 #ifdef PROFILE_INTERP2
1313     s->total_page_fault_count++;
1314 #endif
1315     /* see if the page is write protected -> mark it dirty if needed */
1316     is_user = (s->cpu_state.cpl == 3);
1317     if (is_write && (regs.error_code & 1)) {
1318         uint32_t ram_index, *ptep;
1319         struct kqemu_ram_page *rp;
1320         int dirty_mask;
1321
1322         /* get the original writable flag */
1323         if (USE_PAE(s)) {
1324             uint64_t pte;
1325             ptep = (uint32_t *)mon_get_ptep_l3(s, is_user, address, 0);
1326             if (!ptep)
1327                 goto fail;
1328             pte = *(uint64_t *)ptep;
1329             if (!(pte & PG_PRESENT_MASK))
1330                 goto fail;
1331             if (!(pte & PG_ORIG_RW_MASK))
1332                 goto fail;
1333             rp = find_ram_page_from_paddr(s, pte >> PAGE_SHIFT);
1334         } else {
1335             uint32_t pte;
1336             ptep = mon_get_ptep_l2(s, is_user, address, 0);
1337             if (!ptep)
1338                 goto fail;
1339             pte = *ptep;
1340             if (!(pte & PG_PRESENT_MASK))
1341                 goto fail;
1342             if (!(pte & PG_ORIG_RW_MASK))
1343                 goto fail;
1344             rp = find_ram_page_from_paddr(s, pte >> PAGE_SHIFT);
1345         }
1346         if (!rp)
1347             goto fail;
1348         ram_index = rp - s->ram_pages;
1349         /* cannot write directly on GDT/LDT pages or in pages where
1350            code was translated  */
1351         /* XXX: should revalidate or interpret the code to go faster */
1352 #ifdef USE_SEG_GP
1353         dirty_mask = 0;
1354         if (s->cpu_state.cpl == 3)
1355             dirty_mask |= DT_DIRTY_FLAG;
1356 #else
1357         dirty_mask = DT_DIRTY_FLAG;
1358 #endif
1359         if ((s->ram_dirty[ram_index] & dirty_mask) != dirty_mask) {
1360             raise_exception(s, KQEMU_RET_SOFTMMU);
1361         }
1362         /* code updates need to be signaled */
1363         if ((s->ram_dirty[ram_index] & CODE_DIRTY_FLAG) !=
1364             CODE_DIRTY_FLAG) {
1365             s->modified_ram_pages[s->cpu_state.nb_modified_ram_pages++] =
1366                 ram_index << PAGE_SHIFT;
1367             /* too many modified pages: exit */
1368             if (s->cpu_state.nb_modified_ram_pages >=
1369                 KQEMU_MAX_MODIFIED_RAM_PAGES)
1370                 raise_exception(s, KQEMU_RET_SOFTMMU);
1371         }
1372
1373         /* set the page as RW and mark the corresponding ram page as
1374            dirty */
1375         s->ram_dirty[ram_index] = 0xff;
1376         *ptep |= PG_RW_MASK;
1377         asm volatile("invlpg %0" : : "m" (*(uint8_t *)address));
1378         return;
1379     fail: ;
1380     }
1381
1382 #ifdef PROFILE_INTERP2
1383     s->mmu_page_fault_count++;
1384 #endif
1385     /* see if it is an MMU fault */
1386     ret = cpu_x86_handle_mmu_fault(s, address, is_write, is_user, 0);
1387     switch(ret) {
1388     case 0:
1389 #ifdef PROFILE_INTERP2
1390         if ((regs.cs_sel & 3) != 3)
1391             s->tlb_interp_page_fault_count++;
1392         s->tlb_page_fault_count++;
1393         s->tlb_page_fault_cycles += (getclock() - ti);
1394 #endif
1395         break;
1396     case 1:
1397 #ifdef PROFILE_INTERP2
1398         s->mmu_page_fault_cycles += (getclock() - ti);
1399 #endif
1400         /* real MMU fault */
1401         raise_exception(s, EXCP0E_PAGE);
1402     case 2:
1403     default:
1404 #ifdef PROFILE_INTERP2
1405         s->mmu_page_fault_cycles += (getclock() - ti);
1406 #endif
1407         /* cannot map: I/O  */
1408         raise_exception(s, KQEMU_RET_SOFTMMU);
1409     }
1410 }
1411
1412 /* exit the virtual cpu by raising an exception */
1413 void raise_exception(struct kqemu_state *s, int intno)
1414 {
1415     /* XXX: the exclusion of exception GPF is needed for correct
1416        Windows XP boot. I don't know the precise explanation yet. */
1417     if (s->cpu_state.user_only || (unsigned int)intno >= 0x20 ||
1418         intno == 0x0d) {
1419         /* exit the monitor if user only */
1420         profile_record(s);
1421         s->mon_req = MON_REQ_EXIT;
1422         s->arg0 = intno;
1423         profile_record(s);
1424         monitor2kernel1(s);
1425     } else {
1426         s->arg0 = intno;
1427         start_func(raise_exception_interp, s,
1428                    s->stack_end - sizeof(struct kqemu_exception_regs));
1429     }
1430     /* never returns */
1431     while (1);
1432 }
1433
1434 void __raise_exception_err(struct kqemu_state *s,
1435                            int intno, int error_code)
1436 {
1437     s->cpu_state.error_code = error_code;
1438     raise_exception(s, intno);
1439 }
1440
1441 void do_update_cr3(struct kqemu_state *s, unsigned long new_cr3)
1442 {
1443     if (s->cpu_state.cr0 & CR0_PG_MASK) {
1444         tlb_flush(s, 1);
1445         /* indicate that all the pages must be flushed in user space */
1446         s->cpu_state.nb_pages_to_flush = KQEMU_FLUSH_ALL;
1447     }
1448     s->cpu_state.cr3 = new_cr3;
1449 }
1450
1451 #define CR0_UPDATE_MASK (CR0_TS_MASK | CR0_MP_MASK | CR0_EM_MASK | CR0_AM_MASK)
1452
1453 void do_update_cr0(struct kqemu_state *s, unsigned long new_cr0)
1454 {
1455     if ((new_cr0 & ~CR0_UPDATE_MASK) !=
1456         (s->cpu_state.cr0 & ~CR0_UPDATE_MASK))
1457         raise_exception(s, KQEMU_RET_SOFTMMU);
1458     if ((new_cr0 & CR0_UPDATE_MASK) !=
1459         (s->cpu_state.cr0 & CR0_UPDATE_MASK)) {
1460         s->cpu_state.cr0 = new_cr0;
1461         update_host_cr0(s);
1462     }
1463 }
1464
1465 #define CR4_UPDATE_MASK (CR4_TSD_MASK | CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK)
1466
1467 void do_update_cr4(struct kqemu_state *s, unsigned long new_cr4)
1468 {
1469     if ((new_cr4 & ~CR4_UPDATE_MASK) !=
1470         (s->cpu_state.cr4 & ~CR4_UPDATE_MASK))
1471         raise_exception(s, KQEMU_RET_SOFTMMU);
1472     if ((new_cr4 & CR4_UPDATE_MASK) !=
1473         (s->cpu_state.cr4 & CR4_UPDATE_MASK)) {
1474         s->cpu_state.cr4 = new_cr4;
1475         update_host_cr4(s);
1476     }
1477 }
1478
1479 void do_invlpg(struct kqemu_state *s, unsigned long vaddr)
1480 {
1481     tlb_flush_page(s, vaddr);
1482     if (s->cpu_state.nb_pages_to_flush >= KQEMU_MAX_PAGES_TO_FLUSH) {
1483         s->cpu_state.nb_pages_to_flush = KQEMU_FLUSH_ALL;
1484     } else {
1485         s->pages_to_flush[s->cpu_state.nb_pages_to_flush++] = vaddr;
1486     }
1487 }
1488
1489 extern unsigned long __start_seg_ex_table;
1490 extern unsigned long __stop_seg_ex_table;
1491
1492 static void handle_mon_exception(struct kqemu_state *s,
1493                                  struct kqemu_exception_regs *regs,
1494                                  int intno)
1495 {
1496     unsigned long pc, *p;
1497
1498     pc = regs->eip;
1499     for(p = &__start_seg_ex_table; p != &__stop_seg_ex_table; p++) {
1500         if (*p == pc) goto found;
1501     }
1502     monitor_panic_regs(s, regs,
1503                        "Unexpected exception 0x%02x in monitor space\n",
1504                        intno);
1505  found:
1506     if (intno == 0x00) {
1507         /* division exception from interp */
1508         /* XXX: verify for fxsave/fxrstor */
1509         s->regs = &s->regs1;
1510     } else {
1511         /* Note: the exception state is reliable only for goto_user
1512            handling */
1513         s->regs = NULL;
1514     }
1515     raise_exception_err(s, intno, regs->error_code);
1516 }
1517
1518 #ifdef PROFILE_INTERP_PC
1519 static void profile_interp_add(struct kqemu_state *s,
1520                                unsigned long eip,
1521                                int64_t cycles,
1522                                int insn_count)
1523 {
1524     int h, idx;
1525     ProfileInterpEntry *pe;
1526
1527     h = (eip ^ (eip >> PROFILE_INTERP_PC_HASH_BITS) ^
1528          (eip >> (2 * PROFILE_INTERP_PC_HASH_BITS))) &
1529         (PROFILE_INTERP_PC_HASH_SIZE - 1);
1530     idx = s->profile_interp_hash_table[h];
1531     while (idx != 0) {
1532         pe = &s->profile_interp_entries[idx - 1];
1533         if (pe->eip == eip)
1534             goto found;
1535         idx = pe->next;
1536     }
1537     /* not found */
1538     if (s->nb_profile_interp_entries >= (PROFILE_INTERP_PC_NB_ENTRIES - 1)) {
1539         /* too many entries : use last entry */
1540         if (s->nb_profile_interp_entries < PROFILE_INTERP_PC_NB_ENTRIES)
1541             s->nb_profile_interp_entries++;
1542         pe = &s->profile_interp_entries[PROFILE_INTERP_PC_NB_ENTRIES - 1];
1543     } else {
1544         /* add one more entry */
1545         pe = &s->profile_interp_entries[s->nb_profile_interp_entries++];
1546         pe->next = s->profile_interp_hash_table[h];
1547         s->profile_interp_hash_table[h] = s->nb_profile_interp_entries;
1548         pe->eip = eip;
1549     }
1550  found:
1551     pe->count++;
1552     pe->cycles += cycles;
1553     pe->insn_count += insn_count;
1554 }
1555 #endif
1556
1557 static inline void kqemu_exception_interp(struct kqemu_state *s, int intno,
1558                                           struct kqemu_exception_regs *regs)
1559 {
1560 #ifdef PROFILE_INTERP2
1561     int64_t ti0, ti1, ti2;
1562     int c1;
1563     unsigned long start_eip;
1564     ti0 = getclock();
1565 #endif
1566     if ((regs->cs_sel & 3) != 3)
1567         handle_mon_exception(s, regs, intno);
1568
1569     profile_record(s);
1570
1571     s->regs = regs;
1572
1573     profile_record(s);
1574     update_seg_cache(s);
1575 #ifdef PROFILE_INTERP2
1576     ti1 = getclock();
1577     c1 = s->insn_count;
1578     start_eip = s->regs1.eip;
1579 #endif
1580
1581     insn_interp(s);
1582 #ifdef PROFILE_INTERP2
1583     ti2 = getclock();
1584     s->exc_interp_count++;
1585     s->exc_seg_cycles += ti1 - ti0;
1586     s->exc_interp_cycles += ti2 - ti1;
1587     c1 -= s->insn_count;
1588     s->exc_insn_count += c1;
1589     if (c1 > s->exc_insn_count_max) {
1590         s->exc_insn_count_max = c1;
1591         s->exc_start_eip_max = start_eip;
1592     }
1593 #ifdef PROFILE_INTERP_PC
1594     profile_interp_add(s, start_eip, ti2 - ti0, c1 + 1);
1595 #endif
1596 #endif
1597 }
1598
1599 /* XXX: remove L bit on x86_64 in legacy emulation ? */
1600 static void check_dt_entries(uint8_t *d, const uint8_t *s, int n)
1601 {
1602     int i;
1603     uint32_t e1, e2;
1604     for(i = 0; i < n; i++) {
1605         e1 = ((uint32_t *)s)[0];
1606         e2 = ((uint32_t *)s)[1];
1607         if (!(e2 & DESC_S_MASK)) {
1608             /* not a segment: reset DPL to ensure it cannot be used
1609                from user space */
1610             e2 &= ~(3 << DESC_DPL_SHIFT);
1611 #ifndef USE_SEG_GP
1612             ((uint32_t *)d)[32768 * 0 + 0] = e1; /* CPL = 0 */
1613             ((uint32_t *)d)[32768 * 0 + 1] = e2;
1614             ((uint32_t *)d)[32768 * 1 + 0] = e1; /* CPL = 1 */
1615             ((uint32_t *)d)[32768 * 1 + 1] = e2;
1616             ((uint32_t *)d)[32768 * 2 + 0] = e1; /* CPL = 2 */
1617             ((uint32_t *)d)[32768 * 2 + 1] = e2;
1618 #endif
1619             ((uint32_t *)d)[32768 * (NB_DT_TABLES - 1) + 0] = e1; /* CPL = 3 */
1620             ((uint32_t *)d)[32768 * (NB_DT_TABLES - 1) + 1] = e2;
1621         } else if (unlikely(((e2 & (DESC_CS_MASK | DESC_C_MASK)) ==
1622                              (DESC_CS_MASK | DESC_C_MASK)))) {
1623             /* conforming segment : no need to modify */
1624 #ifndef USE_SEG_GP
1625             ((uint32_t *)d)[32768 * 0 + 0] = e1; /* CPL = 0 */
1626             ((uint32_t *)d)[32768 * 0 + 1] = e2;
1627             ((uint32_t *)d)[32768 * 1 + 0] = e1; /* CPL = 1 */
1628             ((uint32_t *)d)[32768 * 1 + 1] = e2;
1629             ((uint32_t *)d)[32768 * 2 + 0] = e1; /* CPL = 2 */
1630             ((uint32_t *)d)[32768 * 2 + 1] = e2;
1631 #endif
1632             ((uint32_t *)d)[32768 * (NB_DT_TABLES - 1) + 0] = e1; /* CPL = 3 */
1633             ((uint32_t *)d)[32768 * (NB_DT_TABLES - 1) + 1] = e2;
1634         } else {
1635 #ifndef USE_SEG_GP
1636             int dpl;
1637             uint32_t e2tmp, e2dpl3;
1638
1639             dpl = (e2 >> DESC_DPL_SHIFT) & 3;
1640             /* standard segment: need to patch the DPL so that
1641                if (DPL >= CPL) then DPL = 3
1642             */
1643             e2dpl3 = e2 | (3 << DESC_DPL_SHIFT);
1644             ((uint32_t *)d)[32768 * 0 + 0] = e1; /* CPL = 0 */
1645             ((uint32_t *)d)[32768 * 0 + 1] = e2dpl3;
1646
1647             e2tmp = e2;
1648             if (dpl >= 1)
1649                 e2tmp = e2dpl3;
1650             ((uint32_t *)d)[32768 * 1 + 0] = e1; /* CPL = 1 */
1651             ((uint32_t *)d)[32768 * 1 + 1] = e2tmp;
1652
1653             e2tmp = e2;
1654             if (dpl >= 2)
1655                 e2tmp = e2dpl3;
1656             ((uint32_t *)d)[32768 * 2 + 0] = e1; /* CPL = 2 */
1657             ((uint32_t *)d)[32768 * 2 + 1] = e2tmp;
1658 #endif
1659             ((uint32_t *)d)[32768 * (NB_DT_TABLES - 1) + 0] = e1; /* CPL = 3 */
1660             ((uint32_t *)d)[32768 * (NB_DT_TABLES - 1) + 1] = e2;
1661
1662         }
1663         s += 8;
1664         d += 8;
1665     }
1666 }
1667
1668 static void check_dt_entries_page(struct kqemu_state *s, int dt_type,
1669                                   int sel, int sel_end, const uint8_t *src)
1670 {
1671     uint8_t *dt;
1672     int mon_sel_start, mon_sel_end, sel1, sel2;
1673
1674     dt = (uint8_t *)(s->dt_table + (dt_type * 8192));
1675     if (dt_type == 0) {
1676         mon_sel_start = s->monitor_selector_base;
1677         mon_sel_end = s->monitor_selector_base + MONITOR_SEL_RANGE;
1678         sel1 = sel;
1679         while (sel1 < sel_end) {
1680             if (sel1 >= mon_sel_start && sel1 < mon_sel_end)
1681                 sel1 = mon_sel_end;
1682             if (sel1 < mon_sel_start) {
1683                 sel2 = mon_sel_start;
1684                 if (sel2 > sel_end)
1685                     sel2 = sel_end;
1686             } else {
1687                 sel2 = sel_end;
1688             }
1689             if (sel1 >= sel2)
1690                 break;
1691 #ifdef DEBUG_DT_CACHE
1692             monitor_log(s, "check_dt: type=%d sel=%d-%d\n",
1693                         dt_type, sel1, sel2);
1694 #endif
1695             check_dt_entries(dt + sel1,
1696                              src + sel1 - sel, (sel2 - sel1) >> 3);
1697             sel1 = sel2;
1698         }
1699     } else {
1700 #ifdef DEBUG_DT_CACHE
1701             monitor_log(s, "check_dt: type=%d sel=%d-%d\n",
1702                         dt_type, sel, sel_end);
1703 #endif
1704         check_dt_entries(dt + sel, src, (sel_end - sel) >> 3);
1705     }
1706 }
1707
1708 static void reset_dt_entries2(void *dt1, int n)
1709 {
1710     uint32_t *dt = dt1;
1711 #ifndef USE_SEG_GP
1712     memset(dt + 32768 * 0, 0, n);
1713     memset(dt + 32768 * 1, 0, n);
1714     memset(dt + 32768 * 2, 0, n);
1715 #endif
1716     memset(dt + 32768 * (NB_DT_TABLES - 1), 0, n);
1717 }
1718
1719 static void reset_dt_entries(struct kqemu_state *s, int dt_type,
1720                              int sel, int sel_end)
1721 {
1722     uint8_t *dt;
1723     int mon_sel_start, mon_sel_end, sel1, sel2;
1724
1725     dt = (uint8_t *)(s->dt_table + (dt_type * 8192));
1726     if (dt_type == 0) {
1727         mon_sel_start = s->monitor_selector_base;
1728         mon_sel_end = s->monitor_selector_base + MONITOR_SEL_RANGE;
1729         sel1 = sel;
1730         while (sel1 < sel_end) {
1731             if (sel1 >= mon_sel_start && sel1 < mon_sel_end)
1732                 sel1 = mon_sel_end;
1733             if (sel1 < mon_sel_start) {
1734                 sel2 = mon_sel_start;
1735                 if (sel2 > sel_end)
1736                     sel2 = sel_end;
1737             } else {
1738                 sel2 = sel_end;
1739             }
1740             if (sel1 >= sel2)
1741                 break;
1742 #ifdef DEBUG_DT_CACHE
1743             monitor_log(s, "reset_dt: type=%d sel=%d-%d\n",
1744                         dt_type, sel1, sel2);
1745 #endif
1746             reset_dt_entries2(dt + sel1, sel2 - sel1);
1747             sel1 = sel2;
1748         }
1749     } else {
1750 #ifdef DEBUG_DT_CACHE
1751             monitor_log(s, "reset_dt: type=%d sel=%d-%d\n",
1752                         dt_type, sel, sel_end);
1753 #endif
1754             reset_dt_entries2(dt + sel, sel_end - sel);
1755     }
1756 }
1757
1758 /* Note: this function can raise an exception in case of MMU fault or
1759    unaligned DT table */
1760 static void update_dt_cache(struct kqemu_state *s, int dt_type)
1761 {
1762     unsigned long base, dt_end, page_end, dt_ptr, ram_addr;
1763     uint32_t limit;
1764     uint8_t *ptr;
1765     int pindex, sel, sel_end, dt_changed, sel2;
1766
1767     if (dt_type) {
1768         /* XXX: check the exact behaviour of zero LDT */
1769         if ((s->cpu_state.ldt.selector & 0xfffc) == 0) {
1770             base = 0;
1771             limit = 0;
1772         } else {
1773             base = s->cpu_state.ldt.base;
1774             limit = s->cpu_state.ldt.limit;
1775         }
1776     } else {
1777         base = s->cpu_state.gdt.base;
1778         limit = s->cpu_state.gdt.limit;
1779     }
1780     dt_changed = (base != s->dt_base[dt_type] ||
1781                   limit != s->dt_limit[dt_type]);
1782
1783     sel_end = (limit + 1) & ~7;
1784     dt_end = base + sel_end;
1785     if (dt_end < base || (base & 7) != 0)
1786         raise_exception(s, KQEMU_RET_SOFTMMU);
1787
1788     pindex = 0;
1789     sel = 0;
1790     while (sel < sel_end) {
1791         dt_ptr = base + sel;
1792         page_end = (dt_ptr & PAGE_MASK) + PAGE_SIZE;
1793         if (page_end > dt_end)
1794             page_end = dt_end;
1795         sel2 = sel + (page_end - dt_ptr);
1796         ptr = map_vaddr(s, dt_ptr, 0, 0);
1797         ram_addr = ram_ptr_to_ram_addr(s, ptr);
1798         if (dt_changed ||
1799             s->dt_ram_addr[dt_type][pindex] != ram_addr ||
1800             ram_get_dirty(s, ram_addr, DT_DIRTY_FLAG)) {
1801             s->dt_ram_addr[dt_type][pindex] = ram_addr;
1802             check_dt_entries_page(s, dt_type, sel, sel2, ptr);
1803             ram_reset_dirty(s, ram_addr, DT_DIRTY_FLAG);
1804         }
1805         sel = sel2;
1806         pindex++;
1807     }
1808
1809     /* reset the remaining DT entries up to the last limit */
1810     sel_end = (s->dt_limit[dt_type] + 1) & ~7;
1811     if (sel < sel_end)
1812         reset_dt_entries(s, dt_type, sel, sel_end);
1813
1814     s->dt_base[dt_type] = base;
1815     s->dt_limit[dt_type] = limit;
1816 }
1817
1818 void update_gdt_ldt_cache(struct kqemu_state *s)
1819 {
1820     update_dt_cache(s, 0);
1821     update_dt_cache(s, 1);
1822 }
1823
1824 void monitor_exec(struct kqemu_state *s)
1825 {
1826     struct kqemu_cpu_state *env = &s->cpu_state;
1827     struct kqemu_exception_regs *r =
1828         (void *)(s->stack_end - sizeof(struct kqemu_exception_regs));
1829 #ifdef PROFILE_INTERP2
1830     int64_t ti = getclock();
1831 #endif
1832     update_host_cr0(s);
1833
1834     update_host_cr4(s);
1835
1836     restore_monitor_nexus_mapping(s);
1837
1838     s->regs = NULL;
1839
1840     /* if max_locked_ram_pages was modified because some instances
1841        were added, we unlock some pages here */
1842     unlock_pages(s);
1843
1844     /* first we flush the pages if needed */
1845     if (env->nb_pages_to_flush != 0) {
1846         if (env->nb_pages_to_flush > KQEMU_MAX_PAGES_TO_FLUSH) {
1847             tlb_flush(s, 1);
1848         } else {
1849             int i;
1850             for(i = 0; i < env->nb_pages_to_flush; i++) {
1851                 tlb_flush_page(s, s->pages_to_flush[i]);
1852             }
1853         }
1854         env->nb_pages_to_flush = 0;
1855     }
1856
1857     /* XXX: invalidate modified ram pages */
1858     env->nb_modified_ram_pages = 0;
1859
1860     /* unmap pages corresponding to notdirty ram pages */
1861     if (env->nb_ram_pages_to_update != 0) {
1862         unsigned long ram_addr;
1863         int i;
1864
1865         if (env->nb_ram_pages_to_update > KQEMU_MAX_RAM_PAGES_TO_UPDATE) {
1866             for(ram_addr = 0; ram_addr < s->ram_size; ram_addr += PAGE_SIZE) {
1867                 if (!ram_is_dirty(s, ram_addr)) {
1868                     ram_set_read_only(s, ram_addr);
1869                 }
1870             }
1871         } else {
1872             for(i = 0; i < env->nb_ram_pages_to_update; i++) {
1873                 ram_addr = s->ram_pages_to_update[i];
1874                 if (ram_addr < s->ram_size &&
1875                     !ram_is_dirty(s, ram_addr)) {
1876                     ram_set_read_only(s, ram_addr);
1877                 }
1878             }
1879         }
1880         env->nb_ram_pages_to_update = 0;
1881     }
1882
1883 #ifdef USE_SEG_GP
1884     if (s->cpu_state.cpl == 3)
1885         update_gdt_ldt_cache(s);
1886 #else
1887     update_gdt_ldt_cache(s);
1888 #endif
1889
1890 #ifdef PROFILE_INTERP2
1891     s->exec_init_cycles += (getclock() - ti);
1892     s->exec_init_count++;
1893 #endif
1894
1895     /* since this is not costly, we ensure here that the CPU state is
1896        consistent with what we can handle */
1897     if (!(env->cr0 & CR0_PE_MASK) ||
1898         (env->eflags & VM_MASK)) {
1899         raise_exception(s, KQEMU_RET_SOFTMMU);
1900     }
1901
1902     r->eip = env->eip;
1903     r->eflags = compute_eflags_user(s, env->eflags);
1904     s->comm_page.virt_eflags = env->eflags & EFLAGS_MASK;
1905     r->cs_sel = env->segs[R_CS].selector | 3;
1906     r->ss_sel = env->segs[R_SS].selector | 3;
1907
1908     r->eax = env->regs[R_EAX];
1909     r->ecx = env->regs[R_ECX];
1910     r->edx = env->regs[R_EDX];
1911     r->ebx = env->regs[R_EBX];
1912     r->esp = env->regs[R_ESP];
1913     r->ebp = env->regs[R_EBP];
1914     r->esi = env->regs[R_ESI];
1915     r->edi = env->regs[R_EDI];
1916 #ifdef __x86_64__
1917     r->r8 = env->regs[8];
1918     r->r9 = env->regs[9];
1919     r->r10 = env->regs[10];
1920     r->r11 = env->regs[11];
1921     r->r12 = env->regs[12];
1922     r->r13 = env->regs[13];
1923     r->r14 = env->regs[14];
1924     r->r15 = env->regs[15];
1925 #else
1926     r->ds_sel = env->segs[R_DS].selector;
1927     r->es_sel = env->segs[R_ES].selector;
1928 #endif
1929
1930     update_seg_desc_caches(s);
1931
1932     /* NOTE: exceptions can occur here */
1933     reload_segs(s);
1934
1935     /* for consistency, we accept to start the interpreter here if
1936        needed */
1937     if (!(s->comm_page.virt_eflags & IF_MASK)) {
1938         s->regs = r;
1939         s->seg_cache_loaded = 1;
1940         s->insn_count = MAX_INSN_COUNT;
1941         insn_interp(s);
1942     }
1943
1944     goto_user(s, r);
1945 }
1946
1947 /* General Protection Fault. In all cases we need to interpret the
1948    code to know more */
1949 void kqemu_exception_0d(struct kqemu_state *s,
1950                         struct kqemu_exception_regs regs)
1951 {
1952     kqemu_exception_interp(s, 0x0d, &regs);
1953 }
1954
1955 /* illegal intruction. We need to interpret just for the syscall case */
1956 void kqemu_exception_06(struct kqemu_state *s,
1957                         struct kqemu_exception_regs regs)
1958 {
1959     kqemu_exception_interp(s, 0x06, &regs);
1960 }
1961
1962 /* Coproprocessor emulation fault. We handle here the fact that the
1963    FPU state can be temporarily stored in the host OS */
1964 void kqemu_exception_07(struct kqemu_state *s,
1965                         struct kqemu_exception_regs regs)
1966 {
1967     if ((regs.cs_sel & 3) != 3) {
1968         if (!expected_monitor_exception(regs.eip)) {
1969             monitor_panic_regs(s, &regs, "Unexpected exception 0x%02x in monitor space\n", 0x07);
1970         }
1971         /* this can happen for fxsave/fxrstor instructions in the
1972            interpreter */
1973         s->seg_cache_loaded = 1;
1974     } else {
1975         s->seg_cache_loaded = 0;
1976     }
1977     s->regs = &s->regs1;
1978     if (s->cpu_state.cr0 & (CR0_TS_MASK | CR0_EM_MASK)) {
1979         /* real FPU fault needed */
1980         raise_exception_err(s, EXCP07_PREX, 0);
1981     } else {
1982         /* the host needs to restore the FPU state for us */
1983         s->mon_req = MON_REQ_EXCEPTION;
1984         s->arg0 = 0x07;
1985         monitor2kernel1(s);
1986     }
1987 }
1988
1989 /* single step/debug */
1990 void kqemu_exception_01(struct kqemu_state *s,
1991                         struct kqemu_exception_regs regs)
1992 {
1993     unsigned long dr6, val;
1994
1995     asm volatile ("mov %%dr6, %0" : "=r" (dr6));
1996     /* Linux uses lazy dr7 clearing, so we must verify we are in this
1997        case */
1998     /* XXX: check that because TF should have the priority */
1999     if ((dr6 & 0xf) != 0 && !s->monitor_dr7)
2000         goto clear_dr7;
2001
2002     if ((regs.cs_sel & 3) != 3)
2003         monitor_panic_regs(s, &regs, "Unexpected exception 0x%02x in monitor space\n", 0x07);
2004
2005     s->regs = &regs;
2006     s->seg_cache_loaded = 0;
2007     /* update DR6 register */
2008     s->cpu_state.dr6 = dr6;
2009     raise_exception_err(s, EXCP01_SSTP, 0);
2010  clear_dr7:
2011     val = 0;
2012     asm volatile ("mov %0, %%dr7" : : "r" (val));
2013 }
2014
2015 #define DEFAULT_EXCEPTION(n) \
2016 void kqemu_exception_ ## n (struct kqemu_state *s, \
2017                             struct kqemu_exception_regs regs) \
2018 { \
2019     if ((regs.cs_sel & 3) != 3)\
2020         handle_mon_exception(s, &regs, 0x ## n);\
2021     s->regs = &regs;\
2022     s->seg_cache_loaded = 0;\
2023     s->cpu_state.error_code = regs.error_code;\
2024     raise_exception(s, 0x ## n);\
2025 }
2026
2027 DEFAULT_EXCEPTION(00)
2028 DEFAULT_EXCEPTION(02)
2029 DEFAULT_EXCEPTION(03)
2030 DEFAULT_EXCEPTION(04)
2031 DEFAULT_EXCEPTION(05)
2032 DEFAULT_EXCEPTION(08)
2033 DEFAULT_EXCEPTION(09)
2034 DEFAULT_EXCEPTION(0a)
2035 DEFAULT_EXCEPTION(0b)
2036 DEFAULT_EXCEPTION(0c)
2037 DEFAULT_EXCEPTION(0f)
2038 DEFAULT_EXCEPTION(10)
2039 DEFAULT_EXCEPTION(11)
2040 DEFAULT_EXCEPTION(12)
2041 DEFAULT_EXCEPTION(13)
2042
2043 void monitor_interrupt(struct kqemu_state *s, struct kqemu_exception_regs regs)
2044 {
2045     int intno;
2046 #ifdef PROFILE_INTERP2
2047     int64_t ti = getclock();
2048     s->hw_interrupt_start_count++;
2049 #endif
2050
2051     intno = regs.error_code;
2052
2053     if ((regs.cs_sel & 3) != 3) {
2054         monitor_panic_regs(s, &regs, "Interrupt 0x%02x in monitor space\n",
2055                            intno);
2056     }
2057
2058     s->regs = &regs;
2059     s->seg_cache_loaded = 0;
2060     /* execute the irq code in kernel space */
2061     s->mon_req = MON_REQ_IRQ;
2062     s->arg0 = intno;
2063     /* NOTE: if interrupting user code, the host kernel will schedule
2064        and eventually exit from the monitor_exec loop */
2065     monitor2kernel1(s);
2066     /* ... and come back to monitor space */
2067
2068 #ifdef PROFILE_INTERP2
2069     s->hw_interrupt_count++;
2070     s->hw_interrupt_cycles += (getclock() - ti);
2071 #endif
2072 }