softmmu/cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "qemu/cutils.h"
  29 #include "migration/vmstate.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/error.h"
  32 #include "qapi/qapi-commands-misc.h"
  33 #include "qapi/qapi-events-run-state.h"
  34 #include "qapi/qmp/qerror.h"
  35 #include "qemu/error-report.h"
  36 #include "qemu/qemu-print.h"
  37 #include "sysemu/tcg.h"
  38 #include "sysemu/block-backend.h"
  39 #include "exec/gdbstub.h"
  40 #include "sysemu/dma.h"
  41 #include "sysemu/hw_accel.h"
  42 #include "sysemu/kvm.h"
  43 #include "sysemu/hax.h"
  44 #include "sysemu/hvf.h"
  45 #include "sysemu/whpx.h"
  46 #include "exec/exec-all.h"
  47
  48 #include "qemu/thread.h"
  49 #include "qemu/plugin.h"
  50 #include "sysemu/cpus.h"
  51 #include "sysemu/qtest.h"
  52 #include "qemu/main-loop.h"
  53 #include "qemu/option.h"
  54 #include "qemu/bitmap.h"
  55 #include "qemu/seqlock.h"
  56 #include "qemu/guest-random.h"
  57 #include "tcg/tcg.h"
  58 #include "hw/nmi.h"
  59 #include "sysemu/replay.h"
  60 #include "sysemu/runstate.h"
  61 #include "hw/boards.h"
  62 #include "hw/hw.h"
  63
  64 #include "sysemu/cpu-throttle.h"
  65
  66 #ifdef CONFIG_LINUX
  67
  68 #include <sys/prctl.h>
  69
  70 #ifndef PR_MCE_KILL
  71 #define PR_MCE_KILL 33
  72 #endif
  73
  74 #ifndef PR_MCE_KILL_SET
  75 #define PR_MCE_KILL_SET 1
  76 #endif
  77
  78 #ifndef PR_MCE_KILL_EARLY
  79 #define PR_MCE_KILL_EARLY 1
  80 #endif
  81
  82 #endif /* CONFIG_LINUX */
  83
  84 static QemuMutex qemu_global_mutex;
  85
  86 int64_t max_delay;
  87 int64_t max_advance;
  88
  89 bool cpu_is_stopped(CPUState *cpu)
  90 {
  91     return cpu->stopped || !runstate_is_running();
  92 }
  93
  94 static inline bool cpu_work_list_empty(CPUState *cpu)
  95 {
  96     bool ret;
  97
  98     qemu_mutex_lock(&cpu->work_mutex);
  99     ret = QSIMPLEQ_EMPTY(&cpu->work_list);
 100     qemu_mutex_unlock(&cpu->work_mutex);
 101     return ret;
 102 }
 103
 104 static bool cpu_thread_is_idle(CPUState *cpu)
 105 {
 106     if (cpu->stop || !cpu_work_list_empty(cpu)) {
 107         return false;
 108     }
 109     if (cpu_is_stopped(cpu)) {
 110         return true;
 111     }
 112     if (!cpu->halted || cpu_has_work(cpu) ||
 113         kvm_halt_in_kernel()) {
 114         return false;
 115     }
 116     return true;
 117 }
 118
 119 static bool all_cpu_threads_idle(void)
 120 {
 121     CPUState *cpu;
 122
 123     CPU_FOREACH(cpu) {
 124         if (!cpu_thread_is_idle(cpu)) {
 125             return false;
 126         }
 127     }
 128     return true;
 129 }
 130
 131 /***********************************************************/
 132 /* guest cycle counter */
 133
 134 /* Protected by TimersState seqlock */
 135
 136 static bool icount_sleep = true;
 137 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 138 #define MAX_ICOUNT_SHIFT 10
 139
 140 typedef struct TimersState {
 141     /* Protected by BQL.  */
 142     int64_t cpu_ticks_prev;
 143     int64_t cpu_ticks_offset;
 144
 145     /* Protect fields that can be respectively read outside the
 146      * BQL, and written from multiple threads.
 147      */
 148     QemuSeqLock vm_clock_seqlock;
 149     QemuSpin vm_clock_lock;
 150
 151     int16_t cpu_ticks_enabled;
 152
 153     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 154     int16_t icount_time_shift;
 155
 156     /* Compensate for varying guest execution speed.  */
 157     int64_t qemu_icount_bias;
 158
 159     int64_t vm_clock_warp_start;
 160     int64_t cpu_clock_offset;
 161
 162     /* Only written by TCG thread */
 163     int64_t qemu_icount;
 164
 165     /* for adjusting icount */
 166     QEMUTimer *icount_rt_timer;
 167     QEMUTimer *icount_vm_timer;
 168     QEMUTimer *icount_warp_timer;
 169 } TimersState;
 170
 171 static TimersState timers_state;
 172 bool mttcg_enabled;
 173
 174
 175 /* The current number of executed instructions is based on what we
 176  * originally budgeted minus the current state of the decrementing
 177  * icount counters in extra/u16.low.
 178  */
 179 static int64_t cpu_get_icount_executed(CPUState *cpu)
 180 {
 181     return (cpu->icount_budget -
 182             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 183 }
 184
 185 /*
 186  * Update the global shared timer_state.qemu_icount to take into
 187  * account executed instructions. This is done by the TCG vCPU
 188  * thread so the main-loop can see time has moved forward.
 189  */
 190 static void cpu_update_icount_locked(CPUState *cpu)
 191 {
 192     int64_t executed = cpu_get_icount_executed(cpu);
 193     cpu->icount_budget -= executed;
 194
 195     atomic_set_i64(&timers_state.qemu_icount,
 196                    timers_state.qemu_icount + executed);
 197 }
 198
 199 /*
 200  * Update the global shared timer_state.qemu_icount to take into
 201  * account executed instructions. This is done by the TCG vCPU
 202  * thread so the main-loop can see time has moved forward.
 203  */
 204 void cpu_update_icount(CPUState *cpu)
 205 {
 206     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 207                        &timers_state.vm_clock_lock);
 208     cpu_update_icount_locked(cpu);
 209     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 210                          &timers_state.vm_clock_lock);
 211 }
 212
 213 static int64_t cpu_get_icount_raw_locked(void)
 214 {
 215     CPUState *cpu = current_cpu;
 216
 217     if (cpu && cpu->running) {
 218         if (!cpu->can_do_io) {
 219             error_report("Bad icount read");
 220             exit(1);
 221         }
 222         /* Take into account what has run */
 223         cpu_update_icount_locked(cpu);
 224     }
 225     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 226     return atomic_read_i64(&timers_state.qemu_icount);
 227 }
 228
 229 static int64_t cpu_get_icount_locked(void)
 230 {
 231     int64_t icount = cpu_get_icount_raw_locked();
 232     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 233         cpu_icount_to_ns(icount);
 234 }
 235
 236 int64_t cpu_get_icount_raw(void)
 237 {
 238     int64_t icount;
 239     unsigned start;
 240
 241     do {
 242         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 243         icount = cpu_get_icount_raw_locked();
 244     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 245
 246     return icount;
 247 }
 248
 249 /* Return the virtual CPU time, based on the instruction counter.  */
 250 int64_t cpu_get_icount(void)
 251 {
 252     int64_t icount;
 253     unsigned start;
 254
 255     do {
 256         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 257         icount = cpu_get_icount_locked();
 258     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 259
 260     return icount;
 261 }
 262
 263 int64_t cpu_icount_to_ns(int64_t icount)
 264 {
 265     return icount << atomic_read(&timers_state.icount_time_shift);
 266 }
 267
 268 static int64_t cpu_get_ticks_locked(void)
 269 {
 270     int64_t ticks = timers_state.cpu_ticks_offset;
 271     if (timers_state.cpu_ticks_enabled) {
 272         ticks += cpu_get_host_ticks();
 273     }
 274
 275     if (timers_state.cpu_ticks_prev > ticks) {
 276         /* Non increasing ticks may happen if the host uses software suspend.  */
 277         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 278         ticks = timers_state.cpu_ticks_prev;
 279     }
 280
 281     timers_state.cpu_ticks_prev = ticks;
 282     return ticks;
 283 }
 284
 285 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 286  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 287  * counter.
 288  */
 289 int64_t cpu_get_ticks(void)
 290 {
 291     int64_t ticks;
 292
 293     if (use_icount) {
 294         return cpu_get_icount();
 295     }
 296
 297     qemu_spin_lock(&timers_state.vm_clock_lock);
 298     ticks = cpu_get_ticks_locked();
 299     qemu_spin_unlock(&timers_state.vm_clock_lock);
 300     return ticks;
 301 }
 302
 303 static int64_t cpu_get_clock_locked(void)
 304 {
 305     int64_t time;
 306
 307     time = timers_state.cpu_clock_offset;
 308     if (timers_state.cpu_ticks_enabled) {
 309         time += get_clock();
 310     }
 311
 312     return time;
 313 }
 314
 315 /* Return the monotonic time elapsed in VM, i.e.,
 316  * the time between vm_start and vm_stop
 317  */
 318 int64_t cpu_get_clock(void)
 319 {
 320     int64_t ti;
 321     unsigned start;
 322
 323     do {
 324         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 325         ti = cpu_get_clock_locked();
 326     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 327
 328     return ti;
 329 }
 330
 331 /* enable cpu_get_ticks()
 332  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 333  */
 334 void cpu_enable_ticks(void)
 335 {
 336     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 337                        &timers_state.vm_clock_lock);
 338     if (!timers_state.cpu_ticks_enabled) {
 339         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 340         timers_state.cpu_clock_offset -= get_clock();
 341         timers_state.cpu_ticks_enabled = 1;
 342     }
 343     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 344                        &timers_state.vm_clock_lock);
 345 }
 346
 347 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 348  * cpu_get_ticks() after that.
 349  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 350  */
 351 void cpu_disable_ticks(void)
 352 {
 353     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 354                        &timers_state.vm_clock_lock);
 355     if (timers_state.cpu_ticks_enabled) {
 356         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 357         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 358         timers_state.cpu_ticks_enabled = 0;
 359     }
 360     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 361                          &timers_state.vm_clock_lock);
 362 }
 363
 364 /* Correlation between real and virtual time is always going to be
 365    fairly approximate, so ignore small variation.
 366    When the guest is idle real and virtual time will be aligned in
 367    the IO wait loop.  */
 368 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 369
 370 static void icount_adjust(void)
 371 {
 372     int64_t cur_time;
 373     int64_t cur_icount;
 374     int64_t delta;
 375
 376     /* Protected by TimersState mutex.  */
 377     static int64_t last_delta;
 378
 379     /* If the VM is not running, then do nothing.  */
 380     if (!runstate_is_running()) {
 381         return;
 382     }
 383
 384     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 385                        &timers_state.vm_clock_lock);
 386     cur_time = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 387                                    cpu_get_clock_locked());
 388     cur_icount = cpu_get_icount_locked();
 389
 390     delta = cur_icount - cur_time;
 391     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 392     if (delta > 0
 393         && last_delta + ICOUNT_WOBBLE < delta * 2
 394         && timers_state.icount_time_shift > 0) {
 395         /* The guest is getting too far ahead.  Slow time down.  */
 396         atomic_set(&timers_state.icount_time_shift,
 397                    timers_state.icount_time_shift - 1);
 398     }
 399     if (delta < 0
 400         && last_delta - ICOUNT_WOBBLE > delta * 2
 401         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 402         /* The guest is getting too far behind.  Speed time up.  */
 403         atomic_set(&timers_state.icount_time_shift,
 404                    timers_state.icount_time_shift + 1);
 405     }
 406     last_delta = delta;
 407     atomic_set_i64(&timers_state.qemu_icount_bias,
 408                    cur_icount - (timers_state.qemu_icount
 409                                  << timers_state.icount_time_shift));
 410     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 411                          &timers_state.vm_clock_lock);
 412 }
 413
 414 static void icount_adjust_rt(void *opaque)
 415 {
 416     timer_mod(timers_state.icount_rt_timer,
 417               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 418     icount_adjust();
 419 }
 420
 421 static void icount_adjust_vm(void *opaque)
 422 {
 423     timer_mod(timers_state.icount_vm_timer,
 424                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 425                    NANOSECONDS_PER_SECOND / 10);
 426     icount_adjust();
 427 }
 428
 429 static int64_t qemu_icount_round(int64_t count)
 430 {
 431     int shift = atomic_read(&timers_state.icount_time_shift);
 432     return (count + (1 << shift) - 1) >> shift;
 433 }
 434
 435 static void icount_warp_rt(void)
 436 {
 437     unsigned seq;
 438     int64_t warp_start;
 439
 440     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 441      * changes from -1 to another value, so the race here is okay.
 442      */
 443     do {
 444         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 445         warp_start = timers_state.vm_clock_warp_start;
 446     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 447
 448     if (warp_start == -1) {
 449         return;
 450     }
 451
 452     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 453                        &timers_state.vm_clock_lock);
 454     if (runstate_is_running()) {
 455         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 456                                             cpu_get_clock_locked());
 457         int64_t warp_delta;
 458
 459         warp_delta = clock - timers_state.vm_clock_warp_start;
 460         if (use_icount == 2) {
 461             /*
 462              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 463              * far ahead of real time.
 464              */
 465             int64_t cur_icount = cpu_get_icount_locked();
 466             int64_t delta = clock - cur_icount;
 467             warp_delta = MIN(warp_delta, delta);
 468         }
 469         atomic_set_i64(&timers_state.qemu_icount_bias,
 470                        timers_state.qemu_icount_bias + warp_delta);
 471     }
 472     timers_state.vm_clock_warp_start = -1;
 473     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 474                        &timers_state.vm_clock_lock);
 475
 476     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 477         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 478     }
 479 }
 480
 481 static void icount_timer_cb(void *opaque)
 482 {
 483     /* No need for a checkpoint because the timer already synchronizes
 484      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 485      */
 486     icount_warp_rt();
 487 }
 488
 489 void qtest_clock_warp(int64_t dest)
 490 {
 491     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 492     AioContext *aio_context;
 493     assert(qtest_enabled());
 494     aio_context = qemu_get_aio_context();
 495     while (clock < dest) {
 496         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 497                                                       QEMU_TIMER_ATTR_ALL);
 498         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 499
 500         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 501                            &timers_state.vm_clock_lock);
 502         atomic_set_i64(&timers_state.qemu_icount_bias,
 503                        timers_state.qemu_icount_bias + warp);
 504         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 505                              &timers_state.vm_clock_lock);
 506
 507         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 508         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 509         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 510     }
 511     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 512 }
 513
 514 void qemu_start_warp_timer(void)
 515 {
 516     int64_t clock;
 517     int64_t deadline;
 518
 519     if (!use_icount) {
 520         return;
 521     }
 522
 523     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 524      * do not fire, so computing the deadline does not make sense.
 525      */
 526     if (!runstate_is_running()) {
 527         return;
 528     }
 529
 530     if (replay_mode != REPLAY_MODE_PLAY) {
 531         if (!all_cpu_threads_idle()) {
 532             return;
 533         }
 534
 535         if (qtest_enabled()) {
 536             /* When testing, qtest commands advance icount.  */
 537             return;
 538         }
 539
 540         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 541     } else {
 542         /* warp clock deterministically in record/replay mode */
 543         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 544             /* vCPU is sleeping and warp can't be started.
 545                It is probably a race condition: notification sent
 546                to vCPU was processed in advance and vCPU went to sleep.
 547                Therefore we have to wake it up for doing someting. */
 548             if (replay_has_checkpoint()) {
 549                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 550             }
 551             return;
 552         }
 553     }
 554
 555     /* We want to use the earliest deadline from ALL vm_clocks */
 556     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 557     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 558                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 559     if (deadline < 0) {
 560         static bool notified;
 561         if (!icount_sleep && !notified) {
 562             warn_report("icount sleep disabled and no active timers");
 563             notified = true;
 564         }
 565         return;
 566     }
 567
 568     if (deadline > 0) {
 569         /*
 570          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 571          * sleep.  Otherwise, the CPU might be waiting for a future timer
 572          * interrupt to wake it up, but the interrupt never comes because
 573          * the vCPU isn't running any insns and thus doesn't advance the
 574          * QEMU_CLOCK_VIRTUAL.
 575          */
 576         if (!icount_sleep) {
 577             /*
 578              * We never let VCPUs sleep in no sleep icount mode.
 579              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 580              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 581              * It is useful when we want a deterministic execution time,
 582              * isolated from host latencies.
 583              */
 584             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 585                                &timers_state.vm_clock_lock);
 586             atomic_set_i64(&timers_state.qemu_icount_bias,
 587                            timers_state.qemu_icount_bias + deadline);
 588             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 589                                  &timers_state.vm_clock_lock);
 590             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 591         } else {
 592             /*
 593              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 594              * "real" time, (related to the time left until the next event) has
 595              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 596              * This avoids that the warps are visible externally; for example,
 597              * you will not be sending network packets continuously instead of
 598              * every 100ms.
 599              */
 600             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 601                                &timers_state.vm_clock_lock);
 602             if (timers_state.vm_clock_warp_start == -1
 603                 || timers_state.vm_clock_warp_start > clock) {
 604                 timers_state.vm_clock_warp_start = clock;
 605             }
 606             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 607                                  &timers_state.vm_clock_lock);
 608             timer_mod_anticipate(timers_state.icount_warp_timer,
 609                                  clock + deadline);
 610         }
 611     } else if (deadline == 0) {
 612         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 613     }
 614 }
 615
 616 static void qemu_account_warp_timer(void)
 617 {
 618     if (!use_icount || !icount_sleep) {
 619         return;
 620     }
 621
 622     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 623      * do not fire, so computing the deadline does not make sense.
 624      */
 625     if (!runstate_is_running()) {
 626         return;
 627     }
 628
 629     /* warp clock deterministically in record/replay mode */
 630     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 631         return;
 632     }
 633
 634     timer_del(timers_state.icount_warp_timer);
 635     icount_warp_rt();
 636 }
 637
 638 static bool icount_state_needed(void *opaque)
 639 {
 640     return use_icount;
 641 }
 642
 643 static bool warp_timer_state_needed(void *opaque)
 644 {
 645     TimersState *s = opaque;
 646     return s->icount_warp_timer != NULL;
 647 }
 648
 649 static bool adjust_timers_state_needed(void *opaque)
 650 {
 651     TimersState *s = opaque;
 652     return s->icount_rt_timer != NULL;
 653 }
 654
 655 static bool shift_state_needed(void *opaque)
 656 {
 657     return use_icount == 2;
 658 }
 659
 660 /*
 661  * Subsection for warp timer migration is optional, because may not be created
 662  */
 663 static const VMStateDescription icount_vmstate_warp_timer = {
 664     .name = "timer/icount/warp_timer",
 665     .version_id = 1,
 666     .minimum_version_id = 1,
 667     .needed = warp_timer_state_needed,
 668     .fields = (VMStateField[]) {
 669         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 670         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 671         VMSTATE_END_OF_LIST()
 672     }
 673 };
 674
 675 static const VMStateDescription icount_vmstate_adjust_timers = {
 676     .name = "timer/icount/timers",
 677     .version_id = 1,
 678     .minimum_version_id = 1,
 679     .needed = adjust_timers_state_needed,
 680     .fields = (VMStateField[]) {
 681         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 682         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 683         VMSTATE_END_OF_LIST()
 684     }
 685 };
 686
 687 static const VMStateDescription icount_vmstate_shift = {
 688     .name = "timer/icount/shift",
 689     .version_id = 1,
 690     .minimum_version_id = 1,
 691     .needed = shift_state_needed,
 692     .fields = (VMStateField[]) {
 693         VMSTATE_INT16(icount_time_shift, TimersState),
 694         VMSTATE_END_OF_LIST()
 695     }
 696 };
 697
 698 /*
 699  * This is a subsection for icount migration.
 700  */
 701 static const VMStateDescription icount_vmstate_timers = {
 702     .name = "timer/icount",
 703     .version_id = 1,
 704     .minimum_version_id = 1,
 705     .needed = icount_state_needed,
 706     .fields = (VMStateField[]) {
 707         VMSTATE_INT64(qemu_icount_bias, TimersState),
 708         VMSTATE_INT64(qemu_icount, TimersState),
 709         VMSTATE_END_OF_LIST()
 710     },
 711     .subsections = (const VMStateDescription*[]) {
 712         &icount_vmstate_warp_timer,
 713         &icount_vmstate_adjust_timers,
 714         &icount_vmstate_shift,
 715         NULL
 716     }
 717 };
 718
 719 static const VMStateDescription vmstate_timers = {
 720     .name = "timer",
 721     .version_id = 2,
 722     .minimum_version_id = 1,
 723     .fields = (VMStateField[]) {
 724         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 725         VMSTATE_UNUSED(8),
 726         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 727         VMSTATE_END_OF_LIST()
 728     },
 729     .subsections = (const VMStateDescription*[]) {
 730         &icount_vmstate_timers,
 731         NULL
 732     }
 733 };
 734
 735 void cpu_ticks_init(void)
 736 {
 737     seqlock_init(&timers_state.vm_clock_seqlock);
 738     qemu_spin_init(&timers_state.vm_clock_lock);
 739     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 740     cpu_throttle_init();
 741 }
 742
 743 void configure_icount(QemuOpts *opts, Error **errp)
 744 {
 745     const char *option = qemu_opt_get(opts, "shift");
 746     bool sleep = qemu_opt_get_bool(opts, "sleep", true);
 747     bool align = qemu_opt_get_bool(opts, "align", false);
 748     long time_shift = -1;
 749
 750     if (!option) {
 751         if (qemu_opt_get(opts, "align") != NULL) {
 752             error_setg(errp, "Please specify shift option when using align");
 753         }
 754         return;
 755     }
 756
 757     if (align && !sleep) {
 758         error_setg(errp, "align=on and sleep=off are incompatible");
 759         return;
 760     }
 761
 762     if (strcmp(option, "auto") != 0) {
 763         if (qemu_strtol(option, NULL, 0, &time_shift) < 0
 764             || time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {
 765             error_setg(errp, "icount: Invalid shift value");
 766             return;
 767         }
 768     } else if (icount_align_option) {
 769         error_setg(errp, "shift=auto and align=on are incompatible");
 770         return;
 771     } else if (!icount_sleep) {
 772         error_setg(errp, "shift=auto and sleep=off are incompatible");
 773         return;
 774     }
 775
 776     icount_sleep = sleep;
 777     if (icount_sleep) {
 778         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 779                                          icount_timer_cb, NULL);
 780     }
 781
 782     icount_align_option = align;
 783
 784     if (time_shift >= 0) {
 785         timers_state.icount_time_shift = time_shift;
 786         use_icount = 1;
 787         return;
 788     }
 789
 790     use_icount = 2;
 791
 792     /* 125MIPS seems a reasonable initial guess at the guest speed.
 793        It will be corrected fairly quickly anyway.  */
 794     timers_state.icount_time_shift = 3;
 795
 796     /* Have both realtime and virtual time triggers for speed adjustment.
 797        The realtime trigger catches emulated time passing too slowly,
 798        the virtual time trigger catches emulated time passing too fast.
 799        Realtime triggers occur even when idle, so use them less frequently
 800        than VM triggers.  */
 801     timers_state.vm_clock_warp_start = -1;
 802     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 803                                    icount_adjust_rt, NULL);
 804     timer_mod(timers_state.icount_rt_timer,
 805                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 806     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 807                                         icount_adjust_vm, NULL);
 808     timer_mod(timers_state.icount_vm_timer,
 809                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 810                    NANOSECONDS_PER_SECOND / 10);
 811 }
 812
 813 /***********************************************************/
 814 /* TCG vCPU kick timer
 815  *
 816  * The kick timer is responsible for moving single threaded vCPU
 817  * emulation on to the next vCPU. If more than one vCPU is running a
 818  * timer event with force a cpu->exit so the next vCPU can get
 819  * scheduled.
 820  *
 821  * The timer is removed if all vCPUs are idle and restarted again once
 822  * idleness is complete.
 823  */
 824
 825 static QEMUTimer *tcg_kick_vcpu_timer;
 826 static CPUState *tcg_current_rr_cpu;
 827
 828 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 829
 830 static inline int64_t qemu_tcg_next_kick(void)
 831 {
 832     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 833 }
 834
 835 /* Kick the currently round-robin scheduled vCPU to next */
 836 static void qemu_cpu_kick_rr_next_cpu(void)
 837 {
 838     CPUState *cpu;
 839     do {
 840         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 841         if (cpu) {
 842             cpu_exit(cpu);
 843         }
 844     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 845 }
 846
 847 /* Kick all RR vCPUs */
 848 static void qemu_cpu_kick_rr_cpus(void)
 849 {
 850     CPUState *cpu;
 851
 852     CPU_FOREACH(cpu) {
 853         cpu_exit(cpu);
 854     };
 855 }
 856
 857 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 858 {
 859 }
 860
 861 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 862 {
 863     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 864         qemu_notify_event();
 865         return;
 866     }
 867
 868     if (qemu_in_vcpu_thread()) {
 869         /* A CPU is currently running; kick it back out to the
 870          * tcg_cpu_exec() loop so it will recalculate its
 871          * icount deadline immediately.
 872          */
 873         qemu_cpu_kick(current_cpu);
 874     } else if (first_cpu) {
 875         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 876          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 877          * causes cpu_thread_is_idle to return false.  This way,
 878          * handle_icount_deadline can run.
 879          * If we have no CPUs at all for some reason, we don't
 880          * need to do anything.
 881          */
 882         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 883     }
 884 }
 885
 886 static void kick_tcg_thread(void *opaque)
 887 {
 888     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 889     qemu_cpu_kick_rr_next_cpu();
 890 }
 891
 892 static void start_tcg_kick_timer(void)
 893 {
 894     assert(!mttcg_enabled);
 895     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 896         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 897                                            kick_tcg_thread, NULL);
 898     }
 899     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 900         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 901     }
 902 }
 903
 904 static void stop_tcg_kick_timer(void)
 905 {
 906     assert(!mttcg_enabled);
 907     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 908         timer_del(tcg_kick_vcpu_timer);
 909     }
 910 }
 911
 912 /***********************************************************/
 913 void hw_error(const char *fmt, ...)
 914 {
 915     va_list ap;
 916     CPUState *cpu;
 917
 918     va_start(ap, fmt);
 919     fprintf(stderr, "qemu: hardware error: ");
 920     vfprintf(stderr, fmt, ap);
 921     fprintf(stderr, "\n");
 922     CPU_FOREACH(cpu) {
 923         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 924         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
 925     }
 926     va_end(ap);
 927     abort();
 928 }
 929
 930 void cpu_synchronize_all_states(void)
 931 {
 932     CPUState *cpu;
 933
 934     CPU_FOREACH(cpu) {
 935         cpu_synchronize_state(cpu);
 936     }
 937 }
 938
 939 void cpu_synchronize_all_post_reset(void)
 940 {
 941     CPUState *cpu;
 942
 943     CPU_FOREACH(cpu) {
 944         cpu_synchronize_post_reset(cpu);
 945     }
 946 }
 947
 948 void cpu_synchronize_all_post_init(void)
 949 {
 950     CPUState *cpu;
 951
 952     CPU_FOREACH(cpu) {
 953         cpu_synchronize_post_init(cpu);
 954     }
 955 }
 956
 957 void cpu_synchronize_all_pre_loadvm(void)
 958 {
 959     CPUState *cpu;
 960
 961     CPU_FOREACH(cpu) {
 962         cpu_synchronize_pre_loadvm(cpu);
 963     }
 964 }
 965
 966 static int do_vm_stop(RunState state, bool send_stop)
 967 {
 968     int ret = 0;
 969
 970     if (runstate_is_running()) {
 971         runstate_set(state);
 972         cpu_disable_ticks();
 973         pause_all_vcpus();
 974         vm_state_notify(0, state);
 975         if (send_stop) {
 976             qapi_event_send_stop();
 977         }
 978     }
 979
 980     bdrv_drain_all();
 981     ret = bdrv_flush_all();
 982
 983     return ret;
 984 }
 985
 986 /* Special vm_stop() variant for terminating the process.  Historically clients
 987  * did not expect a QMP STOP event and so we need to retain compatibility.
 988  */
 989 int vm_shutdown(void)
 990 {
 991     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
 992 }
 993
 994 static bool cpu_can_run(CPUState *cpu)
 995 {
 996     if (cpu->stop) {
 997         return false;
 998     }
 999     if (cpu_is_stopped(cpu)) {
1000         return false;
1001     }
1002     return true;
1003 }
1004
1005 static void cpu_handle_guest_debug(CPUState *cpu)
1006 {
1007     gdb_set_stop_cpu(cpu);
1008     qemu_system_debug_request();
1009     cpu->stopped = true;
1010 }
1011
1012 #ifdef CONFIG_LINUX
1013 static void sigbus_reraise(void)
1014 {
1015     sigset_t set;
1016     struct sigaction action;
1017
1018     memset(&action, 0, sizeof(action));
1019     action.sa_handler = SIG_DFL;
1020     if (!sigaction(SIGBUS, &action, NULL)) {
1021         raise(SIGBUS);
1022         sigemptyset(&set);
1023         sigaddset(&set, SIGBUS);
1024         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1025     }
1026     perror("Failed to re-raise SIGBUS!\n");
1027     abort();
1028 }
1029
1030 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1031 {
1032     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1033         sigbus_reraise();
1034     }
1035
1036     if (current_cpu) {
1037         /* Called asynchronously in VCPU thread.  */
1038         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1039             sigbus_reraise();
1040         }
1041     } else {
1042         /* Called synchronously (via signalfd) in main thread.  */
1043         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1044             sigbus_reraise();
1045         }
1046     }
1047 }
1048
1049 static void qemu_init_sigbus(void)
1050 {
1051     struct sigaction action;
1052
1053     memset(&action, 0, sizeof(action));
1054     action.sa_flags = SA_SIGINFO;
1055     action.sa_sigaction = sigbus_handler;
1056     sigaction(SIGBUS, &action, NULL);
1057
1058     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1059 }
1060 #else /* !CONFIG_LINUX */
1061 static void qemu_init_sigbus(void)
1062 {
1063 }
1064 #endif /* !CONFIG_LINUX */
1065
1066 static QemuThread io_thread;
1067
1068 /* cpu creation */
1069 static QemuCond qemu_cpu_cond;
1070 /* system init */
1071 static QemuCond qemu_pause_cond;
1072
1073 void qemu_init_cpu_loop(void)
1074 {
1075     qemu_init_sigbus();
1076     qemu_cond_init(&qemu_cpu_cond);
1077     qemu_cond_init(&qemu_pause_cond);
1078     qemu_mutex_init(&qemu_global_mutex);
1079
1080     qemu_thread_get_self(&io_thread);
1081 }
1082
1083 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1084 {
1085     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1086 }
1087
1088 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1089 {
1090     if (kvm_destroy_vcpu(cpu) < 0) {
1091         error_report("kvm_destroy_vcpu failed");
1092         exit(EXIT_FAILURE);
1093     }
1094 }
1095
1096 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1097 {
1098 }
1099
1100 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1101 {
1102     g_assert(qemu_cpu_is_self(cpu));
1103     cpu->stop = false;
1104     cpu->stopped = true;
1105     if (exit) {
1106         cpu_exit(cpu);
1107     }
1108     qemu_cond_broadcast(&qemu_pause_cond);
1109 }
1110
1111 static void qemu_wait_io_event_common(CPUState *cpu)
1112 {
1113     atomic_mb_set(&cpu->thread_kicked, false);
1114     if (cpu->stop) {
1115         qemu_cpu_stop(cpu, false);
1116     }
1117     process_queued_cpu_work(cpu);
1118 }
1119
1120 static void qemu_tcg_rr_wait_io_event(void)
1121 {
1122     CPUState *cpu;
1123
1124     while (all_cpu_threads_idle()) {
1125         stop_tcg_kick_timer();
1126         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1127     }
1128
1129     start_tcg_kick_timer();
1130
1131     CPU_FOREACH(cpu) {
1132         qemu_wait_io_event_common(cpu);
1133     }
1134 }
1135
1136 static void qemu_wait_io_event(CPUState *cpu)
1137 {
1138     bool slept = false;
1139
1140     while (cpu_thread_is_idle(cpu)) {
1141         if (!slept) {
1142             slept = true;
1143             qemu_plugin_vcpu_idle_cb(cpu);
1144         }
1145         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1146     }
1147     if (slept) {
1148         qemu_plugin_vcpu_resume_cb(cpu);
1149     }
1150
1151 #ifdef _WIN32
1152     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1153     if (!tcg_enabled()) {
1154         SleepEx(0, TRUE);
1155     }
1156 #endif
1157     qemu_wait_io_event_common(cpu);
1158 }
1159
1160 static void *qemu_kvm_cpu_thread_fn(void *arg)
1161 {
1162     CPUState *cpu = arg;
1163     int r;
1164
1165     rcu_register_thread();
1166
1167     qemu_mutex_lock_iothread();
1168     qemu_thread_get_self(cpu->thread);
1169     cpu->thread_id = qemu_get_thread_id();
1170     cpu->can_do_io = 1;
1171     current_cpu = cpu;
1172
1173     r = kvm_init_vcpu(cpu);
1174     if (r < 0) {
1175         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1176         exit(1);
1177     }
1178
1179     kvm_init_cpu_signals(cpu);
1180
1181     /* signal CPU creation */
1182     cpu->created = true;
1183     qemu_cond_signal(&qemu_cpu_cond);
1184     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1185
1186     do {
1187         if (cpu_can_run(cpu)) {
1188             r = kvm_cpu_exec(cpu);
1189             if (r == EXCP_DEBUG) {
1190                 cpu_handle_guest_debug(cpu);
1191             }
1192         }
1193         qemu_wait_io_event(cpu);
1194     } while (!cpu->unplug || cpu_can_run(cpu));
1195
1196     qemu_kvm_destroy_vcpu(cpu);
1197     cpu->created = false;
1198     qemu_cond_signal(&qemu_cpu_cond);
1199     qemu_mutex_unlock_iothread();
1200     rcu_unregister_thread();
1201     return NULL;
1202 }
1203
1204 static void *qemu_dummy_cpu_thread_fn(void *arg)
1205 {
1206 #ifdef _WIN32
1207     error_report("qtest is not supported under Windows");
1208     exit(1);
1209 #else
1210     CPUState *cpu = arg;
1211     sigset_t waitset;
1212     int r;
1213
1214     rcu_register_thread();
1215
1216     qemu_mutex_lock_iothread();
1217     qemu_thread_get_self(cpu->thread);
1218     cpu->thread_id = qemu_get_thread_id();
1219     cpu->can_do_io = 1;
1220     current_cpu = cpu;
1221
1222     sigemptyset(&waitset);
1223     sigaddset(&waitset, SIG_IPI);
1224
1225     /* signal CPU creation */
1226     cpu->created = true;
1227     qemu_cond_signal(&qemu_cpu_cond);
1228     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1229
1230     do {
1231         qemu_mutex_unlock_iothread();
1232         do {
1233             int sig;
1234             r = sigwait(&waitset, &sig);
1235         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1236         if (r == -1) {
1237             perror("sigwait");
1238             exit(1);
1239         }
1240         qemu_mutex_lock_iothread();
1241         qemu_wait_io_event(cpu);
1242     } while (!cpu->unplug);
1243
1244     qemu_mutex_unlock_iothread();
1245     rcu_unregister_thread();
1246     return NULL;
1247 #endif
1248 }
1249
1250 static int64_t tcg_get_icount_limit(void)
1251 {
1252     int64_t deadline;
1253
1254     if (replay_mode != REPLAY_MODE_PLAY) {
1255         /*
1256          * Include all the timers, because they may need an attention.
1257          * Too long CPU execution may create unnecessary delay in UI.
1258          */
1259         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1260                                               QEMU_TIMER_ATTR_ALL);
1261         /* Check realtime timers, because they help with input processing */
1262         deadline = qemu_soonest_timeout(deadline,
1263                 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1264                                            QEMU_TIMER_ATTR_ALL));
1265
1266         /* Maintain prior (possibly buggy) behaviour where if no deadline
1267          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1268          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1269          * nanoseconds.
1270          */
1271         if ((deadline < 0) || (deadline > INT32_MAX)) {
1272             deadline = INT32_MAX;
1273         }
1274
1275         return qemu_icount_round(deadline);
1276     } else {
1277         return replay_get_instructions();
1278     }
1279 }
1280
1281 static void notify_aio_contexts(void)
1282 {
1283     /* Wake up other AioContexts.  */
1284     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1285     qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1286 }
1287
1288 static void handle_icount_deadline(void)
1289 {
1290     assert(qemu_in_vcpu_thread());
1291     if (use_icount) {
1292         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1293                                                       QEMU_TIMER_ATTR_ALL);
1294
1295         if (deadline == 0) {
1296             notify_aio_contexts();
1297         }
1298     }
1299 }
1300
1301 static void prepare_icount_for_run(CPUState *cpu)
1302 {
1303     if (use_icount) {
1304         int insns_left;
1305
1306         /* These should always be cleared by process_icount_data after
1307          * each vCPU execution. However u16.high can be raised
1308          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1309          */
1310         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1311         g_assert(cpu->icount_extra == 0);
1312
1313         cpu->icount_budget = tcg_get_icount_limit();
1314         insns_left = MIN(0xffff, cpu->icount_budget);
1315         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1316         cpu->icount_extra = cpu->icount_budget - insns_left;
1317
1318         replay_mutex_lock();
1319
1320         if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
1321             notify_aio_contexts();
1322         }
1323     }
1324 }
1325
1326 static void process_icount_data(CPUState *cpu)
1327 {
1328     if (use_icount) {
1329         /* Account for executed instructions */
1330         cpu_update_icount(cpu);
1331
1332         /* Reset the counters */
1333         cpu_neg(cpu)->icount_decr.u16.low = 0;
1334         cpu->icount_extra = 0;
1335         cpu->icount_budget = 0;
1336
1337         replay_account_executed_instructions();
1338
1339         replay_mutex_unlock();
1340     }
1341 }
1342
1343
1344 static int tcg_cpu_exec(CPUState *cpu)
1345 {
1346     int ret;
1347 #ifdef CONFIG_PROFILER
1348     int64_t ti;
1349 #endif
1350
1351     assert(tcg_enabled());
1352 #ifdef CONFIG_PROFILER
1353     ti = profile_getclock();
1354 #endif
1355     cpu_exec_start(cpu);
1356     ret = cpu_exec(cpu);
1357     cpu_exec_end(cpu);
1358 #ifdef CONFIG_PROFILER
1359     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1360                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1361 #endif
1362     return ret;
1363 }
1364
1365 /* Destroy any remaining vCPUs which have been unplugged and have
1366  * finished running
1367  */
1368 static void deal_with_unplugged_cpus(void)
1369 {
1370     CPUState *cpu;
1371
1372     CPU_FOREACH(cpu) {
1373         if (cpu->unplug && !cpu_can_run(cpu)) {
1374             qemu_tcg_destroy_vcpu(cpu);
1375             cpu->created = false;
1376             qemu_cond_signal(&qemu_cpu_cond);
1377             break;
1378         }
1379     }
1380 }
1381
1382 /* Single-threaded TCG
1383  *
1384  * In the single-threaded case each vCPU is simulated in turn. If
1385  * there is more than a single vCPU we create a simple timer to kick
1386  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1387  * This is done explicitly rather than relying on side-effects
1388  * elsewhere.
1389  */
1390
1391 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1392 {
1393     CPUState *cpu = arg;
1394
1395     assert(tcg_enabled());
1396     rcu_register_thread();
1397     tcg_register_thread();
1398
1399     qemu_mutex_lock_iothread();
1400     qemu_thread_get_self(cpu->thread);
1401
1402     cpu->thread_id = qemu_get_thread_id();
1403     cpu->created = true;
1404     cpu->can_do_io = 1;
1405     qemu_cond_signal(&qemu_cpu_cond);
1406     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1407
1408     /* wait for initial kick-off after machine start */
1409     while (first_cpu->stopped) {
1410         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1411
1412         /* process any pending work */
1413         CPU_FOREACH(cpu) {
1414             current_cpu = cpu;
1415             qemu_wait_io_event_common(cpu);
1416         }
1417     }
1418
1419     start_tcg_kick_timer();
1420
1421     cpu = first_cpu;
1422
1423     /* process any pending work */
1424     cpu->exit_request = 1;
1425
1426     while (1) {
1427         qemu_mutex_unlock_iothread();
1428         replay_mutex_lock();
1429         qemu_mutex_lock_iothread();
1430         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1431         qemu_account_warp_timer();
1432
1433         /* Run the timers here.  This is much more efficient than
1434          * waking up the I/O thread and waiting for completion.
1435          */
1436         handle_icount_deadline();
1437
1438         replay_mutex_unlock();
1439
1440         if (!cpu) {
1441             cpu = first_cpu;
1442         }
1443
1444         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
1445
1446             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1447             current_cpu = cpu;
1448
1449             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1450                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1451
1452             if (cpu_can_run(cpu)) {
1453                 int r;
1454
1455                 qemu_mutex_unlock_iothread();
1456                 prepare_icount_for_run(cpu);
1457
1458                 r = tcg_cpu_exec(cpu);
1459
1460                 process_icount_data(cpu);
1461                 qemu_mutex_lock_iothread();
1462
1463                 if (r == EXCP_DEBUG) {
1464                     cpu_handle_guest_debug(cpu);
1465                     break;
1466                 } else if (r == EXCP_ATOMIC) {
1467                     qemu_mutex_unlock_iothread();
1468                     cpu_exec_step_atomic(cpu);
1469                     qemu_mutex_lock_iothread();
1470                     break;
1471                 }
1472             } else if (cpu->stop) {
1473                 if (cpu->unplug) {
1474                     cpu = CPU_NEXT(cpu);
1475                 }
1476                 break;
1477             }
1478
1479             cpu = CPU_NEXT(cpu);
1480         } /* while (cpu && !cpu->exit_request).. */
1481
1482         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1483         atomic_set(&tcg_current_rr_cpu, NULL);
1484
1485         if (cpu && cpu->exit_request) {
1486             atomic_mb_set(&cpu->exit_request, 0);
1487         }
1488
1489         if (use_icount && all_cpu_threads_idle()) {
1490             /*
1491              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1492              * in the main_loop, wake it up in order to start the warp timer.
1493              */
1494             qemu_notify_event();
1495         }
1496
1497         qemu_tcg_rr_wait_io_event();
1498         deal_with_unplugged_cpus();
1499     }
1500
1501     rcu_unregister_thread();
1502     return NULL;
1503 }
1504
1505 static void *qemu_hax_cpu_thread_fn(void *arg)
1506 {
1507     CPUState *cpu = arg;
1508     int r;
1509
1510     rcu_register_thread();
1511     qemu_mutex_lock_iothread();
1512     qemu_thread_get_self(cpu->thread);
1513
1514     cpu->thread_id = qemu_get_thread_id();
1515     cpu->created = true;
1516     current_cpu = cpu;
1517
1518     hax_init_vcpu(cpu);
1519     qemu_cond_signal(&qemu_cpu_cond);
1520     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1521
1522     do {
1523         if (cpu_can_run(cpu)) {
1524             r = hax_smp_cpu_exec(cpu);
1525             if (r == EXCP_DEBUG) {
1526                 cpu_handle_guest_debug(cpu);
1527             }
1528         }
1529
1530         qemu_wait_io_event(cpu);
1531     } while (!cpu->unplug || cpu_can_run(cpu));
1532     rcu_unregister_thread();
1533     return NULL;
1534 }
1535
1536 /* The HVF-specific vCPU thread function. This one should only run when the host
1537  * CPU supports the VMX "unrestricted guest" feature. */
1538 static void *qemu_hvf_cpu_thread_fn(void *arg)
1539 {
1540     CPUState *cpu = arg;
1541
1542     int r;
1543
1544     assert(hvf_enabled());
1545
1546     rcu_register_thread();
1547
1548     qemu_mutex_lock_iothread();
1549     qemu_thread_get_self(cpu->thread);
1550
1551     cpu->thread_id = qemu_get_thread_id();
1552     cpu->can_do_io = 1;
1553     current_cpu = cpu;
1554
1555     hvf_init_vcpu(cpu);
1556
1557     /* signal CPU creation */
1558     cpu->created = true;
1559     qemu_cond_signal(&qemu_cpu_cond);
1560     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1561
1562     do {
1563         if (cpu_can_run(cpu)) {
1564             r = hvf_vcpu_exec(cpu);
1565             if (r == EXCP_DEBUG) {
1566                 cpu_handle_guest_debug(cpu);
1567             }
1568         }
1569         qemu_wait_io_event(cpu);
1570     } while (!cpu->unplug || cpu_can_run(cpu));
1571
1572     hvf_vcpu_destroy(cpu);
1573     cpu->created = false;
1574     qemu_cond_signal(&qemu_cpu_cond);
1575     qemu_mutex_unlock_iothread();
1576     rcu_unregister_thread();
1577     return NULL;
1578 }
1579
1580 static void *qemu_whpx_cpu_thread_fn(void *arg)
1581 {
1582     CPUState *cpu = arg;
1583     int r;
1584
1585     rcu_register_thread();
1586
1587     qemu_mutex_lock_iothread();
1588     qemu_thread_get_self(cpu->thread);
1589     cpu->thread_id = qemu_get_thread_id();
1590     current_cpu = cpu;
1591
1592     r = whpx_init_vcpu(cpu);
1593     if (r < 0) {
1594         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1595         exit(1);
1596     }
1597
1598     /* signal CPU creation */
1599     cpu->created = true;
1600     qemu_cond_signal(&qemu_cpu_cond);
1601     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1602
1603     do {
1604         if (cpu_can_run(cpu)) {
1605             r = whpx_vcpu_exec(cpu);
1606             if (r == EXCP_DEBUG) {
1607                 cpu_handle_guest_debug(cpu);
1608             }
1609         }
1610         while (cpu_thread_is_idle(cpu)) {
1611             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1612         }
1613         qemu_wait_io_event_common(cpu);
1614     } while (!cpu->unplug || cpu_can_run(cpu));
1615
1616     whpx_destroy_vcpu(cpu);
1617     cpu->created = false;
1618     qemu_cond_signal(&qemu_cpu_cond);
1619     qemu_mutex_unlock_iothread();
1620     rcu_unregister_thread();
1621     return NULL;
1622 }
1623
1624 #ifdef _WIN32
1625 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1626 {
1627 }
1628 #endif
1629
1630 /* Multi-threaded TCG
1631  *
1632  * In the multi-threaded case each vCPU has its own thread. The TLS
1633  * variable current_cpu can be used deep in the code to find the
1634  * current CPUState for a given thread.
1635  */
1636
1637 static void *qemu_tcg_cpu_thread_fn(void *arg)
1638 {
1639     CPUState *cpu = arg;
1640
1641     assert(tcg_enabled());
1642     g_assert(!use_icount);
1643
1644     rcu_register_thread();
1645     tcg_register_thread();
1646
1647     qemu_mutex_lock_iothread();
1648     qemu_thread_get_self(cpu->thread);
1649
1650     cpu->thread_id = qemu_get_thread_id();
1651     cpu->created = true;
1652     cpu->can_do_io = 1;
1653     current_cpu = cpu;
1654     qemu_cond_signal(&qemu_cpu_cond);
1655     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1656
1657     /* process any pending work */
1658     cpu->exit_request = 1;
1659
1660     do {
1661         if (cpu_can_run(cpu)) {
1662             int r;
1663             qemu_mutex_unlock_iothread();
1664             r = tcg_cpu_exec(cpu);
1665             qemu_mutex_lock_iothread();
1666             switch (r) {
1667             case EXCP_DEBUG:
1668                 cpu_handle_guest_debug(cpu);
1669                 break;
1670             case EXCP_HALTED:
1671                 /* during start-up the vCPU is reset and the thread is
1672                  * kicked several times. If we don't ensure we go back
1673                  * to sleep in the halted state we won't cleanly
1674                  * start-up when the vCPU is enabled.
1675                  *
1676                  * cpu->halted should ensure we sleep in wait_io_event
1677                  */
1678                 g_assert(cpu->halted);
1679                 break;
1680             case EXCP_ATOMIC:
1681                 qemu_mutex_unlock_iothread();
1682                 cpu_exec_step_atomic(cpu);
1683                 qemu_mutex_lock_iothread();
1684             default:
1685                 /* Ignore everything else? */
1686                 break;
1687             }
1688         }
1689
1690         atomic_mb_set(&cpu->exit_request, 0);
1691         qemu_wait_io_event(cpu);
1692     } while (!cpu->unplug || cpu_can_run(cpu));
1693
1694     qemu_tcg_destroy_vcpu(cpu);
1695     cpu->created = false;
1696     qemu_cond_signal(&qemu_cpu_cond);
1697     qemu_mutex_unlock_iothread();
1698     rcu_unregister_thread();
1699     return NULL;
1700 }
1701
1702 static void qemu_cpu_kick_thread(CPUState *cpu)
1703 {
1704 #ifndef _WIN32
1705     int err;
1706
1707     if (cpu->thread_kicked) {
1708         return;
1709     }
1710     cpu->thread_kicked = true;
1711     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1712     if (err && err != ESRCH) {
1713         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1714         exit(1);
1715     }
1716 #else /* _WIN32 */
1717     if (!qemu_cpu_is_self(cpu)) {
1718         if (whpx_enabled()) {
1719             whpx_vcpu_kick(cpu);
1720         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1721             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1722                     __func__, GetLastError());
1723             exit(1);
1724         }
1725     }
1726 #endif
1727 }
1728
1729 void qemu_cpu_kick(CPUState *cpu)
1730 {
1731     qemu_cond_broadcast(cpu->halt_cond);
1732     if (tcg_enabled()) {
1733         if (qemu_tcg_mttcg_enabled()) {
1734             cpu_exit(cpu);
1735         } else {
1736             qemu_cpu_kick_rr_cpus();
1737         }
1738     } else {
1739         if (hax_enabled()) {
1740             /*
1741              * FIXME: race condition with the exit_request check in
1742              * hax_vcpu_hax_exec
1743              */
1744             cpu->exit_request = 1;
1745         }
1746         qemu_cpu_kick_thread(cpu);
1747     }
1748 }
1749
1750 void qemu_cpu_kick_self(void)
1751 {
1752     assert(current_cpu);
1753     qemu_cpu_kick_thread(current_cpu);
1754 }
1755
1756 bool qemu_cpu_is_self(CPUState *cpu)
1757 {
1758     return qemu_thread_is_self(cpu->thread);
1759 }
1760
1761 bool qemu_in_vcpu_thread(void)
1762 {
1763     return current_cpu && qemu_cpu_is_self(current_cpu);
1764 }
1765
1766 static __thread bool iothread_locked = false;
1767
1768 bool qemu_mutex_iothread_locked(void)
1769 {
1770     return iothread_locked;
1771 }
1772
1773 /*
1774  * The BQL is taken from so many places that it is worth profiling the
1775  * callers directly, instead of funneling them all through a single function.
1776  */
1777 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1778 {
1779     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1780
1781     g_assert(!qemu_mutex_iothread_locked());
1782     bql_lock(&qemu_global_mutex, file, line);
1783     iothread_locked = true;
1784 }
1785
1786 void qemu_mutex_unlock_iothread(void)
1787 {
1788     g_assert(qemu_mutex_iothread_locked());
1789     iothread_locked = false;
1790     qemu_mutex_unlock(&qemu_global_mutex);
1791 }
1792
1793 void qemu_cond_wait_iothread(QemuCond *cond)
1794 {
1795     qemu_cond_wait(cond, &qemu_global_mutex);
1796 }
1797
1798 void qemu_cond_timedwait_iothread(QemuCond *cond, int ms)
1799 {
1800     qemu_cond_timedwait(cond, &qemu_global_mutex, ms);
1801 }
1802
1803 static bool all_vcpus_paused(void)
1804 {
1805     CPUState *cpu;
1806
1807     CPU_FOREACH(cpu) {
1808         if (!cpu->stopped) {
1809             return false;
1810         }
1811     }
1812
1813     return true;
1814 }
1815
1816 void pause_all_vcpus(void)
1817 {
1818     CPUState *cpu;
1819
1820     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1821     CPU_FOREACH(cpu) {
1822         if (qemu_cpu_is_self(cpu)) {
1823             qemu_cpu_stop(cpu, true);
1824         } else {
1825             cpu->stop = true;
1826             qemu_cpu_kick(cpu);
1827         }
1828     }
1829
1830     /* We need to drop the replay_lock so any vCPU threads woken up
1831      * can finish their replay tasks
1832      */
1833     replay_mutex_unlock();
1834
1835     while (!all_vcpus_paused()) {
1836         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1837         CPU_FOREACH(cpu) {
1838             qemu_cpu_kick(cpu);
1839         }
1840     }
1841
1842     qemu_mutex_unlock_iothread();
1843     replay_mutex_lock();
1844     qemu_mutex_lock_iothread();
1845 }
1846
1847 void cpu_resume(CPUState *cpu)
1848 {
1849     cpu->stop = false;
1850     cpu->stopped = false;
1851     qemu_cpu_kick(cpu);
1852 }
1853
1854 void resume_all_vcpus(void)
1855 {
1856     CPUState *cpu;
1857
1858     if (!runstate_is_running()) {
1859         return;
1860     }
1861
1862     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1863     CPU_FOREACH(cpu) {
1864         cpu_resume(cpu);
1865     }
1866 }
1867
1868 void cpu_remove_sync(CPUState *cpu)
1869 {
1870     cpu->stop = true;
1871     cpu->unplug = true;
1872     qemu_cpu_kick(cpu);
1873     qemu_mutex_unlock_iothread();
1874     qemu_thread_join(cpu->thread);
1875     qemu_mutex_lock_iothread();
1876 }
1877
1878 /* For temporary buffers for forming a name */
1879 #define VCPU_THREAD_NAME_SIZE 16
1880
1881 static void qemu_tcg_init_vcpu(CPUState *cpu)
1882 {
1883     char thread_name[VCPU_THREAD_NAME_SIZE];
1884     static QemuCond *single_tcg_halt_cond;
1885     static QemuThread *single_tcg_cpu_thread;
1886     static int tcg_region_inited;
1887
1888     assert(tcg_enabled());
1889     /*
1890      * Initialize TCG regions--once. Now is a good time, because:
1891      * (1) TCG's init context, prologue and target globals have been set up.
1892      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1893      *     -accel flag is processed, so the check doesn't work then).
1894      */
1895     if (!tcg_region_inited) {
1896         tcg_region_inited = 1;
1897         tcg_region_init();
1898     }
1899
1900     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1901         cpu->thread = g_malloc0(sizeof(QemuThread));
1902         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1903         qemu_cond_init(cpu->halt_cond);
1904
1905         if (qemu_tcg_mttcg_enabled()) {
1906             /* create a thread per vCPU with TCG (MTTCG) */
1907             parallel_cpus = true;
1908             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1909                  cpu->cpu_index);
1910
1911             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1912                                cpu, QEMU_THREAD_JOINABLE);
1913
1914         } else {
1915             /* share a single thread for all cpus with TCG */
1916             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1917             qemu_thread_create(cpu->thread, thread_name,
1918                                qemu_tcg_rr_cpu_thread_fn,
1919                                cpu, QEMU_THREAD_JOINABLE);
1920
1921             single_tcg_halt_cond = cpu->halt_cond;
1922             single_tcg_cpu_thread = cpu->thread;
1923         }
1924 #ifdef _WIN32
1925         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1926 #endif
1927     } else {
1928         /* For non-MTTCG cases we share the thread */
1929         cpu->thread = single_tcg_cpu_thread;
1930         cpu->halt_cond = single_tcg_halt_cond;
1931         cpu->thread_id = first_cpu->thread_id;
1932         cpu->can_do_io = 1;
1933         cpu->created = true;
1934     }
1935 }
1936
1937 static void qemu_hax_start_vcpu(CPUState *cpu)
1938 {
1939     char thread_name[VCPU_THREAD_NAME_SIZE];
1940
1941     cpu->thread = g_malloc0(sizeof(QemuThread));
1942     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1943     qemu_cond_init(cpu->halt_cond);
1944
1945     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1946              cpu->cpu_index);
1947     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1948                        cpu, QEMU_THREAD_JOINABLE);
1949 #ifdef _WIN32
1950     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1951 #endif
1952 }
1953
1954 static void qemu_kvm_start_vcpu(CPUState *cpu)
1955 {
1956     char thread_name[VCPU_THREAD_NAME_SIZE];
1957
1958     cpu->thread = g_malloc0(sizeof(QemuThread));
1959     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1960     qemu_cond_init(cpu->halt_cond);
1961     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1962              cpu->cpu_index);
1963     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1964                        cpu, QEMU_THREAD_JOINABLE);
1965 }
1966
1967 static void qemu_hvf_start_vcpu(CPUState *cpu)
1968 {
1969     char thread_name[VCPU_THREAD_NAME_SIZE];
1970
1971     /* HVF currently does not support TCG, and only runs in
1972      * unrestricted-guest mode. */
1973     assert(hvf_enabled());
1974
1975     cpu->thread = g_malloc0(sizeof(QemuThread));
1976     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1977     qemu_cond_init(cpu->halt_cond);
1978
1979     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
1980              cpu->cpu_index);
1981     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
1982                        cpu, QEMU_THREAD_JOINABLE);
1983 }
1984
1985 static void qemu_whpx_start_vcpu(CPUState *cpu)
1986 {
1987     char thread_name[VCPU_THREAD_NAME_SIZE];
1988
1989     cpu->thread = g_malloc0(sizeof(QemuThread));
1990     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1991     qemu_cond_init(cpu->halt_cond);
1992     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
1993              cpu->cpu_index);
1994     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
1995                        cpu, QEMU_THREAD_JOINABLE);
1996 #ifdef _WIN32
1997     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1998 #endif
1999 }
2000
2001 static void qemu_dummy_start_vcpu(CPUState *cpu)
2002 {
2003     char thread_name[VCPU_THREAD_NAME_SIZE];
2004
2005     cpu->thread = g_malloc0(sizeof(QemuThread));
2006     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2007     qemu_cond_init(cpu->halt_cond);
2008     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2009              cpu->cpu_index);
2010     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2011                        QEMU_THREAD_JOINABLE);
2012 }
2013
2014 void qemu_init_vcpu(CPUState *cpu)
2015 {
2016     MachineState *ms = MACHINE(qdev_get_machine());
2017
2018     cpu->nr_cores = ms->smp.cores;
2019     cpu->nr_threads =  ms->smp.threads;
2020     cpu->stopped = true;
2021     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2022
2023     if (!cpu->as) {
2024         /* If the target cpu hasn't set up any address spaces itself,
2025          * give it the default one.
2026          */
2027         cpu->num_ases = 1;
2028         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2029     }
2030
2031     if (kvm_enabled()) {
2032         qemu_kvm_start_vcpu(cpu);
2033     } else if (hax_enabled()) {
2034         qemu_hax_start_vcpu(cpu);
2035     } else if (hvf_enabled()) {
2036         qemu_hvf_start_vcpu(cpu);
2037     } else if (tcg_enabled()) {
2038         qemu_tcg_init_vcpu(cpu);
2039     } else if (whpx_enabled()) {
2040         qemu_whpx_start_vcpu(cpu);
2041     } else {
2042         qemu_dummy_start_vcpu(cpu);
2043     }
2044
2045     while (!cpu->created) {
2046         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2047     }
2048 }
2049
2050 void cpu_stop_current(void)
2051 {
2052     if (current_cpu) {
2053         current_cpu->stop = true;
2054         cpu_exit(current_cpu);
2055     }
2056 }
2057
2058 int vm_stop(RunState state)
2059 {
2060     if (qemu_in_vcpu_thread()) {
2061         qemu_system_vmstop_request_prepare();
2062         qemu_system_vmstop_request(state);
2063         /*
2064          * FIXME: should not return to device code in case
2065          * vm_stop() has been requested.
2066          */
2067         cpu_stop_current();
2068         return 0;
2069     }
2070
2071     return do_vm_stop(state, true);
2072 }
2073
2074 /**
2075  * Prepare for (re)starting the VM.
2076  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2077  * running or in case of an error condition), 0 otherwise.
2078  */
2079 int vm_prepare_start(void)
2080 {
2081     RunState requested;
2082
2083     qemu_vmstop_requested(&requested);
2084     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2085         return -1;
2086     }
2087
2088     /* Ensure that a STOP/RESUME pair of events is emitted if a
2089      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2090      * example, according to documentation is always followed by
2091      * the STOP event.
2092      */
2093     if (runstate_is_running()) {
2094         qapi_event_send_stop();
2095         qapi_event_send_resume();
2096         return -1;
2097     }
2098
2099     /* We are sending this now, but the CPUs will be resumed shortly later */
2100     qapi_event_send_resume();
2101
2102     cpu_enable_ticks();
2103     runstate_set(RUN_STATE_RUNNING);
2104     vm_state_notify(1, RUN_STATE_RUNNING);
2105     return 0;
2106 }
2107
2108 void vm_start(void)
2109 {
2110     if (!vm_prepare_start()) {
2111         resume_all_vcpus();
2112     }
2113 }
2114
2115 /* does a state transition even if the VM is already stopped,
2116    current state is forgotten forever */
2117 int vm_stop_force_state(RunState state)
2118 {
2119     if (runstate_is_running()) {
2120         return vm_stop(state);
2121     } else {
2122         runstate_set(state);
2123
2124         bdrv_drain_all();
2125         /* Make sure to return an error if the flush in a previous vm_stop()
2126          * failed. */
2127         return bdrv_flush_all();
2128     }
2129 }
2130
2131 void list_cpus(const char *optarg)
2132 {
2133     /* XXX: implement xxx_cpu_list for targets that still miss it */
2134 #if defined(cpu_list)
2135     cpu_list();
2136 #endif
2137 }
2138
2139 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2140                  bool has_cpu, int64_t cpu_index, Error **errp)
2141 {
2142     FILE *f;
2143     uint32_t l;
2144     CPUState *cpu;
2145     uint8_t buf[1024];
2146     int64_t orig_addr = addr, orig_size = size;
2147
2148     if (!has_cpu) {
2149         cpu_index = 0;
2150     }
2151
2152     cpu = qemu_get_cpu(cpu_index);
2153     if (cpu == NULL) {
2154         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2155                    "a CPU number");
2156         return;
2157     }
2158
2159     f = fopen(filename, "wb");
2160     if (!f) {
2161         error_setg_file_open(errp, errno, filename);
2162         return;
2163     }
2164
2165     while (size != 0) {
2166         l = sizeof(buf);
2167         if (l > size)
2168             l = size;
2169         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2170             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2171                              " specified", orig_addr, orig_size);
2172             goto exit;
2173         }
2174         if (fwrite(buf, 1, l, f) != l) {
2175             error_setg(errp, QERR_IO_ERROR);
2176             goto exit;
2177         }
2178         addr += l;
2179         size -= l;
2180     }
2181
2182 exit:
2183     fclose(f);
2184 }
2185
2186 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2187                   Error **errp)
2188 {
2189     FILE *f;
2190     uint32_t l;
2191     uint8_t buf[1024];
2192
2193     f = fopen(filename, "wb");
2194     if (!f) {
2195         error_setg_file_open(errp, errno, filename);
2196         return;
2197     }
2198
2199     while (size != 0) {
2200         l = sizeof(buf);
2201         if (l > size)
2202             l = size;
2203         cpu_physical_memory_read(addr, buf, l);
2204         if (fwrite(buf, 1, l, f) != l) {
2205             error_setg(errp, QERR_IO_ERROR);
2206             goto exit;
2207         }
2208         addr += l;
2209         size -= l;
2210     }
2211
2212 exit:
2213     fclose(f);
2214 }
2215
2216 void qmp_inject_nmi(Error **errp)
2217 {
2218     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2219 }
2220
2221 void dump_drift_info(void)
2222 {
2223     if (!use_icount) {
2224         return;
2225     }
2226
2227     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2228                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2229     if (icount_align_option) {
2230         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2231                     -max_delay / SCALE_MS);
2232         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2233                     max_advance / SCALE_MS);
2234     } else {
2235         qemu_printf("Max guest delay     NA\n");
2236         qemu_printf("Max guest advance   NA\n");
2237     }
2238 }