migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_lock(&XBZRLE.lock);
 106 }
 107
 108 static void XBZRLE_cache_unlock(void)
 109 {
 110     if (migrate_use_xbzrle())
 111         qemu_mutex_unlock(&XBZRLE.lock);
 112 }
 113
 114 /**
 115  * xbzrle_cache_resize: resize the xbzrle cache
 116  *
 117  * This function is called from qmp_migrate_set_cache_size in main
 118  * thread, possibly while a migration is in progress.  A running
 119  * migration may be using the cache and might finish during this call,
 120  * hence changes to the cache are protected by XBZRLE.lock().
 121  *
 122  * Returns 0 for success or -1 for error
 123  *
 124  * @new_size: new cache size
 125  * @errp: set *errp if the check failed, with reason
 126  */
 127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128 {
 129     PageCache *new_cache;
 130     int64_t ret = 0;
 131
 132     /* Check for truncation */
 133     if (new_size != (size_t)new_size) {
 134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                    "exceeding address space");
 136         return -1;
 137     }
 138
 139     if (new_size == migrate_xbzrle_cache_size()) {
 140         /* nothing to do */
 141         return 0;
 142     }
 143
 144     XBZRLE_cache_lock();
 145
 146     if (XBZRLE.cache != NULL) {
 147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148         if (!new_cache) {
 149             ret = -1;
 150             goto out;
 151         }
 152
 153         cache_fini(XBZRLE.cache);
 154         XBZRLE.cache = new_cache;
 155     }
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 /* Should be holding either ram_list.mutex, or the RCU lock. */
 162 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 163     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 164         if (!qemu_ram_is_migratable(block)) {} else
 165
 166 #undef RAMBLOCK_FOREACH
 167
 168 static void ramblock_recv_map_init(void)
 169 {
 170     RAMBlock *rb;
 171
 172     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 173         assert(!rb->receivedmap);
 174         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 175     }
 176 }
 177
 178 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 179 {
 180     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 181                     rb->receivedmap);
 182 }
 183
 184 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 185 {
 186     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 187 }
 188
 189 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 190 {
 191     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 192 }
 193
 194 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 195                                     size_t nr)
 196 {
 197     bitmap_set_atomic(rb->receivedmap,
 198                       ramblock_recv_bitmap_offset(host_addr, rb),
 199                       nr);
 200 }
 201
 202 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 203
 204 /*
 205  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 206  *
 207  * Returns >0 if success with sent bytes, or <0 if error.
 208  */
 209 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 210                                   const char *block_name)
 211 {
 212     RAMBlock *block = qemu_ram_block_by_name(block_name);
 213     unsigned long *le_bitmap, nbits;
 214     uint64_t size;
 215
 216     if (!block) {
 217         error_report("%s: invalid block name: %s", __func__, block_name);
 218         return -1;
 219     }
 220
 221     nbits = block->used_length >> TARGET_PAGE_BITS;
 222
 223     /*
 224      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 225      * machines we may need 4 more bytes for padding (see below
 226      * comment). So extend it a bit before hand.
 227      */
 228     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 229
 230     /*
 231      * Always use little endian when sending the bitmap. This is
 232      * required that when source and destination VMs are not using the
 233      * same endianess. (Note: big endian won't work.)
 234      */
 235     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 236
 237     /* Size of the bitmap, in bytes */
 238     size = DIV_ROUND_UP(nbits, 8);
 239
 240     /*
 241      * size is always aligned to 8 bytes for 64bit machines, but it
 242      * may not be true for 32bit machines. We need this padding to
 243      * make sure the migration can survive even between 32bit and
 244      * 64bit machines.
 245      */
 246     size = ROUND_UP(size, 8);
 247
 248     qemu_put_be64(file, size);
 249     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 250     /*
 251      * Mark as an end, in case the middle part is screwed up due to
 252      * some "misterious" reason.
 253      */
 254     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 255     qemu_fflush(file);
 256
 257     g_free(le_bitmap);
 258
 259     if (qemu_file_get_error(file)) {
 260         return qemu_file_get_error(file);
 261     }
 262
 263     return size + sizeof(size);
 264 }
 265
 266 /*
 267  * An outstanding page request, on the source, having been received
 268  * and queued
 269  */
 270 struct RAMSrcPageRequest {
 271     RAMBlock *rb;
 272     hwaddr    offset;
 273     hwaddr    len;
 274
 275     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 276 };
 277
 278 /* State of RAM for migration */
 279 struct RAMState {
 280     /* QEMUFile used for this migration */
 281     QEMUFile *f;
 282     /* Last block that we have visited searching for dirty pages */
 283     RAMBlock *last_seen_block;
 284     /* Last block from where we have sent data */
 285     RAMBlock *last_sent_block;
 286     /* Last dirty target page we have sent */
 287     ram_addr_t last_page;
 288     /* last ram version we have seen */
 289     uint32_t last_version;
 290     /* We are in the first round */
 291     bool ram_bulk_stage;
 292     /* How many times we have dirty too many pages */
 293     int dirty_rate_high_cnt;
 294     /* these variables are used for bitmap sync */
 295     /* last time we did a full bitmap_sync */
 296     int64_t time_last_bitmap_sync;
 297     /* bytes transferred at start_time */
 298     uint64_t bytes_xfer_prev;
 299     /* number of dirty pages since start_time */
 300     uint64_t num_dirty_pages_period;
 301     /* xbzrle misses since the beginning of the period */
 302     uint64_t xbzrle_cache_miss_prev;
 303     /* number of iterations at the beginning of period */
 304     uint64_t iterations_prev;
 305     /* Iterations since start */
 306     uint64_t iterations;
 307     /* number of dirty bits in the bitmap */
 308     uint64_t migration_dirty_pages;
 309     /* protects modification of the bitmap */
 310     QemuMutex bitmap_mutex;
 311     /* The RAMBlock used in the last src_page_requests */
 312     RAMBlock *last_req_rb;
 313     /* Queue of outstanding page requests from the destination */
 314     QemuMutex src_page_req_mutex;
 315     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 316 };
 317 typedef struct RAMState RAMState;
 318
 319 static RAMState *ram_state;
 320
 321 uint64_t ram_bytes_remaining(void)
 322 {
 323     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 324                        0;
 325 }
 326
 327 MigrationStats ram_counters;
 328
 329 /* used by the search for pages to send */
 330 struct PageSearchStatus {
 331     /* Current block being searched */
 332     RAMBlock    *block;
 333     /* Current page to search from */
 334     unsigned long page;
 335     /* Set once we wrap around */
 336     bool         complete_round;
 337 };
 338 typedef struct PageSearchStatus PageSearchStatus;
 339
 340 struct CompressParam {
 341     bool done;
 342     bool quit;
 343     QEMUFile *file;
 344     QemuMutex mutex;
 345     QemuCond cond;
 346     RAMBlock *block;
 347     ram_addr_t offset;
 348
 349     /* internally used fields */
 350     z_stream stream;
 351     uint8_t *originbuf;
 352 };
 353 typedef struct CompressParam CompressParam;
 354
 355 struct DecompressParam {
 356     bool done;
 357     bool quit;
 358     QemuMutex mutex;
 359     QemuCond cond;
 360     void *des;
 361     uint8_t *compbuf;
 362     int len;
 363     z_stream stream;
 364 };
 365 typedef struct DecompressParam DecompressParam;
 366
 367 static CompressParam *comp_param;
 368 static QemuThread *compress_threads;
 369 /* comp_done_cond is used to wake up the migration thread when
 370  * one of the compression threads has finished the compression.
 371  * comp_done_lock is used to co-work with comp_done_cond.
 372  */
 373 static QemuMutex comp_done_lock;
 374 static QemuCond comp_done_cond;
 375 /* The empty QEMUFileOps will be used by file in CompressParam */
 376 static const QEMUFileOps empty_ops = { };
 377
 378 static QEMUFile *decomp_file;
 379 static DecompressParam *decomp_param;
 380 static QemuThread *decompress_threads;
 381 static QemuMutex decomp_done_lock;
 382 static QemuCond decomp_done_cond;
 383
 384 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 385                                 ram_addr_t offset, uint8_t *source_buf);
 386
 387 static void *do_data_compress(void *opaque)
 388 {
 389     CompressParam *param = opaque;
 390     RAMBlock *block;
 391     ram_addr_t offset;
 392
 393     qemu_mutex_lock(&param->mutex);
 394     while (!param->quit) {
 395         if (param->block) {
 396             block = param->block;
 397             offset = param->offset;
 398             param->block = NULL;
 399             qemu_mutex_unlock(&param->mutex);
 400
 401             do_compress_ram_page(param->file, &param->stream, block, offset,
 402                                  param->originbuf);
 403
 404             qemu_mutex_lock(&comp_done_lock);
 405             param->done = true;
 406             qemu_cond_signal(&comp_done_cond);
 407             qemu_mutex_unlock(&comp_done_lock);
 408
 409             qemu_mutex_lock(&param->mutex);
 410         } else {
 411             qemu_cond_wait(&param->cond, &param->mutex);
 412         }
 413     }
 414     qemu_mutex_unlock(&param->mutex);
 415
 416     return NULL;
 417 }
 418
 419 static inline void terminate_compression_threads(void)
 420 {
 421     int idx, thread_count;
 422
 423     thread_count = migrate_compress_threads();
 424
 425     for (idx = 0; idx < thread_count; idx++) {
 426         qemu_mutex_lock(&comp_param[idx].mutex);
 427         comp_param[idx].quit = true;
 428         qemu_cond_signal(&comp_param[idx].cond);
 429         qemu_mutex_unlock(&comp_param[idx].mutex);
 430     }
 431 }
 432
 433 static void compress_threads_save_cleanup(void)
 434 {
 435     int i, thread_count;
 436
 437     if (!migrate_use_compression()) {
 438         return;
 439     }
 440     terminate_compression_threads();
 441     thread_count = migrate_compress_threads();
 442     for (i = 0; i < thread_count; i++) {
 443         /*
 444          * we use it as a indicator which shows if the thread is
 445          * properly init'd or not
 446          */
 447         if (!comp_param[i].file) {
 448             break;
 449         }
 450         qemu_thread_join(compress_threads + i);
 451         qemu_mutex_destroy(&comp_param[i].mutex);
 452         qemu_cond_destroy(&comp_param[i].cond);
 453         deflateEnd(&comp_param[i].stream);
 454         g_free(comp_param[i].originbuf);
 455         qemu_fclose(comp_param[i].file);
 456         comp_param[i].file = NULL;
 457     }
 458     qemu_mutex_destroy(&comp_done_lock);
 459     qemu_cond_destroy(&comp_done_cond);
 460     g_free(compress_threads);
 461     g_free(comp_param);
 462     compress_threads = NULL;
 463     comp_param = NULL;
 464 }
 465
 466 static int compress_threads_save_setup(void)
 467 {
 468     int i, thread_count;
 469
 470     if (!migrate_use_compression()) {
 471         return 0;
 472     }
 473     thread_count = migrate_compress_threads();
 474     compress_threads = g_new0(QemuThread, thread_count);
 475     comp_param = g_new0(CompressParam, thread_count);
 476     qemu_cond_init(&comp_done_cond);
 477     qemu_mutex_init(&comp_done_lock);
 478     for (i = 0; i < thread_count; i++) {
 479         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 480         if (!comp_param[i].originbuf) {
 481             goto exit;
 482         }
 483
 484         if (deflateInit(&comp_param[i].stream,
 485                         migrate_compress_level()) != Z_OK) {
 486             g_free(comp_param[i].originbuf);
 487             goto exit;
 488         }
 489
 490         /* comp_param[i].file is just used as a dummy buffer to save data,
 491          * set its ops to empty.
 492          */
 493         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 494         comp_param[i].done = true;
 495         comp_param[i].quit = false;
 496         qemu_mutex_init(&comp_param[i].mutex);
 497         qemu_cond_init(&comp_param[i].cond);
 498         qemu_thread_create(compress_threads + i, "compress",
 499                            do_data_compress, comp_param + i,
 500                            QEMU_THREAD_JOINABLE);
 501     }
 502     return 0;
 503
 504 exit:
 505     compress_threads_save_cleanup();
 506     return -1;
 507 }
 508
 509 /* Multiple fd's */
 510
 511 #define MULTIFD_MAGIC 0x11223344U
 512 #define MULTIFD_VERSION 1
 513
 514 #define MULTIFD_FLAG_SYNC (1 << 0)
 515
 516 typedef struct {
 517     uint32_t magic;
 518     uint32_t version;
 519     unsigned char uuid[16]; /* QemuUUID */
 520     uint8_t id;
 521 } __attribute__((packed)) MultiFDInit_t;
 522
 523 typedef struct {
 524     uint32_t magic;
 525     uint32_t version;
 526     uint32_t flags;
 527     uint32_t size;
 528     uint32_t used;
 529     uint64_t packet_num;
 530     char ramblock[256];
 531     uint64_t offset[];
 532 } __attribute__((packed)) MultiFDPacket_t;
 533
 534 typedef struct {
 535     /* number of used pages */
 536     uint32_t used;
 537     /* number of allocated pages */
 538     uint32_t allocated;
 539     /* global number of generated multifd packets */
 540     uint64_t packet_num;
 541     /* offset of each page */
 542     ram_addr_t *offset;
 543     /* pointer to each page */
 544     struct iovec *iov;
 545     RAMBlock *block;
 546 } MultiFDPages_t;
 547
 548 typedef struct {
 549     /* this fields are not changed once the thread is created */
 550     /* channel number */
 551     uint8_t id;
 552     /* channel thread name */
 553     char *name;
 554     /* channel thread id */
 555     QemuThread thread;
 556     /* communication channel */
 557     QIOChannel *c;
 558     /* sem where to wait for more work */
 559     QemuSemaphore sem;
 560     /* this mutex protects the following parameters */
 561     QemuMutex mutex;
 562     /* is this channel thread running */
 563     bool running;
 564     /* should this thread finish */
 565     bool quit;
 566     /* thread has work to do */
 567     int pending_job;
 568     /* array of pages to sent */
 569     MultiFDPages_t *pages;
 570     /* packet allocated len */
 571     uint32_t packet_len;
 572     /* pointer to the packet */
 573     MultiFDPacket_t *packet;
 574     /* multifd flags for each packet */
 575     uint32_t flags;
 576     /* global number of generated multifd packets */
 577     uint64_t packet_num;
 578     /* thread local variables */
 579     /* packets sent through this channel */
 580     uint64_t num_packets;
 581     /* pages sent through this channel */
 582     uint64_t num_pages;
 583     /* syncs main thread and channels */
 584     QemuSemaphore sem_sync;
 585 }  MultiFDSendParams;
 586
 587 typedef struct {
 588     /* this fields are not changed once the thread is created */
 589     /* channel number */
 590     uint8_t id;
 591     /* channel thread name */
 592     char *name;
 593     /* channel thread id */
 594     QemuThread thread;
 595     /* communication channel */
 596     QIOChannel *c;
 597     /* this mutex protects the following parameters */
 598     QemuMutex mutex;
 599     /* is this channel thread running */
 600     bool running;
 601     /* array of pages to receive */
 602     MultiFDPages_t *pages;
 603     /* packet allocated len */
 604     uint32_t packet_len;
 605     /* pointer to the packet */
 606     MultiFDPacket_t *packet;
 607     /* multifd flags for each packet */
 608     uint32_t flags;
 609     /* global number of generated multifd packets */
 610     uint64_t packet_num;
 611     /* thread local variables */
 612     /* packets sent through this channel */
 613     uint64_t num_packets;
 614     /* pages sent through this channel */
 615     uint64_t num_pages;
 616     /* syncs main thread and channels */
 617     QemuSemaphore sem_sync;
 618 } MultiFDRecvParams;
 619
 620 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 621 {
 622     MultiFDInit_t msg;
 623     int ret;
 624
 625     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 626     msg.version = cpu_to_be32(MULTIFD_VERSION);
 627     msg.id = p->id;
 628     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 629
 630     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 631     if (ret != 0) {
 632         return -1;
 633     }
 634     return 0;
 635 }
 636
 637 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 638 {
 639     MultiFDInit_t msg;
 640     int ret;
 641
 642     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 643     if (ret != 0) {
 644         return -1;
 645     }
 646
 647     be32_to_cpus(&msg.magic);
 648     be32_to_cpus(&msg.version);
 649
 650     if (msg.magic != MULTIFD_MAGIC) {
 651         error_setg(errp, "multifd: received packet magic %x "
 652                    "expected %x", msg.magic, MULTIFD_MAGIC);
 653         return -1;
 654     }
 655
 656     if (msg.version != MULTIFD_VERSION) {
 657         error_setg(errp, "multifd: received packet version %d "
 658                    "expected %d", msg.version, MULTIFD_VERSION);
 659         return -1;
 660     }
 661
 662     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 663         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 664         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 665
 666         error_setg(errp, "multifd: received uuid '%s' and expected "
 667                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 668         g_free(uuid);
 669         g_free(msg_uuid);
 670         return -1;
 671     }
 672
 673     if (msg.id > migrate_multifd_channels()) {
 674         error_setg(errp, "multifd: received channel version %d "
 675                    "expected %d", msg.version, MULTIFD_VERSION);
 676         return -1;
 677     }
 678
 679     return msg.id;
 680 }
 681
 682 static MultiFDPages_t *multifd_pages_init(size_t size)
 683 {
 684     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 685
 686     pages->allocated = size;
 687     pages->iov = g_new0(struct iovec, size);
 688     pages->offset = g_new0(ram_addr_t, size);
 689
 690     return pages;
 691 }
 692
 693 static void multifd_pages_clear(MultiFDPages_t *pages)
 694 {
 695     pages->used = 0;
 696     pages->allocated = 0;
 697     pages->packet_num = 0;
 698     pages->block = NULL;
 699     g_free(pages->iov);
 700     pages->iov = NULL;
 701     g_free(pages->offset);
 702     pages->offset = NULL;
 703     g_free(pages);
 704 }
 705
 706 static void multifd_send_fill_packet(MultiFDSendParams *p)
 707 {
 708     MultiFDPacket_t *packet = p->packet;
 709     int i;
 710
 711     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 712     packet->version = cpu_to_be32(MULTIFD_VERSION);
 713     packet->flags = cpu_to_be32(p->flags);
 714     packet->size = cpu_to_be32(migrate_multifd_page_count());
 715     packet->used = cpu_to_be32(p->pages->used);
 716     packet->packet_num = cpu_to_be64(p->packet_num);
 717
 718     if (p->pages->block) {
 719         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 720     }
 721
 722     for (i = 0; i < p->pages->used; i++) {
 723         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 724     }
 725 }
 726
 727 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 728 {
 729     MultiFDPacket_t *packet = p->packet;
 730     RAMBlock *block;
 731     int i;
 732
 733     be32_to_cpus(&packet->magic);
 734     if (packet->magic != MULTIFD_MAGIC) {
 735         error_setg(errp, "multifd: received packet "
 736                    "magic %x and expected magic %x",
 737                    packet->magic, MULTIFD_MAGIC);
 738         return -1;
 739     }
 740
 741     be32_to_cpus(&packet->version);
 742     if (packet->version != MULTIFD_VERSION) {
 743         error_setg(errp, "multifd: received packet "
 744                    "version %d and expected version %d",
 745                    packet->version, MULTIFD_VERSION);
 746         return -1;
 747     }
 748
 749     p->flags = be32_to_cpu(packet->flags);
 750
 751     be32_to_cpus(&packet->size);
 752     if (packet->size > migrate_multifd_page_count()) {
 753         error_setg(errp, "multifd: received packet "
 754                    "with size %d and expected maximum size %d",
 755                    packet->size, migrate_multifd_page_count()) ;
 756         return -1;
 757     }
 758
 759     p->pages->used = be32_to_cpu(packet->used);
 760     if (p->pages->used > packet->size) {
 761         error_setg(errp, "multifd: received packet "
 762                    "with size %d and expected maximum size %d",
 763                    p->pages->used, packet->size) ;
 764         return -1;
 765     }
 766
 767     p->packet_num = be64_to_cpu(packet->packet_num);
 768
 769     if (p->pages->used) {
 770         /* make sure that ramblock is 0 terminated */
 771         packet->ramblock[255] = 0;
 772         block = qemu_ram_block_by_name(packet->ramblock);
 773         if (!block) {
 774             error_setg(errp, "multifd: unknown ram block %s",
 775                        packet->ramblock);
 776             return -1;
 777         }
 778     }
 779
 780     for (i = 0; i < p->pages->used; i++) {
 781         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 782
 783         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 784             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 785                        " (max " RAM_ADDR_FMT ")",
 786                        offset, block->max_length);
 787             return -1;
 788         }
 789         p->pages->iov[i].iov_base = block->host + offset;
 790         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 791     }
 792
 793     return 0;
 794 }
 795
 796 struct {
 797     MultiFDSendParams *params;
 798     /* number of created threads */
 799     int count;
 800     /* array of pages to sent */
 801     MultiFDPages_t *pages;
 802     /* syncs main thread and channels */
 803     QemuSemaphore sem_sync;
 804     /* global number of generated multifd packets */
 805     uint64_t packet_num;
 806     /* send channels ready */
 807     QemuSemaphore channels_ready;
 808 } *multifd_send_state;
 809
 810 /*
 811  * How we use multifd_send_state->pages and channel->pages?
 812  *
 813  * We create a pages for each channel, and a main one.  Each time that
 814  * we need to send a batch of pages we interchange the ones between
 815  * multifd_send_state and the channel that is sending it.  There are
 816  * two reasons for that:
 817  *    - to not have to do so many mallocs during migration
 818  *    - to make easier to know what to free at the end of migration
 819  *
 820  * This way we always know who is the owner of each "pages" struct,
 821  * and we don't need any loocking.  It belongs to the migration thread
 822  * or to the channel thread.  Switching is safe because the migration
 823  * thread is using the channel mutex when changing it, and the channel
 824  * have to had finish with its own, otherwise pending_job can't be
 825  * false.
 826  */
 827
 828 static void multifd_send_pages(void)
 829 {
 830     int i;
 831     static int next_channel;
 832     MultiFDSendParams *p = NULL; /* make happy gcc */
 833     MultiFDPages_t *pages = multifd_send_state->pages;
 834     uint64_t transferred;
 835
 836     qemu_sem_wait(&multifd_send_state->channels_ready);
 837     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 838         p = &multifd_send_state->params[i];
 839
 840         qemu_mutex_lock(&p->mutex);
 841         if (!p->pending_job) {
 842             p->pending_job++;
 843             next_channel = (i + 1) % migrate_multifd_channels();
 844             break;
 845         }
 846         qemu_mutex_unlock(&p->mutex);
 847     }
 848     p->pages->used = 0;
 849
 850     p->packet_num = multifd_send_state->packet_num++;
 851     p->pages->block = NULL;
 852     multifd_send_state->pages = p->pages;
 853     p->pages = pages;
 854     transferred = pages->used * TARGET_PAGE_SIZE + p->packet_len;
 855     ram_counters.multifd_bytes += transferred;
 856     ram_counters.transferred += transferred;;
 857     qemu_mutex_unlock(&p->mutex);
 858     qemu_sem_post(&p->sem);
 859 }
 860
 861 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 862 {
 863     MultiFDPages_t *pages = multifd_send_state->pages;
 864
 865     if (!pages->block) {
 866         pages->block = block;
 867     }
 868
 869     if (pages->block == block) {
 870         pages->offset[pages->used] = offset;
 871         pages->iov[pages->used].iov_base = block->host + offset;
 872         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 873         pages->used++;
 874
 875         if (pages->used < pages->allocated) {
 876             return;
 877         }
 878     }
 879
 880     multifd_send_pages();
 881
 882     if (pages->block != block) {
 883         multifd_queue_page(block, offset);
 884     }
 885 }
 886
 887 static void multifd_send_terminate_threads(Error *err)
 888 {
 889     int i;
 890
 891     if (err) {
 892         MigrationState *s = migrate_get_current();
 893         migrate_set_error(s, err);
 894         if (s->state == MIGRATION_STATUS_SETUP ||
 895             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 896             s->state == MIGRATION_STATUS_DEVICE ||
 897             s->state == MIGRATION_STATUS_ACTIVE) {
 898             migrate_set_state(&s->state, s->state,
 899                               MIGRATION_STATUS_FAILED);
 900         }
 901     }
 902
 903     for (i = 0; i < migrate_multifd_channels(); i++) {
 904         MultiFDSendParams *p = &multifd_send_state->params[i];
 905
 906         qemu_mutex_lock(&p->mutex);
 907         p->quit = true;
 908         qemu_sem_post(&p->sem);
 909         qemu_mutex_unlock(&p->mutex);
 910     }
 911 }
 912
 913 int multifd_save_cleanup(Error **errp)
 914 {
 915     int i;
 916     int ret = 0;
 917
 918     if (!migrate_use_multifd()) {
 919         return 0;
 920     }
 921     multifd_send_terminate_threads(NULL);
 922     for (i = 0; i < migrate_multifd_channels(); i++) {
 923         MultiFDSendParams *p = &multifd_send_state->params[i];
 924
 925         if (p->running) {
 926             qemu_thread_join(&p->thread);
 927         }
 928         socket_send_channel_destroy(p->c);
 929         p->c = NULL;
 930         qemu_mutex_destroy(&p->mutex);
 931         qemu_sem_destroy(&p->sem);
 932         qemu_sem_destroy(&p->sem_sync);
 933         g_free(p->name);
 934         p->name = NULL;
 935         multifd_pages_clear(p->pages);
 936         p->pages = NULL;
 937         p->packet_len = 0;
 938         g_free(p->packet);
 939         p->packet = NULL;
 940     }
 941     qemu_sem_destroy(&multifd_send_state->channels_ready);
 942     qemu_sem_destroy(&multifd_send_state->sem_sync);
 943     g_free(multifd_send_state->params);
 944     multifd_send_state->params = NULL;
 945     multifd_pages_clear(multifd_send_state->pages);
 946     multifd_send_state->pages = NULL;
 947     g_free(multifd_send_state);
 948     multifd_send_state = NULL;
 949     return ret;
 950 }
 951
 952 static void multifd_send_sync_main(void)
 953 {
 954     int i;
 955
 956     if (!migrate_use_multifd()) {
 957         return;
 958     }
 959     if (multifd_send_state->pages->used) {
 960         multifd_send_pages();
 961     }
 962     for (i = 0; i < migrate_multifd_channels(); i++) {
 963         MultiFDSendParams *p = &multifd_send_state->params[i];
 964
 965         trace_multifd_send_sync_main_signal(p->id);
 966
 967         qemu_mutex_lock(&p->mutex);
 968
 969         p->packet_num = multifd_send_state->packet_num++;
 970         p->flags |= MULTIFD_FLAG_SYNC;
 971         p->pending_job++;
 972         qemu_mutex_unlock(&p->mutex);
 973         qemu_sem_post(&p->sem);
 974     }
 975     for (i = 0; i < migrate_multifd_channels(); i++) {
 976         MultiFDSendParams *p = &multifd_send_state->params[i];
 977
 978         trace_multifd_send_sync_main_wait(p->id);
 979         qemu_sem_wait(&multifd_send_state->sem_sync);
 980     }
 981     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 982 }
 983
 984 static void *multifd_send_thread(void *opaque)
 985 {
 986     MultiFDSendParams *p = opaque;
 987     Error *local_err = NULL;
 988     int ret;
 989
 990     trace_multifd_send_thread_start(p->id);
 991
 992     if (multifd_send_initial_packet(p, &local_err) < 0) {
 993         goto out;
 994     }
 995     /* initial packet */
 996     p->num_packets = 1;
 997
 998     while (true) {
 999         qemu_sem_wait(&p->sem);
1000         qemu_mutex_lock(&p->mutex);
1001
1002         if (p->pending_job) {
1003             uint32_t used = p->pages->used;
1004             uint64_t packet_num = p->packet_num;
1005             uint32_t flags = p->flags;
1006
1007             multifd_send_fill_packet(p);
1008             p->flags = 0;
1009             p->num_packets++;
1010             p->num_pages += used;
1011             p->pages->used = 0;
1012             qemu_mutex_unlock(&p->mutex);
1013
1014             trace_multifd_send(p->id, packet_num, used, flags);
1015
1016             ret = qio_channel_write_all(p->c, (void *)p->packet,
1017                                         p->packet_len, &local_err);
1018             if (ret != 0) {
1019                 break;
1020             }
1021
1022             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1023             if (ret != 0) {
1024                 break;
1025             }
1026
1027             qemu_mutex_lock(&p->mutex);
1028             p->pending_job--;
1029             qemu_mutex_unlock(&p->mutex);
1030
1031             if (flags & MULTIFD_FLAG_SYNC) {
1032                 qemu_sem_post(&multifd_send_state->sem_sync);
1033             }
1034             qemu_sem_post(&multifd_send_state->channels_ready);
1035         } else if (p->quit) {
1036             qemu_mutex_unlock(&p->mutex);
1037             break;
1038         } else {
1039             qemu_mutex_unlock(&p->mutex);
1040             /* sometimes there are spurious wakeups */
1041         }
1042     }
1043
1044 out:
1045     if (local_err) {
1046         multifd_send_terminate_threads(local_err);
1047     }
1048
1049     qemu_mutex_lock(&p->mutex);
1050     p->running = false;
1051     qemu_mutex_unlock(&p->mutex);
1052
1053     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1054
1055     return NULL;
1056 }
1057
1058 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1059 {
1060     MultiFDSendParams *p = opaque;
1061     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1062     Error *local_err = NULL;
1063
1064     if (qio_task_propagate_error(task, &local_err)) {
1065         if (multifd_save_cleanup(&local_err) != 0) {
1066             migrate_set_error(migrate_get_current(), local_err);
1067         }
1068     } else {
1069         p->c = QIO_CHANNEL(sioc);
1070         qio_channel_set_delay(p->c, false);
1071         p->running = true;
1072         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1073                            QEMU_THREAD_JOINABLE);
1074
1075         atomic_inc(&multifd_send_state->count);
1076     }
1077 }
1078
1079 int multifd_save_setup(void)
1080 {
1081     int thread_count;
1082     uint32_t page_count = migrate_multifd_page_count();
1083     uint8_t i;
1084
1085     if (!migrate_use_multifd()) {
1086         return 0;
1087     }
1088     thread_count = migrate_multifd_channels();
1089     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1090     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1091     atomic_set(&multifd_send_state->count, 0);
1092     multifd_send_state->pages = multifd_pages_init(page_count);
1093     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1094     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1095
1096     for (i = 0; i < thread_count; i++) {
1097         MultiFDSendParams *p = &multifd_send_state->params[i];
1098
1099         qemu_mutex_init(&p->mutex);
1100         qemu_sem_init(&p->sem, 0);
1101         qemu_sem_init(&p->sem_sync, 0);
1102         p->quit = false;
1103         p->pending_job = 0;
1104         p->id = i;
1105         p->pages = multifd_pages_init(page_count);
1106         p->packet_len = sizeof(MultiFDPacket_t)
1107                       + sizeof(ram_addr_t) * page_count;
1108         p->packet = g_malloc0(p->packet_len);
1109         p->name = g_strdup_printf("multifdsend_%d", i);
1110         socket_send_channel_create(multifd_new_send_channel_async, p);
1111     }
1112     return 0;
1113 }
1114
1115 struct {
1116     MultiFDRecvParams *params;
1117     /* number of created threads */
1118     int count;
1119     /* syncs main thread and channels */
1120     QemuSemaphore sem_sync;
1121     /* global number of generated multifd packets */
1122     uint64_t packet_num;
1123 } *multifd_recv_state;
1124
1125 static void multifd_recv_terminate_threads(Error *err)
1126 {
1127     int i;
1128
1129     if (err) {
1130         MigrationState *s = migrate_get_current();
1131         migrate_set_error(s, err);
1132         if (s->state == MIGRATION_STATUS_SETUP ||
1133             s->state == MIGRATION_STATUS_ACTIVE) {
1134             migrate_set_state(&s->state, s->state,
1135                               MIGRATION_STATUS_FAILED);
1136         }
1137     }
1138
1139     for (i = 0; i < migrate_multifd_channels(); i++) {
1140         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1141
1142         qemu_mutex_lock(&p->mutex);
1143         /* We could arrive here for two reasons:
1144            - normal quit, i.e. everything went fine, just finished
1145            - error quit: We close the channels so the channel threads
1146              finish the qio_channel_read_all_eof() */
1147         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1148         qemu_mutex_unlock(&p->mutex);
1149     }
1150 }
1151
1152 int multifd_load_cleanup(Error **errp)
1153 {
1154     int i;
1155     int ret = 0;
1156
1157     if (!migrate_use_multifd()) {
1158         return 0;
1159     }
1160     multifd_recv_terminate_threads(NULL);
1161     for (i = 0; i < migrate_multifd_channels(); i++) {
1162         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1163
1164         if (p->running) {
1165             qemu_thread_join(&p->thread);
1166         }
1167         object_unref(OBJECT(p->c));
1168         p->c = NULL;
1169         qemu_mutex_destroy(&p->mutex);
1170         qemu_sem_destroy(&p->sem_sync);
1171         g_free(p->name);
1172         p->name = NULL;
1173         multifd_pages_clear(p->pages);
1174         p->pages = NULL;
1175         p->packet_len = 0;
1176         g_free(p->packet);
1177         p->packet = NULL;
1178     }
1179     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1180     g_free(multifd_recv_state->params);
1181     multifd_recv_state->params = NULL;
1182     g_free(multifd_recv_state);
1183     multifd_recv_state = NULL;
1184
1185     return ret;
1186 }
1187
1188 static void multifd_recv_sync_main(void)
1189 {
1190     int i;
1191
1192     if (!migrate_use_multifd()) {
1193         return;
1194     }
1195     for (i = 0; i < migrate_multifd_channels(); i++) {
1196         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1197
1198         trace_multifd_recv_sync_main_wait(p->id);
1199         qemu_sem_wait(&multifd_recv_state->sem_sync);
1200         qemu_mutex_lock(&p->mutex);
1201         if (multifd_recv_state->packet_num < p->packet_num) {
1202             multifd_recv_state->packet_num = p->packet_num;
1203         }
1204         qemu_mutex_unlock(&p->mutex);
1205     }
1206     for (i = 0; i < migrate_multifd_channels(); i++) {
1207         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1208
1209         trace_multifd_recv_sync_main_signal(p->id);
1210         qemu_sem_post(&p->sem_sync);
1211     }
1212     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1213 }
1214
1215 static void *multifd_recv_thread(void *opaque)
1216 {
1217     MultiFDRecvParams *p = opaque;
1218     Error *local_err = NULL;
1219     int ret;
1220
1221     trace_multifd_recv_thread_start(p->id);
1222
1223     while (true) {
1224         uint32_t used;
1225         uint32_t flags;
1226
1227         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1228                                        p->packet_len, &local_err);
1229         if (ret == 0) {   /* EOF */
1230             break;
1231         }
1232         if (ret == -1) {   /* Error */
1233             break;
1234         }
1235
1236         qemu_mutex_lock(&p->mutex);
1237         ret = multifd_recv_unfill_packet(p, &local_err);
1238         if (ret) {
1239             qemu_mutex_unlock(&p->mutex);
1240             break;
1241         }
1242
1243         used = p->pages->used;
1244         flags = p->flags;
1245         trace_multifd_recv(p->id, p->packet_num, used, flags);
1246         p->num_packets++;
1247         p->num_pages += used;
1248         qemu_mutex_unlock(&p->mutex);
1249
1250         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1251         if (ret != 0) {
1252             break;
1253         }
1254
1255         if (flags & MULTIFD_FLAG_SYNC) {
1256             qemu_sem_post(&multifd_recv_state->sem_sync);
1257             qemu_sem_wait(&p->sem_sync);
1258         }
1259     }
1260
1261     if (local_err) {
1262         multifd_recv_terminate_threads(local_err);
1263     }
1264     qemu_mutex_lock(&p->mutex);
1265     p->running = false;
1266     qemu_mutex_unlock(&p->mutex);
1267
1268     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1269
1270     return NULL;
1271 }
1272
1273 int multifd_load_setup(void)
1274 {
1275     int thread_count;
1276     uint32_t page_count = migrate_multifd_page_count();
1277     uint8_t i;
1278
1279     if (!migrate_use_multifd()) {
1280         return 0;
1281     }
1282     thread_count = migrate_multifd_channels();
1283     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1284     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1285     atomic_set(&multifd_recv_state->count, 0);
1286     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1287
1288     for (i = 0; i < thread_count; i++) {
1289         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1290
1291         qemu_mutex_init(&p->mutex);
1292         qemu_sem_init(&p->sem_sync, 0);
1293         p->id = i;
1294         p->pages = multifd_pages_init(page_count);
1295         p->packet_len = sizeof(MultiFDPacket_t)
1296                       + sizeof(ram_addr_t) * page_count;
1297         p->packet = g_malloc0(p->packet_len);
1298         p->name = g_strdup_printf("multifdrecv_%d", i);
1299     }
1300     return 0;
1301 }
1302
1303 bool multifd_recv_all_channels_created(void)
1304 {
1305     int thread_count = migrate_multifd_channels();
1306
1307     if (!migrate_use_multifd()) {
1308         return true;
1309     }
1310
1311     return thread_count == atomic_read(&multifd_recv_state->count);
1312 }
1313
1314 /* Return true if multifd is ready for the migration, otherwise false */
1315 bool multifd_recv_new_channel(QIOChannel *ioc)
1316 {
1317     MultiFDRecvParams *p;
1318     Error *local_err = NULL;
1319     int id;
1320
1321     id = multifd_recv_initial_packet(ioc, &local_err);
1322     if (id < 0) {
1323         multifd_recv_terminate_threads(local_err);
1324         return false;
1325     }
1326
1327     p = &multifd_recv_state->params[id];
1328     if (p->c != NULL) {
1329         error_setg(&local_err, "multifd: received id '%d' already setup'",
1330                    id);
1331         multifd_recv_terminate_threads(local_err);
1332         return false;
1333     }
1334     p->c = ioc;
1335     object_ref(OBJECT(ioc));
1336     /* initial packet */
1337     p->num_packets = 1;
1338
1339     p->running = true;
1340     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1341                        QEMU_THREAD_JOINABLE);
1342     atomic_inc(&multifd_recv_state->count);
1343     return multifd_recv_state->count == migrate_multifd_channels();
1344 }
1345
1346 /**
1347  * save_page_header: write page header to wire
1348  *
1349  * If this is the 1st block, it also writes the block identification
1350  *
1351  * Returns the number of bytes written
1352  *
1353  * @f: QEMUFile where to send the data
1354  * @block: block that contains the page we want to send
1355  * @offset: offset inside the block for the page
1356  *          in the lower bits, it contains flags
1357  */
1358 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1359                                ram_addr_t offset)
1360 {
1361     size_t size, len;
1362
1363     if (block == rs->last_sent_block) {
1364         offset |= RAM_SAVE_FLAG_CONTINUE;
1365     }
1366     qemu_put_be64(f, offset);
1367     size = 8;
1368
1369     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1370         len = strlen(block->idstr);
1371         qemu_put_byte(f, len);
1372         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1373         size += 1 + len;
1374         rs->last_sent_block = block;
1375     }
1376     return size;
1377 }
1378
1379 /**
1380  * mig_throttle_guest_down: throotle down the guest
1381  *
1382  * Reduce amount of guest cpu execution to hopefully slow down memory
1383  * writes. If guest dirty memory rate is reduced below the rate at
1384  * which we can transfer pages to the destination then we should be
1385  * able to complete migration. Some workloads dirty memory way too
1386  * fast and will not effectively converge, even with auto-converge.
1387  */
1388 static void mig_throttle_guest_down(void)
1389 {
1390     MigrationState *s = migrate_get_current();
1391     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1392     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1393
1394     /* We have not started throttling yet. Let's start it. */
1395     if (!cpu_throttle_active()) {
1396         cpu_throttle_set(pct_initial);
1397     } else {
1398         /* Throttling already on, just increase the rate */
1399         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
1400     }
1401 }
1402
1403 /**
1404  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1405  *
1406  * @rs: current RAM state
1407  * @current_addr: address for the zero page
1408  *
1409  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1410  * The important thing is that a stale (not-yet-0'd) page be replaced
1411  * by the new data.
1412  * As a bonus, if the page wasn't in the cache it gets added so that
1413  * when a small write is made into the 0'd page it gets XBZRLE sent.
1414  */
1415 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1416 {
1417     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1418         return;
1419     }
1420
1421     /* We don't care if this fails to allocate a new cache page
1422      * as long as it updated an old one */
1423     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1424                  ram_counters.dirty_sync_count);
1425 }
1426
1427 #define ENCODING_FLAG_XBZRLE 0x1
1428
1429 /**
1430  * save_xbzrle_page: compress and send current page
1431  *
1432  * Returns: 1 means that we wrote the page
1433  *          0 means that page is identical to the one already sent
1434  *          -1 means that xbzrle would be longer than normal
1435  *
1436  * @rs: current RAM state
1437  * @current_data: pointer to the address of the page contents
1438  * @current_addr: addr of the page
1439  * @block: block that contains the page we want to send
1440  * @offset: offset inside the block for the page
1441  * @last_stage: if we are at the completion stage
1442  */
1443 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1444                             ram_addr_t current_addr, RAMBlock *block,
1445                             ram_addr_t offset, bool last_stage)
1446 {
1447     int encoded_len = 0, bytes_xbzrle;
1448     uint8_t *prev_cached_page;
1449
1450     if (!cache_is_cached(XBZRLE.cache, current_addr,
1451                          ram_counters.dirty_sync_count)) {
1452         xbzrle_counters.cache_miss++;
1453         if (!last_stage) {
1454             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1455                              ram_counters.dirty_sync_count) == -1) {
1456                 return -1;
1457             } else {
1458                 /* update *current_data when the page has been
1459                    inserted into cache */
1460                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1461             }
1462         }
1463         return -1;
1464     }
1465
1466     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1467
1468     /* save current buffer into memory */
1469     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1470
1471     /* XBZRLE encoding (if there is no overflow) */
1472     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1473                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1474                                        TARGET_PAGE_SIZE);
1475     if (encoded_len == 0) {
1476         trace_save_xbzrle_page_skipping();
1477         return 0;
1478     } else if (encoded_len == -1) {
1479         trace_save_xbzrle_page_overflow();
1480         xbzrle_counters.overflow++;
1481         /* update data in the cache */
1482         if (!last_stage) {
1483             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1484             *current_data = prev_cached_page;
1485         }
1486         return -1;
1487     }
1488
1489     /* we need to update the data in the cache, in order to get the same data */
1490     if (!last_stage) {
1491         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1492     }
1493
1494     /* Send XBZRLE based compressed page */
1495     bytes_xbzrle = save_page_header(rs, rs->f, block,
1496                                     offset | RAM_SAVE_FLAG_XBZRLE);
1497     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1498     qemu_put_be16(rs->f, encoded_len);
1499     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1500     bytes_xbzrle += encoded_len + 1 + 2;
1501     xbzrle_counters.pages++;
1502     xbzrle_counters.bytes += bytes_xbzrle;
1503     ram_counters.transferred += bytes_xbzrle;
1504
1505     return 1;
1506 }
1507
1508 /**
1509  * migration_bitmap_find_dirty: find the next dirty page from start
1510  *
1511  * Called with rcu_read_lock() to protect migration_bitmap
1512  *
1513  * Returns the byte offset within memory region of the start of a dirty page
1514  *
1515  * @rs: current RAM state
1516  * @rb: RAMBlock where to search for dirty pages
1517  * @start: page where we start the search
1518  */
1519 static inline
1520 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1521                                           unsigned long start)
1522 {
1523     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1524     unsigned long *bitmap = rb->bmap;
1525     unsigned long next;
1526
1527     if (!qemu_ram_is_migratable(rb)) {
1528         return size;
1529     }
1530
1531     if (rs->ram_bulk_stage && start > 0) {
1532         next = start + 1;
1533     } else {
1534         next = find_next_bit(bitmap, size, start);
1535     }
1536
1537     return next;
1538 }
1539
1540 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1541                                                 RAMBlock *rb,
1542                                                 unsigned long page)
1543 {
1544     bool ret;
1545
1546     ret = test_and_clear_bit(page, rb->bmap);
1547
1548     if (ret) {
1549         rs->migration_dirty_pages--;
1550     }
1551     return ret;
1552 }
1553
1554 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1555                                         ram_addr_t start, ram_addr_t length)
1556 {
1557     rs->migration_dirty_pages +=
1558         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1559                                               &rs->num_dirty_pages_period);
1560 }
1561
1562 /**
1563  * ram_pagesize_summary: calculate all the pagesizes of a VM
1564  *
1565  * Returns a summary bitmap of the page sizes of all RAMBlocks
1566  *
1567  * For VMs with just normal pages this is equivalent to the host page
1568  * size. If it's got some huge pages then it's the OR of all the
1569  * different page sizes.
1570  */
1571 uint64_t ram_pagesize_summary(void)
1572 {
1573     RAMBlock *block;
1574     uint64_t summary = 0;
1575
1576     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1577         summary |= block->page_size;
1578     }
1579
1580     return summary;
1581 }
1582
1583 static void migration_update_rates(RAMState *rs, int64_t end_time)
1584 {
1585     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1586
1587     /* calculate period counters */
1588     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1589                 / (end_time - rs->time_last_bitmap_sync);
1590
1591     if (!iter_count) {
1592         return;
1593     }
1594
1595     if (migrate_use_xbzrle()) {
1596         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1597             rs->xbzrle_cache_miss_prev) / iter_count;
1598         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1599     }
1600 }
1601
1602 static void migration_bitmap_sync(RAMState *rs)
1603 {
1604     RAMBlock *block;
1605     int64_t end_time;
1606     uint64_t bytes_xfer_now;
1607
1608     ram_counters.dirty_sync_count++;
1609
1610     if (!rs->time_last_bitmap_sync) {
1611         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1612     }
1613
1614     trace_migration_bitmap_sync_start();
1615     memory_global_dirty_log_sync();
1616
1617     qemu_mutex_lock(&rs->bitmap_mutex);
1618     rcu_read_lock();
1619     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1620         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1621     }
1622     ram_counters.remaining = ram_bytes_remaining();
1623     rcu_read_unlock();
1624     qemu_mutex_unlock(&rs->bitmap_mutex);
1625
1626     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1627
1628     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1629
1630     /* more than 1 second = 1000 millisecons */
1631     if (end_time > rs->time_last_bitmap_sync + 1000) {
1632         bytes_xfer_now = ram_counters.transferred;
1633
1634         /* During block migration the auto-converge logic incorrectly detects
1635          * that ram migration makes no progress. Avoid this by disabling the
1636          * throttling logic during the bulk phase of block migration. */
1637         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1638             /* The following detection logic can be refined later. For now:
1639                Check to see if the dirtied bytes is 50% more than the approx.
1640                amount of bytes that just got transferred since the last time we
1641                were in this routine. If that happens twice, start or increase
1642                throttling */
1643
1644             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1645                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1646                 (++rs->dirty_rate_high_cnt >= 2)) {
1647                     trace_migration_throttle();
1648                     rs->dirty_rate_high_cnt = 0;
1649                     mig_throttle_guest_down();
1650             }
1651         }
1652
1653         migration_update_rates(rs, end_time);
1654
1655         rs->iterations_prev = rs->iterations;
1656
1657         /* reset period counters */
1658         rs->time_last_bitmap_sync = end_time;
1659         rs->num_dirty_pages_period = 0;
1660         rs->bytes_xfer_prev = bytes_xfer_now;
1661     }
1662     if (migrate_use_events()) {
1663         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1664     }
1665 }
1666
1667 /**
1668  * save_zero_page: send the zero page to the stream
1669  *
1670  * Returns the number of pages written.
1671  *
1672  * @rs: current RAM state
1673  * @block: block that contains the page we want to send
1674  * @offset: offset inside the block for the page
1675  */
1676 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1677 {
1678     uint8_t *p = block->host + offset;
1679     int pages = -1;
1680
1681     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1682         ram_counters.duplicate++;
1683         ram_counters.transferred +=
1684             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1685         qemu_put_byte(rs->f, 0);
1686         ram_counters.transferred += 1;
1687         pages = 1;
1688     }
1689
1690     return pages;
1691 }
1692
1693 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1694 {
1695     if (!migrate_release_ram() || !migration_in_postcopy()) {
1696         return;
1697     }
1698
1699     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1700 }
1701
1702 /*
1703  * @pages: the number of pages written by the control path,
1704  *        < 0 - error
1705  *        > 0 - number of pages written
1706  *
1707  * Return true if the pages has been saved, otherwise false is returned.
1708  */
1709 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1710                               int *pages)
1711 {
1712     uint64_t bytes_xmit = 0;
1713     int ret;
1714
1715     *pages = -1;
1716     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1717                                 &bytes_xmit);
1718     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1719         return false;
1720     }
1721
1722     if (bytes_xmit) {
1723         ram_counters.transferred += bytes_xmit;
1724         *pages = 1;
1725     }
1726
1727     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1728         return true;
1729     }
1730
1731     if (bytes_xmit > 0) {
1732         ram_counters.normal++;
1733     } else if (bytes_xmit == 0) {
1734         ram_counters.duplicate++;
1735     }
1736
1737     return true;
1738 }
1739
1740 /*
1741  * directly send the page to the stream
1742  *
1743  * Returns the number of pages written.
1744  *
1745  * @rs: current RAM state
1746  * @block: block that contains the page we want to send
1747  * @offset: offset inside the block for the page
1748  * @buf: the page to be sent
1749  * @async: send to page asyncly
1750  */
1751 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1752                             uint8_t *buf, bool async)
1753 {
1754     ram_counters.transferred += save_page_header(rs, rs->f, block,
1755                                                  offset | RAM_SAVE_FLAG_PAGE);
1756     if (async) {
1757         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1758                               migrate_release_ram() &
1759                               migration_in_postcopy());
1760     } else {
1761         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1762     }
1763     ram_counters.transferred += TARGET_PAGE_SIZE;
1764     ram_counters.normal++;
1765     return 1;
1766 }
1767
1768 /**
1769  * ram_save_page: send the given page to the stream
1770  *
1771  * Returns the number of pages written.
1772  *          < 0 - error
1773  *          >=0 - Number of pages written - this might legally be 0
1774  *                if xbzrle noticed the page was the same.
1775  *
1776  * @rs: current RAM state
1777  * @block: block that contains the page we want to send
1778  * @offset: offset inside the block for the page
1779  * @last_stage: if we are at the completion stage
1780  */
1781 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1782 {
1783     int pages = -1;
1784     uint8_t *p;
1785     bool send_async = true;
1786     RAMBlock *block = pss->block;
1787     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1788     ram_addr_t current_addr = block->offset + offset;
1789
1790     p = block->host + offset;
1791     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1792
1793     XBZRLE_cache_lock();
1794     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1795         migrate_use_xbzrle()) {
1796         pages = save_xbzrle_page(rs, &p, current_addr, block,
1797                                  offset, last_stage);
1798         if (!last_stage) {
1799             /* Can't send this cached data async, since the cache page
1800              * might get updated before it gets to the wire
1801              */
1802             send_async = false;
1803         }
1804     }
1805
1806     /* XBZRLE overflow or normal page */
1807     if (pages == -1) {
1808         pages = save_normal_page(rs, block, offset, p, send_async);
1809     }
1810
1811     XBZRLE_cache_unlock();
1812
1813     return pages;
1814 }
1815
1816 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1817                                  ram_addr_t offset)
1818 {
1819     multifd_queue_page(block, offset);
1820     ram_counters.normal++;
1821
1822     return 1;
1823 }
1824
1825 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1826                                 ram_addr_t offset, uint8_t *source_buf)
1827 {
1828     RAMState *rs = ram_state;
1829     int bytes_sent, blen;
1830     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1831
1832     bytes_sent = save_page_header(rs, f, block, offset |
1833                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1834
1835     /*
1836      * copy it to a internal buffer to avoid it being modified by VM
1837      * so that we can catch up the error during compression and
1838      * decompression
1839      */
1840     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1841     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1842     if (blen < 0) {
1843         bytes_sent = 0;
1844         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1845         error_report("compressed data failed!");
1846     } else {
1847         bytes_sent += blen;
1848         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1849     }
1850
1851     return bytes_sent;
1852 }
1853
1854 static void flush_compressed_data(RAMState *rs)
1855 {
1856     int idx, len, thread_count;
1857
1858     if (!migrate_use_compression()) {
1859         return;
1860     }
1861     thread_count = migrate_compress_threads();
1862
1863     qemu_mutex_lock(&comp_done_lock);
1864     for (idx = 0; idx < thread_count; idx++) {
1865         while (!comp_param[idx].done) {
1866             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1867         }
1868     }
1869     qemu_mutex_unlock(&comp_done_lock);
1870
1871     for (idx = 0; idx < thread_count; idx++) {
1872         qemu_mutex_lock(&comp_param[idx].mutex);
1873         if (!comp_param[idx].quit) {
1874             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1875             ram_counters.transferred += len;
1876         }
1877         qemu_mutex_unlock(&comp_param[idx].mutex);
1878     }
1879 }
1880
1881 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1882                                        ram_addr_t offset)
1883 {
1884     param->block = block;
1885     param->offset = offset;
1886 }
1887
1888 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1889                                            ram_addr_t offset)
1890 {
1891     int idx, thread_count, bytes_xmit = -1, pages = -1;
1892
1893     thread_count = migrate_compress_threads();
1894     qemu_mutex_lock(&comp_done_lock);
1895     while (true) {
1896         for (idx = 0; idx < thread_count; idx++) {
1897             if (comp_param[idx].done) {
1898                 comp_param[idx].done = false;
1899                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1900                 qemu_mutex_lock(&comp_param[idx].mutex);
1901                 set_compress_params(&comp_param[idx], block, offset);
1902                 qemu_cond_signal(&comp_param[idx].cond);
1903                 qemu_mutex_unlock(&comp_param[idx].mutex);
1904                 pages = 1;
1905                 ram_counters.normal++;
1906                 ram_counters.transferred += bytes_xmit;
1907                 break;
1908             }
1909         }
1910         if (pages > 0) {
1911             break;
1912         } else {
1913             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1914         }
1915     }
1916     qemu_mutex_unlock(&comp_done_lock);
1917
1918     return pages;
1919 }
1920
1921 /**
1922  * find_dirty_block: find the next dirty page and update any state
1923  * associated with the search process.
1924  *
1925  * Returns if a page is found
1926  *
1927  * @rs: current RAM state
1928  * @pss: data about the state of the current dirty page scan
1929  * @again: set to false if the search has scanned the whole of RAM
1930  */
1931 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1932 {
1933     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1934     if (pss->complete_round && pss->block == rs->last_seen_block &&
1935         pss->page >= rs->last_page) {
1936         /*
1937          * We've been once around the RAM and haven't found anything.
1938          * Give up.
1939          */
1940         *again = false;
1941         return false;
1942     }
1943     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1944         /* Didn't find anything in this RAM Block */
1945         pss->page = 0;
1946         pss->block = QLIST_NEXT_RCU(pss->block, next);
1947         if (!pss->block) {
1948             /* Hit the end of the list */
1949             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1950             /* Flag that we've looped */
1951             pss->complete_round = true;
1952             rs->ram_bulk_stage = false;
1953             if (migrate_use_xbzrle()) {
1954                 /* If xbzrle is on, stop using the data compression at this
1955                  * point. In theory, xbzrle can do better than compression.
1956                  */
1957                 flush_compressed_data(rs);
1958             }
1959         }
1960         /* Didn't find anything this time, but try again on the new block */
1961         *again = true;
1962         return false;
1963     } else {
1964         /* Can go around again, but... */
1965         *again = true;
1966         /* We've found something so probably don't need to */
1967         return true;
1968     }
1969 }
1970
1971 /**
1972  * unqueue_page: gets a page of the queue
1973  *
1974  * Helper for 'get_queued_page' - gets a page off the queue
1975  *
1976  * Returns the block of the page (or NULL if none available)
1977  *
1978  * @rs: current RAM state
1979  * @offset: used to return the offset within the RAMBlock
1980  */
1981 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1982 {
1983     RAMBlock *block = NULL;
1984
1985     qemu_mutex_lock(&rs->src_page_req_mutex);
1986     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1987         struct RAMSrcPageRequest *entry =
1988                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1989         block = entry->rb;
1990         *offset = entry->offset;
1991
1992         if (entry->len > TARGET_PAGE_SIZE) {
1993             entry->len -= TARGET_PAGE_SIZE;
1994             entry->offset += TARGET_PAGE_SIZE;
1995         } else {
1996             memory_region_unref(block->mr);
1997             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1998             g_free(entry);
1999             migration_consume_urgent_request();
2000         }
2001     }
2002     qemu_mutex_unlock(&rs->src_page_req_mutex);
2003
2004     return block;
2005 }
2006
2007 /**
2008  * get_queued_page: unqueue a page from the postocpy requests
2009  *
2010  * Skips pages that are already sent (!dirty)
2011  *
2012  * Returns if a queued page is found
2013  *
2014  * @rs: current RAM state
2015  * @pss: data about the state of the current dirty page scan
2016  */
2017 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2018 {
2019     RAMBlock  *block;
2020     ram_addr_t offset;
2021     bool dirty;
2022
2023     do {
2024         block = unqueue_page(rs, &offset);
2025         /*
2026          * We're sending this page, and since it's postcopy nothing else
2027          * will dirty it, and we must make sure it doesn't get sent again
2028          * even if this queue request was received after the background
2029          * search already sent it.
2030          */
2031         if (block) {
2032             unsigned long page;
2033
2034             page = offset >> TARGET_PAGE_BITS;
2035             dirty = test_bit(page, block->bmap);
2036             if (!dirty) {
2037                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2038                        page, test_bit(page, block->unsentmap));
2039             } else {
2040                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2041             }
2042         }
2043
2044     } while (block && !dirty);
2045
2046     if (block) {
2047         /*
2048          * As soon as we start servicing pages out of order, then we have
2049          * to kill the bulk stage, since the bulk stage assumes
2050          * in (migration_bitmap_find_and_reset_dirty) that every page is
2051          * dirty, that's no longer true.
2052          */
2053         rs->ram_bulk_stage = false;
2054
2055         /*
2056          * We want the background search to continue from the queued page
2057          * since the guest is likely to want other pages near to the page
2058          * it just requested.
2059          */
2060         pss->block = block;
2061         pss->page = offset >> TARGET_PAGE_BITS;
2062     }
2063
2064     return !!block;
2065 }
2066
2067 /**
2068  * migration_page_queue_free: drop any remaining pages in the ram
2069  * request queue
2070  *
2071  * It should be empty at the end anyway, but in error cases there may
2072  * be some left.  in case that there is any page left, we drop it.
2073  *
2074  */
2075 static void migration_page_queue_free(RAMState *rs)
2076 {
2077     struct RAMSrcPageRequest *mspr, *next_mspr;
2078     /* This queue generally should be empty - but in the case of a failed
2079      * migration might have some droppings in.
2080      */
2081     rcu_read_lock();
2082     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2083         memory_region_unref(mspr->rb->mr);
2084         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2085         g_free(mspr);
2086     }
2087     rcu_read_unlock();
2088 }
2089
2090 /**
2091  * ram_save_queue_pages: queue the page for transmission
2092  *
2093  * A request from postcopy destination for example.
2094  *
2095  * Returns zero on success or negative on error
2096  *
2097  * @rbname: Name of the RAMBLock of the request. NULL means the
2098  *          same that last one.
2099  * @start: starting address from the start of the RAMBlock
2100  * @len: length (in bytes) to send
2101  */
2102 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2103 {
2104     RAMBlock *ramblock;
2105     RAMState *rs = ram_state;
2106
2107     ram_counters.postcopy_requests++;
2108     rcu_read_lock();
2109     if (!rbname) {
2110         /* Reuse last RAMBlock */
2111         ramblock = rs->last_req_rb;
2112
2113         if (!ramblock) {
2114             /*
2115              * Shouldn't happen, we can't reuse the last RAMBlock if
2116              * it's the 1st request.
2117              */
2118             error_report("ram_save_queue_pages no previous block");
2119             goto err;
2120         }
2121     } else {
2122         ramblock = qemu_ram_block_by_name(rbname);
2123
2124         if (!ramblock) {
2125             /* We shouldn't be asked for a non-existent RAMBlock */
2126             error_report("ram_save_queue_pages no block '%s'", rbname);
2127             goto err;
2128         }
2129         rs->last_req_rb = ramblock;
2130     }
2131     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2132     if (start+len > ramblock->used_length) {
2133         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2134                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2135                      __func__, start, len, ramblock->used_length);
2136         goto err;
2137     }
2138
2139     struct RAMSrcPageRequest *new_entry =
2140         g_malloc0(sizeof(struct RAMSrcPageRequest));
2141     new_entry->rb = ramblock;
2142     new_entry->offset = start;
2143     new_entry->len = len;
2144
2145     memory_region_ref(ramblock->mr);
2146     qemu_mutex_lock(&rs->src_page_req_mutex);
2147     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2148     migration_make_urgent_request();
2149     qemu_mutex_unlock(&rs->src_page_req_mutex);
2150     rcu_read_unlock();
2151
2152     return 0;
2153
2154 err:
2155     rcu_read_unlock();
2156     return -1;
2157 }
2158
2159 static bool save_page_use_compression(RAMState *rs)
2160 {
2161     if (!migrate_use_compression()) {
2162         return false;
2163     }
2164
2165     /*
2166      * If xbzrle is on, stop using the data compression after first
2167      * round of migration even if compression is enabled. In theory,
2168      * xbzrle can do better than compression.
2169      */
2170     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2171         return true;
2172     }
2173
2174     return false;
2175 }
2176
2177 /**
2178  * ram_save_target_page: save one target page
2179  *
2180  * Returns the number of pages written
2181  *
2182  * @rs: current RAM state
2183  * @pss: data about the page we want to send
2184  * @last_stage: if we are at the completion stage
2185  */
2186 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2187                                 bool last_stage)
2188 {
2189     RAMBlock *block = pss->block;
2190     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2191     int res;
2192
2193     if (control_save_page(rs, block, offset, &res)) {
2194         return res;
2195     }
2196
2197     /*
2198      * When starting the process of a new block, the first page of
2199      * the block should be sent out before other pages in the same
2200      * block, and all the pages in last block should have been sent
2201      * out, keeping this order is important, because the 'cont' flag
2202      * is used to avoid resending the block name.
2203      */
2204     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2205             flush_compressed_data(rs);
2206     }
2207
2208     res = save_zero_page(rs, block, offset);
2209     if (res > 0) {
2210         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2211          * page would be stale
2212          */
2213         if (!save_page_use_compression(rs)) {
2214             XBZRLE_cache_lock();
2215             xbzrle_cache_zero_page(rs, block->offset + offset);
2216             XBZRLE_cache_unlock();
2217         }
2218         ram_release_pages(block->idstr, offset, res);
2219         return res;
2220     }
2221
2222     /*
2223      * Make sure the first page is sent out before other pages.
2224      *
2225      * we post it as normal page as compression will take much
2226      * CPU resource.
2227      */
2228     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2229         return compress_page_with_multi_thread(rs, block, offset);
2230     } else if (migrate_use_multifd()) {
2231         return ram_save_multifd_page(rs, block, offset);
2232     }
2233
2234     return ram_save_page(rs, pss, last_stage);
2235 }
2236
2237 /**
2238  * ram_save_host_page: save a whole host page
2239  *
2240  * Starting at *offset send pages up to the end of the current host
2241  * page. It's valid for the initial offset to point into the middle of
2242  * a host page in which case the remainder of the hostpage is sent.
2243  * Only dirty target pages are sent. Note that the host page size may
2244  * be a huge page for this block.
2245  * The saving stops at the boundary of the used_length of the block
2246  * if the RAMBlock isn't a multiple of the host page size.
2247  *
2248  * Returns the number of pages written or negative on error
2249  *
2250  * @rs: current RAM state
2251  * @ms: current migration state
2252  * @pss: data about the page we want to send
2253  * @last_stage: if we are at the completion stage
2254  */
2255 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2256                               bool last_stage)
2257 {
2258     int tmppages, pages = 0;
2259     size_t pagesize_bits =
2260         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2261
2262     if (!qemu_ram_is_migratable(pss->block)) {
2263         error_report("block %s should not be migrated !", pss->block->idstr);
2264         return 0;
2265     }
2266
2267     do {
2268         /* Check the pages is dirty and if it is send it */
2269         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2270             pss->page++;
2271             continue;
2272         }
2273
2274         tmppages = ram_save_target_page(rs, pss, last_stage);
2275         if (tmppages < 0) {
2276             return tmppages;
2277         }
2278
2279         pages += tmppages;
2280         if (pss->block->unsentmap) {
2281             clear_bit(pss->page, pss->block->unsentmap);
2282         }
2283
2284         pss->page++;
2285     } while ((pss->page & (pagesize_bits - 1)) &&
2286              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2287
2288     /* The offset we leave with is the last one we looked at */
2289     pss->page--;
2290     return pages;
2291 }
2292
2293 /**
2294  * ram_find_and_save_block: finds a dirty page and sends it to f
2295  *
2296  * Called within an RCU critical section.
2297  *
2298  * Returns the number of pages written where zero means no dirty pages
2299  *
2300  * @rs: current RAM state
2301  * @last_stage: if we are at the completion stage
2302  *
2303  * On systems where host-page-size > target-page-size it will send all the
2304  * pages in a host page that are dirty.
2305  */
2306
2307 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2308 {
2309     PageSearchStatus pss;
2310     int pages = 0;
2311     bool again, found;
2312
2313     /* No dirty page as there is zero RAM */
2314     if (!ram_bytes_total()) {
2315         return pages;
2316     }
2317
2318     pss.block = rs->last_seen_block;
2319     pss.page = rs->last_page;
2320     pss.complete_round = false;
2321
2322     if (!pss.block) {
2323         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2324     }
2325
2326     do {
2327         again = true;
2328         found = get_queued_page(rs, &pss);
2329
2330         if (!found) {
2331             /* priority queue empty, so just search for something dirty */
2332             found = find_dirty_block(rs, &pss, &again);
2333         }
2334
2335         if (found) {
2336             pages = ram_save_host_page(rs, &pss, last_stage);
2337         }
2338     } while (!pages && again);
2339
2340     rs->last_seen_block = pss.block;
2341     rs->last_page = pss.page;
2342
2343     return pages;
2344 }
2345
2346 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2347 {
2348     uint64_t pages = size / TARGET_PAGE_SIZE;
2349
2350     if (zero) {
2351         ram_counters.duplicate += pages;
2352     } else {
2353         ram_counters.normal += pages;
2354         ram_counters.transferred += size;
2355         qemu_update_position(f, size);
2356     }
2357 }
2358
2359 uint64_t ram_bytes_total(void)
2360 {
2361     RAMBlock *block;
2362     uint64_t total = 0;
2363
2364     rcu_read_lock();
2365     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2366         total += block->used_length;
2367     }
2368     rcu_read_unlock();
2369     return total;
2370 }
2371
2372 static void xbzrle_load_setup(void)
2373 {
2374     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2375 }
2376
2377 static void xbzrle_load_cleanup(void)
2378 {
2379     g_free(XBZRLE.decoded_buf);
2380     XBZRLE.decoded_buf = NULL;
2381 }
2382
2383 static void ram_state_cleanup(RAMState **rsp)
2384 {
2385     if (*rsp) {
2386         migration_page_queue_free(*rsp);
2387         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2388         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2389         g_free(*rsp);
2390         *rsp = NULL;
2391     }
2392 }
2393
2394 static void xbzrle_cleanup(void)
2395 {
2396     XBZRLE_cache_lock();
2397     if (XBZRLE.cache) {
2398         cache_fini(XBZRLE.cache);
2399         g_free(XBZRLE.encoded_buf);
2400         g_free(XBZRLE.current_buf);
2401         g_free(XBZRLE.zero_target_page);
2402         XBZRLE.cache = NULL;
2403         XBZRLE.encoded_buf = NULL;
2404         XBZRLE.current_buf = NULL;
2405         XBZRLE.zero_target_page = NULL;
2406     }
2407     XBZRLE_cache_unlock();
2408 }
2409
2410 static void ram_save_cleanup(void *opaque)
2411 {
2412     RAMState **rsp = opaque;
2413     RAMBlock *block;
2414
2415     /* caller have hold iothread lock or is in a bh, so there is
2416      * no writing race against this migration_bitmap
2417      */
2418     memory_global_dirty_log_stop();
2419
2420     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2421         g_free(block->bmap);
2422         block->bmap = NULL;
2423         g_free(block->unsentmap);
2424         block->unsentmap = NULL;
2425     }
2426
2427     xbzrle_cleanup();
2428     compress_threads_save_cleanup();
2429     ram_state_cleanup(rsp);
2430 }
2431
2432 static void ram_state_reset(RAMState *rs)
2433 {
2434     rs->last_seen_block = NULL;
2435     rs->last_sent_block = NULL;
2436     rs->last_page = 0;
2437     rs->last_version = ram_list.version;
2438     rs->ram_bulk_stage = true;
2439 }
2440
2441 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2442
2443 /*
2444  * 'expected' is the value you expect the bitmap mostly to be full
2445  * of; it won't bother printing lines that are all this value.
2446  * If 'todump' is null the migration bitmap is dumped.
2447  */
2448 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2449                            unsigned long pages)
2450 {
2451     int64_t cur;
2452     int64_t linelen = 128;
2453     char linebuf[129];
2454
2455     for (cur = 0; cur < pages; cur += linelen) {
2456         int64_t curb;
2457         bool found = false;
2458         /*
2459          * Last line; catch the case where the line length
2460          * is longer than remaining ram
2461          */
2462         if (cur + linelen > pages) {
2463             linelen = pages - cur;
2464         }
2465         for (curb = 0; curb < linelen; curb++) {
2466             bool thisbit = test_bit(cur + curb, todump);
2467             linebuf[curb] = thisbit ? '1' : '.';
2468             found = found || (thisbit != expected);
2469         }
2470         if (found) {
2471             linebuf[curb] = '\0';
2472             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2473         }
2474     }
2475 }
2476
2477 /* **** functions for postcopy ***** */
2478
2479 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2480 {
2481     struct RAMBlock *block;
2482
2483     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2484         unsigned long *bitmap = block->bmap;
2485         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2486         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2487
2488         while (run_start < range) {
2489             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2490             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2491                               (run_end - run_start) << TARGET_PAGE_BITS);
2492             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2493         }
2494     }
2495 }
2496
2497 /**
2498  * postcopy_send_discard_bm_ram: discard a RAMBlock
2499  *
2500  * Returns zero on success
2501  *
2502  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2503  * Note: At this point the 'unsentmap' is the processed bitmap combined
2504  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2505  *
2506  * @ms: current migration state
2507  * @pds: state for postcopy
2508  * @start: RAMBlock starting page
2509  * @length: RAMBlock size
2510  */
2511 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2512                                         PostcopyDiscardState *pds,
2513                                         RAMBlock *block)
2514 {
2515     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2516     unsigned long current;
2517     unsigned long *unsentmap = block->unsentmap;
2518
2519     for (current = 0; current < end; ) {
2520         unsigned long one = find_next_bit(unsentmap, end, current);
2521
2522         if (one <= end) {
2523             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2524             unsigned long discard_length;
2525
2526             if (zero >= end) {
2527                 discard_length = end - one;
2528             } else {
2529                 discard_length = zero - one;
2530             }
2531             if (discard_length) {
2532                 postcopy_discard_send_range(ms, pds, one, discard_length);
2533             }
2534             current = one + discard_length;
2535         } else {
2536             current = one;
2537         }
2538     }
2539
2540     return 0;
2541 }
2542
2543 /**
2544  * postcopy_each_ram_send_discard: discard all RAMBlocks
2545  *
2546  * Returns 0 for success or negative for error
2547  *
2548  * Utility for the outgoing postcopy code.
2549  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2550  *   passing it bitmap indexes and name.
2551  * (qemu_ram_foreach_block ends up passing unscaled lengths
2552  *  which would mean postcopy code would have to deal with target page)
2553  *
2554  * @ms: current migration state
2555  */
2556 static int postcopy_each_ram_send_discard(MigrationState *ms)
2557 {
2558     struct RAMBlock *block;
2559     int ret;
2560
2561     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2562         PostcopyDiscardState *pds =
2563             postcopy_discard_send_init(ms, block->idstr);
2564
2565         /*
2566          * Postcopy sends chunks of bitmap over the wire, but it
2567          * just needs indexes at this point, avoids it having
2568          * target page specific code.
2569          */
2570         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2571         postcopy_discard_send_finish(ms, pds);
2572         if (ret) {
2573             return ret;
2574         }
2575     }
2576
2577     return 0;
2578 }
2579
2580 /**
2581  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2582  *
2583  * Helper for postcopy_chunk_hostpages; it's called twice to
2584  * canonicalize the two bitmaps, that are similar, but one is
2585  * inverted.
2586  *
2587  * Postcopy requires that all target pages in a hostpage are dirty or
2588  * clean, not a mix.  This function canonicalizes the bitmaps.
2589  *
2590  * @ms: current migration state
2591  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2592  *               otherwise we need to canonicalize partially dirty host pages
2593  * @block: block that contains the page we want to canonicalize
2594  * @pds: state for postcopy
2595  */
2596 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2597                                           RAMBlock *block,
2598                                           PostcopyDiscardState *pds)
2599 {
2600     RAMState *rs = ram_state;
2601     unsigned long *bitmap = block->bmap;
2602     unsigned long *unsentmap = block->unsentmap;
2603     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2604     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2605     unsigned long run_start;
2606
2607     if (block->page_size == TARGET_PAGE_SIZE) {
2608         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2609         return;
2610     }
2611
2612     if (unsent_pass) {
2613         /* Find a sent page */
2614         run_start = find_next_zero_bit(unsentmap, pages, 0);
2615     } else {
2616         /* Find a dirty page */
2617         run_start = find_next_bit(bitmap, pages, 0);
2618     }
2619
2620     while (run_start < pages) {
2621         bool do_fixup = false;
2622         unsigned long fixup_start_addr;
2623         unsigned long host_offset;
2624
2625         /*
2626          * If the start of this run of pages is in the middle of a host
2627          * page, then we need to fixup this host page.
2628          */
2629         host_offset = run_start % host_ratio;
2630         if (host_offset) {
2631             do_fixup = true;
2632             run_start -= host_offset;
2633             fixup_start_addr = run_start;
2634             /* For the next pass */
2635             run_start = run_start + host_ratio;
2636         } else {
2637             /* Find the end of this run */
2638             unsigned long run_end;
2639             if (unsent_pass) {
2640                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2641             } else {
2642                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2643             }
2644             /*
2645              * If the end isn't at the start of a host page, then the
2646              * run doesn't finish at the end of a host page
2647              * and we need to discard.
2648              */
2649             host_offset = run_end % host_ratio;
2650             if (host_offset) {
2651                 do_fixup = true;
2652                 fixup_start_addr = run_end - host_offset;
2653                 /*
2654                  * This host page has gone, the next loop iteration starts
2655                  * from after the fixup
2656                  */
2657                 run_start = fixup_start_addr + host_ratio;
2658             } else {
2659                 /*
2660                  * No discards on this iteration, next loop starts from
2661                  * next sent/dirty page
2662                  */
2663                 run_start = run_end + 1;
2664             }
2665         }
2666
2667         if (do_fixup) {
2668             unsigned long page;
2669
2670             /* Tell the destination to discard this page */
2671             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2672                 /* For the unsent_pass we:
2673                  *     discard partially sent pages
2674                  * For the !unsent_pass (dirty) we:
2675                  *     discard partially dirty pages that were sent
2676                  *     (any partially sent pages were already discarded
2677                  *     by the previous unsent_pass)
2678                  */
2679                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2680                                             host_ratio);
2681             }
2682
2683             /* Clean up the bitmap */
2684             for (page = fixup_start_addr;
2685                  page < fixup_start_addr + host_ratio; page++) {
2686                 /* All pages in this host page are now not sent */
2687                 set_bit(page, unsentmap);
2688
2689                 /*
2690                  * Remark them as dirty, updating the count for any pages
2691                  * that weren't previously dirty.
2692                  */
2693                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2694             }
2695         }
2696
2697         if (unsent_pass) {
2698             /* Find the next sent page for the next iteration */
2699             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2700         } else {
2701             /* Find the next dirty page for the next iteration */
2702             run_start = find_next_bit(bitmap, pages, run_start);
2703         }
2704     }
2705 }
2706
2707 /**
2708  * postcopy_chuck_hostpages: discrad any partially sent host page
2709  *
2710  * Utility for the outgoing postcopy code.
2711  *
2712  * Discard any partially sent host-page size chunks, mark any partially
2713  * dirty host-page size chunks as all dirty.  In this case the host-page
2714  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2715  *
2716  * Returns zero on success
2717  *
2718  * @ms: current migration state
2719  * @block: block we want to work with
2720  */
2721 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2722 {
2723     PostcopyDiscardState *pds =
2724         postcopy_discard_send_init(ms, block->idstr);
2725
2726     /* First pass: Discard all partially sent host pages */
2727     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2728     /*
2729      * Second pass: Ensure that all partially dirty host pages are made
2730      * fully dirty.
2731      */
2732     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2733
2734     postcopy_discard_send_finish(ms, pds);
2735     return 0;
2736 }
2737
2738 /**
2739  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2740  *
2741  * Returns zero on success
2742  *
2743  * Transmit the set of pages to be discarded after precopy to the target
2744  * these are pages that:
2745  *     a) Have been previously transmitted but are now dirty again
2746  *     b) Pages that have never been transmitted, this ensures that
2747  *        any pages on the destination that have been mapped by background
2748  *        tasks get discarded (transparent huge pages is the specific concern)
2749  * Hopefully this is pretty sparse
2750  *
2751  * @ms: current migration state
2752  */
2753 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2754 {
2755     RAMState *rs = ram_state;
2756     RAMBlock *block;
2757     int ret;
2758
2759     rcu_read_lock();
2760
2761     /* This should be our last sync, the src is now paused */
2762     migration_bitmap_sync(rs);
2763
2764     /* Easiest way to make sure we don't resume in the middle of a host-page */
2765     rs->last_seen_block = NULL;
2766     rs->last_sent_block = NULL;
2767     rs->last_page = 0;
2768
2769     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2770         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2771         unsigned long *bitmap = block->bmap;
2772         unsigned long *unsentmap = block->unsentmap;
2773
2774         if (!unsentmap) {
2775             /* We don't have a safe way to resize the sentmap, so
2776              * if the bitmap was resized it will be NULL at this
2777              * point.
2778              */
2779             error_report("migration ram resized during precopy phase");
2780             rcu_read_unlock();
2781             return -EINVAL;
2782         }
2783         /* Deal with TPS != HPS and huge pages */
2784         ret = postcopy_chunk_hostpages(ms, block);
2785         if (ret) {
2786             rcu_read_unlock();
2787             return ret;
2788         }
2789
2790         /*
2791          * Update the unsentmap to be unsentmap = unsentmap | dirty
2792          */
2793         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2794 #ifdef DEBUG_POSTCOPY
2795         ram_debug_dump_bitmap(unsentmap, true, pages);
2796 #endif
2797     }
2798     trace_ram_postcopy_send_discard_bitmap();
2799
2800     ret = postcopy_each_ram_send_discard(ms);
2801     rcu_read_unlock();
2802
2803     return ret;
2804 }
2805
2806 /**
2807  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2808  *
2809  * Returns zero on success
2810  *
2811  * @rbname: name of the RAMBlock of the request. NULL means the
2812  *          same that last one.
2813  * @start: RAMBlock starting page
2814  * @length: RAMBlock size
2815  */
2816 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2817 {
2818     int ret = -1;
2819
2820     trace_ram_discard_range(rbname, start, length);
2821
2822     rcu_read_lock();
2823     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2824
2825     if (!rb) {
2826         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2827         goto err;
2828     }
2829
2830     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2831                  length >> qemu_target_page_bits());
2832     ret = ram_block_discard_range(rb, start, length);
2833
2834 err:
2835     rcu_read_unlock();
2836
2837     return ret;
2838 }
2839
2840 /*
2841  * For every allocation, we will try not to crash the VM if the
2842  * allocation failed.
2843  */
2844 static int xbzrle_init(void)
2845 {
2846     Error *local_err = NULL;
2847
2848     if (!migrate_use_xbzrle()) {
2849         return 0;
2850     }
2851
2852     XBZRLE_cache_lock();
2853
2854     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2855     if (!XBZRLE.zero_target_page) {
2856         error_report("%s: Error allocating zero page", __func__);
2857         goto err_out;
2858     }
2859
2860     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2861                               TARGET_PAGE_SIZE, &local_err);
2862     if (!XBZRLE.cache) {
2863         error_report_err(local_err);
2864         goto free_zero_page;
2865     }
2866
2867     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2868     if (!XBZRLE.encoded_buf) {
2869         error_report("%s: Error allocating encoded_buf", __func__);
2870         goto free_cache;
2871     }
2872
2873     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2874     if (!XBZRLE.current_buf) {
2875         error_report("%s: Error allocating current_buf", __func__);
2876         goto free_encoded_buf;
2877     }
2878
2879     /* We are all good */
2880     XBZRLE_cache_unlock();
2881     return 0;
2882
2883 free_encoded_buf:
2884     g_free(XBZRLE.encoded_buf);
2885     XBZRLE.encoded_buf = NULL;
2886 free_cache:
2887     cache_fini(XBZRLE.cache);
2888     XBZRLE.cache = NULL;
2889 free_zero_page:
2890     g_free(XBZRLE.zero_target_page);
2891     XBZRLE.zero_target_page = NULL;
2892 err_out:
2893     XBZRLE_cache_unlock();
2894     return -ENOMEM;
2895 }
2896
2897 static int ram_state_init(RAMState **rsp)
2898 {
2899     *rsp = g_try_new0(RAMState, 1);
2900
2901     if (!*rsp) {
2902         error_report("%s: Init ramstate fail", __func__);
2903         return -1;
2904     }
2905
2906     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2907     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2908     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2909
2910     /*
2911      * Count the total number of pages used by ram blocks not including any
2912      * gaps due to alignment or unplugs.
2913      */
2914     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2915
2916     ram_state_reset(*rsp);
2917
2918     return 0;
2919 }
2920
2921 static void ram_list_init_bitmaps(void)
2922 {
2923     RAMBlock *block;
2924     unsigned long pages;
2925
2926     /* Skip setting bitmap if there is no RAM */
2927     if (ram_bytes_total()) {
2928         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2929             pages = block->max_length >> TARGET_PAGE_BITS;
2930             block->bmap = bitmap_new(pages);
2931             bitmap_set(block->bmap, 0, pages);
2932             if (migrate_postcopy_ram()) {
2933                 block->unsentmap = bitmap_new(pages);
2934                 bitmap_set(block->unsentmap, 0, pages);
2935             }
2936         }
2937     }
2938 }
2939
2940 static void ram_init_bitmaps(RAMState *rs)
2941 {
2942     /* For memory_global_dirty_log_start below.  */
2943     qemu_mutex_lock_iothread();
2944     qemu_mutex_lock_ramlist();
2945     rcu_read_lock();
2946
2947     ram_list_init_bitmaps();
2948     memory_global_dirty_log_start();
2949     migration_bitmap_sync(rs);
2950
2951     rcu_read_unlock();
2952     qemu_mutex_unlock_ramlist();
2953     qemu_mutex_unlock_iothread();
2954 }
2955
2956 static int ram_init_all(RAMState **rsp)
2957 {
2958     if (ram_state_init(rsp)) {
2959         return -1;
2960     }
2961
2962     if (xbzrle_init()) {
2963         ram_state_cleanup(rsp);
2964         return -1;
2965     }
2966
2967     ram_init_bitmaps(*rsp);
2968
2969     return 0;
2970 }
2971
2972 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2973 {
2974     RAMBlock *block;
2975     uint64_t pages = 0;
2976
2977     /*
2978      * Postcopy is not using xbzrle/compression, so no need for that.
2979      * Also, since source are already halted, we don't need to care
2980      * about dirty page logging as well.
2981      */
2982
2983     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2984         pages += bitmap_count_one(block->bmap,
2985                                   block->used_length >> TARGET_PAGE_BITS);
2986     }
2987
2988     /* This may not be aligned with current bitmaps. Recalculate. */
2989     rs->migration_dirty_pages = pages;
2990
2991     rs->last_seen_block = NULL;
2992     rs->last_sent_block = NULL;
2993     rs->last_page = 0;
2994     rs->last_version = ram_list.version;
2995     /*
2996      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2997      * matter what we have sent.
2998      */
2999     rs->ram_bulk_stage = false;
3000
3001     /* Update RAMState cache of output QEMUFile */
3002     rs->f = out;
3003
3004     trace_ram_state_resume_prepare(pages);
3005 }
3006
3007 /*
3008  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3009  * long-running RCU critical section.  When rcu-reclaims in the code
3010  * start to become numerous it will be necessary to reduce the
3011  * granularity of these critical sections.
3012  */
3013
3014 /**
3015  * ram_save_setup: Setup RAM for migration
3016  *
3017  * Returns zero to indicate success and negative for error
3018  *
3019  * @f: QEMUFile where to send the data
3020  * @opaque: RAMState pointer
3021  */
3022 static int ram_save_setup(QEMUFile *f, void *opaque)
3023 {
3024     RAMState **rsp = opaque;
3025     RAMBlock *block;
3026
3027     if (compress_threads_save_setup()) {
3028         return -1;
3029     }
3030
3031     /* migration has already setup the bitmap, reuse it. */
3032     if (!migration_in_colo_state()) {
3033         if (ram_init_all(rsp) != 0) {
3034             compress_threads_save_cleanup();
3035             return -1;
3036         }
3037     }
3038     (*rsp)->f = f;
3039
3040     rcu_read_lock();
3041
3042     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3043
3044     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3045         qemu_put_byte(f, strlen(block->idstr));
3046         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3047         qemu_put_be64(f, block->used_length);
3048         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3049             qemu_put_be64(f, block->page_size);
3050         }
3051     }
3052
3053     rcu_read_unlock();
3054
3055     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3056     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3057
3058     multifd_send_sync_main();
3059     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3060     qemu_fflush(f);
3061
3062     return 0;
3063 }
3064
3065 /**
3066  * ram_save_iterate: iterative stage for migration
3067  *
3068  * Returns zero to indicate success and negative for error
3069  *
3070  * @f: QEMUFile where to send the data
3071  * @opaque: RAMState pointer
3072  */
3073 static int ram_save_iterate(QEMUFile *f, void *opaque)
3074 {
3075     RAMState **temp = opaque;
3076     RAMState *rs = *temp;
3077     int ret;
3078     int i;
3079     int64_t t0;
3080     int done = 0;
3081
3082     if (blk_mig_bulk_active()) {
3083         /* Avoid transferring ram during bulk phase of block migration as
3084          * the bulk phase will usually take a long time and transferring
3085          * ram updates during that time is pointless. */
3086         goto out;
3087     }
3088
3089     rcu_read_lock();
3090     if (ram_list.version != rs->last_version) {
3091         ram_state_reset(rs);
3092     }
3093
3094     /* Read version before ram_list.blocks */
3095     smp_rmb();
3096
3097     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3098
3099     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3100     i = 0;
3101     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3102             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3103         int pages;
3104
3105         if (qemu_file_get_error(f)) {
3106             break;
3107         }
3108
3109         pages = ram_find_and_save_block(rs, false);
3110         /* no more pages to sent */
3111         if (pages == 0) {
3112             done = 1;
3113             break;
3114         }
3115         rs->iterations++;
3116
3117         /* we want to check in the 1st loop, just in case it was the 1st time
3118            and we had to sync the dirty bitmap.
3119            qemu_get_clock_ns() is a bit expensive, so we only check each some
3120            iterations
3121         */
3122         if ((i & 63) == 0) {
3123             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3124             if (t1 > MAX_WAIT) {
3125                 trace_ram_save_iterate_big_wait(t1, i);
3126                 break;
3127             }
3128         }
3129         i++;
3130     }
3131     flush_compressed_data(rs);
3132     rcu_read_unlock();
3133
3134     /*
3135      * Must occur before EOS (or any QEMUFile operation)
3136      * because of RDMA protocol.
3137      */
3138     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3139
3140     multifd_send_sync_main();
3141 out:
3142     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3143     qemu_fflush(f);
3144     ram_counters.transferred += 8;
3145
3146     ret = qemu_file_get_error(f);
3147     if (ret < 0) {
3148         return ret;
3149     }
3150
3151     return done;
3152 }
3153
3154 /**
3155  * ram_save_complete: function called to send the remaining amount of ram
3156  *
3157  * Returns zero to indicate success
3158  *
3159  * Called with iothread lock
3160  *
3161  * @f: QEMUFile where to send the data
3162  * @opaque: RAMState pointer
3163  */
3164 static int ram_save_complete(QEMUFile *f, void *opaque)
3165 {
3166     RAMState **temp = opaque;
3167     RAMState *rs = *temp;
3168
3169     rcu_read_lock();
3170
3171     if (!migration_in_postcopy()) {
3172         migration_bitmap_sync(rs);
3173     }
3174
3175     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3176
3177     /* try transferring iterative blocks of memory */
3178
3179     /* flush all remaining blocks regardless of rate limiting */
3180     while (true) {
3181         int pages;
3182
3183         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3184         /* no more blocks to sent */
3185         if (pages == 0) {
3186             break;
3187         }
3188     }
3189
3190     flush_compressed_data(rs);
3191     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3192
3193     rcu_read_unlock();
3194
3195     multifd_send_sync_main();
3196     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3197     qemu_fflush(f);
3198
3199     return 0;
3200 }
3201
3202 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3203                              uint64_t *res_precopy_only,
3204                              uint64_t *res_compatible,
3205                              uint64_t *res_postcopy_only)
3206 {
3207     RAMState **temp = opaque;
3208     RAMState *rs = *temp;
3209     uint64_t remaining_size;
3210
3211     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3212
3213     if (!migration_in_postcopy() &&
3214         remaining_size < max_size) {
3215         qemu_mutex_lock_iothread();
3216         rcu_read_lock();
3217         migration_bitmap_sync(rs);
3218         rcu_read_unlock();
3219         qemu_mutex_unlock_iothread();
3220         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3221     }
3222
3223     if (migrate_postcopy_ram()) {
3224         /* We can do postcopy, and all the data is postcopiable */
3225         *res_compatible += remaining_size;
3226     } else {
3227         *res_precopy_only += remaining_size;
3228     }
3229 }
3230
3231 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3232 {
3233     unsigned int xh_len;
3234     int xh_flags;
3235     uint8_t *loaded_data;
3236
3237     /* extract RLE header */
3238     xh_flags = qemu_get_byte(f);
3239     xh_len = qemu_get_be16(f);
3240
3241     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3242         error_report("Failed to load XBZRLE page - wrong compression!");
3243         return -1;
3244     }
3245
3246     if (xh_len > TARGET_PAGE_SIZE) {
3247         error_report("Failed to load XBZRLE page - len overflow!");
3248         return -1;
3249     }
3250     loaded_data = XBZRLE.decoded_buf;
3251     /* load data and decode */
3252     /* it can change loaded_data to point to an internal buffer */
3253     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3254
3255     /* decode RLE */
3256     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3257                              TARGET_PAGE_SIZE) == -1) {
3258         error_report("Failed to load XBZRLE page - decode error!");
3259         return -1;
3260     }
3261
3262     return 0;
3263 }
3264
3265 /**
3266  * ram_block_from_stream: read a RAMBlock id from the migration stream
3267  *
3268  * Must be called from within a rcu critical section.
3269  *
3270  * Returns a pointer from within the RCU-protected ram_list.
3271  *
3272  * @f: QEMUFile where to read the data from
3273  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3274  */
3275 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3276 {
3277     static RAMBlock *block = NULL;
3278     char id[256];
3279     uint8_t len;
3280
3281     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3282         if (!block) {
3283             error_report("Ack, bad migration stream!");
3284             return NULL;
3285         }
3286         return block;
3287     }
3288
3289     len = qemu_get_byte(f);
3290     qemu_get_buffer(f, (uint8_t *)id, len);
3291     id[len] = 0;
3292
3293     block = qemu_ram_block_by_name(id);
3294     if (!block) {
3295         error_report("Can't find block %s", id);
3296         return NULL;
3297     }
3298
3299     if (!qemu_ram_is_migratable(block)) {
3300         error_report("block %s should not be migrated !", id);
3301         return NULL;
3302     }
3303
3304     return block;
3305 }
3306
3307 static inline void *host_from_ram_block_offset(RAMBlock *block,
3308                                                ram_addr_t offset)
3309 {
3310     if (!offset_in_ramblock(block, offset)) {
3311         return NULL;
3312     }
3313
3314     return block->host + offset;
3315 }
3316
3317 /**
3318  * ram_handle_compressed: handle the zero page case
3319  *
3320  * If a page (or a whole RDMA chunk) has been
3321  * determined to be zero, then zap it.
3322  *
3323  * @host: host address for the zero page
3324  * @ch: what the page is filled from.  We only support zero
3325  * @size: size of the zero page
3326  */
3327 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3328 {
3329     if (ch != 0 || !is_zero_range(host, size)) {
3330         memset(host, ch, size);
3331     }
3332 }
3333
3334 /* return the size after decompression, or negative value on error */
3335 static int
3336 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3337                      const uint8_t *source, size_t source_len)
3338 {
3339     int err;
3340
3341     err = inflateReset(stream);
3342     if (err != Z_OK) {
3343         return -1;
3344     }
3345
3346     stream->avail_in = source_len;
3347     stream->next_in = (uint8_t *)source;
3348     stream->avail_out = dest_len;
3349     stream->next_out = dest;
3350
3351     err = inflate(stream, Z_NO_FLUSH);
3352     if (err != Z_STREAM_END) {
3353         return -1;
3354     }
3355
3356     return stream->total_out;
3357 }
3358
3359 static void *do_data_decompress(void *opaque)
3360 {
3361     DecompressParam *param = opaque;
3362     unsigned long pagesize;
3363     uint8_t *des;
3364     int len, ret;
3365
3366     qemu_mutex_lock(&param->mutex);
3367     while (!param->quit) {
3368         if (param->des) {
3369             des = param->des;
3370             len = param->len;
3371             param->des = 0;
3372             qemu_mutex_unlock(&param->mutex);
3373
3374             pagesize = TARGET_PAGE_SIZE;
3375
3376             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3377                                        param->compbuf, len);
3378             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3379                 error_report("decompress data failed");
3380                 qemu_file_set_error(decomp_file, ret);
3381             }
3382
3383             qemu_mutex_lock(&decomp_done_lock);
3384             param->done = true;
3385             qemu_cond_signal(&decomp_done_cond);
3386             qemu_mutex_unlock(&decomp_done_lock);
3387
3388             qemu_mutex_lock(&param->mutex);
3389         } else {
3390             qemu_cond_wait(&param->cond, &param->mutex);
3391         }
3392     }
3393     qemu_mutex_unlock(&param->mutex);
3394
3395     return NULL;
3396 }
3397
3398 static int wait_for_decompress_done(void)
3399 {
3400     int idx, thread_count;
3401
3402     if (!migrate_use_compression()) {
3403         return 0;
3404     }
3405
3406     thread_count = migrate_decompress_threads();
3407     qemu_mutex_lock(&decomp_done_lock);
3408     for (idx = 0; idx < thread_count; idx++) {
3409         while (!decomp_param[idx].done) {
3410             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3411         }
3412     }
3413     qemu_mutex_unlock(&decomp_done_lock);
3414     return qemu_file_get_error(decomp_file);
3415 }
3416
3417 static void compress_threads_load_cleanup(void)
3418 {
3419     int i, thread_count;
3420
3421     if (!migrate_use_compression()) {
3422         return;
3423     }
3424     thread_count = migrate_decompress_threads();
3425     for (i = 0; i < thread_count; i++) {
3426         /*
3427          * we use it as a indicator which shows if the thread is
3428          * properly init'd or not
3429          */
3430         if (!decomp_param[i].compbuf) {
3431             break;
3432         }
3433
3434         qemu_mutex_lock(&decomp_param[i].mutex);
3435         decomp_param[i].quit = true;
3436         qemu_cond_signal(&decomp_param[i].cond);
3437         qemu_mutex_unlock(&decomp_param[i].mutex);
3438     }
3439     for (i = 0; i < thread_count; i++) {
3440         if (!decomp_param[i].compbuf) {
3441             break;
3442         }
3443
3444         qemu_thread_join(decompress_threads + i);
3445         qemu_mutex_destroy(&decomp_param[i].mutex);
3446         qemu_cond_destroy(&decomp_param[i].cond);
3447         inflateEnd(&decomp_param[i].stream);
3448         g_free(decomp_param[i].compbuf);
3449         decomp_param[i].compbuf = NULL;
3450     }
3451     g_free(decompress_threads);
3452     g_free(decomp_param);
3453     decompress_threads = NULL;
3454     decomp_param = NULL;
3455     decomp_file = NULL;
3456 }
3457
3458 static int compress_threads_load_setup(QEMUFile *f)
3459 {
3460     int i, thread_count;
3461
3462     if (!migrate_use_compression()) {
3463         return 0;
3464     }
3465
3466     thread_count = migrate_decompress_threads();
3467     decompress_threads = g_new0(QemuThread, thread_count);
3468     decomp_param = g_new0(DecompressParam, thread_count);
3469     qemu_mutex_init(&decomp_done_lock);
3470     qemu_cond_init(&decomp_done_cond);
3471     decomp_file = f;
3472     for (i = 0; i < thread_count; i++) {
3473         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3474             goto exit;
3475         }
3476
3477         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3478         qemu_mutex_init(&decomp_param[i].mutex);
3479         qemu_cond_init(&decomp_param[i].cond);
3480         decomp_param[i].done = true;
3481         decomp_param[i].quit = false;
3482         qemu_thread_create(decompress_threads + i, "decompress",
3483                            do_data_decompress, decomp_param + i,
3484                            QEMU_THREAD_JOINABLE);
3485     }
3486     return 0;
3487 exit:
3488     compress_threads_load_cleanup();
3489     return -1;
3490 }
3491
3492 static void decompress_data_with_multi_threads(QEMUFile *f,
3493                                                void *host, int len)
3494 {
3495     int idx, thread_count;
3496
3497     thread_count = migrate_decompress_threads();
3498     qemu_mutex_lock(&decomp_done_lock);
3499     while (true) {
3500         for (idx = 0; idx < thread_count; idx++) {
3501             if (decomp_param[idx].done) {
3502                 decomp_param[idx].done = false;
3503                 qemu_mutex_lock(&decomp_param[idx].mutex);
3504                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3505                 decomp_param[idx].des = host;
3506                 decomp_param[idx].len = len;
3507                 qemu_cond_signal(&decomp_param[idx].cond);
3508                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3509                 break;
3510             }
3511         }
3512         if (idx < thread_count) {
3513             break;
3514         } else {
3515             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3516         }
3517     }
3518     qemu_mutex_unlock(&decomp_done_lock);
3519 }
3520
3521 /**
3522  * ram_load_setup: Setup RAM for migration incoming side
3523  *
3524  * Returns zero to indicate success and negative for error
3525  *
3526  * @f: QEMUFile where to receive the data
3527  * @opaque: RAMState pointer
3528  */
3529 static int ram_load_setup(QEMUFile *f, void *opaque)
3530 {
3531     if (compress_threads_load_setup(f)) {
3532         return -1;
3533     }
3534
3535     xbzrle_load_setup();
3536     ramblock_recv_map_init();
3537     return 0;
3538 }
3539
3540 static int ram_load_cleanup(void *opaque)
3541 {
3542     RAMBlock *rb;
3543     xbzrle_load_cleanup();
3544     compress_threads_load_cleanup();
3545
3546     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3547         g_free(rb->receivedmap);
3548         rb->receivedmap = NULL;
3549     }
3550     return 0;
3551 }
3552
3553 /**
3554  * ram_postcopy_incoming_init: allocate postcopy data structures
3555  *
3556  * Returns 0 for success and negative if there was one error
3557  *
3558  * @mis: current migration incoming state
3559  *
3560  * Allocate data structures etc needed by incoming migration with
3561  * postcopy-ram. postcopy-ram's similarly names
3562  * postcopy_ram_incoming_init does the work.
3563  */
3564 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3565 {
3566     return postcopy_ram_incoming_init(mis);
3567 }
3568
3569 /**
3570  * ram_load_postcopy: load a page in postcopy case
3571  *
3572  * Returns 0 for success or -errno in case of error
3573  *
3574  * Called in postcopy mode by ram_load().
3575  * rcu_read_lock is taken prior to this being called.
3576  *
3577  * @f: QEMUFile where to send the data
3578  */
3579 static int ram_load_postcopy(QEMUFile *f)
3580 {
3581     int flags = 0, ret = 0;
3582     bool place_needed = false;
3583     bool matches_target_page_size = false;
3584     MigrationIncomingState *mis = migration_incoming_get_current();
3585     /* Temporary page that is later 'placed' */
3586     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3587     void *last_host = NULL;
3588     bool all_zero = false;
3589
3590     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3591         ram_addr_t addr;
3592         void *host = NULL;
3593         void *page_buffer = NULL;
3594         void *place_source = NULL;
3595         RAMBlock *block = NULL;
3596         uint8_t ch;
3597
3598         addr = qemu_get_be64(f);
3599
3600         /*
3601          * If qemu file error, we should stop here, and then "addr"
3602          * may be invalid
3603          */
3604         ret = qemu_file_get_error(f);
3605         if (ret) {
3606             break;
3607         }
3608
3609         flags = addr & ~TARGET_PAGE_MASK;
3610         addr &= TARGET_PAGE_MASK;
3611
3612         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3613         place_needed = false;
3614         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3615             block = ram_block_from_stream(f, flags);
3616
3617             host = host_from_ram_block_offset(block, addr);
3618             if (!host) {
3619                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3620                 ret = -EINVAL;
3621                 break;
3622             }
3623             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3624             /*
3625              * Postcopy requires that we place whole host pages atomically;
3626              * these may be huge pages for RAMBlocks that are backed by
3627              * hugetlbfs.
3628              * To make it atomic, the data is read into a temporary page
3629              * that's moved into place later.
3630              * The migration protocol uses,  possibly smaller, target-pages
3631              * however the source ensures it always sends all the components
3632              * of a host page in order.
3633              */
3634             page_buffer = postcopy_host_page +
3635                           ((uintptr_t)host & (block->page_size - 1));
3636             /* If all TP are zero then we can optimise the place */
3637             if (!((uintptr_t)host & (block->page_size - 1))) {
3638                 all_zero = true;
3639             } else {
3640                 /* not the 1st TP within the HP */
3641                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3642                     error_report("Non-sequential target page %p/%p",
3643                                   host, last_host);
3644                     ret = -EINVAL;
3645                     break;
3646                 }
3647             }
3648
3649
3650             /*
3651              * If it's the last part of a host page then we place the host
3652              * page
3653              */
3654             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3655                                      (block->page_size - 1)) == 0;
3656             place_source = postcopy_host_page;
3657         }
3658         last_host = host;
3659
3660         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3661         case RAM_SAVE_FLAG_ZERO:
3662             ch = qemu_get_byte(f);
3663             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3664             if (ch) {
3665                 all_zero = false;
3666             }
3667             break;
3668
3669         case RAM_SAVE_FLAG_PAGE:
3670             all_zero = false;
3671             if (!matches_target_page_size) {
3672                 /* For huge pages, we always use temporary buffer */
3673                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3674             } else {
3675                 /*
3676                  * For small pages that matches target page size, we
3677                  * avoid the qemu_file copy.  Instead we directly use
3678                  * the buffer of QEMUFile to place the page.  Note: we
3679                  * cannot do any QEMUFile operation before using that
3680                  * buffer to make sure the buffer is valid when
3681                  * placing the page.
3682                  */
3683                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3684                                          TARGET_PAGE_SIZE);
3685             }
3686             break;
3687         case RAM_SAVE_FLAG_EOS:
3688             /* normal exit */
3689             multifd_recv_sync_main();
3690             break;
3691         default:
3692             error_report("Unknown combination of migration flags: %#x"
3693                          " (postcopy mode)", flags);
3694             ret = -EINVAL;
3695             break;
3696         }
3697
3698         /* Detect for any possible file errors */
3699         if (!ret && qemu_file_get_error(f)) {
3700             ret = qemu_file_get_error(f);
3701         }
3702
3703         if (!ret && place_needed) {
3704             /* This gets called at the last target page in the host page */
3705             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3706
3707             if (all_zero) {
3708                 ret = postcopy_place_page_zero(mis, place_dest,
3709                                                block);
3710             } else {
3711                 ret = postcopy_place_page(mis, place_dest,
3712                                           place_source, block);
3713             }
3714         }
3715     }
3716
3717     return ret;
3718 }
3719
3720 static bool postcopy_is_advised(void)
3721 {
3722     PostcopyState ps = postcopy_state_get();
3723     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3724 }
3725
3726 static bool postcopy_is_running(void)
3727 {
3728     PostcopyState ps = postcopy_state_get();
3729     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3730 }
3731
3732 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3733 {
3734     int flags = 0, ret = 0, invalid_flags = 0;
3735     static uint64_t seq_iter;
3736     int len = 0;
3737     /*
3738      * If system is running in postcopy mode, page inserts to host memory must
3739      * be atomic
3740      */
3741     bool postcopy_running = postcopy_is_running();
3742     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3743     bool postcopy_advised = postcopy_is_advised();
3744
3745     seq_iter++;
3746
3747     if (version_id != 4) {
3748         ret = -EINVAL;
3749     }
3750
3751     if (!migrate_use_compression()) {
3752         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3753     }
3754     /* This RCU critical section can be very long running.
3755      * When RCU reclaims in the code start to become numerous,
3756      * it will be necessary to reduce the granularity of this
3757      * critical section.
3758      */
3759     rcu_read_lock();
3760
3761     if (postcopy_running) {
3762         ret = ram_load_postcopy(f);
3763     }
3764
3765     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3766         ram_addr_t addr, total_ram_bytes;
3767         void *host = NULL;
3768         uint8_t ch;
3769
3770         addr = qemu_get_be64(f);
3771         flags = addr & ~TARGET_PAGE_MASK;
3772         addr &= TARGET_PAGE_MASK;
3773
3774         if (flags & invalid_flags) {
3775             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3776                 error_report("Received an unexpected compressed page");
3777             }
3778
3779             ret = -EINVAL;
3780             break;
3781         }
3782
3783         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3784                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3785             RAMBlock *block = ram_block_from_stream(f, flags);
3786
3787             host = host_from_ram_block_offset(block, addr);
3788             if (!host) {
3789                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3790                 ret = -EINVAL;
3791                 break;
3792             }
3793             ramblock_recv_bitmap_set(block, host);
3794             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3795         }
3796
3797         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3798         case RAM_SAVE_FLAG_MEM_SIZE:
3799             /* Synchronize RAM block list */
3800             total_ram_bytes = addr;
3801             while (!ret && total_ram_bytes) {
3802                 RAMBlock *block;
3803                 char id[256];
3804                 ram_addr_t length;
3805
3806                 len = qemu_get_byte(f);
3807                 qemu_get_buffer(f, (uint8_t *)id, len);
3808                 id[len] = 0;
3809                 length = qemu_get_be64(f);
3810
3811                 block = qemu_ram_block_by_name(id);
3812                 if (block && !qemu_ram_is_migratable(block)) {
3813                     error_report("block %s should not be migrated !", id);
3814                     ret = -EINVAL;
3815                 } else if (block) {
3816                     if (length != block->used_length) {
3817                         Error *local_err = NULL;
3818
3819                         ret = qemu_ram_resize(block, length,
3820                                               &local_err);
3821                         if (local_err) {
3822                             error_report_err(local_err);
3823                         }
3824                     }
3825                     /* For postcopy we need to check hugepage sizes match */
3826                     if (postcopy_advised &&
3827                         block->page_size != qemu_host_page_size) {
3828                         uint64_t remote_page_size = qemu_get_be64(f);
3829                         if (remote_page_size != block->page_size) {
3830                             error_report("Mismatched RAM page size %s "
3831                                          "(local) %zd != %" PRId64,
3832                                          id, block->page_size,
3833                                          remote_page_size);
3834                             ret = -EINVAL;
3835                         }
3836                     }
3837                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3838                                           block->idstr);
3839                 } else {
3840                     error_report("Unknown ramblock \"%s\", cannot "
3841                                  "accept migration", id);
3842                     ret = -EINVAL;
3843                 }
3844
3845                 total_ram_bytes -= length;
3846             }
3847             break;
3848
3849         case RAM_SAVE_FLAG_ZERO:
3850             ch = qemu_get_byte(f);
3851             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3852             break;
3853
3854         case RAM_SAVE_FLAG_PAGE:
3855             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3856             break;
3857
3858         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3859             len = qemu_get_be32(f);
3860             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3861                 error_report("Invalid compressed data length: %d", len);
3862                 ret = -EINVAL;
3863                 break;
3864             }
3865             decompress_data_with_multi_threads(f, host, len);
3866             break;
3867
3868         case RAM_SAVE_FLAG_XBZRLE:
3869             if (load_xbzrle(f, addr, host) < 0) {
3870                 error_report("Failed to decompress XBZRLE page at "
3871                              RAM_ADDR_FMT, addr);
3872                 ret = -EINVAL;
3873                 break;
3874             }
3875             break;
3876         case RAM_SAVE_FLAG_EOS:
3877             /* normal exit */
3878             multifd_recv_sync_main();
3879             break;
3880         default:
3881             if (flags & RAM_SAVE_FLAG_HOOK) {
3882                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3883             } else {
3884                 error_report("Unknown combination of migration flags: %#x",
3885                              flags);
3886                 ret = -EINVAL;
3887             }
3888         }
3889         if (!ret) {
3890             ret = qemu_file_get_error(f);
3891         }
3892     }
3893
3894     ret |= wait_for_decompress_done();
3895     rcu_read_unlock();
3896     trace_ram_load_complete(ret, seq_iter);
3897     return ret;
3898 }
3899
3900 static bool ram_has_postcopy(void *opaque)
3901 {
3902     return migrate_postcopy_ram();
3903 }
3904
3905 /* Sync all the dirty bitmap with destination VM.  */
3906 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3907 {
3908     RAMBlock *block;
3909     QEMUFile *file = s->to_dst_file;
3910     int ramblock_count = 0;
3911
3912     trace_ram_dirty_bitmap_sync_start();
3913
3914     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3915         qemu_savevm_send_recv_bitmap(file, block->idstr);
3916         trace_ram_dirty_bitmap_request(block->idstr);
3917         ramblock_count++;
3918     }
3919
3920     trace_ram_dirty_bitmap_sync_wait();
3921
3922     /* Wait until all the ramblocks' dirty bitmap synced */
3923     while (ramblock_count--) {
3924         qemu_sem_wait(&s->rp_state.rp_sem);
3925     }
3926
3927     trace_ram_dirty_bitmap_sync_complete();
3928
3929     return 0;
3930 }
3931
3932 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3933 {
3934     qemu_sem_post(&s->rp_state.rp_sem);
3935 }
3936
3937 /*
3938  * Read the received bitmap, revert it as the initial dirty bitmap.
3939  * This is only used when the postcopy migration is paused but wants
3940  * to resume from a middle point.
3941  */
3942 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3943 {
3944     int ret = -EINVAL;
3945     QEMUFile *file = s->rp_state.from_dst_file;
3946     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3947     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3948     uint64_t size, end_mark;
3949
3950     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3951
3952     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3953         error_report("%s: incorrect state %s", __func__,
3954                      MigrationStatus_str(s->state));
3955         return -EINVAL;
3956     }
3957
3958     /*
3959      * Note: see comments in ramblock_recv_bitmap_send() on why we
3960      * need the endianess convertion, and the paddings.
3961      */
3962     local_size = ROUND_UP(local_size, 8);
3963
3964     /* Add paddings */
3965     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3966
3967     size = qemu_get_be64(file);
3968
3969     /* The size of the bitmap should match with our ramblock */
3970     if (size != local_size) {
3971         error_report("%s: ramblock '%s' bitmap size mismatch "
3972                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3973                      block->idstr, size, local_size);
3974         ret = -EINVAL;
3975         goto out;
3976     }
3977
3978     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3979     end_mark = qemu_get_be64(file);
3980
3981     ret = qemu_file_get_error(file);
3982     if (ret || size != local_size) {
3983         error_report("%s: read bitmap failed for ramblock '%s': %d"
3984                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3985                      __func__, block->idstr, ret, local_size, size);
3986         ret = -EIO;
3987         goto out;
3988     }
3989
3990     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3991         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3992                      __func__, block->idstr, end_mark);
3993         ret = -EINVAL;
3994         goto out;
3995     }
3996
3997     /*
3998      * Endianess convertion. We are during postcopy (though paused).
3999      * The dirty bitmap won't change. We can directly modify it.
4000      */
4001     bitmap_from_le(block->bmap, le_bitmap, nbits);
4002
4003     /*
4004      * What we received is "received bitmap". Revert it as the initial
4005      * dirty bitmap for this ramblock.
4006      */
4007     bitmap_complement(block->bmap, block->bmap, nbits);
4008
4009     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4010
4011     /*
4012      * We succeeded to sync bitmap for current ramblock. If this is
4013      * the last one to sync, we need to notify the main send thread.
4014      */
4015     ram_dirty_bitmap_reload_notify(s);
4016
4017     ret = 0;
4018 out:
4019     g_free(le_bitmap);
4020     return ret;
4021 }
4022
4023 static int ram_resume_prepare(MigrationState *s, void *opaque)
4024 {
4025     RAMState *rs = *(RAMState **)opaque;
4026     int ret;
4027
4028     ret = ram_dirty_bitmap_sync_all(s, rs);
4029     if (ret) {
4030         return ret;
4031     }
4032
4033     ram_state_resume_prepare(rs, s->to_dst_file);
4034
4035     return 0;
4036 }
4037
4038 static SaveVMHandlers savevm_ram_handlers = {
4039     .save_setup = ram_save_setup,
4040     .save_live_iterate = ram_save_iterate,
4041     .save_live_complete_postcopy = ram_save_complete,
4042     .save_live_complete_precopy = ram_save_complete,
4043     .has_postcopy = ram_has_postcopy,
4044     .save_live_pending = ram_save_pending,
4045     .load_state = ram_load,
4046     .save_cleanup = ram_save_cleanup,
4047     .load_setup = ram_load_setup,
4048     .load_cleanup = ram_load_cleanup,
4049     .resume_prepare = ram_resume_prepare,
4050 };
4051
4052 void ram_mig_init(void)
4053 {
4054     qemu_mutex_init(&XBZRLE.lock);
4055     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4056 }