migration: clear the memory region dirty bitmap when skipping free pages
[qemu/armbru.git] / migration / ram.c
blob7a43bfd7afcbd55b8c7a06731ca0c5a21cb0f1df
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
59 #if defined(__linux__)
60 #include "qemu/userfaultfd.h"
61 #endif /* defined(__linux__) */
63 /***********************************************************/
64 /* ram save/restore */
66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
67 * worked for pages that where filled with the same char. We switched
68 * it to only search for the zero value. And to avoid confusion with
69 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
73 #define RAM_SAVE_FLAG_ZERO 0x02
74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
75 #define RAM_SAVE_FLAG_PAGE 0x08
76 #define RAM_SAVE_FLAG_EOS 0x10
77 #define RAM_SAVE_FLAG_CONTINUE 0x20
78 #define RAM_SAVE_FLAG_XBZRLE 0x40
79 /* 0x80 is reserved in migration.h start with 0x100 next */
80 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
84 return buffer_is_zero(p, size);
87 XBZRLECacheStats xbzrle_counters;
89 /* struct contains XBZRLE cache and a static page
90 used by the compression */
91 static struct {
92 /* buffer used for XBZRLE encoding */
93 uint8_t *encoded_buf;
94 /* buffer for storing page content */
95 uint8_t *current_buf;
96 /* Cache for XBZRLE, Protected by lock. */
97 PageCache *cache;
98 QemuMutex lock;
99 /* it will store a page full of zeros */
100 uint8_t *zero_target_page;
101 /* buffer used for XBZRLE decoding */
102 uint8_t *decoded_buf;
103 } XBZRLE;
105 static void XBZRLE_cache_lock(void)
107 if (migrate_use_xbzrle()) {
108 qemu_mutex_lock(&XBZRLE.lock);
112 static void XBZRLE_cache_unlock(void)
114 if (migrate_use_xbzrle()) {
115 qemu_mutex_unlock(&XBZRLE.lock);
120 * xbzrle_cache_resize: resize the xbzrle cache
122 * This function is called from migrate_params_apply in main
123 * thread, possibly while a migration is in progress. A running
124 * migration may be using the cache and might finish during this call,
125 * hence changes to the cache are protected by XBZRLE.lock().
127 * Returns 0 for success or -1 for error
129 * @new_size: new cache size
130 * @errp: set *errp if the check failed, with reason
132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
134 PageCache *new_cache;
135 int64_t ret = 0;
137 /* Check for truncation */
138 if (new_size != (size_t)new_size) {
139 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
140 "exceeding address space");
141 return -1;
144 if (new_size == migrate_xbzrle_cache_size()) {
145 /* nothing to do */
146 return 0;
149 XBZRLE_cache_lock();
151 if (XBZRLE.cache != NULL) {
152 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
153 if (!new_cache) {
154 ret = -1;
155 goto out;
158 cache_fini(XBZRLE.cache);
159 XBZRLE.cache = new_cache;
161 out:
162 XBZRLE_cache_unlock();
163 return ret;
166 bool ramblock_is_ignored(RAMBlock *block)
168 return !qemu_ram_is_migratable(block) ||
169 (migrate_ignore_shared() && qemu_ram_is_shared(block));
172 #undef RAMBLOCK_FOREACH
174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
176 RAMBlock *block;
177 int ret = 0;
179 RCU_READ_LOCK_GUARD();
181 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
182 ret = func(block, opaque);
183 if (ret) {
184 break;
187 return ret;
190 static void ramblock_recv_map_init(void)
192 RAMBlock *rb;
194 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
195 assert(!rb->receivedmap);
196 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
202 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
203 rb->receivedmap);
206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
208 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
213 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
217 size_t nr)
219 bitmap_set_atomic(rb->receivedmap,
220 ramblock_recv_bitmap_offset(host_addr, rb),
221 nr);
224 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
227 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
229 * Returns >0 if success with sent bytes, or <0 if error.
231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
232 const char *block_name)
234 RAMBlock *block = qemu_ram_block_by_name(block_name);
235 unsigned long *le_bitmap, nbits;
236 uint64_t size;
238 if (!block) {
239 error_report("%s: invalid block name: %s", __func__, block_name);
240 return -1;
243 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
246 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
247 * machines we may need 4 more bytes for padding (see below
248 * comment). So extend it a bit before hand.
250 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
253 * Always use little endian when sending the bitmap. This is
254 * required that when source and destination VMs are not using the
255 * same endianness. (Note: big endian won't work.)
257 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
259 /* Size of the bitmap, in bytes */
260 size = DIV_ROUND_UP(nbits, 8);
263 * size is always aligned to 8 bytes for 64bit machines, but it
264 * may not be true for 32bit machines. We need this padding to
265 * make sure the migration can survive even between 32bit and
266 * 64bit machines.
268 size = ROUND_UP(size, 8);
270 qemu_put_be64(file, size);
271 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
273 * Mark as an end, in case the middle part is screwed up due to
274 * some "mysterious" reason.
276 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
277 qemu_fflush(file);
279 g_free(le_bitmap);
281 if (qemu_file_get_error(file)) {
282 return qemu_file_get_error(file);
285 return size + sizeof(size);
289 * An outstanding page request, on the source, having been received
290 * and queued
292 struct RAMSrcPageRequest {
293 RAMBlock *rb;
294 hwaddr offset;
295 hwaddr len;
297 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
300 /* State of RAM for migration */
301 struct RAMState {
302 /* QEMUFile used for this migration */
303 QEMUFile *f;
304 /* UFFD file descriptor, used in 'write-tracking' migration */
305 int uffdio_fd;
306 /* Last block that we have visited searching for dirty pages */
307 RAMBlock *last_seen_block;
308 /* Last block from where we have sent data */
309 RAMBlock *last_sent_block;
310 /* Last dirty target page we have sent */
311 ram_addr_t last_page;
312 /* last ram version we have seen */
313 uint32_t last_version;
314 /* How many times we have dirty too many pages */
315 int dirty_rate_high_cnt;
316 /* these variables are used for bitmap sync */
317 /* last time we did a full bitmap_sync */
318 int64_t time_last_bitmap_sync;
319 /* bytes transferred at start_time */
320 uint64_t bytes_xfer_prev;
321 /* number of dirty pages since start_time */
322 uint64_t num_dirty_pages_period;
323 /* xbzrle misses since the beginning of the period */
324 uint64_t xbzrle_cache_miss_prev;
325 /* Amount of xbzrle pages since the beginning of the period */
326 uint64_t xbzrle_pages_prev;
327 /* Amount of xbzrle encoded bytes since the beginning of the period */
328 uint64_t xbzrle_bytes_prev;
329 /* Start using XBZRLE (e.g., after the first round). */
330 bool xbzrle_enabled;
332 /* compression statistics since the beginning of the period */
333 /* amount of count that no free thread to compress data */
334 uint64_t compress_thread_busy_prev;
335 /* amount bytes after compression */
336 uint64_t compressed_size_prev;
337 /* amount of compressed pages */
338 uint64_t compress_pages_prev;
340 /* total handled target pages at the beginning of period */
341 uint64_t target_page_count_prev;
342 /* total handled target pages since start */
343 uint64_t target_page_count;
344 /* number of dirty bits in the bitmap */
345 uint64_t migration_dirty_pages;
346 /* Protects modification of the bitmap and migration dirty pages */
347 QemuMutex bitmap_mutex;
348 /* The RAMBlock used in the last src_page_requests */
349 RAMBlock *last_req_rb;
350 /* Queue of outstanding page requests from the destination */
351 QemuMutex src_page_req_mutex;
352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
354 typedef struct RAMState RAMState;
356 static RAMState *ram_state;
358 static NotifierWithReturnList precopy_notifier_list;
360 void precopy_infrastructure_init(void)
362 notifier_with_return_list_init(&precopy_notifier_list);
365 void precopy_add_notifier(NotifierWithReturn *n)
367 notifier_with_return_list_add(&precopy_notifier_list, n);
370 void precopy_remove_notifier(NotifierWithReturn *n)
372 notifier_with_return_remove(n);
375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
377 PrecopyNotifyData pnd;
378 pnd.reason = reason;
379 pnd.errp = errp;
381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
384 uint64_t ram_bytes_remaining(void)
386 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
390 MigrationStats ram_counters;
392 /* used by the search for pages to send */
393 struct PageSearchStatus {
394 /* Current block being searched */
395 RAMBlock *block;
396 /* Current page to search from */
397 unsigned long page;
398 /* Set once we wrap around */
399 bool complete_round;
401 typedef struct PageSearchStatus PageSearchStatus;
403 CompressionStats compression_counters;
405 struct CompressParam {
406 bool done;
407 bool quit;
408 bool zero_page;
409 QEMUFile *file;
410 QemuMutex mutex;
411 QemuCond cond;
412 RAMBlock *block;
413 ram_addr_t offset;
415 /* internally used fields */
416 z_stream stream;
417 uint8_t *originbuf;
419 typedef struct CompressParam CompressParam;
421 struct DecompressParam {
422 bool done;
423 bool quit;
424 QemuMutex mutex;
425 QemuCond cond;
426 void *des;
427 uint8_t *compbuf;
428 int len;
429 z_stream stream;
431 typedef struct DecompressParam DecompressParam;
433 static CompressParam *comp_param;
434 static QemuThread *compress_threads;
435 /* comp_done_cond is used to wake up the migration thread when
436 * one of the compression threads has finished the compression.
437 * comp_done_lock is used to co-work with comp_done_cond.
439 static QemuMutex comp_done_lock;
440 static QemuCond comp_done_cond;
441 /* The empty QEMUFileOps will be used by file in CompressParam */
442 static const QEMUFileOps empty_ops = { };
444 static QEMUFile *decomp_file;
445 static DecompressParam *decomp_param;
446 static QemuThread *decompress_threads;
447 static QemuMutex decomp_done_lock;
448 static QemuCond decomp_done_cond;
450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
451 ram_addr_t offset, uint8_t *source_buf);
453 static void *do_data_compress(void *opaque)
455 CompressParam *param = opaque;
456 RAMBlock *block;
457 ram_addr_t offset;
458 bool zero_page;
460 qemu_mutex_lock(&param->mutex);
461 while (!param->quit) {
462 if (param->block) {
463 block = param->block;
464 offset = param->offset;
465 param->block = NULL;
466 qemu_mutex_unlock(&param->mutex);
468 zero_page = do_compress_ram_page(param->file, &param->stream,
469 block, offset, param->originbuf);
471 qemu_mutex_lock(&comp_done_lock);
472 param->done = true;
473 param->zero_page = zero_page;
474 qemu_cond_signal(&comp_done_cond);
475 qemu_mutex_unlock(&comp_done_lock);
477 qemu_mutex_lock(&param->mutex);
478 } else {
479 qemu_cond_wait(&param->cond, &param->mutex);
482 qemu_mutex_unlock(&param->mutex);
484 return NULL;
487 static void compress_threads_save_cleanup(void)
489 int i, thread_count;
491 if (!migrate_use_compression() || !comp_param) {
492 return;
495 thread_count = migrate_compress_threads();
496 for (i = 0; i < thread_count; i++) {
498 * we use it as a indicator which shows if the thread is
499 * properly init'd or not
501 if (!comp_param[i].file) {
502 break;
505 qemu_mutex_lock(&comp_param[i].mutex);
506 comp_param[i].quit = true;
507 qemu_cond_signal(&comp_param[i].cond);
508 qemu_mutex_unlock(&comp_param[i].mutex);
510 qemu_thread_join(compress_threads + i);
511 qemu_mutex_destroy(&comp_param[i].mutex);
512 qemu_cond_destroy(&comp_param[i].cond);
513 deflateEnd(&comp_param[i].stream);
514 g_free(comp_param[i].originbuf);
515 qemu_fclose(comp_param[i].file);
516 comp_param[i].file = NULL;
518 qemu_mutex_destroy(&comp_done_lock);
519 qemu_cond_destroy(&comp_done_cond);
520 g_free(compress_threads);
521 g_free(comp_param);
522 compress_threads = NULL;
523 comp_param = NULL;
526 static int compress_threads_save_setup(void)
528 int i, thread_count;
530 if (!migrate_use_compression()) {
531 return 0;
533 thread_count = migrate_compress_threads();
534 compress_threads = g_new0(QemuThread, thread_count);
535 comp_param = g_new0(CompressParam, thread_count);
536 qemu_cond_init(&comp_done_cond);
537 qemu_mutex_init(&comp_done_lock);
538 for (i = 0; i < thread_count; i++) {
539 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
540 if (!comp_param[i].originbuf) {
541 goto exit;
544 if (deflateInit(&comp_param[i].stream,
545 migrate_compress_level()) != Z_OK) {
546 g_free(comp_param[i].originbuf);
547 goto exit;
550 /* comp_param[i].file is just used as a dummy buffer to save data,
551 * set its ops to empty.
553 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
554 comp_param[i].done = true;
555 comp_param[i].quit = false;
556 qemu_mutex_init(&comp_param[i].mutex);
557 qemu_cond_init(&comp_param[i].cond);
558 qemu_thread_create(compress_threads + i, "compress",
559 do_data_compress, comp_param + i,
560 QEMU_THREAD_JOINABLE);
562 return 0;
564 exit:
565 compress_threads_save_cleanup();
566 return -1;
570 * save_page_header: write page header to wire
572 * If this is the 1st block, it also writes the block identification
574 * Returns the number of bytes written
576 * @f: QEMUFile where to send the data
577 * @block: block that contains the page we want to send
578 * @offset: offset inside the block for the page
579 * in the lower bits, it contains flags
581 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
582 ram_addr_t offset)
584 size_t size, len;
586 if (block == rs->last_sent_block) {
587 offset |= RAM_SAVE_FLAG_CONTINUE;
589 qemu_put_be64(f, offset);
590 size = 8;
592 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
593 len = strlen(block->idstr);
594 qemu_put_byte(f, len);
595 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
596 size += 1 + len;
597 rs->last_sent_block = block;
599 return size;
603 * mig_throttle_guest_down: throttle down the guest
605 * Reduce amount of guest cpu execution to hopefully slow down memory
606 * writes. If guest dirty memory rate is reduced below the rate at
607 * which we can transfer pages to the destination then we should be
608 * able to complete migration. Some workloads dirty memory way too
609 * fast and will not effectively converge, even with auto-converge.
611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
612 uint64_t bytes_dirty_threshold)
614 MigrationState *s = migrate_get_current();
615 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
616 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
617 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
618 int pct_max = s->parameters.max_cpu_throttle;
620 uint64_t throttle_now = cpu_throttle_get_percentage();
621 uint64_t cpu_now, cpu_ideal, throttle_inc;
623 /* We have not started throttling yet. Let's start it. */
624 if (!cpu_throttle_active()) {
625 cpu_throttle_set(pct_initial);
626 } else {
627 /* Throttling already on, just increase the rate */
628 if (!pct_tailslow) {
629 throttle_inc = pct_increment;
630 } else {
631 /* Compute the ideal CPU percentage used by Guest, which may
632 * make the dirty rate match the dirty rate threshold. */
633 cpu_now = 100 - throttle_now;
634 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
635 bytes_dirty_period);
636 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
638 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
643 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
645 * @rs: current RAM state
646 * @current_addr: address for the zero page
648 * Update the xbzrle cache to reflect a page that's been sent as all 0.
649 * The important thing is that a stale (not-yet-0'd) page be replaced
650 * by the new data.
651 * As a bonus, if the page wasn't in the cache it gets added so that
652 * when a small write is made into the 0'd page it gets XBZRLE sent.
654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
656 if (!rs->xbzrle_enabled) {
657 return;
660 /* We don't care if this fails to allocate a new cache page
661 * as long as it updated an old one */
662 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
663 ram_counters.dirty_sync_count);
666 #define ENCODING_FLAG_XBZRLE 0x1
669 * save_xbzrle_page: compress and send current page
671 * Returns: 1 means that we wrote the page
672 * 0 means that page is identical to the one already sent
673 * -1 means that xbzrle would be longer than normal
675 * @rs: current RAM state
676 * @current_data: pointer to the address of the page contents
677 * @current_addr: addr of the page
678 * @block: block that contains the page we want to send
679 * @offset: offset inside the block for the page
680 * @last_stage: if we are at the completion stage
682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
683 ram_addr_t current_addr, RAMBlock *block,
684 ram_addr_t offset, bool last_stage)
686 int encoded_len = 0, bytes_xbzrle;
687 uint8_t *prev_cached_page;
689 if (!cache_is_cached(XBZRLE.cache, current_addr,
690 ram_counters.dirty_sync_count)) {
691 xbzrle_counters.cache_miss++;
692 if (!last_stage) {
693 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
694 ram_counters.dirty_sync_count) == -1) {
695 return -1;
696 } else {
697 /* update *current_data when the page has been
698 inserted into cache */
699 *current_data = get_cached_data(XBZRLE.cache, current_addr);
702 return -1;
706 * Reaching here means the page has hit the xbzrle cache, no matter what
707 * encoding result it is (normal encoding, overflow or skipping the page),
708 * count the page as encoded. This is used to calculate the encoding rate.
710 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
711 * 2nd page turns out to be skipped (i.e. no new bytes written to the
712 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
713 * skipped page included. In this way, the encoding rate can tell if the
714 * guest page is good for xbzrle encoding.
716 xbzrle_counters.pages++;
717 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
719 /* save current buffer into memory */
720 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
722 /* XBZRLE encoding (if there is no overflow) */
723 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725 TARGET_PAGE_SIZE);
728 * Update the cache contents, so that it corresponds to the data
729 * sent, in all cases except where we skip the page.
731 if (!last_stage && encoded_len != 0) {
732 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
734 * In the case where we couldn't compress, ensure that the caller
735 * sends the data from the cache, since the guest might have
736 * changed the RAM since we copied it.
738 *current_data = prev_cached_page;
741 if (encoded_len == 0) {
742 trace_save_xbzrle_page_skipping();
743 return 0;
744 } else if (encoded_len == -1) {
745 trace_save_xbzrle_page_overflow();
746 xbzrle_counters.overflow++;
747 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
748 return -1;
751 /* Send XBZRLE based compressed page */
752 bytes_xbzrle = save_page_header(rs, rs->f, block,
753 offset | RAM_SAVE_FLAG_XBZRLE);
754 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
755 qemu_put_be16(rs->f, encoded_len);
756 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
757 bytes_xbzrle += encoded_len + 1 + 2;
759 * Like compressed_size (please see update_compress_thread_counts),
760 * the xbzrle encoded bytes don't count the 8 byte header with
761 * RAM_SAVE_FLAG_CONTINUE.
763 xbzrle_counters.bytes += bytes_xbzrle - 8;
764 ram_counters.transferred += bytes_xbzrle;
766 return 1;
770 * migration_bitmap_find_dirty: find the next dirty page from start
772 * Returns the page offset within memory region of the start of a dirty page
774 * @rs: current RAM state
775 * @rb: RAMBlock where to search for dirty pages
776 * @start: page where we start the search
778 static inline
779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
780 unsigned long start)
782 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
783 unsigned long *bitmap = rb->bmap;
785 if (ramblock_is_ignored(rb)) {
786 return size;
789 return find_next_bit(bitmap, size, start);
792 static void migration_clear_memory_region_dirty_bitmap(RAMState *rs,
793 RAMBlock *rb,
794 unsigned long page)
796 uint8_t shift;
797 hwaddr size, start;
799 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
800 return;
803 shift = rb->clear_bmap_shift;
805 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
806 * can make things easier sometimes since then start address
807 * of the small chunk will always be 64 pages aligned so the
808 * bitmap will always be aligned to unsigned long. We should
809 * even be able to remove this restriction but I'm simply
810 * keeping it.
812 assert(shift >= 6);
814 size = 1ULL << (TARGET_PAGE_BITS + shift);
815 start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
816 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
817 memory_region_clear_dirty_bitmap(rb->mr, start, size);
820 static void
821 migration_clear_memory_region_dirty_bitmap_range(RAMState *rs,
822 RAMBlock *rb,
823 unsigned long start,
824 unsigned long npages)
826 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
827 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
828 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
831 * Clear pages from start to start + npages - 1, so the end boundary is
832 * exclusive.
834 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
835 migration_clear_memory_region_dirty_bitmap(rs, rb, i);
839 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
840 RAMBlock *rb,
841 unsigned long page)
843 bool ret;
846 * Clear dirty bitmap if needed. This _must_ be called before we
847 * send any of the page in the chunk because we need to make sure
848 * we can capture further page content changes when we sync dirty
849 * log the next time. So as long as we are going to send any of
850 * the page in the chunk we clear the remote dirty bitmap for all.
851 * Clearing it earlier won't be a problem, but too late will.
853 migration_clear_memory_region_dirty_bitmap(rs, rb, page);
855 ret = test_and_clear_bit(page, rb->bmap);
856 if (ret) {
857 rs->migration_dirty_pages--;
860 return ret;
863 /* Called with RCU critical section */
864 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
866 uint64_t new_dirty_pages =
867 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
869 rs->migration_dirty_pages += new_dirty_pages;
870 rs->num_dirty_pages_period += new_dirty_pages;
874 * ram_pagesize_summary: calculate all the pagesizes of a VM
876 * Returns a summary bitmap of the page sizes of all RAMBlocks
878 * For VMs with just normal pages this is equivalent to the host page
879 * size. If it's got some huge pages then it's the OR of all the
880 * different page sizes.
882 uint64_t ram_pagesize_summary(void)
884 RAMBlock *block;
885 uint64_t summary = 0;
887 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
888 summary |= block->page_size;
891 return summary;
894 uint64_t ram_get_total_transferred_pages(void)
896 return ram_counters.normal + ram_counters.duplicate +
897 compression_counters.pages + xbzrle_counters.pages;
900 static void migration_update_rates(RAMState *rs, int64_t end_time)
902 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
903 double compressed_size;
905 /* calculate period counters */
906 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
907 / (end_time - rs->time_last_bitmap_sync);
909 if (!page_count) {
910 return;
913 if (migrate_use_xbzrle()) {
914 double encoded_size, unencoded_size;
916 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
917 rs->xbzrle_cache_miss_prev) / page_count;
918 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
919 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
920 TARGET_PAGE_SIZE;
921 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
922 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
923 xbzrle_counters.encoding_rate = 0;
924 } else {
925 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
927 rs->xbzrle_pages_prev = xbzrle_counters.pages;
928 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
931 if (migrate_use_compression()) {
932 compression_counters.busy_rate = (double)(compression_counters.busy -
933 rs->compress_thread_busy_prev) / page_count;
934 rs->compress_thread_busy_prev = compression_counters.busy;
936 compressed_size = compression_counters.compressed_size -
937 rs->compressed_size_prev;
938 if (compressed_size) {
939 double uncompressed_size = (compression_counters.pages -
940 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
942 /* Compression-Ratio = Uncompressed-size / Compressed-size */
943 compression_counters.compression_rate =
944 uncompressed_size / compressed_size;
946 rs->compress_pages_prev = compression_counters.pages;
947 rs->compressed_size_prev = compression_counters.compressed_size;
952 static void migration_trigger_throttle(RAMState *rs)
954 MigrationState *s = migrate_get_current();
955 uint64_t threshold = s->parameters.throttle_trigger_threshold;
957 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
958 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
959 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
961 /* During block migration the auto-converge logic incorrectly detects
962 * that ram migration makes no progress. Avoid this by disabling the
963 * throttling logic during the bulk phase of block migration. */
964 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
965 /* The following detection logic can be refined later. For now:
966 Check to see if the ratio between dirtied bytes and the approx.
967 amount of bytes that just got transferred since the last time
968 we were in this routine reaches the threshold. If that happens
969 twice, start or increase throttling. */
971 if ((bytes_dirty_period > bytes_dirty_threshold) &&
972 (++rs->dirty_rate_high_cnt >= 2)) {
973 trace_migration_throttle();
974 rs->dirty_rate_high_cnt = 0;
975 mig_throttle_guest_down(bytes_dirty_period,
976 bytes_dirty_threshold);
981 static void migration_bitmap_sync(RAMState *rs)
983 RAMBlock *block;
984 int64_t end_time;
986 ram_counters.dirty_sync_count++;
988 if (!rs->time_last_bitmap_sync) {
989 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
992 trace_migration_bitmap_sync_start();
993 memory_global_dirty_log_sync();
995 qemu_mutex_lock(&rs->bitmap_mutex);
996 WITH_RCU_READ_LOCK_GUARD() {
997 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
998 ramblock_sync_dirty_bitmap(rs, block);
1000 ram_counters.remaining = ram_bytes_remaining();
1002 qemu_mutex_unlock(&rs->bitmap_mutex);
1004 memory_global_after_dirty_log_sync();
1005 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1007 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1009 /* more than 1 second = 1000 millisecons */
1010 if (end_time > rs->time_last_bitmap_sync + 1000) {
1011 migration_trigger_throttle(rs);
1013 migration_update_rates(rs, end_time);
1015 rs->target_page_count_prev = rs->target_page_count;
1017 /* reset period counters */
1018 rs->time_last_bitmap_sync = end_time;
1019 rs->num_dirty_pages_period = 0;
1020 rs->bytes_xfer_prev = ram_counters.transferred;
1022 if (migrate_use_events()) {
1023 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1027 static void migration_bitmap_sync_precopy(RAMState *rs)
1029 Error *local_err = NULL;
1032 * The current notifier usage is just an optimization to migration, so we
1033 * don't stop the normal migration process in the error case.
1035 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1036 error_report_err(local_err);
1037 local_err = NULL;
1040 migration_bitmap_sync(rs);
1042 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1043 error_report_err(local_err);
1048 * save_zero_page_to_file: send the zero page to the file
1050 * Returns the size of data written to the file, 0 means the page is not
1051 * a zero page
1053 * @rs: current RAM state
1054 * @file: the file where the data is saved
1055 * @block: block that contains the page we want to send
1056 * @offset: offset inside the block for the page
1058 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1059 RAMBlock *block, ram_addr_t offset)
1061 uint8_t *p = block->host + offset;
1062 int len = 0;
1064 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1065 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1066 qemu_put_byte(file, 0);
1067 len += 1;
1069 return len;
1073 * save_zero_page: send the zero page to the stream
1075 * Returns the number of pages written.
1077 * @rs: current RAM state
1078 * @block: block that contains the page we want to send
1079 * @offset: offset inside the block for the page
1081 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1083 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1085 if (len) {
1086 ram_counters.duplicate++;
1087 ram_counters.transferred += len;
1088 return 1;
1090 return -1;
1093 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1095 if (!migrate_release_ram() || !migration_in_postcopy()) {
1096 return;
1099 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1103 * @pages: the number of pages written by the control path,
1104 * < 0 - error
1105 * > 0 - number of pages written
1107 * Return true if the pages has been saved, otherwise false is returned.
1109 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1110 int *pages)
1112 uint64_t bytes_xmit = 0;
1113 int ret;
1115 *pages = -1;
1116 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1117 &bytes_xmit);
1118 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1119 return false;
1122 if (bytes_xmit) {
1123 ram_counters.transferred += bytes_xmit;
1124 *pages = 1;
1127 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1128 return true;
1131 if (bytes_xmit > 0) {
1132 ram_counters.normal++;
1133 } else if (bytes_xmit == 0) {
1134 ram_counters.duplicate++;
1137 return true;
1141 * directly send the page to the stream
1143 * Returns the number of pages written.
1145 * @rs: current RAM state
1146 * @block: block that contains the page we want to send
1147 * @offset: offset inside the block for the page
1148 * @buf: the page to be sent
1149 * @async: send to page asyncly
1151 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1152 uint8_t *buf, bool async)
1154 ram_counters.transferred += save_page_header(rs, rs->f, block,
1155 offset | RAM_SAVE_FLAG_PAGE);
1156 if (async) {
1157 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1158 migrate_release_ram() &
1159 migration_in_postcopy());
1160 } else {
1161 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1163 ram_counters.transferred += TARGET_PAGE_SIZE;
1164 ram_counters.normal++;
1165 return 1;
1169 * ram_save_page: send the given page to the stream
1171 * Returns the number of pages written.
1172 * < 0 - error
1173 * >=0 - Number of pages written - this might legally be 0
1174 * if xbzrle noticed the page was the same.
1176 * @rs: current RAM state
1177 * @block: block that contains the page we want to send
1178 * @offset: offset inside the block for the page
1179 * @last_stage: if we are at the completion stage
1181 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1183 int pages = -1;
1184 uint8_t *p;
1185 bool send_async = true;
1186 RAMBlock *block = pss->block;
1187 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1188 ram_addr_t current_addr = block->offset + offset;
1190 p = block->host + offset;
1191 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1193 XBZRLE_cache_lock();
1194 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1195 pages = save_xbzrle_page(rs, &p, current_addr, block,
1196 offset, last_stage);
1197 if (!last_stage) {
1198 /* Can't send this cached data async, since the cache page
1199 * might get updated before it gets to the wire
1201 send_async = false;
1205 /* XBZRLE overflow or normal page */
1206 if (pages == -1) {
1207 pages = save_normal_page(rs, block, offset, p, send_async);
1210 XBZRLE_cache_unlock();
1212 return pages;
1215 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1216 ram_addr_t offset)
1218 if (multifd_queue_page(rs->f, block, offset) < 0) {
1219 return -1;
1221 ram_counters.normal++;
1223 return 1;
1226 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1227 ram_addr_t offset, uint8_t *source_buf)
1229 RAMState *rs = ram_state;
1230 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1231 bool zero_page = false;
1232 int ret;
1234 if (save_zero_page_to_file(rs, f, block, offset)) {
1235 zero_page = true;
1236 goto exit;
1239 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1242 * copy it to a internal buffer to avoid it being modified by VM
1243 * so that we can catch up the error during compression and
1244 * decompression
1246 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1247 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1248 if (ret < 0) {
1249 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1250 error_report("compressed data failed!");
1251 return false;
1254 exit:
1255 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1256 return zero_page;
1259 static void
1260 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1262 ram_counters.transferred += bytes_xmit;
1264 if (param->zero_page) {
1265 ram_counters.duplicate++;
1266 return;
1269 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1270 compression_counters.compressed_size += bytes_xmit - 8;
1271 compression_counters.pages++;
1274 static bool save_page_use_compression(RAMState *rs);
1276 static void flush_compressed_data(RAMState *rs)
1278 int idx, len, thread_count;
1280 if (!save_page_use_compression(rs)) {
1281 return;
1283 thread_count = migrate_compress_threads();
1285 qemu_mutex_lock(&comp_done_lock);
1286 for (idx = 0; idx < thread_count; idx++) {
1287 while (!comp_param[idx].done) {
1288 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1291 qemu_mutex_unlock(&comp_done_lock);
1293 for (idx = 0; idx < thread_count; idx++) {
1294 qemu_mutex_lock(&comp_param[idx].mutex);
1295 if (!comp_param[idx].quit) {
1296 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1298 * it's safe to fetch zero_page without holding comp_done_lock
1299 * as there is no further request submitted to the thread,
1300 * i.e, the thread should be waiting for a request at this point.
1302 update_compress_thread_counts(&comp_param[idx], len);
1304 qemu_mutex_unlock(&comp_param[idx].mutex);
1308 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1309 ram_addr_t offset)
1311 param->block = block;
1312 param->offset = offset;
1315 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1316 ram_addr_t offset)
1318 int idx, thread_count, bytes_xmit = -1, pages = -1;
1319 bool wait = migrate_compress_wait_thread();
1321 thread_count = migrate_compress_threads();
1322 qemu_mutex_lock(&comp_done_lock);
1323 retry:
1324 for (idx = 0; idx < thread_count; idx++) {
1325 if (comp_param[idx].done) {
1326 comp_param[idx].done = false;
1327 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1328 qemu_mutex_lock(&comp_param[idx].mutex);
1329 set_compress_params(&comp_param[idx], block, offset);
1330 qemu_cond_signal(&comp_param[idx].cond);
1331 qemu_mutex_unlock(&comp_param[idx].mutex);
1332 pages = 1;
1333 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1334 break;
1339 * wait for the free thread if the user specifies 'compress-wait-thread',
1340 * otherwise we will post the page out in the main thread as normal page.
1342 if (pages < 0 && wait) {
1343 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1344 goto retry;
1346 qemu_mutex_unlock(&comp_done_lock);
1348 return pages;
1352 * find_dirty_block: find the next dirty page and update any state
1353 * associated with the search process.
1355 * Returns true if a page is found
1357 * @rs: current RAM state
1358 * @pss: data about the state of the current dirty page scan
1359 * @again: set to false if the search has scanned the whole of RAM
1361 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1363 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1364 if (pss->complete_round && pss->block == rs->last_seen_block &&
1365 pss->page >= rs->last_page) {
1367 * We've been once around the RAM and haven't found anything.
1368 * Give up.
1370 *again = false;
1371 return false;
1373 if (!offset_in_ramblock(pss->block,
1374 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1375 /* Didn't find anything in this RAM Block */
1376 pss->page = 0;
1377 pss->block = QLIST_NEXT_RCU(pss->block, next);
1378 if (!pss->block) {
1380 * If memory migration starts over, we will meet a dirtied page
1381 * which may still exists in compression threads's ring, so we
1382 * should flush the compressed data to make sure the new page
1383 * is not overwritten by the old one in the destination.
1385 * Also If xbzrle is on, stop using the data compression at this
1386 * point. In theory, xbzrle can do better than compression.
1388 flush_compressed_data(rs);
1390 /* Hit the end of the list */
1391 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1392 /* Flag that we've looped */
1393 pss->complete_round = true;
1394 /* After the first round, enable XBZRLE. */
1395 if (migrate_use_xbzrle()) {
1396 rs->xbzrle_enabled = true;
1399 /* Didn't find anything this time, but try again on the new block */
1400 *again = true;
1401 return false;
1402 } else {
1403 /* Can go around again, but... */
1404 *again = true;
1405 /* We've found something so probably don't need to */
1406 return true;
1411 * unqueue_page: gets a page of the queue
1413 * Helper for 'get_queued_page' - gets a page off the queue
1415 * Returns the block of the page (or NULL if none available)
1417 * @rs: current RAM state
1418 * @offset: used to return the offset within the RAMBlock
1420 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1422 RAMBlock *block = NULL;
1424 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1425 return NULL;
1428 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1429 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1430 struct RAMSrcPageRequest *entry =
1431 QSIMPLEQ_FIRST(&rs->src_page_requests);
1432 block = entry->rb;
1433 *offset = entry->offset;
1435 if (entry->len > TARGET_PAGE_SIZE) {
1436 entry->len -= TARGET_PAGE_SIZE;
1437 entry->offset += TARGET_PAGE_SIZE;
1438 } else {
1439 memory_region_unref(block->mr);
1440 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1441 g_free(entry);
1442 migration_consume_urgent_request();
1446 return block;
1449 #if defined(__linux__)
1451 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1452 * is found, return RAM block pointer and page offset
1454 * Returns pointer to the RAMBlock containing faulting page,
1455 * NULL if no write faults are pending
1457 * @rs: current RAM state
1458 * @offset: page offset from the beginning of the block
1460 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1462 struct uffd_msg uffd_msg;
1463 void *page_address;
1464 RAMBlock *block;
1465 int res;
1467 if (!migrate_background_snapshot()) {
1468 return NULL;
1471 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1472 if (res <= 0) {
1473 return NULL;
1476 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1477 block = qemu_ram_block_from_host(page_address, false, offset);
1478 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1479 return block;
1483 * ram_save_release_protection: release UFFD write protection after
1484 * a range of pages has been saved
1486 * @rs: current RAM state
1487 * @pss: page-search-status structure
1488 * @start_page: index of the first page in the range relative to pss->block
1490 * Returns 0 on success, negative value in case of an error
1492 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1493 unsigned long start_page)
1495 int res = 0;
1497 /* Check if page is from UFFD-managed region. */
1498 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1499 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1500 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1502 /* Flush async buffers before un-protect. */
1503 qemu_fflush(rs->f);
1504 /* Un-protect memory range. */
1505 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1506 false, false);
1509 return res;
1512 /* ram_write_tracking_available: check if kernel supports required UFFD features
1514 * Returns true if supports, false otherwise
1516 bool ram_write_tracking_available(void)
1518 uint64_t uffd_features;
1519 int res;
1521 res = uffd_query_features(&uffd_features);
1522 return (res == 0 &&
1523 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1526 /* ram_write_tracking_compatible: check if guest configuration is
1527 * compatible with 'write-tracking'
1529 * Returns true if compatible, false otherwise
1531 bool ram_write_tracking_compatible(void)
1533 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1534 int uffd_fd;
1535 RAMBlock *block;
1536 bool ret = false;
1538 /* Open UFFD file descriptor */
1539 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1540 if (uffd_fd < 0) {
1541 return false;
1544 RCU_READ_LOCK_GUARD();
1546 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1547 uint64_t uffd_ioctls;
1549 /* Nothing to do with read-only and MMIO-writable regions */
1550 if (block->mr->readonly || block->mr->rom_device) {
1551 continue;
1553 /* Try to register block memory via UFFD-IO to track writes */
1554 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1555 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1556 goto out;
1558 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1559 goto out;
1562 ret = true;
1564 out:
1565 uffd_close_fd(uffd_fd);
1566 return ret;
1570 * ram_block_populate_pages: populate memory in the RAM block by reading
1571 * an integer from the beginning of each page.
1573 * Since it's solely used for userfault_fd WP feature, here we just
1574 * hardcode page size to qemu_real_host_page_size.
1576 * @block: RAM block to populate
1578 static void ram_block_populate_pages(RAMBlock *block)
1580 char *ptr = (char *) block->host;
1582 for (ram_addr_t offset = 0; offset < block->used_length;
1583 offset += qemu_real_host_page_size) {
1584 char tmp = *(ptr + offset);
1586 /* Don't optimize the read out */
1587 asm volatile("" : "+r" (tmp));
1592 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1594 void ram_write_tracking_prepare(void)
1596 RAMBlock *block;
1598 RCU_READ_LOCK_GUARD();
1600 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1601 /* Nothing to do with read-only and MMIO-writable regions */
1602 if (block->mr->readonly || block->mr->rom_device) {
1603 continue;
1607 * Populate pages of the RAM block before enabling userfault_fd
1608 * write protection.
1610 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1611 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1612 * pages with pte_none() entries in page table.
1614 ram_block_populate_pages(block);
1619 * ram_write_tracking_start: start UFFD-WP memory tracking
1621 * Returns 0 for success or negative value in case of error
1623 int ram_write_tracking_start(void)
1625 int uffd_fd;
1626 RAMState *rs = ram_state;
1627 RAMBlock *block;
1629 /* Open UFFD file descriptor */
1630 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1631 if (uffd_fd < 0) {
1632 return uffd_fd;
1634 rs->uffdio_fd = uffd_fd;
1636 RCU_READ_LOCK_GUARD();
1638 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639 /* Nothing to do with read-only and MMIO-writable regions */
1640 if (block->mr->readonly || block->mr->rom_device) {
1641 continue;
1644 /* Register block memory with UFFD to track writes */
1645 if (uffd_register_memory(rs->uffdio_fd, block->host,
1646 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1647 goto fail;
1649 /* Apply UFFD write protection to the block memory range */
1650 if (uffd_change_protection(rs->uffdio_fd, block->host,
1651 block->max_length, true, false)) {
1652 goto fail;
1654 block->flags |= RAM_UF_WRITEPROTECT;
1655 memory_region_ref(block->mr);
1657 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1658 block->host, block->max_length);
1661 return 0;
1663 fail:
1664 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1666 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1667 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1668 continue;
1671 * In case some memory block failed to be write-protected
1672 * remove protection and unregister all succeeded RAM blocks
1674 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1675 false, false);
1676 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1677 /* Cleanup flags and remove reference */
1678 block->flags &= ~RAM_UF_WRITEPROTECT;
1679 memory_region_unref(block->mr);
1682 uffd_close_fd(uffd_fd);
1683 rs->uffdio_fd = -1;
1684 return -1;
1688 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1690 void ram_write_tracking_stop(void)
1692 RAMState *rs = ram_state;
1693 RAMBlock *block;
1695 RCU_READ_LOCK_GUARD();
1697 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1698 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1699 continue;
1701 /* Remove protection and unregister all affected RAM blocks */
1702 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1703 false, false);
1704 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1706 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1707 block->host, block->max_length);
1709 /* Cleanup flags and remove reference */
1710 block->flags &= ~RAM_UF_WRITEPROTECT;
1711 memory_region_unref(block->mr);
1714 /* Finally close UFFD file descriptor */
1715 uffd_close_fd(rs->uffdio_fd);
1716 rs->uffdio_fd = -1;
1719 #else
1720 /* No target OS support, stubs just fail or ignore */
1722 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1724 (void) rs;
1725 (void) offset;
1727 return NULL;
1730 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1731 unsigned long start_page)
1733 (void) rs;
1734 (void) pss;
1735 (void) start_page;
1737 return 0;
1740 bool ram_write_tracking_available(void)
1742 return false;
1745 bool ram_write_tracking_compatible(void)
1747 assert(0);
1748 return false;
1751 int ram_write_tracking_start(void)
1753 assert(0);
1754 return -1;
1757 void ram_write_tracking_stop(void)
1759 assert(0);
1761 #endif /* defined(__linux__) */
1764 * get_queued_page: unqueue a page from the postcopy requests
1766 * Skips pages that are already sent (!dirty)
1768 * Returns true if a queued page is found
1770 * @rs: current RAM state
1771 * @pss: data about the state of the current dirty page scan
1773 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1775 RAMBlock *block;
1776 ram_addr_t offset;
1777 bool dirty;
1779 do {
1780 block = unqueue_page(rs, &offset);
1782 * We're sending this page, and since it's postcopy nothing else
1783 * will dirty it, and we must make sure it doesn't get sent again
1784 * even if this queue request was received after the background
1785 * search already sent it.
1787 if (block) {
1788 unsigned long page;
1790 page = offset >> TARGET_PAGE_BITS;
1791 dirty = test_bit(page, block->bmap);
1792 if (!dirty) {
1793 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1794 page);
1795 } else {
1796 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1800 } while (block && !dirty);
1802 if (!block) {
1804 * Poll write faults too if background snapshot is enabled; that's
1805 * when we have vcpus got blocked by the write protected pages.
1807 block = poll_fault_page(rs, &offset);
1810 if (block) {
1812 * We want the background search to continue from the queued page
1813 * since the guest is likely to want other pages near to the page
1814 * it just requested.
1816 pss->block = block;
1817 pss->page = offset >> TARGET_PAGE_BITS;
1820 * This unqueued page would break the "one round" check, even is
1821 * really rare.
1823 pss->complete_round = false;
1826 return !!block;
1830 * migration_page_queue_free: drop any remaining pages in the ram
1831 * request queue
1833 * It should be empty at the end anyway, but in error cases there may
1834 * be some left. in case that there is any page left, we drop it.
1837 static void migration_page_queue_free(RAMState *rs)
1839 struct RAMSrcPageRequest *mspr, *next_mspr;
1840 /* This queue generally should be empty - but in the case of a failed
1841 * migration might have some droppings in.
1843 RCU_READ_LOCK_GUARD();
1844 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1845 memory_region_unref(mspr->rb->mr);
1846 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1847 g_free(mspr);
1852 * ram_save_queue_pages: queue the page for transmission
1854 * A request from postcopy destination for example.
1856 * Returns zero on success or negative on error
1858 * @rbname: Name of the RAMBLock of the request. NULL means the
1859 * same that last one.
1860 * @start: starting address from the start of the RAMBlock
1861 * @len: length (in bytes) to send
1863 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1865 RAMBlock *ramblock;
1866 RAMState *rs = ram_state;
1868 ram_counters.postcopy_requests++;
1869 RCU_READ_LOCK_GUARD();
1871 if (!rbname) {
1872 /* Reuse last RAMBlock */
1873 ramblock = rs->last_req_rb;
1875 if (!ramblock) {
1877 * Shouldn't happen, we can't reuse the last RAMBlock if
1878 * it's the 1st request.
1880 error_report("ram_save_queue_pages no previous block");
1881 return -1;
1883 } else {
1884 ramblock = qemu_ram_block_by_name(rbname);
1886 if (!ramblock) {
1887 /* We shouldn't be asked for a non-existent RAMBlock */
1888 error_report("ram_save_queue_pages no block '%s'", rbname);
1889 return -1;
1891 rs->last_req_rb = ramblock;
1893 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1894 if (!offset_in_ramblock(ramblock, start + len - 1)) {
1895 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1896 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1897 __func__, start, len, ramblock->used_length);
1898 return -1;
1901 struct RAMSrcPageRequest *new_entry =
1902 g_malloc0(sizeof(struct RAMSrcPageRequest));
1903 new_entry->rb = ramblock;
1904 new_entry->offset = start;
1905 new_entry->len = len;
1907 memory_region_ref(ramblock->mr);
1908 qemu_mutex_lock(&rs->src_page_req_mutex);
1909 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1910 migration_make_urgent_request();
1911 qemu_mutex_unlock(&rs->src_page_req_mutex);
1913 return 0;
1916 static bool save_page_use_compression(RAMState *rs)
1918 if (!migrate_use_compression()) {
1919 return false;
1923 * If xbzrle is enabled (e.g., after first round of migration), stop
1924 * using the data compression. In theory, xbzrle can do better than
1925 * compression.
1927 if (rs->xbzrle_enabled) {
1928 return false;
1931 return true;
1935 * try to compress the page before posting it out, return true if the page
1936 * has been properly handled by compression, otherwise needs other
1937 * paths to handle it
1939 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1941 if (!save_page_use_compression(rs)) {
1942 return false;
1946 * When starting the process of a new block, the first page of
1947 * the block should be sent out before other pages in the same
1948 * block, and all the pages in last block should have been sent
1949 * out, keeping this order is important, because the 'cont' flag
1950 * is used to avoid resending the block name.
1952 * We post the fist page as normal page as compression will take
1953 * much CPU resource.
1955 if (block != rs->last_sent_block) {
1956 flush_compressed_data(rs);
1957 return false;
1960 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1961 return true;
1964 compression_counters.busy++;
1965 return false;
1969 * ram_save_target_page: save one target page
1971 * Returns the number of pages written
1973 * @rs: current RAM state
1974 * @pss: data about the page we want to send
1975 * @last_stage: if we are at the completion stage
1977 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1978 bool last_stage)
1980 RAMBlock *block = pss->block;
1981 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1982 int res;
1984 if (control_save_page(rs, block, offset, &res)) {
1985 return res;
1988 if (save_compress_page(rs, block, offset)) {
1989 return 1;
1992 res = save_zero_page(rs, block, offset);
1993 if (res > 0) {
1994 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1995 * page would be stale
1997 if (!save_page_use_compression(rs)) {
1998 XBZRLE_cache_lock();
1999 xbzrle_cache_zero_page(rs, block->offset + offset);
2000 XBZRLE_cache_unlock();
2002 ram_release_pages(block->idstr, offset, res);
2003 return res;
2007 * Do not use multifd for:
2008 * 1. Compression as the first page in the new block should be posted out
2009 * before sending the compressed page
2010 * 2. In postcopy as one whole host page should be placed
2012 if (!save_page_use_compression(rs) && migrate_use_multifd()
2013 && !migration_in_postcopy()) {
2014 return ram_save_multifd_page(rs, block, offset);
2017 return ram_save_page(rs, pss, last_stage);
2021 * ram_save_host_page: save a whole host page
2023 * Starting at *offset send pages up to the end of the current host
2024 * page. It's valid for the initial offset to point into the middle of
2025 * a host page in which case the remainder of the hostpage is sent.
2026 * Only dirty target pages are sent. Note that the host page size may
2027 * be a huge page for this block.
2028 * The saving stops at the boundary of the used_length of the block
2029 * if the RAMBlock isn't a multiple of the host page size.
2031 * Returns the number of pages written or negative on error
2033 * @rs: current RAM state
2034 * @ms: current migration state
2035 * @pss: data about the page we want to send
2036 * @last_stage: if we are at the completion stage
2038 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2039 bool last_stage)
2041 int tmppages, pages = 0;
2042 size_t pagesize_bits =
2043 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2044 unsigned long hostpage_boundary =
2045 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2046 unsigned long start_page = pss->page;
2047 int res;
2049 if (ramblock_is_ignored(pss->block)) {
2050 error_report("block %s should not be migrated !", pss->block->idstr);
2051 return 0;
2054 do {
2055 /* Check the pages is dirty and if it is send it */
2056 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2057 tmppages = ram_save_target_page(rs, pss, last_stage);
2058 if (tmppages < 0) {
2059 return tmppages;
2062 pages += tmppages;
2064 * Allow rate limiting to happen in the middle of huge pages if
2065 * something is sent in the current iteration.
2067 if (pagesize_bits > 1 && tmppages > 0) {
2068 migration_rate_limit();
2071 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2072 } while ((pss->page < hostpage_boundary) &&
2073 offset_in_ramblock(pss->block,
2074 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2075 /* The offset we leave with is the min boundary of host page and block */
2076 pss->page = MIN(pss->page, hostpage_boundary) - 1;
2078 res = ram_save_release_protection(rs, pss, start_page);
2079 return (res < 0 ? res : pages);
2083 * ram_find_and_save_block: finds a dirty page and sends it to f
2085 * Called within an RCU critical section.
2087 * Returns the number of pages written where zero means no dirty pages,
2088 * or negative on error
2090 * @rs: current RAM state
2091 * @last_stage: if we are at the completion stage
2093 * On systems where host-page-size > target-page-size it will send all the
2094 * pages in a host page that are dirty.
2097 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2099 PageSearchStatus pss;
2100 int pages = 0;
2101 bool again, found;
2103 /* No dirty page as there is zero RAM */
2104 if (!ram_bytes_total()) {
2105 return pages;
2108 pss.block = rs->last_seen_block;
2109 pss.page = rs->last_page;
2110 pss.complete_round = false;
2112 if (!pss.block) {
2113 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2116 do {
2117 again = true;
2118 found = get_queued_page(rs, &pss);
2120 if (!found) {
2121 /* priority queue empty, so just search for something dirty */
2122 found = find_dirty_block(rs, &pss, &again);
2125 if (found) {
2126 pages = ram_save_host_page(rs, &pss, last_stage);
2128 } while (!pages && again);
2130 rs->last_seen_block = pss.block;
2131 rs->last_page = pss.page;
2133 return pages;
2136 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2138 uint64_t pages = size / TARGET_PAGE_SIZE;
2140 if (zero) {
2141 ram_counters.duplicate += pages;
2142 } else {
2143 ram_counters.normal += pages;
2144 ram_counters.transferred += size;
2145 qemu_update_position(f, size);
2149 static uint64_t ram_bytes_total_common(bool count_ignored)
2151 RAMBlock *block;
2152 uint64_t total = 0;
2154 RCU_READ_LOCK_GUARD();
2156 if (count_ignored) {
2157 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2158 total += block->used_length;
2160 } else {
2161 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2162 total += block->used_length;
2165 return total;
2168 uint64_t ram_bytes_total(void)
2170 return ram_bytes_total_common(false);
2173 static void xbzrle_load_setup(void)
2175 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2178 static void xbzrle_load_cleanup(void)
2180 g_free(XBZRLE.decoded_buf);
2181 XBZRLE.decoded_buf = NULL;
2184 static void ram_state_cleanup(RAMState **rsp)
2186 if (*rsp) {
2187 migration_page_queue_free(*rsp);
2188 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2189 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2190 g_free(*rsp);
2191 *rsp = NULL;
2195 static void xbzrle_cleanup(void)
2197 XBZRLE_cache_lock();
2198 if (XBZRLE.cache) {
2199 cache_fini(XBZRLE.cache);
2200 g_free(XBZRLE.encoded_buf);
2201 g_free(XBZRLE.current_buf);
2202 g_free(XBZRLE.zero_target_page);
2203 XBZRLE.cache = NULL;
2204 XBZRLE.encoded_buf = NULL;
2205 XBZRLE.current_buf = NULL;
2206 XBZRLE.zero_target_page = NULL;
2208 XBZRLE_cache_unlock();
2211 static void ram_save_cleanup(void *opaque)
2213 RAMState **rsp = opaque;
2214 RAMBlock *block;
2216 /* We don't use dirty log with background snapshots */
2217 if (!migrate_background_snapshot()) {
2218 /* caller have hold iothread lock or is in a bh, so there is
2219 * no writing race against the migration bitmap
2221 memory_global_dirty_log_stop();
2224 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2225 g_free(block->clear_bmap);
2226 block->clear_bmap = NULL;
2227 g_free(block->bmap);
2228 block->bmap = NULL;
2231 xbzrle_cleanup();
2232 compress_threads_save_cleanup();
2233 ram_state_cleanup(rsp);
2236 static void ram_state_reset(RAMState *rs)
2238 rs->last_seen_block = NULL;
2239 rs->last_sent_block = NULL;
2240 rs->last_page = 0;
2241 rs->last_version = ram_list.version;
2242 rs->xbzrle_enabled = false;
2245 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2248 * 'expected' is the value you expect the bitmap mostly to be full
2249 * of; it won't bother printing lines that are all this value.
2250 * If 'todump' is null the migration bitmap is dumped.
2252 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2253 unsigned long pages)
2255 int64_t cur;
2256 int64_t linelen = 128;
2257 char linebuf[129];
2259 for (cur = 0; cur < pages; cur += linelen) {
2260 int64_t curb;
2261 bool found = false;
2263 * Last line; catch the case where the line length
2264 * is longer than remaining ram
2266 if (cur + linelen > pages) {
2267 linelen = pages - cur;
2269 for (curb = 0; curb < linelen; curb++) {
2270 bool thisbit = test_bit(cur + curb, todump);
2271 linebuf[curb] = thisbit ? '1' : '.';
2272 found = found || (thisbit != expected);
2274 if (found) {
2275 linebuf[curb] = '\0';
2276 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2281 /* **** functions for postcopy ***** */
2283 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2285 struct RAMBlock *block;
2287 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2288 unsigned long *bitmap = block->bmap;
2289 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2290 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2292 while (run_start < range) {
2293 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2294 ram_discard_range(block->idstr,
2295 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2296 ((ram_addr_t)(run_end - run_start))
2297 << TARGET_PAGE_BITS);
2298 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2304 * postcopy_send_discard_bm_ram: discard a RAMBlock
2306 * Returns zero on success
2308 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2310 * @ms: current migration state
2311 * @block: RAMBlock to discard
2313 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2315 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2316 unsigned long current;
2317 unsigned long *bitmap = block->bmap;
2319 for (current = 0; current < end; ) {
2320 unsigned long one = find_next_bit(bitmap, end, current);
2321 unsigned long zero, discard_length;
2323 if (one >= end) {
2324 break;
2327 zero = find_next_zero_bit(bitmap, end, one + 1);
2329 if (zero >= end) {
2330 discard_length = end - one;
2331 } else {
2332 discard_length = zero - one;
2334 postcopy_discard_send_range(ms, one, discard_length);
2335 current = one + discard_length;
2338 return 0;
2342 * postcopy_each_ram_send_discard: discard all RAMBlocks
2344 * Returns 0 for success or negative for error
2346 * Utility for the outgoing postcopy code.
2347 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2348 * passing it bitmap indexes and name.
2349 * (qemu_ram_foreach_block ends up passing unscaled lengths
2350 * which would mean postcopy code would have to deal with target page)
2352 * @ms: current migration state
2354 static int postcopy_each_ram_send_discard(MigrationState *ms)
2356 struct RAMBlock *block;
2357 int ret;
2359 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2360 postcopy_discard_send_init(ms, block->idstr);
2363 * Postcopy sends chunks of bitmap over the wire, but it
2364 * just needs indexes at this point, avoids it having
2365 * target page specific code.
2367 ret = postcopy_send_discard_bm_ram(ms, block);
2368 postcopy_discard_send_finish(ms);
2369 if (ret) {
2370 return ret;
2374 return 0;
2378 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2380 * Helper for postcopy_chunk_hostpages; it's called twice to
2381 * canonicalize the two bitmaps, that are similar, but one is
2382 * inverted.
2384 * Postcopy requires that all target pages in a hostpage are dirty or
2385 * clean, not a mix. This function canonicalizes the bitmaps.
2387 * @ms: current migration state
2388 * @block: block that contains the page we want to canonicalize
2390 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2392 RAMState *rs = ram_state;
2393 unsigned long *bitmap = block->bmap;
2394 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2395 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2396 unsigned long run_start;
2398 if (block->page_size == TARGET_PAGE_SIZE) {
2399 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2400 return;
2403 /* Find a dirty page */
2404 run_start = find_next_bit(bitmap, pages, 0);
2406 while (run_start < pages) {
2409 * If the start of this run of pages is in the middle of a host
2410 * page, then we need to fixup this host page.
2412 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2413 /* Find the end of this run */
2414 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2416 * If the end isn't at the start of a host page, then the
2417 * run doesn't finish at the end of a host page
2418 * and we need to discard.
2422 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2423 unsigned long page;
2424 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2425 host_ratio);
2426 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2428 /* Clean up the bitmap */
2429 for (page = fixup_start_addr;
2430 page < fixup_start_addr + host_ratio; page++) {
2432 * Remark them as dirty, updating the count for any pages
2433 * that weren't previously dirty.
2435 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2439 /* Find the next dirty page for the next iteration */
2440 run_start = find_next_bit(bitmap, pages, run_start);
2445 * postcopy_chunk_hostpages: discard any partially sent host page
2447 * Utility for the outgoing postcopy code.
2449 * Discard any partially sent host-page size chunks, mark any partially
2450 * dirty host-page size chunks as all dirty. In this case the host-page
2451 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2453 * Returns zero on success
2455 * @ms: current migration state
2456 * @block: block we want to work with
2458 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2460 postcopy_discard_send_init(ms, block->idstr);
2463 * Ensure that all partially dirty host pages are made fully dirty.
2465 postcopy_chunk_hostpages_pass(ms, block);
2467 postcopy_discard_send_finish(ms);
2468 return 0;
2472 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2474 * Returns zero on success
2476 * Transmit the set of pages to be discarded after precopy to the target
2477 * these are pages that:
2478 * a) Have been previously transmitted but are now dirty again
2479 * b) Pages that have never been transmitted, this ensures that
2480 * any pages on the destination that have been mapped by background
2481 * tasks get discarded (transparent huge pages is the specific concern)
2482 * Hopefully this is pretty sparse
2484 * @ms: current migration state
2486 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2488 RAMState *rs = ram_state;
2489 RAMBlock *block;
2490 int ret;
2492 RCU_READ_LOCK_GUARD();
2494 /* This should be our last sync, the src is now paused */
2495 migration_bitmap_sync(rs);
2497 /* Easiest way to make sure we don't resume in the middle of a host-page */
2498 rs->last_seen_block = NULL;
2499 rs->last_sent_block = NULL;
2500 rs->last_page = 0;
2502 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2503 /* Deal with TPS != HPS and huge pages */
2504 ret = postcopy_chunk_hostpages(ms, block);
2505 if (ret) {
2506 return ret;
2509 #ifdef DEBUG_POSTCOPY
2510 ram_debug_dump_bitmap(block->bmap, true,
2511 block->used_length >> TARGET_PAGE_BITS);
2512 #endif
2514 trace_ram_postcopy_send_discard_bitmap();
2516 return postcopy_each_ram_send_discard(ms);
2520 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2522 * Returns zero on success
2524 * @rbname: name of the RAMBlock of the request. NULL means the
2525 * same that last one.
2526 * @start: RAMBlock starting page
2527 * @length: RAMBlock size
2529 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2531 trace_ram_discard_range(rbname, start, length);
2533 RCU_READ_LOCK_GUARD();
2534 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2536 if (!rb) {
2537 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2538 return -1;
2542 * On source VM, we don't need to update the received bitmap since
2543 * we don't even have one.
2545 if (rb->receivedmap) {
2546 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2547 length >> qemu_target_page_bits());
2550 return ram_block_discard_range(rb, start, length);
2554 * For every allocation, we will try not to crash the VM if the
2555 * allocation failed.
2557 static int xbzrle_init(void)
2559 Error *local_err = NULL;
2561 if (!migrate_use_xbzrle()) {
2562 return 0;
2565 XBZRLE_cache_lock();
2567 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2568 if (!XBZRLE.zero_target_page) {
2569 error_report("%s: Error allocating zero page", __func__);
2570 goto err_out;
2573 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2574 TARGET_PAGE_SIZE, &local_err);
2575 if (!XBZRLE.cache) {
2576 error_report_err(local_err);
2577 goto free_zero_page;
2580 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2581 if (!XBZRLE.encoded_buf) {
2582 error_report("%s: Error allocating encoded_buf", __func__);
2583 goto free_cache;
2586 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2587 if (!XBZRLE.current_buf) {
2588 error_report("%s: Error allocating current_buf", __func__);
2589 goto free_encoded_buf;
2592 /* We are all good */
2593 XBZRLE_cache_unlock();
2594 return 0;
2596 free_encoded_buf:
2597 g_free(XBZRLE.encoded_buf);
2598 XBZRLE.encoded_buf = NULL;
2599 free_cache:
2600 cache_fini(XBZRLE.cache);
2601 XBZRLE.cache = NULL;
2602 free_zero_page:
2603 g_free(XBZRLE.zero_target_page);
2604 XBZRLE.zero_target_page = NULL;
2605 err_out:
2606 XBZRLE_cache_unlock();
2607 return -ENOMEM;
2610 static int ram_state_init(RAMState **rsp)
2612 *rsp = g_try_new0(RAMState, 1);
2614 if (!*rsp) {
2615 error_report("%s: Init ramstate fail", __func__);
2616 return -1;
2619 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2620 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2621 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2624 * Count the total number of pages used by ram blocks not including any
2625 * gaps due to alignment or unplugs.
2626 * This must match with the initial values of dirty bitmap.
2628 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2629 ram_state_reset(*rsp);
2631 return 0;
2634 static void ram_list_init_bitmaps(void)
2636 MigrationState *ms = migrate_get_current();
2637 RAMBlock *block;
2638 unsigned long pages;
2639 uint8_t shift;
2641 /* Skip setting bitmap if there is no RAM */
2642 if (ram_bytes_total()) {
2643 shift = ms->clear_bitmap_shift;
2644 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2645 error_report("clear_bitmap_shift (%u) too big, using "
2646 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2647 shift = CLEAR_BITMAP_SHIFT_MAX;
2648 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2649 error_report("clear_bitmap_shift (%u) too small, using "
2650 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2651 shift = CLEAR_BITMAP_SHIFT_MIN;
2654 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2655 pages = block->max_length >> TARGET_PAGE_BITS;
2657 * The initial dirty bitmap for migration must be set with all
2658 * ones to make sure we'll migrate every guest RAM page to
2659 * destination.
2660 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2661 * new migration after a failed migration, ram_list.
2662 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2663 * guest memory.
2665 block->bmap = bitmap_new(pages);
2666 bitmap_set(block->bmap, 0, pages);
2667 block->clear_bmap_shift = shift;
2668 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2673 static void ram_init_bitmaps(RAMState *rs)
2675 /* For memory_global_dirty_log_start below. */
2676 qemu_mutex_lock_iothread();
2677 qemu_mutex_lock_ramlist();
2679 WITH_RCU_READ_LOCK_GUARD() {
2680 ram_list_init_bitmaps();
2681 /* We don't use dirty log with background snapshots */
2682 if (!migrate_background_snapshot()) {
2683 memory_global_dirty_log_start();
2684 migration_bitmap_sync_precopy(rs);
2687 qemu_mutex_unlock_ramlist();
2688 qemu_mutex_unlock_iothread();
2691 static int ram_init_all(RAMState **rsp)
2693 if (ram_state_init(rsp)) {
2694 return -1;
2697 if (xbzrle_init()) {
2698 ram_state_cleanup(rsp);
2699 return -1;
2702 ram_init_bitmaps(*rsp);
2704 return 0;
2707 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2709 RAMBlock *block;
2710 uint64_t pages = 0;
2713 * Postcopy is not using xbzrle/compression, so no need for that.
2714 * Also, since source are already halted, we don't need to care
2715 * about dirty page logging as well.
2718 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2719 pages += bitmap_count_one(block->bmap,
2720 block->used_length >> TARGET_PAGE_BITS);
2723 /* This may not be aligned with current bitmaps. Recalculate. */
2724 rs->migration_dirty_pages = pages;
2726 ram_state_reset(rs);
2728 /* Update RAMState cache of output QEMUFile */
2729 rs->f = out;
2731 trace_ram_state_resume_prepare(pages);
2735 * This function clears bits of the free pages reported by the caller from the
2736 * migration dirty bitmap. @addr is the host address corresponding to the
2737 * start of the continuous guest free pages, and @len is the total bytes of
2738 * those pages.
2740 void qemu_guest_free_page_hint(void *addr, size_t len)
2742 RAMBlock *block;
2743 ram_addr_t offset;
2744 size_t used_len, start, npages;
2745 MigrationState *s = migrate_get_current();
2747 /* This function is currently expected to be used during live migration */
2748 if (!migration_is_setup_or_active(s->state)) {
2749 return;
2752 for (; len > 0; len -= used_len, addr += used_len) {
2753 block = qemu_ram_block_from_host(addr, false, &offset);
2754 if (unlikely(!block || offset >= block->used_length)) {
2756 * The implementation might not support RAMBlock resize during
2757 * live migration, but it could happen in theory with future
2758 * updates. So we add a check here to capture that case.
2760 error_report_once("%s unexpected error", __func__);
2761 return;
2764 if (len <= block->used_length - offset) {
2765 used_len = len;
2766 } else {
2767 used_len = block->used_length - offset;
2770 start = offset >> TARGET_PAGE_BITS;
2771 npages = used_len >> TARGET_PAGE_BITS;
2773 qemu_mutex_lock(&ram_state->bitmap_mutex);
2775 * The skipped free pages are equavalent to be sent from clear_bmap's
2776 * perspective, so clear the bits from the memory region bitmap which
2777 * are initially set. Otherwise those skipped pages will be sent in
2778 * the next round after syncing from the memory region bitmap.
2780 migration_clear_memory_region_dirty_bitmap_range(ram_state, block,
2781 start, npages);
2782 ram_state->migration_dirty_pages -=
2783 bitmap_count_one_with_offset(block->bmap, start, npages);
2784 bitmap_clear(block->bmap, start, npages);
2785 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2790 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2791 * long-running RCU critical section. When rcu-reclaims in the code
2792 * start to become numerous it will be necessary to reduce the
2793 * granularity of these critical sections.
2797 * ram_save_setup: Setup RAM for migration
2799 * Returns zero to indicate success and negative for error
2801 * @f: QEMUFile where to send the data
2802 * @opaque: RAMState pointer
2804 static int ram_save_setup(QEMUFile *f, void *opaque)
2806 RAMState **rsp = opaque;
2807 RAMBlock *block;
2809 if (compress_threads_save_setup()) {
2810 return -1;
2813 /* migration has already setup the bitmap, reuse it. */
2814 if (!migration_in_colo_state()) {
2815 if (ram_init_all(rsp) != 0) {
2816 compress_threads_save_cleanup();
2817 return -1;
2820 (*rsp)->f = f;
2822 WITH_RCU_READ_LOCK_GUARD() {
2823 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2825 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2826 qemu_put_byte(f, strlen(block->idstr));
2827 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2828 qemu_put_be64(f, block->used_length);
2829 if (migrate_postcopy_ram() && block->page_size !=
2830 qemu_host_page_size) {
2831 qemu_put_be64(f, block->page_size);
2833 if (migrate_ignore_shared()) {
2834 qemu_put_be64(f, block->mr->addr);
2839 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2840 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2842 multifd_send_sync_main(f);
2843 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2844 qemu_fflush(f);
2846 return 0;
2850 * ram_save_iterate: iterative stage for migration
2852 * Returns zero to indicate success and negative for error
2854 * @f: QEMUFile where to send the data
2855 * @opaque: RAMState pointer
2857 static int ram_save_iterate(QEMUFile *f, void *opaque)
2859 RAMState **temp = opaque;
2860 RAMState *rs = *temp;
2861 int ret = 0;
2862 int i;
2863 int64_t t0;
2864 int done = 0;
2866 if (blk_mig_bulk_active()) {
2867 /* Avoid transferring ram during bulk phase of block migration as
2868 * the bulk phase will usually take a long time and transferring
2869 * ram updates during that time is pointless. */
2870 goto out;
2874 * We'll take this lock a little bit long, but it's okay for two reasons.
2875 * Firstly, the only possible other thread to take it is who calls
2876 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2877 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2878 * guarantees that we'll at least released it in a regular basis.
2880 qemu_mutex_lock(&rs->bitmap_mutex);
2881 WITH_RCU_READ_LOCK_GUARD() {
2882 if (ram_list.version != rs->last_version) {
2883 ram_state_reset(rs);
2886 /* Read version before ram_list.blocks */
2887 smp_rmb();
2889 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2891 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2892 i = 0;
2893 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2894 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2895 int pages;
2897 if (qemu_file_get_error(f)) {
2898 break;
2901 pages = ram_find_and_save_block(rs, false);
2902 /* no more pages to sent */
2903 if (pages == 0) {
2904 done = 1;
2905 break;
2908 if (pages < 0) {
2909 qemu_file_set_error(f, pages);
2910 break;
2913 rs->target_page_count += pages;
2916 * During postcopy, it is necessary to make sure one whole host
2917 * page is sent in one chunk.
2919 if (migrate_postcopy_ram()) {
2920 flush_compressed_data(rs);
2924 * we want to check in the 1st loop, just in case it was the 1st
2925 * time and we had to sync the dirty bitmap.
2926 * qemu_clock_get_ns() is a bit expensive, so we only check each
2927 * some iterations
2929 if ((i & 63) == 0) {
2930 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2931 1000000;
2932 if (t1 > MAX_WAIT) {
2933 trace_ram_save_iterate_big_wait(t1, i);
2934 break;
2937 i++;
2940 qemu_mutex_unlock(&rs->bitmap_mutex);
2943 * Must occur before EOS (or any QEMUFile operation)
2944 * because of RDMA protocol.
2946 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2948 out:
2949 if (ret >= 0
2950 && migration_is_setup_or_active(migrate_get_current()->state)) {
2951 multifd_send_sync_main(rs->f);
2952 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2953 qemu_fflush(f);
2954 ram_counters.transferred += 8;
2956 ret = qemu_file_get_error(f);
2958 if (ret < 0) {
2959 return ret;
2962 return done;
2966 * ram_save_complete: function called to send the remaining amount of ram
2968 * Returns zero to indicate success or negative on error
2970 * Called with iothread lock
2972 * @f: QEMUFile where to send the data
2973 * @opaque: RAMState pointer
2975 static int ram_save_complete(QEMUFile *f, void *opaque)
2977 RAMState **temp = opaque;
2978 RAMState *rs = *temp;
2979 int ret = 0;
2981 WITH_RCU_READ_LOCK_GUARD() {
2982 if (!migration_in_postcopy()) {
2983 migration_bitmap_sync_precopy(rs);
2986 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2988 /* try transferring iterative blocks of memory */
2990 /* flush all remaining blocks regardless of rate limiting */
2991 while (true) {
2992 int pages;
2994 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2995 /* no more blocks to sent */
2996 if (pages == 0) {
2997 break;
2999 if (pages < 0) {
3000 ret = pages;
3001 break;
3005 flush_compressed_data(rs);
3006 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3009 if (ret >= 0) {
3010 multifd_send_sync_main(rs->f);
3011 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3012 qemu_fflush(f);
3015 return ret;
3018 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3019 uint64_t *res_precopy_only,
3020 uint64_t *res_compatible,
3021 uint64_t *res_postcopy_only)
3023 RAMState **temp = opaque;
3024 RAMState *rs = *temp;
3025 uint64_t remaining_size;
3027 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3029 if (!migration_in_postcopy() &&
3030 remaining_size < max_size) {
3031 qemu_mutex_lock_iothread();
3032 WITH_RCU_READ_LOCK_GUARD() {
3033 migration_bitmap_sync_precopy(rs);
3035 qemu_mutex_unlock_iothread();
3036 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3039 if (migrate_postcopy_ram()) {
3040 /* We can do postcopy, and all the data is postcopiable */
3041 *res_compatible += remaining_size;
3042 } else {
3043 *res_precopy_only += remaining_size;
3047 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3049 unsigned int xh_len;
3050 int xh_flags;
3051 uint8_t *loaded_data;
3053 /* extract RLE header */
3054 xh_flags = qemu_get_byte(f);
3055 xh_len = qemu_get_be16(f);
3057 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3058 error_report("Failed to load XBZRLE page - wrong compression!");
3059 return -1;
3062 if (xh_len > TARGET_PAGE_SIZE) {
3063 error_report("Failed to load XBZRLE page - len overflow!");
3064 return -1;
3066 loaded_data = XBZRLE.decoded_buf;
3067 /* load data and decode */
3068 /* it can change loaded_data to point to an internal buffer */
3069 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3071 /* decode RLE */
3072 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3073 TARGET_PAGE_SIZE) == -1) {
3074 error_report("Failed to load XBZRLE page - decode error!");
3075 return -1;
3078 return 0;
3082 * ram_block_from_stream: read a RAMBlock id from the migration stream
3084 * Must be called from within a rcu critical section.
3086 * Returns a pointer from within the RCU-protected ram_list.
3088 * @f: QEMUFile where to read the data from
3089 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3091 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3093 static RAMBlock *block;
3094 char id[256];
3095 uint8_t len;
3097 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3098 if (!block) {
3099 error_report("Ack, bad migration stream!");
3100 return NULL;
3102 return block;
3105 len = qemu_get_byte(f);
3106 qemu_get_buffer(f, (uint8_t *)id, len);
3107 id[len] = 0;
3109 block = qemu_ram_block_by_name(id);
3110 if (!block) {
3111 error_report("Can't find block %s", id);
3112 return NULL;
3115 if (ramblock_is_ignored(block)) {
3116 error_report("block %s should not be migrated !", id);
3117 return NULL;
3120 return block;
3123 static inline void *host_from_ram_block_offset(RAMBlock *block,
3124 ram_addr_t offset)
3126 if (!offset_in_ramblock(block, offset)) {
3127 return NULL;
3130 return block->host + offset;
3133 static void *host_page_from_ram_block_offset(RAMBlock *block,
3134 ram_addr_t offset)
3136 /* Note: Explicitly no check against offset_in_ramblock(). */
3137 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3138 block->page_size);
3141 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3142 ram_addr_t offset)
3144 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3147 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3148 ram_addr_t offset, bool record_bitmap)
3150 if (!offset_in_ramblock(block, offset)) {
3151 return NULL;
3153 if (!block->colo_cache) {
3154 error_report("%s: colo_cache is NULL in block :%s",
3155 __func__, block->idstr);
3156 return NULL;
3160 * During colo checkpoint, we need bitmap of these migrated pages.
3161 * It help us to decide which pages in ram cache should be flushed
3162 * into VM's RAM later.
3164 if (record_bitmap &&
3165 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3166 ram_state->migration_dirty_pages++;
3168 return block->colo_cache + offset;
3172 * ram_handle_compressed: handle the zero page case
3174 * If a page (or a whole RDMA chunk) has been
3175 * determined to be zero, then zap it.
3177 * @host: host address for the zero page
3178 * @ch: what the page is filled from. We only support zero
3179 * @size: size of the zero page
3181 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3183 if (ch != 0 || !is_zero_range(host, size)) {
3184 memset(host, ch, size);
3188 /* return the size after decompression, or negative value on error */
3189 static int
3190 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3191 const uint8_t *source, size_t source_len)
3193 int err;
3195 err = inflateReset(stream);
3196 if (err != Z_OK) {
3197 return -1;
3200 stream->avail_in = source_len;
3201 stream->next_in = (uint8_t *)source;
3202 stream->avail_out = dest_len;
3203 stream->next_out = dest;
3205 err = inflate(stream, Z_NO_FLUSH);
3206 if (err != Z_STREAM_END) {
3207 return -1;
3210 return stream->total_out;
3213 static void *do_data_decompress(void *opaque)
3215 DecompressParam *param = opaque;
3216 unsigned long pagesize;
3217 uint8_t *des;
3218 int len, ret;
3220 qemu_mutex_lock(&param->mutex);
3221 while (!param->quit) {
3222 if (param->des) {
3223 des = param->des;
3224 len = param->len;
3225 param->des = 0;
3226 qemu_mutex_unlock(&param->mutex);
3228 pagesize = TARGET_PAGE_SIZE;
3230 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3231 param->compbuf, len);
3232 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3233 error_report("decompress data failed");
3234 qemu_file_set_error(decomp_file, ret);
3237 qemu_mutex_lock(&decomp_done_lock);
3238 param->done = true;
3239 qemu_cond_signal(&decomp_done_cond);
3240 qemu_mutex_unlock(&decomp_done_lock);
3242 qemu_mutex_lock(&param->mutex);
3243 } else {
3244 qemu_cond_wait(&param->cond, &param->mutex);
3247 qemu_mutex_unlock(&param->mutex);
3249 return NULL;
3252 static int wait_for_decompress_done(void)
3254 int idx, thread_count;
3256 if (!migrate_use_compression()) {
3257 return 0;
3260 thread_count = migrate_decompress_threads();
3261 qemu_mutex_lock(&decomp_done_lock);
3262 for (idx = 0; idx < thread_count; idx++) {
3263 while (!decomp_param[idx].done) {
3264 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3267 qemu_mutex_unlock(&decomp_done_lock);
3268 return qemu_file_get_error(decomp_file);
3271 static void compress_threads_load_cleanup(void)
3273 int i, thread_count;
3275 if (!migrate_use_compression()) {
3276 return;
3278 thread_count = migrate_decompress_threads();
3279 for (i = 0; i < thread_count; i++) {
3281 * we use it as a indicator which shows if the thread is
3282 * properly init'd or not
3284 if (!decomp_param[i].compbuf) {
3285 break;
3288 qemu_mutex_lock(&decomp_param[i].mutex);
3289 decomp_param[i].quit = true;
3290 qemu_cond_signal(&decomp_param[i].cond);
3291 qemu_mutex_unlock(&decomp_param[i].mutex);
3293 for (i = 0; i < thread_count; i++) {
3294 if (!decomp_param[i].compbuf) {
3295 break;
3298 qemu_thread_join(decompress_threads + i);
3299 qemu_mutex_destroy(&decomp_param[i].mutex);
3300 qemu_cond_destroy(&decomp_param[i].cond);
3301 inflateEnd(&decomp_param[i].stream);
3302 g_free(decomp_param[i].compbuf);
3303 decomp_param[i].compbuf = NULL;
3305 g_free(decompress_threads);
3306 g_free(decomp_param);
3307 decompress_threads = NULL;
3308 decomp_param = NULL;
3309 decomp_file = NULL;
3312 static int compress_threads_load_setup(QEMUFile *f)
3314 int i, thread_count;
3316 if (!migrate_use_compression()) {
3317 return 0;
3320 thread_count = migrate_decompress_threads();
3321 decompress_threads = g_new0(QemuThread, thread_count);
3322 decomp_param = g_new0(DecompressParam, thread_count);
3323 qemu_mutex_init(&decomp_done_lock);
3324 qemu_cond_init(&decomp_done_cond);
3325 decomp_file = f;
3326 for (i = 0; i < thread_count; i++) {
3327 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3328 goto exit;
3331 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3332 qemu_mutex_init(&decomp_param[i].mutex);
3333 qemu_cond_init(&decomp_param[i].cond);
3334 decomp_param[i].done = true;
3335 decomp_param[i].quit = false;
3336 qemu_thread_create(decompress_threads + i, "decompress",
3337 do_data_decompress, decomp_param + i,
3338 QEMU_THREAD_JOINABLE);
3340 return 0;
3341 exit:
3342 compress_threads_load_cleanup();
3343 return -1;
3346 static void decompress_data_with_multi_threads(QEMUFile *f,
3347 void *host, int len)
3349 int idx, thread_count;
3351 thread_count = migrate_decompress_threads();
3352 QEMU_LOCK_GUARD(&decomp_done_lock);
3353 while (true) {
3354 for (idx = 0; idx < thread_count; idx++) {
3355 if (decomp_param[idx].done) {
3356 decomp_param[idx].done = false;
3357 qemu_mutex_lock(&decomp_param[idx].mutex);
3358 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3359 decomp_param[idx].des = host;
3360 decomp_param[idx].len = len;
3361 qemu_cond_signal(&decomp_param[idx].cond);
3362 qemu_mutex_unlock(&decomp_param[idx].mutex);
3363 break;
3366 if (idx < thread_count) {
3367 break;
3368 } else {
3369 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3374 static void colo_init_ram_state(void)
3376 ram_state_init(&ram_state);
3380 * colo cache: this is for secondary VM, we cache the whole
3381 * memory of the secondary VM, it is need to hold the global lock
3382 * to call this helper.
3384 int colo_init_ram_cache(void)
3386 RAMBlock *block;
3388 WITH_RCU_READ_LOCK_GUARD() {
3389 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3390 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3391 NULL, false, false);
3392 if (!block->colo_cache) {
3393 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3394 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3395 block->used_length);
3396 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3397 if (block->colo_cache) {
3398 qemu_anon_ram_free(block->colo_cache, block->used_length);
3399 block->colo_cache = NULL;
3402 return -errno;
3408 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3409 * with to decide which page in cache should be flushed into SVM's RAM. Here
3410 * we use the same name 'ram_bitmap' as for migration.
3412 if (ram_bytes_total()) {
3413 RAMBlock *block;
3415 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3416 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3417 block->bmap = bitmap_new(pages);
3421 colo_init_ram_state();
3422 return 0;
3425 /* TODO: duplicated with ram_init_bitmaps */
3426 void colo_incoming_start_dirty_log(void)
3428 RAMBlock *block = NULL;
3429 /* For memory_global_dirty_log_start below. */
3430 qemu_mutex_lock_iothread();
3431 qemu_mutex_lock_ramlist();
3433 memory_global_dirty_log_sync();
3434 WITH_RCU_READ_LOCK_GUARD() {
3435 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3436 ramblock_sync_dirty_bitmap(ram_state, block);
3437 /* Discard this dirty bitmap record */
3438 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3440 memory_global_dirty_log_start();
3442 ram_state->migration_dirty_pages = 0;
3443 qemu_mutex_unlock_ramlist();
3444 qemu_mutex_unlock_iothread();
3447 /* It is need to hold the global lock to call this helper */
3448 void colo_release_ram_cache(void)
3450 RAMBlock *block;
3452 memory_global_dirty_log_stop();
3453 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3454 g_free(block->bmap);
3455 block->bmap = NULL;
3458 WITH_RCU_READ_LOCK_GUARD() {
3459 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3460 if (block->colo_cache) {
3461 qemu_anon_ram_free(block->colo_cache, block->used_length);
3462 block->colo_cache = NULL;
3466 ram_state_cleanup(&ram_state);
3470 * ram_load_setup: Setup RAM for migration incoming side
3472 * Returns zero to indicate success and negative for error
3474 * @f: QEMUFile where to receive the data
3475 * @opaque: RAMState pointer
3477 static int ram_load_setup(QEMUFile *f, void *opaque)
3479 if (compress_threads_load_setup(f)) {
3480 return -1;
3483 xbzrle_load_setup();
3484 ramblock_recv_map_init();
3486 return 0;
3489 static int ram_load_cleanup(void *opaque)
3491 RAMBlock *rb;
3493 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3494 qemu_ram_block_writeback(rb);
3497 xbzrle_load_cleanup();
3498 compress_threads_load_cleanup();
3500 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3501 g_free(rb->receivedmap);
3502 rb->receivedmap = NULL;
3505 return 0;
3509 * ram_postcopy_incoming_init: allocate postcopy data structures
3511 * Returns 0 for success and negative if there was one error
3513 * @mis: current migration incoming state
3515 * Allocate data structures etc needed by incoming migration with
3516 * postcopy-ram. postcopy-ram's similarly names
3517 * postcopy_ram_incoming_init does the work.
3519 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3521 return postcopy_ram_incoming_init(mis);
3525 * ram_load_postcopy: load a page in postcopy case
3527 * Returns 0 for success or -errno in case of error
3529 * Called in postcopy mode by ram_load().
3530 * rcu_read_lock is taken prior to this being called.
3532 * @f: QEMUFile where to send the data
3534 static int ram_load_postcopy(QEMUFile *f)
3536 int flags = 0, ret = 0;
3537 bool place_needed = false;
3538 bool matches_target_page_size = false;
3539 MigrationIncomingState *mis = migration_incoming_get_current();
3540 /* Temporary page that is later 'placed' */
3541 void *postcopy_host_page = mis->postcopy_tmp_page;
3542 void *host_page = NULL;
3543 bool all_zero = true;
3544 int target_pages = 0;
3546 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3547 ram_addr_t addr;
3548 void *page_buffer = NULL;
3549 void *place_source = NULL;
3550 RAMBlock *block = NULL;
3551 uint8_t ch;
3552 int len;
3554 addr = qemu_get_be64(f);
3557 * If qemu file error, we should stop here, and then "addr"
3558 * may be invalid
3560 ret = qemu_file_get_error(f);
3561 if (ret) {
3562 break;
3565 flags = addr & ~TARGET_PAGE_MASK;
3566 addr &= TARGET_PAGE_MASK;
3568 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3569 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3570 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3571 block = ram_block_from_stream(f, flags);
3572 if (!block) {
3573 ret = -EINVAL;
3574 break;
3578 * Relying on used_length is racy and can result in false positives.
3579 * We might place pages beyond used_length in case RAM was shrunk
3580 * while in postcopy, which is fine - trying to place via
3581 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3583 if (!block->host || addr >= block->postcopy_length) {
3584 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3585 ret = -EINVAL;
3586 break;
3588 target_pages++;
3589 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3591 * Postcopy requires that we place whole host pages atomically;
3592 * these may be huge pages for RAMBlocks that are backed by
3593 * hugetlbfs.
3594 * To make it atomic, the data is read into a temporary page
3595 * that's moved into place later.
3596 * The migration protocol uses, possibly smaller, target-pages
3597 * however the source ensures it always sends all the components
3598 * of a host page in one chunk.
3600 page_buffer = postcopy_host_page +
3601 host_page_offset_from_ram_block_offset(block, addr);
3602 /* If all TP are zero then we can optimise the place */
3603 if (target_pages == 1) {
3604 host_page = host_page_from_ram_block_offset(block, addr);
3605 } else if (host_page != host_page_from_ram_block_offset(block,
3606 addr)) {
3607 /* not the 1st TP within the HP */
3608 error_report("Non-same host page %p/%p", host_page,
3609 host_page_from_ram_block_offset(block, addr));
3610 ret = -EINVAL;
3611 break;
3615 * If it's the last part of a host page then we place the host
3616 * page
3618 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3619 place_needed = true;
3621 place_source = postcopy_host_page;
3624 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3625 case RAM_SAVE_FLAG_ZERO:
3626 ch = qemu_get_byte(f);
3628 * Can skip to set page_buffer when
3629 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3631 if (ch || !matches_target_page_size) {
3632 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3634 if (ch) {
3635 all_zero = false;
3637 break;
3639 case RAM_SAVE_FLAG_PAGE:
3640 all_zero = false;
3641 if (!matches_target_page_size) {
3642 /* For huge pages, we always use temporary buffer */
3643 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3644 } else {
3646 * For small pages that matches target page size, we
3647 * avoid the qemu_file copy. Instead we directly use
3648 * the buffer of QEMUFile to place the page. Note: we
3649 * cannot do any QEMUFile operation before using that
3650 * buffer to make sure the buffer is valid when
3651 * placing the page.
3653 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3654 TARGET_PAGE_SIZE);
3656 break;
3657 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3658 all_zero = false;
3659 len = qemu_get_be32(f);
3660 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3661 error_report("Invalid compressed data length: %d", len);
3662 ret = -EINVAL;
3663 break;
3665 decompress_data_with_multi_threads(f, page_buffer, len);
3666 break;
3668 case RAM_SAVE_FLAG_EOS:
3669 /* normal exit */
3670 multifd_recv_sync_main();
3671 break;
3672 default:
3673 error_report("Unknown combination of migration flags: 0x%x"
3674 " (postcopy mode)", flags);
3675 ret = -EINVAL;
3676 break;
3679 /* Got the whole host page, wait for decompress before placing. */
3680 if (place_needed) {
3681 ret |= wait_for_decompress_done();
3684 /* Detect for any possible file errors */
3685 if (!ret && qemu_file_get_error(f)) {
3686 ret = qemu_file_get_error(f);
3689 if (!ret && place_needed) {
3690 if (all_zero) {
3691 ret = postcopy_place_page_zero(mis, host_page, block);
3692 } else {
3693 ret = postcopy_place_page(mis, host_page, place_source,
3694 block);
3696 place_needed = false;
3697 target_pages = 0;
3698 /* Assume we have a zero page until we detect something different */
3699 all_zero = true;
3703 return ret;
3706 static bool postcopy_is_advised(void)
3708 PostcopyState ps = postcopy_state_get();
3709 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3712 static bool postcopy_is_running(void)
3714 PostcopyState ps = postcopy_state_get();
3715 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3719 * Flush content of RAM cache into SVM's memory.
3720 * Only flush the pages that be dirtied by PVM or SVM or both.
3722 void colo_flush_ram_cache(void)
3724 RAMBlock *block = NULL;
3725 void *dst_host;
3726 void *src_host;
3727 unsigned long offset = 0;
3729 memory_global_dirty_log_sync();
3730 qemu_mutex_lock(&ram_state->bitmap_mutex);
3731 WITH_RCU_READ_LOCK_GUARD() {
3732 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3733 ramblock_sync_dirty_bitmap(ram_state, block);
3737 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3738 WITH_RCU_READ_LOCK_GUARD() {
3739 block = QLIST_FIRST_RCU(&ram_list.blocks);
3741 while (block) {
3742 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3744 if (!offset_in_ramblock(block,
3745 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3746 offset = 0;
3747 block = QLIST_NEXT_RCU(block, next);
3748 } else {
3749 migration_bitmap_clear_dirty(ram_state, block, offset);
3750 dst_host = block->host
3751 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3752 src_host = block->colo_cache
3753 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3754 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3758 trace_colo_flush_ram_cache_end();
3759 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3763 * ram_load_precopy: load pages in precopy case
3765 * Returns 0 for success or -errno in case of error
3767 * Called in precopy mode by ram_load().
3768 * rcu_read_lock is taken prior to this being called.
3770 * @f: QEMUFile where to send the data
3772 static int ram_load_precopy(QEMUFile *f)
3774 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3775 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3776 bool postcopy_advised = postcopy_is_advised();
3777 if (!migrate_use_compression()) {
3778 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3781 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3782 ram_addr_t addr, total_ram_bytes;
3783 void *host = NULL, *host_bak = NULL;
3784 uint8_t ch;
3787 * Yield periodically to let main loop run, but an iteration of
3788 * the main loop is expensive, so do it each some iterations
3790 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3791 aio_co_schedule(qemu_get_current_aio_context(),
3792 qemu_coroutine_self());
3793 qemu_coroutine_yield();
3795 i++;
3797 addr = qemu_get_be64(f);
3798 flags = addr & ~TARGET_PAGE_MASK;
3799 addr &= TARGET_PAGE_MASK;
3801 if (flags & invalid_flags) {
3802 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3803 error_report("Received an unexpected compressed page");
3806 ret = -EINVAL;
3807 break;
3810 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3811 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3812 RAMBlock *block = ram_block_from_stream(f, flags);
3814 host = host_from_ram_block_offset(block, addr);
3816 * After going into COLO stage, we should not load the page
3817 * into SVM's memory directly, we put them into colo_cache firstly.
3818 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3819 * Previously, we copied all these memory in preparing stage of COLO
3820 * while we need to stop VM, which is a time-consuming process.
3821 * Here we optimize it by a trick, back-up every page while in
3822 * migration process while COLO is enabled, though it affects the
3823 * speed of the migration, but it obviously reduce the downtime of
3824 * back-up all SVM'S memory in COLO preparing stage.
3826 if (migration_incoming_colo_enabled()) {
3827 if (migration_incoming_in_colo_state()) {
3828 /* In COLO stage, put all pages into cache temporarily */
3829 host = colo_cache_from_block_offset(block, addr, true);
3830 } else {
3832 * In migration stage but before COLO stage,
3833 * Put all pages into both cache and SVM's memory.
3835 host_bak = colo_cache_from_block_offset(block, addr, false);
3838 if (!host) {
3839 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3840 ret = -EINVAL;
3841 break;
3843 if (!migration_incoming_in_colo_state()) {
3844 ramblock_recv_bitmap_set(block, host);
3847 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3850 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3851 case RAM_SAVE_FLAG_MEM_SIZE:
3852 /* Synchronize RAM block list */
3853 total_ram_bytes = addr;
3854 while (!ret && total_ram_bytes) {
3855 RAMBlock *block;
3856 char id[256];
3857 ram_addr_t length;
3859 len = qemu_get_byte(f);
3860 qemu_get_buffer(f, (uint8_t *)id, len);
3861 id[len] = 0;
3862 length = qemu_get_be64(f);
3864 block = qemu_ram_block_by_name(id);
3865 if (block && !qemu_ram_is_migratable(block)) {
3866 error_report("block %s should not be migrated !", id);
3867 ret = -EINVAL;
3868 } else if (block) {
3869 if (length != block->used_length) {
3870 Error *local_err = NULL;
3872 ret = qemu_ram_resize(block, length,
3873 &local_err);
3874 if (local_err) {
3875 error_report_err(local_err);
3878 /* For postcopy we need to check hugepage sizes match */
3879 if (postcopy_advised && migrate_postcopy_ram() &&
3880 block->page_size != qemu_host_page_size) {
3881 uint64_t remote_page_size = qemu_get_be64(f);
3882 if (remote_page_size != block->page_size) {
3883 error_report("Mismatched RAM page size %s "
3884 "(local) %zd != %" PRId64,
3885 id, block->page_size,
3886 remote_page_size);
3887 ret = -EINVAL;
3890 if (migrate_ignore_shared()) {
3891 hwaddr addr = qemu_get_be64(f);
3892 if (ramblock_is_ignored(block) &&
3893 block->mr->addr != addr) {
3894 error_report("Mismatched GPAs for block %s "
3895 "%" PRId64 "!= %" PRId64,
3896 id, (uint64_t)addr,
3897 (uint64_t)block->mr->addr);
3898 ret = -EINVAL;
3901 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3902 block->idstr);
3903 } else {
3904 error_report("Unknown ramblock \"%s\", cannot "
3905 "accept migration", id);
3906 ret = -EINVAL;
3909 total_ram_bytes -= length;
3911 break;
3913 case RAM_SAVE_FLAG_ZERO:
3914 ch = qemu_get_byte(f);
3915 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3916 break;
3918 case RAM_SAVE_FLAG_PAGE:
3919 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3920 break;
3922 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3923 len = qemu_get_be32(f);
3924 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3925 error_report("Invalid compressed data length: %d", len);
3926 ret = -EINVAL;
3927 break;
3929 decompress_data_with_multi_threads(f, host, len);
3930 break;
3932 case RAM_SAVE_FLAG_XBZRLE:
3933 if (load_xbzrle(f, addr, host) < 0) {
3934 error_report("Failed to decompress XBZRLE page at "
3935 RAM_ADDR_FMT, addr);
3936 ret = -EINVAL;
3937 break;
3939 break;
3940 case RAM_SAVE_FLAG_EOS:
3941 /* normal exit */
3942 multifd_recv_sync_main();
3943 break;
3944 default:
3945 if (flags & RAM_SAVE_FLAG_HOOK) {
3946 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3947 } else {
3948 error_report("Unknown combination of migration flags: 0x%x",
3949 flags);
3950 ret = -EINVAL;
3953 if (!ret) {
3954 ret = qemu_file_get_error(f);
3956 if (!ret && host_bak) {
3957 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3961 ret |= wait_for_decompress_done();
3962 return ret;
3965 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3967 int ret = 0;
3968 static uint64_t seq_iter;
3970 * If system is running in postcopy mode, page inserts to host memory must
3971 * be atomic
3973 bool postcopy_running = postcopy_is_running();
3975 seq_iter++;
3977 if (version_id != 4) {
3978 return -EINVAL;
3982 * This RCU critical section can be very long running.
3983 * When RCU reclaims in the code start to become numerous,
3984 * it will be necessary to reduce the granularity of this
3985 * critical section.
3987 WITH_RCU_READ_LOCK_GUARD() {
3988 if (postcopy_running) {
3989 ret = ram_load_postcopy(f);
3990 } else {
3991 ret = ram_load_precopy(f);
3994 trace_ram_load_complete(ret, seq_iter);
3996 return ret;
3999 static bool ram_has_postcopy(void *opaque)
4001 RAMBlock *rb;
4002 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4003 if (ramblock_is_pmem(rb)) {
4004 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4005 "is not supported now!", rb->idstr, rb->host);
4006 return false;
4010 return migrate_postcopy_ram();
4013 /* Sync all the dirty bitmap with destination VM. */
4014 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4016 RAMBlock *block;
4017 QEMUFile *file = s->to_dst_file;
4018 int ramblock_count = 0;
4020 trace_ram_dirty_bitmap_sync_start();
4022 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4023 qemu_savevm_send_recv_bitmap(file, block->idstr);
4024 trace_ram_dirty_bitmap_request(block->idstr);
4025 ramblock_count++;
4028 trace_ram_dirty_bitmap_sync_wait();
4030 /* Wait until all the ramblocks' dirty bitmap synced */
4031 while (ramblock_count--) {
4032 qemu_sem_wait(&s->rp_state.rp_sem);
4035 trace_ram_dirty_bitmap_sync_complete();
4037 return 0;
4040 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4042 qemu_sem_post(&s->rp_state.rp_sem);
4046 * Read the received bitmap, revert it as the initial dirty bitmap.
4047 * This is only used when the postcopy migration is paused but wants
4048 * to resume from a middle point.
4050 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4052 int ret = -EINVAL;
4053 /* from_dst_file is always valid because we're within rp_thread */
4054 QEMUFile *file = s->rp_state.from_dst_file;
4055 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4056 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4057 uint64_t size, end_mark;
4059 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4061 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4062 error_report("%s: incorrect state %s", __func__,
4063 MigrationStatus_str(s->state));
4064 return -EINVAL;
4068 * Note: see comments in ramblock_recv_bitmap_send() on why we
4069 * need the endianness conversion, and the paddings.
4071 local_size = ROUND_UP(local_size, 8);
4073 /* Add paddings */
4074 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4076 size = qemu_get_be64(file);
4078 /* The size of the bitmap should match with our ramblock */
4079 if (size != local_size) {
4080 error_report("%s: ramblock '%s' bitmap size mismatch "
4081 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4082 block->idstr, size, local_size);
4083 ret = -EINVAL;
4084 goto out;
4087 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4088 end_mark = qemu_get_be64(file);
4090 ret = qemu_file_get_error(file);
4091 if (ret || size != local_size) {
4092 error_report("%s: read bitmap failed for ramblock '%s': %d"
4093 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4094 __func__, block->idstr, ret, local_size, size);
4095 ret = -EIO;
4096 goto out;
4099 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4100 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4101 __func__, block->idstr, end_mark);
4102 ret = -EINVAL;
4103 goto out;
4107 * Endianness conversion. We are during postcopy (though paused).
4108 * The dirty bitmap won't change. We can directly modify it.
4110 bitmap_from_le(block->bmap, le_bitmap, nbits);
4113 * What we received is "received bitmap". Revert it as the initial
4114 * dirty bitmap for this ramblock.
4116 bitmap_complement(block->bmap, block->bmap, nbits);
4118 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4121 * We succeeded to sync bitmap for current ramblock. If this is
4122 * the last one to sync, we need to notify the main send thread.
4124 ram_dirty_bitmap_reload_notify(s);
4126 ret = 0;
4127 out:
4128 g_free(le_bitmap);
4129 return ret;
4132 static int ram_resume_prepare(MigrationState *s, void *opaque)
4134 RAMState *rs = *(RAMState **)opaque;
4135 int ret;
4137 ret = ram_dirty_bitmap_sync_all(s, rs);
4138 if (ret) {
4139 return ret;
4142 ram_state_resume_prepare(rs, s->to_dst_file);
4144 return 0;
4147 static SaveVMHandlers savevm_ram_handlers = {
4148 .save_setup = ram_save_setup,
4149 .save_live_iterate = ram_save_iterate,
4150 .save_live_complete_postcopy = ram_save_complete,
4151 .save_live_complete_precopy = ram_save_complete,
4152 .has_postcopy = ram_has_postcopy,
4153 .save_live_pending = ram_save_pending,
4154 .load_state = ram_load,
4155 .save_cleanup = ram_save_cleanup,
4156 .load_setup = ram_load_setup,
4157 .load_cleanup = ram_load_cleanup,
4158 .resume_prepare = ram_resume_prepare,
4161 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4162 size_t old_size, size_t new_size)
4164 PostcopyState ps = postcopy_state_get();
4165 ram_addr_t offset;
4166 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4167 Error *err = NULL;
4169 if (ramblock_is_ignored(rb)) {
4170 return;
4173 if (!migration_is_idle()) {
4175 * Precopy code on the source cannot deal with the size of RAM blocks
4176 * changing at random points in time - especially after sending the
4177 * RAM block sizes in the migration stream, they must no longer change.
4178 * Abort and indicate a proper reason.
4180 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4181 migrate_set_error(migrate_get_current(), err);
4182 error_free(err);
4183 migration_cancel();
4186 switch (ps) {
4187 case POSTCOPY_INCOMING_ADVISE:
4189 * Update what ram_postcopy_incoming_init()->init_range() does at the
4190 * time postcopy was advised. Syncing RAM blocks with the source will
4191 * result in RAM resizes.
4193 if (old_size < new_size) {
4194 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4195 error_report("RAM block '%s' discard of resized RAM failed",
4196 rb->idstr);
4199 rb->postcopy_length = new_size;
4200 break;
4201 case POSTCOPY_INCOMING_NONE:
4202 case POSTCOPY_INCOMING_RUNNING:
4203 case POSTCOPY_INCOMING_END:
4205 * Once our guest is running, postcopy does no longer care about
4206 * resizes. When growing, the new memory was not available on the
4207 * source, no handler needed.
4209 break;
4210 default:
4211 error_report("RAM block '%s' resized during postcopy state: %d",
4212 rb->idstr, ps);
4213 exit(-1);
4217 static RAMBlockNotifier ram_mig_ram_notifier = {
4218 .ram_block_resized = ram_mig_ram_block_resized,
4221 void ram_mig_init(void)
4223 qemu_mutex_init(&XBZRLE.lock);
4224 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4225 ram_block_notifier_add(&ram_mig_ram_notifier);