blob: ba228eead4bfd6a435aafb0959fa584cc9fe1999 [file] [log] [blame]
Juan Quintela56e93d22015-05-07 19:33:31 +02001/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
Juan Quintela76cc7b52015-05-08 13:20:21 +02005 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
Juan Quintela56e93d22015-05-07 19:33:31 +02009 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
Markus Armbrustere688df62018-02-01 12:18:31 +010028
Peter Maydell1393a482016-01-26 18:16:54 +000029#include "qemu/osdep.h"
Veronia Bahaaf348b6d2016-03-20 19:16:19 +020030#include "qemu/cutils.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020031#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
Peter Maydellb85ea5f2022-02-08 20:08:52 +000033#include "qemu/madvise.h"
Juan Quintela7205c9e2015-05-08 13:54:36 +020034#include "qemu/main-loop.h"
Daniel P. Berrangéc0e08252022-06-20 12:01:46 +010035#include "io/channel-null.h"
Juan Quintela709e3fe2017-04-05 21:47:50 +020036#include "xbzrle.h"
Juan Quintela7b1e1a22017-04-17 20:26:27 +020037#include "ram.h"
Juan Quintela6666c962017-04-24 20:07:27 +020038#include "migration.h"
Juan Quintelaf2a8f0a2017-04-24 13:42:55 +020039#include "migration/register.h"
Juan Quintela7b1e1a22017-04-17 20:26:27 +020040#include "migration/misc.h"
Juan Quintela08a0aee2017-04-20 18:52:18 +020041#include "qemu-file.h"
Juan Quintelabe07b0a2017-04-20 13:12:24 +020042#include "postcopy-ram.h"
Michael S. Tsirkin53d37d32018-05-03 22:50:51 +030043#include "page_cache.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020044#include "qemu/error-report.h"
Markus Armbrustere688df62018-02-01 12:18:31 +010045#include "qapi/error.h"
Juan Quintelaab7cbb02019-05-15 13:37:46 +020046#include "qapi/qapi-types-migration.h"
Markus Armbruster9af23982018-02-11 10:36:01 +010047#include "qapi/qapi-events-migration.h"
Juan Quintela8acabf62017-10-05 22:00:31 +020048#include "qapi/qmp/qerror.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020049#include "trace.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020050#include "exec/ram_addr.h"
Alexey Perevalovf9494612017-10-05 14:13:20 +030051#include "exec/target_page.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020052#include "qemu/rcu_queue.h"
zhanghailianga91246c2016-10-27 14:42:59 +080053#include "migration/colo.h"
Michael S. Tsirkin53d37d32018-05-03 22:50:51 +030054#include "block.h"
Claudio Fontanab0c3cf92020-06-29 11:35:03 +020055#include "sysemu/cpu-throttle.h"
Peter Xuedd090c2018-05-02 18:47:32 +080056#include "savevm.h"
Juan Quintelab9ee2f72016-01-15 11:40:13 +010057#include "qemu/iov.h"
Juan Quintelad32ca5a2020-01-22 16:16:07 +010058#include "multifd.h"
Andrey Gruzdev278e2f52021-01-29 13:14:05 +030059#include "sysemu/runstate.h"
60
Lukas Straube5fdf922021-07-04 18:14:44 +020061#include "hw/boards.h" /* for machine_dump_guest_core() */
62
Andrey Gruzdev278e2f52021-01-29 13:14:05 +030063#if defined(__linux__)
64#include "qemu/userfaultfd.h"
65#endif /* defined(__linux__) */
Juan Quintela56e93d22015-05-07 19:33:31 +020066
Juan Quintela56e93d22015-05-07 19:33:31 +020067/***********************************************************/
68/* ram save/restore */
69
Juan Quintelabb890ed2017-04-28 09:39:55 +020070/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71 * worked for pages that where filled with the same char. We switched
72 * it to only search for the zero value. And to avoid confusion with
73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 */
75
Juan Quintela56e93d22015-05-07 19:33:31 +020076#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
Juan Quintelabb890ed2017-04-28 09:39:55 +020077#define RAM_SAVE_FLAG_ZERO 0x02
Juan Quintela56e93d22015-05-07 19:33:31 +020078#define RAM_SAVE_FLAG_MEM_SIZE 0x04
79#define RAM_SAVE_FLAG_PAGE 0x08
80#define RAM_SAVE_FLAG_EOS 0x10
81#define RAM_SAVE_FLAG_CONTINUE 0x20
82#define RAM_SAVE_FLAG_XBZRLE 0x40
83/* 0x80 is reserved in migration.h start with 0x100 next */
84#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
85
Juan Quintela93604472017-06-06 19:49:03 +020086XBZRLECacheStats xbzrle_counters;
87
Peter Xuf1668762022-10-11 17:55:55 -040088/* used by the search for pages to send */
89struct PageSearchStatus {
90 /* The migration channel used for a specific host page */
91 QEMUFile *pss_channel;
Peter Xuec6f3ab2022-10-11 17:55:56 -040092 /* Last block from where we have sent data */
93 RAMBlock *last_sent_block;
Peter Xuf1668762022-10-11 17:55:55 -040094 /* Current block being searched */
95 RAMBlock *block;
96 /* Current page to search from */
97 unsigned long page;
98 /* Set once we wrap around */
99 bool complete_round;
Peter Xuf1668762022-10-11 17:55:55 -0400100 /* Whether we're sending a host page */
101 bool host_page_sending;
102 /* The start/end of current host page. Invalid if host_page_sending==false */
103 unsigned long host_page_start;
104 unsigned long host_page_end;
105};
106typedef struct PageSearchStatus PageSearchStatus;
107
Juan Quintela56e93d22015-05-07 19:33:31 +0200108/* struct contains XBZRLE cache and a static page
109 used by the compression */
110static struct {
111 /* buffer used for XBZRLE encoding */
112 uint8_t *encoded_buf;
113 /* buffer for storing page content */
114 uint8_t *current_buf;
115 /* Cache for XBZRLE, Protected by lock. */
116 PageCache *cache;
117 QemuMutex lock;
Juan Quintelac00e0922017-05-09 16:22:01 +0200118 /* it will store a page full of zeros */
119 uint8_t *zero_target_page;
Juan Quintelaf265e0e2017-06-28 11:52:27 +0200120 /* buffer used for XBZRLE decoding */
121 uint8_t *decoded_buf;
Juan Quintela56e93d22015-05-07 19:33:31 +0200122} XBZRLE;
123
Juan Quintela56e93d22015-05-07 19:33:31 +0200124static void XBZRLE_cache_lock(void)
125{
Bihong Yuf4c51a62020-10-20 11:10:45 +0800126 if (migrate_use_xbzrle()) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200127 qemu_mutex_lock(&XBZRLE.lock);
Bihong Yuf4c51a62020-10-20 11:10:45 +0800128 }
Juan Quintela56e93d22015-05-07 19:33:31 +0200129}
130
131static void XBZRLE_cache_unlock(void)
132{
Bihong Yuf4c51a62020-10-20 11:10:45 +0800133 if (migrate_use_xbzrle()) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200134 qemu_mutex_unlock(&XBZRLE.lock);
Bihong Yuf4c51a62020-10-20 11:10:45 +0800135 }
Juan Quintela56e93d22015-05-07 19:33:31 +0200136}
137
Juan Quintela3d0684b2017-03-23 15:06:39 +0100138/**
139 * xbzrle_cache_resize: resize the xbzrle cache
140 *
Daniel P. Berrangécbde7be2021-02-19 18:40:12 +0000141 * This function is called from migrate_params_apply in main
Juan Quintela3d0684b2017-03-23 15:06:39 +0100142 * thread, possibly while a migration is in progress. A running
143 * migration may be using the cache and might finish during this call,
144 * hence changes to the cache are protected by XBZRLE.lock().
145 *
Juan Quintelac9dede22017-10-06 23:03:55 +0200146 * Returns 0 for success or -1 for error
Juan Quintela3d0684b2017-03-23 15:06:39 +0100147 *
148 * @new_size: new cache size
Juan Quintela8acabf62017-10-05 22:00:31 +0200149 * @errp: set *errp if the check failed, with reason
Juan Quintela56e93d22015-05-07 19:33:31 +0200150 */
Markus Armbruster8b9407a2021-02-02 15:17:32 +0100151int xbzrle_cache_resize(uint64_t new_size, Error **errp)
Juan Quintela56e93d22015-05-07 19:33:31 +0200152{
153 PageCache *new_cache;
Juan Quintelac9dede22017-10-06 23:03:55 +0200154 int64_t ret = 0;
Juan Quintela56e93d22015-05-07 19:33:31 +0200155
Juan Quintela8acabf62017-10-05 22:00:31 +0200156 /* Check for truncation */
157 if (new_size != (size_t)new_size) {
158 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
159 "exceeding address space");
160 return -1;
161 }
162
Juan Quintela2a313e52017-10-06 23:00:12 +0200163 if (new_size == migrate_xbzrle_cache_size()) {
164 /* nothing to do */
Juan Quintelac9dede22017-10-06 23:03:55 +0200165 return 0;
Juan Quintela2a313e52017-10-06 23:00:12 +0200166 }
167
Juan Quintela56e93d22015-05-07 19:33:31 +0200168 XBZRLE_cache_lock();
169
170 if (XBZRLE.cache != NULL) {
Juan Quintela80f8dfd2017-10-06 22:30:45 +0200171 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
Juan Quintela56e93d22015-05-07 19:33:31 +0200172 if (!new_cache) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200173 ret = -1;
174 goto out;
175 }
176
177 cache_fini(XBZRLE.cache);
178 XBZRLE.cache = new_cache;
179 }
Juan Quintela56e93d22015-05-07 19:33:31 +0200180out:
181 XBZRLE_cache_unlock();
182 return ret;
183}
184
Peter Xu20123ee2022-10-11 17:55:46 -0400185static bool postcopy_preempt_active(void)
186{
187 return migrate_postcopy_preempt() && migration_in_postcopy();
188}
189
Chuan Zheng3ded54b2020-09-16 14:22:00 +0800190bool ramblock_is_ignored(RAMBlock *block)
Yury Kotovfbd162e2019-02-15 20:45:46 +0300191{
192 return !qemu_ram_is_migratable(block) ||
193 (migrate_ignore_shared() && qemu_ram_is_shared(block));
194}
195
Dr. David Alan Gilbert343f6322018-06-05 17:25:45 +0100196#undef RAMBLOCK_FOREACH
197
Yury Kotovfbd162e2019-02-15 20:45:46 +0300198int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
199{
200 RAMBlock *block;
201 int ret = 0;
202
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +0100203 RCU_READ_LOCK_GUARD();
204
Yury Kotovfbd162e2019-02-15 20:45:46 +0300205 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
206 ret = func(block, opaque);
207 if (ret) {
208 break;
209 }
210 }
Yury Kotovfbd162e2019-02-15 20:45:46 +0300211 return ret;
212}
213
Alexey Perevalovf9494612017-10-05 14:13:20 +0300214static void ramblock_recv_map_init(void)
215{
216 RAMBlock *rb;
217
Yury Kotovfbd162e2019-02-15 20:45:46 +0300218 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
Alexey Perevalovf9494612017-10-05 14:13:20 +0300219 assert(!rb->receivedmap);
220 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
221 }
222}
223
224int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
225{
226 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
227 rb->receivedmap);
228}
229
Dr. David Alan Gilbert1cba9f62018-03-12 17:21:08 +0000230bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
231{
232 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
233}
234
Alexey Perevalovf9494612017-10-05 14:13:20 +0300235void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
236{
237 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
238}
239
240void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
241 size_t nr)
242{
243 bitmap_set_atomic(rb->receivedmap,
244 ramblock_recv_bitmap_offset(host_addr, rb),
245 nr);
246}
247
Peter Xua335deb2018-05-02 18:47:28 +0800248#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
249
250/*
251 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
252 *
253 * Returns >0 if success with sent bytes, or <0 if error.
254 */
255int64_t ramblock_recv_bitmap_send(QEMUFile *file,
256 const char *block_name)
257{
258 RAMBlock *block = qemu_ram_block_by_name(block_name);
259 unsigned long *le_bitmap, nbits;
260 uint64_t size;
261
262 if (!block) {
263 error_report("%s: invalid block name: %s", __func__, block_name);
264 return -1;
265 }
266
David Hildenbrand898ba902021-04-29 13:27:06 +0200267 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
Peter Xua335deb2018-05-02 18:47:28 +0800268
269 /*
270 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
271 * machines we may need 4 more bytes for padding (see below
272 * comment). So extend it a bit before hand.
273 */
274 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
275
276 /*
277 * Always use little endian when sending the bitmap. This is
278 * required that when source and destination VMs are not using the
zhaolichang3a4452d2020-09-17 15:50:21 +0800279 * same endianness. (Note: big endian won't work.)
Peter Xua335deb2018-05-02 18:47:28 +0800280 */
281 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
282
283 /* Size of the bitmap, in bytes */
Peter Xua725ef92018-07-10 17:18:55 +0800284 size = DIV_ROUND_UP(nbits, 8);
Peter Xua335deb2018-05-02 18:47:28 +0800285
286 /*
287 * size is always aligned to 8 bytes for 64bit machines, but it
288 * may not be true for 32bit machines. We need this padding to
289 * make sure the migration can survive even between 32bit and
290 * 64bit machines.
291 */
292 size = ROUND_UP(size, 8);
293
294 qemu_put_be64(file, size);
295 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
296 /*
297 * Mark as an end, in case the middle part is screwed up due to
zhaolichang3a4452d2020-09-17 15:50:21 +0800298 * some "mysterious" reason.
Peter Xua335deb2018-05-02 18:47:28 +0800299 */
300 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
301 qemu_fflush(file);
302
Peter Xubf269902018-05-25 09:50:42 +0800303 g_free(le_bitmap);
Peter Xua335deb2018-05-02 18:47:28 +0800304
305 if (qemu_file_get_error(file)) {
306 return qemu_file_get_error(file);
307 }
308
309 return size + sizeof(size);
310}
311
Juan Quintelaec481c62017-03-20 22:12:40 +0100312/*
313 * An outstanding page request, on the source, having been received
314 * and queued
315 */
316struct RAMSrcPageRequest {
317 RAMBlock *rb;
318 hwaddr offset;
319 hwaddr len;
320
321 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
322};
323
Juan Quintela6f37bb82017-03-13 19:26:29 +0100324/* State of RAM for migration */
325struct RAMState {
Peter Xuf1668762022-10-11 17:55:55 -0400326 /*
327 * PageSearchStatus structures for the channels when send pages.
328 * Protected by the bitmap_mutex.
329 */
330 PageSearchStatus pss[RAM_CHANNEL_MAX];
Andrey Gruzdev278e2f52021-01-29 13:14:05 +0300331 /* UFFD file descriptor, used in 'write-tracking' migration */
332 int uffdio_fd;
Juan Quintela6f37bb82017-03-13 19:26:29 +0100333 /* Last block that we have visited searching for dirty pages */
334 RAMBlock *last_seen_block;
Juan Quintela269ace22017-03-21 15:23:31 +0100335 /* Last dirty target page we have sent */
336 ram_addr_t last_page;
Juan Quintela6f37bb82017-03-13 19:26:29 +0100337 /* last ram version we have seen */
338 uint32_t last_version;
Juan Quintela8d820d62017-03-13 19:35:50 +0100339 /* How many times we have dirty too many pages */
340 int dirty_rate_high_cnt;
Juan Quintelaf664da82017-03-13 19:44:57 +0100341 /* these variables are used for bitmap sync */
342 /* last time we did a full bitmap_sync */
343 int64_t time_last_bitmap_sync;
Juan Quintelaeac74152017-03-28 14:59:01 +0200344 /* bytes transferred at start_time */
Juan Quintelac4bdf0c2017-03-28 14:59:54 +0200345 uint64_t bytes_xfer_prev;
Juan Quintelaa66cd902017-03-28 15:02:43 +0200346 /* number of dirty pages since start_time */
Juan Quintela68908ed2017-03-28 15:05:53 +0200347 uint64_t num_dirty_pages_period;
Juan Quintelab5833fd2017-03-13 19:49:19 +0100348 /* xbzrle misses since the beginning of the period */
349 uint64_t xbzrle_cache_miss_prev;
Wei Wange460a4b2020-04-30 08:59:35 +0800350 /* Amount of xbzrle pages since the beginning of the period */
351 uint64_t xbzrle_pages_prev;
352 /* Amount of xbzrle encoded bytes since the beginning of the period */
353 uint64_t xbzrle_bytes_prev;
David Hildenbrand1a373522021-02-16 11:50:39 +0100354 /* Start using XBZRLE (e.g., after the first round). */
355 bool xbzrle_enabled;
Juan Quintela05931ec2021-12-15 19:01:21 +0100356 /* Are we on the last stage of migration */
357 bool last_stage;
Xiao Guangrong76e03002018-09-06 15:01:00 +0800358 /* compression statistics since the beginning of the period */
359 /* amount of count that no free thread to compress data */
360 uint64_t compress_thread_busy_prev;
361 /* amount bytes after compression */
362 uint64_t compressed_size_prev;
363 /* amount of compressed pages */
364 uint64_t compress_pages_prev;
365
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +0800366 /* total handled target pages at the beginning of period */
367 uint64_t target_page_count_prev;
368 /* total handled target pages since start */
369 uint64_t target_page_count;
Juan Quintela93604472017-06-06 19:49:03 +0200370 /* number of dirty bits in the bitmap */
Peter Xu2dfaf122017-08-02 17:41:19 +0800371 uint64_t migration_dirty_pages;
Peter Xuf1668762022-10-11 17:55:55 -0400372 /*
373 * Protects:
374 * - dirty/clear bitmap
375 * - migration_dirty_pages
376 * - pss structures
377 */
Juan Quintela108cfae2017-03-13 21:38:09 +0100378 QemuMutex bitmap_mutex;
Juan Quintela68a098f2017-03-14 13:48:42 +0100379 /* The RAMBlock used in the last src_page_requests */
380 RAMBlock *last_req_rb;
Juan Quintelaec481c62017-03-20 22:12:40 +0100381 /* Queue of outstanding page requests from the destination */
382 QemuMutex src_page_req_mutex;
Paolo Bonzinib58deb32018-12-06 11:58:10 +0100383 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
Juan Quintela6f37bb82017-03-13 19:26:29 +0100384};
385typedef struct RAMState RAMState;
386
Juan Quintela53518d92017-05-04 11:46:24 +0200387static RAMState *ram_state;
Juan Quintela6f37bb82017-03-13 19:26:29 +0100388
Wei Wangbd227062018-12-11 16:24:51 +0800389static NotifierWithReturnList precopy_notifier_list;
390
Peter Xua1fe28d2022-01-19 16:09:18 +0800391/* Whether postcopy has queued requests? */
392static bool postcopy_has_request(RAMState *rs)
393{
394 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
395}
396
Wei Wangbd227062018-12-11 16:24:51 +0800397void precopy_infrastructure_init(void)
398{
399 notifier_with_return_list_init(&precopy_notifier_list);
400}
401
402void precopy_add_notifier(NotifierWithReturn *n)
403{
404 notifier_with_return_list_add(&precopy_notifier_list, n);
405}
406
407void precopy_remove_notifier(NotifierWithReturn *n)
408{
409 notifier_with_return_remove(n);
410}
411
412int precopy_notify(PrecopyNotifyReason reason, Error **errp)
413{
414 PrecopyNotifyData pnd;
415 pnd.reason = reason;
416 pnd.errp = errp;
417
418 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
419}
420
Juan Quintela9edabd42017-03-14 12:02:16 +0100421uint64_t ram_bytes_remaining(void)
422{
Dr. David Alan Gilbertbae416e2017-12-15 11:51:23 +0000423 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
424 0;
Juan Quintela9edabd42017-03-14 12:02:16 +0100425}
426
Peter Xu23b75762022-10-11 17:55:51 -0400427/*
428 * NOTE: not all stats in ram_counters are used in reality. See comments
429 * for struct MigrationAtomicStats. The ultimate result of ram migration
430 * counters will be a merged version with both ram_counters and the atomic
431 * fields in ram_atomic_counters.
432 */
Juan Quintela93604472017-06-06 19:49:03 +0200433MigrationStats ram_counters;
Peter Xu23b75762022-10-11 17:55:51 -0400434MigrationAtomicStats ram_atomic_counters;
Juan Quintela96506892017-03-14 18:41:03 +0100435
Juan Quintela26a26062022-02-22 21:02:03 +0100436void ram_transferred_add(uint64_t bytes)
David Edmondson4c2d0f62021-12-21 09:34:40 +0000437{
David Edmondsonae680662021-12-21 09:34:41 +0000438 if (runstate_is_running()) {
439 ram_counters.precopy_bytes += bytes;
440 } else if (migration_in_postcopy()) {
Peter Xu23b75762022-10-11 17:55:51 -0400441 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
David Edmondsonae680662021-12-21 09:34:41 +0000442 } else {
443 ram_counters.downtime_bytes += bytes;
444 }
Peter Xu23b75762022-10-11 17:55:51 -0400445 stat64_add(&ram_atomic_counters.transferred, bytes);
David Edmondson4c2d0f62021-12-21 09:34:40 +0000446}
447
Leonardo Brasd59c40c2022-07-11 18:11:13 -0300448void dirty_sync_missed_zero_copy(void)
449{
450 ram_counters.dirty_sync_missed_zero_copy++;
451}
452
Xiao Guangrong76e03002018-09-06 15:01:00 +0800453CompressionStats compression_counters;
454
Juan Quintela56e93d22015-05-07 19:33:31 +0200455struct CompressParam {
Juan Quintela56e93d22015-05-07 19:33:31 +0200456 bool done;
Liang Li90e56fb2016-05-05 15:32:56 +0800457 bool quit;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +0800458 bool zero_page;
Juan Quintela56e93d22015-05-07 19:33:31 +0200459 QEMUFile *file;
460 QemuMutex mutex;
461 QemuCond cond;
462 RAMBlock *block;
463 ram_addr_t offset;
Xiao Guangrong34ab9e92018-03-30 15:51:22 +0800464
465 /* internally used fields */
Xiao Guangrongdcaf4462018-03-30 15:51:20 +0800466 z_stream stream;
Xiao Guangrong34ab9e92018-03-30 15:51:22 +0800467 uint8_t *originbuf;
Juan Quintela56e93d22015-05-07 19:33:31 +0200468};
469typedef struct CompressParam CompressParam;
470
471struct DecompressParam {
Liang Li73a89122016-05-05 15:32:51 +0800472 bool done;
Liang Li90e56fb2016-05-05 15:32:56 +0800473 bool quit;
Juan Quintela56e93d22015-05-07 19:33:31 +0200474 QemuMutex mutex;
475 QemuCond cond;
476 void *des;
Peter Maydelld341d9f2016-01-22 15:09:21 +0000477 uint8_t *compbuf;
Juan Quintela56e93d22015-05-07 19:33:31 +0200478 int len;
Xiao Guangrong797ca152018-03-30 15:51:21 +0800479 z_stream stream;
Juan Quintela56e93d22015-05-07 19:33:31 +0200480};
481typedef struct DecompressParam DecompressParam;
482
483static CompressParam *comp_param;
484static QemuThread *compress_threads;
485/* comp_done_cond is used to wake up the migration thread when
486 * one of the compression threads has finished the compression.
487 * comp_done_lock is used to co-work with comp_done_cond.
488 */
Liang Li0d9f9a52016-05-05 15:32:59 +0800489static QemuMutex comp_done_lock;
490static QemuCond comp_done_cond;
Juan Quintela56e93d22015-05-07 19:33:31 +0200491
Xiao Guangrong34ab9e92018-03-30 15:51:22 +0800492static QEMUFile *decomp_file;
Juan Quintela56e93d22015-05-07 19:33:31 +0200493static DecompressParam *decomp_param;
494static QemuThread *decompress_threads;
Liang Li73a89122016-05-05 15:32:51 +0800495static QemuMutex decomp_done_lock;
496static QemuCond decomp_done_cond;
Juan Quintela56e93d22015-05-07 19:33:31 +0200497
Peter Xu93589822022-10-11 17:55:57 -0400498static int ram_save_host_page_urgent(PageSearchStatus *pss);
499
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +0800500static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
Xiao Guangrong6ef37712018-08-21 16:10:23 +0800501 ram_addr_t offset, uint8_t *source_buf);
Juan Quintela56e93d22015-05-07 19:33:31 +0200502
Peter Xuebd88a42022-10-11 17:55:54 -0400503/* NOTE: page is the PFN not real ram_addr_t. */
504static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
505{
506 pss->block = rb;
507 pss->page = page;
508 pss->complete_round = false;
509}
510
Peter Xu93589822022-10-11 17:55:57 -0400511/*
512 * Check whether two PSSs are actively sending the same page. Return true
513 * if it is, false otherwise.
514 */
515static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
516{
517 return pss1->host_page_sending && pss2->host_page_sending &&
518 (pss1->host_page_start == pss2->host_page_start);
519}
520
Juan Quintela56e93d22015-05-07 19:33:31 +0200521static void *do_data_compress(void *opaque)
522{
523 CompressParam *param = opaque;
Liang Lia7a9a882016-05-05 15:32:57 +0800524 RAMBlock *block;
525 ram_addr_t offset;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +0800526 bool zero_page;
Juan Quintela56e93d22015-05-07 19:33:31 +0200527
Liang Lia7a9a882016-05-05 15:32:57 +0800528 qemu_mutex_lock(&param->mutex);
Liang Li90e56fb2016-05-05 15:32:56 +0800529 while (!param->quit) {
Liang Lia7a9a882016-05-05 15:32:57 +0800530 if (param->block) {
531 block = param->block;
532 offset = param->offset;
533 param->block = NULL;
534 qemu_mutex_unlock(&param->mutex);
535
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +0800536 zero_page = do_compress_ram_page(param->file, &param->stream,
537 block, offset, param->originbuf);
Liang Lia7a9a882016-05-05 15:32:57 +0800538
Liang Li0d9f9a52016-05-05 15:32:59 +0800539 qemu_mutex_lock(&comp_done_lock);
Liang Lia7a9a882016-05-05 15:32:57 +0800540 param->done = true;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +0800541 param->zero_page = zero_page;
Liang Li0d9f9a52016-05-05 15:32:59 +0800542 qemu_cond_signal(&comp_done_cond);
543 qemu_mutex_unlock(&comp_done_lock);
Liang Lia7a9a882016-05-05 15:32:57 +0800544
545 qemu_mutex_lock(&param->mutex);
546 } else {
Juan Quintela56e93d22015-05-07 19:33:31 +0200547 qemu_cond_wait(&param->cond, &param->mutex);
548 }
Juan Quintela56e93d22015-05-07 19:33:31 +0200549 }
Liang Lia7a9a882016-05-05 15:32:57 +0800550 qemu_mutex_unlock(&param->mutex);
Juan Quintela56e93d22015-05-07 19:33:31 +0200551
552 return NULL;
553}
554
Juan Quintelaf0afa332017-06-28 11:52:28 +0200555static void compress_threads_save_cleanup(void)
Juan Quintela56e93d22015-05-07 19:33:31 +0200556{
557 int i, thread_count;
558
Fei Li05306932018-09-25 17:14:40 +0800559 if (!migrate_use_compression() || !comp_param) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200560 return;
561 }
Fei Li05306932018-09-25 17:14:40 +0800562
Juan Quintela56e93d22015-05-07 19:33:31 +0200563 thread_count = migrate_compress_threads();
564 for (i = 0; i < thread_count; i++) {
Xiao Guangrongdcaf4462018-03-30 15:51:20 +0800565 /*
566 * we use it as a indicator which shows if the thread is
567 * properly init'd or not
568 */
569 if (!comp_param[i].file) {
570 break;
571 }
Fei Li05306932018-09-25 17:14:40 +0800572
573 qemu_mutex_lock(&comp_param[i].mutex);
574 comp_param[i].quit = true;
575 qemu_cond_signal(&comp_param[i].cond);
576 qemu_mutex_unlock(&comp_param[i].mutex);
577
Juan Quintela56e93d22015-05-07 19:33:31 +0200578 qemu_thread_join(compress_threads + i);
Juan Quintela56e93d22015-05-07 19:33:31 +0200579 qemu_mutex_destroy(&comp_param[i].mutex);
580 qemu_cond_destroy(&comp_param[i].cond);
Xiao Guangrongdcaf4462018-03-30 15:51:20 +0800581 deflateEnd(&comp_param[i].stream);
Xiao Guangrong34ab9e92018-03-30 15:51:22 +0800582 g_free(comp_param[i].originbuf);
Xiao Guangrongdcaf4462018-03-30 15:51:20 +0800583 qemu_fclose(comp_param[i].file);
584 comp_param[i].file = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +0200585 }
Liang Li0d9f9a52016-05-05 15:32:59 +0800586 qemu_mutex_destroy(&comp_done_lock);
587 qemu_cond_destroy(&comp_done_cond);
Juan Quintela56e93d22015-05-07 19:33:31 +0200588 g_free(compress_threads);
589 g_free(comp_param);
Juan Quintela56e93d22015-05-07 19:33:31 +0200590 compress_threads = NULL;
591 comp_param = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +0200592}
593
Xiao Guangrongdcaf4462018-03-30 15:51:20 +0800594static int compress_threads_save_setup(void)
Juan Quintela56e93d22015-05-07 19:33:31 +0200595{
596 int i, thread_count;
597
598 if (!migrate_use_compression()) {
Xiao Guangrongdcaf4462018-03-30 15:51:20 +0800599 return 0;
Juan Quintela56e93d22015-05-07 19:33:31 +0200600 }
Juan Quintela56e93d22015-05-07 19:33:31 +0200601 thread_count = migrate_compress_threads();
602 compress_threads = g_new0(QemuThread, thread_count);
603 comp_param = g_new0(CompressParam, thread_count);
Liang Li0d9f9a52016-05-05 15:32:59 +0800604 qemu_cond_init(&comp_done_cond);
605 qemu_mutex_init(&comp_done_lock);
Juan Quintela56e93d22015-05-07 19:33:31 +0200606 for (i = 0; i < thread_count; i++) {
Xiao Guangrong34ab9e92018-03-30 15:51:22 +0800607 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
608 if (!comp_param[i].originbuf) {
609 goto exit;
610 }
611
Xiao Guangrongdcaf4462018-03-30 15:51:20 +0800612 if (deflateInit(&comp_param[i].stream,
613 migrate_compress_level()) != Z_OK) {
Xiao Guangrong34ab9e92018-03-30 15:51:22 +0800614 g_free(comp_param[i].originbuf);
Xiao Guangrongdcaf4462018-03-30 15:51:20 +0800615 goto exit;
616 }
617
Cao jine110aa92016-07-29 15:10:31 +0800618 /* comp_param[i].file is just used as a dummy buffer to save data,
619 * set its ops to empty.
Juan Quintela56e93d22015-05-07 19:33:31 +0200620 */
Daniel P. Berrangé77ef2dc2022-06-20 12:02:05 +0100621 comp_param[i].file = qemu_file_new_output(
Daniel P. Berrangéc0e08252022-06-20 12:01:46 +0100622 QIO_CHANNEL(qio_channel_null_new()));
Juan Quintela56e93d22015-05-07 19:33:31 +0200623 comp_param[i].done = true;
Liang Li90e56fb2016-05-05 15:32:56 +0800624 comp_param[i].quit = false;
Juan Quintela56e93d22015-05-07 19:33:31 +0200625 qemu_mutex_init(&comp_param[i].mutex);
626 qemu_cond_init(&comp_param[i].cond);
627 qemu_thread_create(compress_threads + i, "compress",
628 do_data_compress, comp_param + i,
629 QEMU_THREAD_JOINABLE);
630 }
Xiao Guangrongdcaf4462018-03-30 15:51:20 +0800631 return 0;
632
633exit:
634 compress_threads_save_cleanup();
635 return -1;
Juan Quintela56e93d22015-05-07 19:33:31 +0200636}
637
638/**
Juan Quintela3d0684b2017-03-23 15:06:39 +0100639 * save_page_header: write page header to wire
Juan Quintela56e93d22015-05-07 19:33:31 +0200640 *
641 * If this is the 1st block, it also writes the block identification
642 *
Juan Quintela3d0684b2017-03-23 15:06:39 +0100643 * Returns the number of bytes written
Juan Quintela56e93d22015-05-07 19:33:31 +0200644 *
Peter Xuec6f3ab2022-10-11 17:55:56 -0400645 * @pss: current PSS channel status
Juan Quintela56e93d22015-05-07 19:33:31 +0200646 * @block: block that contains the page we want to send
647 * @offset: offset inside the block for the page
648 * in the lower bits, it contains flags
649 */
Peter Xuec6f3ab2022-10-11 17:55:56 -0400650static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
Juan Quintela2bf3aa82017-05-10 13:28:13 +0200651 ram_addr_t offset)
Juan Quintela56e93d22015-05-07 19:33:31 +0200652{
Liang Li9f5f3802015-07-13 17:34:10 +0800653 size_t size, len;
Peter Xuec6f3ab2022-10-11 17:55:56 -0400654 bool same_block = (block == pss->last_sent_block);
655 QEMUFile *f = pss->pss_channel;
Juan Quintela56e93d22015-05-07 19:33:31 +0200656
Peter Xu10661f12022-10-11 17:55:48 -0400657 if (same_block) {
Juan Quintela24795692017-03-21 11:45:01 +0100658 offset |= RAM_SAVE_FLAG_CONTINUE;
659 }
Juan Quintela2bf3aa82017-05-10 13:28:13 +0200660 qemu_put_be64(f, offset);
Juan Quintela56e93d22015-05-07 19:33:31 +0200661 size = 8;
662
Peter Xu10661f12022-10-11 17:55:48 -0400663 if (!same_block) {
Liang Li9f5f3802015-07-13 17:34:10 +0800664 len = strlen(block->idstr);
Juan Quintela2bf3aa82017-05-10 13:28:13 +0200665 qemu_put_byte(f, len);
666 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
Liang Li9f5f3802015-07-13 17:34:10 +0800667 size += 1 + len;
Peter Xuec6f3ab2022-10-11 17:55:56 -0400668 pss->last_sent_block = block;
Juan Quintela56e93d22015-05-07 19:33:31 +0200669 }
670 return size;
671}
672
Juan Quintela3d0684b2017-03-23 15:06:39 +0100673/**
Olaf Hering179a8082021-07-08 18:21:59 +0200674 * mig_throttle_guest_down: throttle down the guest
Juan Quintela3d0684b2017-03-23 15:06:39 +0100675 *
676 * Reduce amount of guest cpu execution to hopefully slow down memory
677 * writes. If guest dirty memory rate is reduced below the rate at
678 * which we can transfer pages to the destination then we should be
679 * able to complete migration. Some workloads dirty memory way too
680 * fast and will not effectively converge, even with auto-converge.
Jason J. Herne070afca2015-09-08 13:12:35 -0400681 */
Keqian Zhucbbf8182020-04-13 18:15:08 +0800682static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
683 uint64_t bytes_dirty_threshold)
Jason J. Herne070afca2015-09-08 13:12:35 -0400684{
685 MigrationState *s = migrate_get_current();
Daniel P. Berrange2594f562016-04-27 11:05:14 +0100686 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
Keqian Zhucbbf8182020-04-13 18:15:08 +0800687 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
688 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
Li Qiang4cbc9c72018-08-01 06:00:20 -0700689 int pct_max = s->parameters.max_cpu_throttle;
Jason J. Herne070afca2015-09-08 13:12:35 -0400690
Keqian Zhucbbf8182020-04-13 18:15:08 +0800691 uint64_t throttle_now = cpu_throttle_get_percentage();
692 uint64_t cpu_now, cpu_ideal, throttle_inc;
693
Jason J. Herne070afca2015-09-08 13:12:35 -0400694 /* We have not started throttling yet. Let's start it. */
695 if (!cpu_throttle_active()) {
696 cpu_throttle_set(pct_initial);
697 } else {
698 /* Throttling already on, just increase the rate */
Keqian Zhucbbf8182020-04-13 18:15:08 +0800699 if (!pct_tailslow) {
700 throttle_inc = pct_increment;
701 } else {
702 /* Compute the ideal CPU percentage used by Guest, which may
703 * make the dirty rate match the dirty rate threshold. */
704 cpu_now = 100 - throttle_now;
705 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
706 bytes_dirty_period);
707 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
708 }
709 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
Jason J. Herne070afca2015-09-08 13:12:35 -0400710 }
711}
712
Rao, Lei91fe9a82021-11-09 11:04:54 +0800713void mig_throttle_counter_reset(void)
714{
715 RAMState *rs = ram_state;
716
717 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
718 rs->num_dirty_pages_period = 0;
Peter Xu23b75762022-10-11 17:55:51 -0400719 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
Rao, Lei91fe9a82021-11-09 11:04:54 +0800720}
721
Juan Quintela3d0684b2017-03-23 15:06:39 +0100722/**
723 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
724 *
Juan Quintela6f37bb82017-03-13 19:26:29 +0100725 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +0100726 * @current_addr: address for the zero page
727 *
728 * Update the xbzrle cache to reflect a page that's been sent as all 0.
Juan Quintela56e93d22015-05-07 19:33:31 +0200729 * The important thing is that a stale (not-yet-0'd) page be replaced
730 * by the new data.
731 * As a bonus, if the page wasn't in the cache it gets added so that
Juan Quintela3d0684b2017-03-23 15:06:39 +0100732 * when a small write is made into the 0'd page it gets XBZRLE sent.
Juan Quintela56e93d22015-05-07 19:33:31 +0200733 */
Juan Quintela6f37bb82017-03-13 19:26:29 +0100734static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
Juan Quintela56e93d22015-05-07 19:33:31 +0200735{
Juan Quintela56e93d22015-05-07 19:33:31 +0200736 /* We don't care if this fails to allocate a new cache page
737 * as long as it updated an old one */
Juan Quintelac00e0922017-05-09 16:22:01 +0200738 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
Juan Quintela93604472017-06-06 19:49:03 +0200739 ram_counters.dirty_sync_count);
Juan Quintela56e93d22015-05-07 19:33:31 +0200740}
741
742#define ENCODING_FLAG_XBZRLE 0x1
743
744/**
745 * save_xbzrle_page: compress and send current page
746 *
747 * Returns: 1 means that we wrote the page
748 * 0 means that page is identical to the one already sent
749 * -1 means that xbzrle would be longer than normal
750 *
Juan Quintela5a987732017-03-13 19:39:02 +0100751 * @rs: current RAM state
Peter Xuec6f3ab2022-10-11 17:55:56 -0400752 * @pss: current PSS channel
Juan Quintela3d0684b2017-03-23 15:06:39 +0100753 * @current_data: pointer to the address of the page contents
754 * @current_addr: addr of the page
Juan Quintela56e93d22015-05-07 19:33:31 +0200755 * @block: block that contains the page we want to send
756 * @offset: offset inside the block for the page
Juan Quintela56e93d22015-05-07 19:33:31 +0200757 */
Peter Xuec6f3ab2022-10-11 17:55:56 -0400758static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
Peter Xu61717ea2022-10-11 17:55:53 -0400759 uint8_t **current_data, ram_addr_t current_addr,
760 RAMBlock *block, ram_addr_t offset)
Juan Quintela56e93d22015-05-07 19:33:31 +0200761{
762 int encoded_len = 0, bytes_xbzrle;
763 uint8_t *prev_cached_page;
Peter Xuec6f3ab2022-10-11 17:55:56 -0400764 QEMUFile *file = pss->pss_channel;
Juan Quintela56e93d22015-05-07 19:33:31 +0200765
Juan Quintela93604472017-06-06 19:49:03 +0200766 if (!cache_is_cached(XBZRLE.cache, current_addr,
767 ram_counters.dirty_sync_count)) {
768 xbzrle_counters.cache_miss++;
Juan Quintela05931ec2021-12-15 19:01:21 +0100769 if (!rs->last_stage) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200770 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
Juan Quintela93604472017-06-06 19:49:03 +0200771 ram_counters.dirty_sync_count) == -1) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200772 return -1;
773 } else {
774 /* update *current_data when the page has been
775 inserted into cache */
776 *current_data = get_cached_data(XBZRLE.cache, current_addr);
777 }
778 }
779 return -1;
780 }
781
Wei Wange460a4b2020-04-30 08:59:35 +0800782 /*
783 * Reaching here means the page has hit the xbzrle cache, no matter what
784 * encoding result it is (normal encoding, overflow or skipping the page),
zhaolichang3a4452d2020-09-17 15:50:21 +0800785 * count the page as encoded. This is used to calculate the encoding rate.
Wei Wange460a4b2020-04-30 08:59:35 +0800786 *
787 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
788 * 2nd page turns out to be skipped (i.e. no new bytes written to the
789 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
790 * skipped page included. In this way, the encoding rate can tell if the
791 * guest page is good for xbzrle encoding.
792 */
793 xbzrle_counters.pages++;
Juan Quintela56e93d22015-05-07 19:33:31 +0200794 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
795
796 /* save current buffer into memory */
797 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
798
799 /* XBZRLE encoding (if there is no overflow) */
800 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
801 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
802 TARGET_PAGE_SIZE);
Wei Yangca353802019-06-10 08:41:59 +0800803
804 /*
805 * Update the cache contents, so that it corresponds to the data
806 * sent, in all cases except where we skip the page.
807 */
Juan Quintela05931ec2021-12-15 19:01:21 +0100808 if (!rs->last_stage && encoded_len != 0) {
Wei Yangca353802019-06-10 08:41:59 +0800809 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
810 /*
811 * In the case where we couldn't compress, ensure that the caller
812 * sends the data from the cache, since the guest might have
813 * changed the RAM since we copied it.
814 */
815 *current_data = prev_cached_page;
816 }
817
Juan Quintela56e93d22015-05-07 19:33:31 +0200818 if (encoded_len == 0) {
Juan Quintela55c44462017-01-23 22:32:05 +0100819 trace_save_xbzrle_page_skipping();
Juan Quintela56e93d22015-05-07 19:33:31 +0200820 return 0;
821 } else if (encoded_len == -1) {
Juan Quintela55c44462017-01-23 22:32:05 +0100822 trace_save_xbzrle_page_overflow();
Juan Quintela93604472017-06-06 19:49:03 +0200823 xbzrle_counters.overflow++;
Wei Wange460a4b2020-04-30 08:59:35 +0800824 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
Juan Quintela56e93d22015-05-07 19:33:31 +0200825 return -1;
826 }
827
Juan Quintela56e93d22015-05-07 19:33:31 +0200828 /* Send XBZRLE based compressed page */
Peter Xuec6f3ab2022-10-11 17:55:56 -0400829 bytes_xbzrle = save_page_header(pss, block,
Juan Quintela204b88b2017-03-15 09:16:57 +0100830 offset | RAM_SAVE_FLAG_XBZRLE);
Peter Xu61717ea2022-10-11 17:55:53 -0400831 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
832 qemu_put_be16(file, encoded_len);
833 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
Juan Quintela56e93d22015-05-07 19:33:31 +0200834 bytes_xbzrle += encoded_len + 1 + 2;
Wei Wange460a4b2020-04-30 08:59:35 +0800835 /*
836 * Like compressed_size (please see update_compress_thread_counts),
837 * the xbzrle encoded bytes don't count the 8 byte header with
838 * RAM_SAVE_FLAG_CONTINUE.
839 */
840 xbzrle_counters.bytes += bytes_xbzrle - 8;
David Edmondson4c2d0f62021-12-21 09:34:40 +0000841 ram_transferred_add(bytes_xbzrle);
Juan Quintela56e93d22015-05-07 19:33:31 +0200842
843 return 1;
844}
845
Juan Quintela3d0684b2017-03-23 15:06:39 +0100846/**
Peter Xud9e474e2022-10-11 17:55:52 -0400847 * pss_find_next_dirty: find the next dirty page of current ramblock
Dr. David Alan Gilbertf3f491f2015-11-05 18:11:01 +0000848 *
Peter Xud9e474e2022-10-11 17:55:52 -0400849 * This function updates pss->page to point to the next dirty page index
850 * within the ramblock to migrate, or the end of ramblock when nothing
851 * found. Note that when pss->host_page_sending==true it means we're
852 * during sending a host page, so we won't look for dirty page that is
853 * outside the host page boundary.
Juan Quintela3d0684b2017-03-23 15:06:39 +0100854 *
Peter Xud9e474e2022-10-11 17:55:52 -0400855 * @pss: the current page search status
Dr. David Alan Gilbertf3f491f2015-11-05 18:11:01 +0000856 */
Peter Xud9e474e2022-10-11 17:55:52 -0400857static void pss_find_next_dirty(PageSearchStatus *pss)
Juan Quintela56e93d22015-05-07 19:33:31 +0200858{
Peter Xud9e474e2022-10-11 17:55:52 -0400859 RAMBlock *rb = pss->block;
Juan Quintela6b6712e2017-03-22 15:18:04 +0100860 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
861 unsigned long *bitmap = rb->bmap;
Juan Quintela56e93d22015-05-07 19:33:31 +0200862
Yury Kotovfbd162e2019-02-15 20:45:46 +0300863 if (ramblock_is_ignored(rb)) {
Peter Xud9e474e2022-10-11 17:55:52 -0400864 /* Points directly to the end, so we know no dirty page */
865 pss->page = size;
866 return;
CĂ©dric Le Goaterb895de52018-05-14 08:57:00 +0200867 }
868
Peter Xud9e474e2022-10-11 17:55:52 -0400869 /*
870 * If during sending a host page, only look for dirty pages within the
871 * current host page being send.
872 */
873 if (pss->host_page_sending) {
874 assert(pss->host_page_end);
875 size = MIN(size, pss->host_page_end);
876 }
877
878 pss->page = find_next_bit(bitmap, size, pss->page);
Juan Quintela56e93d22015-05-07 19:33:31 +0200879}
880
David Hildenbrand1230a252021-09-04 18:09:07 +0200881static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
Wei Wang3143577d2021-07-22 04:30:55 -0400882 unsigned long page)
883{
884 uint8_t shift;
885 hwaddr size, start;
886
887 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
888 return;
889 }
890
891 shift = rb->clear_bmap_shift;
892 /*
893 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
894 * can make things easier sometimes since then start address
895 * of the small chunk will always be 64 pages aligned so the
896 * bitmap will always be aligned to unsigned long. We should
897 * even be able to remove this restriction but I'm simply
898 * keeping it.
899 */
900 assert(shift >= 6);
901
902 size = 1ULL << (TARGET_PAGE_BITS + shift);
David Hildenbrand76482972021-10-11 19:53:44 +0200903 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
Wei Wang3143577d2021-07-22 04:30:55 -0400904 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
905 memory_region_clear_dirty_bitmap(rb->mr, start, size);
906}
907
908static void
David Hildenbrand1230a252021-09-04 18:09:07 +0200909migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
Wei Wang3143577d2021-07-22 04:30:55 -0400910 unsigned long start,
911 unsigned long npages)
912{
913 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
914 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
915 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
916
917 /*
918 * Clear pages from start to start + npages - 1, so the end boundary is
919 * exclusive.
920 */
921 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
David Hildenbrand1230a252021-09-04 18:09:07 +0200922 migration_clear_memory_region_dirty_bitmap(rb, i);
Wei Wang3143577d2021-07-22 04:30:55 -0400923 }
924}
925
Rao, Leia6a83ce2021-11-09 11:04:55 +0800926/*
927 * colo_bitmap_find_diry:find contiguous dirty pages from start
928 *
929 * Returns the page offset within memory region of the start of the contiguout
930 * dirty page
931 *
932 * @rs: current RAM state
933 * @rb: RAMBlock where to search for dirty pages
934 * @start: page where we start the search
935 * @num: the number of contiguous dirty pages
936 */
937static inline
938unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
939 unsigned long start, unsigned long *num)
940{
941 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
942 unsigned long *bitmap = rb->bmap;
943 unsigned long first, next;
944
945 *num = 0;
946
947 if (ramblock_is_ignored(rb)) {
948 return size;
949 }
950
951 first = find_next_bit(bitmap, size, start);
952 if (first >= size) {
953 return first;
954 }
955 next = find_next_zero_bit(bitmap, size, first + 1);
956 assert(next >= first);
957 *num = next - first;
958 return first;
959}
960
Juan Quintela06b10682017-03-21 15:18:05 +0100961static inline bool migration_bitmap_clear_dirty(RAMState *rs,
Juan Quintelaf20e2862017-03-21 16:19:05 +0100962 RAMBlock *rb,
963 unsigned long page)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000964{
965 bool ret;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000966
Peter Xu002cad62019-06-03 14:50:56 +0800967 /*
968 * Clear dirty bitmap if needed. This _must_ be called before we
969 * send any of the page in the chunk because we need to make sure
970 * we can capture further page content changes when we sync dirty
971 * log the next time. So as long as we are going to send any of
972 * the page in the chunk we clear the remote dirty bitmap for all.
973 * Clearing it earlier won't be a problem, but too late will.
974 */
David Hildenbrand1230a252021-09-04 18:09:07 +0200975 migration_clear_memory_region_dirty_bitmap(rb, page);
Peter Xu002cad62019-06-03 14:50:56 +0800976
Juan Quintela6b6712e2017-03-22 15:18:04 +0100977 ret = test_and_clear_bit(page, rb->bmap);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000978 if (ret) {
Juan Quintela0d8ec882017-03-13 21:21:41 +0100979 rs->migration_dirty_pages--;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000980 }
Wei Wang386a9072018-12-11 16:24:49 +0800981
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000982 return ret;
983}
984
David Hildenbrandbe39b4c2021-10-11 19:53:41 +0200985static void dirty_bitmap_clear_section(MemoryRegionSection *section,
986 void *opaque)
987{
988 const hwaddr offset = section->offset_within_region;
989 const hwaddr size = int128_get64(section->size);
990 const unsigned long start = offset >> TARGET_PAGE_BITS;
991 const unsigned long npages = size >> TARGET_PAGE_BITS;
992 RAMBlock *rb = section->mr->ram_block;
993 uint64_t *cleared_bits = opaque;
994
995 /*
996 * We don't grab ram_state->bitmap_mutex because we expect to run
997 * only when starting migration or during postcopy recovery where
998 * we don't have concurrent access.
999 */
1000 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1001 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1002 }
1003 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1004 bitmap_clear(rb->bmap, start, npages);
1005}
1006
1007/*
1008 * Exclude all dirty pages from migration that fall into a discarded range as
1009 * managed by a RamDiscardManager responsible for the mapped memory region of
1010 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1011 *
1012 * Discarded pages ("logically unplugged") have undefined content and must
1013 * not get migrated, because even reading these pages for migration might
1014 * result in undesired behavior.
1015 *
1016 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1017 *
1018 * Note: The result is only stable while migrating (precopy/postcopy).
1019 */
1020static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1021{
1022 uint64_t cleared_bits = 0;
1023
1024 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1025 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1026 MemoryRegionSection section = {
1027 .mr = rb->mr,
1028 .offset_within_region = 0,
1029 .size = int128_make64(qemu_ram_get_used_length(rb)),
1030 };
1031
1032 ram_discard_manager_replay_discarded(rdm, &section,
1033 dirty_bitmap_clear_section,
1034 &cleared_bits);
1035 }
1036 return cleared_bits;
1037}
1038
David Hildenbrand9470c5e2021-10-11 19:53:43 +02001039/*
1040 * Check if a host-page aligned page falls into a discarded range as managed by
1041 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1042 *
1043 * Note: The result is only stable while migrating (precopy/postcopy).
1044 */
1045bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1046{
1047 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1048 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1049 MemoryRegionSection section = {
1050 .mr = rb->mr,
1051 .offset_within_region = start,
1052 .size = int128_make64(qemu_ram_pagesize(rb)),
1053 };
1054
1055 return !ram_discard_manager_is_populated(rdm, &section);
1056 }
1057 return false;
1058}
1059
Peter Xu267691b2019-06-03 14:50:46 +08001060/* Called with RCU critical section */
Wei Yang7a3e9572019-08-08 11:31:55 +08001061static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
Juan Quintela56e93d22015-05-07 19:33:31 +02001062{
Keqian Zhufb613582020-06-22 11:20:37 +08001063 uint64_t new_dirty_pages =
1064 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1065
1066 rs->migration_dirty_pages += new_dirty_pages;
1067 rs->num_dirty_pages_period += new_dirty_pages;
Juan Quintela56e93d22015-05-07 19:33:31 +02001068}
1069
Juan Quintela3d0684b2017-03-23 15:06:39 +01001070/**
1071 * ram_pagesize_summary: calculate all the pagesizes of a VM
1072 *
1073 * Returns a summary bitmap of the page sizes of all RAMBlocks
1074 *
1075 * For VMs with just normal pages this is equivalent to the host page
1076 * size. If it's got some huge pages then it's the OR of all the
1077 * different page sizes.
Dr. David Alan Gilberte8ca1db2017-02-24 18:28:29 +00001078 */
1079uint64_t ram_pagesize_summary(void)
1080{
1081 RAMBlock *block;
1082 uint64_t summary = 0;
1083
Yury Kotovfbd162e2019-02-15 20:45:46 +03001084 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Dr. David Alan Gilberte8ca1db2017-02-24 18:28:29 +00001085 summary |= block->page_size;
1086 }
1087
1088 return summary;
1089}
1090
Xiao Guangrongaecbfe92019-01-11 14:37:30 +08001091uint64_t ram_get_total_transferred_pages(void)
1092{
Peter Xu23b75762022-10-11 17:55:51 -04001093 return stat64_get(&ram_atomic_counters.normal) +
1094 stat64_get(&ram_atomic_counters.duplicate) +
1095 compression_counters.pages + xbzrle_counters.pages;
Xiao Guangrongaecbfe92019-01-11 14:37:30 +08001096}
1097
Xiao Guangrongb7340352018-06-04 17:55:12 +08001098static void migration_update_rates(RAMState *rs, int64_t end_time)
1099{
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +08001100 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
Xiao Guangrong76e03002018-09-06 15:01:00 +08001101 double compressed_size;
Xiao Guangrongb7340352018-06-04 17:55:12 +08001102
1103 /* calculate period counters */
1104 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1105 / (end_time - rs->time_last_bitmap_sync);
1106
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +08001107 if (!page_count) {
Xiao Guangrongb7340352018-06-04 17:55:12 +08001108 return;
1109 }
1110
1111 if (migrate_use_xbzrle()) {
Wei Wange460a4b2020-04-30 08:59:35 +08001112 double encoded_size, unencoded_size;
1113
Xiao Guangrongb7340352018-06-04 17:55:12 +08001114 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +08001115 rs->xbzrle_cache_miss_prev) / page_count;
Xiao Guangrongb7340352018-06-04 17:55:12 +08001116 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
Wei Wange460a4b2020-04-30 08:59:35 +08001117 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1118 TARGET_PAGE_SIZE;
1119 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
Wei Wang92271402020-06-17 13:13:05 -07001120 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
Wei Wange460a4b2020-04-30 08:59:35 +08001121 xbzrle_counters.encoding_rate = 0;
Wei Wange460a4b2020-04-30 08:59:35 +08001122 } else {
1123 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1124 }
1125 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1126 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
Xiao Guangrongb7340352018-06-04 17:55:12 +08001127 }
Xiao Guangrong76e03002018-09-06 15:01:00 +08001128
1129 if (migrate_use_compression()) {
1130 compression_counters.busy_rate = (double)(compression_counters.busy -
1131 rs->compress_thread_busy_prev) / page_count;
1132 rs->compress_thread_busy_prev = compression_counters.busy;
1133
1134 compressed_size = compression_counters.compressed_size -
1135 rs->compressed_size_prev;
1136 if (compressed_size) {
1137 double uncompressed_size = (compression_counters.pages -
1138 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1139
1140 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1141 compression_counters.compression_rate =
1142 uncompressed_size / compressed_size;
1143
1144 rs->compress_pages_prev = compression_counters.pages;
1145 rs->compressed_size_prev = compression_counters.compressed_size;
1146 }
1147 }
Xiao Guangrongb7340352018-06-04 17:55:12 +08001148}
1149
Keqian Zhudc14a472020-02-24 10:31:42 +08001150static void migration_trigger_throttle(RAMState *rs)
1151{
1152 MigrationState *s = migrate_get_current();
1153 uint64_t threshold = s->parameters.throttle_trigger_threshold;
Peter Xu23b75762022-10-11 17:55:51 -04001154 uint64_t bytes_xfer_period =
1155 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
Keqian Zhudc14a472020-02-24 10:31:42 +08001156 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1157 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1158
1159 /* During block migration the auto-converge logic incorrectly detects
1160 * that ram migration makes no progress. Avoid this by disabling the
1161 * throttling logic during the bulk phase of block migration. */
1162 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1163 /* The following detection logic can be refined later. For now:
1164 Check to see if the ratio between dirtied bytes and the approx.
1165 amount of bytes that just got transferred since the last time
1166 we were in this routine reaches the threshold. If that happens
1167 twice, start or increase throttling. */
1168
1169 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1170 (++rs->dirty_rate_high_cnt >= 2)) {
1171 trace_migration_throttle();
1172 rs->dirty_rate_high_cnt = 0;
Keqian Zhucbbf8182020-04-13 18:15:08 +08001173 mig_throttle_guest_down(bytes_dirty_period,
1174 bytes_dirty_threshold);
Keqian Zhudc14a472020-02-24 10:31:42 +08001175 }
1176 }
1177}
1178
Juan Quintela8d820d62017-03-13 19:35:50 +01001179static void migration_bitmap_sync(RAMState *rs)
Juan Quintela56e93d22015-05-07 19:33:31 +02001180{
1181 RAMBlock *block;
Juan Quintela56e93d22015-05-07 19:33:31 +02001182 int64_t end_time;
Juan Quintela56e93d22015-05-07 19:33:31 +02001183
Juan Quintela93604472017-06-06 19:49:03 +02001184 ram_counters.dirty_sync_count++;
Juan Quintela56e93d22015-05-07 19:33:31 +02001185
Juan Quintelaf664da82017-03-13 19:44:57 +01001186 if (!rs->time_last_bitmap_sync) {
1187 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
Juan Quintela56e93d22015-05-07 19:33:31 +02001188 }
1189
1190 trace_migration_bitmap_sync_start();
Paolo Bonzini9c1f8f42016-09-22 16:08:31 +02001191 memory_global_dirty_log_sync();
Juan Quintela56e93d22015-05-07 19:33:31 +02001192
Juan Quintela108cfae2017-03-13 21:38:09 +01001193 qemu_mutex_lock(&rs->bitmap_mutex);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01001194 WITH_RCU_READ_LOCK_GUARD() {
1195 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1196 ramblock_sync_dirty_bitmap(rs, block);
1197 }
1198 ram_counters.remaining = ram_bytes_remaining();
Juan Quintela56e93d22015-05-07 19:33:31 +02001199 }
Juan Quintela108cfae2017-03-13 21:38:09 +01001200 qemu_mutex_unlock(&rs->bitmap_mutex);
Juan Quintela56e93d22015-05-07 19:33:31 +02001201
Paolo Bonzini9458a9a2018-02-06 18:37:39 +01001202 memory_global_after_dirty_log_sync();
Juan Quintelaa66cd902017-03-28 15:02:43 +02001203 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
Chao Fan1ffb5df2017-03-14 09:55:07 +08001204
Juan Quintela56e93d22015-05-07 19:33:31 +02001205 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1206
1207 /* more than 1 second = 1000 millisecons */
Juan Quintelaf664da82017-03-13 19:44:57 +01001208 if (end_time > rs->time_last_bitmap_sync + 1000) {
Keqian Zhudc14a472020-02-24 10:31:42 +08001209 migration_trigger_throttle(rs);
Jason J. Herne070afca2015-09-08 13:12:35 -04001210
Xiao Guangrongb7340352018-06-04 17:55:12 +08001211 migration_update_rates(rs, end_time);
1212
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +08001213 rs->target_page_count_prev = rs->target_page_count;
Felipe Franciosid693c6f2017-05-24 17:10:01 +01001214
1215 /* reset period counters */
Juan Quintelaf664da82017-03-13 19:44:57 +01001216 rs->time_last_bitmap_sync = end_time;
Juan Quintelaa66cd902017-03-28 15:02:43 +02001217 rs->num_dirty_pages_period = 0;
Peter Xu23b75762022-10-11 17:55:51 -04001218 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
Juan Quintela56e93d22015-05-07 19:33:31 +02001219 }
Dr. David Alan Gilbert4addcd42015-12-16 11:47:36 +00001220 if (migrate_use_events()) {
Peter Xu3ab72382018-08-15 21:37:37 +08001221 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
Dr. David Alan Gilbert4addcd42015-12-16 11:47:36 +00001222 }
Juan Quintela56e93d22015-05-07 19:33:31 +02001223}
1224
Wei Wangbd227062018-12-11 16:24:51 +08001225static void migration_bitmap_sync_precopy(RAMState *rs)
1226{
1227 Error *local_err = NULL;
1228
1229 /*
1230 * The current notifier usage is just an optimization to migration, so we
1231 * don't stop the normal migration process in the error case.
1232 */
1233 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1234 error_report_err(local_err);
Vladimir Sementsov-Ogievskiyb4a17332020-03-24 18:36:29 +03001235 local_err = NULL;
Wei Wangbd227062018-12-11 16:24:51 +08001236 }
1237
1238 migration_bitmap_sync(rs);
1239
1240 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1241 error_report_err(local_err);
1242 }
1243}
1244
Juan Quintelaa4dbaf82021-12-16 10:19:38 +01001245void ram_release_page(const char *rbname, uint64_t offset)
Juan Quintela47fe16f2021-12-16 09:58:49 +01001246{
1247 if (!migrate_release_ram() || !migration_in_postcopy()) {
1248 return;
1249 }
1250
1251 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1252}
1253
Juan Quintela56e93d22015-05-07 19:33:31 +02001254/**
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001255 * save_zero_page_to_file: send the zero page to the file
1256 *
1257 * Returns the size of data written to the file, 0 means the page is not
1258 * a zero page
1259 *
Peter Xuec6f3ab2022-10-11 17:55:56 -04001260 * @pss: current PSS channel
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001261 * @block: block that contains the page we want to send
1262 * @offset: offset inside the block for the page
1263 */
Peter Xuec6f3ab2022-10-11 17:55:56 -04001264static int save_zero_page_to_file(PageSearchStatus *pss,
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001265 RAMBlock *block, ram_addr_t offset)
1266{
1267 uint8_t *p = block->host + offset;
Peter Xuec6f3ab2022-10-11 17:55:56 -04001268 QEMUFile *file = pss->pss_channel;
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001269 int len = 0;
1270
Juan Quintelabad452a2021-11-18 15:56:38 +01001271 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
Peter Xuec6f3ab2022-10-11 17:55:56 -04001272 len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001273 qemu_put_byte(file, 0);
1274 len += 1;
Juan Quintela47fe16f2021-12-16 09:58:49 +01001275 ram_release_page(block->idstr, offset);
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001276 }
1277 return len;
1278}
1279
1280/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01001281 * save_zero_page: send the zero page to the stream
Juan Quintela56e93d22015-05-07 19:33:31 +02001282 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01001283 * Returns the number of pages written.
Juan Quintela56e93d22015-05-07 19:33:31 +02001284 *
Peter Xuec6f3ab2022-10-11 17:55:56 -04001285 * @pss: current PSS channel
Juan Quintela56e93d22015-05-07 19:33:31 +02001286 * @block: block that contains the page we want to send
1287 * @offset: offset inside the block for the page
Juan Quintela56e93d22015-05-07 19:33:31 +02001288 */
Peter Xuec6f3ab2022-10-11 17:55:56 -04001289static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
Peter Xu61717ea2022-10-11 17:55:53 -04001290 ram_addr_t offset)
Juan Quintela56e93d22015-05-07 19:33:31 +02001291{
Peter Xuec6f3ab2022-10-11 17:55:56 -04001292 int len = save_zero_page_to_file(pss, block, offset);
Juan Quintela56e93d22015-05-07 19:33:31 +02001293
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001294 if (len) {
Peter Xu23b75762022-10-11 17:55:51 -04001295 stat64_add(&ram_atomic_counters.duplicate, 1);
David Edmondson4c2d0f62021-12-21 09:34:40 +00001296 ram_transferred_add(len);
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001297 return 1;
Juan Quintela56e93d22015-05-07 19:33:31 +02001298 }
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001299 return -1;
Juan Quintela56e93d22015-05-07 19:33:31 +02001300}
1301
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001302/*
1303 * @pages: the number of pages written by the control path,
1304 * < 0 - error
1305 * > 0 - number of pages written
1306 *
1307 * Return true if the pages has been saved, otherwise false is returned.
1308 */
Peter Xu61717ea2022-10-11 17:55:53 -04001309static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1310 ram_addr_t offset, int *pages)
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001311{
1312 uint64_t bytes_xmit = 0;
1313 int ret;
1314
1315 *pages = -1;
Peter Xu61717ea2022-10-11 17:55:53 -04001316 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1317 TARGET_PAGE_SIZE, &bytes_xmit);
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001318 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1319 return false;
1320 }
1321
1322 if (bytes_xmit) {
David Edmondson4c2d0f62021-12-21 09:34:40 +00001323 ram_transferred_add(bytes_xmit);
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001324 *pages = 1;
1325 }
1326
1327 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1328 return true;
1329 }
1330
1331 if (bytes_xmit > 0) {
Peter Xu23b75762022-10-11 17:55:51 -04001332 stat64_add(&ram_atomic_counters.normal, 1);
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001333 } else if (bytes_xmit == 0) {
Peter Xu23b75762022-10-11 17:55:51 -04001334 stat64_add(&ram_atomic_counters.duplicate, 1);
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001335 }
1336
1337 return true;
1338}
1339
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001340/*
1341 * directly send the page to the stream
1342 *
1343 * Returns the number of pages written.
1344 *
Peter Xuec6f3ab2022-10-11 17:55:56 -04001345 * @pss: current PSS channel
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001346 * @block: block that contains the page we want to send
1347 * @offset: offset inside the block for the page
1348 * @buf: the page to be sent
1349 * @async: send to page asyncly
1350 */
Peter Xuec6f3ab2022-10-11 17:55:56 -04001351static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
Peter Xu61717ea2022-10-11 17:55:53 -04001352 ram_addr_t offset, uint8_t *buf, bool async)
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001353{
Peter Xuec6f3ab2022-10-11 17:55:56 -04001354 QEMUFile *file = pss->pss_channel;
1355
1356 ram_transferred_add(save_page_header(pss, block,
David Edmondson4c2d0f62021-12-21 09:34:40 +00001357 offset | RAM_SAVE_FLAG_PAGE));
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001358 if (async) {
Peter Xu61717ea2022-10-11 17:55:53 -04001359 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
Dr. David Alan Gilbertf912ec52022-04-06 11:25:15 +01001360 migrate_release_ram() &&
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001361 migration_in_postcopy());
1362 } else {
Peter Xu61717ea2022-10-11 17:55:53 -04001363 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001364 }
David Edmondson4c2d0f62021-12-21 09:34:40 +00001365 ram_transferred_add(TARGET_PAGE_SIZE);
Peter Xu23b75762022-10-11 17:55:51 -04001366 stat64_add(&ram_atomic_counters.normal, 1);
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001367 return 1;
1368}
1369
Juan Quintela56e93d22015-05-07 19:33:31 +02001370/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01001371 * ram_save_page: send the given page to the stream
Juan Quintela56e93d22015-05-07 19:33:31 +02001372 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01001373 * Returns the number of pages written.
Dr. David Alan Gilbert3fd3c4b2015-12-10 16:31:46 +00001374 * < 0 - error
1375 * >=0 - Number of pages written - this might legally be 0
1376 * if xbzrle noticed the page was the same.
Juan Quintela56e93d22015-05-07 19:33:31 +02001377 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01001378 * @rs: current RAM state
Juan Quintela56e93d22015-05-07 19:33:31 +02001379 * @block: block that contains the page we want to send
1380 * @offset: offset inside the block for the page
Juan Quintela56e93d22015-05-07 19:33:31 +02001381 */
Juan Quintela05931ec2021-12-15 19:01:21 +01001382static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
Juan Quintela56e93d22015-05-07 19:33:31 +02001383{
1384 int pages = -1;
Juan Quintela56e93d22015-05-07 19:33:31 +02001385 uint8_t *p;
Juan Quintela56e93d22015-05-07 19:33:31 +02001386 bool send_async = true;
zhanghailianga08f6892016-01-15 11:37:44 +08001387 RAMBlock *block = pss->block;
Alexey Romko8bba0042020-01-10 14:51:34 +01001388 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001389 ram_addr_t current_addr = block->offset + offset;
Juan Quintela56e93d22015-05-07 19:33:31 +02001390
Dr. David Alan Gilbert2f68e392015-08-13 11:51:30 +01001391 p = block->host + offset;
Dr. David Alan Gilbert1db9d8e2017-04-26 19:37:21 +01001392 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
Juan Quintela56e93d22015-05-07 19:33:31 +02001393
Juan Quintela56e93d22015-05-07 19:33:31 +02001394 XBZRLE_cache_lock();
David Hildenbrand1a373522021-02-16 11:50:39 +01001395 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
Peter Xuec6f3ab2022-10-11 17:55:56 -04001396 pages = save_xbzrle_page(rs, pss, &p, current_addr,
Peter Xu61717ea2022-10-11 17:55:53 -04001397 block, offset);
Juan Quintela05931ec2021-12-15 19:01:21 +01001398 if (!rs->last_stage) {
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001399 /* Can't send this cached data async, since the cache page
1400 * might get updated before it gets to the wire
Juan Quintela56e93d22015-05-07 19:33:31 +02001401 */
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001402 send_async = false;
Juan Quintela56e93d22015-05-07 19:33:31 +02001403 }
1404 }
1405
1406 /* XBZRLE overflow or normal page */
1407 if (pages == -1) {
Peter Xuec6f3ab2022-10-11 17:55:56 -04001408 pages = save_normal_page(pss, block, offset, p, send_async);
Juan Quintela56e93d22015-05-07 19:33:31 +02001409 }
1410
1411 XBZRLE_cache_unlock();
1412
1413 return pages;
1414}
1415
Peter Xu61717ea2022-10-11 17:55:53 -04001416static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
Juan Quintelab9ee2f72016-01-15 11:40:13 +01001417 ram_addr_t offset)
1418{
Peter Xu61717ea2022-10-11 17:55:53 -04001419 if (multifd_queue_page(file, block, offset) < 0) {
Ivan Ren713f7622019-06-25 21:18:17 +08001420 return -1;
1421 }
Peter Xu23b75762022-10-11 17:55:51 -04001422 stat64_add(&ram_atomic_counters.normal, 1);
Juan Quintelab9ee2f72016-01-15 11:40:13 +01001423
1424 return 1;
1425}
1426
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08001427static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
Xiao Guangrong6ef37712018-08-21 16:10:23 +08001428 ram_addr_t offset, uint8_t *source_buf)
Juan Quintela56e93d22015-05-07 19:33:31 +02001429{
Juan Quintela53518d92017-05-04 11:46:24 +02001430 RAMState *rs = ram_state;
Peter Xuec6f3ab2022-10-11 17:55:56 -04001431 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
Juan Quintela20d549c2021-12-21 10:28:16 +01001432 uint8_t *p = block->host + offset;
Xiao Guangrong6ef37712018-08-21 16:10:23 +08001433 int ret;
Juan Quintela56e93d22015-05-07 19:33:31 +02001434
Peter Xuec6f3ab2022-10-11 17:55:56 -04001435 if (save_zero_page_to_file(pss, block, offset)) {
Juan Quintelae7f2e192021-12-16 09:39:49 +01001436 return true;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08001437 }
1438
Peter Xuec6f3ab2022-10-11 17:55:56 -04001439 save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08001440
1441 /*
1442 * copy it to a internal buffer to avoid it being modified by VM
1443 * so that we can catch up the error during compression and
1444 * decompression
1445 */
1446 memcpy(source_buf, p, TARGET_PAGE_SIZE);
Xiao Guangrong6ef37712018-08-21 16:10:23 +08001447 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1448 if (ret < 0) {
1449 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
Liang Lib3be2892016-05-05 15:32:54 +08001450 error_report("compressed data failed!");
Liang Lib3be2892016-05-05 15:32:54 +08001451 }
Juan Quintelae7f2e192021-12-16 09:39:49 +01001452 return false;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08001453}
1454
1455static void
1456update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1457{
David Edmondson4c2d0f62021-12-21 09:34:40 +00001458 ram_transferred_add(bytes_xmit);
Xiao Guangrong76e03002018-09-06 15:01:00 +08001459
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08001460 if (param->zero_page) {
Peter Xu23b75762022-10-11 17:55:51 -04001461 stat64_add(&ram_atomic_counters.duplicate, 1);
Xiao Guangrong76e03002018-09-06 15:01:00 +08001462 return;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08001463 }
Xiao Guangrong76e03002018-09-06 15:01:00 +08001464
1465 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1466 compression_counters.compressed_size += bytes_xmit - 8;
1467 compression_counters.pages++;
Juan Quintela56e93d22015-05-07 19:33:31 +02001468}
1469
Xiao Guangrong32b05492018-09-06 15:01:01 +08001470static bool save_page_use_compression(RAMState *rs);
1471
Juan Quintelace25d332017-03-15 11:00:51 +01001472static void flush_compressed_data(RAMState *rs)
Juan Quintela56e93d22015-05-07 19:33:31 +02001473{
Peter Xueaa238a2022-10-11 17:55:49 -04001474 MigrationState *ms = migrate_get_current();
Juan Quintela56e93d22015-05-07 19:33:31 +02001475 int idx, len, thread_count;
1476
Xiao Guangrong32b05492018-09-06 15:01:01 +08001477 if (!save_page_use_compression(rs)) {
Juan Quintela56e93d22015-05-07 19:33:31 +02001478 return;
1479 }
1480 thread_count = migrate_compress_threads();
Liang Lia7a9a882016-05-05 15:32:57 +08001481
Liang Li0d9f9a52016-05-05 15:32:59 +08001482 qemu_mutex_lock(&comp_done_lock);
Juan Quintela56e93d22015-05-07 19:33:31 +02001483 for (idx = 0; idx < thread_count; idx++) {
Liang Lia7a9a882016-05-05 15:32:57 +08001484 while (!comp_param[idx].done) {
Liang Li0d9f9a52016-05-05 15:32:59 +08001485 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
Juan Quintela56e93d22015-05-07 19:33:31 +02001486 }
Liang Lia7a9a882016-05-05 15:32:57 +08001487 }
Liang Li0d9f9a52016-05-05 15:32:59 +08001488 qemu_mutex_unlock(&comp_done_lock);
Liang Lia7a9a882016-05-05 15:32:57 +08001489
1490 for (idx = 0; idx < thread_count; idx++) {
1491 qemu_mutex_lock(&comp_param[idx].mutex);
Liang Li90e56fb2016-05-05 15:32:56 +08001492 if (!comp_param[idx].quit) {
Peter Xueaa238a2022-10-11 17:55:49 -04001493 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08001494 /*
1495 * it's safe to fetch zero_page without holding comp_done_lock
1496 * as there is no further request submitted to the thread,
1497 * i.e, the thread should be waiting for a request at this point.
1498 */
1499 update_compress_thread_counts(&comp_param[idx], len);
Juan Quintela56e93d22015-05-07 19:33:31 +02001500 }
Liang Lia7a9a882016-05-05 15:32:57 +08001501 qemu_mutex_unlock(&comp_param[idx].mutex);
Juan Quintela56e93d22015-05-07 19:33:31 +02001502 }
1503}
1504
1505static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1506 ram_addr_t offset)
1507{
1508 param->block = block;
1509 param->offset = offset;
1510}
1511
Peter Xueaa238a2022-10-11 17:55:49 -04001512static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
Juan Quintela56e93d22015-05-07 19:33:31 +02001513{
1514 int idx, thread_count, bytes_xmit = -1, pages = -1;
Xiao Guangrong1d588722018-08-21 16:10:20 +08001515 bool wait = migrate_compress_wait_thread();
Peter Xueaa238a2022-10-11 17:55:49 -04001516 MigrationState *ms = migrate_get_current();
Juan Quintela56e93d22015-05-07 19:33:31 +02001517
1518 thread_count = migrate_compress_threads();
Liang Li0d9f9a52016-05-05 15:32:59 +08001519 qemu_mutex_lock(&comp_done_lock);
Xiao Guangrong1d588722018-08-21 16:10:20 +08001520retry:
1521 for (idx = 0; idx < thread_count; idx++) {
1522 if (comp_param[idx].done) {
1523 comp_param[idx].done = false;
Peter Xueaa238a2022-10-11 17:55:49 -04001524 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1525 comp_param[idx].file);
Xiao Guangrong1d588722018-08-21 16:10:20 +08001526 qemu_mutex_lock(&comp_param[idx].mutex);
1527 set_compress_params(&comp_param[idx], block, offset);
1528 qemu_cond_signal(&comp_param[idx].cond);
1529 qemu_mutex_unlock(&comp_param[idx].mutex);
1530 pages = 1;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08001531 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
Juan Quintela56e93d22015-05-07 19:33:31 +02001532 break;
Juan Quintela56e93d22015-05-07 19:33:31 +02001533 }
1534 }
Xiao Guangrong1d588722018-08-21 16:10:20 +08001535
1536 /*
1537 * wait for the free thread if the user specifies 'compress-wait-thread',
1538 * otherwise we will post the page out in the main thread as normal page.
1539 */
1540 if (pages < 0 && wait) {
1541 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1542 goto retry;
1543 }
Liang Li0d9f9a52016-05-05 15:32:59 +08001544 qemu_mutex_unlock(&comp_done_lock);
Juan Quintela56e93d22015-05-07 19:33:31 +02001545
1546 return pages;
1547}
1548
1549/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01001550 * find_dirty_block: find the next dirty page and update any state
1551 * associated with the search process.
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001552 *
Wei Yanga5f7b1a2019-05-11 07:37:29 +08001553 * Returns true if a page is found
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001554 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01001555 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01001556 * @pss: data about the state of the current dirty page scan
1557 * @again: set to false if the search has scanned the whole of RAM
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001558 */
Juan Quintelaf20e2862017-03-21 16:19:05 +01001559static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001560{
Peter Xud9e474e2022-10-11 17:55:52 -04001561 /* Update pss->page for the next dirty bit in ramblock */
1562 pss_find_next_dirty(pss);
1563
Juan Quintela6f37bb82017-03-13 19:26:29 +01001564 if (pss->complete_round && pss->block == rs->last_seen_block &&
Juan Quintelaa935e302017-03-21 15:36:51 +01001565 pss->page >= rs->last_page) {
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001566 /*
1567 * We've been once around the RAM and haven't found anything.
1568 * Give up.
1569 */
1570 *again = false;
1571 return false;
1572 }
David Hildenbrand542147f2021-04-29 13:27:08 +02001573 if (!offset_in_ramblock(pss->block,
1574 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001575 /* Didn't find anything in this RAM Block */
Juan Quintelaa935e302017-03-21 15:36:51 +01001576 pss->page = 0;
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001577 pss->block = QLIST_NEXT_RCU(pss->block, next);
1578 if (!pss->block) {
Xiao Guangrong48df9d82018-09-06 15:00:59 +08001579 /*
1580 * If memory migration starts over, we will meet a dirtied page
1581 * which may still exists in compression threads's ring, so we
1582 * should flush the compressed data to make sure the new page
1583 * is not overwritten by the old one in the destination.
1584 *
1585 * Also If xbzrle is on, stop using the data compression at this
1586 * point. In theory, xbzrle can do better than compression.
1587 */
1588 flush_compressed_data(rs);
1589
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001590 /* Hit the end of the list */
1591 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1592 /* Flag that we've looped */
1593 pss->complete_round = true;
David Hildenbrand1a373522021-02-16 11:50:39 +01001594 /* After the first round, enable XBZRLE. */
1595 if (migrate_use_xbzrle()) {
1596 rs->xbzrle_enabled = true;
1597 }
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001598 }
1599 /* Didn't find anything this time, but try again on the new block */
1600 *again = true;
1601 return false;
1602 } else {
1603 /* Can go around again, but... */
1604 *again = true;
1605 /* We've found something so probably don't need to */
1606 return true;
1607 }
1608}
1609
Juan Quintela3d0684b2017-03-23 15:06:39 +01001610/**
1611 * unqueue_page: gets a page of the queue
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001612 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01001613 * Helper for 'get_queued_page' - gets a page off the queue
1614 *
1615 * Returns the block of the page (or NULL if none available)
1616 *
Juan Quintelaec481c62017-03-20 22:12:40 +01001617 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01001618 * @offset: used to return the offset within the RAMBlock
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001619 */
Juan Quintelaf20e2862017-03-21 16:19:05 +01001620static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001621{
Peter Xua1fe28d2022-01-19 16:09:18 +08001622 struct RAMSrcPageRequest *entry;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001623 RAMBlock *block = NULL;
1624
Peter Xua1fe28d2022-01-19 16:09:18 +08001625 if (!postcopy_has_request(rs)) {
Xiao Guangrongae526e32018-08-21 16:10:25 +08001626 return NULL;
1627 }
1628
Daniel Brodsky6e8a3552020-04-03 21:21:08 -07001629 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001630
Peter Xua1fe28d2022-01-19 16:09:18 +08001631 /*
1632 * This should _never_ change even after we take the lock, because no one
1633 * should be taking anything off the request list other than us.
1634 */
1635 assert(postcopy_has_request(rs));
1636
1637 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1638 block = entry->rb;
1639 *offset = entry->offset;
1640
Thomas Huth777f53c2022-08-02 08:19:49 +02001641 if (entry->len > TARGET_PAGE_SIZE) {
1642 entry->len -= TARGET_PAGE_SIZE;
1643 entry->offset += TARGET_PAGE_SIZE;
Peter Xua1fe28d2022-01-19 16:09:18 +08001644 } else {
1645 memory_region_unref(block->mr);
1646 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1647 g_free(entry);
1648 migration_consume_urgent_request();
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001649 }
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001650
1651 return block;
1652}
1653
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001654#if defined(__linux__)
1655/**
1656 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1657 * is found, return RAM block pointer and page offset
1658 *
1659 * Returns pointer to the RAMBlock containing faulting page,
1660 * NULL if no write faults are pending
1661 *
1662 * @rs: current RAM state
1663 * @offset: page offset from the beginning of the block
1664 */
1665static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1666{
1667 struct uffd_msg uffd_msg;
1668 void *page_address;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001669 RAMBlock *block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001670 int res;
1671
1672 if (!migrate_background_snapshot()) {
1673 return NULL;
1674 }
1675
1676 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1677 if (res <= 0) {
1678 return NULL;
1679 }
1680
1681 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001682 block = qemu_ram_block_from_host(page_address, false, offset);
1683 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1684 return block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001685}
1686
1687/**
1688 * ram_save_release_protection: release UFFD write protection after
1689 * a range of pages has been saved
1690 *
1691 * @rs: current RAM state
1692 * @pss: page-search-status structure
1693 * @start_page: index of the first page in the range relative to pss->block
1694 *
1695 * Returns 0 on success, negative value in case of an error
1696*/
1697static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1698 unsigned long start_page)
1699{
1700 int res = 0;
1701
1702 /* Check if page is from UFFD-managed region. */
1703 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1704 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
Peter Xu258f5c982022-01-19 16:09:15 +08001705 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001706
1707 /* Flush async buffers before un-protect. */
Peter Xu61717ea2022-10-11 17:55:53 -04001708 qemu_fflush(pss->pss_channel);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001709 /* Un-protect memory range. */
1710 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1711 false, false);
1712 }
1713
1714 return res;
1715}
1716
1717/* ram_write_tracking_available: check if kernel supports required UFFD features
1718 *
1719 * Returns true if supports, false otherwise
1720 */
1721bool ram_write_tracking_available(void)
1722{
1723 uint64_t uffd_features;
1724 int res;
1725
1726 res = uffd_query_features(&uffd_features);
1727 return (res == 0 &&
1728 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1729}
1730
1731/* ram_write_tracking_compatible: check if guest configuration is
1732 * compatible with 'write-tracking'
1733 *
1734 * Returns true if compatible, false otherwise
1735 */
1736bool ram_write_tracking_compatible(void)
1737{
1738 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1739 int uffd_fd;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001740 RAMBlock *block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001741 bool ret = false;
1742
1743 /* Open UFFD file descriptor */
1744 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1745 if (uffd_fd < 0) {
1746 return false;
1747 }
1748
1749 RCU_READ_LOCK_GUARD();
1750
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001751 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001752 uint64_t uffd_ioctls;
1753
1754 /* Nothing to do with read-only and MMIO-writable regions */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001755 if (block->mr->readonly || block->mr->rom_device) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001756 continue;
1757 }
1758 /* Try to register block memory via UFFD-IO to track writes */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001759 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001760 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1761 goto out;
1762 }
1763 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1764 goto out;
1765 }
1766 }
1767 ret = true;
1768
1769out:
1770 uffd_close_fd(uffd_fd);
1771 return ret;
1772}
1773
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001774static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1775 ram_addr_t size)
1776{
David Hildenbrand5f19a442023-01-05 13:45:24 +01001777 const ram_addr_t end = offset + size;
1778
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001779 /*
1780 * We read one byte of each page; this will preallocate page tables if
1781 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1782 * where no page was populated yet. This might require adaption when
1783 * supporting other mappings, like shmem.
1784 */
David Hildenbrand5f19a442023-01-05 13:45:24 +01001785 for (; offset < end; offset += block->page_size) {
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001786 char tmp = *((char *)block->host + offset);
1787
1788 /* Don't optimize the read out */
1789 asm volatile("" : "+r" (tmp));
1790 }
1791}
1792
David Hildenbrand6fee3a12021-10-11 19:53:46 +02001793static inline int populate_read_section(MemoryRegionSection *section,
1794 void *opaque)
1795{
1796 const hwaddr size = int128_get64(section->size);
1797 hwaddr offset = section->offset_within_region;
1798 RAMBlock *block = section->mr->ram_block;
1799
1800 populate_read_range(block, offset, size);
1801 return 0;
1802}
1803
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001804/*
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001805 * ram_block_populate_read: preallocate page tables and populate pages in the
1806 * RAM block by reading a byte of each page.
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001807 *
1808 * Since it's solely used for userfault_fd WP feature, here we just
1809 * hardcode page size to qemu_real_host_page_size.
1810 *
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001811 * @block: RAM block to populate
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001812 */
David Hildenbrand6fee3a12021-10-11 19:53:46 +02001813static void ram_block_populate_read(RAMBlock *rb)
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001814{
David Hildenbrand6fee3a12021-10-11 19:53:46 +02001815 /*
1816 * Skip populating all pages that fall into a discarded range as managed by
1817 * a RamDiscardManager responsible for the mapped memory region of the
1818 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1819 * must not get populated automatically. We don't have to track
1820 * modifications via userfaultfd WP reliably, because these pages will
1821 * not be part of the migration stream either way -- see
1822 * ramblock_dirty_bitmap_exclude_discarded_pages().
1823 *
1824 * Note: The result is only stable while migrating (precopy/postcopy).
1825 */
1826 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1827 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1828 MemoryRegionSection section = {
1829 .mr = rb->mr,
1830 .offset_within_region = 0,
1831 .size = rb->mr->size,
1832 };
1833
1834 ram_discard_manager_replay_populated(rdm, &section,
1835 populate_read_section, NULL);
1836 } else {
1837 populate_read_range(rb, 0, rb->used_length);
1838 }
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001839}
1840
1841/*
1842 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1843 */
1844void ram_write_tracking_prepare(void)
1845{
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001846 RAMBlock *block;
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001847
1848 RCU_READ_LOCK_GUARD();
1849
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001850 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001851 /* Nothing to do with read-only and MMIO-writable regions */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001852 if (block->mr->readonly || block->mr->rom_device) {
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001853 continue;
1854 }
1855
1856 /*
1857 * Populate pages of the RAM block before enabling userfault_fd
1858 * write protection.
1859 *
1860 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1861 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1862 * pages with pte_none() entries in page table.
1863 */
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001864 ram_block_populate_read(block);
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001865 }
1866}
1867
1868/*
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001869 * ram_write_tracking_start: start UFFD-WP memory tracking
1870 *
1871 * Returns 0 for success or negative value in case of error
1872 */
1873int ram_write_tracking_start(void)
1874{
1875 int uffd_fd;
1876 RAMState *rs = ram_state;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001877 RAMBlock *block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001878
1879 /* Open UFFD file descriptor */
1880 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1881 if (uffd_fd < 0) {
1882 return uffd_fd;
1883 }
1884 rs->uffdio_fd = uffd_fd;
1885
1886 RCU_READ_LOCK_GUARD();
1887
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001888 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001889 /* Nothing to do with read-only and MMIO-writable regions */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001890 if (block->mr->readonly || block->mr->rom_device) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001891 continue;
1892 }
1893
1894 /* Register block memory with UFFD to track writes */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001895 if (uffd_register_memory(rs->uffdio_fd, block->host,
1896 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001897 goto fail;
1898 }
1899 /* Apply UFFD write protection to the block memory range */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001900 if (uffd_change_protection(rs->uffdio_fd, block->host,
1901 block->max_length, true, false)) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001902 goto fail;
1903 }
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001904 block->flags |= RAM_UF_WRITEPROTECT;
1905 memory_region_ref(block->mr);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001906
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001907 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1908 block->host, block->max_length);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001909 }
1910
1911 return 0;
1912
1913fail:
1914 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1915
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001916 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1917 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001918 continue;
1919 }
1920 /*
1921 * In case some memory block failed to be write-protected
1922 * remove protection and unregister all succeeded RAM blocks
1923 */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001924 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1925 false, false);
1926 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001927 /* Cleanup flags and remove reference */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001928 block->flags &= ~RAM_UF_WRITEPROTECT;
1929 memory_region_unref(block->mr);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001930 }
1931
1932 uffd_close_fd(uffd_fd);
1933 rs->uffdio_fd = -1;
1934 return -1;
1935}
1936
1937/**
1938 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1939 */
1940void ram_write_tracking_stop(void)
1941{
1942 RAMState *rs = ram_state;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001943 RAMBlock *block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001944
1945 RCU_READ_LOCK_GUARD();
1946
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001947 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1948 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001949 continue;
1950 }
1951 /* Remove protection and unregister all affected RAM blocks */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001952 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1953 false, false);
1954 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001955
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001956 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1957 block->host, block->max_length);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001958
1959 /* Cleanup flags and remove reference */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001960 block->flags &= ~RAM_UF_WRITEPROTECT;
1961 memory_region_unref(block->mr);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001962 }
1963
1964 /* Finally close UFFD file descriptor */
1965 uffd_close_fd(rs->uffdio_fd);
1966 rs->uffdio_fd = -1;
1967}
1968
1969#else
1970/* No target OS support, stubs just fail or ignore */
1971
1972static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1973{
1974 (void) rs;
1975 (void) offset;
1976
1977 return NULL;
1978}
1979
1980static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1981 unsigned long start_page)
1982{
1983 (void) rs;
1984 (void) pss;
1985 (void) start_page;
1986
1987 return 0;
1988}
1989
1990bool ram_write_tracking_available(void)
1991{
1992 return false;
1993}
1994
1995bool ram_write_tracking_compatible(void)
1996{
1997 assert(0);
1998 return false;
1999}
2000
2001int ram_write_tracking_start(void)
2002{
2003 assert(0);
2004 return -1;
2005}
2006
2007void ram_write_tracking_stop(void)
2008{
2009 assert(0);
2010}
2011#endif /* defined(__linux__) */
2012
Juan Quintela3d0684b2017-03-23 15:06:39 +01002013/**
Li Qiangff1543a2019-05-24 23:28:32 -07002014 * get_queued_page: unqueue a page from the postcopy requests
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002015 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002016 * Skips pages that are already sent (!dirty)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002017 *
Wei Yanga5f7b1a2019-05-11 07:37:29 +08002018 * Returns true if a queued page is found
Juan Quintela3d0684b2017-03-23 15:06:39 +01002019 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01002020 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01002021 * @pss: data about the state of the current dirty page scan
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002022 */
Juan Quintelaf20e2862017-03-21 16:19:05 +01002023static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002024{
2025 RAMBlock *block;
2026 ram_addr_t offset;
Thomas Huth777f53c2022-08-02 08:19:49 +02002027 bool dirty;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002028
Thomas Huth777f53c2022-08-02 08:19:49 +02002029 do {
2030 block = unqueue_page(rs, &offset);
2031 /*
2032 * We're sending this page, and since it's postcopy nothing else
2033 * will dirty it, and we must make sure it doesn't get sent again
2034 * even if this queue request was received after the background
2035 * search already sent it.
2036 */
2037 if (block) {
2038 unsigned long page;
2039
2040 page = offset >> TARGET_PAGE_BITS;
2041 dirty = test_bit(page, block->bmap);
2042 if (!dirty) {
2043 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2044 page);
2045 } else {
2046 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2047 }
2048 }
2049
2050 } while (block && !dirty);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002051
Peter Xub0621062022-10-11 17:55:58 -04002052 if (!block) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002053 /*
2054 * Poll write faults too if background snapshot is enabled; that's
2055 * when we have vcpus got blocked by the write protected pages.
2056 */
2057 block = poll_fault_page(rs, &offset);
2058 }
2059
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002060 if (block) {
2061 /*
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002062 * We want the background search to continue from the queued page
2063 * since the guest is likely to want other pages near to the page
2064 * it just requested.
2065 */
2066 pss->block = block;
Juan Quintelaa935e302017-03-21 15:36:51 +01002067 pss->page = offset >> TARGET_PAGE_BITS;
Wei Yang422314e2019-06-05 09:08:28 +08002068
2069 /*
2070 * This unqueued page would break the "one round" check, even is
2071 * really rare.
2072 */
2073 pss->complete_round = false;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002074 }
2075
2076 return !!block;
2077}
2078
Juan Quintela56e93d22015-05-07 19:33:31 +02002079/**
Juan Quintela5e58f962017-04-03 22:06:54 +02002080 * migration_page_queue_free: drop any remaining pages in the ram
2081 * request queue
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002082 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002083 * It should be empty at the end anyway, but in error cases there may
2084 * be some left. in case that there is any page left, we drop it.
2085 *
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002086 */
Juan Quintela83c13382017-05-04 11:45:01 +02002087static void migration_page_queue_free(RAMState *rs)
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002088{
Juan Quintelaec481c62017-03-20 22:12:40 +01002089 struct RAMSrcPageRequest *mspr, *next_mspr;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002090 /* This queue generally should be empty - but in the case of a failed
2091 * migration might have some droppings in.
2092 */
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002093 RCU_READ_LOCK_GUARD();
Juan Quintelaec481c62017-03-20 22:12:40 +01002094 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002095 memory_region_unref(mspr->rb->mr);
Juan Quintelaec481c62017-03-20 22:12:40 +01002096 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002097 g_free(mspr);
2098 }
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002099}
2100
2101/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01002102 * ram_save_queue_pages: queue the page for transmission
2103 *
2104 * A request from postcopy destination for example.
2105 *
2106 * Returns zero on success or negative on error
2107 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002108 * @rbname: Name of the RAMBLock of the request. NULL means the
2109 * same that last one.
2110 * @start: starting address from the start of the RAMBlock
2111 * @len: length (in bytes) to send
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002112 */
Juan Quintela96506892017-03-14 18:41:03 +01002113int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002114{
2115 RAMBlock *ramblock;
Juan Quintela53518d92017-05-04 11:46:24 +02002116 RAMState *rs = ram_state;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002117
Juan Quintela93604472017-06-06 19:49:03 +02002118 ram_counters.postcopy_requests++;
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002119 RCU_READ_LOCK_GUARD();
2120
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002121 if (!rbname) {
2122 /* Reuse last RAMBlock */
Juan Quintela68a098f2017-03-14 13:48:42 +01002123 ramblock = rs->last_req_rb;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002124
2125 if (!ramblock) {
2126 /*
2127 * Shouldn't happen, we can't reuse the last RAMBlock if
2128 * it's the 1st request.
2129 */
2130 error_report("ram_save_queue_pages no previous block");
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03002131 return -1;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002132 }
2133 } else {
2134 ramblock = qemu_ram_block_by_name(rbname);
2135
2136 if (!ramblock) {
2137 /* We shouldn't be asked for a non-existent RAMBlock */
2138 error_report("ram_save_queue_pages no block '%s'", rbname);
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03002139 return -1;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002140 }
Juan Quintela68a098f2017-03-14 13:48:42 +01002141 rs->last_req_rb = ramblock;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002142 }
2143 trace_ram_save_queue_pages(ramblock->idstr, start, len);
David Hildenbrand542147f2021-04-29 13:27:08 +02002144 if (!offset_in_ramblock(ramblock, start + len - 1)) {
Juan Quintela9458ad62015-11-10 17:42:05 +01002145 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2146 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002147 __func__, start, len, ramblock->used_length);
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03002148 return -1;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002149 }
2150
Peter Xu93589822022-10-11 17:55:57 -04002151 /*
2152 * When with postcopy preempt, we send back the page directly in the
2153 * rp-return thread.
2154 */
2155 if (postcopy_preempt_active()) {
2156 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2157 size_t page_size = qemu_ram_pagesize(ramblock);
2158 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2159 int ret = 0;
2160
2161 qemu_mutex_lock(&rs->bitmap_mutex);
2162
2163 pss_init(pss, ramblock, page_start);
2164 /*
2165 * Always use the preempt channel, and make sure it's there. It's
2166 * safe to access without lock, because when rp-thread is running
2167 * we should be the only one who operates on the qemufile
2168 */
2169 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
Peter Xu93589822022-10-11 17:55:57 -04002170 assert(pss->pss_channel);
2171
2172 /*
2173 * It must be either one or multiple of host page size. Just
2174 * assert; if something wrong we're mostly split brain anyway.
2175 */
2176 assert(len % page_size == 0);
2177 while (len) {
2178 if (ram_save_host_page_urgent(pss)) {
2179 error_report("%s: ram_save_host_page_urgent() failed: "
2180 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2181 __func__, ramblock->idstr, start);
2182 ret = -1;
2183 break;
2184 }
2185 /*
2186 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2187 * will automatically be moved and point to the next host page
2188 * we're going to send, so no need to update here.
2189 *
2190 * Normally QEMU never sends >1 host page in requests, so
2191 * logically we don't even need that as the loop should only
2192 * run once, but just to be consistent.
2193 */
2194 len -= page_size;
2195 };
2196 qemu_mutex_unlock(&rs->bitmap_mutex);
2197
2198 return ret;
2199 }
2200
Juan Quintelaec481c62017-03-20 22:12:40 +01002201 struct RAMSrcPageRequest *new_entry =
Markus Armbrusterb21e2382022-03-15 15:41:56 +01002202 g_new0(struct RAMSrcPageRequest, 1);
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002203 new_entry->rb = ramblock;
2204 new_entry->offset = start;
2205 new_entry->len = len;
2206
2207 memory_region_ref(ramblock->mr);
Juan Quintelaec481c62017-03-20 22:12:40 +01002208 qemu_mutex_lock(&rs->src_page_req_mutex);
2209 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
Dr. David Alan Gilberte03a34f2018-06-13 11:26:42 +01002210 migration_make_urgent_request();
Juan Quintelaec481c62017-03-20 22:12:40 +01002211 qemu_mutex_unlock(&rs->src_page_req_mutex);
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002212
2213 return 0;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002214}
2215
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002216static bool save_page_use_compression(RAMState *rs)
2217{
2218 if (!migrate_use_compression()) {
2219 return false;
2220 }
2221
2222 /*
David Hildenbrand1a373522021-02-16 11:50:39 +01002223 * If xbzrle is enabled (e.g., after first round of migration), stop
2224 * using the data compression. In theory, xbzrle can do better than
2225 * compression.
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002226 */
David Hildenbrand1a373522021-02-16 11:50:39 +01002227 if (rs->xbzrle_enabled) {
2228 return false;
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002229 }
2230
David Hildenbrand1a373522021-02-16 11:50:39 +01002231 return true;
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002232}
2233
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002234/*
2235 * try to compress the page before posting it out, return true if the page
2236 * has been properly handled by compression, otherwise needs other
2237 * paths to handle it
2238 */
Peter Xuec6f3ab2022-10-11 17:55:56 -04002239static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2240 RAMBlock *block, ram_addr_t offset)
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002241{
2242 if (!save_page_use_compression(rs)) {
2243 return false;
2244 }
2245
2246 /*
2247 * When starting the process of a new block, the first page of
2248 * the block should be sent out before other pages in the same
2249 * block, and all the pages in last block should have been sent
2250 * out, keeping this order is important, because the 'cont' flag
2251 * is used to avoid resending the block name.
2252 *
2253 * We post the fist page as normal page as compression will take
2254 * much CPU resource.
2255 */
Peter Xuec6f3ab2022-10-11 17:55:56 -04002256 if (block != pss->last_sent_block) {
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002257 flush_compressed_data(rs);
2258 return false;
2259 }
2260
Peter Xueaa238a2022-10-11 17:55:49 -04002261 if (compress_page_with_multi_thread(block, offset) > 0) {
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002262 return true;
2263 }
2264
Xiao Guangrong76e03002018-09-06 15:01:00 +08002265 compression_counters.busy++;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002266 return false;
2267}
2268
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002269/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01002270 * ram_save_target_page: save one target page
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002271 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002272 * Returns the number of pages written
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002273 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01002274 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01002275 * @pss: data about the page we want to send
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002276 */
Juan Quintela05931ec2021-12-15 19:01:21 +01002277static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002278{
Xiao Guangronga8ec91f2018-03-30 15:51:25 +08002279 RAMBlock *block = pss->block;
Alexey Romko8bba0042020-01-10 14:51:34 +01002280 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
Xiao Guangronga8ec91f2018-03-30 15:51:25 +08002281 int res;
2282
Peter Xu61717ea2022-10-11 17:55:53 -04002283 if (control_save_page(pss, block, offset, &res)) {
Xiao Guangronga8ec91f2018-03-30 15:51:25 +08002284 return res;
2285 }
2286
Peter Xuec6f3ab2022-10-11 17:55:56 -04002287 if (save_compress_page(rs, pss, block, offset)) {
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002288 return 1;
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002289 }
2290
Peter Xuec6f3ab2022-10-11 17:55:56 -04002291 res = save_zero_page(pss, block, offset);
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002292 if (res > 0) {
2293 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2294 * page would be stale
2295 */
Peter Xuef5c3d12022-10-11 17:55:47 -04002296 if (rs->xbzrle_enabled) {
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002297 XBZRLE_cache_lock();
2298 xbzrle_cache_zero_page(rs, block->offset + offset);
2299 XBZRLE_cache_unlock();
2300 }
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002301 return res;
2302 }
2303
Xiao Guangrongda3f56c2018-03-30 15:51:28 +08002304 /*
Peter Xu6f39c902022-10-04 14:24:30 -04002305 * Do not use multifd in postcopy as one whole host page should be
2306 * placed. Meanwhile postcopy requires atomic update of pages, so even
2307 * if host page size == guest page size the dest guest during run may
2308 * still see partially copied pages which is data corruption.
Xiao Guangrongda3f56c2018-03-30 15:51:28 +08002309 */
Peter Xu6f39c902022-10-04 14:24:30 -04002310 if (migrate_use_multifd() && !migration_in_postcopy()) {
Peter Xu61717ea2022-10-11 17:55:53 -04002311 return ram_save_multifd_page(pss->pss_channel, block, offset);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002312 }
2313
Juan Quintela05931ec2021-12-15 19:01:21 +01002314 return ram_save_page(rs, pss);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002315}
2316
Peter Xud9e474e2022-10-11 17:55:52 -04002317/* Should be called before sending a host page */
2318static void pss_host_page_prepare(PageSearchStatus *pss)
2319{
2320 /* How many guest pages are there in one host page? */
2321 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2322
2323 pss->host_page_sending = true;
Peter Xu301d7ff2023-01-20 11:31:47 -05002324 if (guest_pfns <= 1) {
2325 /*
2326 * This covers both when guest psize == host psize, or when guest
2327 * has larger psize than the host (guest_pfns==0).
2328 *
2329 * For the latter, we always send one whole guest page per
2330 * iteration of the host page (example: an Alpha VM on x86 host
2331 * will have guest psize 8K while host psize 4K).
2332 */
2333 pss->host_page_start = pss->page;
2334 pss->host_page_end = pss->page + 1;
2335 } else {
2336 /*
2337 * The host page spans over multiple guest pages, we send them
2338 * within the same host page iteration.
2339 */
2340 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2341 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2342 }
Peter Xud9e474e2022-10-11 17:55:52 -04002343}
2344
2345/*
2346 * Whether the page pointed by PSS is within the host page being sent.
2347 * Must be called after a previous pss_host_page_prepare().
2348 */
2349static bool pss_within_range(PageSearchStatus *pss)
2350{
2351 ram_addr_t ram_addr;
2352
2353 assert(pss->host_page_sending);
2354
2355 /* Over host-page boundary? */
2356 if (pss->page >= pss->host_page_end) {
2357 return false;
2358 }
2359
2360 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2361
2362 return offset_in_ramblock(pss->block, ram_addr);
2363}
2364
2365static void pss_host_page_finish(PageSearchStatus *pss)
2366{
2367 pss->host_page_sending = false;
2368 /* This is not needed, but just to reset it */
2369 pss->host_page_start = pss->host_page_end = 0;
2370}
2371
Peter Xu93589822022-10-11 17:55:57 -04002372/*
2373 * Send an urgent host page specified by `pss'. Need to be called with
2374 * bitmap_mutex held.
2375 *
2376 * Returns 0 if save host page succeeded, false otherwise.
2377 */
2378static int ram_save_host_page_urgent(PageSearchStatus *pss)
2379{
2380 bool page_dirty, sent = false;
2381 RAMState *rs = ram_state;
2382 int ret = 0;
2383
2384 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2385 pss_host_page_prepare(pss);
2386
2387 /*
2388 * If precopy is sending the same page, let it be done in precopy, or
2389 * we could send the same page in two channels and none of them will
2390 * receive the whole page.
2391 */
2392 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2393 trace_postcopy_preempt_hit(pss->block->idstr,
2394 pss->page << TARGET_PAGE_BITS);
2395 return 0;
2396 }
2397
2398 do {
2399 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2400
2401 if (page_dirty) {
2402 /* Be strict to return code; it must be 1, or what else? */
2403 if (ram_save_target_page(rs, pss) != 1) {
2404 error_report_once("%s: ram_save_target_page failed", __func__);
2405 ret = -1;
2406 goto out;
2407 }
2408 sent = true;
2409 }
2410 pss_find_next_dirty(pss);
2411 } while (pss_within_range(pss));
2412out:
2413 pss_host_page_finish(pss);
2414 /* For urgent requests, flush immediately if sent */
2415 if (sent) {
2416 qemu_fflush(pss->pss_channel);
2417 }
2418 return ret;
2419}
2420
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002421/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01002422 * ram_save_host_page: save a whole host page
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002423 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002424 * Starting at *offset send pages up to the end of the current host
2425 * page. It's valid for the initial offset to point into the middle of
2426 * a host page in which case the remainder of the hostpage is sent.
2427 * Only dirty target pages are sent. Note that the host page size may
2428 * be a huge page for this block.
Peter Xuf3321552022-10-11 17:55:50 -04002429 *
Dr. David Alan Gilbert1eb3fc02017-05-17 17:58:09 +01002430 * The saving stops at the boundary of the used_length of the block
2431 * if the RAMBlock isn't a multiple of the host page size.
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002432 *
Peter Xuf3321552022-10-11 17:55:50 -04002433 * The caller must be with ram_state.bitmap_mutex held to call this
2434 * function. Note that this function can temporarily release the lock, but
2435 * when the function is returned it'll make sure the lock is still held.
2436 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002437 * Returns the number of pages written or negative on error
2438 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01002439 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01002440 * @pss: data about the page we want to send
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002441 */
Juan Quintela05931ec2021-12-15 19:01:21 +01002442static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002443{
Peter Xuf3321552022-10-11 17:55:50 -04002444 bool page_dirty, preempt_active = postcopy_preempt_active();
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002445 int tmppages, pages = 0;
Juan Quintelaa935e302017-03-21 15:36:51 +01002446 size_t pagesize_bits =
2447 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002448 unsigned long start_page = pss->page;
2449 int res;
Dr. David Alan Gilbert4c011c32017-02-24 18:28:39 +00002450
Yury Kotovfbd162e2019-02-15 20:45:46 +03002451 if (ramblock_is_ignored(pss->block)) {
CĂ©dric Le Goaterb895de52018-05-14 08:57:00 +02002452 error_report("block %s should not be migrated !", pss->block->idstr);
2453 return 0;
2454 }
2455
Peter Xud9e474e2022-10-11 17:55:52 -04002456 /* Update host page boundary information */
2457 pss_host_page_prepare(pss);
2458
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002459 do {
Peter Xuf3321552022-10-11 17:55:50 -04002460 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
Xiao Guangrong1faa5662018-03-30 15:51:24 +08002461
Peter Xuf3321552022-10-11 17:55:50 -04002462 /* Check the pages is dirty and if it is send it */
2463 if (page_dirty) {
Kunkun Jiangba1b7c82021-03-16 20:57:16 +08002464 /*
Peter Xuf3321552022-10-11 17:55:50 -04002465 * Properly yield the lock only in postcopy preempt mode
2466 * because both migration thread and rp-return thread can
2467 * operate on the bitmaps.
Kunkun Jiangba1b7c82021-03-16 20:57:16 +08002468 */
Peter Xuf3321552022-10-11 17:55:50 -04002469 if (preempt_active) {
2470 qemu_mutex_unlock(&rs->bitmap_mutex);
Kunkun Jiangba1b7c82021-03-16 20:57:16 +08002471 }
Peter Xuf3321552022-10-11 17:55:50 -04002472 tmppages = ram_save_target_page(rs, pss);
2473 if (tmppages >= 0) {
2474 pages += tmppages;
2475 /*
2476 * Allow rate limiting to happen in the middle of huge pages if
2477 * something is sent in the current iteration.
2478 */
2479 if (pagesize_bits > 1 && tmppages > 0) {
2480 migration_rate_limit();
2481 }
2482 }
2483 if (preempt_active) {
2484 qemu_mutex_lock(&rs->bitmap_mutex);
2485 }
2486 } else {
2487 tmppages = 0;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002488 }
Peter Xuf3321552022-10-11 17:55:50 -04002489
2490 if (tmppages < 0) {
Peter Xud9e474e2022-10-11 17:55:52 -04002491 pss_host_page_finish(pss);
Peter Xuf3321552022-10-11 17:55:50 -04002492 return tmppages;
2493 }
2494
Peter Xud9e474e2022-10-11 17:55:52 -04002495 pss_find_next_dirty(pss);
2496 } while (pss_within_range(pss));
2497
2498 pss_host_page_finish(pss);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002499
2500 res = ram_save_release_protection(rs, pss, start_page);
2501 return (res < 0 ? res : pages);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002502}
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002503
2504/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01002505 * ram_find_and_save_block: finds a dirty page and sends it to f
Juan Quintela56e93d22015-05-07 19:33:31 +02002506 *
2507 * Called within an RCU critical section.
2508 *
Xiao Guangronge8f37352018-09-03 17:26:44 +08002509 * Returns the number of pages written where zero means no dirty pages,
2510 * or negative on error
Juan Quintela56e93d22015-05-07 19:33:31 +02002511 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01002512 * @rs: current RAM state
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002513 *
2514 * On systems where host-page-size > target-page-size it will send all the
2515 * pages in a host page that are dirty.
Juan Quintela56e93d22015-05-07 19:33:31 +02002516 */
Juan Quintela05931ec2021-12-15 19:01:21 +01002517static int ram_find_and_save_block(RAMState *rs)
Juan Quintela56e93d22015-05-07 19:33:31 +02002518{
Peter Xuf1668762022-10-11 17:55:55 -04002519 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
Juan Quintela56e93d22015-05-07 19:33:31 +02002520 int pages = 0;
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01002521 bool again, found;
Juan Quintela56e93d22015-05-07 19:33:31 +02002522
Ashijeet Acharya0827b9e2017-02-08 19:58:45 +05302523 /* No dirty page as there is zero RAM */
2524 if (!ram_bytes_total()) {
2525 return pages;
2526 }
2527
Peter Xu4934a5d2022-10-04 14:24:26 -04002528 /*
2529 * Always keep last_seen_block/last_page valid during this procedure,
2530 * because find_dirty_block() relies on these values (e.g., we compare
2531 * last_seen_block with pss.block to see whether we searched all the
2532 * ramblocks) to detect the completion of migration. Having NULL value
2533 * of last_seen_block can conditionally cause below loop to run forever.
2534 */
2535 if (!rs->last_seen_block) {
2536 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2537 rs->last_page = 0;
2538 }
2539
Peter Xuf1668762022-10-11 17:55:55 -04002540 pss_init(pss, rs->last_seen_block, rs->last_page);
Dr. David Alan Gilbertb8fb8cb2015-09-23 15:27:10 +01002541
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01002542 do {
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002543 again = true;
Peter Xuf1668762022-10-11 17:55:55 -04002544 found = get_queued_page(rs, pss);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002545
2546 if (!found) {
Peter Xub0621062022-10-11 17:55:58 -04002547 /* priority queue empty, so just search for something dirty */
2548 found = find_dirty_block(rs, pss, &again);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002549 }
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01002550
2551 if (found) {
Peter Xuf1668762022-10-11 17:55:55 -04002552 pages = ram_save_host_page(rs, pss);
Juan Quintela56e93d22015-05-07 19:33:31 +02002553 }
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01002554 } while (!pages && again);
Juan Quintela56e93d22015-05-07 19:33:31 +02002555
Peter Xuf1668762022-10-11 17:55:55 -04002556 rs->last_seen_block = pss->block;
2557 rs->last_page = pss->page;
Juan Quintela56e93d22015-05-07 19:33:31 +02002558
2559 return pages;
2560}
2561
2562void acct_update_position(QEMUFile *f, size_t size, bool zero)
2563{
2564 uint64_t pages = size / TARGET_PAGE_SIZE;
Juan Quintelaf7ccd612017-03-13 20:30:21 +01002565
Juan Quintela56e93d22015-05-07 19:33:31 +02002566 if (zero) {
Peter Xu23b75762022-10-11 17:55:51 -04002567 stat64_add(&ram_atomic_counters.duplicate, pages);
Juan Quintela56e93d22015-05-07 19:33:31 +02002568 } else {
Peter Xu23b75762022-10-11 17:55:51 -04002569 stat64_add(&ram_atomic_counters.normal, pages);
David Edmondson4c2d0f62021-12-21 09:34:40 +00002570 ram_transferred_add(size);
Daniel P. Berrangé1a93bd22022-06-20 12:01:51 +01002571 qemu_file_credit_transfer(f, size);
Juan Quintela56e93d22015-05-07 19:33:31 +02002572 }
2573}
2574
Yury Kotovfbd162e2019-02-15 20:45:46 +03002575static uint64_t ram_bytes_total_common(bool count_ignored)
Juan Quintela56e93d22015-05-07 19:33:31 +02002576{
2577 RAMBlock *block;
2578 uint64_t total = 0;
2579
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002580 RCU_READ_LOCK_GUARD();
2581
Yury Kotovfbd162e2019-02-15 20:45:46 +03002582 if (count_ignored) {
2583 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2584 total += block->used_length;
2585 }
2586 } else {
2587 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2588 total += block->used_length;
2589 }
Peter Xu99e15582017-05-12 12:17:39 +08002590 }
Juan Quintela56e93d22015-05-07 19:33:31 +02002591 return total;
2592}
2593
Yury Kotovfbd162e2019-02-15 20:45:46 +03002594uint64_t ram_bytes_total(void)
2595{
2596 return ram_bytes_total_common(false);
2597}
2598
Juan Quintelaf265e0e2017-06-28 11:52:27 +02002599static void xbzrle_load_setup(void)
Juan Quintela56e93d22015-05-07 19:33:31 +02002600{
Juan Quintelaf265e0e2017-06-28 11:52:27 +02002601 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
Juan Quintela56e93d22015-05-07 19:33:31 +02002602}
2603
Juan Quintelaf265e0e2017-06-28 11:52:27 +02002604static void xbzrle_load_cleanup(void)
2605{
2606 g_free(XBZRLE.decoded_buf);
2607 XBZRLE.decoded_buf = NULL;
2608}
2609
Peter Xu7d7c96b2017-10-19 14:31:58 +08002610static void ram_state_cleanup(RAMState **rsp)
2611{
Dr. David Alan Gilbertb9ccaf62018-02-12 16:03:39 +00002612 if (*rsp) {
2613 migration_page_queue_free(*rsp);
2614 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2615 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2616 g_free(*rsp);
2617 *rsp = NULL;
2618 }
Peter Xu7d7c96b2017-10-19 14:31:58 +08002619}
2620
Peter Xu84593a02017-10-19 14:31:59 +08002621static void xbzrle_cleanup(void)
2622{
2623 XBZRLE_cache_lock();
2624 if (XBZRLE.cache) {
2625 cache_fini(XBZRLE.cache);
2626 g_free(XBZRLE.encoded_buf);
2627 g_free(XBZRLE.current_buf);
2628 g_free(XBZRLE.zero_target_page);
2629 XBZRLE.cache = NULL;
2630 XBZRLE.encoded_buf = NULL;
2631 XBZRLE.current_buf = NULL;
2632 XBZRLE.zero_target_page = NULL;
2633 }
2634 XBZRLE_cache_unlock();
2635}
2636
Juan Quintelaf265e0e2017-06-28 11:52:27 +02002637static void ram_save_cleanup(void *opaque)
Juan Quintela56e93d22015-05-07 19:33:31 +02002638{
Juan Quintela53518d92017-05-04 11:46:24 +02002639 RAMState **rsp = opaque;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002640 RAMBlock *block;
Juan Quintelaeb859c52017-03-13 21:51:55 +01002641
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002642 /* We don't use dirty log with background snapshots */
2643 if (!migrate_background_snapshot()) {
2644 /* caller have hold iothread lock or is in a bh, so there is
2645 * no writing race against the migration bitmap
2646 */
Hyman Huang(黄勇)63b41db2021-06-29 16:01:19 +00002647 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2648 /*
2649 * do not stop dirty log without starting it, since
2650 * memory_global_dirty_log_stop will assert that
2651 * memory_global_dirty_log_start/stop used in pairs
2652 */
2653 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2654 }
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002655 }
Juan Quintela6b6712e2017-03-22 15:18:04 +01002656
Yury Kotovfbd162e2019-02-15 20:45:46 +03002657 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Peter Xu002cad62019-06-03 14:50:56 +08002658 g_free(block->clear_bmap);
2659 block->clear_bmap = NULL;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002660 g_free(block->bmap);
2661 block->bmap = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02002662 }
2663
Peter Xu84593a02017-10-19 14:31:59 +08002664 xbzrle_cleanup();
Juan Quintelaf0afa332017-06-28 11:52:28 +02002665 compress_threads_save_cleanup();
Peter Xu7d7c96b2017-10-19 14:31:58 +08002666 ram_state_cleanup(rsp);
Juan Quintela56e93d22015-05-07 19:33:31 +02002667}
2668
Juan Quintela6f37bb82017-03-13 19:26:29 +01002669static void ram_state_reset(RAMState *rs)
Juan Quintela56e93d22015-05-07 19:33:31 +02002670{
Peter Xuec6f3ab2022-10-11 17:55:56 -04002671 int i;
2672
2673 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2674 rs->pss[i].last_sent_block = NULL;
2675 }
2676
Juan Quintela6f37bb82017-03-13 19:26:29 +01002677 rs->last_seen_block = NULL;
Juan Quintela269ace22017-03-21 15:23:31 +01002678 rs->last_page = 0;
Juan Quintela6f37bb82017-03-13 19:26:29 +01002679 rs->last_version = ram_list.version;
David Hildenbrand1a373522021-02-16 11:50:39 +01002680 rs->xbzrle_enabled = false;
Juan Quintela56e93d22015-05-07 19:33:31 +02002681}
2682
2683#define MAX_WAIT 50 /* ms, half buffered_file limit */
2684
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002685/* **** functions for postcopy ***** */
2686
Pavel Butsykinced1c612017-02-03 18:23:21 +03002687void ram_postcopy_migrated_memory_release(MigrationState *ms)
2688{
2689 struct RAMBlock *block;
Pavel Butsykinced1c612017-02-03 18:23:21 +03002690
Yury Kotovfbd162e2019-02-15 20:45:46 +03002691 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Juan Quintela6b6712e2017-03-22 15:18:04 +01002692 unsigned long *bitmap = block->bmap;
2693 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2694 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
Pavel Butsykinced1c612017-02-03 18:23:21 +03002695
2696 while (run_start < range) {
2697 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
Alexey Romko8bba0042020-01-10 14:51:34 +01002698 ram_discard_range(block->idstr,
2699 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2700 ((ram_addr_t)(run_end - run_start))
2701 << TARGET_PAGE_BITS);
Pavel Butsykinced1c612017-02-03 18:23:21 +03002702 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2703 }
2704 }
2705}
2706
Juan Quintela3d0684b2017-03-23 15:06:39 +01002707/**
2708 * postcopy_send_discard_bm_ram: discard a RAMBlock
2709 *
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002710 * Callback from postcopy_each_ram_send_discard for each RAMBlock
Juan Quintela3d0684b2017-03-23 15:06:39 +01002711 *
2712 * @ms: current migration state
Wei Yang89dab312019-07-15 10:05:49 +08002713 * @block: RAMBlock to discard
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002714 */
Philippe Mathieu-Daudé9e7d1222021-12-30 17:05:25 +01002715static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002716{
Juan Quintela6b6712e2017-03-22 15:18:04 +01002717 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002718 unsigned long current;
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002719 unsigned long *bitmap = block->bmap;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002720
Juan Quintela6b6712e2017-03-22 15:18:04 +01002721 for (current = 0; current < end; ) {
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002722 unsigned long one = find_next_bit(bitmap, end, current);
Wei Yang33a5cb622019-06-27 10:08:21 +08002723 unsigned long zero, discard_length;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002724
Wei Yang33a5cb622019-06-27 10:08:21 +08002725 if (one >= end) {
2726 break;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002727 }
Wei Yang33a5cb622019-06-27 10:08:21 +08002728
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002729 zero = find_next_zero_bit(bitmap, end, one + 1);
Wei Yang33a5cb622019-06-27 10:08:21 +08002730
2731 if (zero >= end) {
2732 discard_length = end - one;
2733 } else {
2734 discard_length = zero - one;
2735 }
Wei Yang810cf2b2019-07-24 09:07:21 +08002736 postcopy_discard_send_range(ms, one, discard_length);
Wei Yang33a5cb622019-06-27 10:08:21 +08002737 current = one + discard_length;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002738 }
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002739}
2740
Peter Xuf30c2e52021-12-07 19:50:13 +08002741static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2742
Juan Quintela3d0684b2017-03-23 15:06:39 +01002743/**
2744 * postcopy_each_ram_send_discard: discard all RAMBlocks
2745 *
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002746 * Utility for the outgoing postcopy code.
2747 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2748 * passing it bitmap indexes and name.
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002749 * (qemu_ram_foreach_block ends up passing unscaled lengths
2750 * which would mean postcopy code would have to deal with target page)
Juan Quintela3d0684b2017-03-23 15:06:39 +01002751 *
2752 * @ms: current migration state
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002753 */
Peter Xu739fcc12021-12-07 19:50:14 +08002754static void postcopy_each_ram_send_discard(MigrationState *ms)
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002755{
2756 struct RAMBlock *block;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002757
Yury Kotovfbd162e2019-02-15 20:45:46 +03002758 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Wei Yang810cf2b2019-07-24 09:07:21 +08002759 postcopy_discard_send_init(ms, block->idstr);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002760
2761 /*
Peter Xuf30c2e52021-12-07 19:50:13 +08002762 * Deal with TPS != HPS and huge pages. It discard any partially sent
2763 * host-page size chunks, mark any partially dirty host-page size
2764 * chunks as all dirty. In this case the host-page is the host-page
2765 * for the particular RAMBlock, i.e. it might be a huge page.
2766 */
2767 postcopy_chunk_hostpages_pass(ms, block);
2768
2769 /*
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002770 * Postcopy sends chunks of bitmap over the wire, but it
2771 * just needs indexes at this point, avoids it having
2772 * target page specific code.
2773 */
Peter Xu739fcc12021-12-07 19:50:14 +08002774 postcopy_send_discard_bm_ram(ms, block);
Wei Yang810cf2b2019-07-24 09:07:21 +08002775 postcopy_discard_send_finish(ms);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002776 }
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002777}
2778
Juan Quintela3d0684b2017-03-23 15:06:39 +01002779/**
Wei Yang8324ef82019-08-19 14:18:41 +08002780 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002781 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002782 * Helper for postcopy_chunk_hostpages; it's called twice to
2783 * canonicalize the two bitmaps, that are similar, but one is
2784 * inverted.
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002785 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002786 * Postcopy requires that all target pages in a hostpage are dirty or
2787 * clean, not a mix. This function canonicalizes the bitmaps.
2788 *
2789 * @ms: current migration state
Juan Quintela3d0684b2017-03-23 15:06:39 +01002790 * @block: block that contains the page we want to canonicalize
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002791 */
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002792static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002793{
Juan Quintela53518d92017-05-04 11:46:24 +02002794 RAMState *rs = ram_state;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002795 unsigned long *bitmap = block->bmap;
Dr. David Alan Gilbert29c59172017-02-24 18:28:31 +00002796 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002797 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002798 unsigned long run_start;
2799
Dr. David Alan Gilbert29c59172017-02-24 18:28:31 +00002800 if (block->page_size == TARGET_PAGE_SIZE) {
2801 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2802 return;
2803 }
2804
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002805 /* Find a dirty page */
2806 run_start = find_next_bit(bitmap, pages, 0);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002807
Juan Quintela6b6712e2017-03-22 15:18:04 +01002808 while (run_start < pages) {
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002809
2810 /*
2811 * If the start of this run of pages is in the middle of a host
2812 * page, then we need to fixup this host page.
2813 */
Wei Yang9dec3cc2019-08-06 08:46:48 +08002814 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002815 /* Find the end of this run */
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002816 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002817 /*
2818 * If the end isn't at the start of a host page, then the
2819 * run doesn't finish at the end of a host page
2820 * and we need to discard.
2821 */
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002822 }
2823
Wei Yang9dec3cc2019-08-06 08:46:48 +08002824 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002825 unsigned long page;
Wei Yangdad45ab2019-08-06 08:46:47 +08002826 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2827 host_ratio);
2828 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002829
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002830 /* Clean up the bitmap */
2831 for (page = fixup_start_addr;
2832 page < fixup_start_addr + host_ratio; page++) {
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002833 /*
2834 * Remark them as dirty, updating the count for any pages
2835 * that weren't previously dirty.
2836 */
Juan Quintela0d8ec882017-03-13 21:21:41 +01002837 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002838 }
2839 }
2840
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002841 /* Find the next dirty page for the next iteration */
2842 run_start = find_next_bit(bitmap, pages, run_start);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002843 }
2844}
2845
Juan Quintela3d0684b2017-03-23 15:06:39 +01002846/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01002847 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2848 *
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002849 * Transmit the set of pages to be discarded after precopy to the target
2850 * these are pages that:
2851 * a) Have been previously transmitted but are now dirty again
2852 * b) Pages that have never been transmitted, this ensures that
2853 * any pages on the destination that have been mapped by background
2854 * tasks get discarded (transparent huge pages is the specific concern)
2855 * Hopefully this is pretty sparse
Juan Quintela3d0684b2017-03-23 15:06:39 +01002856 *
2857 * @ms: current migration state
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002858 */
Peter Xu739fcc12021-12-07 19:50:14 +08002859void ram_postcopy_send_discard_bitmap(MigrationState *ms)
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002860{
Juan Quintela53518d92017-05-04 11:46:24 +02002861 RAMState *rs = ram_state;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002862
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002863 RCU_READ_LOCK_GUARD();
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002864
2865 /* This should be our last sync, the src is now paused */
Juan Quintelaeb859c52017-03-13 21:51:55 +01002866 migration_bitmap_sync(rs);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002867
Juan Quintela6b6712e2017-03-22 15:18:04 +01002868 /* Easiest way to make sure we don't resume in the middle of a host-page */
Peter Xuec6f3ab2022-10-11 17:55:56 -04002869 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002870 rs->last_seen_block = NULL;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002871 rs->last_page = 0;
2872
Peter Xu739fcc12021-12-07 19:50:14 +08002873 postcopy_each_ram_send_discard(ms);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002874
Peter Xu739fcc12021-12-07 19:50:14 +08002875 trace_ram_postcopy_send_discard_bitmap();
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002876}
2877
Juan Quintela3d0684b2017-03-23 15:06:39 +01002878/**
2879 * ram_discard_range: discard dirtied pages at the beginning of postcopy
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002880 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002881 * Returns zero on success
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002882 *
Juan Quintela36449152017-03-23 15:11:59 +01002883 * @rbname: name of the RAMBlock of the request. NULL means the
2884 * same that last one.
Juan Quintela3d0684b2017-03-23 15:06:39 +01002885 * @start: RAMBlock starting page
2886 * @length: RAMBlock size
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002887 */
Juan Quintelaaaa20642017-03-21 11:35:24 +01002888int ram_discard_range(const char *rbname, uint64_t start, size_t length)
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002889{
Juan Quintela36449152017-03-23 15:11:59 +01002890 trace_ram_discard_range(rbname, start, length);
Dr. David Alan Gilbertd3a50382017-02-24 18:28:32 +00002891
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002892 RCU_READ_LOCK_GUARD();
Juan Quintela36449152017-03-23 15:11:59 +01002893 RAMBlock *rb = qemu_ram_block_by_name(rbname);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002894
2895 if (!rb) {
Juan Quintela36449152017-03-23 15:11:59 +01002896 error_report("ram_discard_range: Failed to find block '%s'", rbname);
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03002897 return -1;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002898 }
2899
Peter Xu814bb082018-07-23 20:33:02 +08002900 /*
2901 * On source VM, we don't need to update the received bitmap since
2902 * we don't even have one.
2903 */
2904 if (rb->receivedmap) {
2905 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2906 length >> qemu_target_page_bits());
2907 }
2908
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03002909 return ram_block_discard_range(rb, start, length);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002910}
2911
Peter Xu84593a02017-10-19 14:31:59 +08002912/*
2913 * For every allocation, we will try not to crash the VM if the
2914 * allocation failed.
2915 */
2916static int xbzrle_init(void)
2917{
2918 Error *local_err = NULL;
2919
2920 if (!migrate_use_xbzrle()) {
2921 return 0;
2922 }
2923
2924 XBZRLE_cache_lock();
2925
2926 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2927 if (!XBZRLE.zero_target_page) {
2928 error_report("%s: Error allocating zero page", __func__);
2929 goto err_out;
2930 }
2931
2932 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2933 TARGET_PAGE_SIZE, &local_err);
2934 if (!XBZRLE.cache) {
2935 error_report_err(local_err);
2936 goto free_zero_page;
2937 }
2938
2939 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2940 if (!XBZRLE.encoded_buf) {
2941 error_report("%s: Error allocating encoded_buf", __func__);
2942 goto free_cache;
2943 }
2944
2945 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2946 if (!XBZRLE.current_buf) {
2947 error_report("%s: Error allocating current_buf", __func__);
2948 goto free_encoded_buf;
2949 }
2950
2951 /* We are all good */
2952 XBZRLE_cache_unlock();
2953 return 0;
2954
2955free_encoded_buf:
2956 g_free(XBZRLE.encoded_buf);
2957 XBZRLE.encoded_buf = NULL;
2958free_cache:
2959 cache_fini(XBZRLE.cache);
2960 XBZRLE.cache = NULL;
2961free_zero_page:
2962 g_free(XBZRLE.zero_target_page);
2963 XBZRLE.zero_target_page = NULL;
2964err_out:
2965 XBZRLE_cache_unlock();
2966 return -ENOMEM;
2967}
2968
Juan Quintela53518d92017-05-04 11:46:24 +02002969static int ram_state_init(RAMState **rsp)
Juan Quintela56e93d22015-05-07 19:33:31 +02002970{
Peter Xu7d00ee62017-10-19 14:31:57 +08002971 *rsp = g_try_new0(RAMState, 1);
2972
2973 if (!*rsp) {
2974 error_report("%s: Init ramstate fail", __func__);
2975 return -1;
2976 }
Juan Quintela53518d92017-05-04 11:46:24 +02002977
2978 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2979 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2980 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
Juan Quintela56e93d22015-05-07 19:33:31 +02002981
Peter Xu7d00ee62017-10-19 14:31:57 +08002982 /*
Ivan Ren40c4d4a2019-07-14 22:51:19 +08002983 * Count the total number of pages used by ram blocks not including any
2984 * gaps due to alignment or unplugs.
Wei Yang03158512019-06-04 14:17:27 +08002985 * This must match with the initial values of dirty bitmap.
Peter Xu7d00ee62017-10-19 14:31:57 +08002986 */
Ivan Ren40c4d4a2019-07-14 22:51:19 +08002987 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
Peter Xu7d00ee62017-10-19 14:31:57 +08002988 ram_state_reset(*rsp);
2989
2990 return 0;
2991}
2992
Peter Xud6eff5d2017-10-19 14:32:00 +08002993static void ram_list_init_bitmaps(void)
2994{
Peter Xu002cad62019-06-03 14:50:56 +08002995 MigrationState *ms = migrate_get_current();
Peter Xud6eff5d2017-10-19 14:32:00 +08002996 RAMBlock *block;
2997 unsigned long pages;
Peter Xu002cad62019-06-03 14:50:56 +08002998 uint8_t shift;
Peter Xud6eff5d2017-10-19 14:32:00 +08002999
3000 /* Skip setting bitmap if there is no RAM */
3001 if (ram_bytes_total()) {
Peter Xu002cad62019-06-03 14:50:56 +08003002 shift = ms->clear_bitmap_shift;
3003 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3004 error_report("clear_bitmap_shift (%u) too big, using "
3005 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3006 shift = CLEAR_BITMAP_SHIFT_MAX;
3007 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3008 error_report("clear_bitmap_shift (%u) too small, using "
3009 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3010 shift = CLEAR_BITMAP_SHIFT_MIN;
3011 }
3012
Yury Kotovfbd162e2019-02-15 20:45:46 +03003013 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Peter Xud6eff5d2017-10-19 14:32:00 +08003014 pages = block->max_length >> TARGET_PAGE_BITS;
Wei Yang03158512019-06-04 14:17:27 +08003015 /*
3016 * The initial dirty bitmap for migration must be set with all
3017 * ones to make sure we'll migrate every guest RAM page to
3018 * destination.
Ivan Ren40c4d4a2019-07-14 22:51:19 +08003019 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3020 * new migration after a failed migration, ram_list.
3021 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3022 * guest memory.
Wei Yang03158512019-06-04 14:17:27 +08003023 */
Peter Xud6eff5d2017-10-19 14:32:00 +08003024 block->bmap = bitmap_new(pages);
Ivan Ren40c4d4a2019-07-14 22:51:19 +08003025 bitmap_set(block->bmap, 0, pages);
Peter Xu002cad62019-06-03 14:50:56 +08003026 block->clear_bmap_shift = shift;
3027 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
Peter Xud6eff5d2017-10-19 14:32:00 +08003028 }
3029 }
3030}
3031
David Hildenbrandbe39b4c2021-10-11 19:53:41 +02003032static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3033{
3034 unsigned long pages;
3035 RAMBlock *rb;
3036
3037 RCU_READ_LOCK_GUARD();
3038
3039 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3040 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3041 rs->migration_dirty_pages -= pages;
3042 }
3043}
3044
Peter Xud6eff5d2017-10-19 14:32:00 +08003045static void ram_init_bitmaps(RAMState *rs)
3046{
3047 /* For memory_global_dirty_log_start below. */
3048 qemu_mutex_lock_iothread();
3049 qemu_mutex_lock_ramlist();
Peter Xud6eff5d2017-10-19 14:32:00 +08003050
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003051 WITH_RCU_READ_LOCK_GUARD() {
3052 ram_list_init_bitmaps();
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03003053 /* We don't use dirty log with background snapshots */
3054 if (!migrate_background_snapshot()) {
Hyman Huang(黄勇)63b41db2021-06-29 16:01:19 +00003055 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03003056 migration_bitmap_sync_precopy(rs);
3057 }
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003058 }
Peter Xud6eff5d2017-10-19 14:32:00 +08003059 qemu_mutex_unlock_ramlist();
3060 qemu_mutex_unlock_iothread();
David Hildenbrandbe39b4c2021-10-11 19:53:41 +02003061
3062 /*
3063 * After an eventual first bitmap sync, fixup the initial bitmap
3064 * containing all 1s to exclude any discarded pages from migration.
3065 */
3066 migration_bitmap_clear_discarded_pages(rs);
Peter Xud6eff5d2017-10-19 14:32:00 +08003067}
3068
Peter Xu7d00ee62017-10-19 14:31:57 +08003069static int ram_init_all(RAMState **rsp)
3070{
Peter Xu7d00ee62017-10-19 14:31:57 +08003071 if (ram_state_init(rsp)) {
3072 return -1;
3073 }
3074
Peter Xu84593a02017-10-19 14:31:59 +08003075 if (xbzrle_init()) {
3076 ram_state_cleanup(rsp);
3077 return -1;
Juan Quintela56e93d22015-05-07 19:33:31 +02003078 }
3079
Peter Xud6eff5d2017-10-19 14:32:00 +08003080 ram_init_bitmaps(*rsp);
zhanghailianga91246c2016-10-27 14:42:59 +08003081
3082 return 0;
3083}
3084
Peter Xu08614f32018-05-02 18:47:33 +08003085static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3086{
3087 RAMBlock *block;
3088 uint64_t pages = 0;
3089
3090 /*
3091 * Postcopy is not using xbzrle/compression, so no need for that.
3092 * Also, since source are already halted, we don't need to care
3093 * about dirty page logging as well.
3094 */
3095
Yury Kotovfbd162e2019-02-15 20:45:46 +03003096 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Peter Xu08614f32018-05-02 18:47:33 +08003097 pages += bitmap_count_one(block->bmap,
3098 block->used_length >> TARGET_PAGE_BITS);
3099 }
3100
3101 /* This may not be aligned with current bitmaps. Recalculate. */
3102 rs->migration_dirty_pages = pages;
3103
David Hildenbrand1a373522021-02-16 11:50:39 +01003104 ram_state_reset(rs);
Peter Xu08614f32018-05-02 18:47:33 +08003105
3106 /* Update RAMState cache of output QEMUFile */
Peter Xu7f401b82022-10-11 17:55:59 -04003107 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
Peter Xu08614f32018-05-02 18:47:33 +08003108
3109 trace_ram_state_resume_prepare(pages);
3110}
3111
Juan Quintela3d0684b2017-03-23 15:06:39 +01003112/*
Wei Wang6bcb05f2018-12-11 16:24:50 +08003113 * This function clears bits of the free pages reported by the caller from the
3114 * migration dirty bitmap. @addr is the host address corresponding to the
3115 * start of the continuous guest free pages, and @len is the total bytes of
3116 * those pages.
3117 */
3118void qemu_guest_free_page_hint(void *addr, size_t len)
3119{
3120 RAMBlock *block;
3121 ram_addr_t offset;
3122 size_t used_len, start, npages;
3123 MigrationState *s = migrate_get_current();
3124
3125 /* This function is currently expected to be used during live migration */
3126 if (!migration_is_setup_or_active(s->state)) {
3127 return;
3128 }
3129
3130 for (; len > 0; len -= used_len, addr += used_len) {
3131 block = qemu_ram_block_from_host(addr, false, &offset);
3132 if (unlikely(!block || offset >= block->used_length)) {
3133 /*
3134 * The implementation might not support RAMBlock resize during
3135 * live migration, but it could happen in theory with future
3136 * updates. So we add a check here to capture that case.
3137 */
3138 error_report_once("%s unexpected error", __func__);
3139 return;
3140 }
3141
3142 if (len <= block->used_length - offset) {
3143 used_len = len;
3144 } else {
3145 used_len = block->used_length - offset;
3146 }
3147
3148 start = offset >> TARGET_PAGE_BITS;
3149 npages = used_len >> TARGET_PAGE_BITS;
3150
3151 qemu_mutex_lock(&ram_state->bitmap_mutex);
Wei Wang3143577d2021-07-22 04:30:55 -04003152 /*
3153 * The skipped free pages are equavalent to be sent from clear_bmap's
3154 * perspective, so clear the bits from the memory region bitmap which
3155 * are initially set. Otherwise those skipped pages will be sent in
3156 * the next round after syncing from the memory region bitmap.
3157 */
David Hildenbrand1230a252021-09-04 18:09:07 +02003158 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
Wei Wang6bcb05f2018-12-11 16:24:50 +08003159 ram_state->migration_dirty_pages -=
3160 bitmap_count_one_with_offset(block->bmap, start, npages);
3161 bitmap_clear(block->bmap, start, npages);
3162 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3163 }
3164}
3165
3166/*
Juan Quintela3d0684b2017-03-23 15:06:39 +01003167 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
zhanghailianga91246c2016-10-27 14:42:59 +08003168 * long-running RCU critical section. When rcu-reclaims in the code
3169 * start to become numerous it will be necessary to reduce the
3170 * granularity of these critical sections.
3171 */
3172
Juan Quintela3d0684b2017-03-23 15:06:39 +01003173/**
3174 * ram_save_setup: Setup RAM for migration
3175 *
3176 * Returns zero to indicate success and negative for error
3177 *
3178 * @f: QEMUFile where to send the data
3179 * @opaque: RAMState pointer
3180 */
zhanghailianga91246c2016-10-27 14:42:59 +08003181static int ram_save_setup(QEMUFile *f, void *opaque)
3182{
Juan Quintela53518d92017-05-04 11:46:24 +02003183 RAMState **rsp = opaque;
zhanghailianga91246c2016-10-27 14:42:59 +08003184 RAMBlock *block;
Leonardo Bras33d70972022-05-13 03:28:35 -03003185 int ret;
zhanghailianga91246c2016-10-27 14:42:59 +08003186
Xiao Guangrongdcaf4462018-03-30 15:51:20 +08003187 if (compress_threads_save_setup()) {
3188 return -1;
3189 }
3190
zhanghailianga91246c2016-10-27 14:42:59 +08003191 /* migration has already setup the bitmap, reuse it. */
3192 if (!migration_in_colo_state()) {
Peter Xu7d00ee62017-10-19 14:31:57 +08003193 if (ram_init_all(rsp) != 0) {
Xiao Guangrongdcaf4462018-03-30 15:51:20 +08003194 compress_threads_save_cleanup();
zhanghailianga91246c2016-10-27 14:42:59 +08003195 return -1;
Juan Quintela53518d92017-05-04 11:46:24 +02003196 }
zhanghailianga91246c2016-10-27 14:42:59 +08003197 }
Peter Xu7f401b82022-10-11 17:55:59 -04003198 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
zhanghailianga91246c2016-10-27 14:42:59 +08003199
Dr. David Alan Gilbert0e6ebd42019-10-07 15:36:38 +01003200 WITH_RCU_READ_LOCK_GUARD() {
3201 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
Juan Quintela56e93d22015-05-07 19:33:31 +02003202
Dr. David Alan Gilbert0e6ebd42019-10-07 15:36:38 +01003203 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3204 qemu_put_byte(f, strlen(block->idstr));
3205 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3206 qemu_put_be64(f, block->used_length);
3207 if (migrate_postcopy_ram() && block->page_size !=
3208 qemu_host_page_size) {
3209 qemu_put_be64(f, block->page_size);
3210 }
3211 if (migrate_ignore_shared()) {
3212 qemu_put_be64(f, block->mr->addr);
3213 }
Yury Kotovfbd162e2019-02-15 20:45:46 +03003214 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003215 }
3216
Juan Quintela56e93d22015-05-07 19:33:31 +02003217 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3218 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3219
Leonardo Bras33d70972022-05-13 03:28:35 -03003220 ret = multifd_send_sync_main(f);
3221 if (ret < 0) {
3222 return ret;
3223 }
3224
Juan Quintela56e93d22015-05-07 19:33:31 +02003225 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
Juan Quintela35374cb2018-04-18 10:13:21 +02003226 qemu_fflush(f);
Juan Quintela56e93d22015-05-07 19:33:31 +02003227
3228 return 0;
3229}
3230
Juan Quintela3d0684b2017-03-23 15:06:39 +01003231/**
3232 * ram_save_iterate: iterative stage for migration
3233 *
3234 * Returns zero to indicate success and negative for error
3235 *
3236 * @f: QEMUFile where to send the data
3237 * @opaque: RAMState pointer
3238 */
Juan Quintela56e93d22015-05-07 19:33:31 +02003239static int ram_save_iterate(QEMUFile *f, void *opaque)
3240{
Juan Quintela53518d92017-05-04 11:46:24 +02003241 RAMState **temp = opaque;
3242 RAMState *rs = *temp;
Juan Quintela3d4095b2019-12-18 05:12:36 +01003243 int ret = 0;
Juan Quintela56e93d22015-05-07 19:33:31 +02003244 int i;
3245 int64_t t0;
Thomas Huth5c903082016-11-04 14:10:17 +01003246 int done = 0;
Juan Quintela56e93d22015-05-07 19:33:31 +02003247
Peter Lievenb2557342018-03-08 12:18:24 +01003248 if (blk_mig_bulk_active()) {
3249 /* Avoid transferring ram during bulk phase of block migration as
3250 * the bulk phase will usually take a long time and transferring
3251 * ram updates during that time is pointless. */
3252 goto out;
3253 }
3254
Peter Xu63268c42021-06-30 16:08:05 -04003255 /*
3256 * We'll take this lock a little bit long, but it's okay for two reasons.
3257 * Firstly, the only possible other thread to take it is who calls
3258 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3259 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3260 * guarantees that we'll at least released it in a regular basis.
3261 */
3262 qemu_mutex_lock(&rs->bitmap_mutex);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003263 WITH_RCU_READ_LOCK_GUARD() {
3264 if (ram_list.version != rs->last_version) {
3265 ram_state_reset(rs);
Dr. David Alan Gilberte03a34f2018-06-13 11:26:42 +01003266 }
3267
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003268 /* Read version before ram_list.blocks */
3269 smp_rmb();
Xiao Guangronge8f37352018-09-03 17:26:44 +08003270
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003271 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
Xiao Guangronge8f37352018-09-03 17:26:44 +08003272
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003273 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3274 i = 0;
3275 while ((ret = qemu_file_rate_limit(f)) == 0 ||
Peter Xua1fe28d2022-01-19 16:09:18 +08003276 postcopy_has_request(rs)) {
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003277 int pages;
Jason J. Herne070afca2015-09-08 13:12:35 -04003278
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003279 if (qemu_file_get_error(f)) {
Juan Quintela56e93d22015-05-07 19:33:31 +02003280 break;
3281 }
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003282
Juan Quintela05931ec2021-12-15 19:01:21 +01003283 pages = ram_find_and_save_block(rs);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003284 /* no more pages to sent */
3285 if (pages == 0) {
3286 done = 1;
3287 break;
3288 }
3289
3290 if (pages < 0) {
3291 qemu_file_set_error(f, pages);
3292 break;
3293 }
3294
3295 rs->target_page_count += pages;
3296
3297 /*
Wei Yang644acf92019-11-07 20:39:07 +08003298 * During postcopy, it is necessary to make sure one whole host
3299 * page is sent in one chunk.
3300 */
3301 if (migrate_postcopy_ram()) {
3302 flush_compressed_data(rs);
3303 }
3304
3305 /*
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003306 * we want to check in the 1st loop, just in case it was the 1st
3307 * time and we had to sync the dirty bitmap.
3308 * qemu_clock_get_ns() is a bit expensive, so we only check each
3309 * some iterations
3310 */
3311 if ((i & 63) == 0) {
3312 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3313 1000000;
3314 if (t1 > MAX_WAIT) {
3315 trace_ram_save_iterate_big_wait(t1, i);
3316 break;
3317 }
3318 }
3319 i++;
Juan Quintela56e93d22015-05-07 19:33:31 +02003320 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003321 }
Peter Xu63268c42021-06-30 16:08:05 -04003322 qemu_mutex_unlock(&rs->bitmap_mutex);
Juan Quintela56e93d22015-05-07 19:33:31 +02003323
3324 /*
3325 * Must occur before EOS (or any QEMUFile operation)
3326 * because of RDMA protocol.
3327 */
3328 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3329
Peter Lievenb2557342018-03-08 12:18:24 +01003330out:
Juan Quintelab69a0222020-01-22 11:36:12 +01003331 if (ret >= 0
3332 && migration_is_setup_or_active(migrate_get_current()->state)) {
Peter Xu7f401b82022-10-11 17:55:59 -04003333 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
Leonardo Bras33d70972022-05-13 03:28:35 -03003334 if (ret < 0) {
3335 return ret;
3336 }
3337
Juan Quintela3d4095b2019-12-18 05:12:36 +01003338 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3339 qemu_fflush(f);
David Edmondson4c2d0f62021-12-21 09:34:40 +00003340 ram_transferred_add(8);
Juan Quintela56e93d22015-05-07 19:33:31 +02003341
Juan Quintela3d4095b2019-12-18 05:12:36 +01003342 ret = qemu_file_get_error(f);
3343 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003344 if (ret < 0) {
3345 return ret;
3346 }
3347
Thomas Huth5c903082016-11-04 14:10:17 +01003348 return done;
Juan Quintela56e93d22015-05-07 19:33:31 +02003349}
3350
Juan Quintela3d0684b2017-03-23 15:06:39 +01003351/**
3352 * ram_save_complete: function called to send the remaining amount of ram
3353 *
Xiao Guangronge8f37352018-09-03 17:26:44 +08003354 * Returns zero to indicate success or negative on error
Juan Quintela3d0684b2017-03-23 15:06:39 +01003355 *
3356 * Called with iothread lock
3357 *
3358 * @f: QEMUFile where to send the data
3359 * @opaque: RAMState pointer
3360 */
Juan Quintela56e93d22015-05-07 19:33:31 +02003361static int ram_save_complete(QEMUFile *f, void *opaque)
3362{
Juan Quintela53518d92017-05-04 11:46:24 +02003363 RAMState **temp = opaque;
3364 RAMState *rs = *temp;
Xiao Guangronge8f37352018-09-03 17:26:44 +08003365 int ret = 0;
Juan Quintela6f37bb82017-03-13 19:26:29 +01003366
Juan Quintela05931ec2021-12-15 19:01:21 +01003367 rs->last_stage = !migration_in_colo_state();
3368
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003369 WITH_RCU_READ_LOCK_GUARD() {
3370 if (!migration_in_postcopy()) {
3371 migration_bitmap_sync_precopy(rs);
Juan Quintela56e93d22015-05-07 19:33:31 +02003372 }
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003373
3374 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3375
3376 /* try transferring iterative blocks of memory */
3377
3378 /* flush all remaining blocks regardless of rate limiting */
Peter Xuc13221b2022-10-11 17:55:45 -04003379 qemu_mutex_lock(&rs->bitmap_mutex);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003380 while (true) {
3381 int pages;
3382
Juan Quintela05931ec2021-12-15 19:01:21 +01003383 pages = ram_find_and_save_block(rs);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003384 /* no more blocks to sent */
3385 if (pages == 0) {
3386 break;
3387 }
3388 if (pages < 0) {
3389 ret = pages;
3390 break;
3391 }
Xiao Guangronge8f37352018-09-03 17:26:44 +08003392 }
Peter Xuc13221b2022-10-11 17:55:45 -04003393 qemu_mutex_unlock(&rs->bitmap_mutex);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003394
3395 flush_compressed_data(rs);
3396 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
Juan Quintela56e93d22015-05-07 19:33:31 +02003397 }
3398
Leonardo Bras33d70972022-05-13 03:28:35 -03003399 if (ret < 0) {
3400 return ret;
Juan Quintela3d4095b2019-12-18 05:12:36 +01003401 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003402
Peter Xu7f401b82022-10-11 17:55:59 -04003403 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
Leonardo Bras33d70972022-05-13 03:28:35 -03003404 if (ret < 0) {
3405 return ret;
3406 }
3407
3408 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3409 qemu_fflush(f);
3410
3411 return 0;
Juan Quintela56e93d22015-05-07 19:33:31 +02003412}
3413
Juan Quintelafd703852022-10-03 02:50:42 +02003414static void ram_state_pending_estimate(void *opaque,
Juan Quintelac8df4a72022-10-03 02:00:03 +02003415 uint64_t *res_precopy_only,
3416 uint64_t *res_compatible,
3417 uint64_t *res_postcopy_only)
Juan Quintela56e93d22015-05-07 19:33:31 +02003418{
Juan Quintela53518d92017-05-04 11:46:24 +02003419 RAMState **temp = opaque;
3420 RAMState *rs = *temp;
Juan Quintela56e93d22015-05-07 19:33:31 +02003421
Juan Quintelac8df4a72022-10-03 02:00:03 +02003422 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
Juan Quintela56e93d22015-05-07 19:33:31 +02003423
Juan Quintelac8df4a72022-10-03 02:00:03 +02003424 if (migrate_postcopy_ram()) {
3425 /* We can do postcopy, and all the data is postcopiable */
3426 *res_postcopy_only += remaining_size;
3427 } else {
3428 *res_precopy_only += remaining_size;
3429 }
3430}
3431
Juan Quintelafd703852022-10-03 02:50:42 +02003432static void ram_state_pending_exact(void *opaque,
Juan Quintelac8df4a72022-10-03 02:00:03 +02003433 uint64_t *res_precopy_only,
3434 uint64_t *res_compatible,
3435 uint64_t *res_postcopy_only)
3436{
3437 RAMState **temp = opaque;
3438 RAMState *rs = *temp;
3439
3440 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3441
3442 if (!migration_in_postcopy()) {
Juan Quintela56e93d22015-05-07 19:33:31 +02003443 qemu_mutex_lock_iothread();
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003444 WITH_RCU_READ_LOCK_GUARD() {
3445 migration_bitmap_sync_precopy(rs);
3446 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003447 qemu_mutex_unlock_iothread();
Juan Quintela9edabd42017-03-14 12:02:16 +01003448 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
Juan Quintela56e93d22015-05-07 19:33:31 +02003449 }
Dr. David Alan Gilbertc31b0982015-11-05 18:10:54 +00003450
Vladimir Sementsov-Ogievskiy86e11672017-07-10 19:30:15 +03003451 if (migrate_postcopy_ram()) {
3452 /* We can do postcopy, and all the data is postcopiable */
Vladimir Sementsov-Ogievskiy47995022018-03-13 15:34:00 -04003453 *res_compatible += remaining_size;
Vladimir Sementsov-Ogievskiy86e11672017-07-10 19:30:15 +03003454 } else {
Vladimir Sementsov-Ogievskiy47995022018-03-13 15:34:00 -04003455 *res_precopy_only += remaining_size;
Vladimir Sementsov-Ogievskiy86e11672017-07-10 19:30:15 +03003456 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003457}
3458
3459static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3460{
3461 unsigned int xh_len;
3462 int xh_flags;
Dr. David Alan Gilbert063e7602015-12-16 11:47:37 +00003463 uint8_t *loaded_data;
Juan Quintela56e93d22015-05-07 19:33:31 +02003464
Juan Quintela56e93d22015-05-07 19:33:31 +02003465 /* extract RLE header */
3466 xh_flags = qemu_get_byte(f);
3467 xh_len = qemu_get_be16(f);
3468
3469 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3470 error_report("Failed to load XBZRLE page - wrong compression!");
3471 return -1;
3472 }
3473
3474 if (xh_len > TARGET_PAGE_SIZE) {
3475 error_report("Failed to load XBZRLE page - len overflow!");
3476 return -1;
3477 }
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003478 loaded_data = XBZRLE.decoded_buf;
Juan Quintela56e93d22015-05-07 19:33:31 +02003479 /* load data and decode */
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003480 /* it can change loaded_data to point to an internal buffer */
Dr. David Alan Gilbert063e7602015-12-16 11:47:37 +00003481 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
Juan Quintela56e93d22015-05-07 19:33:31 +02003482
3483 /* decode RLE */
Dr. David Alan Gilbert063e7602015-12-16 11:47:37 +00003484 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
Juan Quintela56e93d22015-05-07 19:33:31 +02003485 TARGET_PAGE_SIZE) == -1) {
3486 error_report("Failed to load XBZRLE page - decode error!");
3487 return -1;
3488 }
3489
3490 return 0;
3491}
3492
Juan Quintela3d0684b2017-03-23 15:06:39 +01003493/**
3494 * ram_block_from_stream: read a RAMBlock id from the migration stream
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003495 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01003496 * Must be called from within a rcu critical section.
3497 *
3498 * Returns a pointer from within the RCU-protected ram_list.
3499 *
Peter Xu755e8d72022-03-01 16:39:07 +08003500 * @mis: the migration incoming state pointer
Juan Quintela3d0684b2017-03-23 15:06:39 +01003501 * @f: QEMUFile where to read the data from
3502 * @flags: Page flags (mostly to see if it's a continuation of previous block)
Peter Xuc01b16e2022-07-07 14:55:04 -04003503 * @channel: the channel we're using
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003504 */
Peter Xu755e8d72022-03-01 16:39:07 +08003505static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
Peter Xuc01b16e2022-07-07 14:55:04 -04003506 QEMUFile *f, int flags,
3507 int channel)
Juan Quintela56e93d22015-05-07 19:33:31 +02003508{
Peter Xuc01b16e2022-07-07 14:55:04 -04003509 RAMBlock *block = mis->last_recv_block[channel];
Juan Quintela56e93d22015-05-07 19:33:31 +02003510 char id[256];
3511 uint8_t len;
3512
3513 if (flags & RAM_SAVE_FLAG_CONTINUE) {
zhanghailiang4c4bad42016-01-15 11:37:41 +08003514 if (!block) {
Juan Quintela56e93d22015-05-07 19:33:31 +02003515 error_report("Ack, bad migration stream!");
3516 return NULL;
3517 }
zhanghailiang4c4bad42016-01-15 11:37:41 +08003518 return block;
Juan Quintela56e93d22015-05-07 19:33:31 +02003519 }
3520
3521 len = qemu_get_byte(f);
3522 qemu_get_buffer(f, (uint8_t *)id, len);
3523 id[len] = 0;
3524
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00003525 block = qemu_ram_block_by_name(id);
zhanghailiang4c4bad42016-01-15 11:37:41 +08003526 if (!block) {
3527 error_report("Can't find block %s", id);
3528 return NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02003529 }
3530
Yury Kotovfbd162e2019-02-15 20:45:46 +03003531 if (ramblock_is_ignored(block)) {
CĂ©dric Le Goaterb895de52018-05-14 08:57:00 +02003532 error_report("block %s should not be migrated !", id);
3533 return NULL;
3534 }
3535
Peter Xuc01b16e2022-07-07 14:55:04 -04003536 mis->last_recv_block[channel] = block;
Peter Xu755e8d72022-03-01 16:39:07 +08003537
zhanghailiang4c4bad42016-01-15 11:37:41 +08003538 return block;
3539}
3540
3541static inline void *host_from_ram_block_offset(RAMBlock *block,
3542 ram_addr_t offset)
3543{
3544 if (!offset_in_ramblock(block, offset)) {
3545 return NULL;
3546 }
3547
3548 return block->host + offset;
Juan Quintela56e93d22015-05-07 19:33:31 +02003549}
3550
David Hildenbrand6a23f632021-04-29 13:27:05 +02003551static void *host_page_from_ram_block_offset(RAMBlock *block,
3552 ram_addr_t offset)
3553{
3554 /* Note: Explicitly no check against offset_in_ramblock(). */
3555 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3556 block->page_size);
3557}
3558
3559static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3560 ram_addr_t offset)
3561{
3562 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3563}
3564
Zhang Chen13af18f2018-09-03 12:38:48 +08003565static inline void *colo_cache_from_block_offset(RAMBlock *block,
zhanghailiang8af66372020-02-24 14:54:11 +08003566 ram_addr_t offset, bool record_bitmap)
Zhang Chen13af18f2018-09-03 12:38:48 +08003567{
3568 if (!offset_in_ramblock(block, offset)) {
3569 return NULL;
3570 }
3571 if (!block->colo_cache) {
3572 error_report("%s: colo_cache is NULL in block :%s",
3573 __func__, block->idstr);
3574 return NULL;
3575 }
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003576
3577 /*
3578 * During colo checkpoint, we need bitmap of these migrated pages.
3579 * It help us to decide which pages in ram cache should be flushed
3580 * into VM's RAM later.
3581 */
zhanghailiang8af66372020-02-24 14:54:11 +08003582 if (record_bitmap &&
3583 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003584 ram_state->migration_dirty_pages++;
3585 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003586 return block->colo_cache + offset;
3587}
3588
Juan Quintela3d0684b2017-03-23 15:06:39 +01003589/**
3590 * ram_handle_compressed: handle the zero page case
3591 *
Juan Quintela56e93d22015-05-07 19:33:31 +02003592 * If a page (or a whole RDMA chunk) has been
3593 * determined to be zero, then zap it.
Juan Quintela3d0684b2017-03-23 15:06:39 +01003594 *
3595 * @host: host address for the zero page
3596 * @ch: what the page is filled from. We only support zero
3597 * @size: size of the zero page
Juan Quintela56e93d22015-05-07 19:33:31 +02003598 */
3599void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3600{
Juan Quintelabad452a2021-11-18 15:56:38 +01003601 if (ch != 0 || !buffer_is_zero(host, size)) {
Juan Quintela56e93d22015-05-07 19:33:31 +02003602 memset(host, ch, size);
3603 }
3604}
3605
Xiao Guangrong797ca152018-03-30 15:51:21 +08003606/* return the size after decompression, or negative value on error */
3607static int
3608qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3609 const uint8_t *source, size_t source_len)
3610{
3611 int err;
3612
3613 err = inflateReset(stream);
3614 if (err != Z_OK) {
3615 return -1;
3616 }
3617
3618 stream->avail_in = source_len;
3619 stream->next_in = (uint8_t *)source;
3620 stream->avail_out = dest_len;
3621 stream->next_out = dest;
3622
3623 err = inflate(stream, Z_NO_FLUSH);
3624 if (err != Z_STREAM_END) {
3625 return -1;
3626 }
3627
3628 return stream->total_out;
3629}
3630
Juan Quintela56e93d22015-05-07 19:33:31 +02003631static void *do_data_decompress(void *opaque)
3632{
3633 DecompressParam *param = opaque;
3634 unsigned long pagesize;
Liang Li33d151f2016-05-05 15:32:58 +08003635 uint8_t *des;
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003636 int len, ret;
Juan Quintela56e93d22015-05-07 19:33:31 +02003637
Liang Li33d151f2016-05-05 15:32:58 +08003638 qemu_mutex_lock(&param->mutex);
Liang Li90e56fb2016-05-05 15:32:56 +08003639 while (!param->quit) {
Liang Li33d151f2016-05-05 15:32:58 +08003640 if (param->des) {
3641 des = param->des;
3642 len = param->len;
3643 param->des = 0;
3644 qemu_mutex_unlock(&param->mutex);
3645
Liang Li73a89122016-05-05 15:32:51 +08003646 pagesize = TARGET_PAGE_SIZE;
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003647
3648 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3649 param->compbuf, len);
Xiao Guangrongf5482222018-05-03 16:06:11 +08003650 if (ret < 0 && migrate_get_current()->decompress_error_check) {
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003651 error_report("decompress data failed");
3652 qemu_file_set_error(decomp_file, ret);
3653 }
Liang Li73a89122016-05-05 15:32:51 +08003654
Liang Li33d151f2016-05-05 15:32:58 +08003655 qemu_mutex_lock(&decomp_done_lock);
3656 param->done = true;
3657 qemu_cond_signal(&decomp_done_cond);
3658 qemu_mutex_unlock(&decomp_done_lock);
3659
3660 qemu_mutex_lock(&param->mutex);
3661 } else {
3662 qemu_cond_wait(&param->cond, &param->mutex);
3663 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003664 }
Liang Li33d151f2016-05-05 15:32:58 +08003665 qemu_mutex_unlock(&param->mutex);
Juan Quintela56e93d22015-05-07 19:33:31 +02003666
3667 return NULL;
3668}
3669
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003670static int wait_for_decompress_done(void)
Liang Li5533b2e2016-05-05 15:32:52 +08003671{
3672 int idx, thread_count;
3673
3674 if (!migrate_use_compression()) {
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003675 return 0;
Liang Li5533b2e2016-05-05 15:32:52 +08003676 }
3677
3678 thread_count = migrate_decompress_threads();
3679 qemu_mutex_lock(&decomp_done_lock);
3680 for (idx = 0; idx < thread_count; idx++) {
3681 while (!decomp_param[idx].done) {
3682 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3683 }
3684 }
3685 qemu_mutex_unlock(&decomp_done_lock);
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003686 return qemu_file_get_error(decomp_file);
Liang Li5533b2e2016-05-05 15:32:52 +08003687}
3688
Juan Quintelaf0afa332017-06-28 11:52:28 +02003689static void compress_threads_load_cleanup(void)
Juan Quintela56e93d22015-05-07 19:33:31 +02003690{
3691 int i, thread_count;
3692
Juan Quintela3416ab52016-04-20 11:56:01 +02003693 if (!migrate_use_compression()) {
3694 return;
3695 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003696 thread_count = migrate_decompress_threads();
3697 for (i = 0; i < thread_count; i++) {
Xiao Guangrong797ca152018-03-30 15:51:21 +08003698 /*
3699 * we use it as a indicator which shows if the thread is
3700 * properly init'd or not
3701 */
3702 if (!decomp_param[i].compbuf) {
3703 break;
3704 }
3705
Juan Quintela56e93d22015-05-07 19:33:31 +02003706 qemu_mutex_lock(&decomp_param[i].mutex);
Liang Li90e56fb2016-05-05 15:32:56 +08003707 decomp_param[i].quit = true;
Juan Quintela56e93d22015-05-07 19:33:31 +02003708 qemu_cond_signal(&decomp_param[i].cond);
3709 qemu_mutex_unlock(&decomp_param[i].mutex);
3710 }
3711 for (i = 0; i < thread_count; i++) {
Xiao Guangrong797ca152018-03-30 15:51:21 +08003712 if (!decomp_param[i].compbuf) {
3713 break;
3714 }
3715
Juan Quintela56e93d22015-05-07 19:33:31 +02003716 qemu_thread_join(decompress_threads + i);
3717 qemu_mutex_destroy(&decomp_param[i].mutex);
3718 qemu_cond_destroy(&decomp_param[i].cond);
Xiao Guangrong797ca152018-03-30 15:51:21 +08003719 inflateEnd(&decomp_param[i].stream);
Juan Quintela56e93d22015-05-07 19:33:31 +02003720 g_free(decomp_param[i].compbuf);
Xiao Guangrong797ca152018-03-30 15:51:21 +08003721 decomp_param[i].compbuf = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02003722 }
3723 g_free(decompress_threads);
3724 g_free(decomp_param);
Juan Quintela56e93d22015-05-07 19:33:31 +02003725 decompress_threads = NULL;
3726 decomp_param = NULL;
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003727 decomp_file = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02003728}
3729
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003730static int compress_threads_load_setup(QEMUFile *f)
Xiao Guangrong797ca152018-03-30 15:51:21 +08003731{
3732 int i, thread_count;
3733
3734 if (!migrate_use_compression()) {
3735 return 0;
3736 }
3737
3738 thread_count = migrate_decompress_threads();
3739 decompress_threads = g_new0(QemuThread, thread_count);
3740 decomp_param = g_new0(DecompressParam, thread_count);
3741 qemu_mutex_init(&decomp_done_lock);
3742 qemu_cond_init(&decomp_done_cond);
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003743 decomp_file = f;
Xiao Guangrong797ca152018-03-30 15:51:21 +08003744 for (i = 0; i < thread_count; i++) {
3745 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3746 goto exit;
3747 }
3748
3749 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3750 qemu_mutex_init(&decomp_param[i].mutex);
3751 qemu_cond_init(&decomp_param[i].cond);
3752 decomp_param[i].done = true;
3753 decomp_param[i].quit = false;
3754 qemu_thread_create(decompress_threads + i, "decompress",
3755 do_data_decompress, decomp_param + i,
3756 QEMU_THREAD_JOINABLE);
3757 }
3758 return 0;
3759exit:
3760 compress_threads_load_cleanup();
3761 return -1;
3762}
3763
Dr. David Alan Gilbertc1bc6622015-12-16 11:47:38 +00003764static void decompress_data_with_multi_threads(QEMUFile *f,
Juan Quintela56e93d22015-05-07 19:33:31 +02003765 void *host, int len)
3766{
3767 int idx, thread_count;
3768
3769 thread_count = migrate_decompress_threads();
Mahmoud Mandour37396952021-03-11 05:15:35 +02003770 QEMU_LOCK_GUARD(&decomp_done_lock);
Juan Quintela56e93d22015-05-07 19:33:31 +02003771 while (true) {
3772 for (idx = 0; idx < thread_count; idx++) {
Liang Li73a89122016-05-05 15:32:51 +08003773 if (decomp_param[idx].done) {
Liang Li33d151f2016-05-05 15:32:58 +08003774 decomp_param[idx].done = false;
3775 qemu_mutex_lock(&decomp_param[idx].mutex);
Dr. David Alan Gilbertc1bc6622015-12-16 11:47:38 +00003776 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
Juan Quintela56e93d22015-05-07 19:33:31 +02003777 decomp_param[idx].des = host;
3778 decomp_param[idx].len = len;
Liang Li33d151f2016-05-05 15:32:58 +08003779 qemu_cond_signal(&decomp_param[idx].cond);
3780 qemu_mutex_unlock(&decomp_param[idx].mutex);
Juan Quintela56e93d22015-05-07 19:33:31 +02003781 break;
3782 }
3783 }
3784 if (idx < thread_count) {
3785 break;
Liang Li73a89122016-05-05 15:32:51 +08003786 } else {
3787 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
Juan Quintela56e93d22015-05-07 19:33:31 +02003788 }
3789 }
3790}
3791
Rao, Leib70cb3b2020-10-16 13:52:01 +08003792static void colo_init_ram_state(void)
3793{
3794 ram_state_init(&ram_state);
Rao, Leib70cb3b2020-10-16 13:52:01 +08003795}
3796
Zhang Chen13af18f2018-09-03 12:38:48 +08003797/*
3798 * colo cache: this is for secondary VM, we cache the whole
3799 * memory of the secondary VM, it is need to hold the global lock
3800 * to call this helper.
3801 */
3802int colo_init_ram_cache(void)
3803{
3804 RAMBlock *block;
3805
Paolo Bonzini44901b52019-12-13 15:07:22 +01003806 WITH_RCU_READ_LOCK_GUARD() {
3807 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3808 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
David Hildenbrand8dbe22c2021-05-10 13:43:21 +02003809 NULL, false, false);
Paolo Bonzini44901b52019-12-13 15:07:22 +01003810 if (!block->colo_cache) {
3811 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3812 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3813 block->used_length);
3814 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3815 if (block->colo_cache) {
3816 qemu_anon_ram_free(block->colo_cache, block->used_length);
3817 block->colo_cache = NULL;
3818 }
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003819 }
Paolo Bonzini44901b52019-12-13 15:07:22 +01003820 return -errno;
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003821 }
Lukas Straube5fdf922021-07-04 18:14:44 +02003822 if (!machine_dump_guest_core(current_machine)) {
3823 qemu_madvise(block->colo_cache, block->used_length,
3824 QEMU_MADV_DONTDUMP);
3825 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003826 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003827 }
Paolo Bonzini44901b52019-12-13 15:07:22 +01003828
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003829 /*
3830 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3831 * with to decide which page in cache should be flushed into SVM's RAM. Here
3832 * we use the same name 'ram_bitmap' as for migration.
3833 */
3834 if (ram_bytes_total()) {
3835 RAMBlock *block;
3836
Yury Kotovfbd162e2019-02-15 20:45:46 +03003837 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003838 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003839 block->bmap = bitmap_new(pages);
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003840 }
3841 }
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003842
Rao, Leib70cb3b2020-10-16 13:52:01 +08003843 colo_init_ram_state();
Zhang Chen13af18f2018-09-03 12:38:48 +08003844 return 0;
Zhang Chen13af18f2018-09-03 12:38:48 +08003845}
3846
zhanghailiang03930312020-02-24 14:54:10 +08003847/* TODO: duplicated with ram_init_bitmaps */
3848void colo_incoming_start_dirty_log(void)
3849{
3850 RAMBlock *block = NULL;
3851 /* For memory_global_dirty_log_start below. */
3852 qemu_mutex_lock_iothread();
3853 qemu_mutex_lock_ramlist();
3854
3855 memory_global_dirty_log_sync();
3856 WITH_RCU_READ_LOCK_GUARD() {
3857 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3858 ramblock_sync_dirty_bitmap(ram_state, block);
3859 /* Discard this dirty bitmap record */
3860 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3861 }
Hyman Huang(黄勇)63b41db2021-06-29 16:01:19 +00003862 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
zhanghailiang03930312020-02-24 14:54:10 +08003863 }
3864 ram_state->migration_dirty_pages = 0;
3865 qemu_mutex_unlock_ramlist();
3866 qemu_mutex_unlock_iothread();
3867}
3868
Zhang Chen13af18f2018-09-03 12:38:48 +08003869/* It is need to hold the global lock to call this helper */
3870void colo_release_ram_cache(void)
3871{
3872 RAMBlock *block;
3873
Hyman Huang(黄勇)63b41db2021-06-29 16:01:19 +00003874 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
Yury Kotovfbd162e2019-02-15 20:45:46 +03003875 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003876 g_free(block->bmap);
3877 block->bmap = NULL;
3878 }
3879
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003880 WITH_RCU_READ_LOCK_GUARD() {
3881 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3882 if (block->colo_cache) {
3883 qemu_anon_ram_free(block->colo_cache, block->used_length);
3884 block->colo_cache = NULL;
3885 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003886 }
3887 }
zhanghailiang03930312020-02-24 14:54:10 +08003888 ram_state_cleanup(&ram_state);
Zhang Chen13af18f2018-09-03 12:38:48 +08003889}
3890
Juan Quintela3d0684b2017-03-23 15:06:39 +01003891/**
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003892 * ram_load_setup: Setup RAM for migration incoming side
3893 *
3894 * Returns zero to indicate success and negative for error
3895 *
3896 * @f: QEMUFile where to receive the data
3897 * @opaque: RAMState pointer
3898 */
3899static int ram_load_setup(QEMUFile *f, void *opaque)
3900{
Xiao Guangrong34ab9e92018-03-30 15:51:22 +08003901 if (compress_threads_load_setup(f)) {
Xiao Guangrong797ca152018-03-30 15:51:21 +08003902 return -1;
3903 }
3904
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003905 xbzrle_load_setup();
Alexey Perevalovf9494612017-10-05 14:13:20 +03003906 ramblock_recv_map_init();
Zhang Chen13af18f2018-09-03 12:38:48 +08003907
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003908 return 0;
3909}
3910
3911static int ram_load_cleanup(void *opaque)
3912{
Alexey Perevalovf9494612017-10-05 14:13:20 +03003913 RAMBlock *rb;
Junyan He56eb90a2018-07-18 15:48:03 +08003914
Yury Kotovfbd162e2019-02-15 20:45:46 +03003915 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
Beata Michalskabd108a42019-11-21 00:08:42 +00003916 qemu_ram_block_writeback(rb);
Junyan He56eb90a2018-07-18 15:48:03 +08003917 }
3918
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003919 xbzrle_load_cleanup();
Juan Quintelaf0afa332017-06-28 11:52:28 +02003920 compress_threads_load_cleanup();
Alexey Perevalovf9494612017-10-05 14:13:20 +03003921
Yury Kotovfbd162e2019-02-15 20:45:46 +03003922 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
Alexey Perevalovf9494612017-10-05 14:13:20 +03003923 g_free(rb->receivedmap);
3924 rb->receivedmap = NULL;
3925 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003926
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003927 return 0;
3928}
3929
3930/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01003931 * ram_postcopy_incoming_init: allocate postcopy data structures
3932 *
3933 * Returns 0 for success and negative if there was one error
3934 *
3935 * @mis: current migration incoming state
3936 *
3937 * Allocate data structures etc needed by incoming migration with
3938 * postcopy-ram. postcopy-ram's similarly names
3939 * postcopy_ram_incoming_init does the work.
Dr. David Alan Gilbert1caddf82015-11-05 18:11:03 +00003940 */
3941int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3942{
David Hildenbrandc1361802018-06-20 22:27:36 +02003943 return postcopy_ram_incoming_init(mis);
Dr. David Alan Gilbert1caddf82015-11-05 18:11:03 +00003944}
3945
Juan Quintela3d0684b2017-03-23 15:06:39 +01003946/**
3947 * ram_load_postcopy: load a page in postcopy case
3948 *
3949 * Returns 0 for success or -errno in case of error
3950 *
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003951 * Called in postcopy mode by ram_load().
3952 * rcu_read_lock is taken prior to this being called.
Juan Quintela3d0684b2017-03-23 15:06:39 +01003953 *
3954 * @f: QEMUFile where to send the data
Peter Xu36f62f12022-07-07 14:55:02 -04003955 * @channel: the channel to use for loading
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003956 */
Peter Xu36f62f12022-07-07 14:55:02 -04003957int ram_load_postcopy(QEMUFile *f, int channel)
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003958{
3959 int flags = 0, ret = 0;
3960 bool place_needed = false;
Peter Xu1aa83672018-07-10 17:18:53 +08003961 bool matches_target_page_size = false;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003962 MigrationIncomingState *mis = migration_incoming_get_current();
Peter Xu36f62f12022-07-07 14:55:02 -04003963 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003964
3965 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3966 ram_addr_t addr;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003967 void *page_buffer = NULL;
3968 void *place_source = NULL;
Dr. David Alan Gilbertdf9ff5e2017-02-24 18:28:35 +00003969 RAMBlock *block = NULL;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003970 uint8_t ch;
Wei Yang644acf92019-11-07 20:39:07 +08003971 int len;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003972
3973 addr = qemu_get_be64(f);
Peter Xu7a9ddfb2018-02-08 18:31:05 +08003974
3975 /*
3976 * If qemu file error, we should stop here, and then "addr"
3977 * may be invalid
3978 */
3979 ret = qemu_file_get_error(f);
3980 if (ret) {
3981 break;
3982 }
3983
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003984 flags = addr & ~TARGET_PAGE_MASK;
3985 addr &= TARGET_PAGE_MASK;
3986
Peter Xu36f62f12022-07-07 14:55:02 -04003987 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
Wei Yang644acf92019-11-07 20:39:07 +08003988 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3989 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
Peter Xuc01b16e2022-07-07 14:55:04 -04003990 block = ram_block_from_stream(mis, f, flags, channel);
David Hildenbrand6a23f632021-04-29 13:27:05 +02003991 if (!block) {
3992 ret = -EINVAL;
3993 break;
3994 }
zhanghailiang4c4bad42016-01-15 11:37:41 +08003995
David Hildenbrand898ba902021-04-29 13:27:06 +02003996 /*
3997 * Relying on used_length is racy and can result in false positives.
3998 * We might place pages beyond used_length in case RAM was shrunk
3999 * while in postcopy, which is fine - trying to place via
4000 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4001 */
4002 if (!block->host || addr >= block->postcopy_length) {
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004003 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4004 ret = -EINVAL;
4005 break;
4006 }
Peter Xu77dadc32022-03-01 16:39:04 +08004007 tmp_page->target_pages++;
Peter Xu1aa83672018-07-10 17:18:53 +08004008 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004009 /*
Dr. David Alan Gilbert28abd202017-02-24 18:28:37 +00004010 * Postcopy requires that we place whole host pages atomically;
4011 * these may be huge pages for RAMBlocks that are backed by
4012 * hugetlbfs.
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004013 * To make it atomic, the data is read into a temporary page
4014 * that's moved into place later.
4015 * The migration protocol uses, possibly smaller, target-pages
4016 * however the source ensures it always sends all the components
Wei Yang91ba4422019-11-07 20:39:06 +08004017 * of a host page in one chunk.
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004018 */
Peter Xu77dadc32022-03-01 16:39:04 +08004019 page_buffer = tmp_page->tmp_huge_page +
David Hildenbrand6a23f632021-04-29 13:27:05 +02004020 host_page_offset_from_ram_block_offset(block, addr);
4021 /* If all TP are zero then we can optimise the place */
Peter Xu77dadc32022-03-01 16:39:04 +08004022 if (tmp_page->target_pages == 1) {
4023 tmp_page->host_addr =
4024 host_page_from_ram_block_offset(block, addr);
4025 } else if (tmp_page->host_addr !=
4026 host_page_from_ram_block_offset(block, addr)) {
Dr. David Alan Gilbertc53b7dd2015-11-05 18:11:12 +00004027 /* not the 1st TP within the HP */
Peter Xu36f62f12022-07-07 14:55:02 -04004028 error_report("Non-same host page detected on channel %d: "
Peter Xucfc7dc82022-03-01 16:39:05 +08004029 "Target host page %p, received host page %p "
4030 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
Peter Xu36f62f12022-07-07 14:55:02 -04004031 channel, tmp_page->host_addr,
Peter Xucfc7dc82022-03-01 16:39:05 +08004032 host_page_from_ram_block_offset(block, addr),
4033 block->idstr, addr, tmp_page->target_pages);
David Hildenbrand6a23f632021-04-29 13:27:05 +02004034 ret = -EINVAL;
4035 break;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004036 }
4037
4038 /*
4039 * If it's the last part of a host page then we place the host
4040 * page
4041 */
Peter Xu77dadc32022-03-01 16:39:04 +08004042 if (tmp_page->target_pages ==
4043 (block->page_size / TARGET_PAGE_SIZE)) {
Wei Yang4cbb3c62019-11-07 20:39:04 +08004044 place_needed = true;
Wei Yang4cbb3c62019-11-07 20:39:04 +08004045 }
Peter Xu77dadc32022-03-01 16:39:04 +08004046 place_source = tmp_page->tmp_huge_page;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004047 }
4048
4049 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
Juan Quintelabb890ed2017-04-28 09:39:55 +02004050 case RAM_SAVE_FLAG_ZERO:
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004051 ch = qemu_get_byte(f);
Wei Yang2e36bc12019-11-07 20:39:02 +08004052 /*
4053 * Can skip to set page_buffer when
4054 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4055 */
4056 if (ch || !matches_target_page_size) {
4057 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4058 }
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004059 if (ch) {
Peter Xu77dadc32022-03-01 16:39:04 +08004060 tmp_page->all_zero = false;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004061 }
4062 break;
4063
4064 case RAM_SAVE_FLAG_PAGE:
Peter Xu77dadc32022-03-01 16:39:04 +08004065 tmp_page->all_zero = false;
Peter Xu1aa83672018-07-10 17:18:53 +08004066 if (!matches_target_page_size) {
4067 /* For huge pages, we always use temporary buffer */
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004068 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4069 } else {
Peter Xu1aa83672018-07-10 17:18:53 +08004070 /*
4071 * For small pages that matches target page size, we
4072 * avoid the qemu_file copy. Instead we directly use
4073 * the buffer of QEMUFile to place the page. Note: we
4074 * cannot do any QEMUFile operation before using that
4075 * buffer to make sure the buffer is valid when
4076 * placing the page.
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004077 */
4078 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4079 TARGET_PAGE_SIZE);
4080 }
4081 break;
Wei Yang644acf92019-11-07 20:39:07 +08004082 case RAM_SAVE_FLAG_COMPRESS_PAGE:
Peter Xu77dadc32022-03-01 16:39:04 +08004083 tmp_page->all_zero = false;
Wei Yang644acf92019-11-07 20:39:07 +08004084 len = qemu_get_be32(f);
4085 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4086 error_report("Invalid compressed data length: %d", len);
4087 ret = -EINVAL;
4088 break;
4089 }
4090 decompress_data_with_multi_threads(f, page_buffer, len);
4091 break;
4092
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004093 case RAM_SAVE_FLAG_EOS:
4094 /* normal exit */
Juan Quintela6df264a2018-02-28 09:10:07 +01004095 multifd_recv_sync_main();
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004096 break;
4097 default:
Bihong Yu29fccad2020-10-20 11:10:42 +08004098 error_report("Unknown combination of migration flags: 0x%x"
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004099 " (postcopy mode)", flags);
4100 ret = -EINVAL;
Peter Xu7a9ddfb2018-02-08 18:31:05 +08004101 break;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004102 }
4103
Wei Yang644acf92019-11-07 20:39:07 +08004104 /* Got the whole host page, wait for decompress before placing. */
4105 if (place_needed) {
4106 ret |= wait_for_decompress_done();
4107 }
4108
Peter Xu7a9ddfb2018-02-08 18:31:05 +08004109 /* Detect for any possible file errors */
4110 if (!ret && qemu_file_get_error(f)) {
4111 ret = qemu_file_get_error(f);
4112 }
4113
4114 if (!ret && place_needed) {
Peter Xu77dadc32022-03-01 16:39:04 +08004115 if (tmp_page->all_zero) {
4116 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004117 } else {
Peter Xu77dadc32022-03-01 16:39:04 +08004118 ret = postcopy_place_page(mis, tmp_page->host_addr,
4119 place_source, block);
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004120 }
David Hildenbrandddf35bd2020-04-21 10:52:56 +02004121 place_needed = false;
Peter Xu77dadc32022-03-01 16:39:04 +08004122 postcopy_temp_page_reset(tmp_page);
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004123 }
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004124 }
4125
4126 return ret;
4127}
4128
Daniel Henrique Barbozaacab30b2017-11-16 20:35:26 -02004129static bool postcopy_is_advised(void)
4130{
4131 PostcopyState ps = postcopy_state_get();
4132 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4133}
4134
4135static bool postcopy_is_running(void)
4136{
4137 PostcopyState ps = postcopy_state_get();
4138 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4139}
4140
Zhang Chene6f4aa12018-09-03 12:38:50 +08004141/*
4142 * Flush content of RAM cache into SVM's memory.
4143 * Only flush the pages that be dirtied by PVM or SVM or both.
4144 */
Lukas Straub24fa16f2020-05-11 13:10:51 +02004145void colo_flush_ram_cache(void)
Zhang Chene6f4aa12018-09-03 12:38:50 +08004146{
4147 RAMBlock *block = NULL;
4148 void *dst_host;
4149 void *src_host;
4150 unsigned long offset = 0;
4151
zhanghailiangd1955d22018-09-03 12:38:55 +08004152 memory_global_dirty_log_sync();
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004153 WITH_RCU_READ_LOCK_GUARD() {
4154 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4155 ramblock_sync_dirty_bitmap(ram_state, block);
Zhang Chene6f4aa12018-09-03 12:38:50 +08004156 }
4157 }
4158
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004159 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4160 WITH_RCU_READ_LOCK_GUARD() {
4161 block = QLIST_FIRST_RCU(&ram_list.blocks);
4162
4163 while (block) {
Rao, Leia6a83ce2021-11-09 11:04:55 +08004164 unsigned long num = 0;
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004165
Rao, Leia6a83ce2021-11-09 11:04:55 +08004166 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
David Hildenbrand542147f2021-04-29 13:27:08 +02004167 if (!offset_in_ramblock(block,
4168 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004169 offset = 0;
Rao, Leia6a83ce2021-11-09 11:04:55 +08004170 num = 0;
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004171 block = QLIST_NEXT_RCU(block, next);
4172 } else {
Rao, Leia6a83ce2021-11-09 11:04:55 +08004173 unsigned long i = 0;
4174
4175 for (i = 0; i < num; i++) {
4176 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4177 }
Alexey Romko8bba0042020-01-10 14:51:34 +01004178 dst_host = block->host
4179 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4180 src_host = block->colo_cache
4181 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
Rao, Leia6a83ce2021-11-09 11:04:55 +08004182 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4183 offset += num;
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004184 }
4185 }
4186 }
Zhang Chene6f4aa12018-09-03 12:38:50 +08004187 trace_colo_flush_ram_cache_end();
4188}
4189
Wei Yang10da4a32019-07-25 08:20:23 +08004190/**
4191 * ram_load_precopy: load pages in precopy case
4192 *
4193 * Returns 0 for success or -errno in case of error
4194 *
4195 * Called in precopy mode by ram_load().
4196 * rcu_read_lock is taken prior to this being called.
4197 *
4198 * @f: QEMUFile where to send the data
4199 */
4200static int ram_load_precopy(QEMUFile *f)
Juan Quintela56e93d22015-05-07 19:33:31 +02004201{
Peter Xu755e8d72022-03-01 16:39:07 +08004202 MigrationIncomingState *mis = migration_incoming_get_current();
Yury Kotove65cec52019-11-25 16:36:32 +03004203 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
Dr. David Alan Gilbertef08fb32017-02-24 18:28:30 +00004204 /* ADVISE is earlier, it shows the source has the postcopy capability on */
Daniel Henrique Barbozaacab30b2017-11-16 20:35:26 -02004205 bool postcopy_advised = postcopy_is_advised();
Juan Quintelaedc60122016-11-02 12:40:46 +01004206 if (!migrate_use_compression()) {
4207 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4208 }
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00004209
Wei Yang10da4a32019-07-25 08:20:23 +08004210 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
Juan Quintela56e93d22015-05-07 19:33:31 +02004211 ram_addr_t addr, total_ram_bytes;
zhanghailiang03930312020-02-24 14:54:10 +08004212 void *host = NULL, *host_bak = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02004213 uint8_t ch;
4214
Yury Kotove65cec52019-11-25 16:36:32 +03004215 /*
4216 * Yield periodically to let main loop run, but an iteration of
4217 * the main loop is expensive, so do it each some iterations
4218 */
4219 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4220 aio_co_schedule(qemu_get_current_aio_context(),
4221 qemu_coroutine_self());
4222 qemu_coroutine_yield();
4223 }
4224 i++;
4225
Juan Quintela56e93d22015-05-07 19:33:31 +02004226 addr = qemu_get_be64(f);
4227 flags = addr & ~TARGET_PAGE_MASK;
4228 addr &= TARGET_PAGE_MASK;
4229
Juan Quintelaedc60122016-11-02 12:40:46 +01004230 if (flags & invalid_flags) {
4231 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4232 error_report("Received an unexpected compressed page");
4233 }
4234
4235 ret = -EINVAL;
4236 break;
4237 }
4238
Juan Quintelabb890ed2017-04-28 09:39:55 +02004239 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00004240 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
Peter Xuc01b16e2022-07-07 14:55:04 -04004241 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4242 RAM_CHANNEL_PRECOPY);
zhanghailiang4c4bad42016-01-15 11:37:41 +08004243
zhanghailiang03930312020-02-24 14:54:10 +08004244 host = host_from_ram_block_offset(block, addr);
Zhang Chen13af18f2018-09-03 12:38:48 +08004245 /*
zhanghailiang03930312020-02-24 14:54:10 +08004246 * After going into COLO stage, we should not load the page
4247 * into SVM's memory directly, we put them into colo_cache firstly.
4248 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4249 * Previously, we copied all these memory in preparing stage of COLO
4250 * while we need to stop VM, which is a time-consuming process.
4251 * Here we optimize it by a trick, back-up every page while in
4252 * migration process while COLO is enabled, though it affects the
4253 * speed of the migration, but it obviously reduce the downtime of
4254 * back-up all SVM'S memory in COLO preparing stage.
Zhang Chen13af18f2018-09-03 12:38:48 +08004255 */
zhanghailiang03930312020-02-24 14:54:10 +08004256 if (migration_incoming_colo_enabled()) {
4257 if (migration_incoming_in_colo_state()) {
4258 /* In COLO stage, put all pages into cache temporarily */
zhanghailiang8af66372020-02-24 14:54:11 +08004259 host = colo_cache_from_block_offset(block, addr, true);
zhanghailiang03930312020-02-24 14:54:10 +08004260 } else {
4261 /*
4262 * In migration stage but before COLO stage,
4263 * Put all pages into both cache and SVM's memory.
4264 */
zhanghailiang8af66372020-02-24 14:54:11 +08004265 host_bak = colo_cache_from_block_offset(block, addr, false);
zhanghailiang03930312020-02-24 14:54:10 +08004266 }
Zhang Chen13af18f2018-09-03 12:38:48 +08004267 }
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00004268 if (!host) {
4269 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4270 ret = -EINVAL;
4271 break;
4272 }
Zhang Chen13af18f2018-09-03 12:38:48 +08004273 if (!migration_incoming_in_colo_state()) {
4274 ramblock_recv_bitmap_set(block, host);
4275 }
4276
Dr. David Alan Gilbert1db9d8e2017-04-26 19:37:21 +01004277 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00004278 }
4279
Juan Quintela56e93d22015-05-07 19:33:31 +02004280 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4281 case RAM_SAVE_FLAG_MEM_SIZE:
4282 /* Synchronize RAM block list */
4283 total_ram_bytes = addr;
4284 while (!ret && total_ram_bytes) {
4285 RAMBlock *block;
Juan Quintela56e93d22015-05-07 19:33:31 +02004286 char id[256];
4287 ram_addr_t length;
4288
4289 len = qemu_get_byte(f);
4290 qemu_get_buffer(f, (uint8_t *)id, len);
4291 id[len] = 0;
4292 length = qemu_get_be64(f);
4293
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00004294 block = qemu_ram_block_by_name(id);
CĂ©dric Le Goaterb895de52018-05-14 08:57:00 +02004295 if (block && !qemu_ram_is_migratable(block)) {
4296 error_report("block %s should not be migrated !", id);
4297 ret = -EINVAL;
4298 } else if (block) {
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00004299 if (length != block->used_length) {
4300 Error *local_err = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02004301
Gongleifa53a0e2016-05-10 10:04:59 +08004302 ret = qemu_ram_resize(block, length,
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00004303 &local_err);
4304 if (local_err) {
4305 error_report_err(local_err);
Juan Quintela56e93d22015-05-07 19:33:31 +02004306 }
Juan Quintela56e93d22015-05-07 19:33:31 +02004307 }
Dr. David Alan Gilbertef08fb32017-02-24 18:28:30 +00004308 /* For postcopy we need to check hugepage sizes match */
Stefan Reitere846b742021-02-04 17:35:22 +01004309 if (postcopy_advised && migrate_postcopy_ram() &&
Dr. David Alan Gilbertef08fb32017-02-24 18:28:30 +00004310 block->page_size != qemu_host_page_size) {
4311 uint64_t remote_page_size = qemu_get_be64(f);
4312 if (remote_page_size != block->page_size) {
4313 error_report("Mismatched RAM page size %s "
4314 "(local) %zd != %" PRId64,
4315 id, block->page_size,
4316 remote_page_size);
4317 ret = -EINVAL;
4318 }
4319 }
Yury Kotovfbd162e2019-02-15 20:45:46 +03004320 if (migrate_ignore_shared()) {
4321 hwaddr addr = qemu_get_be64(f);
Yury Kotovfbd162e2019-02-15 20:45:46 +03004322 if (ramblock_is_ignored(block) &&
4323 block->mr->addr != addr) {
4324 error_report("Mismatched GPAs for block %s "
4325 "%" PRId64 "!= %" PRId64,
4326 id, (uint64_t)addr,
4327 (uint64_t)block->mr->addr);
4328 ret = -EINVAL;
4329 }
4330 }
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00004331 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4332 block->idstr);
4333 } else {
Juan Quintela56e93d22015-05-07 19:33:31 +02004334 error_report("Unknown ramblock \"%s\", cannot "
4335 "accept migration", id);
4336 ret = -EINVAL;
4337 }
4338
4339 total_ram_bytes -= length;
4340 }
4341 break;
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00004342
Juan Quintelabb890ed2017-04-28 09:39:55 +02004343 case RAM_SAVE_FLAG_ZERO:
Juan Quintela56e93d22015-05-07 19:33:31 +02004344 ch = qemu_get_byte(f);
4345 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4346 break;
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00004347
Juan Quintela56e93d22015-05-07 19:33:31 +02004348 case RAM_SAVE_FLAG_PAGE:
Juan Quintela56e93d22015-05-07 19:33:31 +02004349 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4350 break;
Juan Quintela56e93d22015-05-07 19:33:31 +02004351
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00004352 case RAM_SAVE_FLAG_COMPRESS_PAGE:
Juan Quintela56e93d22015-05-07 19:33:31 +02004353 len = qemu_get_be32(f);
4354 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4355 error_report("Invalid compressed data length: %d", len);
4356 ret = -EINVAL;
4357 break;
4358 }
Dr. David Alan Gilbertc1bc6622015-12-16 11:47:38 +00004359 decompress_data_with_multi_threads(f, host, len);
Juan Quintela56e93d22015-05-07 19:33:31 +02004360 break;
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00004361
Juan Quintela56e93d22015-05-07 19:33:31 +02004362 case RAM_SAVE_FLAG_XBZRLE:
Juan Quintela56e93d22015-05-07 19:33:31 +02004363 if (load_xbzrle(f, addr, host) < 0) {
4364 error_report("Failed to decompress XBZRLE page at "
4365 RAM_ADDR_FMT, addr);
4366 ret = -EINVAL;
4367 break;
4368 }
4369 break;
4370 case RAM_SAVE_FLAG_EOS:
4371 /* normal exit */
Juan Quintela6df264a2018-02-28 09:10:07 +01004372 multifd_recv_sync_main();
Juan Quintela56e93d22015-05-07 19:33:31 +02004373 break;
4374 default:
4375 if (flags & RAM_SAVE_FLAG_HOOK) {
Dr. David Alan Gilbert632e3a52015-06-11 18:17:23 +01004376 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
Juan Quintela56e93d22015-05-07 19:33:31 +02004377 } else {
Bihong Yu29fccad2020-10-20 11:10:42 +08004378 error_report("Unknown combination of migration flags: 0x%x",
Juan Quintela56e93d22015-05-07 19:33:31 +02004379 flags);
4380 ret = -EINVAL;
4381 }
4382 }
4383 if (!ret) {
4384 ret = qemu_file_get_error(f);
4385 }
zhanghailiang03930312020-02-24 14:54:10 +08004386 if (!ret && host_bak) {
4387 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4388 }
Juan Quintela56e93d22015-05-07 19:33:31 +02004389 }
4390
Wei Yangca1a6b72019-11-07 20:39:03 +08004391 ret |= wait_for_decompress_done();
Wei Yang10da4a32019-07-25 08:20:23 +08004392 return ret;
4393}
4394
4395static int ram_load(QEMUFile *f, void *opaque, int version_id)
4396{
4397 int ret = 0;
4398 static uint64_t seq_iter;
4399 /*
4400 * If system is running in postcopy mode, page inserts to host memory must
4401 * be atomic
4402 */
4403 bool postcopy_running = postcopy_is_running();
4404
4405 seq_iter++;
4406
4407 if (version_id != 4) {
4408 return -EINVAL;
4409 }
4410
4411 /*
4412 * This RCU critical section can be very long running.
4413 * When RCU reclaims in the code start to become numerous,
4414 * it will be necessary to reduce the granularity of this
4415 * critical section.
4416 */
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004417 WITH_RCU_READ_LOCK_GUARD() {
4418 if (postcopy_running) {
Peter Xu36f62f12022-07-07 14:55:02 -04004419 /*
4420 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4421 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4422 * service fast page faults.
4423 */
4424 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004425 } else {
4426 ret = ram_load_precopy(f);
4427 }
Wei Yang10da4a32019-07-25 08:20:23 +08004428 }
Juan Quintela55c44462017-01-23 22:32:05 +01004429 trace_ram_load_complete(ret, seq_iter);
Zhang Chene6f4aa12018-09-03 12:38:50 +08004430
Juan Quintela56e93d22015-05-07 19:33:31 +02004431 return ret;
4432}
4433
Vladimir Sementsov-Ogievskiyc6467622017-07-10 19:30:14 +03004434static bool ram_has_postcopy(void *opaque)
4435{
Junyan He469dd512018-07-18 15:48:02 +08004436 RAMBlock *rb;
Yury Kotovfbd162e2019-02-15 20:45:46 +03004437 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
Junyan He469dd512018-07-18 15:48:02 +08004438 if (ramblock_is_pmem(rb)) {
4439 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4440 "is not supported now!", rb->idstr, rb->host);
4441 return false;
4442 }
4443 }
4444
Vladimir Sementsov-Ogievskiyc6467622017-07-10 19:30:14 +03004445 return migrate_postcopy_ram();
4446}
4447
Peter Xuedd090c2018-05-02 18:47:32 +08004448/* Sync all the dirty bitmap with destination VM. */
4449static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4450{
4451 RAMBlock *block;
4452 QEMUFile *file = s->to_dst_file;
4453 int ramblock_count = 0;
4454
4455 trace_ram_dirty_bitmap_sync_start();
4456
Yury Kotovfbd162e2019-02-15 20:45:46 +03004457 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Peter Xuedd090c2018-05-02 18:47:32 +08004458 qemu_savevm_send_recv_bitmap(file, block->idstr);
4459 trace_ram_dirty_bitmap_request(block->idstr);
4460 ramblock_count++;
4461 }
4462
4463 trace_ram_dirty_bitmap_sync_wait();
4464
4465 /* Wait until all the ramblocks' dirty bitmap synced */
4466 while (ramblock_count--) {
4467 qemu_sem_wait(&s->rp_state.rp_sem);
4468 }
4469
4470 trace_ram_dirty_bitmap_sync_complete();
4471
4472 return 0;
4473}
4474
4475static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4476{
4477 qemu_sem_post(&s->rp_state.rp_sem);
4478}
4479
Peter Xua335deb2018-05-02 18:47:28 +08004480/*
4481 * Read the received bitmap, revert it as the initial dirty bitmap.
4482 * This is only used when the postcopy migration is paused but wants
4483 * to resume from a middle point.
4484 */
4485int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4486{
4487 int ret = -EINVAL;
Peter Xu43044ac2021-07-22 13:58:38 -04004488 /* from_dst_file is always valid because we're within rp_thread */
Peter Xua335deb2018-05-02 18:47:28 +08004489 QEMUFile *file = s->rp_state.from_dst_file;
4490 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
Peter Xua725ef92018-07-10 17:18:55 +08004491 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
Peter Xua335deb2018-05-02 18:47:28 +08004492 uint64_t size, end_mark;
4493
4494 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4495
4496 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4497 error_report("%s: incorrect state %s", __func__,
4498 MigrationStatus_str(s->state));
4499 return -EINVAL;
4500 }
4501
4502 /*
4503 * Note: see comments in ramblock_recv_bitmap_send() on why we
zhaolichang3a4452d2020-09-17 15:50:21 +08004504 * need the endianness conversion, and the paddings.
Peter Xua335deb2018-05-02 18:47:28 +08004505 */
4506 local_size = ROUND_UP(local_size, 8);
4507
4508 /* Add paddings */
4509 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4510
4511 size = qemu_get_be64(file);
4512
4513 /* The size of the bitmap should match with our ramblock */
4514 if (size != local_size) {
4515 error_report("%s: ramblock '%s' bitmap size mismatch "
4516 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4517 block->idstr, size, local_size);
4518 ret = -EINVAL;
4519 goto out;
4520 }
4521
4522 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4523 end_mark = qemu_get_be64(file);
4524
4525 ret = qemu_file_get_error(file);
4526 if (ret || size != local_size) {
4527 error_report("%s: read bitmap failed for ramblock '%s': %d"
4528 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4529 __func__, block->idstr, ret, local_size, size);
4530 ret = -EIO;
4531 goto out;
4532 }
4533
4534 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
Philippe Mathieu-Daudéaf3bbbe2020-11-03 12:25:58 +01004535 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
Peter Xua335deb2018-05-02 18:47:28 +08004536 __func__, block->idstr, end_mark);
4537 ret = -EINVAL;
4538 goto out;
4539 }
4540
4541 /*
zhaolichang3a4452d2020-09-17 15:50:21 +08004542 * Endianness conversion. We are during postcopy (though paused).
Peter Xua335deb2018-05-02 18:47:28 +08004543 * The dirty bitmap won't change. We can directly modify it.
4544 */
4545 bitmap_from_le(block->bmap, le_bitmap, nbits);
4546
4547 /*
4548 * What we received is "received bitmap". Revert it as the initial
4549 * dirty bitmap for this ramblock.
4550 */
4551 bitmap_complement(block->bmap, block->bmap, nbits);
4552
David Hildenbrandbe39b4c2021-10-11 19:53:41 +02004553 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4554 ramblock_dirty_bitmap_clear_discarded_pages(block);
4555
4556 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
Peter Xua335deb2018-05-02 18:47:28 +08004557 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4558
Peter Xuedd090c2018-05-02 18:47:32 +08004559 /*
4560 * We succeeded to sync bitmap for current ramblock. If this is
4561 * the last one to sync, we need to notify the main send thread.
4562 */
4563 ram_dirty_bitmap_reload_notify(s);
4564
Peter Xua335deb2018-05-02 18:47:28 +08004565 ret = 0;
4566out:
Peter Xubf269902018-05-25 09:50:42 +08004567 g_free(le_bitmap);
Peter Xua335deb2018-05-02 18:47:28 +08004568 return ret;
4569}
4570
Peter Xuedd090c2018-05-02 18:47:32 +08004571static int ram_resume_prepare(MigrationState *s, void *opaque)
4572{
4573 RAMState *rs = *(RAMState **)opaque;
Peter Xu08614f32018-05-02 18:47:33 +08004574 int ret;
Peter Xuedd090c2018-05-02 18:47:32 +08004575
Peter Xu08614f32018-05-02 18:47:33 +08004576 ret = ram_dirty_bitmap_sync_all(s, rs);
4577 if (ret) {
4578 return ret;
4579 }
4580
4581 ram_state_resume_prepare(rs, s->to_dst_file);
4582
4583 return 0;
Peter Xuedd090c2018-05-02 18:47:32 +08004584}
4585
Peter Xu36f62f12022-07-07 14:55:02 -04004586void postcopy_preempt_shutdown_file(MigrationState *s)
4587{
4588 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4589 qemu_fflush(s->postcopy_qemufile_src);
4590}
4591
Juan Quintela56e93d22015-05-07 19:33:31 +02004592static SaveVMHandlers savevm_ram_handlers = {
Juan Quintela9907e842017-06-28 11:52:24 +02004593 .save_setup = ram_save_setup,
Juan Quintela56e93d22015-05-07 19:33:31 +02004594 .save_live_iterate = ram_save_iterate,
Dr. David Alan Gilbert763c9062015-11-05 18:11:00 +00004595 .save_live_complete_postcopy = ram_save_complete,
Dr. David Alan Gilberta3e06c32015-11-05 18:10:41 +00004596 .save_live_complete_precopy = ram_save_complete,
Vladimir Sementsov-Ogievskiyc6467622017-07-10 19:30:14 +03004597 .has_postcopy = ram_has_postcopy,
Juan Quintelac8df4a72022-10-03 02:00:03 +02004598 .state_pending_exact = ram_state_pending_exact,
4599 .state_pending_estimate = ram_state_pending_estimate,
Juan Quintela56e93d22015-05-07 19:33:31 +02004600 .load_state = ram_load,
Juan Quintelaf265e0e2017-06-28 11:52:27 +02004601 .save_cleanup = ram_save_cleanup,
4602 .load_setup = ram_load_setup,
4603 .load_cleanup = ram_load_cleanup,
Peter Xuedd090c2018-05-02 18:47:32 +08004604 .resume_prepare = ram_resume_prepare,
Juan Quintela56e93d22015-05-07 19:33:31 +02004605};
4606
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004607static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4608 size_t old_size, size_t new_size)
4609{
David Hildenbrandcc61c702021-04-29 13:27:04 +02004610 PostcopyState ps = postcopy_state_get();
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004611 ram_addr_t offset;
4612 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4613 Error *err = NULL;
4614
4615 if (ramblock_is_ignored(rb)) {
4616 return;
4617 }
4618
4619 if (!migration_is_idle()) {
4620 /*
4621 * Precopy code on the source cannot deal with the size of RAM blocks
4622 * changing at random points in time - especially after sending the
4623 * RAM block sizes in the migration stream, they must no longer change.
4624 * Abort and indicate a proper reason.
4625 */
4626 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
Laurent Vivier458fecc2021-09-29 16:43:10 +02004627 migration_cancel(err);
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004628 error_free(err);
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004629 }
David Hildenbrandcc61c702021-04-29 13:27:04 +02004630
4631 switch (ps) {
4632 case POSTCOPY_INCOMING_ADVISE:
4633 /*
4634 * Update what ram_postcopy_incoming_init()->init_range() does at the
4635 * time postcopy was advised. Syncing RAM blocks with the source will
4636 * result in RAM resizes.
4637 */
4638 if (old_size < new_size) {
4639 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4640 error_report("RAM block '%s' discard of resized RAM failed",
4641 rb->idstr);
4642 }
4643 }
David Hildenbrand898ba902021-04-29 13:27:06 +02004644 rb->postcopy_length = new_size;
David Hildenbrandcc61c702021-04-29 13:27:04 +02004645 break;
4646 case POSTCOPY_INCOMING_NONE:
4647 case POSTCOPY_INCOMING_RUNNING:
4648 case POSTCOPY_INCOMING_END:
4649 /*
4650 * Once our guest is running, postcopy does no longer care about
4651 * resizes. When growing, the new memory was not available on the
4652 * source, no handler needed.
4653 */
4654 break;
4655 default:
4656 error_report("RAM block '%s' resized during postcopy state: %d",
4657 rb->idstr, ps);
4658 exit(-1);
4659 }
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004660}
4661
4662static RAMBlockNotifier ram_mig_ram_notifier = {
4663 .ram_block_resized = ram_mig_ram_block_resized,
4664};
4665
Juan Quintela56e93d22015-05-07 19:33:31 +02004666void ram_mig_init(void)
4667{
4668 qemu_mutex_init(&XBZRLE.lock);
Dr. David Alan Gilbertce62df52019-08-22 12:54:33 +01004669 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004670 ram_block_notifier_add(&ram_mig_ram_notifier);
Juan Quintela56e93d22015-05-07 19:33:31 +02004671}