blob: 0ada6477e8e2cc7605034eb748341527b915461e [file] [log] [blame]
Juan Quintela56e93d22015-05-07 19:33:31 +02001/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
Juan Quintela76cc7b52015-05-08 13:20:21 +02005 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
Juan Quintela56e93d22015-05-07 19:33:31 +02009 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
Markus Armbrustere688df62018-02-01 12:18:31 +010028
Peter Maydell1393a482016-01-26 18:16:54 +000029#include "qemu/osdep.h"
Veronia Bahaaf348b6d2016-03-20 19:16:19 +020030#include "qemu/cutils.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020031#include "qemu/bitops.h"
32#include "qemu/bitmap.h"
Peter Maydellb85ea5f2022-02-08 20:08:52 +000033#include "qemu/madvise.h"
Juan Quintela7205c9e2015-05-08 13:54:36 +020034#include "qemu/main-loop.h"
Juan Quintela709e3fe2017-04-05 21:47:50 +020035#include "xbzrle.h"
Lukas Straubb5ca3362023-04-20 11:48:20 +020036#include "ram-compress.h"
Juan Quintela7b1e1a22017-04-17 20:26:27 +020037#include "ram.h"
Juan Quintela6666c962017-04-24 20:07:27 +020038#include "migration.h"
Juan Quintela947701c2023-04-26 19:04:06 +020039#include "migration-stats.h"
Juan Quintelaf2a8f0a2017-04-24 13:42:55 +020040#include "migration/register.h"
Juan Quintela7b1e1a22017-04-17 20:26:27 +020041#include "migration/misc.h"
Juan Quintela08a0aee2017-04-20 18:52:18 +020042#include "qemu-file.h"
Juan Quintelabe07b0a2017-04-20 13:12:24 +020043#include "postcopy-ram.h"
Michael S. Tsirkin53d37d32018-05-03 22:50:51 +030044#include "page_cache.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020045#include "qemu/error-report.h"
Markus Armbrustere688df62018-02-01 12:18:31 +010046#include "qapi/error.h"
Juan Quintelaab7cbb02019-05-15 13:37:46 +020047#include "qapi/qapi-types-migration.h"
Markus Armbruster9af23982018-02-11 10:36:01 +010048#include "qapi/qapi-events-migration.h"
Juan Quintela8acabf62017-10-05 22:00:31 +020049#include "qapi/qmp/qerror.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020050#include "trace.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020051#include "exec/ram_addr.h"
Alexey Perevalovf9494612017-10-05 14:13:20 +030052#include "exec/target_page.h"
Juan Quintela56e93d22015-05-07 19:33:31 +020053#include "qemu/rcu_queue.h"
zhanghailianga91246c2016-10-27 14:42:59 +080054#include "migration/colo.h"
Michael S. Tsirkin53d37d32018-05-03 22:50:51 +030055#include "block.h"
Claudio Fontanab0c3cf92020-06-29 11:35:03 +020056#include "sysemu/cpu-throttle.h"
Peter Xuedd090c2018-05-02 18:47:32 +080057#include "savevm.h"
Juan Quintelab9ee2f72016-01-15 11:40:13 +010058#include "qemu/iov.h"
Juan Quintelad32ca5a2020-01-22 16:16:07 +010059#include "multifd.h"
Andrey Gruzdev278e2f52021-01-29 13:14:05 +030060#include "sysemu/runstate.h"
Juan Quintela1f0776f2023-03-01 21:18:45 +010061#include "options.h"
Andrey Gruzdev278e2f52021-01-29 13:14:05 +030062
Lukas Straube5fdf922021-07-04 18:14:44 +020063#include "hw/boards.h" /* for machine_dump_guest_core() */
64
Andrey Gruzdev278e2f52021-01-29 13:14:05 +030065#if defined(__linux__)
66#include "qemu/userfaultfd.h"
67#endif /* defined(__linux__) */
Juan Quintela56e93d22015-05-07 19:33:31 +020068
Juan Quintela56e93d22015-05-07 19:33:31 +020069/***********************************************************/
70/* ram save/restore */
71
Juan Quintela7b548762022-07-28 10:14:42 +020072/*
73 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
74 * worked for pages that were filled with the same char. We switched
Juan Quintelabb890ed2017-04-28 09:39:55 +020075 * it to only search for the zero value. And to avoid confusion with
Juan Quintela7b548762022-07-28 10:14:42 +020076 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
Juan Quintelabb890ed2017-04-28 09:39:55 +020077 */
Juan Quintela7b548762022-07-28 10:14:42 +020078/*
79 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
80 */
81#define RAM_SAVE_FLAG_FULL 0x01
Juan Quintelabb890ed2017-04-28 09:39:55 +020082#define RAM_SAVE_FLAG_ZERO 0x02
Juan Quintela56e93d22015-05-07 19:33:31 +020083#define RAM_SAVE_FLAG_MEM_SIZE 0x04
84#define RAM_SAVE_FLAG_PAGE 0x08
85#define RAM_SAVE_FLAG_EOS 0x10
86#define RAM_SAVE_FLAG_CONTINUE 0x20
87#define RAM_SAVE_FLAG_XBZRLE 0x40
Juan Quintela7b548762022-07-28 10:14:42 +020088/* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
Juan Quintela56e93d22015-05-07 19:33:31 +020089#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
Juan Quintela294e5a42022-06-21 13:36:11 +020090#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
Juan Quintela7b548762022-07-28 10:14:42 +020091/* We can't use any flag that is bigger than 0x200 */
Juan Quintela56e93d22015-05-07 19:33:31 +020092
Juan Quintela93604472017-06-06 19:49:03 +020093XBZRLECacheStats xbzrle_counters;
94
Peter Xuf1668762022-10-11 17:55:55 -040095/* used by the search for pages to send */
96struct PageSearchStatus {
97 /* The migration channel used for a specific host page */
98 QEMUFile *pss_channel;
Peter Xuec6f3ab2022-10-11 17:55:56 -040099 /* Last block from where we have sent data */
100 RAMBlock *last_sent_block;
Peter Xuf1668762022-10-11 17:55:55 -0400101 /* Current block being searched */
102 RAMBlock *block;
103 /* Current page to search from */
104 unsigned long page;
105 /* Set once we wrap around */
106 bool complete_round;
Peter Xuf1668762022-10-11 17:55:55 -0400107 /* Whether we're sending a host page */
108 bool host_page_sending;
109 /* The start/end of current host page. Invalid if host_page_sending==false */
110 unsigned long host_page_start;
111 unsigned long host_page_end;
112};
113typedef struct PageSearchStatus PageSearchStatus;
114
Juan Quintela56e93d22015-05-07 19:33:31 +0200115/* struct contains XBZRLE cache and a static page
116 used by the compression */
117static struct {
118 /* buffer used for XBZRLE encoding */
119 uint8_t *encoded_buf;
120 /* buffer for storing page content */
121 uint8_t *current_buf;
122 /* Cache for XBZRLE, Protected by lock. */
123 PageCache *cache;
124 QemuMutex lock;
Juan Quintelac00e0922017-05-09 16:22:01 +0200125 /* it will store a page full of zeros */
126 uint8_t *zero_target_page;
Juan Quintelaf265e0e2017-06-28 11:52:27 +0200127 /* buffer used for XBZRLE decoding */
128 uint8_t *decoded_buf;
Juan Quintela56e93d22015-05-07 19:33:31 +0200129} XBZRLE;
130
Juan Quintela56e93d22015-05-07 19:33:31 +0200131static void XBZRLE_cache_lock(void)
132{
Juan Quintela87dca0c2023-03-01 22:20:13 +0100133 if (migrate_xbzrle()) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200134 qemu_mutex_lock(&XBZRLE.lock);
Bihong Yuf4c51a62020-10-20 11:10:45 +0800135 }
Juan Quintela56e93d22015-05-07 19:33:31 +0200136}
137
138static void XBZRLE_cache_unlock(void)
139{
Juan Quintela87dca0c2023-03-01 22:20:13 +0100140 if (migrate_xbzrle()) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200141 qemu_mutex_unlock(&XBZRLE.lock);
Bihong Yuf4c51a62020-10-20 11:10:45 +0800142 }
Juan Quintela56e93d22015-05-07 19:33:31 +0200143}
144
Juan Quintela3d0684b2017-03-23 15:06:39 +0100145/**
146 * xbzrle_cache_resize: resize the xbzrle cache
147 *
Daniel P. Berrangécbde7be2021-02-19 18:40:12 +0000148 * This function is called from migrate_params_apply in main
Juan Quintela3d0684b2017-03-23 15:06:39 +0100149 * thread, possibly while a migration is in progress. A running
150 * migration may be using the cache and might finish during this call,
151 * hence changes to the cache are protected by XBZRLE.lock().
152 *
Juan Quintelac9dede22017-10-06 23:03:55 +0200153 * Returns 0 for success or -1 for error
Juan Quintela3d0684b2017-03-23 15:06:39 +0100154 *
155 * @new_size: new cache size
Juan Quintela8acabf62017-10-05 22:00:31 +0200156 * @errp: set *errp if the check failed, with reason
Juan Quintela56e93d22015-05-07 19:33:31 +0200157 */
Markus Armbruster8b9407a2021-02-02 15:17:32 +0100158int xbzrle_cache_resize(uint64_t new_size, Error **errp)
Juan Quintela56e93d22015-05-07 19:33:31 +0200159{
160 PageCache *new_cache;
Juan Quintelac9dede22017-10-06 23:03:55 +0200161 int64_t ret = 0;
Juan Quintela56e93d22015-05-07 19:33:31 +0200162
Juan Quintela8acabf62017-10-05 22:00:31 +0200163 /* Check for truncation */
164 if (new_size != (size_t)new_size) {
165 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
166 "exceeding address space");
167 return -1;
168 }
169
Juan Quintela2a313e52017-10-06 23:00:12 +0200170 if (new_size == migrate_xbzrle_cache_size()) {
171 /* nothing to do */
Juan Quintelac9dede22017-10-06 23:03:55 +0200172 return 0;
Juan Quintela2a313e52017-10-06 23:00:12 +0200173 }
174
Juan Quintela56e93d22015-05-07 19:33:31 +0200175 XBZRLE_cache_lock();
176
177 if (XBZRLE.cache != NULL) {
Juan Quintela80f8dfd2017-10-06 22:30:45 +0200178 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
Juan Quintela56e93d22015-05-07 19:33:31 +0200179 if (!new_cache) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200180 ret = -1;
181 goto out;
182 }
183
184 cache_fini(XBZRLE.cache);
185 XBZRLE.cache = new_cache;
186 }
Juan Quintela56e93d22015-05-07 19:33:31 +0200187out:
188 XBZRLE_cache_unlock();
189 return ret;
190}
191
Peter Xu20123ee2022-10-11 17:55:46 -0400192static bool postcopy_preempt_active(void)
193{
194 return migrate_postcopy_preempt() && migration_in_postcopy();
195}
196
David Hildenbrandf161c882023-07-06 09:56:08 +0200197bool migrate_ram_is_ignored(RAMBlock *block)
Yury Kotovfbd162e2019-02-15 20:45:46 +0300198{
199 return !qemu_ram_is_migratable(block) ||
Steve Sistareb0182e52023-06-07 08:18:36 -0700200 (migrate_ignore_shared() && qemu_ram_is_shared(block)
201 && qemu_ram_is_named_file(block));
Yury Kotovfbd162e2019-02-15 20:45:46 +0300202}
203
Dr. David Alan Gilbert343f6322018-06-05 17:25:45 +0100204#undef RAMBLOCK_FOREACH
205
Yury Kotovfbd162e2019-02-15 20:45:46 +0300206int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
207{
208 RAMBlock *block;
209 int ret = 0;
210
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +0100211 RCU_READ_LOCK_GUARD();
212
Yury Kotovfbd162e2019-02-15 20:45:46 +0300213 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
214 ret = func(block, opaque);
215 if (ret) {
216 break;
217 }
218 }
Yury Kotovfbd162e2019-02-15 20:45:46 +0300219 return ret;
220}
221
Alexey Perevalovf9494612017-10-05 14:13:20 +0300222static void ramblock_recv_map_init(void)
223{
224 RAMBlock *rb;
225
Yury Kotovfbd162e2019-02-15 20:45:46 +0300226 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
Alexey Perevalovf9494612017-10-05 14:13:20 +0300227 assert(!rb->receivedmap);
228 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
229 }
230}
231
232int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
233{
234 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
235 rb->receivedmap);
236}
237
Dr. David Alan Gilbert1cba9f62018-03-12 17:21:08 +0000238bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
239{
240 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
241}
242
Alexey Perevalovf9494612017-10-05 14:13:20 +0300243void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
244{
245 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
246}
247
248void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
249 size_t nr)
250{
251 bitmap_set_atomic(rb->receivedmap,
252 ramblock_recv_bitmap_offset(host_addr, rb),
253 nr);
254}
255
Peter Xua335deb2018-05-02 18:47:28 +0800256#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
257
258/*
259 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
260 *
261 * Returns >0 if success with sent bytes, or <0 if error.
262 */
263int64_t ramblock_recv_bitmap_send(QEMUFile *file,
264 const char *block_name)
265{
266 RAMBlock *block = qemu_ram_block_by_name(block_name);
267 unsigned long *le_bitmap, nbits;
268 uint64_t size;
269
270 if (!block) {
271 error_report("%s: invalid block name: %s", __func__, block_name);
272 return -1;
273 }
274
David Hildenbrand898ba902021-04-29 13:27:06 +0200275 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
Peter Xua335deb2018-05-02 18:47:28 +0800276
277 /*
278 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
279 * machines we may need 4 more bytes for padding (see below
280 * comment). So extend it a bit before hand.
281 */
282 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
283
284 /*
285 * Always use little endian when sending the bitmap. This is
286 * required that when source and destination VMs are not using the
zhaolichang3a4452d2020-09-17 15:50:21 +0800287 * same endianness. (Note: big endian won't work.)
Peter Xua335deb2018-05-02 18:47:28 +0800288 */
289 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
290
291 /* Size of the bitmap, in bytes */
Peter Xua725ef92018-07-10 17:18:55 +0800292 size = DIV_ROUND_UP(nbits, 8);
Peter Xua335deb2018-05-02 18:47:28 +0800293
294 /*
295 * size is always aligned to 8 bytes for 64bit machines, but it
296 * may not be true for 32bit machines. We need this padding to
297 * make sure the migration can survive even between 32bit and
298 * 64bit machines.
299 */
300 size = ROUND_UP(size, 8);
301
302 qemu_put_be64(file, size);
303 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
304 /*
305 * Mark as an end, in case the middle part is screwed up due to
zhaolichang3a4452d2020-09-17 15:50:21 +0800306 * some "mysterious" reason.
Peter Xua335deb2018-05-02 18:47:28 +0800307 */
308 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
309 qemu_fflush(file);
310
Peter Xubf269902018-05-25 09:50:42 +0800311 g_free(le_bitmap);
Peter Xua335deb2018-05-02 18:47:28 +0800312
313 if (qemu_file_get_error(file)) {
314 return qemu_file_get_error(file);
315 }
316
317 return size + sizeof(size);
318}
319
Juan Quintelaec481c62017-03-20 22:12:40 +0100320/*
321 * An outstanding page request, on the source, having been received
322 * and queued
323 */
324struct RAMSrcPageRequest {
325 RAMBlock *rb;
326 hwaddr offset;
327 hwaddr len;
328
329 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
330};
331
Juan Quintela6f37bb82017-03-13 19:26:29 +0100332/* State of RAM for migration */
333struct RAMState {
Peter Xuf1668762022-10-11 17:55:55 -0400334 /*
335 * PageSearchStatus structures for the channels when send pages.
336 * Protected by the bitmap_mutex.
337 */
338 PageSearchStatus pss[RAM_CHANNEL_MAX];
Andrey Gruzdev278e2f52021-01-29 13:14:05 +0300339 /* UFFD file descriptor, used in 'write-tracking' migration */
340 int uffdio_fd;
Juan Quintela8d80e192022-05-10 19:37:36 +0200341 /* total ram size in bytes */
342 uint64_t ram_bytes_total;
Juan Quintela6f37bb82017-03-13 19:26:29 +0100343 /* Last block that we have visited searching for dirty pages */
344 RAMBlock *last_seen_block;
Juan Quintela269ace22017-03-21 15:23:31 +0100345 /* Last dirty target page we have sent */
346 ram_addr_t last_page;
Juan Quintela6f37bb82017-03-13 19:26:29 +0100347 /* last ram version we have seen */
348 uint32_t last_version;
Juan Quintela8d820d62017-03-13 19:35:50 +0100349 /* How many times we have dirty too many pages */
350 int dirty_rate_high_cnt;
Juan Quintelaf664da82017-03-13 19:44:57 +0100351 /* these variables are used for bitmap sync */
352 /* last time we did a full bitmap_sync */
353 int64_t time_last_bitmap_sync;
Juan Quintelaeac74152017-03-28 14:59:01 +0200354 /* bytes transferred at start_time */
Juan Quintelac4bdf0c2017-03-28 14:59:54 +0200355 uint64_t bytes_xfer_prev;
Juan Quintelaa66cd902017-03-28 15:02:43 +0200356 /* number of dirty pages since start_time */
Juan Quintela68908ed2017-03-28 15:05:53 +0200357 uint64_t num_dirty_pages_period;
Juan Quintelab5833fd2017-03-13 19:49:19 +0100358 /* xbzrle misses since the beginning of the period */
359 uint64_t xbzrle_cache_miss_prev;
Wei Wange460a4b2020-04-30 08:59:35 +0800360 /* Amount of xbzrle pages since the beginning of the period */
361 uint64_t xbzrle_pages_prev;
362 /* Amount of xbzrle encoded bytes since the beginning of the period */
363 uint64_t xbzrle_bytes_prev;
Juan Quintelaf3095cc2023-05-04 13:53:23 +0200364 /* Are we really using XBZRLE (e.g., after the first round). */
365 bool xbzrle_started;
Juan Quintela05931ec2021-12-15 19:01:21 +0100366 /* Are we on the last stage of migration */
367 bool last_stage;
Xiao Guangrong76e03002018-09-06 15:01:00 +0800368 /* compression statistics since the beginning of the period */
369 /* amount of count that no free thread to compress data */
370 uint64_t compress_thread_busy_prev;
371 /* amount bytes after compression */
372 uint64_t compressed_size_prev;
373 /* amount of compressed pages */
374 uint64_t compress_pages_prev;
375
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +0800376 /* total handled target pages at the beginning of period */
377 uint64_t target_page_count_prev;
378 /* total handled target pages since start */
379 uint64_t target_page_count;
Juan Quintela93604472017-06-06 19:49:03 +0200380 /* number of dirty bits in the bitmap */
Peter Xu2dfaf122017-08-02 17:41:19 +0800381 uint64_t migration_dirty_pages;
Peter Xuf1668762022-10-11 17:55:55 -0400382 /*
383 * Protects:
384 * - dirty/clear bitmap
385 * - migration_dirty_pages
386 * - pss structures
387 */
Juan Quintela108cfae2017-03-13 21:38:09 +0100388 QemuMutex bitmap_mutex;
Juan Quintela68a098f2017-03-14 13:48:42 +0100389 /* The RAMBlock used in the last src_page_requests */
390 RAMBlock *last_req_rb;
Juan Quintelaec481c62017-03-20 22:12:40 +0100391 /* Queue of outstanding page requests from the destination */
392 QemuMutex src_page_req_mutex;
Paolo Bonzinib58deb32018-12-06 11:58:10 +0100393 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
Juan Quintela6f37bb82017-03-13 19:26:29 +0100394};
395typedef struct RAMState RAMState;
396
Juan Quintela53518d92017-05-04 11:46:24 +0200397static RAMState *ram_state;
Juan Quintela6f37bb82017-03-13 19:26:29 +0100398
Wei Wangbd227062018-12-11 16:24:51 +0800399static NotifierWithReturnList precopy_notifier_list;
400
Peter Xua1fe28d2022-01-19 16:09:18 +0800401/* Whether postcopy has queued requests? */
402static bool postcopy_has_request(RAMState *rs)
403{
404 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
405}
406
Wei Wangbd227062018-12-11 16:24:51 +0800407void precopy_infrastructure_init(void)
408{
409 notifier_with_return_list_init(&precopy_notifier_list);
410}
411
412void precopy_add_notifier(NotifierWithReturn *n)
413{
414 notifier_with_return_list_add(&precopy_notifier_list, n);
415}
416
417void precopy_remove_notifier(NotifierWithReturn *n)
418{
419 notifier_with_return_remove(n);
420}
421
422int precopy_notify(PrecopyNotifyReason reason, Error **errp)
423{
424 PrecopyNotifyData pnd;
425 pnd.reason = reason;
426 pnd.errp = errp;
427
428 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
429}
430
Juan Quintela9edabd42017-03-14 12:02:16 +0100431uint64_t ram_bytes_remaining(void)
432{
Dr. David Alan Gilbertbae416e2017-12-15 11:51:23 +0000433 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
434 0;
Juan Quintela9edabd42017-03-14 12:02:16 +0100435}
436
Juan Quintela26a26062022-02-22 21:02:03 +0100437void ram_transferred_add(uint64_t bytes)
David Edmondson4c2d0f62021-12-21 09:34:40 +0000438{
David Edmondsonae680662021-12-21 09:34:41 +0000439 if (runstate_is_running()) {
Juan Quintelaaff3f662023-04-26 19:37:19 +0200440 stat64_add(&mig_stats.precopy_bytes, bytes);
David Edmondsonae680662021-12-21 09:34:41 +0000441 } else if (migration_in_postcopy()) {
Juan Quintelaaff3f662023-04-26 19:37:19 +0200442 stat64_add(&mig_stats.postcopy_bytes, bytes);
David Edmondsonae680662021-12-21 09:34:41 +0000443 } else {
Juan Quintelaaff3f662023-04-26 19:37:19 +0200444 stat64_add(&mig_stats.downtime_bytes, bytes);
David Edmondsonae680662021-12-21 09:34:41 +0000445 }
Juan Quintelaaff3f662023-04-26 19:37:19 +0200446 stat64_add(&mig_stats.transferred, bytes);
David Edmondson4c2d0f62021-12-21 09:34:40 +0000447}
448
Juan Quintela4010ba32021-12-15 20:10:39 +0100449struct MigrationOps {
450 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
451};
452typedef struct MigrationOps MigrationOps;
453
454MigrationOps *migration_ops;
455
Peter Xu93589822022-10-11 17:55:57 -0400456static int ram_save_host_page_urgent(PageSearchStatus *pss);
457
Peter Xuebd88a42022-10-11 17:55:54 -0400458/* NOTE: page is the PFN not real ram_addr_t. */
459static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
460{
461 pss->block = rb;
462 pss->page = page;
463 pss->complete_round = false;
464}
465
Peter Xu93589822022-10-11 17:55:57 -0400466/*
467 * Check whether two PSSs are actively sending the same page. Return true
468 * if it is, false otherwise.
469 */
470static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
471{
472 return pss1->host_page_sending && pss2->host_page_sending &&
473 (pss1->host_page_start == pss2->host_page_start);
474}
475
Juan Quintela56e93d22015-05-07 19:33:31 +0200476/**
Juan Quintela3d0684b2017-03-23 15:06:39 +0100477 * save_page_header: write page header to wire
Juan Quintela56e93d22015-05-07 19:33:31 +0200478 *
479 * If this is the 1st block, it also writes the block identification
480 *
Juan Quintela3d0684b2017-03-23 15:06:39 +0100481 * Returns the number of bytes written
Juan Quintela56e93d22015-05-07 19:33:31 +0200482 *
Peter Xuec6f3ab2022-10-11 17:55:56 -0400483 * @pss: current PSS channel status
Juan Quintela56e93d22015-05-07 19:33:31 +0200484 * @block: block that contains the page we want to send
485 * @offset: offset inside the block for the page
486 * in the lower bits, it contains flags
487 */
Lukas Straub37502df2023-04-02 17:06:32 +0000488static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
489 RAMBlock *block, ram_addr_t offset)
Juan Quintela56e93d22015-05-07 19:33:31 +0200490{
Liang Li9f5f3802015-07-13 17:34:10 +0800491 size_t size, len;
Peter Xuec6f3ab2022-10-11 17:55:56 -0400492 bool same_block = (block == pss->last_sent_block);
Juan Quintela56e93d22015-05-07 19:33:31 +0200493
Peter Xu10661f12022-10-11 17:55:48 -0400494 if (same_block) {
Juan Quintela24795692017-03-21 11:45:01 +0100495 offset |= RAM_SAVE_FLAG_CONTINUE;
496 }
Juan Quintela2bf3aa82017-05-10 13:28:13 +0200497 qemu_put_be64(f, offset);
Juan Quintela56e93d22015-05-07 19:33:31 +0200498 size = 8;
499
Peter Xu10661f12022-10-11 17:55:48 -0400500 if (!same_block) {
Liang Li9f5f3802015-07-13 17:34:10 +0800501 len = strlen(block->idstr);
Juan Quintela2bf3aa82017-05-10 13:28:13 +0200502 qemu_put_byte(f, len);
503 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
Liang Li9f5f3802015-07-13 17:34:10 +0800504 size += 1 + len;
Peter Xuec6f3ab2022-10-11 17:55:56 -0400505 pss->last_sent_block = block;
Juan Quintela56e93d22015-05-07 19:33:31 +0200506 }
507 return size;
508}
509
Juan Quintela3d0684b2017-03-23 15:06:39 +0100510/**
Olaf Hering179a8082021-07-08 18:21:59 +0200511 * mig_throttle_guest_down: throttle down the guest
Juan Quintela3d0684b2017-03-23 15:06:39 +0100512 *
513 * Reduce amount of guest cpu execution to hopefully slow down memory
514 * writes. If guest dirty memory rate is reduced below the rate at
515 * which we can transfer pages to the destination then we should be
516 * able to complete migration. Some workloads dirty memory way too
517 * fast and will not effectively converge, even with auto-converge.
Jason J. Herne070afca2015-09-08 13:12:35 -0400518 */
Keqian Zhucbbf8182020-04-13 18:15:08 +0800519static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
520 uint64_t bytes_dirty_threshold)
Jason J. Herne070afca2015-09-08 13:12:35 -0400521{
Juan Quintela2a8ec382023-03-02 01:22:44 +0100522 uint64_t pct_initial = migrate_cpu_throttle_initial();
Juan Quintela9605c2a2023-03-02 10:20:49 +0100523 uint64_t pct_increment = migrate_cpu_throttle_increment();
Juan Quintela873f6742023-03-02 10:29:51 +0100524 bool pct_tailslow = migrate_cpu_throttle_tailslow();
Juan Quintela24155bd2023-03-02 01:13:01 +0100525 int pct_max = migrate_max_cpu_throttle();
Jason J. Herne070afca2015-09-08 13:12:35 -0400526
Keqian Zhucbbf8182020-04-13 18:15:08 +0800527 uint64_t throttle_now = cpu_throttle_get_percentage();
528 uint64_t cpu_now, cpu_ideal, throttle_inc;
529
Jason J. Herne070afca2015-09-08 13:12:35 -0400530 /* We have not started throttling yet. Let's start it. */
531 if (!cpu_throttle_active()) {
532 cpu_throttle_set(pct_initial);
533 } else {
534 /* Throttling already on, just increase the rate */
Keqian Zhucbbf8182020-04-13 18:15:08 +0800535 if (!pct_tailslow) {
536 throttle_inc = pct_increment;
537 } else {
538 /* Compute the ideal CPU percentage used by Guest, which may
539 * make the dirty rate match the dirty rate threshold. */
540 cpu_now = 100 - throttle_now;
541 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
542 bytes_dirty_period);
543 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
544 }
545 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
Jason J. Herne070afca2015-09-08 13:12:35 -0400546 }
547}
548
Rao, Lei91fe9a82021-11-09 11:04:54 +0800549void mig_throttle_counter_reset(void)
550{
551 RAMState *rs = ram_state;
552
553 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
554 rs->num_dirty_pages_period = 0;
Juan Quintelaaff3f662023-04-26 19:37:19 +0200555 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
Rao, Lei91fe9a82021-11-09 11:04:54 +0800556}
557
Juan Quintela3d0684b2017-03-23 15:06:39 +0100558/**
559 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
560 *
Juan Quintela6f37bb82017-03-13 19:26:29 +0100561 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +0100562 * @current_addr: address for the zero page
563 *
564 * Update the xbzrle cache to reflect a page that's been sent as all 0.
Juan Quintela56e93d22015-05-07 19:33:31 +0200565 * The important thing is that a stale (not-yet-0'd) page be replaced
566 * by the new data.
567 * As a bonus, if the page wasn't in the cache it gets added so that
Juan Quintela3d0684b2017-03-23 15:06:39 +0100568 * when a small write is made into the 0'd page it gets XBZRLE sent.
Juan Quintela56e93d22015-05-07 19:33:31 +0200569 */
Juan Quintela6f37bb82017-03-13 19:26:29 +0100570static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
Juan Quintela56e93d22015-05-07 19:33:31 +0200571{
Juan Quintela56e93d22015-05-07 19:33:31 +0200572 /* We don't care if this fails to allocate a new cache page
573 * as long as it updated an old one */
Juan Quintelac00e0922017-05-09 16:22:01 +0200574 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
Juan Quintelaaff3f662023-04-26 19:37:19 +0200575 stat64_get(&mig_stats.dirty_sync_count));
Juan Quintela56e93d22015-05-07 19:33:31 +0200576}
577
578#define ENCODING_FLAG_XBZRLE 0x1
579
580/**
581 * save_xbzrle_page: compress and send current page
582 *
583 * Returns: 1 means that we wrote the page
584 * 0 means that page is identical to the one already sent
585 * -1 means that xbzrle would be longer than normal
586 *
Juan Quintela5a987732017-03-13 19:39:02 +0100587 * @rs: current RAM state
Peter Xuec6f3ab2022-10-11 17:55:56 -0400588 * @pss: current PSS channel
Juan Quintela3d0684b2017-03-23 15:06:39 +0100589 * @current_data: pointer to the address of the page contents
590 * @current_addr: addr of the page
Juan Quintela56e93d22015-05-07 19:33:31 +0200591 * @block: block that contains the page we want to send
592 * @offset: offset inside the block for the page
Juan Quintela56e93d22015-05-07 19:33:31 +0200593 */
Peter Xuec6f3ab2022-10-11 17:55:56 -0400594static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
Peter Xu61717ea2022-10-11 17:55:53 -0400595 uint8_t **current_data, ram_addr_t current_addr,
596 RAMBlock *block, ram_addr_t offset)
Juan Quintela56e93d22015-05-07 19:33:31 +0200597{
598 int encoded_len = 0, bytes_xbzrle;
599 uint8_t *prev_cached_page;
Peter Xuec6f3ab2022-10-11 17:55:56 -0400600 QEMUFile *file = pss->pss_channel;
Juan Quintelaaff3f662023-04-26 19:37:19 +0200601 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
Juan Quintela56e93d22015-05-07 19:33:31 +0200602
Juan Quintela536b5a42023-04-11 18:02:34 +0200603 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
Juan Quintela93604472017-06-06 19:49:03 +0200604 xbzrle_counters.cache_miss++;
Juan Quintela05931ec2021-12-15 19:01:21 +0100605 if (!rs->last_stage) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200606 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
Juan Quintela536b5a42023-04-11 18:02:34 +0200607 generation) == -1) {
Juan Quintela56e93d22015-05-07 19:33:31 +0200608 return -1;
609 } else {
610 /* update *current_data when the page has been
611 inserted into cache */
612 *current_data = get_cached_data(XBZRLE.cache, current_addr);
613 }
614 }
615 return -1;
616 }
617
Wei Wange460a4b2020-04-30 08:59:35 +0800618 /*
619 * Reaching here means the page has hit the xbzrle cache, no matter what
620 * encoding result it is (normal encoding, overflow or skipping the page),
zhaolichang3a4452d2020-09-17 15:50:21 +0800621 * count the page as encoded. This is used to calculate the encoding rate.
Wei Wange460a4b2020-04-30 08:59:35 +0800622 *
623 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
624 * 2nd page turns out to be skipped (i.e. no new bytes written to the
625 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
626 * skipped page included. In this way, the encoding rate can tell if the
627 * guest page is good for xbzrle encoding.
628 */
629 xbzrle_counters.pages++;
Juan Quintela56e93d22015-05-07 19:33:31 +0200630 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
631
632 /* save current buffer into memory */
633 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
634
635 /* XBZRLE encoding (if there is no overflow) */
Richard Henderson7ba7db92023-05-17 20:00:30 -0700636 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
637 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
638 TARGET_PAGE_SIZE);
Wei Yangca353802019-06-10 08:41:59 +0800639
640 /*
641 * Update the cache contents, so that it corresponds to the data
642 * sent, in all cases except where we skip the page.
643 */
Juan Quintela05931ec2021-12-15 19:01:21 +0100644 if (!rs->last_stage && encoded_len != 0) {
Wei Yangca353802019-06-10 08:41:59 +0800645 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
646 /*
647 * In the case where we couldn't compress, ensure that the caller
648 * sends the data from the cache, since the guest might have
649 * changed the RAM since we copied it.
650 */
651 *current_data = prev_cached_page;
652 }
653
Juan Quintela56e93d22015-05-07 19:33:31 +0200654 if (encoded_len == 0) {
Juan Quintela55c44462017-01-23 22:32:05 +0100655 trace_save_xbzrle_page_skipping();
Juan Quintela56e93d22015-05-07 19:33:31 +0200656 return 0;
657 } else if (encoded_len == -1) {
Juan Quintela55c44462017-01-23 22:32:05 +0100658 trace_save_xbzrle_page_overflow();
Juan Quintela93604472017-06-06 19:49:03 +0200659 xbzrle_counters.overflow++;
Wei Wange460a4b2020-04-30 08:59:35 +0800660 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
Juan Quintela56e93d22015-05-07 19:33:31 +0200661 return -1;
662 }
663
Juan Quintela56e93d22015-05-07 19:33:31 +0200664 /* Send XBZRLE based compressed page */
Lukas Straub37502df2023-04-02 17:06:32 +0000665 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
Juan Quintela204b88b2017-03-15 09:16:57 +0100666 offset | RAM_SAVE_FLAG_XBZRLE);
Peter Xu61717ea2022-10-11 17:55:53 -0400667 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
668 qemu_put_be16(file, encoded_len);
669 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
Juan Quintela56e93d22015-05-07 19:33:31 +0200670 bytes_xbzrle += encoded_len + 1 + 2;
Wei Wange460a4b2020-04-30 08:59:35 +0800671 /*
672 * Like compressed_size (please see update_compress_thread_counts),
673 * the xbzrle encoded bytes don't count the 8 byte header with
674 * RAM_SAVE_FLAG_CONTINUE.
675 */
676 xbzrle_counters.bytes += bytes_xbzrle - 8;
David Edmondson4c2d0f62021-12-21 09:34:40 +0000677 ram_transferred_add(bytes_xbzrle);
Juan Quintela56e93d22015-05-07 19:33:31 +0200678
679 return 1;
680}
681
Juan Quintela3d0684b2017-03-23 15:06:39 +0100682/**
Peter Xud9e474e2022-10-11 17:55:52 -0400683 * pss_find_next_dirty: find the next dirty page of current ramblock
Dr. David Alan Gilbertf3f491f2015-11-05 18:11:01 +0000684 *
Peter Xud9e474e2022-10-11 17:55:52 -0400685 * This function updates pss->page to point to the next dirty page index
686 * within the ramblock to migrate, or the end of ramblock when nothing
687 * found. Note that when pss->host_page_sending==true it means we're
688 * during sending a host page, so we won't look for dirty page that is
689 * outside the host page boundary.
Juan Quintela3d0684b2017-03-23 15:06:39 +0100690 *
Peter Xud9e474e2022-10-11 17:55:52 -0400691 * @pss: the current page search status
Dr. David Alan Gilbertf3f491f2015-11-05 18:11:01 +0000692 */
Peter Xud9e474e2022-10-11 17:55:52 -0400693static void pss_find_next_dirty(PageSearchStatus *pss)
Juan Quintela56e93d22015-05-07 19:33:31 +0200694{
Peter Xud9e474e2022-10-11 17:55:52 -0400695 RAMBlock *rb = pss->block;
Juan Quintela6b6712e2017-03-22 15:18:04 +0100696 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
697 unsigned long *bitmap = rb->bmap;
Juan Quintela56e93d22015-05-07 19:33:31 +0200698
David Hildenbrandf161c882023-07-06 09:56:08 +0200699 if (migrate_ram_is_ignored(rb)) {
Peter Xud9e474e2022-10-11 17:55:52 -0400700 /* Points directly to the end, so we know no dirty page */
701 pss->page = size;
702 return;
Cédric Le Goaterb895de52018-05-14 08:57:00 +0200703 }
704
Peter Xud9e474e2022-10-11 17:55:52 -0400705 /*
706 * If during sending a host page, only look for dirty pages within the
707 * current host page being send.
708 */
709 if (pss->host_page_sending) {
710 assert(pss->host_page_end);
711 size = MIN(size, pss->host_page_end);
712 }
713
714 pss->page = find_next_bit(bitmap, size, pss->page);
Juan Quintela56e93d22015-05-07 19:33:31 +0200715}
716
David Hildenbrand1230a252021-09-04 18:09:07 +0200717static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
Wei Wang3143577d2021-07-22 04:30:55 -0400718 unsigned long page)
719{
720 uint8_t shift;
721 hwaddr size, start;
722
723 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
724 return;
725 }
726
727 shift = rb->clear_bmap_shift;
728 /*
729 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
730 * can make things easier sometimes since then start address
731 * of the small chunk will always be 64 pages aligned so the
732 * bitmap will always be aligned to unsigned long. We should
733 * even be able to remove this restriction but I'm simply
734 * keeping it.
735 */
736 assert(shift >= 6);
737
738 size = 1ULL << (TARGET_PAGE_BITS + shift);
David Hildenbrand76482972021-10-11 19:53:44 +0200739 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
Wei Wang3143577d2021-07-22 04:30:55 -0400740 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
741 memory_region_clear_dirty_bitmap(rb->mr, start, size);
742}
743
744static void
David Hildenbrand1230a252021-09-04 18:09:07 +0200745migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
Wei Wang3143577d2021-07-22 04:30:55 -0400746 unsigned long start,
747 unsigned long npages)
748{
749 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
750 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
751 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
752
753 /*
754 * Clear pages from start to start + npages - 1, so the end boundary is
755 * exclusive.
756 */
757 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
David Hildenbrand1230a252021-09-04 18:09:07 +0200758 migration_clear_memory_region_dirty_bitmap(rb, i);
Wei Wang3143577d2021-07-22 04:30:55 -0400759 }
760}
761
Rao, Leia6a83ce2021-11-09 11:04:55 +0800762/*
763 * colo_bitmap_find_diry:find contiguous dirty pages from start
764 *
765 * Returns the page offset within memory region of the start of the contiguout
766 * dirty page
767 *
768 * @rs: current RAM state
769 * @rb: RAMBlock where to search for dirty pages
770 * @start: page where we start the search
771 * @num: the number of contiguous dirty pages
772 */
773static inline
774unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
775 unsigned long start, unsigned long *num)
776{
777 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
778 unsigned long *bitmap = rb->bmap;
779 unsigned long first, next;
780
781 *num = 0;
782
David Hildenbrandf161c882023-07-06 09:56:08 +0200783 if (migrate_ram_is_ignored(rb)) {
Rao, Leia6a83ce2021-11-09 11:04:55 +0800784 return size;
785 }
786
787 first = find_next_bit(bitmap, size, start);
788 if (first >= size) {
789 return first;
790 }
791 next = find_next_zero_bit(bitmap, size, first + 1);
792 assert(next >= first);
793 *num = next - first;
794 return first;
795}
796
Juan Quintela06b10682017-03-21 15:18:05 +0100797static inline bool migration_bitmap_clear_dirty(RAMState *rs,
Juan Quintelaf20e2862017-03-21 16:19:05 +0100798 RAMBlock *rb,
799 unsigned long page)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000800{
801 bool ret;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000802
Peter Xu002cad62019-06-03 14:50:56 +0800803 /*
804 * Clear dirty bitmap if needed. This _must_ be called before we
805 * send any of the page in the chunk because we need to make sure
806 * we can capture further page content changes when we sync dirty
807 * log the next time. So as long as we are going to send any of
808 * the page in the chunk we clear the remote dirty bitmap for all.
809 * Clearing it earlier won't be a problem, but too late will.
810 */
David Hildenbrand1230a252021-09-04 18:09:07 +0200811 migration_clear_memory_region_dirty_bitmap(rb, page);
Peter Xu002cad62019-06-03 14:50:56 +0800812
Juan Quintela6b6712e2017-03-22 15:18:04 +0100813 ret = test_and_clear_bit(page, rb->bmap);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000814 if (ret) {
Juan Quintela0d8ec882017-03-13 21:21:41 +0100815 rs->migration_dirty_pages--;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000816 }
Wei Wang386a9072018-12-11 16:24:49 +0800817
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +0000818 return ret;
819}
820
David Hildenbrandbe39b4c2021-10-11 19:53:41 +0200821static void dirty_bitmap_clear_section(MemoryRegionSection *section,
822 void *opaque)
823{
824 const hwaddr offset = section->offset_within_region;
825 const hwaddr size = int128_get64(section->size);
826 const unsigned long start = offset >> TARGET_PAGE_BITS;
827 const unsigned long npages = size >> TARGET_PAGE_BITS;
828 RAMBlock *rb = section->mr->ram_block;
829 uint64_t *cleared_bits = opaque;
830
831 /*
832 * We don't grab ram_state->bitmap_mutex because we expect to run
833 * only when starting migration or during postcopy recovery where
834 * we don't have concurrent access.
835 */
836 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
837 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
838 }
839 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
840 bitmap_clear(rb->bmap, start, npages);
841}
842
843/*
844 * Exclude all dirty pages from migration that fall into a discarded range as
845 * managed by a RamDiscardManager responsible for the mapped memory region of
846 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
847 *
848 * Discarded pages ("logically unplugged") have undefined content and must
849 * not get migrated, because even reading these pages for migration might
850 * result in undesired behavior.
851 *
852 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
853 *
854 * Note: The result is only stable while migrating (precopy/postcopy).
855 */
856static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
857{
858 uint64_t cleared_bits = 0;
859
860 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
861 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
862 MemoryRegionSection section = {
863 .mr = rb->mr,
864 .offset_within_region = 0,
865 .size = int128_make64(qemu_ram_get_used_length(rb)),
866 };
867
868 ram_discard_manager_replay_discarded(rdm, &section,
869 dirty_bitmap_clear_section,
870 &cleared_bits);
871 }
872 return cleared_bits;
873}
874
David Hildenbrand9470c5e2021-10-11 19:53:43 +0200875/*
876 * Check if a host-page aligned page falls into a discarded range as managed by
877 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
878 *
879 * Note: The result is only stable while migrating (precopy/postcopy).
880 */
881bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
882{
883 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
884 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
885 MemoryRegionSection section = {
886 .mr = rb->mr,
887 .offset_within_region = start,
888 .size = int128_make64(qemu_ram_pagesize(rb)),
889 };
890
891 return !ram_discard_manager_is_populated(rdm, &section);
892 }
893 return false;
894}
895
Peter Xu267691b2019-06-03 14:50:46 +0800896/* Called with RCU critical section */
Wei Yang7a3e9572019-08-08 11:31:55 +0800897static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
Juan Quintela56e93d22015-05-07 19:33:31 +0200898{
Keqian Zhufb613582020-06-22 11:20:37 +0800899 uint64_t new_dirty_pages =
900 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
901
902 rs->migration_dirty_pages += new_dirty_pages;
903 rs->num_dirty_pages_period += new_dirty_pages;
Juan Quintela56e93d22015-05-07 19:33:31 +0200904}
905
Juan Quintela3d0684b2017-03-23 15:06:39 +0100906/**
907 * ram_pagesize_summary: calculate all the pagesizes of a VM
908 *
909 * Returns a summary bitmap of the page sizes of all RAMBlocks
910 *
911 * For VMs with just normal pages this is equivalent to the host page
912 * size. If it's got some huge pages then it's the OR of all the
913 * different page sizes.
Dr. David Alan Gilberte8ca1db2017-02-24 18:28:29 +0000914 */
915uint64_t ram_pagesize_summary(void)
916{
917 RAMBlock *block;
918 uint64_t summary = 0;
919
Yury Kotovfbd162e2019-02-15 20:45:46 +0300920 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Dr. David Alan Gilberte8ca1db2017-02-24 18:28:29 +0000921 summary |= block->page_size;
922 }
923
924 return summary;
925}
926
Xiao Guangrongaecbfe92019-01-11 14:37:30 +0800927uint64_t ram_get_total_transferred_pages(void)
928{
Juan Quintelaaff3f662023-04-26 19:37:19 +0200929 return stat64_get(&mig_stats.normal_pages) +
930 stat64_get(&mig_stats.zero_pages) +
Peter Xu23b75762022-10-11 17:55:51 -0400931 compression_counters.pages + xbzrle_counters.pages;
Xiao Guangrongaecbfe92019-01-11 14:37:30 +0800932}
933
Xiao Guangrongb7340352018-06-04 17:55:12 +0800934static void migration_update_rates(RAMState *rs, int64_t end_time)
935{
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +0800936 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
Xiao Guangrong76e03002018-09-06 15:01:00 +0800937 double compressed_size;
Xiao Guangrongb7340352018-06-04 17:55:12 +0800938
939 /* calculate period counters */
Juan Quintelaaff3f662023-04-26 19:37:19 +0200940 stat64_set(&mig_stats.dirty_pages_rate,
Juan Quintela72f8e582023-04-11 18:19:05 +0200941 rs->num_dirty_pages_period * 1000 /
942 (end_time - rs->time_last_bitmap_sync));
Xiao Guangrongb7340352018-06-04 17:55:12 +0800943
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +0800944 if (!page_count) {
Xiao Guangrongb7340352018-06-04 17:55:12 +0800945 return;
946 }
947
Juan Quintela87dca0c2023-03-01 22:20:13 +0100948 if (migrate_xbzrle()) {
Wei Wange460a4b2020-04-30 08:59:35 +0800949 double encoded_size, unencoded_size;
950
Xiao Guangrongb7340352018-06-04 17:55:12 +0800951 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +0800952 rs->xbzrle_cache_miss_prev) / page_count;
Xiao Guangrongb7340352018-06-04 17:55:12 +0800953 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
Wei Wange460a4b2020-04-30 08:59:35 +0800954 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
955 TARGET_PAGE_SIZE;
956 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
Wei Wang92271402020-06-17 13:13:05 -0700957 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
Wei Wange460a4b2020-04-30 08:59:35 +0800958 xbzrle_counters.encoding_rate = 0;
Wei Wange460a4b2020-04-30 08:59:35 +0800959 } else {
960 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
961 }
962 rs->xbzrle_pages_prev = xbzrle_counters.pages;
963 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
Xiao Guangrongb7340352018-06-04 17:55:12 +0800964 }
Xiao Guangrong76e03002018-09-06 15:01:00 +0800965
Juan Quintelaa7a94d12023-03-01 22:03:48 +0100966 if (migrate_compress()) {
Xiao Guangrong76e03002018-09-06 15:01:00 +0800967 compression_counters.busy_rate = (double)(compression_counters.busy -
968 rs->compress_thread_busy_prev) / page_count;
969 rs->compress_thread_busy_prev = compression_counters.busy;
970
971 compressed_size = compression_counters.compressed_size -
972 rs->compressed_size_prev;
973 if (compressed_size) {
974 double uncompressed_size = (compression_counters.pages -
975 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
976
977 /* Compression-Ratio = Uncompressed-size / Compressed-size */
978 compression_counters.compression_rate =
979 uncompressed_size / compressed_size;
980
981 rs->compress_pages_prev = compression_counters.pages;
982 rs->compressed_size_prev = compression_counters.compressed_size;
983 }
984 }
Xiao Guangrongb7340352018-06-04 17:55:12 +0800985}
986
Keqian Zhudc14a472020-02-24 10:31:42 +0800987static void migration_trigger_throttle(RAMState *rs)
988{
Juan Quintela6499efd2023-03-02 00:59:13 +0100989 uint64_t threshold = migrate_throttle_trigger_threshold();
Peter Xu23b75762022-10-11 17:55:51 -0400990 uint64_t bytes_xfer_period =
Juan Quintelaaff3f662023-04-26 19:37:19 +0200991 stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
Keqian Zhudc14a472020-02-24 10:31:42 +0800992 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
993 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
994
995 /* During block migration the auto-converge logic incorrectly detects
996 * that ram migration makes no progress. Avoid this by disabling the
997 * throttling logic during the bulk phase of block migration. */
998 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
999 /* The following detection logic can be refined later. For now:
1000 Check to see if the ratio between dirtied bytes and the approx.
1001 amount of bytes that just got transferred since the last time
1002 we were in this routine reaches the threshold. If that happens
1003 twice, start or increase throttling. */
1004
1005 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1006 (++rs->dirty_rate_high_cnt >= 2)) {
1007 trace_migration_throttle();
1008 rs->dirty_rate_high_cnt = 0;
Keqian Zhucbbf8182020-04-13 18:15:08 +08001009 mig_throttle_guest_down(bytes_dirty_period,
1010 bytes_dirty_threshold);
Keqian Zhudc14a472020-02-24 10:31:42 +08001011 }
1012 }
1013}
1014
Gavin Shan1e493be2023-05-09 12:21:19 +10001015static void migration_bitmap_sync(RAMState *rs, bool last_stage)
Juan Quintela56e93d22015-05-07 19:33:31 +02001016{
1017 RAMBlock *block;
Juan Quintela56e93d22015-05-07 19:33:31 +02001018 int64_t end_time;
Juan Quintela56e93d22015-05-07 19:33:31 +02001019
Juan Quintelaaff3f662023-04-26 19:37:19 +02001020 stat64_add(&mig_stats.dirty_sync_count, 1);
Juan Quintela56e93d22015-05-07 19:33:31 +02001021
Juan Quintelaf664da82017-03-13 19:44:57 +01001022 if (!rs->time_last_bitmap_sync) {
1023 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
Juan Quintela56e93d22015-05-07 19:33:31 +02001024 }
1025
1026 trace_migration_bitmap_sync_start();
Gavin Shan1e493be2023-05-09 12:21:19 +10001027 memory_global_dirty_log_sync(last_stage);
Juan Quintela56e93d22015-05-07 19:33:31 +02001028
Juan Quintela108cfae2017-03-13 21:38:09 +01001029 qemu_mutex_lock(&rs->bitmap_mutex);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01001030 WITH_RCU_READ_LOCK_GUARD() {
1031 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1032 ramblock_sync_dirty_bitmap(rs, block);
1033 }
Juan Quintelaaff3f662023-04-26 19:37:19 +02001034 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
Juan Quintela56e93d22015-05-07 19:33:31 +02001035 }
Juan Quintela108cfae2017-03-13 21:38:09 +01001036 qemu_mutex_unlock(&rs->bitmap_mutex);
Juan Quintela56e93d22015-05-07 19:33:31 +02001037
Paolo Bonzini9458a9a2018-02-06 18:37:39 +01001038 memory_global_after_dirty_log_sync();
Juan Quintelaa66cd902017-03-28 15:02:43 +02001039 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
Chao Fan1ffb5df2017-03-14 09:55:07 +08001040
Juan Quintela56e93d22015-05-07 19:33:31 +02001041 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1042
1043 /* more than 1 second = 1000 millisecons */
Juan Quintelaf664da82017-03-13 19:44:57 +01001044 if (end_time > rs->time_last_bitmap_sync + 1000) {
Keqian Zhudc14a472020-02-24 10:31:42 +08001045 migration_trigger_throttle(rs);
Jason J. Herne070afca2015-09-08 13:12:35 -04001046
Xiao Guangrongb7340352018-06-04 17:55:12 +08001047 migration_update_rates(rs, end_time);
1048
Xiao Guangrongbe8b02e2018-09-03 17:26:42 +08001049 rs->target_page_count_prev = rs->target_page_count;
Felipe Franciosid693c6f2017-05-24 17:10:01 +01001050
1051 /* reset period counters */
Juan Quintelaf664da82017-03-13 19:44:57 +01001052 rs->time_last_bitmap_sync = end_time;
Juan Quintelaa66cd902017-03-28 15:02:43 +02001053 rs->num_dirty_pages_period = 0;
Juan Quintelaaff3f662023-04-26 19:37:19 +02001054 rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
Juan Quintela56e93d22015-05-07 19:33:31 +02001055 }
Juan Quintelab8909022023-03-01 22:08:09 +01001056 if (migrate_events()) {
Juan Quintelaaff3f662023-04-26 19:37:19 +02001057 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
Juan Quintela536b5a42023-04-11 18:02:34 +02001058 qapi_event_send_migration_pass(generation);
Dr. David Alan Gilbert4addcd42015-12-16 11:47:36 +00001059 }
Juan Quintela56e93d22015-05-07 19:33:31 +02001060}
1061
Gavin Shan1e493be2023-05-09 12:21:19 +10001062static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
Wei Wangbd227062018-12-11 16:24:51 +08001063{
1064 Error *local_err = NULL;
1065
1066 /*
1067 * The current notifier usage is just an optimization to migration, so we
1068 * don't stop the normal migration process in the error case.
1069 */
1070 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1071 error_report_err(local_err);
Vladimir Sementsov-Ogievskiyb4a17332020-03-24 18:36:29 +03001072 local_err = NULL;
Wei Wangbd227062018-12-11 16:24:51 +08001073 }
1074
Gavin Shan1e493be2023-05-09 12:21:19 +10001075 migration_bitmap_sync(rs, last_stage);
Wei Wangbd227062018-12-11 16:24:51 +08001076
1077 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1078 error_report_err(local_err);
1079 }
1080}
1081
Juan Quintelaa4dbaf82021-12-16 10:19:38 +01001082void ram_release_page(const char *rbname, uint64_t offset)
Juan Quintela47fe16f2021-12-16 09:58:49 +01001083{
1084 if (!migrate_release_ram() || !migration_in_postcopy()) {
1085 return;
1086 }
1087
1088 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1089}
1090
Juan Quintela56e93d22015-05-07 19:33:31 +02001091/**
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001092 * save_zero_page_to_file: send the zero page to the file
1093 *
1094 * Returns the size of data written to the file, 0 means the page is not
1095 * a zero page
1096 *
Peter Xuec6f3ab2022-10-11 17:55:56 -04001097 * @pss: current PSS channel
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001098 * @block: block that contains the page we want to send
1099 * @offset: offset inside the block for the page
1100 */
Lukas Straub37502df2023-04-02 17:06:32 +00001101static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001102 RAMBlock *block, ram_addr_t offset)
1103{
1104 uint8_t *p = block->host + offset;
1105 int len = 0;
1106
Juan Quintelabad452a2021-11-18 15:56:38 +01001107 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
Lukas Straub37502df2023-04-02 17:06:32 +00001108 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001109 qemu_put_byte(file, 0);
1110 len += 1;
Juan Quintela47fe16f2021-12-16 09:58:49 +01001111 ram_release_page(block->idstr, offset);
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001112 }
1113 return len;
1114}
1115
1116/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01001117 * save_zero_page: send the zero page to the stream
Juan Quintela56e93d22015-05-07 19:33:31 +02001118 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01001119 * Returns the number of pages written.
Juan Quintela56e93d22015-05-07 19:33:31 +02001120 *
Peter Xuec6f3ab2022-10-11 17:55:56 -04001121 * @pss: current PSS channel
Juan Quintela56e93d22015-05-07 19:33:31 +02001122 * @block: block that contains the page we want to send
1123 * @offset: offset inside the block for the page
Juan Quintela56e93d22015-05-07 19:33:31 +02001124 */
Lukas Straub37502df2023-04-02 17:06:32 +00001125static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
Peter Xu61717ea2022-10-11 17:55:53 -04001126 ram_addr_t offset)
Juan Quintela56e93d22015-05-07 19:33:31 +02001127{
Lukas Straub37502df2023-04-02 17:06:32 +00001128 int len = save_zero_page_to_file(pss, f, block, offset);
Juan Quintela56e93d22015-05-07 19:33:31 +02001129
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001130 if (len) {
Juan Quintelaaff3f662023-04-26 19:37:19 +02001131 stat64_add(&mig_stats.zero_pages, 1);
David Edmondson4c2d0f62021-12-21 09:34:40 +00001132 ram_transferred_add(len);
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001133 return 1;
Juan Quintela56e93d22015-05-07 19:33:31 +02001134 }
Xiao Guangrong6c97ec52018-08-21 16:10:22 +08001135 return -1;
Juan Quintela56e93d22015-05-07 19:33:31 +02001136}
1137
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001138/*
1139 * @pages: the number of pages written by the control path,
1140 * < 0 - error
1141 * > 0 - number of pages written
1142 *
1143 * Return true if the pages has been saved, otherwise false is returned.
1144 */
Peter Xu61717ea2022-10-11 17:55:53 -04001145static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1146 ram_addr_t offset, int *pages)
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001147{
1148 uint64_t bytes_xmit = 0;
1149 int ret;
1150
1151 *pages = -1;
Peter Xu61717ea2022-10-11 17:55:53 -04001152 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1153 TARGET_PAGE_SIZE, &bytes_xmit);
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001154 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1155 return false;
1156 }
1157
1158 if (bytes_xmit) {
David Edmondson4c2d0f62021-12-21 09:34:40 +00001159 ram_transferred_add(bytes_xmit);
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001160 *pages = 1;
1161 }
1162
1163 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1164 return true;
1165 }
1166
1167 if (bytes_xmit > 0) {
Juan Quintelaaff3f662023-04-26 19:37:19 +02001168 stat64_add(&mig_stats.normal_pages, 1);
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001169 } else if (bytes_xmit == 0) {
Juan Quintelaaff3f662023-04-26 19:37:19 +02001170 stat64_add(&mig_stats.zero_pages, 1);
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001171 }
1172
1173 return true;
1174}
1175
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001176/*
1177 * directly send the page to the stream
1178 *
1179 * Returns the number of pages written.
1180 *
Peter Xuec6f3ab2022-10-11 17:55:56 -04001181 * @pss: current PSS channel
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001182 * @block: block that contains the page we want to send
1183 * @offset: offset inside the block for the page
1184 * @buf: the page to be sent
1185 * @async: send to page asyncly
1186 */
Peter Xuec6f3ab2022-10-11 17:55:56 -04001187static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
Peter Xu61717ea2022-10-11 17:55:53 -04001188 ram_addr_t offset, uint8_t *buf, bool async)
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001189{
Peter Xuec6f3ab2022-10-11 17:55:56 -04001190 QEMUFile *file = pss->pss_channel;
1191
Lukas Straub37502df2023-04-02 17:06:32 +00001192 ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
David Edmondson4c2d0f62021-12-21 09:34:40 +00001193 offset | RAM_SAVE_FLAG_PAGE));
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001194 if (async) {
Peter Xu61717ea2022-10-11 17:55:53 -04001195 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
Dr. David Alan Gilbertf912ec52022-04-06 11:25:15 +01001196 migrate_release_ram() &&
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001197 migration_in_postcopy());
1198 } else {
Peter Xu61717ea2022-10-11 17:55:53 -04001199 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001200 }
David Edmondson4c2d0f62021-12-21 09:34:40 +00001201 ram_transferred_add(TARGET_PAGE_SIZE);
Juan Quintelaaff3f662023-04-26 19:37:19 +02001202 stat64_add(&mig_stats.normal_pages, 1);
Xiao Guangrong65dacaa2018-03-30 15:51:27 +08001203 return 1;
1204}
1205
Juan Quintela56e93d22015-05-07 19:33:31 +02001206/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01001207 * ram_save_page: send the given page to the stream
Juan Quintela56e93d22015-05-07 19:33:31 +02001208 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01001209 * Returns the number of pages written.
Dr. David Alan Gilbert3fd3c4b2015-12-10 16:31:46 +00001210 * < 0 - error
1211 * >=0 - Number of pages written - this might legally be 0
1212 * if xbzrle noticed the page was the same.
Juan Quintela56e93d22015-05-07 19:33:31 +02001213 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01001214 * @rs: current RAM state
Juan Quintela56e93d22015-05-07 19:33:31 +02001215 * @block: block that contains the page we want to send
1216 * @offset: offset inside the block for the page
Juan Quintela56e93d22015-05-07 19:33:31 +02001217 */
Juan Quintela05931ec2021-12-15 19:01:21 +01001218static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
Juan Quintela56e93d22015-05-07 19:33:31 +02001219{
1220 int pages = -1;
Juan Quintela56e93d22015-05-07 19:33:31 +02001221 uint8_t *p;
Juan Quintela56e93d22015-05-07 19:33:31 +02001222 bool send_async = true;
zhanghailianga08f6892016-01-15 11:37:44 +08001223 RAMBlock *block = pss->block;
Alexey Romko8bba0042020-01-10 14:51:34 +01001224 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001225 ram_addr_t current_addr = block->offset + offset;
Juan Quintela56e93d22015-05-07 19:33:31 +02001226
Dr. David Alan Gilbert2f68e392015-08-13 11:51:30 +01001227 p = block->host + offset;
Dr. David Alan Gilbert1db9d8e2017-04-26 19:37:21 +01001228 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
Juan Quintela56e93d22015-05-07 19:33:31 +02001229
Juan Quintela56e93d22015-05-07 19:33:31 +02001230 XBZRLE_cache_lock();
Juan Quintelaf3095cc2023-05-04 13:53:23 +02001231 if (rs->xbzrle_started && !migration_in_postcopy()) {
Peter Xuec6f3ab2022-10-11 17:55:56 -04001232 pages = save_xbzrle_page(rs, pss, &p, current_addr,
Peter Xu61717ea2022-10-11 17:55:53 -04001233 block, offset);
Juan Quintela05931ec2021-12-15 19:01:21 +01001234 if (!rs->last_stage) {
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001235 /* Can't send this cached data async, since the cache page
1236 * might get updated before it gets to the wire
Juan Quintela56e93d22015-05-07 19:33:31 +02001237 */
Xiao Guangrong059ff0f2018-03-30 15:51:23 +08001238 send_async = false;
Juan Quintela56e93d22015-05-07 19:33:31 +02001239 }
1240 }
1241
1242 /* XBZRLE overflow or normal page */
1243 if (pages == -1) {
Peter Xuec6f3ab2022-10-11 17:55:56 -04001244 pages = save_normal_page(pss, block, offset, p, send_async);
Juan Quintela56e93d22015-05-07 19:33:31 +02001245 }
1246
1247 XBZRLE_cache_unlock();
1248
1249 return pages;
1250}
1251
Peter Xu61717ea2022-10-11 17:55:53 -04001252static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
Juan Quintelab9ee2f72016-01-15 11:40:13 +01001253 ram_addr_t offset)
1254{
Peter Xu61717ea2022-10-11 17:55:53 -04001255 if (multifd_queue_page(file, block, offset) < 0) {
Ivan Ren713f7622019-06-25 21:18:17 +08001256 return -1;
1257 }
Juan Quintelaaff3f662023-04-26 19:37:19 +02001258 stat64_add(&mig_stats.normal_pages, 1);
Juan Quintelab9ee2f72016-01-15 11:40:13 +01001259
1260 return 1;
1261}
1262
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08001263static void
1264update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1265{
David Edmondson4c2d0f62021-12-21 09:34:40 +00001266 ram_transferred_add(bytes_xmit);
Xiao Guangrong76e03002018-09-06 15:01:00 +08001267
Lukas Straub97274a82023-04-20 11:47:56 +02001268 if (param->result == RES_ZEROPAGE) {
Juan Quintelaaff3f662023-04-26 19:37:19 +02001269 stat64_add(&mig_stats.zero_pages, 1);
Xiao Guangrong76e03002018-09-06 15:01:00 +08001270 return;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08001271 }
Xiao Guangrong76e03002018-09-06 15:01:00 +08001272
1273 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1274 compression_counters.compressed_size += bytes_xmit - 8;
1275 compression_counters.pages++;
Juan Quintela56e93d22015-05-07 19:33:31 +02001276}
1277
Xiao Guangrong32b05492018-09-06 15:01:01 +08001278static bool save_page_use_compression(RAMState *rs);
1279
Lukas Straub3e817632023-04-20 11:48:06 +02001280static int send_queued_data(CompressParam *param)
1281{
1282 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
1283 MigrationState *ms = migrate_get_current();
1284 QEMUFile *file = ms->to_dst_file;
1285 int len = 0;
1286
1287 RAMBlock *block = param->block;
1288 ram_addr_t offset = param->offset;
1289
1290 if (param->result == RES_NONE) {
1291 return 0;
1292 }
1293
1294 assert(block == pss->last_sent_block);
1295
1296 if (param->result == RES_ZEROPAGE) {
Lukas Straub4024cc82023-04-20 11:48:31 +02001297 assert(qemu_file_buffer_empty(param->file));
Lukas Straub3e817632023-04-20 11:48:06 +02001298 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1299 qemu_put_byte(file, 0);
1300 len += 1;
1301 ram_release_page(block->idstr, offset);
1302 } else if (param->result == RES_COMPRESS) {
Lukas Straub4024cc82023-04-20 11:48:31 +02001303 assert(!qemu_file_buffer_empty(param->file));
Lukas Straub3e817632023-04-20 11:48:06 +02001304 len += save_page_header(pss, file, block,
1305 offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1306 len += qemu_put_qemu_file(file, param->file);
1307 } else {
1308 abort();
1309 }
1310
Lukas Straub680628d2023-04-20 11:48:10 +02001311 update_compress_thread_counts(param, len);
1312
Lukas Straub3e817632023-04-20 11:48:06 +02001313 return len;
1314}
1315
Lukas Straubef4f5f52023-04-20 11:48:13 +02001316static void ram_flush_compressed_data(RAMState *rs)
1317{
1318 if (!save_page_use_compression(rs)) {
1319 return;
1320 }
1321
1322 flush_compressed_data(send_queued_data);
1323}
1324
Juan Quintela31e2ac72022-06-21 13:29:36 +02001325#define PAGE_ALL_CLEAN 0
1326#define PAGE_TRY_AGAIN 1
1327#define PAGE_DIRTY_FOUND 2
Juan Quintela56e93d22015-05-07 19:33:31 +02001328/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01001329 * find_dirty_block: find the next dirty page and update any state
1330 * associated with the search process.
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001331 *
Juan Quintela31e2ac72022-06-21 13:29:36 +02001332 * Returns:
Juan Quintela294e5a42022-06-21 13:36:11 +02001333 * <0: An error happened
Juan Quintela31e2ac72022-06-21 13:29:36 +02001334 * PAGE_ALL_CLEAN: no dirty page found, give up
1335 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1336 * PAGE_DIRTY_FOUND: dirty page found
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001337 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01001338 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01001339 * @pss: data about the state of the current dirty page scan
1340 * @again: set to false if the search has scanned the whole of RAM
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001341 */
Juan Quintela31e2ac72022-06-21 13:29:36 +02001342static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001343{
Peter Xud9e474e2022-10-11 17:55:52 -04001344 /* Update pss->page for the next dirty bit in ramblock */
1345 pss_find_next_dirty(pss);
1346
Juan Quintela6f37bb82017-03-13 19:26:29 +01001347 if (pss->complete_round && pss->block == rs->last_seen_block &&
Juan Quintelaa935e302017-03-21 15:36:51 +01001348 pss->page >= rs->last_page) {
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001349 /*
1350 * We've been once around the RAM and haven't found anything.
1351 * Give up.
1352 */
Juan Quintela31e2ac72022-06-21 13:29:36 +02001353 return PAGE_ALL_CLEAN;
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001354 }
David Hildenbrand542147f2021-04-29 13:27:08 +02001355 if (!offset_in_ramblock(pss->block,
1356 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001357 /* Didn't find anything in this RAM Block */
Juan Quintelaa935e302017-03-21 15:36:51 +01001358 pss->page = 0;
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001359 pss->block = QLIST_NEXT_RCU(pss->block, next);
1360 if (!pss->block) {
Juan Quintela294e5a42022-06-21 13:36:11 +02001361 if (!migrate_multifd_flush_after_each_section()) {
1362 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1363 int ret = multifd_send_sync_main(f);
1364 if (ret < 0) {
1365 return ret;
1366 }
1367 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1368 qemu_fflush(f);
1369 }
Xiao Guangrong48df9d82018-09-06 15:00:59 +08001370 /*
1371 * If memory migration starts over, we will meet a dirtied page
1372 * which may still exists in compression threads's ring, so we
1373 * should flush the compressed data to make sure the new page
1374 * is not overwritten by the old one in the destination.
1375 *
1376 * Also If xbzrle is on, stop using the data compression at this
1377 * point. In theory, xbzrle can do better than compression.
1378 */
Lukas Straubef4f5f52023-04-20 11:48:13 +02001379 ram_flush_compressed_data(rs);
Xiao Guangrong48df9d82018-09-06 15:00:59 +08001380
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001381 /* Hit the end of the list */
1382 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1383 /* Flag that we've looped */
1384 pss->complete_round = true;
David Hildenbrand1a373522021-02-16 11:50:39 +01001385 /* After the first round, enable XBZRLE. */
Juan Quintela87dca0c2023-03-01 22:20:13 +01001386 if (migrate_xbzrle()) {
Juan Quintelaf3095cc2023-05-04 13:53:23 +02001387 rs->xbzrle_started = true;
David Hildenbrand1a373522021-02-16 11:50:39 +01001388 }
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001389 }
1390 /* Didn't find anything this time, but try again on the new block */
Juan Quintela31e2ac72022-06-21 13:29:36 +02001391 return PAGE_TRY_AGAIN;
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001392 } else {
Juan Quintela31e2ac72022-06-21 13:29:36 +02001393 /* We've found something */
1394 return PAGE_DIRTY_FOUND;
Dr. David Alan Gilbertb9e60922015-09-23 15:27:11 +01001395 }
1396}
1397
Juan Quintela3d0684b2017-03-23 15:06:39 +01001398/**
1399 * unqueue_page: gets a page of the queue
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001400 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01001401 * Helper for 'get_queued_page' - gets a page off the queue
1402 *
1403 * Returns the block of the page (or NULL if none available)
1404 *
Juan Quintelaec481c62017-03-20 22:12:40 +01001405 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01001406 * @offset: used to return the offset within the RAMBlock
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001407 */
Juan Quintelaf20e2862017-03-21 16:19:05 +01001408static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001409{
Peter Xua1fe28d2022-01-19 16:09:18 +08001410 struct RAMSrcPageRequest *entry;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001411 RAMBlock *block = NULL;
1412
Peter Xua1fe28d2022-01-19 16:09:18 +08001413 if (!postcopy_has_request(rs)) {
Xiao Guangrongae526e32018-08-21 16:10:25 +08001414 return NULL;
1415 }
1416
Daniel Brodsky6e8a3552020-04-03 21:21:08 -07001417 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001418
Peter Xua1fe28d2022-01-19 16:09:18 +08001419 /*
1420 * This should _never_ change even after we take the lock, because no one
1421 * should be taking anything off the request list other than us.
1422 */
1423 assert(postcopy_has_request(rs));
1424
1425 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1426 block = entry->rb;
1427 *offset = entry->offset;
1428
Thomas Huth777f53c2022-08-02 08:19:49 +02001429 if (entry->len > TARGET_PAGE_SIZE) {
1430 entry->len -= TARGET_PAGE_SIZE;
1431 entry->offset += TARGET_PAGE_SIZE;
Peter Xua1fe28d2022-01-19 16:09:18 +08001432 } else {
1433 memory_region_unref(block->mr);
1434 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435 g_free(entry);
1436 migration_consume_urgent_request();
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001437 }
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001438
1439 return block;
1440}
1441
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001442#if defined(__linux__)
1443/**
1444 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1445 * is found, return RAM block pointer and page offset
1446 *
1447 * Returns pointer to the RAMBlock containing faulting page,
1448 * NULL if no write faults are pending
1449 *
1450 * @rs: current RAM state
1451 * @offset: page offset from the beginning of the block
1452 */
1453static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1454{
1455 struct uffd_msg uffd_msg;
1456 void *page_address;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001457 RAMBlock *block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001458 int res;
1459
1460 if (!migrate_background_snapshot()) {
1461 return NULL;
1462 }
1463
1464 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1465 if (res <= 0) {
1466 return NULL;
1467 }
1468
1469 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001470 block = qemu_ram_block_from_host(page_address, false, offset);
1471 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1472 return block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001473}
1474
1475/**
1476 * ram_save_release_protection: release UFFD write protection after
1477 * a range of pages has been saved
1478 *
1479 * @rs: current RAM state
1480 * @pss: page-search-status structure
1481 * @start_page: index of the first page in the range relative to pss->block
1482 *
1483 * Returns 0 on success, negative value in case of an error
1484*/
1485static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1486 unsigned long start_page)
1487{
1488 int res = 0;
1489
1490 /* Check if page is from UFFD-managed region. */
1491 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1492 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
Peter Xu258f5c982022-01-19 16:09:15 +08001493 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001494
1495 /* Flush async buffers before un-protect. */
Peter Xu61717ea2022-10-11 17:55:53 -04001496 qemu_fflush(pss->pss_channel);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001497 /* Un-protect memory range. */
1498 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1499 false, false);
1500 }
1501
1502 return res;
1503}
1504
1505/* ram_write_tracking_available: check if kernel supports required UFFD features
1506 *
1507 * Returns true if supports, false otherwise
1508 */
1509bool ram_write_tracking_available(void)
1510{
1511 uint64_t uffd_features;
1512 int res;
1513
1514 res = uffd_query_features(&uffd_features);
1515 return (res == 0 &&
1516 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1517}
1518
1519/* ram_write_tracking_compatible: check if guest configuration is
1520 * compatible with 'write-tracking'
1521 *
1522 * Returns true if compatible, false otherwise
1523 */
1524bool ram_write_tracking_compatible(void)
1525{
1526 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1527 int uffd_fd;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001528 RAMBlock *block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001529 bool ret = false;
1530
1531 /* Open UFFD file descriptor */
1532 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1533 if (uffd_fd < 0) {
1534 return false;
1535 }
1536
1537 RCU_READ_LOCK_GUARD();
1538
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001539 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001540 uint64_t uffd_ioctls;
1541
1542 /* Nothing to do with read-only and MMIO-writable regions */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001543 if (block->mr->readonly || block->mr->rom_device) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001544 continue;
1545 }
1546 /* Try to register block memory via UFFD-IO to track writes */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001547 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001548 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1549 goto out;
1550 }
1551 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1552 goto out;
1553 }
1554 }
1555 ret = true;
1556
1557out:
1558 uffd_close_fd(uffd_fd);
1559 return ret;
1560}
1561
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001562static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1563 ram_addr_t size)
1564{
David Hildenbrand5f19a442023-01-05 13:45:24 +01001565 const ram_addr_t end = offset + size;
1566
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001567 /*
1568 * We read one byte of each page; this will preallocate page tables if
1569 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1570 * where no page was populated yet. This might require adaption when
1571 * supporting other mappings, like shmem.
1572 */
David Hildenbrand5f19a442023-01-05 13:45:24 +01001573 for (; offset < end; offset += block->page_size) {
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001574 char tmp = *((char *)block->host + offset);
1575
1576 /* Don't optimize the read out */
1577 asm volatile("" : "+r" (tmp));
1578 }
1579}
1580
David Hildenbrand6fee3a12021-10-11 19:53:46 +02001581static inline int populate_read_section(MemoryRegionSection *section,
1582 void *opaque)
1583{
1584 const hwaddr size = int128_get64(section->size);
1585 hwaddr offset = section->offset_within_region;
1586 RAMBlock *block = section->mr->ram_block;
1587
1588 populate_read_range(block, offset, size);
1589 return 0;
1590}
1591
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001592/*
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001593 * ram_block_populate_read: preallocate page tables and populate pages in the
1594 * RAM block by reading a byte of each page.
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001595 *
1596 * Since it's solely used for userfault_fd WP feature, here we just
1597 * hardcode page size to qemu_real_host_page_size.
1598 *
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001599 * @block: RAM block to populate
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001600 */
David Hildenbrand6fee3a12021-10-11 19:53:46 +02001601static void ram_block_populate_read(RAMBlock *rb)
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001602{
David Hildenbrand6fee3a12021-10-11 19:53:46 +02001603 /*
1604 * Skip populating all pages that fall into a discarded range as managed by
1605 * a RamDiscardManager responsible for the mapped memory region of the
1606 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1607 * must not get populated automatically. We don't have to track
1608 * modifications via userfaultfd WP reliably, because these pages will
1609 * not be part of the migration stream either way -- see
1610 * ramblock_dirty_bitmap_exclude_discarded_pages().
1611 *
1612 * Note: The result is only stable while migrating (precopy/postcopy).
1613 */
1614 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1615 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1616 MemoryRegionSection section = {
1617 .mr = rb->mr,
1618 .offset_within_region = 0,
1619 .size = rb->mr->size,
1620 };
1621
1622 ram_discard_manager_replay_populated(rdm, &section,
1623 populate_read_section, NULL);
1624 } else {
1625 populate_read_range(rb, 0, rb->used_length);
1626 }
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001627}
1628
1629/*
1630 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1631 */
1632void ram_write_tracking_prepare(void)
1633{
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001634 RAMBlock *block;
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001635
1636 RCU_READ_LOCK_GUARD();
1637
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001638 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001639 /* Nothing to do with read-only and MMIO-writable regions */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001640 if (block->mr->readonly || block->mr->rom_device) {
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001641 continue;
1642 }
1643
1644 /*
1645 * Populate pages of the RAM block before enabling userfault_fd
1646 * write protection.
1647 *
1648 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1649 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1650 * pages with pte_none() entries in page table.
1651 */
David Hildenbrandf7b9dcf2021-10-11 19:53:45 +02001652 ram_block_populate_read(block);
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001653 }
1654}
1655
David Hildenbrande41c5772023-01-05 13:45:28 +01001656static inline int uffd_protect_section(MemoryRegionSection *section,
1657 void *opaque)
1658{
1659 const hwaddr size = int128_get64(section->size);
1660 const hwaddr offset = section->offset_within_region;
1661 RAMBlock *rb = section->mr->ram_block;
1662 int uffd_fd = (uintptr_t)opaque;
1663
1664 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1665 false);
1666}
1667
1668static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1669{
1670 assert(rb->flags & RAM_UF_WRITEPROTECT);
1671
1672 /* See ram_block_populate_read() */
1673 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1674 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1675 MemoryRegionSection section = {
1676 .mr = rb->mr,
1677 .offset_within_region = 0,
1678 .size = rb->mr->size,
1679 };
1680
1681 return ram_discard_manager_replay_populated(rdm, &section,
1682 uffd_protect_section,
1683 (void *)(uintptr_t)uffd_fd);
1684 }
1685 return uffd_change_protection(uffd_fd, rb->host,
1686 rb->used_length, true, false);
1687}
1688
Andrey Gruzdeveeccb992021-04-01 12:22:25 +03001689/*
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001690 * ram_write_tracking_start: start UFFD-WP memory tracking
1691 *
1692 * Returns 0 for success or negative value in case of error
1693 */
1694int ram_write_tracking_start(void)
1695{
1696 int uffd_fd;
1697 RAMState *rs = ram_state;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001698 RAMBlock *block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001699
1700 /* Open UFFD file descriptor */
1701 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1702 if (uffd_fd < 0) {
1703 return uffd_fd;
1704 }
1705 rs->uffdio_fd = uffd_fd;
1706
1707 RCU_READ_LOCK_GUARD();
1708
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001709 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001710 /* Nothing to do with read-only and MMIO-writable regions */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001711 if (block->mr->readonly || block->mr->rom_device) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001712 continue;
1713 }
1714
1715 /* Register block memory with UFFD to track writes */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001716 if (uffd_register_memory(rs->uffdio_fd, block->host,
1717 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001718 goto fail;
1719 }
David Hildenbrand72ef3a32023-01-05 13:45:25 +01001720 block->flags |= RAM_UF_WRITEPROTECT;
1721 memory_region_ref(block->mr);
1722
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001723 /* Apply UFFD write protection to the block memory range */
David Hildenbrande41c5772023-01-05 13:45:28 +01001724 if (ram_block_uffd_protect(block, uffd_fd)) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001725 goto fail;
1726 }
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001727
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001728 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1729 block->host, block->max_length);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001730 }
1731
1732 return 0;
1733
1734fail:
1735 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1736
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001737 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1738 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001739 continue;
1740 }
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001741 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001742 /* Cleanup flags and remove reference */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001743 block->flags &= ~RAM_UF_WRITEPROTECT;
1744 memory_region_unref(block->mr);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001745 }
1746
1747 uffd_close_fd(uffd_fd);
1748 rs->uffdio_fd = -1;
1749 return -1;
1750}
1751
1752/**
1753 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1754 */
1755void ram_write_tracking_stop(void)
1756{
1757 RAMState *rs = ram_state;
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001758 RAMBlock *block;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001759
1760 RCU_READ_LOCK_GUARD();
1761
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001762 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1763 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001764 continue;
1765 }
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001766 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001767
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001768 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1769 block->host, block->max_length);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001770
1771 /* Cleanup flags and remove reference */
Andrey Gruzdev82ea3e32021-04-01 12:22:26 +03001772 block->flags &= ~RAM_UF_WRITEPROTECT;
1773 memory_region_unref(block->mr);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001774 }
1775
1776 /* Finally close UFFD file descriptor */
1777 uffd_close_fd(rs->uffdio_fd);
1778 rs->uffdio_fd = -1;
1779}
1780
1781#else
1782/* No target OS support, stubs just fail or ignore */
1783
1784static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1785{
1786 (void) rs;
1787 (void) offset;
1788
1789 return NULL;
1790}
1791
1792static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1793 unsigned long start_page)
1794{
1795 (void) rs;
1796 (void) pss;
1797 (void) start_page;
1798
1799 return 0;
1800}
1801
1802bool ram_write_tracking_available(void)
1803{
1804 return false;
1805}
1806
1807bool ram_write_tracking_compatible(void)
1808{
1809 assert(0);
1810 return false;
1811}
1812
1813int ram_write_tracking_start(void)
1814{
1815 assert(0);
1816 return -1;
1817}
1818
1819void ram_write_tracking_stop(void)
1820{
1821 assert(0);
1822}
1823#endif /* defined(__linux__) */
1824
Juan Quintela3d0684b2017-03-23 15:06:39 +01001825/**
Li Qiangff1543a2019-05-24 23:28:32 -07001826 * get_queued_page: unqueue a page from the postcopy requests
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001827 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01001828 * Skips pages that are already sent (!dirty)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001829 *
Wei Yanga5f7b1a2019-05-11 07:37:29 +08001830 * Returns true if a queued page is found
Juan Quintela3d0684b2017-03-23 15:06:39 +01001831 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01001832 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01001833 * @pss: data about the state of the current dirty page scan
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001834 */
Juan Quintelaf20e2862017-03-21 16:19:05 +01001835static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001836{
1837 RAMBlock *block;
1838 ram_addr_t offset;
Thomas Huth777f53c2022-08-02 08:19:49 +02001839 bool dirty;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001840
Thomas Huth777f53c2022-08-02 08:19:49 +02001841 do {
1842 block = unqueue_page(rs, &offset);
1843 /*
1844 * We're sending this page, and since it's postcopy nothing else
1845 * will dirty it, and we must make sure it doesn't get sent again
1846 * even if this queue request was received after the background
1847 * search already sent it.
1848 */
1849 if (block) {
1850 unsigned long page;
1851
1852 page = offset >> TARGET_PAGE_BITS;
1853 dirty = test_bit(page, block->bmap);
1854 if (!dirty) {
1855 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1856 page);
1857 } else {
1858 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1859 }
1860 }
1861
1862 } while (block && !dirty);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001863
Peter Xub0621062022-10-11 17:55:58 -04001864 if (!block) {
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03001865 /*
1866 * Poll write faults too if background snapshot is enabled; that's
1867 * when we have vcpus got blocked by the write protected pages.
1868 */
1869 block = poll_fault_page(rs, &offset);
1870 }
1871
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001872 if (block) {
1873 /*
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001874 * We want the background search to continue from the queued page
1875 * since the guest is likely to want other pages near to the page
1876 * it just requested.
1877 */
1878 pss->block = block;
Juan Quintelaa935e302017-03-21 15:36:51 +01001879 pss->page = offset >> TARGET_PAGE_BITS;
Wei Yang422314e2019-06-05 09:08:28 +08001880
1881 /*
1882 * This unqueued page would break the "one round" check, even is
1883 * really rare.
1884 */
1885 pss->complete_round = false;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00001886 }
1887
1888 return !!block;
1889}
1890
Juan Quintela56e93d22015-05-07 19:33:31 +02001891/**
Juan Quintela5e58f962017-04-03 22:06:54 +02001892 * migration_page_queue_free: drop any remaining pages in the ram
1893 * request queue
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001894 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01001895 * It should be empty at the end anyway, but in error cases there may
1896 * be some left. in case that there is any page left, we drop it.
1897 *
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001898 */
Juan Quintela83c13382017-05-04 11:45:01 +02001899static void migration_page_queue_free(RAMState *rs)
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001900{
Juan Quintelaec481c62017-03-20 22:12:40 +01001901 struct RAMSrcPageRequest *mspr, *next_mspr;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001902 /* This queue generally should be empty - but in the case of a failed
1903 * migration might have some droppings in.
1904 */
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01001905 RCU_READ_LOCK_GUARD();
Juan Quintelaec481c62017-03-20 22:12:40 +01001906 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001907 memory_region_unref(mspr->rb->mr);
Juan Quintelaec481c62017-03-20 22:12:40 +01001908 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001909 g_free(mspr);
1910 }
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001911}
1912
1913/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01001914 * ram_save_queue_pages: queue the page for transmission
1915 *
1916 * A request from postcopy destination for example.
1917 *
1918 * Returns zero on success or negative on error
1919 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01001920 * @rbname: Name of the RAMBLock of the request. NULL means the
1921 * same that last one.
1922 * @start: starting address from the start of the RAMBlock
1923 * @len: length (in bytes) to send
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001924 */
Juan Quintela96506892017-03-14 18:41:03 +01001925int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001926{
1927 RAMBlock *ramblock;
Juan Quintela53518d92017-05-04 11:46:24 +02001928 RAMState *rs = ram_state;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001929
Juan Quintelaaff3f662023-04-26 19:37:19 +02001930 stat64_add(&mig_stats.postcopy_requests, 1);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01001931 RCU_READ_LOCK_GUARD();
1932
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001933 if (!rbname) {
1934 /* Reuse last RAMBlock */
Juan Quintela68a098f2017-03-14 13:48:42 +01001935 ramblock = rs->last_req_rb;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001936
1937 if (!ramblock) {
1938 /*
1939 * Shouldn't happen, we can't reuse the last RAMBlock if
1940 * it's the 1st request.
1941 */
1942 error_report("ram_save_queue_pages no previous block");
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03001943 return -1;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001944 }
1945 } else {
1946 ramblock = qemu_ram_block_by_name(rbname);
1947
1948 if (!ramblock) {
1949 /* We shouldn't be asked for a non-existent RAMBlock */
1950 error_report("ram_save_queue_pages no block '%s'", rbname);
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03001951 return -1;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001952 }
Juan Quintela68a098f2017-03-14 13:48:42 +01001953 rs->last_req_rb = ramblock;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001954 }
1955 trace_ram_save_queue_pages(ramblock->idstr, start, len);
David Hildenbrand542147f2021-04-29 13:27:08 +02001956 if (!offset_in_ramblock(ramblock, start + len - 1)) {
Juan Quintela9458ad62015-11-10 17:42:05 +01001957 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1958 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001959 __func__, start, len, ramblock->used_length);
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03001960 return -1;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00001961 }
1962
Peter Xu93589822022-10-11 17:55:57 -04001963 /*
1964 * When with postcopy preempt, we send back the page directly in the
1965 * rp-return thread.
1966 */
1967 if (postcopy_preempt_active()) {
1968 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
1969 size_t page_size = qemu_ram_pagesize(ramblock);
1970 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
1971 int ret = 0;
1972
1973 qemu_mutex_lock(&rs->bitmap_mutex);
1974
1975 pss_init(pss, ramblock, page_start);
1976 /*
1977 * Always use the preempt channel, and make sure it's there. It's
1978 * safe to access without lock, because when rp-thread is running
1979 * we should be the only one who operates on the qemufile
1980 */
1981 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
Peter Xu93589822022-10-11 17:55:57 -04001982 assert(pss->pss_channel);
1983
1984 /*
1985 * It must be either one or multiple of host page size. Just
1986 * assert; if something wrong we're mostly split brain anyway.
1987 */
1988 assert(len % page_size == 0);
1989 while (len) {
1990 if (ram_save_host_page_urgent(pss)) {
1991 error_report("%s: ram_save_host_page_urgent() failed: "
1992 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
1993 __func__, ramblock->idstr, start);
1994 ret = -1;
1995 break;
1996 }
1997 /*
1998 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
1999 * will automatically be moved and point to the next host page
2000 * we're going to send, so no need to update here.
2001 *
2002 * Normally QEMU never sends >1 host page in requests, so
2003 * logically we don't even need that as the loop should only
2004 * run once, but just to be consistent.
2005 */
2006 len -= page_size;
2007 };
2008 qemu_mutex_unlock(&rs->bitmap_mutex);
2009
2010 return ret;
2011 }
2012
Juan Quintelaec481c62017-03-20 22:12:40 +01002013 struct RAMSrcPageRequest *new_entry =
Markus Armbrusterb21e2382022-03-15 15:41:56 +01002014 g_new0(struct RAMSrcPageRequest, 1);
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002015 new_entry->rb = ramblock;
2016 new_entry->offset = start;
2017 new_entry->len = len;
2018
2019 memory_region_ref(ramblock->mr);
Juan Quintelaec481c62017-03-20 22:12:40 +01002020 qemu_mutex_lock(&rs->src_page_req_mutex);
2021 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
Dr. David Alan Gilberte03a34f2018-06-13 11:26:42 +01002022 migration_make_urgent_request();
Juan Quintelaec481c62017-03-20 22:12:40 +01002023 qemu_mutex_unlock(&rs->src_page_req_mutex);
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002024
2025 return 0;
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002026}
2027
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002028static bool save_page_use_compression(RAMState *rs)
2029{
Juan Quintelaa7a94d12023-03-01 22:03:48 +01002030 if (!migrate_compress()) {
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002031 return false;
2032 }
2033
2034 /*
David Hildenbrand1a373522021-02-16 11:50:39 +01002035 * If xbzrle is enabled (e.g., after first round of migration), stop
2036 * using the data compression. In theory, xbzrle can do better than
2037 * compression.
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002038 */
Juan Quintelaf3095cc2023-05-04 13:53:23 +02002039 if (rs->xbzrle_started) {
David Hildenbrand1a373522021-02-16 11:50:39 +01002040 return false;
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002041 }
2042
David Hildenbrand1a373522021-02-16 11:50:39 +01002043 return true;
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002044}
2045
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002046/*
2047 * try to compress the page before posting it out, return true if the page
2048 * has been properly handled by compression, otherwise needs other
2049 * paths to handle it
2050 */
Peter Xuec6f3ab2022-10-11 17:55:56 -04002051static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2052 RAMBlock *block, ram_addr_t offset)
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002053{
2054 if (!save_page_use_compression(rs)) {
2055 return false;
2056 }
2057
2058 /*
2059 * When starting the process of a new block, the first page of
2060 * the block should be sent out before other pages in the same
2061 * block, and all the pages in last block should have been sent
2062 * out, keeping this order is important, because the 'cont' flag
2063 * is used to avoid resending the block name.
2064 *
2065 * We post the fist page as normal page as compression will take
2066 * much CPU resource.
2067 */
Peter Xuec6f3ab2022-10-11 17:55:56 -04002068 if (block != pss->last_sent_block) {
Lukas Straubef4f5f52023-04-20 11:48:13 +02002069 ram_flush_compressed_data(rs);
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002070 return false;
2071 }
2072
Lukas Straubef4f5f52023-04-20 11:48:13 +02002073 if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002074 return true;
2075 }
2076
Xiao Guangrong76e03002018-09-06 15:01:00 +08002077 compression_counters.busy++;
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002078 return false;
2079}
2080
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002081/**
Juan Quintela4010ba32021-12-15 20:10:39 +01002082 * ram_save_target_page_legacy: save one target page
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002083 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002084 * Returns the number of pages written
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002085 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01002086 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01002087 * @pss: data about the page we want to send
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002088 */
Juan Quintela4010ba32021-12-15 20:10:39 +01002089static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002090{
Xiao Guangronga8ec91f2018-03-30 15:51:25 +08002091 RAMBlock *block = pss->block;
Alexey Romko8bba0042020-01-10 14:51:34 +01002092 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
Xiao Guangronga8ec91f2018-03-30 15:51:25 +08002093 int res;
2094
Peter Xu61717ea2022-10-11 17:55:53 -04002095 if (control_save_page(pss, block, offset, &res)) {
Xiao Guangronga8ec91f2018-03-30 15:51:25 +08002096 return res;
2097 }
2098
Peter Xuec6f3ab2022-10-11 17:55:56 -04002099 if (save_compress_page(rs, pss, block, offset)) {
Xiao Guangrong5e5fdcf2018-08-21 16:10:24 +08002100 return 1;
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002101 }
2102
Lukas Straub37502df2023-04-02 17:06:32 +00002103 res = save_zero_page(pss, pss->pss_channel, block, offset);
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002104 if (res > 0) {
2105 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2106 * page would be stale
2107 */
Juan Quintelaf3095cc2023-05-04 13:53:23 +02002108 if (rs->xbzrle_started) {
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002109 XBZRLE_cache_lock();
2110 xbzrle_cache_zero_page(rs, block->offset + offset);
2111 XBZRLE_cache_unlock();
2112 }
Xiao Guangrongd7400a32018-03-30 15:51:26 +08002113 return res;
2114 }
2115
Xiao Guangrongda3f56c2018-03-30 15:51:28 +08002116 /*
Peter Xu6f39c902022-10-04 14:24:30 -04002117 * Do not use multifd in postcopy as one whole host page should be
2118 * placed. Meanwhile postcopy requires atomic update of pages, so even
2119 * if host page size == guest page size the dest guest during run may
2120 * still see partially copied pages which is data corruption.
Xiao Guangrongda3f56c2018-03-30 15:51:28 +08002121 */
Juan Quintela51b07542023-03-01 22:10:29 +01002122 if (migrate_multifd() && !migration_in_postcopy()) {
Peter Xu61717ea2022-10-11 17:55:53 -04002123 return ram_save_multifd_page(pss->pss_channel, block, offset);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002124 }
2125
Juan Quintela05931ec2021-12-15 19:01:21 +01002126 return ram_save_page(rs, pss);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002127}
2128
Peter Xud9e474e2022-10-11 17:55:52 -04002129/* Should be called before sending a host page */
2130static void pss_host_page_prepare(PageSearchStatus *pss)
2131{
2132 /* How many guest pages are there in one host page? */
2133 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2134
2135 pss->host_page_sending = true;
Peter Xu301d7ff2023-01-20 11:31:47 -05002136 if (guest_pfns <= 1) {
2137 /*
2138 * This covers both when guest psize == host psize, or when guest
2139 * has larger psize than the host (guest_pfns==0).
2140 *
2141 * For the latter, we always send one whole guest page per
2142 * iteration of the host page (example: an Alpha VM on x86 host
2143 * will have guest psize 8K while host psize 4K).
2144 */
2145 pss->host_page_start = pss->page;
2146 pss->host_page_end = pss->page + 1;
2147 } else {
2148 /*
2149 * The host page spans over multiple guest pages, we send them
2150 * within the same host page iteration.
2151 */
2152 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2153 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2154 }
Peter Xud9e474e2022-10-11 17:55:52 -04002155}
2156
2157/*
2158 * Whether the page pointed by PSS is within the host page being sent.
2159 * Must be called after a previous pss_host_page_prepare().
2160 */
2161static bool pss_within_range(PageSearchStatus *pss)
2162{
2163 ram_addr_t ram_addr;
2164
2165 assert(pss->host_page_sending);
2166
2167 /* Over host-page boundary? */
2168 if (pss->page >= pss->host_page_end) {
2169 return false;
2170 }
2171
2172 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2173
2174 return offset_in_ramblock(pss->block, ram_addr);
2175}
2176
2177static void pss_host_page_finish(PageSearchStatus *pss)
2178{
2179 pss->host_page_sending = false;
2180 /* This is not needed, but just to reset it */
2181 pss->host_page_start = pss->host_page_end = 0;
2182}
2183
Peter Xu93589822022-10-11 17:55:57 -04002184/*
2185 * Send an urgent host page specified by `pss'. Need to be called with
2186 * bitmap_mutex held.
2187 *
2188 * Returns 0 if save host page succeeded, false otherwise.
2189 */
2190static int ram_save_host_page_urgent(PageSearchStatus *pss)
2191{
2192 bool page_dirty, sent = false;
2193 RAMState *rs = ram_state;
2194 int ret = 0;
2195
2196 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2197 pss_host_page_prepare(pss);
2198
2199 /*
2200 * If precopy is sending the same page, let it be done in precopy, or
2201 * we could send the same page in two channels and none of them will
2202 * receive the whole page.
2203 */
2204 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2205 trace_postcopy_preempt_hit(pss->block->idstr,
2206 pss->page << TARGET_PAGE_BITS);
2207 return 0;
2208 }
2209
2210 do {
2211 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2212
2213 if (page_dirty) {
2214 /* Be strict to return code; it must be 1, or what else? */
Juan Quintela4010ba32021-12-15 20:10:39 +01002215 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
Peter Xu93589822022-10-11 17:55:57 -04002216 error_report_once("%s: ram_save_target_page failed", __func__);
2217 ret = -1;
2218 goto out;
2219 }
2220 sent = true;
2221 }
2222 pss_find_next_dirty(pss);
2223 } while (pss_within_range(pss));
2224out:
2225 pss_host_page_finish(pss);
2226 /* For urgent requests, flush immediately if sent */
2227 if (sent) {
2228 qemu_fflush(pss->pss_channel);
2229 }
2230 return ret;
2231}
2232
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002233/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01002234 * ram_save_host_page: save a whole host page
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002235 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002236 * Starting at *offset send pages up to the end of the current host
2237 * page. It's valid for the initial offset to point into the middle of
2238 * a host page in which case the remainder of the hostpage is sent.
2239 * Only dirty target pages are sent. Note that the host page size may
2240 * be a huge page for this block.
Peter Xuf3321552022-10-11 17:55:50 -04002241 *
Dr. David Alan Gilbert1eb3fc02017-05-17 17:58:09 +01002242 * The saving stops at the boundary of the used_length of the block
2243 * if the RAMBlock isn't a multiple of the host page size.
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002244 *
Peter Xuf3321552022-10-11 17:55:50 -04002245 * The caller must be with ram_state.bitmap_mutex held to call this
2246 * function. Note that this function can temporarily release the lock, but
2247 * when the function is returned it'll make sure the lock is still held.
2248 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002249 * Returns the number of pages written or negative on error
2250 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01002251 * @rs: current RAM state
Juan Quintela3d0684b2017-03-23 15:06:39 +01002252 * @pss: data about the page we want to send
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002253 */
Juan Quintela05931ec2021-12-15 19:01:21 +01002254static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002255{
Peter Xuf3321552022-10-11 17:55:50 -04002256 bool page_dirty, preempt_active = postcopy_preempt_active();
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002257 int tmppages, pages = 0;
Juan Quintelaa935e302017-03-21 15:36:51 +01002258 size_t pagesize_bits =
2259 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002260 unsigned long start_page = pss->page;
2261 int res;
Dr. David Alan Gilbert4c011c32017-02-24 18:28:39 +00002262
David Hildenbrandf161c882023-07-06 09:56:08 +02002263 if (migrate_ram_is_ignored(pss->block)) {
Cédric Le Goaterb895de52018-05-14 08:57:00 +02002264 error_report("block %s should not be migrated !", pss->block->idstr);
2265 return 0;
2266 }
2267
Peter Xud9e474e2022-10-11 17:55:52 -04002268 /* Update host page boundary information */
2269 pss_host_page_prepare(pss);
2270
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002271 do {
Peter Xuf3321552022-10-11 17:55:50 -04002272 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
Xiao Guangrong1faa5662018-03-30 15:51:24 +08002273
Peter Xuf3321552022-10-11 17:55:50 -04002274 /* Check the pages is dirty and if it is send it */
2275 if (page_dirty) {
Kunkun Jiangba1b7c82021-03-16 20:57:16 +08002276 /*
Peter Xuf3321552022-10-11 17:55:50 -04002277 * Properly yield the lock only in postcopy preempt mode
2278 * because both migration thread and rp-return thread can
2279 * operate on the bitmaps.
Kunkun Jiangba1b7c82021-03-16 20:57:16 +08002280 */
Peter Xuf3321552022-10-11 17:55:50 -04002281 if (preempt_active) {
2282 qemu_mutex_unlock(&rs->bitmap_mutex);
Kunkun Jiangba1b7c82021-03-16 20:57:16 +08002283 }
Juan Quintela4010ba32021-12-15 20:10:39 +01002284 tmppages = migration_ops->ram_save_target_page(rs, pss);
Peter Xuf3321552022-10-11 17:55:50 -04002285 if (tmppages >= 0) {
2286 pages += tmppages;
2287 /*
2288 * Allow rate limiting to happen in the middle of huge pages if
2289 * something is sent in the current iteration.
2290 */
2291 if (pagesize_bits > 1 && tmppages > 0) {
2292 migration_rate_limit();
2293 }
2294 }
2295 if (preempt_active) {
2296 qemu_mutex_lock(&rs->bitmap_mutex);
2297 }
2298 } else {
2299 tmppages = 0;
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002300 }
Peter Xuf3321552022-10-11 17:55:50 -04002301
2302 if (tmppages < 0) {
Peter Xud9e474e2022-10-11 17:55:52 -04002303 pss_host_page_finish(pss);
Peter Xuf3321552022-10-11 17:55:50 -04002304 return tmppages;
2305 }
2306
Peter Xud9e474e2022-10-11 17:55:52 -04002307 pss_find_next_dirty(pss);
2308 } while (pss_within_range(pss));
2309
2310 pss_host_page_finish(pss);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002311
2312 res = ram_save_release_protection(rs, pss, start_page);
2313 return (res < 0 ? res : pages);
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002314}
Dr. David Alan Gilbert6c595cd2015-11-05 18:11:08 +00002315
2316/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01002317 * ram_find_and_save_block: finds a dirty page and sends it to f
Juan Quintela56e93d22015-05-07 19:33:31 +02002318 *
2319 * Called within an RCU critical section.
2320 *
Xiao Guangronge8f37352018-09-03 17:26:44 +08002321 * Returns the number of pages written where zero means no dirty pages,
2322 * or negative on error
Juan Quintela56e93d22015-05-07 19:33:31 +02002323 *
Juan Quintela6f37bb82017-03-13 19:26:29 +01002324 * @rs: current RAM state
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002325 *
2326 * On systems where host-page-size > target-page-size it will send all the
2327 * pages in a host page that are dirty.
Juan Quintela56e93d22015-05-07 19:33:31 +02002328 */
Juan Quintela05931ec2021-12-15 19:01:21 +01002329static int ram_find_and_save_block(RAMState *rs)
Juan Quintela56e93d22015-05-07 19:33:31 +02002330{
Peter Xuf1668762022-10-11 17:55:55 -04002331 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
Juan Quintela56e93d22015-05-07 19:33:31 +02002332 int pages = 0;
Juan Quintela56e93d22015-05-07 19:33:31 +02002333
Ashijeet Acharya0827b9e2017-02-08 19:58:45 +05302334 /* No dirty page as there is zero RAM */
Juan Quintela8d80e192022-05-10 19:37:36 +02002335 if (!rs->ram_bytes_total) {
Ashijeet Acharya0827b9e2017-02-08 19:58:45 +05302336 return pages;
2337 }
2338
Peter Xu4934a5d2022-10-04 14:24:26 -04002339 /*
2340 * Always keep last_seen_block/last_page valid during this procedure,
2341 * because find_dirty_block() relies on these values (e.g., we compare
2342 * last_seen_block with pss.block to see whether we searched all the
2343 * ramblocks) to detect the completion of migration. Having NULL value
2344 * of last_seen_block can conditionally cause below loop to run forever.
2345 */
2346 if (!rs->last_seen_block) {
2347 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2348 rs->last_page = 0;
2349 }
2350
Peter Xuf1668762022-10-11 17:55:55 -04002351 pss_init(pss, rs->last_seen_block, rs->last_page);
Dr. David Alan Gilbertb8fb8cb2015-09-23 15:27:10 +01002352
Juan Quintela31e2ac72022-06-21 13:29:36 +02002353 while (true){
Juan Quintela51efd362022-06-21 13:20:35 +02002354 if (!get_queued_page(rs, pss)) {
Peter Xub0621062022-10-11 17:55:58 -04002355 /* priority queue empty, so just search for something dirty */
Juan Quintela31e2ac72022-06-21 13:29:36 +02002356 int res = find_dirty_block(rs, pss);
2357 if (res != PAGE_DIRTY_FOUND) {
2358 if (res == PAGE_ALL_CLEAN) {
Juan Quintela51efd362022-06-21 13:20:35 +02002359 break;
Juan Quintela31e2ac72022-06-21 13:29:36 +02002360 } else if (res == PAGE_TRY_AGAIN) {
2361 continue;
Juan Quintela294e5a42022-06-21 13:36:11 +02002362 } else if (res < 0) {
2363 pages = res;
2364 break;
Juan Quintela51efd362022-06-21 13:20:35 +02002365 }
2366 }
Dr. David Alan Gilberta82d5932015-11-05 18:11:09 +00002367 }
Juan Quintela51efd362022-06-21 13:20:35 +02002368 pages = ram_save_host_page(rs, pss);
Juan Quintela31e2ac72022-06-21 13:29:36 +02002369 if (pages) {
2370 break;
2371 }
2372 }
Juan Quintela56e93d22015-05-07 19:33:31 +02002373
Peter Xuf1668762022-10-11 17:55:55 -04002374 rs->last_seen_block = pss->block;
2375 rs->last_page = pss->page;
Juan Quintela56e93d22015-05-07 19:33:31 +02002376
2377 return pages;
2378}
2379
Juan Quintela8008a272022-05-10 19:18:19 +02002380static uint64_t ram_bytes_total_with_ignored(void)
Juan Quintela56e93d22015-05-07 19:33:31 +02002381{
2382 RAMBlock *block;
2383 uint64_t total = 0;
2384
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002385 RCU_READ_LOCK_GUARD();
2386
Juan Quintela8008a272022-05-10 19:18:19 +02002387 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2388 total += block->used_length;
Peter Xu99e15582017-05-12 12:17:39 +08002389 }
Juan Quintela56e93d22015-05-07 19:33:31 +02002390 return total;
2391}
2392
Yury Kotovfbd162e2019-02-15 20:45:46 +03002393uint64_t ram_bytes_total(void)
2394{
Juan Quintela8008a272022-05-10 19:18:19 +02002395 RAMBlock *block;
2396 uint64_t total = 0;
2397
2398 RCU_READ_LOCK_GUARD();
2399
2400 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2401 total += block->used_length;
2402 }
2403 return total;
Yury Kotovfbd162e2019-02-15 20:45:46 +03002404}
2405
Juan Quintelaf265e0e2017-06-28 11:52:27 +02002406static void xbzrle_load_setup(void)
Juan Quintela56e93d22015-05-07 19:33:31 +02002407{
Juan Quintelaf265e0e2017-06-28 11:52:27 +02002408 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
Juan Quintela56e93d22015-05-07 19:33:31 +02002409}
2410
Juan Quintelaf265e0e2017-06-28 11:52:27 +02002411static void xbzrle_load_cleanup(void)
2412{
2413 g_free(XBZRLE.decoded_buf);
2414 XBZRLE.decoded_buf = NULL;
2415}
2416
Peter Xu7d7c96b2017-10-19 14:31:58 +08002417static void ram_state_cleanup(RAMState **rsp)
2418{
Dr. David Alan Gilbertb9ccaf62018-02-12 16:03:39 +00002419 if (*rsp) {
2420 migration_page_queue_free(*rsp);
2421 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2422 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2423 g_free(*rsp);
2424 *rsp = NULL;
2425 }
Peter Xu7d7c96b2017-10-19 14:31:58 +08002426}
2427
Peter Xu84593a02017-10-19 14:31:59 +08002428static void xbzrle_cleanup(void)
2429{
2430 XBZRLE_cache_lock();
2431 if (XBZRLE.cache) {
2432 cache_fini(XBZRLE.cache);
2433 g_free(XBZRLE.encoded_buf);
2434 g_free(XBZRLE.current_buf);
2435 g_free(XBZRLE.zero_target_page);
2436 XBZRLE.cache = NULL;
2437 XBZRLE.encoded_buf = NULL;
2438 XBZRLE.current_buf = NULL;
2439 XBZRLE.zero_target_page = NULL;
2440 }
2441 XBZRLE_cache_unlock();
2442}
2443
Juan Quintelaf265e0e2017-06-28 11:52:27 +02002444static void ram_save_cleanup(void *opaque)
Juan Quintela56e93d22015-05-07 19:33:31 +02002445{
Juan Quintela53518d92017-05-04 11:46:24 +02002446 RAMState **rsp = opaque;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002447 RAMBlock *block;
Juan Quintelaeb859c52017-03-13 21:51:55 +01002448
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002449 /* We don't use dirty log with background snapshots */
2450 if (!migrate_background_snapshot()) {
2451 /* caller have hold iothread lock or is in a bh, so there is
2452 * no writing race against the migration bitmap
2453 */
Hyman Huang(黄勇)63b41db2021-06-29 16:01:19 +00002454 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2455 /*
2456 * do not stop dirty log without starting it, since
2457 * memory_global_dirty_log_stop will assert that
2458 * memory_global_dirty_log_start/stop used in pairs
2459 */
2460 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2461 }
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002462 }
Juan Quintela6b6712e2017-03-22 15:18:04 +01002463
Yury Kotovfbd162e2019-02-15 20:45:46 +03002464 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Peter Xu002cad62019-06-03 14:50:56 +08002465 g_free(block->clear_bmap);
2466 block->clear_bmap = NULL;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002467 g_free(block->bmap);
2468 block->bmap = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02002469 }
2470
Peter Xu84593a02017-10-19 14:31:59 +08002471 xbzrle_cleanup();
Juan Quintelaf0afa332017-06-28 11:52:28 +02002472 compress_threads_save_cleanup();
Peter Xu7d7c96b2017-10-19 14:31:58 +08002473 ram_state_cleanup(rsp);
Juan Quintela4010ba32021-12-15 20:10:39 +01002474 g_free(migration_ops);
2475 migration_ops = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02002476}
2477
Juan Quintela6f37bb82017-03-13 19:26:29 +01002478static void ram_state_reset(RAMState *rs)
Juan Quintela56e93d22015-05-07 19:33:31 +02002479{
Peter Xuec6f3ab2022-10-11 17:55:56 -04002480 int i;
2481
2482 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2483 rs->pss[i].last_sent_block = NULL;
2484 }
2485
Juan Quintela6f37bb82017-03-13 19:26:29 +01002486 rs->last_seen_block = NULL;
Juan Quintela269ace22017-03-21 15:23:31 +01002487 rs->last_page = 0;
Juan Quintela6f37bb82017-03-13 19:26:29 +01002488 rs->last_version = ram_list.version;
Juan Quintelaf3095cc2023-05-04 13:53:23 +02002489 rs->xbzrle_started = false;
Juan Quintela56e93d22015-05-07 19:33:31 +02002490}
2491
2492#define MAX_WAIT 50 /* ms, half buffered_file limit */
2493
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002494/* **** functions for postcopy ***** */
2495
Pavel Butsykinced1c612017-02-03 18:23:21 +03002496void ram_postcopy_migrated_memory_release(MigrationState *ms)
2497{
2498 struct RAMBlock *block;
Pavel Butsykinced1c612017-02-03 18:23:21 +03002499
Yury Kotovfbd162e2019-02-15 20:45:46 +03002500 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Juan Quintela6b6712e2017-03-22 15:18:04 +01002501 unsigned long *bitmap = block->bmap;
2502 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2503 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
Pavel Butsykinced1c612017-02-03 18:23:21 +03002504
2505 while (run_start < range) {
2506 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
Alexey Romko8bba0042020-01-10 14:51:34 +01002507 ram_discard_range(block->idstr,
2508 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2509 ((ram_addr_t)(run_end - run_start))
2510 << TARGET_PAGE_BITS);
Pavel Butsykinced1c612017-02-03 18:23:21 +03002511 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2512 }
2513 }
2514}
2515
Juan Quintela3d0684b2017-03-23 15:06:39 +01002516/**
2517 * postcopy_send_discard_bm_ram: discard a RAMBlock
2518 *
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002519 * Callback from postcopy_each_ram_send_discard for each RAMBlock
Juan Quintela3d0684b2017-03-23 15:06:39 +01002520 *
2521 * @ms: current migration state
Wei Yang89dab312019-07-15 10:05:49 +08002522 * @block: RAMBlock to discard
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002523 */
Philippe Mathieu-Daudé9e7d1222021-12-30 17:05:25 +01002524static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002525{
Juan Quintela6b6712e2017-03-22 15:18:04 +01002526 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002527 unsigned long current;
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002528 unsigned long *bitmap = block->bmap;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002529
Juan Quintela6b6712e2017-03-22 15:18:04 +01002530 for (current = 0; current < end; ) {
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002531 unsigned long one = find_next_bit(bitmap, end, current);
Wei Yang33a5cb622019-06-27 10:08:21 +08002532 unsigned long zero, discard_length;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002533
Wei Yang33a5cb622019-06-27 10:08:21 +08002534 if (one >= end) {
2535 break;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002536 }
Wei Yang33a5cb622019-06-27 10:08:21 +08002537
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002538 zero = find_next_zero_bit(bitmap, end, one + 1);
Wei Yang33a5cb622019-06-27 10:08:21 +08002539
2540 if (zero >= end) {
2541 discard_length = end - one;
2542 } else {
2543 discard_length = zero - one;
2544 }
Wei Yang810cf2b2019-07-24 09:07:21 +08002545 postcopy_discard_send_range(ms, one, discard_length);
Wei Yang33a5cb622019-06-27 10:08:21 +08002546 current = one + discard_length;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002547 }
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002548}
2549
Peter Xuf30c2e52021-12-07 19:50:13 +08002550static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2551
Juan Quintela3d0684b2017-03-23 15:06:39 +01002552/**
2553 * postcopy_each_ram_send_discard: discard all RAMBlocks
2554 *
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002555 * Utility for the outgoing postcopy code.
2556 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2557 * passing it bitmap indexes and name.
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002558 * (qemu_ram_foreach_block ends up passing unscaled lengths
2559 * which would mean postcopy code would have to deal with target page)
Juan Quintela3d0684b2017-03-23 15:06:39 +01002560 *
2561 * @ms: current migration state
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002562 */
Peter Xu739fcc12021-12-07 19:50:14 +08002563static void postcopy_each_ram_send_discard(MigrationState *ms)
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002564{
2565 struct RAMBlock *block;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002566
Yury Kotovfbd162e2019-02-15 20:45:46 +03002567 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Wei Yang810cf2b2019-07-24 09:07:21 +08002568 postcopy_discard_send_init(ms, block->idstr);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002569
2570 /*
Peter Xuf30c2e52021-12-07 19:50:13 +08002571 * Deal with TPS != HPS and huge pages. It discard any partially sent
2572 * host-page size chunks, mark any partially dirty host-page size
2573 * chunks as all dirty. In this case the host-page is the host-page
2574 * for the particular RAMBlock, i.e. it might be a huge page.
2575 */
2576 postcopy_chunk_hostpages_pass(ms, block);
2577
2578 /*
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002579 * Postcopy sends chunks of bitmap over the wire, but it
2580 * just needs indexes at this point, avoids it having
2581 * target page specific code.
2582 */
Peter Xu739fcc12021-12-07 19:50:14 +08002583 postcopy_send_discard_bm_ram(ms, block);
Wei Yang810cf2b2019-07-24 09:07:21 +08002584 postcopy_discard_send_finish(ms);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002585 }
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002586}
2587
Juan Quintela3d0684b2017-03-23 15:06:39 +01002588/**
Wei Yang8324ef82019-08-19 14:18:41 +08002589 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002590 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002591 * Helper for postcopy_chunk_hostpages; it's called twice to
2592 * canonicalize the two bitmaps, that are similar, but one is
2593 * inverted.
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002594 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002595 * Postcopy requires that all target pages in a hostpage are dirty or
2596 * clean, not a mix. This function canonicalizes the bitmaps.
2597 *
2598 * @ms: current migration state
Juan Quintela3d0684b2017-03-23 15:06:39 +01002599 * @block: block that contains the page we want to canonicalize
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002600 */
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002601static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002602{
Juan Quintela53518d92017-05-04 11:46:24 +02002603 RAMState *rs = ram_state;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002604 unsigned long *bitmap = block->bmap;
Dr. David Alan Gilbert29c59172017-02-24 18:28:31 +00002605 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002606 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002607 unsigned long run_start;
2608
Dr. David Alan Gilbert29c59172017-02-24 18:28:31 +00002609 if (block->page_size == TARGET_PAGE_SIZE) {
2610 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2611 return;
2612 }
2613
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002614 /* Find a dirty page */
2615 run_start = find_next_bit(bitmap, pages, 0);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002616
Juan Quintela6b6712e2017-03-22 15:18:04 +01002617 while (run_start < pages) {
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002618
2619 /*
2620 * If the start of this run of pages is in the middle of a host
2621 * page, then we need to fixup this host page.
2622 */
Wei Yang9dec3cc2019-08-06 08:46:48 +08002623 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002624 /* Find the end of this run */
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002625 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002626 /*
2627 * If the end isn't at the start of a host page, then the
2628 * run doesn't finish at the end of a host page
2629 * and we need to discard.
2630 */
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002631 }
2632
Wei Yang9dec3cc2019-08-06 08:46:48 +08002633 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002634 unsigned long page;
Wei Yangdad45ab2019-08-06 08:46:47 +08002635 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2636 host_ratio);
2637 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002638
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002639 /* Clean up the bitmap */
2640 for (page = fixup_start_addr;
2641 page < fixup_start_addr + host_ratio; page++) {
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002642 /*
2643 * Remark them as dirty, updating the count for any pages
2644 * that weren't previously dirty.
2645 */
Juan Quintela0d8ec882017-03-13 21:21:41 +01002646 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002647 }
2648 }
2649
Wei Yang1e7cf8c2019-08-19 14:18:42 +08002650 /* Find the next dirty page for the next iteration */
2651 run_start = find_next_bit(bitmap, pages, run_start);
Dr. David Alan Gilbert99e314e2015-11-05 18:11:15 +00002652 }
2653}
2654
Juan Quintela3d0684b2017-03-23 15:06:39 +01002655/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01002656 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2657 *
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002658 * Transmit the set of pages to be discarded after precopy to the target
2659 * these are pages that:
2660 * a) Have been previously transmitted but are now dirty again
2661 * b) Pages that have never been transmitted, this ensures that
2662 * any pages on the destination that have been mapped by background
2663 * tasks get discarded (transparent huge pages is the specific concern)
2664 * Hopefully this is pretty sparse
Juan Quintela3d0684b2017-03-23 15:06:39 +01002665 *
2666 * @ms: current migration state
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002667 */
Peter Xu739fcc12021-12-07 19:50:14 +08002668void ram_postcopy_send_discard_bitmap(MigrationState *ms)
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002669{
Juan Quintela53518d92017-05-04 11:46:24 +02002670 RAMState *rs = ram_state;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002671
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002672 RCU_READ_LOCK_GUARD();
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002673
2674 /* This should be our last sync, the src is now paused */
Gavin Shan1e493be2023-05-09 12:21:19 +10002675 migration_bitmap_sync(rs, false);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002676
Juan Quintela6b6712e2017-03-22 15:18:04 +01002677 /* Easiest way to make sure we don't resume in the middle of a host-page */
Peter Xuec6f3ab2022-10-11 17:55:56 -04002678 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002679 rs->last_seen_block = NULL;
Juan Quintela6b6712e2017-03-22 15:18:04 +01002680 rs->last_page = 0;
2681
Peter Xu739fcc12021-12-07 19:50:14 +08002682 postcopy_each_ram_send_discard(ms);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002683
Peter Xu739fcc12021-12-07 19:50:14 +08002684 trace_ram_postcopy_send_discard_bitmap();
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002685}
2686
Juan Quintela3d0684b2017-03-23 15:06:39 +01002687/**
2688 * ram_discard_range: discard dirtied pages at the beginning of postcopy
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002689 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01002690 * Returns zero on success
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002691 *
Juan Quintela36449152017-03-23 15:11:59 +01002692 * @rbname: name of the RAMBlock of the request. NULL means the
2693 * same that last one.
Juan Quintela3d0684b2017-03-23 15:06:39 +01002694 * @start: RAMBlock starting page
2695 * @length: RAMBlock size
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002696 */
Juan Quintelaaaa20642017-03-21 11:35:24 +01002697int ram_discard_range(const char *rbname, uint64_t start, size_t length)
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002698{
Juan Quintela36449152017-03-23 15:11:59 +01002699 trace_ram_discard_range(rbname, start, length);
Dr. David Alan Gilbertd3a50382017-02-24 18:28:32 +00002700
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002701 RCU_READ_LOCK_GUARD();
Juan Quintela36449152017-03-23 15:11:59 +01002702 RAMBlock *rb = qemu_ram_block_by_name(rbname);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002703
2704 if (!rb) {
Juan Quintela36449152017-03-23 15:11:59 +01002705 error_report("ram_discard_range: Failed to find block '%s'", rbname);
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03002706 return -1;
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002707 }
2708
Peter Xu814bb082018-07-23 20:33:02 +08002709 /*
2710 * On source VM, we don't need to update the received bitmap since
2711 * we don't even have one.
2712 */
2713 if (rb->receivedmap) {
2714 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2715 length >> qemu_target_page_bits());
2716 }
2717
Daniel Henrique Barboza03acb4e2020-01-06 15:23:31 -03002718 return ram_block_discard_range(rb, start, length);
Dr. David Alan Gilberte0b266f2015-11-05 18:11:02 +00002719}
2720
Peter Xu84593a02017-10-19 14:31:59 +08002721/*
2722 * For every allocation, we will try not to crash the VM if the
2723 * allocation failed.
2724 */
2725static int xbzrle_init(void)
2726{
2727 Error *local_err = NULL;
2728
Juan Quintela87dca0c2023-03-01 22:20:13 +01002729 if (!migrate_xbzrle()) {
Peter Xu84593a02017-10-19 14:31:59 +08002730 return 0;
2731 }
2732
2733 XBZRLE_cache_lock();
2734
2735 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2736 if (!XBZRLE.zero_target_page) {
2737 error_report("%s: Error allocating zero page", __func__);
2738 goto err_out;
2739 }
2740
2741 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2742 TARGET_PAGE_SIZE, &local_err);
2743 if (!XBZRLE.cache) {
2744 error_report_err(local_err);
2745 goto free_zero_page;
2746 }
2747
2748 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2749 if (!XBZRLE.encoded_buf) {
2750 error_report("%s: Error allocating encoded_buf", __func__);
2751 goto free_cache;
2752 }
2753
2754 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2755 if (!XBZRLE.current_buf) {
2756 error_report("%s: Error allocating current_buf", __func__);
2757 goto free_encoded_buf;
2758 }
2759
2760 /* We are all good */
2761 XBZRLE_cache_unlock();
2762 return 0;
2763
2764free_encoded_buf:
2765 g_free(XBZRLE.encoded_buf);
2766 XBZRLE.encoded_buf = NULL;
2767free_cache:
2768 cache_fini(XBZRLE.cache);
2769 XBZRLE.cache = NULL;
2770free_zero_page:
2771 g_free(XBZRLE.zero_target_page);
2772 XBZRLE.zero_target_page = NULL;
2773err_out:
2774 XBZRLE_cache_unlock();
2775 return -ENOMEM;
2776}
2777
Juan Quintela53518d92017-05-04 11:46:24 +02002778static int ram_state_init(RAMState **rsp)
Juan Quintela56e93d22015-05-07 19:33:31 +02002779{
Peter Xu7d00ee62017-10-19 14:31:57 +08002780 *rsp = g_try_new0(RAMState, 1);
2781
2782 if (!*rsp) {
2783 error_report("%s: Init ramstate fail", __func__);
2784 return -1;
2785 }
Juan Quintela53518d92017-05-04 11:46:24 +02002786
2787 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2788 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2789 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
Juan Quintela8d80e192022-05-10 19:37:36 +02002790 (*rsp)->ram_bytes_total = ram_bytes_total();
Juan Quintela56e93d22015-05-07 19:33:31 +02002791
Peter Xu7d00ee62017-10-19 14:31:57 +08002792 /*
Ivan Ren40c4d4a2019-07-14 22:51:19 +08002793 * Count the total number of pages used by ram blocks not including any
2794 * gaps due to alignment or unplugs.
Wei Yang03158512019-06-04 14:17:27 +08002795 * This must match with the initial values of dirty bitmap.
Peter Xu7d00ee62017-10-19 14:31:57 +08002796 */
Juan Quintela8d80e192022-05-10 19:37:36 +02002797 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
Peter Xu7d00ee62017-10-19 14:31:57 +08002798 ram_state_reset(*rsp);
2799
2800 return 0;
2801}
2802
Peter Xud6eff5d2017-10-19 14:32:00 +08002803static void ram_list_init_bitmaps(void)
2804{
Peter Xu002cad62019-06-03 14:50:56 +08002805 MigrationState *ms = migrate_get_current();
Peter Xud6eff5d2017-10-19 14:32:00 +08002806 RAMBlock *block;
2807 unsigned long pages;
Peter Xu002cad62019-06-03 14:50:56 +08002808 uint8_t shift;
Peter Xud6eff5d2017-10-19 14:32:00 +08002809
2810 /* Skip setting bitmap if there is no RAM */
2811 if (ram_bytes_total()) {
Peter Xu002cad62019-06-03 14:50:56 +08002812 shift = ms->clear_bitmap_shift;
2813 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2814 error_report("clear_bitmap_shift (%u) too big, using "
2815 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2816 shift = CLEAR_BITMAP_SHIFT_MAX;
2817 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2818 error_report("clear_bitmap_shift (%u) too small, using "
2819 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2820 shift = CLEAR_BITMAP_SHIFT_MIN;
2821 }
2822
Yury Kotovfbd162e2019-02-15 20:45:46 +03002823 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Peter Xud6eff5d2017-10-19 14:32:00 +08002824 pages = block->max_length >> TARGET_PAGE_BITS;
Wei Yang03158512019-06-04 14:17:27 +08002825 /*
2826 * The initial dirty bitmap for migration must be set with all
2827 * ones to make sure we'll migrate every guest RAM page to
2828 * destination.
Ivan Ren40c4d4a2019-07-14 22:51:19 +08002829 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2830 * new migration after a failed migration, ram_list.
2831 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2832 * guest memory.
Wei Yang03158512019-06-04 14:17:27 +08002833 */
Peter Xud6eff5d2017-10-19 14:32:00 +08002834 block->bmap = bitmap_new(pages);
Ivan Ren40c4d4a2019-07-14 22:51:19 +08002835 bitmap_set(block->bmap, 0, pages);
Peter Xu002cad62019-06-03 14:50:56 +08002836 block->clear_bmap_shift = shift;
2837 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
Peter Xud6eff5d2017-10-19 14:32:00 +08002838 }
2839 }
2840}
2841
David Hildenbrandbe39b4c2021-10-11 19:53:41 +02002842static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2843{
2844 unsigned long pages;
2845 RAMBlock *rb;
2846
2847 RCU_READ_LOCK_GUARD();
2848
2849 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2850 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2851 rs->migration_dirty_pages -= pages;
2852 }
2853}
2854
Peter Xud6eff5d2017-10-19 14:32:00 +08002855static void ram_init_bitmaps(RAMState *rs)
2856{
2857 /* For memory_global_dirty_log_start below. */
2858 qemu_mutex_lock_iothread();
2859 qemu_mutex_lock_ramlist();
Peter Xud6eff5d2017-10-19 14:32:00 +08002860
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002861 WITH_RCU_READ_LOCK_GUARD() {
2862 ram_list_init_bitmaps();
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002863 /* We don't use dirty log with background snapshots */
2864 if (!migrate_background_snapshot()) {
Hyman Huang(黄勇)63b41db2021-06-29 16:01:19 +00002865 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
Gavin Shan1e493be2023-05-09 12:21:19 +10002866 migration_bitmap_sync_precopy(rs, false);
Andrey Gruzdev278e2f52021-01-29 13:14:05 +03002867 }
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01002868 }
Peter Xud6eff5d2017-10-19 14:32:00 +08002869 qemu_mutex_unlock_ramlist();
2870 qemu_mutex_unlock_iothread();
David Hildenbrandbe39b4c2021-10-11 19:53:41 +02002871
2872 /*
2873 * After an eventual first bitmap sync, fixup the initial bitmap
2874 * containing all 1s to exclude any discarded pages from migration.
2875 */
2876 migration_bitmap_clear_discarded_pages(rs);
Peter Xud6eff5d2017-10-19 14:32:00 +08002877}
2878
Peter Xu7d00ee62017-10-19 14:31:57 +08002879static int ram_init_all(RAMState **rsp)
2880{
Peter Xu7d00ee62017-10-19 14:31:57 +08002881 if (ram_state_init(rsp)) {
2882 return -1;
2883 }
2884
Peter Xu84593a02017-10-19 14:31:59 +08002885 if (xbzrle_init()) {
2886 ram_state_cleanup(rsp);
2887 return -1;
Juan Quintela56e93d22015-05-07 19:33:31 +02002888 }
2889
Peter Xud6eff5d2017-10-19 14:32:00 +08002890 ram_init_bitmaps(*rsp);
zhanghailianga91246c2016-10-27 14:42:59 +08002891
2892 return 0;
2893}
2894
Peter Xu08614f32018-05-02 18:47:33 +08002895static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2896{
2897 RAMBlock *block;
2898 uint64_t pages = 0;
2899
2900 /*
2901 * Postcopy is not using xbzrle/compression, so no need for that.
2902 * Also, since source are already halted, we don't need to care
2903 * about dirty page logging as well.
2904 */
2905
Yury Kotovfbd162e2019-02-15 20:45:46 +03002906 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Peter Xu08614f32018-05-02 18:47:33 +08002907 pages += bitmap_count_one(block->bmap,
2908 block->used_length >> TARGET_PAGE_BITS);
2909 }
2910
2911 /* This may not be aligned with current bitmaps. Recalculate. */
2912 rs->migration_dirty_pages = pages;
2913
David Hildenbrand1a373522021-02-16 11:50:39 +01002914 ram_state_reset(rs);
Peter Xu08614f32018-05-02 18:47:33 +08002915
2916 /* Update RAMState cache of output QEMUFile */
Peter Xu7f401b82022-10-11 17:55:59 -04002917 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
Peter Xu08614f32018-05-02 18:47:33 +08002918
2919 trace_ram_state_resume_prepare(pages);
2920}
2921
Juan Quintela3d0684b2017-03-23 15:06:39 +01002922/*
Wei Wang6bcb05f2018-12-11 16:24:50 +08002923 * This function clears bits of the free pages reported by the caller from the
2924 * migration dirty bitmap. @addr is the host address corresponding to the
2925 * start of the continuous guest free pages, and @len is the total bytes of
2926 * those pages.
2927 */
2928void qemu_guest_free_page_hint(void *addr, size_t len)
2929{
2930 RAMBlock *block;
2931 ram_addr_t offset;
2932 size_t used_len, start, npages;
2933 MigrationState *s = migrate_get_current();
2934
2935 /* This function is currently expected to be used during live migration */
2936 if (!migration_is_setup_or_active(s->state)) {
2937 return;
2938 }
2939
2940 for (; len > 0; len -= used_len, addr += used_len) {
2941 block = qemu_ram_block_from_host(addr, false, &offset);
2942 if (unlikely(!block || offset >= block->used_length)) {
2943 /*
2944 * The implementation might not support RAMBlock resize during
2945 * live migration, but it could happen in theory with future
2946 * updates. So we add a check here to capture that case.
2947 */
2948 error_report_once("%s unexpected error", __func__);
2949 return;
2950 }
2951
2952 if (len <= block->used_length - offset) {
2953 used_len = len;
2954 } else {
2955 used_len = block->used_length - offset;
2956 }
2957
2958 start = offset >> TARGET_PAGE_BITS;
2959 npages = used_len >> TARGET_PAGE_BITS;
2960
2961 qemu_mutex_lock(&ram_state->bitmap_mutex);
Wei Wang3143577d2021-07-22 04:30:55 -04002962 /*
2963 * The skipped free pages are equavalent to be sent from clear_bmap's
2964 * perspective, so clear the bits from the memory region bitmap which
2965 * are initially set. Otherwise those skipped pages will be sent in
2966 * the next round after syncing from the memory region bitmap.
2967 */
David Hildenbrand1230a252021-09-04 18:09:07 +02002968 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
Wei Wang6bcb05f2018-12-11 16:24:50 +08002969 ram_state->migration_dirty_pages -=
2970 bitmap_count_one_with_offset(block->bmap, start, npages);
2971 bitmap_clear(block->bmap, start, npages);
2972 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2973 }
2974}
2975
2976/*
Juan Quintela3d0684b2017-03-23 15:06:39 +01002977 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
zhanghailianga91246c2016-10-27 14:42:59 +08002978 * long-running RCU critical section. When rcu-reclaims in the code
2979 * start to become numerous it will be necessary to reduce the
2980 * granularity of these critical sections.
2981 */
2982
Juan Quintela3d0684b2017-03-23 15:06:39 +01002983/**
2984 * ram_save_setup: Setup RAM for migration
2985 *
2986 * Returns zero to indicate success and negative for error
2987 *
2988 * @f: QEMUFile where to send the data
2989 * @opaque: RAMState pointer
2990 */
zhanghailianga91246c2016-10-27 14:42:59 +08002991static int ram_save_setup(QEMUFile *f, void *opaque)
2992{
Juan Quintela53518d92017-05-04 11:46:24 +02002993 RAMState **rsp = opaque;
zhanghailianga91246c2016-10-27 14:42:59 +08002994 RAMBlock *block;
Leonardo Bras33d70972022-05-13 03:28:35 -03002995 int ret;
zhanghailianga91246c2016-10-27 14:42:59 +08002996
Xiao Guangrongdcaf4462018-03-30 15:51:20 +08002997 if (compress_threads_save_setup()) {
2998 return -1;
2999 }
3000
zhanghailianga91246c2016-10-27 14:42:59 +08003001 /* migration has already setup the bitmap, reuse it. */
3002 if (!migration_in_colo_state()) {
Peter Xu7d00ee62017-10-19 14:31:57 +08003003 if (ram_init_all(rsp) != 0) {
Xiao Guangrongdcaf4462018-03-30 15:51:20 +08003004 compress_threads_save_cleanup();
zhanghailianga91246c2016-10-27 14:42:59 +08003005 return -1;
Juan Quintela53518d92017-05-04 11:46:24 +02003006 }
zhanghailianga91246c2016-10-27 14:42:59 +08003007 }
Peter Xu7f401b82022-10-11 17:55:59 -04003008 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
zhanghailianga91246c2016-10-27 14:42:59 +08003009
Dr. David Alan Gilbert0e6ebd42019-10-07 15:36:38 +01003010 WITH_RCU_READ_LOCK_GUARD() {
Juan Quintela8008a272022-05-10 19:18:19 +02003011 qemu_put_be64(f, ram_bytes_total_with_ignored()
3012 | RAM_SAVE_FLAG_MEM_SIZE);
Juan Quintela56e93d22015-05-07 19:33:31 +02003013
Dr. David Alan Gilbert0e6ebd42019-10-07 15:36:38 +01003014 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3015 qemu_put_byte(f, strlen(block->idstr));
3016 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3017 qemu_put_be64(f, block->used_length);
3018 if (migrate_postcopy_ram() && block->page_size !=
3019 qemu_host_page_size) {
3020 qemu_put_be64(f, block->page_size);
3021 }
3022 if (migrate_ignore_shared()) {
3023 qemu_put_be64(f, block->mr->addr);
3024 }
Yury Kotovfbd162e2019-02-15 20:45:46 +03003025 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003026 }
3027
Juan Quintela56e93d22015-05-07 19:33:31 +02003028 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3029 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3030
Juan Quintela4010ba32021-12-15 20:10:39 +01003031 migration_ops = g_malloc0(sizeof(MigrationOps));
3032 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
李皆俊8ebb6ec2023-03-17 09:57:13 +00003033 ret = multifd_send_sync_main(f);
Leonardo Bras33d70972022-05-13 03:28:35 -03003034 if (ret < 0) {
3035 return ret;
3036 }
3037
Juan Quintela294e5a42022-06-21 13:36:11 +02003038 if (!migrate_multifd_flush_after_each_section()) {
3039 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3040 }
3041
Juan Quintela56e93d22015-05-07 19:33:31 +02003042 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
Juan Quintela35374cb2018-04-18 10:13:21 +02003043 qemu_fflush(f);
Juan Quintela56e93d22015-05-07 19:33:31 +02003044
3045 return 0;
3046}
3047
Juan Quintela3d0684b2017-03-23 15:06:39 +01003048/**
3049 * ram_save_iterate: iterative stage for migration
3050 *
3051 * Returns zero to indicate success and negative for error
3052 *
3053 * @f: QEMUFile where to send the data
3054 * @opaque: RAMState pointer
3055 */
Juan Quintela56e93d22015-05-07 19:33:31 +02003056static int ram_save_iterate(QEMUFile *f, void *opaque)
3057{
Juan Quintela53518d92017-05-04 11:46:24 +02003058 RAMState **temp = opaque;
3059 RAMState *rs = *temp;
Juan Quintela3d4095b2019-12-18 05:12:36 +01003060 int ret = 0;
Juan Quintela56e93d22015-05-07 19:33:31 +02003061 int i;
3062 int64_t t0;
Thomas Huth5c903082016-11-04 14:10:17 +01003063 int done = 0;
Juan Quintela56e93d22015-05-07 19:33:31 +02003064
Peter Lievenb2557342018-03-08 12:18:24 +01003065 if (blk_mig_bulk_active()) {
3066 /* Avoid transferring ram during bulk phase of block migration as
3067 * the bulk phase will usually take a long time and transferring
3068 * ram updates during that time is pointless. */
3069 goto out;
3070 }
3071
Peter Xu63268c42021-06-30 16:08:05 -04003072 /*
3073 * We'll take this lock a little bit long, but it's okay for two reasons.
3074 * Firstly, the only possible other thread to take it is who calls
3075 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3076 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3077 * guarantees that we'll at least released it in a regular basis.
3078 */
3079 qemu_mutex_lock(&rs->bitmap_mutex);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003080 WITH_RCU_READ_LOCK_GUARD() {
3081 if (ram_list.version != rs->last_version) {
3082 ram_state_reset(rs);
Dr. David Alan Gilberte03a34f2018-06-13 11:26:42 +01003083 }
3084
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003085 /* Read version before ram_list.blocks */
3086 smp_rmb();
Xiao Guangronge8f37352018-09-03 17:26:44 +08003087
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003088 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
Xiao Guangronge8f37352018-09-03 17:26:44 +08003089
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003090 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3091 i = 0;
Juan Quintelae1fde0e2023-05-15 21:56:58 +02003092 while ((ret = migration_rate_exceeded(f)) == 0 ||
Peter Xua1fe28d2022-01-19 16:09:18 +08003093 postcopy_has_request(rs)) {
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003094 int pages;
Jason J. Herne070afca2015-09-08 13:12:35 -04003095
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003096 if (qemu_file_get_error(f)) {
Juan Quintela56e93d22015-05-07 19:33:31 +02003097 break;
3098 }
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003099
Juan Quintela05931ec2021-12-15 19:01:21 +01003100 pages = ram_find_and_save_block(rs);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003101 /* no more pages to sent */
3102 if (pages == 0) {
3103 done = 1;
3104 break;
3105 }
3106
3107 if (pages < 0) {
3108 qemu_file_set_error(f, pages);
3109 break;
3110 }
3111
3112 rs->target_page_count += pages;
3113
3114 /*
Wei Yang644acf92019-11-07 20:39:07 +08003115 * During postcopy, it is necessary to make sure one whole host
3116 * page is sent in one chunk.
3117 */
3118 if (migrate_postcopy_ram()) {
Lukas Straubef4f5f52023-04-20 11:48:13 +02003119 ram_flush_compressed_data(rs);
Wei Yang644acf92019-11-07 20:39:07 +08003120 }
3121
3122 /*
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003123 * we want to check in the 1st loop, just in case it was the 1st
3124 * time and we had to sync the dirty bitmap.
3125 * qemu_clock_get_ns() is a bit expensive, so we only check each
3126 * some iterations
3127 */
3128 if ((i & 63) == 0) {
3129 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3130 1000000;
3131 if (t1 > MAX_WAIT) {
3132 trace_ram_save_iterate_big_wait(t1, i);
3133 break;
3134 }
3135 }
3136 i++;
Juan Quintela56e93d22015-05-07 19:33:31 +02003137 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003138 }
Peter Xu63268c42021-06-30 16:08:05 -04003139 qemu_mutex_unlock(&rs->bitmap_mutex);
Juan Quintela56e93d22015-05-07 19:33:31 +02003140
3141 /*
3142 * Must occur before EOS (or any QEMUFile operation)
3143 * because of RDMA protocol.
3144 */
3145 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3146
Peter Lievenb2557342018-03-08 12:18:24 +01003147out:
Juan Quintelab69a0222020-01-22 11:36:12 +01003148 if (ret >= 0
3149 && migration_is_setup_or_active(migrate_get_current()->state)) {
Juan Quintelab05292c2022-06-21 12:21:32 +02003150 if (migrate_multifd_flush_after_each_section()) {
3151 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3152 if (ret < 0) {
3153 return ret;
3154 }
Leonardo Bras33d70972022-05-13 03:28:35 -03003155 }
3156
Juan Quintela3d4095b2019-12-18 05:12:36 +01003157 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3158 qemu_fflush(f);
David Edmondson4c2d0f62021-12-21 09:34:40 +00003159 ram_transferred_add(8);
Juan Quintela56e93d22015-05-07 19:33:31 +02003160
Juan Quintela3d4095b2019-12-18 05:12:36 +01003161 ret = qemu_file_get_error(f);
3162 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003163 if (ret < 0) {
3164 return ret;
3165 }
3166
Thomas Huth5c903082016-11-04 14:10:17 +01003167 return done;
Juan Quintela56e93d22015-05-07 19:33:31 +02003168}
3169
Juan Quintela3d0684b2017-03-23 15:06:39 +01003170/**
3171 * ram_save_complete: function called to send the remaining amount of ram
3172 *
Xiao Guangronge8f37352018-09-03 17:26:44 +08003173 * Returns zero to indicate success or negative on error
Juan Quintela3d0684b2017-03-23 15:06:39 +01003174 *
3175 * Called with iothread lock
3176 *
3177 * @f: QEMUFile where to send the data
3178 * @opaque: RAMState pointer
3179 */
Juan Quintela56e93d22015-05-07 19:33:31 +02003180static int ram_save_complete(QEMUFile *f, void *opaque)
3181{
Juan Quintela53518d92017-05-04 11:46:24 +02003182 RAMState **temp = opaque;
3183 RAMState *rs = *temp;
Xiao Guangronge8f37352018-09-03 17:26:44 +08003184 int ret = 0;
Juan Quintela6f37bb82017-03-13 19:26:29 +01003185
Juan Quintela05931ec2021-12-15 19:01:21 +01003186 rs->last_stage = !migration_in_colo_state();
3187
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003188 WITH_RCU_READ_LOCK_GUARD() {
3189 if (!migration_in_postcopy()) {
Gavin Shan1e493be2023-05-09 12:21:19 +10003190 migration_bitmap_sync_precopy(rs, true);
Juan Quintela56e93d22015-05-07 19:33:31 +02003191 }
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003192
3193 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3194
3195 /* try transferring iterative blocks of memory */
3196
3197 /* flush all remaining blocks regardless of rate limiting */
Peter Xuc13221b2022-10-11 17:55:45 -04003198 qemu_mutex_lock(&rs->bitmap_mutex);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003199 while (true) {
3200 int pages;
3201
Juan Quintela05931ec2021-12-15 19:01:21 +01003202 pages = ram_find_and_save_block(rs);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003203 /* no more blocks to sent */
3204 if (pages == 0) {
3205 break;
3206 }
3207 if (pages < 0) {
3208 ret = pages;
3209 break;
3210 }
Xiao Guangronge8f37352018-09-03 17:26:44 +08003211 }
Peter Xuc13221b2022-10-11 17:55:45 -04003212 qemu_mutex_unlock(&rs->bitmap_mutex);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003213
Lukas Straubef4f5f52023-04-20 11:48:13 +02003214 ram_flush_compressed_data(rs);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003215 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
Juan Quintela56e93d22015-05-07 19:33:31 +02003216 }
3217
Leonardo Bras33d70972022-05-13 03:28:35 -03003218 if (ret < 0) {
3219 return ret;
Juan Quintela3d4095b2019-12-18 05:12:36 +01003220 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003221
Peter Xu7f401b82022-10-11 17:55:59 -04003222 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
Leonardo Bras33d70972022-05-13 03:28:35 -03003223 if (ret < 0) {
3224 return ret;
3225 }
3226
Juan Quintela294e5a42022-06-21 13:36:11 +02003227 if (!migrate_multifd_flush_after_each_section()) {
3228 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3229 }
Leonardo Bras33d70972022-05-13 03:28:35 -03003230 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3231 qemu_fflush(f);
3232
3233 return 0;
Juan Quintela56e93d22015-05-07 19:33:31 +02003234}
3235
Juan Quintela24beea42023-02-08 14:48:02 +01003236static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3237 uint64_t *can_postcopy)
Juan Quintela56e93d22015-05-07 19:33:31 +02003238{
Juan Quintela53518d92017-05-04 11:46:24 +02003239 RAMState **temp = opaque;
3240 RAMState *rs = *temp;
Juan Quintela56e93d22015-05-07 19:33:31 +02003241
Juan Quintelac8df4a72022-10-03 02:00:03 +02003242 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
Juan Quintela56e93d22015-05-07 19:33:31 +02003243
Juan Quintelac8df4a72022-10-03 02:00:03 +02003244 if (migrate_postcopy_ram()) {
3245 /* We can do postcopy, and all the data is postcopiable */
Juan Quintela24beea42023-02-08 14:48:02 +01003246 *can_postcopy += remaining_size;
Juan Quintelac8df4a72022-10-03 02:00:03 +02003247 } else {
Juan Quintela24beea42023-02-08 14:48:02 +01003248 *must_precopy += remaining_size;
Juan Quintelac8df4a72022-10-03 02:00:03 +02003249 }
3250}
3251
Juan Quintela24beea42023-02-08 14:48:02 +01003252static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3253 uint64_t *can_postcopy)
Juan Quintelac8df4a72022-10-03 02:00:03 +02003254{
Juan Quintela28ef5332023-04-12 22:30:20 +02003255 MigrationState *s = migrate_get_current();
Juan Quintelac8df4a72022-10-03 02:00:03 +02003256 RAMState **temp = opaque;
3257 RAMState *rs = *temp;
3258
3259 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3260
Juan Quintela28ef5332023-04-12 22:30:20 +02003261 if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
Juan Quintela56e93d22015-05-07 19:33:31 +02003262 qemu_mutex_lock_iothread();
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003263 WITH_RCU_READ_LOCK_GUARD() {
Gavin Shan1e493be2023-05-09 12:21:19 +10003264 migration_bitmap_sync_precopy(rs, false);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003265 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003266 qemu_mutex_unlock_iothread();
Juan Quintela9edabd42017-03-14 12:02:16 +01003267 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
Juan Quintela56e93d22015-05-07 19:33:31 +02003268 }
Dr. David Alan Gilbertc31b0982015-11-05 18:10:54 +00003269
Vladimir Sementsov-Ogievskiy86e11672017-07-10 19:30:15 +03003270 if (migrate_postcopy_ram()) {
3271 /* We can do postcopy, and all the data is postcopiable */
Juan Quintela24beea42023-02-08 14:48:02 +01003272 *can_postcopy += remaining_size;
Vladimir Sementsov-Ogievskiy86e11672017-07-10 19:30:15 +03003273 } else {
Juan Quintela24beea42023-02-08 14:48:02 +01003274 *must_precopy += remaining_size;
Vladimir Sementsov-Ogievskiy86e11672017-07-10 19:30:15 +03003275 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003276}
3277
3278static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3279{
3280 unsigned int xh_len;
3281 int xh_flags;
Dr. David Alan Gilbert063e7602015-12-16 11:47:37 +00003282 uint8_t *loaded_data;
Juan Quintela56e93d22015-05-07 19:33:31 +02003283
Juan Quintela56e93d22015-05-07 19:33:31 +02003284 /* extract RLE header */
3285 xh_flags = qemu_get_byte(f);
3286 xh_len = qemu_get_be16(f);
3287
3288 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3289 error_report("Failed to load XBZRLE page - wrong compression!");
3290 return -1;
3291 }
3292
3293 if (xh_len > TARGET_PAGE_SIZE) {
3294 error_report("Failed to load XBZRLE page - len overflow!");
3295 return -1;
3296 }
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003297 loaded_data = XBZRLE.decoded_buf;
Juan Quintela56e93d22015-05-07 19:33:31 +02003298 /* load data and decode */
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003299 /* it can change loaded_data to point to an internal buffer */
Dr. David Alan Gilbert063e7602015-12-16 11:47:37 +00003300 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
Juan Quintela56e93d22015-05-07 19:33:31 +02003301
3302 /* decode RLE */
Dr. David Alan Gilbert063e7602015-12-16 11:47:37 +00003303 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
Juan Quintela56e93d22015-05-07 19:33:31 +02003304 TARGET_PAGE_SIZE) == -1) {
3305 error_report("Failed to load XBZRLE page - decode error!");
3306 return -1;
3307 }
3308
3309 return 0;
3310}
3311
Juan Quintela3d0684b2017-03-23 15:06:39 +01003312/**
3313 * ram_block_from_stream: read a RAMBlock id from the migration stream
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003314 *
Juan Quintela3d0684b2017-03-23 15:06:39 +01003315 * Must be called from within a rcu critical section.
3316 *
3317 * Returns a pointer from within the RCU-protected ram_list.
3318 *
Peter Xu755e8d72022-03-01 16:39:07 +08003319 * @mis: the migration incoming state pointer
Juan Quintela3d0684b2017-03-23 15:06:39 +01003320 * @f: QEMUFile where to read the data from
3321 * @flags: Page flags (mostly to see if it's a continuation of previous block)
Peter Xuc01b16e2022-07-07 14:55:04 -04003322 * @channel: the channel we're using
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003323 */
Peter Xu755e8d72022-03-01 16:39:07 +08003324static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
Peter Xuc01b16e2022-07-07 14:55:04 -04003325 QEMUFile *f, int flags,
3326 int channel)
Juan Quintela56e93d22015-05-07 19:33:31 +02003327{
Peter Xuc01b16e2022-07-07 14:55:04 -04003328 RAMBlock *block = mis->last_recv_block[channel];
Juan Quintela56e93d22015-05-07 19:33:31 +02003329 char id[256];
3330 uint8_t len;
3331
3332 if (flags & RAM_SAVE_FLAG_CONTINUE) {
zhanghailiang4c4bad42016-01-15 11:37:41 +08003333 if (!block) {
Juan Quintela56e93d22015-05-07 19:33:31 +02003334 error_report("Ack, bad migration stream!");
3335 return NULL;
3336 }
zhanghailiang4c4bad42016-01-15 11:37:41 +08003337 return block;
Juan Quintela56e93d22015-05-07 19:33:31 +02003338 }
3339
3340 len = qemu_get_byte(f);
3341 qemu_get_buffer(f, (uint8_t *)id, len);
3342 id[len] = 0;
3343
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00003344 block = qemu_ram_block_by_name(id);
zhanghailiang4c4bad42016-01-15 11:37:41 +08003345 if (!block) {
3346 error_report("Can't find block %s", id);
3347 return NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02003348 }
3349
David Hildenbrandf161c882023-07-06 09:56:08 +02003350 if (migrate_ram_is_ignored(block)) {
Cédric Le Goaterb895de52018-05-14 08:57:00 +02003351 error_report("block %s should not be migrated !", id);
3352 return NULL;
3353 }
3354
Peter Xuc01b16e2022-07-07 14:55:04 -04003355 mis->last_recv_block[channel] = block;
Peter Xu755e8d72022-03-01 16:39:07 +08003356
zhanghailiang4c4bad42016-01-15 11:37:41 +08003357 return block;
3358}
3359
3360static inline void *host_from_ram_block_offset(RAMBlock *block,
3361 ram_addr_t offset)
3362{
3363 if (!offset_in_ramblock(block, offset)) {
3364 return NULL;
3365 }
3366
3367 return block->host + offset;
Juan Quintela56e93d22015-05-07 19:33:31 +02003368}
3369
David Hildenbrand6a23f632021-04-29 13:27:05 +02003370static void *host_page_from_ram_block_offset(RAMBlock *block,
3371 ram_addr_t offset)
3372{
3373 /* Note: Explicitly no check against offset_in_ramblock(). */
3374 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3375 block->page_size);
3376}
3377
3378static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3379 ram_addr_t offset)
3380{
3381 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3382}
3383
Lukas Straub871cfc52023-05-08 21:10:52 +02003384void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3385{
3386 qemu_mutex_lock(&ram_state->bitmap_mutex);
3387 for (int i = 0; i < pages; i++) {
3388 ram_addr_t offset = normal[i];
3389 ram_state->migration_dirty_pages += !test_and_set_bit(
3390 offset >> TARGET_PAGE_BITS,
3391 block->bmap);
3392 }
3393 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3394}
3395
Zhang Chen13af18f2018-09-03 12:38:48 +08003396static inline void *colo_cache_from_block_offset(RAMBlock *block,
zhanghailiang8af66372020-02-24 14:54:11 +08003397 ram_addr_t offset, bool record_bitmap)
Zhang Chen13af18f2018-09-03 12:38:48 +08003398{
3399 if (!offset_in_ramblock(block, offset)) {
3400 return NULL;
3401 }
3402 if (!block->colo_cache) {
3403 error_report("%s: colo_cache is NULL in block :%s",
3404 __func__, block->idstr);
3405 return NULL;
3406 }
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003407
3408 /*
3409 * During colo checkpoint, we need bitmap of these migrated pages.
3410 * It help us to decide which pages in ram cache should be flushed
3411 * into VM's RAM later.
3412 */
Lukas Straub871cfc52023-05-08 21:10:52 +02003413 if (record_bitmap) {
3414 colo_record_bitmap(block, &offset, 1);
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003415 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003416 return block->colo_cache + offset;
3417}
3418
Juan Quintela3d0684b2017-03-23 15:06:39 +01003419/**
3420 * ram_handle_compressed: handle the zero page case
3421 *
Juan Quintela56e93d22015-05-07 19:33:31 +02003422 * If a page (or a whole RDMA chunk) has been
3423 * determined to be zero, then zap it.
Juan Quintela3d0684b2017-03-23 15:06:39 +01003424 *
3425 * @host: host address for the zero page
3426 * @ch: what the page is filled from. We only support zero
3427 * @size: size of the zero page
Juan Quintela56e93d22015-05-07 19:33:31 +02003428 */
3429void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3430{
Juan Quintelabad452a2021-11-18 15:56:38 +01003431 if (ch != 0 || !buffer_is_zero(host, size)) {
Juan Quintela56e93d22015-05-07 19:33:31 +02003432 memset(host, ch, size);
3433 }
3434}
3435
Rao, Leib70cb3b2020-10-16 13:52:01 +08003436static void colo_init_ram_state(void)
3437{
3438 ram_state_init(&ram_state);
Rao, Leib70cb3b2020-10-16 13:52:01 +08003439}
3440
Zhang Chen13af18f2018-09-03 12:38:48 +08003441/*
3442 * colo cache: this is for secondary VM, we cache the whole
3443 * memory of the secondary VM, it is need to hold the global lock
3444 * to call this helper.
3445 */
3446int colo_init_ram_cache(void)
3447{
3448 RAMBlock *block;
3449
Paolo Bonzini44901b52019-12-13 15:07:22 +01003450 WITH_RCU_READ_LOCK_GUARD() {
3451 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3452 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
David Hildenbrand8dbe22c2021-05-10 13:43:21 +02003453 NULL, false, false);
Paolo Bonzini44901b52019-12-13 15:07:22 +01003454 if (!block->colo_cache) {
3455 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3456 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3457 block->used_length);
3458 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3459 if (block->colo_cache) {
3460 qemu_anon_ram_free(block->colo_cache, block->used_length);
3461 block->colo_cache = NULL;
3462 }
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003463 }
Paolo Bonzini44901b52019-12-13 15:07:22 +01003464 return -errno;
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003465 }
Lukas Straube5fdf922021-07-04 18:14:44 +02003466 if (!machine_dump_guest_core(current_machine)) {
3467 qemu_madvise(block->colo_cache, block->used_length,
3468 QEMU_MADV_DONTDUMP);
3469 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003470 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003471 }
Paolo Bonzini44901b52019-12-13 15:07:22 +01003472
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003473 /*
3474 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3475 * with to decide which page in cache should be flushed into SVM's RAM. Here
3476 * we use the same name 'ram_bitmap' as for migration.
3477 */
3478 if (ram_bytes_total()) {
3479 RAMBlock *block;
3480
Yury Kotovfbd162e2019-02-15 20:45:46 +03003481 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003482 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003483 block->bmap = bitmap_new(pages);
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003484 }
3485 }
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003486
Rao, Leib70cb3b2020-10-16 13:52:01 +08003487 colo_init_ram_state();
Zhang Chen13af18f2018-09-03 12:38:48 +08003488 return 0;
Zhang Chen13af18f2018-09-03 12:38:48 +08003489}
3490
zhanghailiang03930312020-02-24 14:54:10 +08003491/* TODO: duplicated with ram_init_bitmaps */
3492void colo_incoming_start_dirty_log(void)
3493{
3494 RAMBlock *block = NULL;
3495 /* For memory_global_dirty_log_start below. */
3496 qemu_mutex_lock_iothread();
3497 qemu_mutex_lock_ramlist();
3498
Gavin Shan1e493be2023-05-09 12:21:19 +10003499 memory_global_dirty_log_sync(false);
zhanghailiang03930312020-02-24 14:54:10 +08003500 WITH_RCU_READ_LOCK_GUARD() {
3501 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3502 ramblock_sync_dirty_bitmap(ram_state, block);
3503 /* Discard this dirty bitmap record */
3504 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3505 }
Hyman Huang(黄勇)63b41db2021-06-29 16:01:19 +00003506 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
zhanghailiang03930312020-02-24 14:54:10 +08003507 }
3508 ram_state->migration_dirty_pages = 0;
3509 qemu_mutex_unlock_ramlist();
3510 qemu_mutex_unlock_iothread();
3511}
3512
Zhang Chen13af18f2018-09-03 12:38:48 +08003513/* It is need to hold the global lock to call this helper */
3514void colo_release_ram_cache(void)
3515{
3516 RAMBlock *block;
3517
Hyman Huang(黄勇)63b41db2021-06-29 16:01:19 +00003518 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
Yury Kotovfbd162e2019-02-15 20:45:46 +03003519 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Zhang Chen7d9acaf2018-09-03 12:38:49 +08003520 g_free(block->bmap);
3521 block->bmap = NULL;
3522 }
3523
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003524 WITH_RCU_READ_LOCK_GUARD() {
3525 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3526 if (block->colo_cache) {
3527 qemu_anon_ram_free(block->colo_cache, block->used_length);
3528 block->colo_cache = NULL;
3529 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003530 }
3531 }
zhanghailiang03930312020-02-24 14:54:10 +08003532 ram_state_cleanup(&ram_state);
Zhang Chen13af18f2018-09-03 12:38:48 +08003533}
3534
Juan Quintela3d0684b2017-03-23 15:06:39 +01003535/**
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003536 * ram_load_setup: Setup RAM for migration incoming side
3537 *
3538 * Returns zero to indicate success and negative for error
3539 *
3540 * @f: QEMUFile where to receive the data
3541 * @opaque: RAMState pointer
3542 */
3543static int ram_load_setup(QEMUFile *f, void *opaque)
3544{
3545 xbzrle_load_setup();
Alexey Perevalovf9494612017-10-05 14:13:20 +03003546 ramblock_recv_map_init();
Zhang Chen13af18f2018-09-03 12:38:48 +08003547
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003548 return 0;
3549}
3550
3551static int ram_load_cleanup(void *opaque)
3552{
Alexey Perevalovf9494612017-10-05 14:13:20 +03003553 RAMBlock *rb;
Junyan He56eb90a2018-07-18 15:48:03 +08003554
Yury Kotovfbd162e2019-02-15 20:45:46 +03003555 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
Beata Michalskabd108a42019-11-21 00:08:42 +00003556 qemu_ram_block_writeback(rb);
Junyan He56eb90a2018-07-18 15:48:03 +08003557 }
3558
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003559 xbzrle_load_cleanup();
Alexey Perevalovf9494612017-10-05 14:13:20 +03003560
Yury Kotovfbd162e2019-02-15 20:45:46 +03003561 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
Alexey Perevalovf9494612017-10-05 14:13:20 +03003562 g_free(rb->receivedmap);
3563 rb->receivedmap = NULL;
3564 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003565
Juan Quintelaf265e0e2017-06-28 11:52:27 +02003566 return 0;
3567}
3568
3569/**
Juan Quintela3d0684b2017-03-23 15:06:39 +01003570 * ram_postcopy_incoming_init: allocate postcopy data structures
3571 *
3572 * Returns 0 for success and negative if there was one error
3573 *
3574 * @mis: current migration incoming state
3575 *
3576 * Allocate data structures etc needed by incoming migration with
3577 * postcopy-ram. postcopy-ram's similarly names
3578 * postcopy_ram_incoming_init does the work.
Dr. David Alan Gilbert1caddf82015-11-05 18:11:03 +00003579 */
3580int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3581{
David Hildenbrandc1361802018-06-20 22:27:36 +02003582 return postcopy_ram_incoming_init(mis);
Dr. David Alan Gilbert1caddf82015-11-05 18:11:03 +00003583}
3584
Juan Quintela3d0684b2017-03-23 15:06:39 +01003585/**
3586 * ram_load_postcopy: load a page in postcopy case
3587 *
3588 * Returns 0 for success or -errno in case of error
3589 *
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003590 * Called in postcopy mode by ram_load().
3591 * rcu_read_lock is taken prior to this being called.
Juan Quintela3d0684b2017-03-23 15:06:39 +01003592 *
3593 * @f: QEMUFile where to send the data
Peter Xu36f62f12022-07-07 14:55:02 -04003594 * @channel: the channel to use for loading
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003595 */
Peter Xu36f62f12022-07-07 14:55:02 -04003596int ram_load_postcopy(QEMUFile *f, int channel)
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003597{
3598 int flags = 0, ret = 0;
3599 bool place_needed = false;
Peter Xu1aa83672018-07-10 17:18:53 +08003600 bool matches_target_page_size = false;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003601 MigrationIncomingState *mis = migration_incoming_get_current();
Peter Xu36f62f12022-07-07 14:55:02 -04003602 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003603
3604 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3605 ram_addr_t addr;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003606 void *page_buffer = NULL;
3607 void *place_source = NULL;
Dr. David Alan Gilbertdf9ff5e2017-02-24 18:28:35 +00003608 RAMBlock *block = NULL;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003609 uint8_t ch;
Wei Yang644acf92019-11-07 20:39:07 +08003610 int len;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003611
3612 addr = qemu_get_be64(f);
Peter Xu7a9ddfb2018-02-08 18:31:05 +08003613
3614 /*
3615 * If qemu file error, we should stop here, and then "addr"
3616 * may be invalid
3617 */
3618 ret = qemu_file_get_error(f);
3619 if (ret) {
3620 break;
3621 }
3622
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003623 flags = addr & ~TARGET_PAGE_MASK;
3624 addr &= TARGET_PAGE_MASK;
3625
Peter Xu36f62f12022-07-07 14:55:02 -04003626 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
Wei Yang644acf92019-11-07 20:39:07 +08003627 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3628 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
Peter Xuc01b16e2022-07-07 14:55:04 -04003629 block = ram_block_from_stream(mis, f, flags, channel);
David Hildenbrand6a23f632021-04-29 13:27:05 +02003630 if (!block) {
3631 ret = -EINVAL;
3632 break;
3633 }
zhanghailiang4c4bad42016-01-15 11:37:41 +08003634
David Hildenbrand898ba902021-04-29 13:27:06 +02003635 /*
3636 * Relying on used_length is racy and can result in false positives.
3637 * We might place pages beyond used_length in case RAM was shrunk
3638 * while in postcopy, which is fine - trying to place via
3639 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3640 */
3641 if (!block->host || addr >= block->postcopy_length) {
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003642 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3643 ret = -EINVAL;
3644 break;
3645 }
Peter Xu77dadc32022-03-01 16:39:04 +08003646 tmp_page->target_pages++;
Peter Xu1aa83672018-07-10 17:18:53 +08003647 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003648 /*
Dr. David Alan Gilbert28abd202017-02-24 18:28:37 +00003649 * Postcopy requires that we place whole host pages atomically;
3650 * these may be huge pages for RAMBlocks that are backed by
3651 * hugetlbfs.
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003652 * To make it atomic, the data is read into a temporary page
3653 * that's moved into place later.
3654 * The migration protocol uses, possibly smaller, target-pages
3655 * however the source ensures it always sends all the components
Wei Yang91ba4422019-11-07 20:39:06 +08003656 * of a host page in one chunk.
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003657 */
Peter Xu77dadc32022-03-01 16:39:04 +08003658 page_buffer = tmp_page->tmp_huge_page +
David Hildenbrand6a23f632021-04-29 13:27:05 +02003659 host_page_offset_from_ram_block_offset(block, addr);
3660 /* If all TP are zero then we can optimise the place */
Peter Xu77dadc32022-03-01 16:39:04 +08003661 if (tmp_page->target_pages == 1) {
3662 tmp_page->host_addr =
3663 host_page_from_ram_block_offset(block, addr);
3664 } else if (tmp_page->host_addr !=
3665 host_page_from_ram_block_offset(block, addr)) {
Dr. David Alan Gilbertc53b7dd2015-11-05 18:11:12 +00003666 /* not the 1st TP within the HP */
Peter Xu36f62f12022-07-07 14:55:02 -04003667 error_report("Non-same host page detected on channel %d: "
Peter Xucfc7dc82022-03-01 16:39:05 +08003668 "Target host page %p, received host page %p "
3669 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
Peter Xu36f62f12022-07-07 14:55:02 -04003670 channel, tmp_page->host_addr,
Peter Xucfc7dc82022-03-01 16:39:05 +08003671 host_page_from_ram_block_offset(block, addr),
3672 block->idstr, addr, tmp_page->target_pages);
David Hildenbrand6a23f632021-04-29 13:27:05 +02003673 ret = -EINVAL;
3674 break;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003675 }
3676
3677 /*
3678 * If it's the last part of a host page then we place the host
3679 * page
3680 */
Peter Xu77dadc32022-03-01 16:39:04 +08003681 if (tmp_page->target_pages ==
3682 (block->page_size / TARGET_PAGE_SIZE)) {
Wei Yang4cbb3c62019-11-07 20:39:04 +08003683 place_needed = true;
Wei Yang4cbb3c62019-11-07 20:39:04 +08003684 }
Peter Xu77dadc32022-03-01 16:39:04 +08003685 place_source = tmp_page->tmp_huge_page;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003686 }
3687
3688 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
Juan Quintelabb890ed2017-04-28 09:39:55 +02003689 case RAM_SAVE_FLAG_ZERO:
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003690 ch = qemu_get_byte(f);
Wei Yang2e36bc12019-11-07 20:39:02 +08003691 /*
3692 * Can skip to set page_buffer when
3693 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3694 */
3695 if (ch || !matches_target_page_size) {
3696 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3697 }
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003698 if (ch) {
Peter Xu77dadc32022-03-01 16:39:04 +08003699 tmp_page->all_zero = false;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003700 }
3701 break;
3702
3703 case RAM_SAVE_FLAG_PAGE:
Peter Xu77dadc32022-03-01 16:39:04 +08003704 tmp_page->all_zero = false;
Peter Xu1aa83672018-07-10 17:18:53 +08003705 if (!matches_target_page_size) {
3706 /* For huge pages, we always use temporary buffer */
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003707 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3708 } else {
Peter Xu1aa83672018-07-10 17:18:53 +08003709 /*
3710 * For small pages that matches target page size, we
3711 * avoid the qemu_file copy. Instead we directly use
3712 * the buffer of QEMUFile to place the page. Note: we
3713 * cannot do any QEMUFile operation before using that
3714 * buffer to make sure the buffer is valid when
3715 * placing the page.
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003716 */
3717 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3718 TARGET_PAGE_SIZE);
3719 }
3720 break;
Wei Yang644acf92019-11-07 20:39:07 +08003721 case RAM_SAVE_FLAG_COMPRESS_PAGE:
Peter Xu77dadc32022-03-01 16:39:04 +08003722 tmp_page->all_zero = false;
Wei Yang644acf92019-11-07 20:39:07 +08003723 len = qemu_get_be32(f);
3724 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3725 error_report("Invalid compressed data length: %d", len);
3726 ret = -EINVAL;
3727 break;
3728 }
3729 decompress_data_with_multi_threads(f, page_buffer, len);
3730 break;
Juan Quintela294e5a42022-06-21 13:36:11 +02003731 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3732 multifd_recv_sync_main();
3733 break;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003734 case RAM_SAVE_FLAG_EOS:
3735 /* normal exit */
Juan Quintelab05292c2022-06-21 12:21:32 +02003736 if (migrate_multifd_flush_after_each_section()) {
3737 multifd_recv_sync_main();
3738 }
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003739 break;
3740 default:
Bihong Yu29fccad2020-10-20 11:10:42 +08003741 error_report("Unknown combination of migration flags: 0x%x"
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003742 " (postcopy mode)", flags);
3743 ret = -EINVAL;
Peter Xu7a9ddfb2018-02-08 18:31:05 +08003744 break;
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003745 }
3746
Wei Yang644acf92019-11-07 20:39:07 +08003747 /* Got the whole host page, wait for decompress before placing. */
3748 if (place_needed) {
3749 ret |= wait_for_decompress_done();
3750 }
3751
Peter Xu7a9ddfb2018-02-08 18:31:05 +08003752 /* Detect for any possible file errors */
3753 if (!ret && qemu_file_get_error(f)) {
3754 ret = qemu_file_get_error(f);
3755 }
3756
3757 if (!ret && place_needed) {
Peter Xu77dadc32022-03-01 16:39:04 +08003758 if (tmp_page->all_zero) {
3759 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003760 } else {
Peter Xu77dadc32022-03-01 16:39:04 +08003761 ret = postcopy_place_page(mis, tmp_page->host_addr,
3762 place_source, block);
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003763 }
David Hildenbrandddf35bd2020-04-21 10:52:56 +02003764 place_needed = false;
Peter Xu77dadc32022-03-01 16:39:04 +08003765 postcopy_temp_page_reset(tmp_page);
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003766 }
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003767 }
3768
3769 return ret;
3770}
3771
Daniel Henrique Barbozaacab30b2017-11-16 20:35:26 -02003772static bool postcopy_is_running(void)
3773{
3774 PostcopyState ps = postcopy_state_get();
3775 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3776}
3777
Zhang Chene6f4aa12018-09-03 12:38:50 +08003778/*
3779 * Flush content of RAM cache into SVM's memory.
3780 * Only flush the pages that be dirtied by PVM or SVM or both.
3781 */
Lukas Straub24fa16f2020-05-11 13:10:51 +02003782void colo_flush_ram_cache(void)
Zhang Chene6f4aa12018-09-03 12:38:50 +08003783{
3784 RAMBlock *block = NULL;
3785 void *dst_host;
3786 void *src_host;
3787 unsigned long offset = 0;
3788
Gavin Shan1e493be2023-05-09 12:21:19 +10003789 memory_global_dirty_log_sync(false);
Lukas Straub9d638402023-05-08 21:10:55 +02003790 qemu_mutex_lock(&ram_state->bitmap_mutex);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003791 WITH_RCU_READ_LOCK_GUARD() {
3792 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3793 ramblock_sync_dirty_bitmap(ram_state, block);
Zhang Chene6f4aa12018-09-03 12:38:50 +08003794 }
3795 }
3796
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003797 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3798 WITH_RCU_READ_LOCK_GUARD() {
3799 block = QLIST_FIRST_RCU(&ram_list.blocks);
3800
3801 while (block) {
Rao, Leia6a83ce2021-11-09 11:04:55 +08003802 unsigned long num = 0;
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003803
Rao, Leia6a83ce2021-11-09 11:04:55 +08003804 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
David Hildenbrand542147f2021-04-29 13:27:08 +02003805 if (!offset_in_ramblock(block,
3806 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003807 offset = 0;
Rao, Leia6a83ce2021-11-09 11:04:55 +08003808 num = 0;
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003809 block = QLIST_NEXT_RCU(block, next);
3810 } else {
Rao, Leia6a83ce2021-11-09 11:04:55 +08003811 unsigned long i = 0;
3812
3813 for (i = 0; i < num; i++) {
3814 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3815 }
Alexey Romko8bba0042020-01-10 14:51:34 +01003816 dst_host = block->host
3817 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3818 src_host = block->colo_cache
3819 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
Rao, Leia6a83ce2021-11-09 11:04:55 +08003820 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3821 offset += num;
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01003822 }
3823 }
3824 }
Lukas Straub9d638402023-05-08 21:10:55 +02003825 qemu_mutex_unlock(&ram_state->bitmap_mutex);
Zhang Chene6f4aa12018-09-03 12:38:50 +08003826 trace_colo_flush_ram_cache_end();
3827}
3828
Wei Yang10da4a32019-07-25 08:20:23 +08003829/**
3830 * ram_load_precopy: load pages in precopy case
3831 *
3832 * Returns 0 for success or -errno in case of error
3833 *
3834 * Called in precopy mode by ram_load().
3835 * rcu_read_lock is taken prior to this being called.
3836 *
3837 * @f: QEMUFile where to send the data
3838 */
3839static int ram_load_precopy(QEMUFile *f)
Juan Quintela56e93d22015-05-07 19:33:31 +02003840{
Peter Xu755e8d72022-03-01 16:39:07 +08003841 MigrationIncomingState *mis = migration_incoming_get_current();
Yury Kotove65cec52019-11-25 16:36:32 +03003842 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
Dr. David Alan Gilbertef08fb32017-02-24 18:28:30 +00003843 /* ADVISE is earlier, it shows the source has the postcopy capability on */
David Hildenbrand80fe3152023-01-17 12:22:46 +01003844 bool postcopy_advised = migration_incoming_postcopy_advised();
Juan Quintelaa7a94d12023-03-01 22:03:48 +01003845 if (!migrate_compress()) {
Juan Quintelaedc60122016-11-02 12:40:46 +01003846 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3847 }
Dr. David Alan Gilberta7180872015-11-05 18:11:11 +00003848
Wei Yang10da4a32019-07-25 08:20:23 +08003849 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
Juan Quintela56e93d22015-05-07 19:33:31 +02003850 ram_addr_t addr, total_ram_bytes;
zhanghailiang03930312020-02-24 14:54:10 +08003851 void *host = NULL, *host_bak = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02003852 uint8_t ch;
3853
Yury Kotove65cec52019-11-25 16:36:32 +03003854 /*
3855 * Yield periodically to let main loop run, but an iteration of
3856 * the main loop is expensive, so do it each some iterations
3857 */
3858 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3859 aio_co_schedule(qemu_get_current_aio_context(),
3860 qemu_coroutine_self());
3861 qemu_coroutine_yield();
3862 }
3863 i++;
3864
Juan Quintela56e93d22015-05-07 19:33:31 +02003865 addr = qemu_get_be64(f);
3866 flags = addr & ~TARGET_PAGE_MASK;
3867 addr &= TARGET_PAGE_MASK;
3868
Juan Quintelaedc60122016-11-02 12:40:46 +01003869 if (flags & invalid_flags) {
3870 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3871 error_report("Received an unexpected compressed page");
3872 }
3873
3874 ret = -EINVAL;
3875 break;
3876 }
3877
Juan Quintelabb890ed2017-04-28 09:39:55 +02003878 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00003879 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
Peter Xuc01b16e2022-07-07 14:55:04 -04003880 RAMBlock *block = ram_block_from_stream(mis, f, flags,
3881 RAM_CHANNEL_PRECOPY);
zhanghailiang4c4bad42016-01-15 11:37:41 +08003882
zhanghailiang03930312020-02-24 14:54:10 +08003883 host = host_from_ram_block_offset(block, addr);
Zhang Chen13af18f2018-09-03 12:38:48 +08003884 /*
zhanghailiang03930312020-02-24 14:54:10 +08003885 * After going into COLO stage, we should not load the page
3886 * into SVM's memory directly, we put them into colo_cache firstly.
3887 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3888 * Previously, we copied all these memory in preparing stage of COLO
3889 * while we need to stop VM, which is a time-consuming process.
3890 * Here we optimize it by a trick, back-up every page while in
3891 * migration process while COLO is enabled, though it affects the
3892 * speed of the migration, but it obviously reduce the downtime of
3893 * back-up all SVM'S memory in COLO preparing stage.
Zhang Chen13af18f2018-09-03 12:38:48 +08003894 */
zhanghailiang03930312020-02-24 14:54:10 +08003895 if (migration_incoming_colo_enabled()) {
3896 if (migration_incoming_in_colo_state()) {
3897 /* In COLO stage, put all pages into cache temporarily */
zhanghailiang8af66372020-02-24 14:54:11 +08003898 host = colo_cache_from_block_offset(block, addr, true);
zhanghailiang03930312020-02-24 14:54:10 +08003899 } else {
3900 /*
3901 * In migration stage but before COLO stage,
3902 * Put all pages into both cache and SVM's memory.
3903 */
zhanghailiang8af66372020-02-24 14:54:11 +08003904 host_bak = colo_cache_from_block_offset(block, addr, false);
zhanghailiang03930312020-02-24 14:54:10 +08003905 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003906 }
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00003907 if (!host) {
3908 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3909 ret = -EINVAL;
3910 break;
3911 }
Zhang Chen13af18f2018-09-03 12:38:48 +08003912 if (!migration_incoming_in_colo_state()) {
3913 ramblock_recv_bitmap_set(block, host);
3914 }
3915
Dr. David Alan Gilbert1db9d8e2017-04-26 19:37:21 +01003916 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00003917 }
3918
Juan Quintela56e93d22015-05-07 19:33:31 +02003919 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3920 case RAM_SAVE_FLAG_MEM_SIZE:
3921 /* Synchronize RAM block list */
3922 total_ram_bytes = addr;
3923 while (!ret && total_ram_bytes) {
3924 RAMBlock *block;
Juan Quintela56e93d22015-05-07 19:33:31 +02003925 char id[256];
3926 ram_addr_t length;
3927
3928 len = qemu_get_byte(f);
3929 qemu_get_buffer(f, (uint8_t *)id, len);
3930 id[len] = 0;
3931 length = qemu_get_be64(f);
3932
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00003933 block = qemu_ram_block_by_name(id);
Cédric Le Goaterb895de52018-05-14 08:57:00 +02003934 if (block && !qemu_ram_is_migratable(block)) {
3935 error_report("block %s should not be migrated !", id);
3936 ret = -EINVAL;
3937 } else if (block) {
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00003938 if (length != block->used_length) {
3939 Error *local_err = NULL;
Juan Quintela56e93d22015-05-07 19:33:31 +02003940
Gongleifa53a0e2016-05-10 10:04:59 +08003941 ret = qemu_ram_resize(block, length,
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00003942 &local_err);
3943 if (local_err) {
3944 error_report_err(local_err);
Juan Quintela56e93d22015-05-07 19:33:31 +02003945 }
Juan Quintela56e93d22015-05-07 19:33:31 +02003946 }
Dr. David Alan Gilbertef08fb32017-02-24 18:28:30 +00003947 /* For postcopy we need to check hugepage sizes match */
Stefan Reitere846b742021-02-04 17:35:22 +01003948 if (postcopy_advised && migrate_postcopy_ram() &&
Dr. David Alan Gilbertef08fb32017-02-24 18:28:30 +00003949 block->page_size != qemu_host_page_size) {
3950 uint64_t remote_page_size = qemu_get_be64(f);
3951 if (remote_page_size != block->page_size) {
3952 error_report("Mismatched RAM page size %s "
3953 "(local) %zd != %" PRId64,
3954 id, block->page_size,
3955 remote_page_size);
3956 ret = -EINVAL;
3957 }
3958 }
Yury Kotovfbd162e2019-02-15 20:45:46 +03003959 if (migrate_ignore_shared()) {
3960 hwaddr addr = qemu_get_be64(f);
David Hildenbrandf161c882023-07-06 09:56:08 +02003961 if (migrate_ram_is_ignored(block) &&
Yury Kotovfbd162e2019-02-15 20:45:46 +03003962 block->mr->addr != addr) {
3963 error_report("Mismatched GPAs for block %s "
3964 "%" PRId64 "!= %" PRId64,
3965 id, (uint64_t)addr,
3966 (uint64_t)block->mr->addr);
3967 ret = -EINVAL;
3968 }
3969 }
Dr. David Alan Gilberte3dd7492015-11-05 18:10:33 +00003970 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3971 block->idstr);
3972 } else {
Juan Quintela56e93d22015-05-07 19:33:31 +02003973 error_report("Unknown ramblock \"%s\", cannot "
3974 "accept migration", id);
3975 ret = -EINVAL;
3976 }
3977
3978 total_ram_bytes -= length;
3979 }
3980 break;
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00003981
Juan Quintelabb890ed2017-04-28 09:39:55 +02003982 case RAM_SAVE_FLAG_ZERO:
Juan Quintela56e93d22015-05-07 19:33:31 +02003983 ch = qemu_get_byte(f);
3984 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3985 break;
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00003986
Juan Quintela56e93d22015-05-07 19:33:31 +02003987 case RAM_SAVE_FLAG_PAGE:
Juan Quintela56e93d22015-05-07 19:33:31 +02003988 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3989 break;
Juan Quintela56e93d22015-05-07 19:33:31 +02003990
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00003991 case RAM_SAVE_FLAG_COMPRESS_PAGE:
Juan Quintela56e93d22015-05-07 19:33:31 +02003992 len = qemu_get_be32(f);
3993 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3994 error_report("Invalid compressed data length: %d", len);
3995 ret = -EINVAL;
3996 break;
3997 }
Dr. David Alan Gilbertc1bc6622015-12-16 11:47:38 +00003998 decompress_data_with_multi_threads(f, host, len);
Juan Quintela56e93d22015-05-07 19:33:31 +02003999 break;
Dr. David Alan Gilberta776aa12015-11-05 18:10:39 +00004000
Juan Quintela56e93d22015-05-07 19:33:31 +02004001 case RAM_SAVE_FLAG_XBZRLE:
Juan Quintela56e93d22015-05-07 19:33:31 +02004002 if (load_xbzrle(f, addr, host) < 0) {
4003 error_report("Failed to decompress XBZRLE page at "
4004 RAM_ADDR_FMT, addr);
4005 ret = -EINVAL;
4006 break;
4007 }
4008 break;
Juan Quintela294e5a42022-06-21 13:36:11 +02004009 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4010 multifd_recv_sync_main();
4011 break;
Juan Quintela56e93d22015-05-07 19:33:31 +02004012 case RAM_SAVE_FLAG_EOS:
4013 /* normal exit */
Juan Quintelab05292c2022-06-21 12:21:32 +02004014 if (migrate_multifd_flush_after_each_section()) {
4015 multifd_recv_sync_main();
4016 }
Juan Quintela56e93d22015-05-07 19:33:31 +02004017 break;
Juan Quintela5f1e7542023-05-04 13:44:39 +02004018 case RAM_SAVE_FLAG_HOOK:
4019 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4020 break;
Juan Quintela56e93d22015-05-07 19:33:31 +02004021 default:
Juan Quintela5f1e7542023-05-04 13:44:39 +02004022 error_report("Unknown combination of migration flags: 0x%x", flags);
4023 ret = -EINVAL;
Juan Quintela56e93d22015-05-07 19:33:31 +02004024 }
4025 if (!ret) {
4026 ret = qemu_file_get_error(f);
4027 }
zhanghailiang03930312020-02-24 14:54:10 +08004028 if (!ret && host_bak) {
4029 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4030 }
Juan Quintela56e93d22015-05-07 19:33:31 +02004031 }
4032
Wei Yangca1a6b72019-11-07 20:39:03 +08004033 ret |= wait_for_decompress_done();
Wei Yang10da4a32019-07-25 08:20:23 +08004034 return ret;
4035}
4036
4037static int ram_load(QEMUFile *f, void *opaque, int version_id)
4038{
4039 int ret = 0;
4040 static uint64_t seq_iter;
4041 /*
4042 * If system is running in postcopy mode, page inserts to host memory must
4043 * be atomic
4044 */
4045 bool postcopy_running = postcopy_is_running();
4046
4047 seq_iter++;
4048
4049 if (version_id != 4) {
4050 return -EINVAL;
4051 }
4052
4053 /*
4054 * This RCU critical section can be very long running.
4055 * When RCU reclaims in the code start to become numerous,
4056 * it will be necessary to reduce the granularity of this
4057 * critical section.
4058 */
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004059 WITH_RCU_READ_LOCK_GUARD() {
4060 if (postcopy_running) {
Peter Xu36f62f12022-07-07 14:55:02 -04004061 /*
4062 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4063 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4064 * service fast page faults.
4065 */
4066 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
Dr. David Alan Gilbert89ac5a12019-10-07 15:36:39 +01004067 } else {
4068 ret = ram_load_precopy(f);
4069 }
Wei Yang10da4a32019-07-25 08:20:23 +08004070 }
Juan Quintela55c44462017-01-23 22:32:05 +01004071 trace_ram_load_complete(ret, seq_iter);
Zhang Chene6f4aa12018-09-03 12:38:50 +08004072
Juan Quintela56e93d22015-05-07 19:33:31 +02004073 return ret;
4074}
4075
Vladimir Sementsov-Ogievskiyc6467622017-07-10 19:30:14 +03004076static bool ram_has_postcopy(void *opaque)
4077{
Junyan He469dd512018-07-18 15:48:02 +08004078 RAMBlock *rb;
Yury Kotovfbd162e2019-02-15 20:45:46 +03004079 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
Junyan He469dd512018-07-18 15:48:02 +08004080 if (ramblock_is_pmem(rb)) {
4081 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4082 "is not supported now!", rb->idstr, rb->host);
4083 return false;
4084 }
4085 }
4086
Vladimir Sementsov-Ogievskiyc6467622017-07-10 19:30:14 +03004087 return migrate_postcopy_ram();
4088}
4089
Peter Xuedd090c2018-05-02 18:47:32 +08004090/* Sync all the dirty bitmap with destination VM. */
4091static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4092{
4093 RAMBlock *block;
4094 QEMUFile *file = s->to_dst_file;
4095 int ramblock_count = 0;
4096
4097 trace_ram_dirty_bitmap_sync_start();
4098
Yury Kotovfbd162e2019-02-15 20:45:46 +03004099 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
Peter Xuedd090c2018-05-02 18:47:32 +08004100 qemu_savevm_send_recv_bitmap(file, block->idstr);
4101 trace_ram_dirty_bitmap_request(block->idstr);
4102 ramblock_count++;
4103 }
4104
4105 trace_ram_dirty_bitmap_sync_wait();
4106
4107 /* Wait until all the ramblocks' dirty bitmap synced */
4108 while (ramblock_count--) {
4109 qemu_sem_wait(&s->rp_state.rp_sem);
4110 }
4111
4112 trace_ram_dirty_bitmap_sync_complete();
4113
4114 return 0;
4115}
4116
4117static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4118{
4119 qemu_sem_post(&s->rp_state.rp_sem);
4120}
4121
Peter Xua335deb2018-05-02 18:47:28 +08004122/*
4123 * Read the received bitmap, revert it as the initial dirty bitmap.
4124 * This is only used when the postcopy migration is paused but wants
4125 * to resume from a middle point.
4126 */
4127int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4128{
4129 int ret = -EINVAL;
Peter Xu43044ac2021-07-22 13:58:38 -04004130 /* from_dst_file is always valid because we're within rp_thread */
Peter Xua335deb2018-05-02 18:47:28 +08004131 QEMUFile *file = s->rp_state.from_dst_file;
4132 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
Peter Xua725ef92018-07-10 17:18:55 +08004133 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
Peter Xua335deb2018-05-02 18:47:28 +08004134 uint64_t size, end_mark;
4135
4136 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4137
4138 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4139 error_report("%s: incorrect state %s", __func__,
4140 MigrationStatus_str(s->state));
4141 return -EINVAL;
4142 }
4143
4144 /*
4145 * Note: see comments in ramblock_recv_bitmap_send() on why we
zhaolichang3a4452d2020-09-17 15:50:21 +08004146 * need the endianness conversion, and the paddings.
Peter Xua335deb2018-05-02 18:47:28 +08004147 */
4148 local_size = ROUND_UP(local_size, 8);
4149
4150 /* Add paddings */
4151 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4152
4153 size = qemu_get_be64(file);
4154
4155 /* The size of the bitmap should match with our ramblock */
4156 if (size != local_size) {
4157 error_report("%s: ramblock '%s' bitmap size mismatch "
4158 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4159 block->idstr, size, local_size);
4160 ret = -EINVAL;
4161 goto out;
4162 }
4163
4164 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4165 end_mark = qemu_get_be64(file);
4166
4167 ret = qemu_file_get_error(file);
4168 if (ret || size != local_size) {
4169 error_report("%s: read bitmap failed for ramblock '%s': %d"
4170 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4171 __func__, block->idstr, ret, local_size, size);
4172 ret = -EIO;
4173 goto out;
4174 }
4175
4176 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
Philippe Mathieu-Daudéaf3bbbe2020-11-03 12:25:58 +01004177 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
Peter Xua335deb2018-05-02 18:47:28 +08004178 __func__, block->idstr, end_mark);
4179 ret = -EINVAL;
4180 goto out;
4181 }
4182
4183 /*
zhaolichang3a4452d2020-09-17 15:50:21 +08004184 * Endianness conversion. We are during postcopy (though paused).
Peter Xua335deb2018-05-02 18:47:28 +08004185 * The dirty bitmap won't change. We can directly modify it.
4186 */
4187 bitmap_from_le(block->bmap, le_bitmap, nbits);
4188
4189 /*
4190 * What we received is "received bitmap". Revert it as the initial
4191 * dirty bitmap for this ramblock.
4192 */
4193 bitmap_complement(block->bmap, block->bmap, nbits);
4194
David Hildenbrandbe39b4c2021-10-11 19:53:41 +02004195 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4196 ramblock_dirty_bitmap_clear_discarded_pages(block);
4197
4198 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
Peter Xua335deb2018-05-02 18:47:28 +08004199 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4200
Peter Xuedd090c2018-05-02 18:47:32 +08004201 /*
4202 * We succeeded to sync bitmap for current ramblock. If this is
4203 * the last one to sync, we need to notify the main send thread.
4204 */
4205 ram_dirty_bitmap_reload_notify(s);
4206
Peter Xua335deb2018-05-02 18:47:28 +08004207 ret = 0;
4208out:
Peter Xubf269902018-05-25 09:50:42 +08004209 g_free(le_bitmap);
Peter Xua335deb2018-05-02 18:47:28 +08004210 return ret;
4211}
4212
Peter Xuedd090c2018-05-02 18:47:32 +08004213static int ram_resume_prepare(MigrationState *s, void *opaque)
4214{
4215 RAMState *rs = *(RAMState **)opaque;
Peter Xu08614f32018-05-02 18:47:33 +08004216 int ret;
Peter Xuedd090c2018-05-02 18:47:32 +08004217
Peter Xu08614f32018-05-02 18:47:33 +08004218 ret = ram_dirty_bitmap_sync_all(s, rs);
4219 if (ret) {
4220 return ret;
4221 }
4222
4223 ram_state_resume_prepare(rs, s->to_dst_file);
4224
4225 return 0;
Peter Xuedd090c2018-05-02 18:47:32 +08004226}
4227
Peter Xu36f62f12022-07-07 14:55:02 -04004228void postcopy_preempt_shutdown_file(MigrationState *s)
4229{
4230 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4231 qemu_fflush(s->postcopy_qemufile_src);
4232}
4233
Juan Quintela56e93d22015-05-07 19:33:31 +02004234static SaveVMHandlers savevm_ram_handlers = {
Juan Quintela9907e842017-06-28 11:52:24 +02004235 .save_setup = ram_save_setup,
Juan Quintela56e93d22015-05-07 19:33:31 +02004236 .save_live_iterate = ram_save_iterate,
Dr. David Alan Gilbert763c9062015-11-05 18:11:00 +00004237 .save_live_complete_postcopy = ram_save_complete,
Dr. David Alan Gilberta3e06c32015-11-05 18:10:41 +00004238 .save_live_complete_precopy = ram_save_complete,
Vladimir Sementsov-Ogievskiyc6467622017-07-10 19:30:14 +03004239 .has_postcopy = ram_has_postcopy,
Juan Quintelac8df4a72022-10-03 02:00:03 +02004240 .state_pending_exact = ram_state_pending_exact,
4241 .state_pending_estimate = ram_state_pending_estimate,
Juan Quintela56e93d22015-05-07 19:33:31 +02004242 .load_state = ram_load,
Juan Quintelaf265e0e2017-06-28 11:52:27 +02004243 .save_cleanup = ram_save_cleanup,
4244 .load_setup = ram_load_setup,
4245 .load_cleanup = ram_load_cleanup,
Peter Xuedd090c2018-05-02 18:47:32 +08004246 .resume_prepare = ram_resume_prepare,
Juan Quintela56e93d22015-05-07 19:33:31 +02004247};
4248
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004249static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4250 size_t old_size, size_t new_size)
4251{
David Hildenbrandcc61c702021-04-29 13:27:04 +02004252 PostcopyState ps = postcopy_state_get();
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004253 ram_addr_t offset;
4254 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4255 Error *err = NULL;
4256
David Hildenbrandf161c882023-07-06 09:56:08 +02004257 if (migrate_ram_is_ignored(rb)) {
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004258 return;
4259 }
4260
4261 if (!migration_is_idle()) {
4262 /*
4263 * Precopy code on the source cannot deal with the size of RAM blocks
4264 * changing at random points in time - especially after sending the
4265 * RAM block sizes in the migration stream, they must no longer change.
4266 * Abort and indicate a proper reason.
4267 */
4268 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
Laurent Vivier458fecc2021-09-29 16:43:10 +02004269 migration_cancel(err);
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004270 error_free(err);
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004271 }
David Hildenbrandcc61c702021-04-29 13:27:04 +02004272
4273 switch (ps) {
4274 case POSTCOPY_INCOMING_ADVISE:
4275 /*
4276 * Update what ram_postcopy_incoming_init()->init_range() does at the
4277 * time postcopy was advised. Syncing RAM blocks with the source will
4278 * result in RAM resizes.
4279 */
4280 if (old_size < new_size) {
4281 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4282 error_report("RAM block '%s' discard of resized RAM failed",
4283 rb->idstr);
4284 }
4285 }
David Hildenbrand898ba902021-04-29 13:27:06 +02004286 rb->postcopy_length = new_size;
David Hildenbrandcc61c702021-04-29 13:27:04 +02004287 break;
4288 case POSTCOPY_INCOMING_NONE:
4289 case POSTCOPY_INCOMING_RUNNING:
4290 case POSTCOPY_INCOMING_END:
4291 /*
4292 * Once our guest is running, postcopy does no longer care about
4293 * resizes. When growing, the new memory was not available on the
4294 * source, no handler needed.
4295 */
4296 break;
4297 default:
4298 error_report("RAM block '%s' resized during postcopy state: %d",
4299 rb->idstr, ps);
4300 exit(-1);
4301 }
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004302}
4303
4304static RAMBlockNotifier ram_mig_ram_notifier = {
4305 .ram_block_resized = ram_mig_ram_block_resized,
4306};
4307
Juan Quintela56e93d22015-05-07 19:33:31 +02004308void ram_mig_init(void)
4309{
4310 qemu_mutex_init(&XBZRLE.lock);
Dr. David Alan Gilbertce62df52019-08-22 12:54:33 +01004311 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
David Hildenbrandc7c0e722021-04-29 13:27:02 +02004312 ram_block_notifier_add(&ram_mig_ram_notifier);
Juan Quintela56e93d22015-05-07 19:33:31 +02004313}