Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 1 | /* |
| 2 | * Write logging blk driver based on blkverify and blkdebug. |
| 3 | * |
| 4 | * Copyright (c) 2017 Tuomas Tynkkynen <tuomas@tuxera.com> |
| 5 | * Copyright (c) 2018 Aapo Vienamo <aapo@tuxera.com> |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 6 | * Copyright (c) 2018-2024 Ari Sundholm <ari@tuxera.com> |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 7 | * |
| 8 | * This work is licensed under the terms of the GNU GPL, version 2 or later. |
| 9 | * See the COPYING file in the top-level directory. |
| 10 | */ |
| 11 | |
| 12 | #include "qemu/osdep.h" |
| 13 | #include "qapi/error.h" |
| 14 | #include "qemu/sockets.h" /* for EINPROGRESS on Windows */ |
Markus Armbruster | e2c1c34 | 2022-12-21 14:35:49 +0100 | [diff] [blame] | 15 | #include "block/block-io.h" |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 16 | #include "block/block_int.h" |
| 17 | #include "qapi/qmp/qdict.h" |
| 18 | #include "qapi/qmp/qstring.h" |
| 19 | #include "qemu/cutils.h" |
Markus Armbruster | 0b8fa32 | 2019-05-23 16:35:07 +0200 | [diff] [blame] | 20 | #include "qemu/module.h" |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 21 | #include "qemu/option.h" |
| 22 | |
| 23 | /* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */ |
| 24 | |
| 25 | #define LOG_FLUSH_FLAG (1 << 0) |
| 26 | #define LOG_FUA_FLAG (1 << 1) |
| 27 | #define LOG_DISCARD_FLAG (1 << 2) |
| 28 | #define LOG_MARK_FLAG (1 << 3) |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 29 | #define LOG_FLAG_MASK (LOG_FLUSH_FLAG \ |
| 30 | | LOG_FUA_FLAG \ |
| 31 | | LOG_DISCARD_FLAG \ |
| 32 | | LOG_MARK_FLAG) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 33 | |
| 34 | #define WRITE_LOG_VERSION 1ULL |
| 35 | #define WRITE_LOG_MAGIC 0x6a736677736872ULL |
| 36 | |
| 37 | /* All fields are little-endian. */ |
| 38 | struct log_write_super { |
| 39 | uint64_t magic; |
| 40 | uint64_t version; |
| 41 | uint64_t nr_entries; |
| 42 | uint32_t sectorsize; |
| 43 | } QEMU_PACKED; |
| 44 | |
| 45 | struct log_write_entry { |
| 46 | uint64_t sector; |
| 47 | uint64_t nr_sectors; |
| 48 | uint64_t flags; |
| 49 | uint64_t data_len; |
| 50 | } QEMU_PACKED; |
| 51 | |
| 52 | /* End of disk format structures. */ |
| 53 | |
| 54 | typedef struct { |
| 55 | BdrvChild *log_file; |
| 56 | uint32_t sectorsize; |
| 57 | uint32_t sectorbits; |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 58 | uint64_t update_interval; |
| 59 | |
| 60 | /* |
| 61 | * The mutable state of the driver, consisting of the current log sector |
| 62 | * and the number of log entries. |
| 63 | * |
| 64 | * May be read and/or written from multiple threads, and the mutex must be |
| 65 | * held when accessing these fields. |
| 66 | */ |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 67 | uint64_t cur_log_sector; |
| 68 | uint64_t nr_entries; |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 69 | QemuMutex mutex; |
| 70 | |
| 71 | /* |
| 72 | * The super block sequence number. Non-zero if a super block update is in |
| 73 | * progress. |
| 74 | * |
| 75 | * The mutex must be held when accessing this field. |
| 76 | */ |
| 77 | uint64_t super_update_seq; |
| 78 | |
| 79 | /* |
| 80 | * A coroutine-aware queue to serialize super block updates. |
| 81 | * |
| 82 | * Used with the mutex to ensure that only one thread be updating the super |
| 83 | * block at a time. |
| 84 | */ |
| 85 | CoQueue super_update_queue; |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 86 | } BDRVBlkLogWritesState; |
| 87 | |
| 88 | static QemuOptsList runtime_opts = { |
| 89 | .name = "blklogwrites", |
| 90 | .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), |
| 91 | .desc = { |
| 92 | { |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 93 | .name = "log-append", |
| 94 | .type = QEMU_OPT_BOOL, |
| 95 | .help = "Append to an existing log", |
| 96 | }, |
| 97 | { |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 98 | .name = "log-sector-size", |
| 99 | .type = QEMU_OPT_SIZE, |
| 100 | .help = "Log sector size", |
| 101 | }, |
Ari Sundholm | 1dce698 | 2018-07-04 17:59:36 +0300 | [diff] [blame] | 102 | { |
| 103 | .name = "log-super-update-interval", |
| 104 | .type = QEMU_OPT_NUMBER, |
| 105 | .help = "Log superblock update interval (# of write requests)", |
| 106 | }, |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 107 | { /* end of list */ } |
| 108 | }, |
| 109 | }; |
| 110 | |
| 111 | static inline uint32_t blk_log_writes_log2(uint32_t value) |
| 112 | { |
| 113 | assert(value > 0); |
| 114 | return 31 - clz32(value); |
| 115 | } |
| 116 | |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 117 | static inline bool blk_log_writes_sector_size_valid(uint32_t sector_size) |
| 118 | { |
Ari Sundholm | ba814c8 | 2018-07-06 15:00:38 +0300 | [diff] [blame] | 119 | return is_power_of_2(sector_size) && |
| 120 | sector_size >= sizeof(struct log_write_super) && |
| 121 | sector_size >= sizeof(struct log_write_entry) && |
| 122 | sector_size < (1ull << 24); |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 123 | } |
| 124 | |
| 125 | static uint64_t blk_log_writes_find_cur_log_sector(BdrvChild *log, |
| 126 | uint32_t sector_size, |
| 127 | uint64_t nr_entries, |
| 128 | Error **errp) |
| 129 | { |
| 130 | uint64_t cur_sector = 1; |
| 131 | uint64_t cur_idx = 0; |
| 132 | uint32_t sector_bits = blk_log_writes_log2(sector_size); |
| 133 | struct log_write_entry cur_entry; |
| 134 | |
| 135 | while (cur_idx < nr_entries) { |
Alberto Faria | 32cc71d | 2022-06-09 16:27:36 +0100 | [diff] [blame] | 136 | int read_ret = bdrv_pread(log, cur_sector << sector_bits, |
| 137 | sizeof(cur_entry), &cur_entry, 0); |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 138 | if (read_ret < 0) { |
| 139 | error_setg_errno(errp, -read_ret, |
| 140 | "Failed to read log entry %"PRIu64, cur_idx); |
| 141 | return (uint64_t)-1ull; |
| 142 | } |
| 143 | |
| 144 | if (cur_entry.flags & ~cpu_to_le64(LOG_FLAG_MASK)) { |
| 145 | error_setg(errp, "Invalid flags 0x%"PRIx64" in log entry %"PRIu64, |
| 146 | le64_to_cpu(cur_entry.flags), cur_idx); |
| 147 | return (uint64_t)-1ull; |
| 148 | } |
| 149 | |
| 150 | /* Account for the sector of the entry itself */ |
| 151 | ++cur_sector; |
| 152 | |
| 153 | /* |
| 154 | * Account for the data of the write. |
| 155 | * For discards, this data is not present. |
| 156 | */ |
| 157 | if (!(cur_entry.flags & cpu_to_le64(LOG_DISCARD_FLAG))) { |
| 158 | cur_sector += le64_to_cpu(cur_entry.nr_sectors); |
| 159 | } |
| 160 | |
| 161 | ++cur_idx; |
| 162 | } |
| 163 | |
| 164 | return cur_sector; |
| 165 | } |
| 166 | |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 167 | static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags, |
| 168 | Error **errp) |
| 169 | { |
| 170 | BDRVBlkLogWritesState *s = bs->opaque; |
| 171 | QemuOpts *opts; |
| 172 | Error *local_err = NULL; |
| 173 | int ret; |
Ari Sundholm | 2dacaf7 | 2018-07-04 17:59:34 +0300 | [diff] [blame] | 174 | uint64_t log_sector_size; |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 175 | bool log_append; |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 176 | |
| 177 | opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); |
Markus Armbruster | af175e8 | 2020-07-07 18:06:03 +0200 | [diff] [blame] | 178 | if (!qemu_opts_absorb_qdict(opts, options, errp)) { |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 179 | ret = -EINVAL; |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 180 | goto fail; |
| 181 | } |
| 182 | |
| 183 | /* Open the file */ |
Vladimir Sementsov-Ogievskiy | 8393078 | 2022-07-26 23:11:21 +0300 | [diff] [blame] | 184 | ret = bdrv_open_file_child(NULL, options, "file", bs, errp); |
| 185 | if (ret < 0) { |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 186 | goto fail; |
| 187 | } |
| 188 | |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 189 | /* Open the log file */ |
Max Reitz | 5894440 | 2020-05-13 13:05:37 +0200 | [diff] [blame] | 190 | s->log_file = bdrv_open_child(NULL, options, "log", bs, &child_of_bds, |
Vladimir Sementsov-Ogievskiy | bc52024 | 2021-02-02 15:49:45 +0300 | [diff] [blame] | 191 | BDRV_CHILD_METADATA, false, errp); |
| 192 | if (!s->log_file) { |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 193 | ret = -EINVAL; |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 194 | goto fail; |
| 195 | } |
| 196 | |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 197 | qemu_mutex_init(&s->mutex); |
| 198 | qemu_co_queue_init(&s->super_update_queue); |
| 199 | |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 200 | log_append = qemu_opt_get_bool(opts, "log-append", false); |
| 201 | |
| 202 | if (log_append) { |
| 203 | struct log_write_super log_sb = { 0, 0, 0, 0 }; |
| 204 | |
| 205 | if (qemu_opt_find(opts, "log-sector-size")) { |
| 206 | ret = -EINVAL; |
| 207 | error_setg(errp, "log-append and log-sector-size are mutually " |
| 208 | "exclusive"); |
| 209 | goto fail_log; |
| 210 | } |
| 211 | |
| 212 | /* Read log superblock or fake one for an empty log */ |
| 213 | if (!bdrv_getlength(s->log_file->bs)) { |
| 214 | log_sb.magic = cpu_to_le64(WRITE_LOG_MAGIC); |
| 215 | log_sb.version = cpu_to_le64(WRITE_LOG_VERSION); |
| 216 | log_sb.nr_entries = cpu_to_le64(0); |
| 217 | log_sb.sectorsize = cpu_to_le32(BDRV_SECTOR_SIZE); |
| 218 | } else { |
Alberto Faria | 32cc71d | 2022-06-09 16:27:36 +0100 | [diff] [blame] | 219 | ret = bdrv_pread(s->log_file, 0, sizeof(log_sb), &log_sb, 0); |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 220 | if (ret < 0) { |
| 221 | error_setg_errno(errp, -ret, "Could not read log superblock"); |
| 222 | goto fail_log; |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | if (log_sb.magic != cpu_to_le64(WRITE_LOG_MAGIC)) { |
| 227 | ret = -EINVAL; |
| 228 | error_setg(errp, "Invalid log superblock magic"); |
| 229 | goto fail_log; |
| 230 | } |
| 231 | |
| 232 | if (log_sb.version != cpu_to_le64(WRITE_LOG_VERSION)) { |
| 233 | ret = -EINVAL; |
| 234 | error_setg(errp, "Unsupported log version %"PRIu64, |
| 235 | le64_to_cpu(log_sb.version)); |
| 236 | goto fail_log; |
| 237 | } |
| 238 | |
| 239 | log_sector_size = le32_to_cpu(log_sb.sectorsize); |
| 240 | s->cur_log_sector = 1; |
| 241 | s->nr_entries = 0; |
| 242 | |
| 243 | if (blk_log_writes_sector_size_valid(log_sector_size)) { |
| 244 | s->cur_log_sector = |
| 245 | blk_log_writes_find_cur_log_sector(s->log_file, log_sector_size, |
| 246 | le64_to_cpu(log_sb.nr_entries), &local_err); |
| 247 | if (local_err) { |
| 248 | ret = -EINVAL; |
| 249 | error_propagate(errp, local_err); |
| 250 | goto fail_log; |
| 251 | } |
| 252 | |
| 253 | s->nr_entries = le64_to_cpu(log_sb.nr_entries); |
| 254 | } |
| 255 | } else { |
| 256 | log_sector_size = qemu_opt_get_size(opts, "log-sector-size", |
| 257 | BDRV_SECTOR_SIZE); |
| 258 | s->cur_log_sector = 1; |
| 259 | s->nr_entries = 0; |
| 260 | } |
| 261 | |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 262 | s->super_update_seq = 0; |
| 263 | |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 264 | if (!blk_log_writes_sector_size_valid(log_sector_size)) { |
| 265 | ret = -EINVAL; |
| 266 | error_setg(errp, "Invalid log sector size %"PRIu64, log_sector_size); |
| 267 | goto fail_log; |
| 268 | } |
| 269 | |
| 270 | s->sectorsize = log_sector_size; |
| 271 | s->sectorbits = blk_log_writes_log2(log_sector_size); |
Ari Sundholm | 1dce698 | 2018-07-04 17:59:36 +0300 | [diff] [blame] | 272 | s->update_interval = qemu_opt_get_number(opts, "log-super-update-interval", |
| 273 | 4096); |
| 274 | if (!s->update_interval) { |
| 275 | ret = -EINVAL; |
| 276 | error_setg(errp, "Invalid log superblock update interval %"PRIu64, |
| 277 | s->update_interval); |
| 278 | goto fail_log; |
| 279 | } |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 280 | |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 281 | ret = 0; |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 282 | fail_log: |
| 283 | if (ret < 0) { |
Stefan Hajnoczi | 6bc30f1 | 2023-12-05 13:20:02 -0500 | [diff] [blame] | 284 | bdrv_graph_wrlock(); |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 285 | bdrv_unref_child(bs, s->log_file); |
Stefan Hajnoczi | 6bc30f1 | 2023-12-05 13:20:02 -0500 | [diff] [blame] | 286 | bdrv_graph_wrunlock(); |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 287 | s->log_file = NULL; |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 288 | qemu_mutex_destroy(&s->mutex); |
Ari Sundholm | 0878b3c | 2018-07-04 17:59:35 +0300 | [diff] [blame] | 289 | } |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 290 | fail: |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 291 | qemu_opts_del(opts); |
| 292 | return ret; |
| 293 | } |
| 294 | |
| 295 | static void blk_log_writes_close(BlockDriverState *bs) |
| 296 | { |
| 297 | BDRVBlkLogWritesState *s = bs->opaque; |
| 298 | |
Stefan Hajnoczi | 6bc30f1 | 2023-12-05 13:20:02 -0500 | [diff] [blame] | 299 | bdrv_graph_wrlock(); |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 300 | bdrv_unref_child(bs, s->log_file); |
| 301 | s->log_file = NULL; |
Stefan Hajnoczi | 6bc30f1 | 2023-12-05 13:20:02 -0500 | [diff] [blame] | 302 | bdrv_graph_wrunlock(); |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 303 | qemu_mutex_destroy(&s->mutex); |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 304 | } |
| 305 | |
Kevin Wolf | 8ab8140 | 2023-02-03 16:22:02 +0100 | [diff] [blame] | 306 | static int64_t coroutine_fn GRAPH_RDLOCK |
| 307 | blk_log_writes_co_getlength(BlockDriverState *bs) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 308 | { |
Emanuele Giuseppe Esposito | c86422c | 2023-01-13 21:42:04 +0100 | [diff] [blame] | 309 | return bdrv_co_getlength(bs->file->bs); |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 310 | } |
| 311 | |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 312 | static void blk_log_writes_child_perm(BlockDriverState *bs, BdrvChild *c, |
Max Reitz | bf8e925 | 2020-05-13 13:05:16 +0200 | [diff] [blame] | 313 | BdrvChildRole role, |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 314 | BlockReopenQueue *ro_q, |
| 315 | uint64_t perm, uint64_t shrd, |
| 316 | uint64_t *nperm, uint64_t *nshrd) |
| 317 | { |
| 318 | if (!c) { |
| 319 | *nperm = perm & DEFAULT_PERM_PASSTHROUGH; |
| 320 | *nshrd = (shrd & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED; |
| 321 | return; |
| 322 | } |
| 323 | |
Max Reitz | e5d8a40 | 2020-05-13 13:05:44 +0200 | [diff] [blame] | 324 | bdrv_default_perms(bs, c, role, ro_q, perm, shrd, |
Max Reitz | 69dca43 | 2020-05-13 13:05:39 +0200 | [diff] [blame] | 325 | nperm, nshrd); |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 326 | } |
| 327 | |
| 328 | static void blk_log_writes_refresh_limits(BlockDriverState *bs, Error **errp) |
| 329 | { |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 330 | const BDRVBlkLogWritesState *s = bs->opaque; |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 331 | bs->bl.request_alignment = s->sectorsize; |
| 332 | } |
| 333 | |
Kevin Wolf | b9b10c3 | 2023-02-03 16:21:50 +0100 | [diff] [blame] | 334 | static int coroutine_fn GRAPH_RDLOCK |
Vladimir Sementsov-Ogievskiy | f7ef38d | 2021-09-03 13:27:59 +0300 | [diff] [blame] | 335 | blk_log_writes_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, |
| 336 | QEMUIOVector *qiov, BdrvRequestFlags flags) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 337 | { |
| 338 | return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); |
| 339 | } |
| 340 | |
| 341 | typedef struct BlkLogWritesFileReq { |
| 342 | BlockDriverState *bs; |
| 343 | uint64_t offset; |
| 344 | uint64_t bytes; |
| 345 | int file_flags; |
| 346 | QEMUIOVector *qiov; |
Emanuele Giuseppe Esposito | 8809534 | 2023-02-03 16:21:46 +0100 | [diff] [blame] | 347 | int GRAPH_RDLOCK_PTR (*func)(struct BlkLogWritesFileReq *r); |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 348 | int file_ret; |
| 349 | } BlkLogWritesFileReq; |
| 350 | |
| 351 | typedef struct { |
| 352 | BlockDriverState *bs; |
| 353 | QEMUIOVector *qiov; |
| 354 | struct log_write_entry entry; |
| 355 | uint64_t zero_size; |
| 356 | int log_ret; |
| 357 | } BlkLogWritesLogReq; |
| 358 | |
Emanuele Giuseppe Esposito | 8809534 | 2023-02-03 16:21:46 +0100 | [diff] [blame] | 359 | static void coroutine_fn GRAPH_RDLOCK |
| 360 | blk_log_writes_co_do_log(BlkLogWritesLogReq *lr) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 361 | { |
| 362 | BDRVBlkLogWritesState *s = lr->bs->opaque; |
Ari Sundholm | a9c8ea9 | 2024-01-09 20:46:46 +0200 | [diff] [blame] | 363 | |
| 364 | /* |
| 365 | * Determine the offsets and sizes of different parts of the entry, and |
| 366 | * update the state of the driver. |
| 367 | * |
| 368 | * This needs to be done in one go, before any actual I/O is done, as the |
| 369 | * log entry may have to be written in two parts, and the state of the |
| 370 | * driver may be modified by other driver operations while waiting for the |
| 371 | * I/O to complete. |
| 372 | */ |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 373 | qemu_mutex_lock(&s->mutex); |
Ari Sundholm | a9c8ea9 | 2024-01-09 20:46:46 +0200 | [diff] [blame] | 374 | const uint64_t entry_start_sector = s->cur_log_sector; |
| 375 | const uint64_t entry_offset = entry_start_sector << s->sectorbits; |
| 376 | const uint64_t qiov_aligned_size = ROUND_UP(lr->qiov->size, s->sectorsize); |
| 377 | const uint64_t entry_aligned_size = qiov_aligned_size + |
| 378 | ROUND_UP(lr->zero_size, s->sectorsize); |
| 379 | const uint64_t entry_nr_sectors = entry_aligned_size >> s->sectorbits; |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 380 | const uint64_t entry_seq = s->nr_entries + 1; |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 381 | |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 382 | s->nr_entries = entry_seq; |
Ari Sundholm | a9c8ea9 | 2024-01-09 20:46:46 +0200 | [diff] [blame] | 383 | s->cur_log_sector += entry_nr_sectors; |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 384 | qemu_mutex_unlock(&s->mutex); |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 385 | |
Ari Sundholm | a9c8ea9 | 2024-01-09 20:46:46 +0200 | [diff] [blame] | 386 | /* |
| 387 | * Write the log entry. Note that if this is a "write zeroes" operation, |
| 388 | * only the entry header is written here, with the zeroing being done |
| 389 | * separately below. |
| 390 | */ |
| 391 | lr->log_ret = bdrv_co_pwritev(s->log_file, entry_offset, lr->qiov->size, |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 392 | lr->qiov, 0); |
| 393 | |
| 394 | /* Logging for the "write zeroes" operation */ |
| 395 | if (lr->log_ret == 0 && lr->zero_size) { |
Ari Sundholm | a9c8ea9 | 2024-01-09 20:46:46 +0200 | [diff] [blame] | 396 | const uint64_t zeroes_offset = entry_offset + qiov_aligned_size; |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 397 | |
Ari Sundholm | a9c8ea9 | 2024-01-09 20:46:46 +0200 | [diff] [blame] | 398 | lr->log_ret = bdrv_co_pwrite_zeroes(s->log_file, zeroes_offset, |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 399 | lr->zero_size, 0); |
| 400 | } |
| 401 | |
Ari Sundholm | 1dce698 | 2018-07-04 17:59:36 +0300 | [diff] [blame] | 402 | /* Update super block on flush or every update interval */ |
| 403 | if (lr->log_ret == 0 && ((lr->entry.flags & LOG_FLUSH_FLAG) |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 404 | || (entry_seq % s->update_interval == 0))) |
Ari Sundholm | 1dce698 | 2018-07-04 17:59:36 +0300 | [diff] [blame] | 405 | { |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 406 | struct log_write_super super = { |
| 407 | .magic = cpu_to_le64(WRITE_LOG_MAGIC), |
| 408 | .version = cpu_to_le64(WRITE_LOG_VERSION), |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 409 | .nr_entries = 0, /* updated below */ |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 410 | .sectorsize = cpu_to_le32(s->sectorsize), |
| 411 | }; |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 412 | void *zeroes; |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 413 | QEMUIOVector qiov; |
| 414 | |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 415 | /* |
| 416 | * Wait if a super block update is already in progress. |
| 417 | * Bail out if a newer update got its turn before us. |
| 418 | */ |
| 419 | WITH_QEMU_LOCK_GUARD(&s->mutex) { |
| 420 | CoQueueWaitFlags wait_flags = 0; |
| 421 | while (s->super_update_seq) { |
| 422 | if (entry_seq < s->super_update_seq) { |
| 423 | return; |
| 424 | } |
| 425 | qemu_co_queue_wait_flags(&s->super_update_queue, |
| 426 | &s->mutex, wait_flags); |
| 427 | |
| 428 | /* |
| 429 | * In case the wait condition remains true after wakeup, |
| 430 | * to avoid starvation, make sure that this request is |
| 431 | * scheduled to rerun next by pushing it to the front of the |
| 432 | * queue. |
| 433 | */ |
| 434 | wait_flags = CO_QUEUE_WAIT_FRONT; |
| 435 | } |
| 436 | s->super_update_seq = entry_seq; |
| 437 | super.nr_entries = cpu_to_le64(s->nr_entries); |
| 438 | } |
| 439 | |
| 440 | zeroes = g_malloc0(s->sectorsize - sizeof(super)); |
| 441 | |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 442 | qemu_iovec_init(&qiov, 2); |
| 443 | qemu_iovec_add(&qiov, &super, sizeof(super)); |
| 444 | qemu_iovec_add(&qiov, zeroes, s->sectorsize - sizeof(super)); |
| 445 | |
| 446 | lr->log_ret = |
| 447 | bdrv_co_pwritev(s->log_file, 0, s->sectorsize, &qiov, 0); |
| 448 | if (lr->log_ret == 0) { |
| 449 | lr->log_ret = bdrv_co_flush(s->log_file->bs); |
| 450 | } |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 451 | |
| 452 | /* The super block has been updated. Let another request have a go. */ |
| 453 | qemu_mutex_lock(&s->mutex); |
| 454 | s->super_update_seq = 0; |
| 455 | (void) qemu_co_queue_next(&s->super_update_queue); |
| 456 | qemu_mutex_unlock(&s->mutex); |
| 457 | |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 458 | qemu_iovec_destroy(&qiov); |
| 459 | g_free(zeroes); |
| 460 | } |
| 461 | } |
| 462 | |
Emanuele Giuseppe Esposito | 8809534 | 2023-02-03 16:21:46 +0100 | [diff] [blame] | 463 | static void coroutine_fn GRAPH_RDLOCK |
| 464 | blk_log_writes_co_do_file(BlkLogWritesFileReq *fr) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 465 | { |
| 466 | fr->file_ret = fr->func(fr); |
| 467 | } |
| 468 | |
Emanuele Giuseppe Esposito | 8809534 | 2023-02-03 16:21:46 +0100 | [diff] [blame] | 469 | static int coroutine_fn GRAPH_RDLOCK |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 470 | blk_log_writes_co_log(BlockDriverState *bs, uint64_t offset, uint64_t bytes, |
| 471 | QEMUIOVector *qiov, int flags, |
Emanuele Giuseppe Esposito | 8809534 | 2023-02-03 16:21:46 +0100 | [diff] [blame] | 472 | int /*GRAPH_RDLOCK*/ (*file_func)(BlkLogWritesFileReq *r), |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 473 | uint64_t entry_flags, bool is_zero_write) |
| 474 | { |
| 475 | QEMUIOVector log_qiov; |
| 476 | size_t niov = qiov ? qiov->niov : 0; |
Ari Sundholm | a26d018 | 2024-01-19 18:29:13 +0200 | [diff] [blame] | 477 | const BDRVBlkLogWritesState *s = bs->opaque; |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 478 | BlkLogWritesFileReq fr = { |
| 479 | .bs = bs, |
| 480 | .offset = offset, |
| 481 | .bytes = bytes, |
| 482 | .file_flags = flags, |
| 483 | .qiov = qiov, |
| 484 | .func = file_func, |
| 485 | }; |
| 486 | BlkLogWritesLogReq lr = { |
| 487 | .bs = bs, |
| 488 | .qiov = &log_qiov, |
| 489 | .entry = { |
| 490 | .sector = cpu_to_le64(offset >> s->sectorbits), |
| 491 | .nr_sectors = cpu_to_le64(bytes >> s->sectorbits), |
| 492 | .flags = cpu_to_le64(entry_flags), |
| 493 | .data_len = 0, |
| 494 | }, |
| 495 | .zero_size = is_zero_write ? bytes : 0, |
| 496 | }; |
| 497 | void *zeroes = g_malloc0(s->sectorsize - sizeof(lr.entry)); |
| 498 | |
| 499 | assert((1 << s->sectorbits) == s->sectorsize); |
| 500 | assert(bs->bl.request_alignment == s->sectorsize); |
| 501 | assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)); |
| 502 | assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment)); |
| 503 | |
| 504 | qemu_iovec_init(&log_qiov, niov + 2); |
| 505 | qemu_iovec_add(&log_qiov, &lr.entry, sizeof(lr.entry)); |
| 506 | qemu_iovec_add(&log_qiov, zeroes, s->sectorsize - sizeof(lr.entry)); |
| 507 | if (qiov) { |
| 508 | qemu_iovec_concat(&log_qiov, qiov, 0, qiov->size); |
| 509 | } |
| 510 | |
| 511 | blk_log_writes_co_do_file(&fr); |
| 512 | blk_log_writes_co_do_log(&lr); |
| 513 | |
| 514 | qemu_iovec_destroy(&log_qiov); |
| 515 | g_free(zeroes); |
| 516 | |
| 517 | if (lr.log_ret < 0) { |
| 518 | return lr.log_ret; |
| 519 | } |
| 520 | |
| 521 | return fr.file_ret; |
| 522 | } |
| 523 | |
Kevin Wolf | b9b10c3 | 2023-02-03 16:21:50 +0100 | [diff] [blame] | 524 | static int coroutine_fn GRAPH_RDLOCK |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 525 | blk_log_writes_co_do_file_pwritev(BlkLogWritesFileReq *fr) |
| 526 | { |
| 527 | return bdrv_co_pwritev(fr->bs->file, fr->offset, fr->bytes, |
| 528 | fr->qiov, fr->file_flags); |
| 529 | } |
| 530 | |
Kevin Wolf | abaf8b7 | 2023-02-03 16:21:48 +0100 | [diff] [blame] | 531 | static int coroutine_fn GRAPH_RDLOCK |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 532 | blk_log_writes_co_do_file_pwrite_zeroes(BlkLogWritesFileReq *fr) |
| 533 | { |
| 534 | return bdrv_co_pwrite_zeroes(fr->bs->file, fr->offset, fr->bytes, |
| 535 | fr->file_flags); |
| 536 | } |
| 537 | |
Emanuele Giuseppe Esposito | 8809534 | 2023-02-03 16:21:46 +0100 | [diff] [blame] | 538 | static int coroutine_fn GRAPH_RDLOCK |
| 539 | blk_log_writes_co_do_file_flush(BlkLogWritesFileReq *fr) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 540 | { |
| 541 | return bdrv_co_flush(fr->bs->file->bs); |
| 542 | } |
| 543 | |
Emanuele Giuseppe Esposito | 9a5a1c6 | 2023-02-03 16:21:47 +0100 | [diff] [blame] | 544 | static int coroutine_fn GRAPH_RDLOCK |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 545 | blk_log_writes_co_do_file_pdiscard(BlkLogWritesFileReq *fr) |
| 546 | { |
Fam Zheng | 0b9fd3f | 2018-07-10 14:31:17 +0800 | [diff] [blame] | 547 | return bdrv_co_pdiscard(fr->bs->file, fr->offset, fr->bytes); |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 548 | } |
| 549 | |
Kevin Wolf | b9b10c3 | 2023-02-03 16:21:50 +0100 | [diff] [blame] | 550 | static int coroutine_fn GRAPH_RDLOCK |
Vladimir Sementsov-Ogievskiy | e75abed | 2021-09-03 13:28:00 +0300 | [diff] [blame] | 551 | blk_log_writes_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, |
| 552 | QEMUIOVector *qiov, BdrvRequestFlags flags) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 553 | { |
| 554 | return blk_log_writes_co_log(bs, offset, bytes, qiov, flags, |
| 555 | blk_log_writes_co_do_file_pwritev, 0, false); |
| 556 | } |
| 557 | |
Kevin Wolf | abaf8b7 | 2023-02-03 16:21:48 +0100 | [diff] [blame] | 558 | static int coroutine_fn GRAPH_RDLOCK |
Vladimir Sementsov-Ogievskiy | f34b2bc | 2021-09-03 13:28:03 +0300 | [diff] [blame] | 559 | blk_log_writes_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, |
| 560 | int64_t bytes, BdrvRequestFlags flags) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 561 | { |
| 562 | return blk_log_writes_co_log(bs, offset, bytes, NULL, flags, |
| 563 | blk_log_writes_co_do_file_pwrite_zeroes, 0, |
| 564 | true); |
| 565 | } |
| 566 | |
Emanuele Giuseppe Esposito | 8809534 | 2023-02-03 16:21:46 +0100 | [diff] [blame] | 567 | static int coroutine_fn GRAPH_RDLOCK |
| 568 | blk_log_writes_co_flush_to_disk(BlockDriverState *bs) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 569 | { |
| 570 | return blk_log_writes_co_log(bs, 0, 0, NULL, 0, |
| 571 | blk_log_writes_co_do_file_flush, |
| 572 | LOG_FLUSH_FLAG, false); |
| 573 | } |
| 574 | |
Emanuele Giuseppe Esposito | 9a5a1c6 | 2023-02-03 16:21:47 +0100 | [diff] [blame] | 575 | static int coroutine_fn GRAPH_RDLOCK |
Vladimir Sementsov-Ogievskiy | 0c80228 | 2021-09-03 13:28:06 +0300 | [diff] [blame] | 576 | blk_log_writes_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 577 | { |
Vladimir Sementsov-Ogievskiy | 0c80228 | 2021-09-03 13:28:06 +0300 | [diff] [blame] | 578 | return blk_log_writes_co_log(bs, offset, bytes, NULL, 0, |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 579 | blk_log_writes_co_do_file_pdiscard, |
| 580 | LOG_DISCARD_FLAG, false); |
| 581 | } |
| 582 | |
Max Reitz | 2654267 | 2019-02-01 20:29:25 +0100 | [diff] [blame] | 583 | static const char *const blk_log_writes_strong_runtime_opts[] = { |
| 584 | "log-append", |
| 585 | "log-sector-size", |
| 586 | |
| 587 | NULL |
| 588 | }; |
| 589 | |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 590 | static BlockDriver bdrv_blk_log_writes = { |
| 591 | .format_name = "blklogwrites", |
| 592 | .instance_size = sizeof(BDRVBlkLogWritesState), |
| 593 | |
| 594 | .bdrv_open = blk_log_writes_open, |
| 595 | .bdrv_close = blk_log_writes_close, |
Emanuele Giuseppe Esposito | c86422c | 2023-01-13 21:42:04 +0100 | [diff] [blame] | 596 | .bdrv_co_getlength = blk_log_writes_co_getlength, |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 597 | .bdrv_child_perm = blk_log_writes_child_perm, |
| 598 | .bdrv_refresh_limits = blk_log_writes_refresh_limits, |
| 599 | |
| 600 | .bdrv_co_preadv = blk_log_writes_co_preadv, |
| 601 | .bdrv_co_pwritev = blk_log_writes_co_pwritev, |
| 602 | .bdrv_co_pwrite_zeroes = blk_log_writes_co_pwrite_zeroes, |
| 603 | .bdrv_co_flush_to_disk = blk_log_writes_co_flush_to_disk, |
| 604 | .bdrv_co_pdiscard = blk_log_writes_co_pdiscard, |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 605 | |
| 606 | .is_filter = true, |
Max Reitz | 2654267 | 2019-02-01 20:29:25 +0100 | [diff] [blame] | 607 | .strong_runtime_opts = blk_log_writes_strong_runtime_opts, |
Aapo Vienamo | bfcc224 | 2018-07-03 17:48:48 +0300 | [diff] [blame] | 608 | }; |
| 609 | |
| 610 | static void bdrv_blk_log_writes_init(void) |
| 611 | { |
| 612 | bdrv_register(&bdrv_blk_log_writes); |
| 613 | } |
| 614 | |
| 615 | block_init(bdrv_blk_log_writes_init); |