| /* |
| * preallocate filter driver |
| * |
| * The driver performs preallocate operation: it is injected above |
| * some node, and before each write over EOF it does additional preallocating |
| * write-zeroes request. |
| * |
| * Copyright (c) 2020 Virtuozzo International GmbH. |
| * |
| * Author: |
| * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com> |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2 of the License, or |
| * (at your option) any later version. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program. If not, see <http://www.gnu.org/licenses/>. |
| */ |
| |
| #include "qemu/osdep.h" |
| |
| #include "qapi/error.h" |
| #include "qemu/module.h" |
| #include "qemu/option.h" |
| #include "qemu/units.h" |
| #include "block/block-io.h" |
| #include "block/block_int.h" |
| |
| |
| typedef struct PreallocateOpts { |
| int64_t prealloc_size; |
| int64_t prealloc_align; |
| } PreallocateOpts; |
| |
| typedef struct BDRVPreallocateState { |
| PreallocateOpts opts; |
| |
| /* |
| * Track real data end, to crop preallocation on close. If < 0 the status is |
| * unknown. |
| * |
| * @data_end is a maximum of file size on open (or when we get write/resize |
| * permissions) and all write request ends after it. So it's safe to |
| * truncate to data_end if it is valid. |
| */ |
| int64_t data_end; |
| |
| /* |
| * Start of trailing preallocated area which reads as zero. May be smaller |
| * than data_end, if user does over-EOF write zero operation. If < 0 the |
| * status is unknown. |
| * |
| * If both @zero_start and @file_end are valid, the region |
| * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end |
| * is not valid, @zero_start doesn't make much sense. |
| */ |
| int64_t zero_start; |
| |
| /* |
| * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs), |
| * to avoid extra lseek() calls on each write operation. If < 0 the status |
| * is unknown. |
| */ |
| int64_t file_end; |
| |
| /* |
| * All three states @data_end, @zero_start and @file_end are guaranteed to |
| * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and |
| * BLK_PERM_WRITE permissions on file child. |
| */ |
| |
| /* Gives up the resize permission on children when parents don't need it */ |
| QEMUBH *drop_resize_bh; |
| } BDRVPreallocateState; |
| |
| static int preallocate_drop_resize(BlockDriverState *bs, Error **errp); |
| static void preallocate_drop_resize_bh(void *opaque); |
| |
| #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align" |
| #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size" |
| static QemuOptsList runtime_opts = { |
| .name = "preallocate", |
| .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), |
| .desc = { |
| { |
| .name = PREALLOCATE_OPT_PREALLOC_ALIGN, |
| .type = QEMU_OPT_SIZE, |
| .help = "on preallocation, align file length to this number, " |
| "default 1M", |
| }, |
| { |
| .name = PREALLOCATE_OPT_PREALLOC_SIZE, |
| .type = QEMU_OPT_SIZE, |
| .help = "how much to preallocate, default 128M", |
| }, |
| { /* end of list */ } |
| }, |
| }; |
| |
| static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options, |
| BlockDriverState *child_bs, Error **errp) |
| { |
| QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); |
| |
| if (!qemu_opts_absorb_qdict(opts, options, errp)) { |
| return false; |
| } |
| |
| dest->prealloc_align = |
| qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB); |
| dest->prealloc_size = |
| qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB); |
| |
| qemu_opts_del(opts); |
| |
| if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) { |
| error_setg(errp, "prealloc-align parameter of preallocate filter " |
| "is not aligned to %llu", BDRV_SECTOR_SIZE); |
| return false; |
| } |
| |
| if (!QEMU_IS_ALIGNED(dest->prealloc_align, |
| child_bs->bl.request_alignment)) { |
| error_setg(errp, "prealloc-align parameter of preallocate filter " |
| "is not aligned to underlying node request alignment " |
| "(%" PRIi32 ")", child_bs->bl.request_alignment); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static int preallocate_open(BlockDriverState *bs, QDict *options, int flags, |
| Error **errp) |
| { |
| BDRVPreallocateState *s = bs->opaque; |
| int ret; |
| |
| GLOBAL_STATE_CODE(); |
| |
| /* |
| * s->data_end and friends should be initialized on permission update. |
| * For this to work, mark them invalid. |
| */ |
| s->file_end = s->zero_start = s->data_end = -EINVAL; |
| s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs); |
| |
| ret = bdrv_open_file_child(NULL, options, "file", bs, errp); |
| if (ret < 0) { |
| return ret; |
| } |
| |
| GRAPH_RDLOCK_GUARD_MAINLOOP(); |
| |
| if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) { |
| return -EINVAL; |
| } |
| |
| bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | |
| (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); |
| |
| bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | |
| ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & |
| bs->file->bs->supported_zero_flags); |
| |
| return 0; |
| } |
| |
| static int GRAPH_RDLOCK |
| preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp) |
| { |
| BDRVPreallocateState *s = bs->opaque; |
| int ret; |
| |
| if (s->file_end < 0) { |
| s->file_end = bdrv_getlength(bs->file->bs); |
| if (s->file_end < 0) { |
| error_setg_errno(errp, -s->file_end, "Failed to get file length"); |
| return s->file_end; |
| } |
| } |
| |
| if (s->data_end < s->file_end) { |
| ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, |
| NULL); |
| if (ret < 0) { |
| error_setg_errno(errp, -ret, "Failed to drop preallocation"); |
| s->file_end = ret; |
| return ret; |
| } |
| s->file_end = s->data_end; |
| } |
| |
| return 0; |
| } |
| |
| static void preallocate_close(BlockDriverState *bs) |
| { |
| BDRVPreallocateState *s = bs->opaque; |
| |
| GLOBAL_STATE_CODE(); |
| GRAPH_RDLOCK_GUARD_MAINLOOP(); |
| |
| qemu_bh_cancel(s->drop_resize_bh); |
| qemu_bh_delete(s->drop_resize_bh); |
| |
| if (s->data_end >= 0) { |
| preallocate_truncate_to_real_size(bs, NULL); |
| } |
| } |
| |
| |
| /* |
| * Handle reopen. |
| * |
| * We must implement reopen handlers, otherwise reopen just don't work. Handle |
| * new options and don't care about preallocation state, as it is handled in |
| * set/check permission handlers. |
| */ |
| |
| static int preallocate_reopen_prepare(BDRVReopenState *reopen_state, |
| BlockReopenQueue *queue, Error **errp) |
| { |
| PreallocateOpts *opts = g_new0(PreallocateOpts, 1); |
| int ret; |
| |
| GLOBAL_STATE_CODE(); |
| GRAPH_RDLOCK_GUARD_MAINLOOP(); |
| |
| if (!preallocate_absorb_opts(opts, reopen_state->options, |
| reopen_state->bs->file->bs, errp)) { |
| g_free(opts); |
| return -EINVAL; |
| } |
| |
| /* |
| * Drop the preallocation already here if reopening read-only. The child |
| * might also be reopened read-only and then scheduling a BH during the |
| * permission update is too late. |
| */ |
| if ((reopen_state->flags & BDRV_O_RDWR) == 0) { |
| ret = preallocate_drop_resize(reopen_state->bs, errp); |
| if (ret < 0) { |
| g_free(opts); |
| return ret; |
| } |
| } |
| |
| reopen_state->opaque = opts; |
| |
| return 0; |
| } |
| |
| static void preallocate_reopen_commit(BDRVReopenState *state) |
| { |
| BDRVPreallocateState *s = state->bs->opaque; |
| |
| s->opts = *(PreallocateOpts *)state->opaque; |
| |
| g_free(state->opaque); |
| state->opaque = NULL; |
| } |
| |
| static void preallocate_reopen_abort(BDRVReopenState *state) |
| { |
| g_free(state->opaque); |
| state->opaque = NULL; |
| } |
| |
| static int coroutine_fn GRAPH_RDLOCK |
| preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes, |
| QEMUIOVector *qiov, size_t qiov_offset, |
| BdrvRequestFlags flags) |
| { |
| return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, |
| flags); |
| } |
| |
| static int coroutine_fn GRAPH_RDLOCK |
| preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) |
| { |
| return bdrv_co_pdiscard(bs->file, offset, bytes); |
| } |
| |
| static bool can_write_resize(uint64_t perm) |
| { |
| return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE); |
| } |
| |
| static bool GRAPH_RDLOCK has_prealloc_perms(BlockDriverState *bs) |
| { |
| BDRVPreallocateState *s = bs->opaque; |
| |
| if (can_write_resize(bs->file->perm)) { |
| assert(!(bs->file->shared_perm & BLK_PERM_WRITE)); |
| assert(!(bs->file->shared_perm & BLK_PERM_RESIZE)); |
| return true; |
| } |
| |
| assert(s->data_end < 0); |
| assert(s->zero_start < 0); |
| assert(s->file_end < 0); |
| return false; |
| } |
| |
| /* |
| * Call on each write. Returns true if @want_merge_zero is true and the region |
| * [offset, offset + bytes) is zeroed (as a result of this call or earlier |
| * preallocation). |
| * |
| * want_merge_zero is used to merge write-zero request with preallocation in |
| * one bdrv_co_pwrite_zeroes() call. |
| */ |
| static bool coroutine_fn GRAPH_RDLOCK |
| handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes, |
| bool want_merge_zero) |
| { |
| BDRVPreallocateState *s = bs->opaque; |
| int64_t end = offset + bytes; |
| int64_t prealloc_start, prealloc_end; |
| int ret; |
| uint32_t file_align = bs->file->bs->bl.request_alignment; |
| uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align); |
| |
| assert(QEMU_IS_ALIGNED(prealloc_align, file_align)); |
| |
| if (!has_prealloc_perms(bs)) { |
| /* We don't have state neither should try to recover it */ |
| return false; |
| } |
| |
| if (s->data_end < 0) { |
| s->data_end = bdrv_co_getlength(bs->file->bs); |
| if (s->data_end < 0) { |
| return false; |
| } |
| |
| if (s->file_end < 0) { |
| s->file_end = s->data_end; |
| } |
| } |
| |
| if (end <= s->data_end) { |
| return false; |
| } |
| |
| /* We have valid s->data_end, and request writes beyond it. */ |
| |
| s->data_end = end; |
| if (s->zero_start < 0 || !want_merge_zero) { |
| s->zero_start = end; |
| } |
| |
| if (s->file_end < 0) { |
| s->file_end = bdrv_co_getlength(bs->file->bs); |
| if (s->file_end < 0) { |
| return false; |
| } |
| } |
| |
| /* Now s->data_end, s->zero_start and s->file_end are valid. */ |
| |
| if (end <= s->file_end) { |
| /* No preallocation needed. */ |
| return want_merge_zero && offset >= s->zero_start; |
| } |
| |
| /* Now we want new preallocation, as request writes beyond s->file_end. */ |
| |
| prealloc_start = QEMU_ALIGN_UP( |
| want_merge_zero ? MIN(offset, s->file_end) : s->file_end, |
| file_align); |
| prealloc_end = QEMU_ALIGN_UP( |
| MAX(prealloc_start, end) + s->opts.prealloc_size, |
| prealloc_align); |
| |
| want_merge_zero = want_merge_zero && (prealloc_start <= offset); |
| |
| ret = bdrv_co_pwrite_zeroes( |
| bs->file, prealloc_start, prealloc_end - prealloc_start, |
| BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT); |
| if (ret < 0) { |
| s->file_end = ret; |
| return false; |
| } |
| |
| s->file_end = prealloc_end; |
| return want_merge_zero; |
| } |
| |
| static int coroutine_fn GRAPH_RDLOCK |
| preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, |
| int64_t bytes, BdrvRequestFlags flags) |
| { |
| bool want_merge_zero = |
| !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK)); |
| if (handle_write(bs, offset, bytes, want_merge_zero)) { |
| return 0; |
| } |
| |
| return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); |
| } |
| |
| static int coroutine_fn GRAPH_RDLOCK |
| preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes, |
| QEMUIOVector *qiov, size_t qiov_offset, |
| BdrvRequestFlags flags) |
| { |
| handle_write(bs, offset, bytes, false); |
| |
| return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, |
| flags); |
| } |
| |
| static int coroutine_fn GRAPH_RDLOCK |
| preallocate_co_truncate(BlockDriverState *bs, int64_t offset, |
| bool exact, PreallocMode prealloc, |
| BdrvRequestFlags flags, Error **errp) |
| { |
| ERRP_GUARD(); |
| BDRVPreallocateState *s = bs->opaque; |
| int ret; |
| |
| if (s->data_end >= 0 && offset > s->data_end) { |
| if (s->file_end < 0) { |
| s->file_end = bdrv_co_getlength(bs->file->bs); |
| if (s->file_end < 0) { |
| error_setg(errp, "failed to get file length"); |
| return s->file_end; |
| } |
| } |
| |
| if (prealloc == PREALLOC_MODE_FALLOC) { |
| /* |
| * If offset <= s->file_end, the task is already done, just |
| * update s->data_end, to move part of "filter preallocation" |
| * to "preallocation requested by user". |
| * Otherwise just proceed to preallocate missing part. |
| */ |
| if (offset <= s->file_end) { |
| s->data_end = offset; |
| return 0; |
| } |
| } else { |
| /* |
| * We have to drop our preallocation, to |
| * - avoid "Cannot use preallocation for shrinking files" in |
| * case of offset < file_end |
| * - give PREALLOC_MODE_OFF a chance to keep small disk |
| * usage |
| * - give PREALLOC_MODE_FULL a chance to actually write the |
| * whole region as user expects |
| */ |
| if (s->file_end > s->data_end) { |
| ret = bdrv_co_truncate(bs->file, s->data_end, true, |
| PREALLOC_MODE_OFF, 0, errp); |
| if (ret < 0) { |
| s->file_end = ret; |
| error_prepend(errp, "preallocate-filter: failed to drop " |
| "write-zero preallocation: "); |
| return ret; |
| } |
| s->file_end = s->data_end; |
| } |
| } |
| |
| s->data_end = offset; |
| } |
| |
| ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); |
| if (ret < 0) { |
| s->file_end = s->zero_start = s->data_end = ret; |
| return ret; |
| } |
| |
| if (has_prealloc_perms(bs)) { |
| s->file_end = s->zero_start = s->data_end = offset; |
| } |
| return 0; |
| } |
| |
| static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs) |
| { |
| return bdrv_co_flush(bs->file->bs); |
| } |
| |
| static int64_t coroutine_fn GRAPH_RDLOCK |
| preallocate_co_getlength(BlockDriverState *bs) |
| { |
| int64_t ret; |
| BDRVPreallocateState *s = bs->opaque; |
| |
| if (s->data_end >= 0) { |
| return s->data_end; |
| } |
| |
| ret = bdrv_co_getlength(bs->file->bs); |
| |
| if (has_prealloc_perms(bs)) { |
| s->file_end = s->zero_start = s->data_end = ret; |
| } |
| |
| return ret; |
| } |
| |
| static int GRAPH_RDLOCK |
| preallocate_drop_resize(BlockDriverState *bs, Error **errp) |
| { |
| BDRVPreallocateState *s = bs->opaque; |
| int ret; |
| |
| if (s->data_end < 0) { |
| return 0; |
| } |
| |
| /* |
| * Before switching children to be read-only, truncate them to remove |
| * the preallocation and let them have the real size. |
| */ |
| ret = preallocate_truncate_to_real_size(bs, errp); |
| if (ret < 0) { |
| return ret; |
| } |
| |
| /* |
| * We'll drop our permissions and will allow other users to take write and |
| * resize permissions (see preallocate_child_perm). Anyone will be able to |
| * change the child, so mark all states invalid. We'll regain control if a |
| * parent requests write access again. |
| */ |
| s->data_end = s->file_end = s->zero_start = -EINVAL; |
| |
| bdrv_child_refresh_perms(bs, bs->file, NULL); |
| |
| return 0; |
| } |
| |
| static void preallocate_drop_resize_bh(void *opaque) |
| { |
| GLOBAL_STATE_CODE(); |
| GRAPH_RDLOCK_GUARD_MAINLOOP(); |
| |
| /* |
| * In case of errors, we'll simply keep the exclusive lock on the image |
| * indefinitely. |
| */ |
| preallocate_drop_resize(opaque, NULL); |
| } |
| |
| static void GRAPH_RDLOCK |
| preallocate_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared) |
| { |
| BDRVPreallocateState *s = bs->opaque; |
| |
| if (can_write_resize(perm)) { |
| qemu_bh_cancel(s->drop_resize_bh); |
| if (s->data_end < 0) { |
| s->data_end = s->file_end = s->zero_start = |
| bs->file->bs->total_sectors * BDRV_SECTOR_SIZE; |
| } |
| } else { |
| qemu_bh_schedule(s->drop_resize_bh); |
| } |
| } |
| |
| static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c, |
| BdrvChildRole role, BlockReopenQueue *reopen_queue, |
| uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) |
| { |
| BDRVPreallocateState *s = bs->opaque; |
| |
| bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared); |
| |
| /* |
| * We need exclusive write and resize permissions on the child not only when |
| * the parent can write to it, but also after the parent gave up write |
| * permissions until preallocate_drop_resize() has completed. |
| */ |
| if (can_write_resize(perm) || s->data_end != -EINVAL) { |
| *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE; |
| |
| /* |
| * Don't share, to keep our states s->file_end, s->data_end and |
| * s->zero_start valid. |
| */ |
| *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); |
| } |
| } |
| |
| static BlockDriver bdrv_preallocate_filter = { |
| .format_name = "preallocate", |
| .instance_size = sizeof(BDRVPreallocateState), |
| |
| .bdrv_co_getlength = preallocate_co_getlength, |
| .bdrv_open = preallocate_open, |
| .bdrv_close = preallocate_close, |
| |
| .bdrv_reopen_prepare = preallocate_reopen_prepare, |
| .bdrv_reopen_commit = preallocate_reopen_commit, |
| .bdrv_reopen_abort = preallocate_reopen_abort, |
| |
| .bdrv_co_preadv_part = preallocate_co_preadv_part, |
| .bdrv_co_pwritev_part = preallocate_co_pwritev_part, |
| .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes, |
| .bdrv_co_pdiscard = preallocate_co_pdiscard, |
| .bdrv_co_flush = preallocate_co_flush, |
| .bdrv_co_truncate = preallocate_co_truncate, |
| |
| .bdrv_set_perm = preallocate_set_perm, |
| .bdrv_child_perm = preallocate_child_perm, |
| |
| .is_filter = true, |
| }; |
| |
| static void bdrv_preallocate_init(void) |
| { |
| bdrv_register(&bdrv_preallocate_filter); |
| } |
| |
| block_init(bdrv_preallocate_init); |