block: Implement bdrv_append() without bdrv_swap()

Remember all parent nodes and just change the pointers there instead of
swapping the contents of the BlockDriverState.

Handling of snapshot=on must be moved further down in bdrv_open()
because *pbs (which is the bs pointer in the BlockBackend) must already
be set before bdrv_append() is called. Otherwise bdrv_append() changes
the BB's pointer to the temporary snapshot, but bdrv_open() overwrites
it with the read-only original image.

We also need to be careful to update callers as the interface changes
(becomes less insane): Previously, the meaning of the two parameters was
inverted when bdrv_append() returns. Now any BDS pointers keep pointing
to the same node.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
diff --git a/block.c b/block.c
index 980437f..b4d2313 100644
--- a/block.c
+++ b/block.c
@@ -1516,15 +1516,6 @@
 
     bdrv_refresh_filename(bs);
 
-    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
-     * temporary snapshot afterwards. */
-    if (snapshot_flags) {
-        ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
-        if (local_err) {
-            goto close_and_fail;
-        }
-    }
-
     /* Check if any unknown options were used */
     if (options && (qdict_size(options) != 0)) {
         const QDictEntry *entry = qdict_first(options);
@@ -1556,6 +1547,16 @@
 
     QDECREF(options);
     *pbs = bs;
+
+    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
+     * temporary snapshot afterwards. */
+    if (snapshot_flags) {
+        ret = bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err);
+        if (local_err) {
+            goto close_and_fail;
+        }
+    }
+
     return 0;
 
 fail:
@@ -2000,6 +2001,24 @@
 
     bs_dest->enable_write_cache = bs_src->enable_write_cache;
 
+    /* r/w error */
+    bs_dest->on_read_error      = bs_src->on_read_error;
+    bs_dest->on_write_error     = bs_src->on_write_error;
+
+    /* i/o status */
+    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
+    bs_dest->iostatus           = bs_src->iostatus;
+
+    /* dirty bitmap */
+    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
+}
+
+/* Fields that only need to be swapped if the contents of BDSes is swapped
+ * rather than pointers being changed in the parents, and throttling fields
+ * because only bdrv_swap() messes with internals of throttling. */
+static void bdrv_move_reference_fields(BlockDriverState *bs_dest,
+                                       BlockDriverState *bs_src)
+{
     /* i/o throttled req */
     bs_dest->throttle_state     = bs_src->throttle_state,
     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
@@ -2014,23 +2033,6 @@
            &bs_src->throttle_timers,
            sizeof(ThrottleTimers));
 
-    /* r/w error */
-    bs_dest->on_read_error      = bs_src->on_read_error;
-    bs_dest->on_write_error     = bs_src->on_write_error;
-
-    /* i/o status */
-    bs_dest->iostatus_enabled   = bs_src->iostatus_enabled;
-    bs_dest->iostatus           = bs_src->iostatus;
-
-    /* dirty bitmap */
-    bs_dest->dirty_bitmaps      = bs_src->dirty_bitmaps;
-}
-
-/* Fields that only need to be swapped if the contents of BDSes is swapped
- * rather than pointers being changed in the parents. */
-static void bdrv_move_reference_fields(BlockDriverState *bs_dest,
-                                       BlockDriverState *bs_src)
-{
     /* reference count */
     bs_dest->refcnt             = bs_src->refcnt;
 
@@ -2156,6 +2158,45 @@
     bdrv_rebind(bs_old);
 }
 
+static void change_parent_backing_link(BlockDriverState *from,
+                                       BlockDriverState *to)
+{
+    BdrvChild *c, *next;
+
+    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
+        assert(c->role != &child_backing);
+        c->bs = to;
+        QLIST_REMOVE(c, next_parent);
+        QLIST_INSERT_HEAD(&to->parents, c, next_parent);
+        bdrv_ref(to);
+        bdrv_unref(from);
+    }
+    if (from->blk) {
+        blk_set_bs(from->blk, to);
+        if (!to->device_list.tqe_prev) {
+            QTAILQ_INSERT_BEFORE(from, to, device_list);
+        }
+        QTAILQ_REMOVE(&bdrv_states, from, device_list);
+    }
+}
+
+static void swap_feature_fields(BlockDriverState *bs_top,
+                                BlockDriverState *bs_new)
+{
+    BlockDriverState tmp;
+
+    bdrv_move_feature_fields(&tmp, bs_top);
+    bdrv_move_feature_fields(bs_top, bs_new);
+    bdrv_move_feature_fields(bs_new, &tmp);
+
+    assert(!bs_new->throttle_state);
+    if (bs_top->throttle_state) {
+        assert(bs_top->io_limits_enabled);
+        bdrv_io_limits_enable(bs_new, throttle_group_get_name(bs_top));
+        bdrv_io_limits_disable(bs_top);
+    }
+}
+
 /*
  * Add new bs contents at the top of an image chain while the chain is
  * live, while keeping required fields on the top layer.
@@ -2166,14 +2207,29 @@
  * bs_new must not be attached to a BlockBackend.
  *
  * This function does not create any image files.
+ *
+ * bdrv_append() takes ownership of a bs_new reference and unrefs it because
+ * that's what the callers commonly need. bs_new will be referenced by the old
+ * parents of bs_top after bdrv_append() returns. If the caller needs to keep a
+ * reference of its own, it must call bdrv_ref().
  */
 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
 {
-    bdrv_swap(bs_new, bs_top);
+    assert(!bdrv_requests_pending(bs_top));
+    assert(!bdrv_requests_pending(bs_new));
 
-    /* The contents of 'tmp' will become bs_top, as we are
-     * swapping bs_new and bs_top contents. */
-    bdrv_set_backing_hd(bs_top, bs_new);
+    bdrv_ref(bs_top);
+    change_parent_backing_link(bs_top, bs_new);
+
+    /* Some fields always stay on top of the backing file chain */
+    swap_feature_fields(bs_top, bs_new);
+
+    bdrv_set_backing_hd(bs_new, bs_top);
+    bdrv_unref(bs_top);
+
+    /* bs_new is now referenced by its new parents, we don't need the
+     * additional reference any more. */
+    bdrv_unref(bs_new);
 }
 
 static void bdrv_delete(BlockDriverState *bs)
diff --git a/blockdev.c b/blockdev.c
index b633212..6c8cce4 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1546,7 +1546,7 @@
     /* We don't need (or want) to use the transactional
      * bdrv_reopen_multiple() across all the entries at once, because we
      * don't want to abort all of them if one of them fails the reopen */
-    bdrv_reopen(state->new_bs, state->new_bs->open_flags & ~BDRV_O_RDWR,
+    bdrv_reopen(state->old_bs, state->old_bs->open_flags & ~BDRV_O_RDWR,
                 NULL);
 
     aio_context_release(state->aio_context);