Fix a race condition and non-leaf imagesgrowing in VMDK chains, by Igor
Lvovsky.


git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@2987 c046a42c-6fe2-441c-8c8c-71466251a162
diff --git a/block-vmdk.c b/block-vmdk.c
index 28df8ae..e595eab 100644
--- a/block-vmdk.c
+++ b/block-vmdk.c
@@ -75,8 +75,25 @@
 
     unsigned int cluster_sectors;
     uint32_t parent_cid;
+    int is_parent;
 } BDRVVmdkState;
 
+typedef struct VmdkMetaData {
+    uint32_t offset;
+    unsigned int l1_index;
+    unsigned int l2_index;
+    unsigned int l2_offset;
+    int valid;
+} VmdkMetaData;
+
+typedef struct ActiveBDRVState{
+    BlockDriverState *hd;            // active image handler
+    uint64_t cluster_offset;         // current write offset
+}ActiveBDRVState;
+
+static ActiveBDRVState activeBDRV;
+
+
 static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
     uint32_t magic;
@@ -305,7 +322,7 @@
         bdrv_close(bs->backing_hd);
 }
 
-
+int parent_open = 0;
 static int vmdk_parent_open(BlockDriverState *bs, const char * filename)
 {
     BDRVVmdkState *s = bs->opaque;
@@ -339,8 +356,10 @@
             bdrv_close(s->hd);
             return -1;
         }
-        if (bdrv_open(s->hd->backing_hd, parent_img_name, 0) < 0)
+        parent_open = 1;
+        if (bdrv_open(s->hd->backing_hd, parent_img_name, BDRV_O_RDONLY) < 0)
             goto failure;
+        parent_open = 0;
     }
 
     return 0;
@@ -352,6 +371,11 @@
     uint32_t magic;
     int l1_size, i, ret;
 
+    if (parent_open)
+        // Parent must be opened as RO.
+        flags = BDRV_O_RDONLY;
+    fprintf(stderr, "(VMDK) image open: flags=0x%x filename=%s\n", flags, bs->filename);
+
     ret = bdrv_file_open(&s->hd, filename, flags);
     if (ret < 0)
         return ret;
@@ -387,6 +411,11 @@
         s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
         s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
 
+        if (parent_open)
+            s->is_parent = 1;
+        else
+            s->is_parent = 0;
+
         // try to open parent images, if exist
         if (vmdk_parent_open(bs, filename) != 0)
             goto fail;
@@ -430,7 +459,8 @@
     return -1;
 }
 
-static uint64_t get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate);
+static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
+                                   uint64_t offset, int allocate);
 
 static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
                              uint64_t offset, int allocate)
@@ -446,27 +476,55 @@
 
         if (!vmdk_is_cid_valid(bs))
             return -1;
-        parent_cluster_offset = get_cluster_offset(s->hd->backing_hd, offset, allocate);
-        if (bdrv_pread(ps->hd, parent_cluster_offset, whole_grain, ps->cluster_sectors*512) != 
-                                                                            ps->cluster_sectors*512)
-            return -1;
 
-        if (bdrv_pwrite(s->hd, cluster_offset << 9, whole_grain, sizeof(whole_grain)) != 
-                                                                            sizeof(whole_grain))
-            return -1;
+        parent_cluster_offset = get_cluster_offset(s->hd->backing_hd, NULL, offset, allocate);
+
+        if (parent_cluster_offset) {
+            BDRVVmdkState *act_s = activeBDRV.hd->opaque;
+
+            if (bdrv_pread(ps->hd, parent_cluster_offset, whole_grain, ps->cluster_sectors*512) != ps->cluster_sectors*512)
+                return -1;
+
+            //Write grain only into the active image
+            if (bdrv_pwrite(act_s->hd, activeBDRV.cluster_offset << 9, whole_grain, sizeof(whole_grain)) != sizeof(whole_grain))
+                return -1;
+        }
     }
     return 0;
 }
 
-static uint64_t get_cluster_offset(BlockDriverState *bs,
+static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
+{
+    BDRVVmdkState *s = bs->opaque;
+
+    /* update L2 table */
+    if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
+                    &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
+        return -1;
+    /* update backup L2 table */
+    if (s->l1_backup_table_offset != 0) {
+        m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
+        if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
+                        &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
+            return -1;
+    }
+
+    return 0;
+}
+
+static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
                                    uint64_t offset, int allocate)
 {
     BDRVVmdkState *s = bs->opaque;
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
-    uint32_t min_count, *l2_table, tmp;
+    uint32_t min_count, *l2_table, tmp = 0;
     uint64_t cluster_offset;
-    
+    int status;
+
+    if (m_data)
+        m_data->valid = 0;
+
     l1_index = (offset >> 9) / s->l1_entry_sectors;
     if (l1_index >= s->l1_size)
         return 0;
@@ -504,32 +562,45 @@
  found:
     l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
     cluster_offset = le32_to_cpu(l2_table[l2_index]);
+
     if (!cluster_offset) {
         struct stat file_buf;
 
         if (!allocate)
             return 0;
-        stat(s->hd->filename, &file_buf);
-        cluster_offset = file_buf.st_size;
-        bdrv_truncate(s->hd, cluster_offset + (s->cluster_sectors << 9));
-
-        cluster_offset >>= 9;
-        /* update L2 table */
-        tmp = cpu_to_le32(cluster_offset);
-        l2_table[l2_index] = tmp;
-        if (bdrv_pwrite(s->hd, ((int64_t)l2_offset * 512) + (l2_index * sizeof(tmp)), 
-                        &tmp, sizeof(tmp)) != sizeof(tmp))
-            return 0;
-        /* update backup L2 table */
-        if (s->l1_backup_table_offset != 0) {
-            l2_offset = s->l1_backup_table[l1_index];
-            if (bdrv_pwrite(s->hd, ((int64_t)l2_offset * 512) + (l2_index * sizeof(tmp)), 
-                            &tmp, sizeof(tmp)) != sizeof(tmp))
+        // Avoid the L2 tables update for the images that have snapshots.
+        if (!s->is_parent) {
+            status = stat(s->hd->filename, &file_buf);
+            if (status == -1) {
+                fprintf(stderr, "(VMDK) Fail file stat: filename =%s size=0x%lx errno=%s\n",
+                                s->hd->filename, (uint64_t)file_buf.st_size, strerror(errno));
                 return 0;
-        }
+            }
+            cluster_offset = file_buf.st_size;
+            bdrv_truncate(s->hd, cluster_offset + (s->cluster_sectors << 9));
 
+            cluster_offset >>= 9;
+            tmp = cpu_to_le32(cluster_offset);
+            l2_table[l2_index] = tmp;
+            // Save the active image state
+            activeBDRV.cluster_offset = cluster_offset;
+            activeBDRV.hd = bs;
+        }
+        /* First of all we write grain itself, to avoid race condition
+         * that may to corrupt the image.
+         * This problem may occur because of insufficient space on host disk
+         * or inappropriate VM shutdown.
+         */
         if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
             return 0;
+
+        if (m_data) {
+            m_data->offset = tmp;
+            m_data->l1_index = l1_index;
+            m_data->l2_index = l2_index;
+            m_data->l2_offset = l2_offset;
+            m_data->valid = 1;
+        }
     }
     cluster_offset <<= 9;
     return cluster_offset;
@@ -542,7 +613,7 @@
     int index_in_cluster, n;
     uint64_t cluster_offset;
 
-    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0);
+    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
     index_in_cluster = sector_num % s->cluster_sectors;
     n = s->cluster_sectors - index_in_cluster;
     if (n > nb_sectors)
@@ -559,7 +630,7 @@
     uint64_t cluster_offset;
 
     while (nb_sectors > 0) {
-        cluster_offset = get_cluster_offset(bs, sector_num << 9, 0);
+        cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
         index_in_cluster = sector_num % s->cluster_sectors;
         n = s->cluster_sectors - index_in_cluster;
         if (n > nb_sectors)
@@ -590,20 +661,34 @@
                      const uint8_t *buf, int nb_sectors)
 {
     BDRVVmdkState *s = bs->opaque;
+    VmdkMetaData m_data;
     int index_in_cluster, n;
     uint64_t cluster_offset;
     static int cid_update = 0;
 
+    if (sector_num > bs->total_sectors) {
+        fprintf(stderr,
+                "(VMDK) Wrong offset: sector_num=0x%lx total_sectors=0x%lx\n",
+                sector_num, bs->total_sectors);
+        return -1;
+    }
+
     while (nb_sectors > 0) {
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
         n = s->cluster_sectors - index_in_cluster;
         if (n > nb_sectors)
             n = nb_sectors;
-        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1);
+        cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
         if (!cluster_offset)
             return -1;
+
         if (bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
             return -1;
+        if (m_data.valid) {
+            /* update L2 tables */
+            if (vmdk_L2update(bs, &m_data) == -1)
+                return -1;
+        }
         nb_sectors -= n;
         sector_num += n;
         buf += n * 512;