From 0d60175bd684cf6a14f750579d82dbd1ba97fcbc Mon Sep 17 00:00:00 2001
From: Anand Avati <avati@redhat.com>
Date: Wed, 6 Mar 2013 01:11:59 -0800
Subject: contrib/qemu: Import qemu block source code

This qemu block format source code and its minimal
dependency files will be used in the next patch to implement
a qemu-block format translator.

Change-Id: Ic87638972f7ea9b3df84d7a0539512a250c11c1c
BUG: 986775
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.org/5366
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 contrib/qemu/block/qcow.c           |  914 ++++++++++++++++++
 contrib/qemu/block/qcow2-cache.c    |  323 +++++++
 contrib/qemu/block/qcow2-cluster.c  | 1478 ++++++++++++++++++++++++++++
 contrib/qemu/block/qcow2-refcount.c | 1374 ++++++++++++++++++++++++++
 contrib/qemu/block/qcow2-snapshot.c |  660 +++++++++++++
 contrib/qemu/block/qcow2.c          | 1825 +++++++++++++++++++++++++++++++++++
 contrib/qemu/block/qcow2.h          |  437 +++++++++
 contrib/qemu/block/qed-check.c      |  248 +++++
 contrib/qemu/block/qed-cluster.c    |  165 ++++
 contrib/qemu/block/qed-gencb.c      |   32 +
 contrib/qemu/block/qed-l2-cache.c   |  187 ++++
 contrib/qemu/block/qed-table.c      |  296 ++++++
 contrib/qemu/block/qed.c            | 1596 ++++++++++++++++++++++++++++++
 contrib/qemu/block/qed.h            |  344 +++++++
 contrib/qemu/block/snapshot.c       |  157 +++
 15 files changed, 10036 insertions(+)
 create mode 100644 contrib/qemu/block/qcow.c
 create mode 100644 contrib/qemu/block/qcow2-cache.c
 create mode 100644 contrib/qemu/block/qcow2-cluster.c
 create mode 100644 contrib/qemu/block/qcow2-refcount.c
 create mode 100644 contrib/qemu/block/qcow2-snapshot.c
 create mode 100644 contrib/qemu/block/qcow2.c
 create mode 100644 contrib/qemu/block/qcow2.h
 create mode 100644 contrib/qemu/block/qed-check.c
 create mode 100644 contrib/qemu/block/qed-cluster.c
 create mode 100644 contrib/qemu/block/qed-gencb.c
 create mode 100644 contrib/qemu/block/qed-l2-cache.c
 create mode 100644 contrib/qemu/block/qed-table.c
 create mode 100644 contrib/qemu/block/qed.c
 create mode 100644 contrib/qemu/block/qed.h
 create mode 100644 contrib/qemu/block/snapshot.c

(limited to 'contrib/qemu/block')

diff --git a/contrib/qemu/block/qcow.c b/contrib/qemu/block/qcow.c
new file mode 100644
index 00000000000..5239bd68f1c
--- /dev/null
+++ b/contrib/qemu/block/qcow.c
@@ -0,0 +1,914 @@
+/*
+ * Block driver for the QCOW format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include <zlib.h>
+#include "qemu/aes.h"
+#include "migration/migration.h"
+
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+
+typedef struct QCowHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t mtime;
+    uint64_t size; /* in bytes */
+    uint8_t cluster_bits;
+    uint8_t l2_bits;
+    uint32_t crypt_method;
+    uint64_t l1_table_offset;
+} QCowHeader;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVQcowState {
+    int cluster_bits;
+    int cluster_size;
+    int cluster_sectors;
+    int l2_bits;
+    int l2_size;
+    int l1_size;
+    uint64_t cluster_offset_mask;
+    uint64_t l1_table_offset;
+    uint64_t *l1_table;
+    uint64_t *l2_cache;
+    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+    uint8_t *cluster_cache;
+    uint8_t *cluster_data;
+    uint64_t cluster_cache_offset;
+    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    uint32_t crypt_method_header;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+    CoMutex lock;
+    Error *migration_blocker;
+} BDRVQcowState;
+
+static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const QCowHeader *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(QCowHeader) &&
+        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+        be32_to_cpu(cow_header->version) == QCOW_VERSION)
+        return 100;
+    else
+        return 0;
+}
+
+static int qcow_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    int len, i, shift, ret;
+    QCowHeader header;
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret < 0) {
+        goto fail;
+    }
+    be32_to_cpus(&header.magic);
+    be32_to_cpus(&header.version);
+    be64_to_cpus(&header.backing_file_offset);
+    be32_to_cpus(&header.backing_file_size);
+    be32_to_cpus(&header.mtime);
+    be64_to_cpus(&header.size);
+    be32_to_cpus(&header.crypt_method);
+    be64_to_cpus(&header.l1_table_offset);
+
+    if (header.magic != QCOW_MAGIC) {
+        ret = -EMEDIUMTYPE;
+        goto fail;
+    }
+    if (header.version != QCOW_VERSION) {
+        char version[64];
+        snprintf(version, sizeof(version), "QCOW version %d", header.version);
+        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+            bs->device_name, "qcow", version);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    if (header.size <= 1 || header.cluster_bits < 9) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (header.crypt_method > QCOW_CRYPT_AES) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    s->crypt_method_header = header.crypt_method;
+    if (s->crypt_method_header) {
+        bs->encrypted = 1;
+    }
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+    s->l2_bits = header.l2_bits;
+    s->l2_size = 1 << s->l2_bits;
+    bs->total_sectors = header.size / 512;
+    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+
+    /* read the level 1 table */
+    shift = s->cluster_bits + s->l2_bits;
+    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
+
+    s->l1_table_offset = header.l1_table_offset;
+    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+
+    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+               s->l1_size * sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+    /* alloc L2 cache */
+    s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    s->cluster_cache = g_malloc(s->cluster_size);
+    s->cluster_data = g_malloc(s->cluster_size);
+    s->cluster_cache_offset = -1;
+
+    /* read the backing file name */
+    if (header.backing_file_offset != 0) {
+        len = header.backing_file_size;
+        if (len > 1023) {
+            len = 1023;
+        }
+        ret = bdrv_pread(bs->file, header.backing_file_offset,
+                   bs->backing_file, len);
+        if (ret < 0) {
+            goto fail;
+        }
+        bs->backing_file[len] = '\0';
+    }
+
+    /* Disable migration when qcow images are used */
+    error_set(&s->migration_blocker,
+              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
+              "qcow", bs->device_name, "live migration");
+    migrate_add_blocker(s->migration_blocker);
+
+    qemu_co_mutex_init(&s->lock);
+    return 0;
+
+ fail:
+    g_free(s->l1_table);
+    g_free(s->l2_cache);
+    g_free(s->cluster_cache);
+    g_free(s->cluster_data);
+    return ret;
+}
+
+
+/* We have nothing to do for QCOW reopen, stubs just return
+ * success */
+static int qcow_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint8_t keybuf[16];
+    int len, i;
+
+    memset(keybuf, 0, 16);
+    len = strlen(key);
+    if (len > 16)
+        len = 16;
+    /* XXX: we could compress the chars to 7 bits to increase
+       entropy */
+    for(i = 0;i < len;i++) {
+        keybuf[i] = key[i];
+    }
+    s->crypt_method = s->crypt_method_header;
+
+    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+        return -1;
+    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+        return -1;
+    return 0;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+   supported */
+static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                            uint8_t *out_buf, const uint8_t *in_buf,
+                            int nb_sectors, int enc,
+                            const AES_KEY *key)
+{
+    union {
+        uint64_t ll[2];
+        uint8_t b[16];
+    } ivec;
+    int i;
+
+    for(i = 0; i < nb_sectors; i++) {
+        ivec.ll[0] = cpu_to_le64(sector_num);
+        ivec.ll[1] = 0;
+        AES_cbc_encrypt(in_buf, out_buf, 512, key,
+                        ivec.b, enc);
+        sector_num++;
+        in_buf += 512;
+        out_buf += 512;
+    }
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+                                   uint64_t offset, int allocate,
+                                   int compressed_size,
+                                   int n_start, int n_end)
+{
+    BDRVQcowState *s = bs->opaque;
+    int min_index, i, j, l1_index, l2_index;
+    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+    uint32_t min_count;
+    int new_l2_table;
+
+    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    l2_offset = s->l1_table[l1_index];
+    new_l2_table = 0;
+    if (!l2_offset) {
+        if (!allocate)
+            return 0;
+        /* allocate a new l2 entry */
+        l2_offset = bdrv_getlength(bs->file);
+        /* round to cluster size */
+        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
+        /* update the L1 entry */
+        s->l1_table[l1_index] = l2_offset;
+        tmp = cpu_to_be64(l2_offset);
+        if (bdrv_pwrite_sync(bs->file,
+                s->l1_table_offset + l1_index * sizeof(tmp),
+                &tmp, sizeof(tmp)) < 0)
+            return 0;
+        new_l2_table = 1;
+    }
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (l2_offset == s->l2_cache_offsets[i]) {
+            /* increment the hit count */
+            if (++s->l2_cache_counts[i] == 0xffffffff) {
+                for(j = 0; j < L2_CACHE_SIZE; j++) {
+                    s->l2_cache_counts[j] >>= 1;
+                }
+            }
+            l2_table = s->l2_cache + (i << s->l2_bits);
+            goto found;
+        }
+    }
+    /* not found: load a new entry in the least used one */
+    min_index = 0;
+    min_count = 0xffffffff;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (s->l2_cache_counts[i] < min_count) {
+            min_count = s->l2_cache_counts[i];
+            min_index = i;
+        }
+    }
+    l2_table = s->l2_cache + (min_index << s->l2_bits);
+    if (new_l2_table) {
+        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
+                s->l2_size * sizeof(uint64_t)) < 0)
+            return 0;
+    } else {
+        if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+            s->l2_size * sizeof(uint64_t))
+            return 0;
+    }
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+ found:
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    if (!cluster_offset ||
+        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
+        if (!allocate)
+            return 0;
+        /* allocate a new cluster */
+        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+            (n_end - n_start) < s->cluster_sectors) {
+            /* if the cluster is already compressed, we must
+               decompress it in the case it is not completely
+               overwritten */
+            if (decompress_cluster(bs, cluster_offset) < 0)
+                return 0;
+            cluster_offset = bdrv_getlength(bs->file);
+            cluster_offset = (cluster_offset + s->cluster_size - 1) &
+                ~(s->cluster_size - 1);
+            /* write the cluster content */
+            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) !=
+                s->cluster_size)
+                return -1;
+        } else {
+            cluster_offset = bdrv_getlength(bs->file);
+            if (allocate == 1) {
+                /* round to cluster size */
+                cluster_offset = (cluster_offset + s->cluster_size - 1) &
+                    ~(s->cluster_size - 1);
+                bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
+                /* if encrypted, we must initialize the cluster
+                   content which won't be written */
+                if (s->crypt_method &&
+                    (n_end - n_start) < s->cluster_sectors) {
+                    uint64_t start_sect;
+                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
+                    memset(s->cluster_data + 512, 0x00, 512);
+                    for(i = 0; i < s->cluster_sectors; i++) {
+                        if (i < n_start || i >= n_end) {
+                            encrypt_sectors(s, start_sect + i,
+                                            s->cluster_data,
+                                            s->cluster_data + 512, 1, 1,
+                                            &s->aes_encrypt_key);
+                            if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
+                                            s->cluster_data, 512) != 512)
+                                return -1;
+                        }
+                    }
+                }
+            } else if (allocate == 2) {
+                cluster_offset |= QCOW_OFLAG_COMPRESSED |
+                    (uint64_t)compressed_size << (63 - s->cluster_bits);
+            }
+        }
+        /* update L2 table */
+        tmp = cpu_to_be64(cluster_offset);
+        l2_table[l2_index] = tmp;
+        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
+                &tmp, sizeof(tmp)) < 0)
+            return 0;
+    }
+    return cluster_offset;
+}
+
+static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster, n;
+    uint64_t cluster_offset;
+
+    qemu_co_mutex_lock(&s->lock);
+    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+    qemu_co_mutex_unlock(&s->lock);
+    index_in_cluster = sector_num & (s->cluster_sectors - 1);
+    n = s->cluster_sectors - index_in_cluster;
+    if (n > nb_sectors)
+        n = nb_sectors;
+    *pnum = n;
+    return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+    z_stream strm1, *strm = &strm1;
+    int ret, out_len;
+
+    memset(strm, 0, sizeof(*strm));
+
+    strm->next_in = (uint8_t *)buf;
+    strm->avail_in = buf_size;
+    strm->next_out = out_buf;
+    strm->avail_out = out_buf_size;
+
+    ret = inflateInit2(strm, -12);
+    if (ret != Z_OK)
+        return -1;
+    ret = inflate(strm, Z_FINISH);
+    out_len = strm->next_out - out_buf;
+    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+        out_len != out_buf_size) {
+        inflateEnd(strm);
+        return -1;
+    }
+    inflateEnd(strm);
+    return 0;
+}
+
+static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, csize;
+    uint64_t coffset;
+
+    coffset = cluster_offset & s->cluster_offset_mask;
+    if (s->cluster_cache_offset != coffset) {
+        csize = cluster_offset >> (63 - s->cluster_bits);
+        csize &= (s->cluster_size - 1);
+        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
+        if (ret != csize)
+            return -1;
+        if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                              s->cluster_data, csize) < 0) {
+            return -1;
+        }
+        s->cluster_cache_offset = coffset;
+    }
+    return 0;
+}
+
+static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    int ret = 0, n;
+    uint64_t cluster_offset;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    uint8_t *buf;
+    void *orig_buf;
+
+    if (qiov->niov > 1) {
+        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+    } else {
+        orig_buf = NULL;
+        buf = (uint8_t *)qiov->iov->iov_base;
+    }
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (nb_sectors != 0) {
+        /* prepare next request */
+        cluster_offset = get_cluster_offset(bs, sector_num << 9,
+                                                 0, 0, 0, 0);
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
+
+        if (!cluster_offset) {
+            if (bs->backing_hd) {
+                /* read from the base image */
+                hd_iov.iov_base = (void *)buf;
+                hd_iov.iov_len = n * 512;
+                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                    n, &hd_qiov);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto fail;
+                }
+            } else {
+                /* Note: in this case, no need to wait */
+                memset(buf, 0, 512 * n);
+            }
+        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+            /* add AIO support for compressed blocks ? */
+            if (decompress_cluster(bs, cluster_offset) < 0) {
+                goto fail;
+            }
+            memcpy(buf,
+                   s->cluster_cache + index_in_cluster * 512, 512 * n);
+        } else {
+            if ((cluster_offset & 511) != 0) {
+                goto fail;
+            }
+            hd_iov.iov_base = (void *)buf;
+            hd_iov.iov_len = n * 512;
+            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+            qemu_co_mutex_unlock(&s->lock);
+            ret = bdrv_co_readv(bs->file,
+                                (cluster_offset >> 9) + index_in_cluster,
+                                n, &hd_qiov);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                break;
+            }
+            if (s->crypt_method) {
+                encrypt_sectors(s, sector_num, buf, buf,
+                                n, 0,
+                                &s->aes_decrypt_key);
+            }
+        }
+        ret = 0;
+
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+
+done:
+    qemu_co_mutex_unlock(&s->lock);
+
+    if (qiov->niov > 1) {
+        qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
+        qemu_vfree(orig_buf);
+    }
+
+    return ret;
+
+fail:
+    ret = -EIO;
+    goto done;
+}
+
+static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    uint64_t cluster_offset;
+    const uint8_t *src_buf;
+    int ret = 0, n;
+    uint8_t *cluster_data = NULL;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    uint8_t *buf;
+    void *orig_buf;
+
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+
+    if (qiov->niov > 1) {
+        buf = orig_buf = qemu_blockalign(bs, qiov->size);
+        qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
+    } else {
+        orig_buf = NULL;
+        buf = (uint8_t *)qiov->iov->iov_base;
+    }
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (nb_sectors != 0) {
+
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors) {
+            n = nb_sectors;
+        }
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
+                                            index_in_cluster,
+                                            index_in_cluster + n);
+        if (!cluster_offset || (cluster_offset & 511) != 0) {
+            ret = -EIO;
+            break;
+        }
+        if (s->crypt_method) {
+            if (!cluster_data) {
+                cluster_data = g_malloc0(s->cluster_size);
+            }
+            encrypt_sectors(s, sector_num, cluster_data, buf,
+                            n, 1, &s->aes_encrypt_key);
+            src_buf = cluster_data;
+        } else {
+            src_buf = buf;
+        }
+
+        hd_iov.iov_base = (void *)src_buf;
+        hd_iov.iov_len = n * 512;
+        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+        qemu_co_mutex_unlock(&s->lock);
+        ret = bdrv_co_writev(bs->file,
+                             (cluster_offset >> 9) + index_in_cluster,
+                             n, &hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            break;
+        }
+        ret = 0;
+
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    qemu_co_mutex_unlock(&s->lock);
+
+    if (qiov->niov > 1) {
+        qemu_vfree(orig_buf);
+    }
+    g_free(cluster_data);
+
+    return ret;
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    g_free(s->l1_table);
+    g_free(s->l2_cache);
+    g_free(s->cluster_cache);
+    g_free(s->cluster_data);
+
+    migrate_del_blocker(s->migration_blocker);
+    error_free(s->migration_blocker);
+}
+
+static int qcow_create(const char *filename, QEMUOptionParameter *options)
+{
+    int header_size, backing_filename_len, l1_size, shift, i;
+    QCowHeader header;
+    uint8_t *tmp;
+    int64_t total_size = 0;
+    const char *backing_file = NULL;
+    int flags = 0;
+    int ret;
+    BlockDriverState *qcow_bs;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / 512;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+        }
+        options++;
+    }
+
+    ret = bdrv_create_file(filename, options);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_truncate(qcow_bs, 0);
+    if (ret < 0) {
+        goto exit;
+    }
+
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be32(QCOW_MAGIC);
+    header.version = cpu_to_be32(QCOW_VERSION);
+    header.size = cpu_to_be64(total_size * 512);
+    header_size = sizeof(header);
+    backing_filename_len = 0;
+    if (backing_file) {
+        if (strcmp(backing_file, "fat:")) {
+            header.backing_file_offset = cpu_to_be64(header_size);
+            backing_filename_len = strlen(backing_file);
+            header.backing_file_size = cpu_to_be32(backing_filename_len);
+            header_size += backing_filename_len;
+        } else {
+            /* special backing file for vvfat */
+            backing_file = NULL;
+        }
+        header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+                                    unmodifyed sectors */
+        header.l2_bits = 12; /* 32 KB L2 tables */
+    } else {
+        header.cluster_bits = 12; /* 4 KB clusters */
+        header.l2_bits = 9; /* 4 KB L2 tables */
+    }
+    header_size = (header_size + 7) & ~7;
+    shift = header.cluster_bits + header.l2_bits;
+    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
+
+    header.l1_table_offset = cpu_to_be64(header_size);
+    if (flags & BLOCK_FLAG_ENCRYPT) {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+    } else {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+    }
+
+    /* write all the data */
+    ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header));
+    if (ret != sizeof(header)) {
+        goto exit;
+    }
+
+    if (backing_file) {
+        ret = bdrv_pwrite(qcow_bs, sizeof(header),
+            backing_file, backing_filename_len);
+        if (ret != backing_filename_len) {
+            goto exit;
+        }
+    }
+
+    tmp = g_malloc0(BDRV_SECTOR_SIZE);
+    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
+        BDRV_SECTOR_SIZE); i++) {
+        ret = bdrv_pwrite(qcow_bs, header_size +
+            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
+        if (ret != BDRV_SECTOR_SIZE) {
+            g_free(tmp);
+            goto exit;
+        }
+    }
+
+    g_free(tmp);
+    ret = 0;
+exit:
+    bdrv_delete(qcow_bs);
+    return ret;
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+    int ret;
+
+    memset(s->l1_table, 0, l1_length);
+    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
+            l1_length) < 0)
+        return -1;
+    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
+    if (ret < 0)
+        return ret;
+
+    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+    return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                 const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    z_stream strm;
+    int ret, out_len;
+    uint8_t *out_buf;
+    uint64_t cluster_offset;
+
+    if (nb_sectors != s->cluster_sectors) {
+        ret = -EINVAL;
+
+        /* Zero-pad last write if image size is not cluster aligned */
+        if (sector_num + nb_sectors == bs->total_sectors &&
+            nb_sectors < s->cluster_sectors) {
+            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+            memset(pad_buf, 0, s->cluster_size);
+            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+            ret = qcow_write_compressed(bs, sector_num,
+                                        pad_buf, s->cluster_sectors);
+            qemu_vfree(pad_buf);
+        }
+        return ret;
+    }
+
+    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+    /* best compression, small window, no zlib header */
+    memset(&strm, 0, sizeof(strm));
+    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                       Z_DEFLATED, -12,
+                       9, Z_DEFAULT_STRATEGY);
+    if (ret != 0) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    strm.avail_in = s->cluster_size;
+    strm.next_in = (uint8_t *)buf;
+    strm.avail_out = s->cluster_size;
+    strm.next_out = out_buf;
+
+    ret = deflate(&strm, Z_FINISH);
+    if (ret != Z_STREAM_END && ret != Z_OK) {
+        deflateEnd(&strm);
+        ret = -EINVAL;
+        goto fail;
+    }
+    out_len = strm.next_out - out_buf;
+
+    deflateEnd(&strm);
+
+    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+        /* could not compress: write normal cluster */
+        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+        if (ret < 0) {
+            goto fail;
+        }
+    } else {
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
+                                            out_len, 0, 0);
+        if (cluster_offset == 0) {
+            ret = -EIO;
+            goto fail;
+        }
+
+        cluster_offset &= s->cluster_offset_mask;
+        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    ret = 0;
+fail:
+    g_free(out_buf);
+    return ret;
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdi->cluster_size = s->cluster_size;
+    return 0;
+}
+
+
+static QEMUOptionParameter qcow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_ENCRYPT,
+        .type = OPT_FLAG,
+        .help = "Encrypt the image"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_qcow = {
+    .format_name	= "qcow",
+    .instance_size	= sizeof(BDRVQcowState),
+    .bdrv_probe		= qcow_probe,
+    .bdrv_open		= qcow_open,
+    .bdrv_close		= qcow_close,
+    .bdrv_reopen_prepare = qcow_reopen_prepare,
+    .bdrv_create	= qcow_create,
+    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
+
+    .bdrv_co_readv          = qcow_co_readv,
+    .bdrv_co_writev         = qcow_co_writev,
+    .bdrv_co_is_allocated   = qcow_co_is_allocated,
+
+    .bdrv_set_key           = qcow_set_key,
+    .bdrv_make_empty        = qcow_make_empty,
+    .bdrv_write_compressed  = qcow_write_compressed,
+    .bdrv_get_info          = qcow_get_info,
+
+    .create_options = qcow_create_options,
+};
+
+static void bdrv_qcow_init(void)
+{
+    bdrv_register(&bdrv_qcow);
+}
+
+block_init(bdrv_qcow_init);
diff --git a/contrib/qemu/block/qcow2-cache.c b/contrib/qemu/block/qcow2-cache.c
new file mode 100644
index 00000000000..2f3114ecc24
--- /dev/null
+++ b/contrib/qemu/block/qcow2-cache.c
@@ -0,0 +1,323 @@
+/*
+ * L2/refcount table cache for the QCOW2 format
+ *
+ * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/block_int.h"
+#include "qemu-common.h"
+#include "qcow2.h"
+#include "trace.h"
+
+typedef struct Qcow2CachedTable {
+    void*   table;
+    int64_t offset;
+    bool    dirty;
+    int     cache_hits;
+    int     ref;
+} Qcow2CachedTable;
+
+struct Qcow2Cache {
+    Qcow2CachedTable*       entries;
+    struct Qcow2Cache*      depends;
+    int                     size;
+    bool                    depends_on_flush;
+};
+
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2Cache *c;
+    int i;
+
+    c = g_malloc0(sizeof(*c));
+    c->size = num_tables;
+    c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
+
+    for (i = 0; i < c->size; i++) {
+        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+    }
+
+    return c;
+}
+
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        assert(c->entries[i].ref == 0);
+        qemu_vfree(c->entries[i].table);
+    }
+
+    g_free(c->entries);
+    g_free(c);
+
+    return 0;
+}
+
+static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int ret;
+
+    ret = qcow2_cache_flush(bs, c->depends);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->depends = NULL;
+    c->depends_on_flush = false;
+
+    return 0;
+}
+
+static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret = 0;
+
+    if (!c->entries[i].dirty || !c->entries[i].offset) {
+        return 0;
+    }
+
+    trace_qcow2_cache_entry_flush(qemu_coroutine_self(),
+                                  c == s->l2_table_cache, i);
+
+    if (c->depends) {
+        ret = qcow2_cache_flush_dependency(bs, c);
+    } else if (c->depends_on_flush) {
+        ret = bdrv_flush(bs->file);
+        if (ret >= 0) {
+            c->depends_on_flush = false;
+        }
+    }
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (c == s->refcount_block_cache) {
+        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
+    } else if (c == s->l2_table_cache) {
+        BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
+    }
+
+    ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
+        s->cluster_size);
+    if (ret < 0) {
+        return ret;
+    }
+
+    c->entries[i].dirty = false;
+
+    return 0;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+{
+    BDRVQcowState *s = bs->opaque;
+    int result = 0;
+    int ret;
+    int i;
+
+    trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache);
+
+    for (i = 0; i < c->size; i++) {
+        ret = qcow2_cache_entry_flush(bs, c, i);
+        if (ret < 0 && result != -ENOSPC) {
+            result = ret;
+        }
+    }
+
+    if (result == 0) {
+        ret = bdrv_flush(bs->file);
+        if (ret < 0) {
+            result = ret;
+        }
+    }
+
+    return result;
+}
+
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+    Qcow2Cache *dependency)
+{
+    int ret;
+
+    if (dependency->depends) {
+        ret = qcow2_cache_flush_dependency(bs, dependency);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if (c->depends && (c->depends != dependency)) {
+        ret = qcow2_cache_flush_dependency(bs, c);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    c->depends = dependency;
+    return 0;
+}
+
+void qcow2_cache_depends_on_flush(Qcow2Cache *c)
+{
+    c->depends_on_flush = true;
+}
+
+static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
+{
+    int i;
+    int min_count = INT_MAX;
+    int min_index = -1;
+
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].ref) {
+            continue;
+        }
+
+        if (c->entries[i].cache_hits < min_count) {
+            min_index = i;
+            min_count = c->entries[i].cache_hits;
+        }
+
+        /* Give newer hits priority */
+        /* TODO Check how to optimize the replacement strategy */
+        c->entries[i].cache_hits /= 2;
+    }
+
+    if (min_index == -1) {
+        /* This can't happen in current synchronous code, but leave the check
+         * here as a reminder for whoever starts using AIO with the cache */
+        abort();
+    }
+    return min_index;
+}
+
+static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
+    uint64_t offset, void **table, bool read_from_disk)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+    int ret;
+
+    trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
+                          offset, read_from_disk);
+
+    /* Check if the table is already cached */
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].offset == offset) {
+            goto found;
+        }
+    }
+
+    /* If not, write a table back and replace it */
+    i = qcow2_cache_find_entry_to_replace(c);
+    trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(),
+                                        c == s->l2_table_cache, i);
+    if (i < 0) {
+        return i;
+    }
+
+    ret = qcow2_cache_entry_flush(bs, c, i);
+    if (ret < 0) {
+        return ret;
+    }
+
+    trace_qcow2_cache_get_read(qemu_coroutine_self(),
+                               c == s->l2_table_cache, i);
+    c->entries[i].offset = 0;
+    if (read_from_disk) {
+        if (c == s->l2_table_cache) {
+            BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
+        }
+
+        ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* Give the table some hits for the start so that it won't be replaced
+     * immediately. The number 32 is completely arbitrary. */
+    c->entries[i].cache_hits = 32;
+    c->entries[i].offset = offset;
+
+    /* And return the right table */
+found:
+    c->entries[i].cache_hits++;
+    c->entries[i].ref++;
+    *table = c->entries[i].table;
+
+    trace_qcow2_cache_get_done(qemu_coroutine_self(),
+                               c == s->l2_table_cache, i);
+
+    return 0;
+}
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table)
+{
+    return qcow2_cache_do_get(bs, c, offset, table, true);
+}
+
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table)
+{
+    return qcow2_cache_do_get(bs, c, offset, table, false);
+}
+
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == *table) {
+            goto found;
+        }
+    }
+    return -ENOENT;
+
+found:
+    c->entries[i].ref--;
+    *table = NULL;
+
+    assert(c->entries[i].ref >= 0);
+    return 0;
+}
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
+{
+    int i;
+
+    for (i = 0; i < c->size; i++) {
+        if (c->entries[i].table == table) {
+            goto found;
+        }
+    }
+    abort();
+
+found:
+    c->entries[i].dirty = true;
+}
diff --git a/contrib/qemu/block/qcow2-cluster.c b/contrib/qemu/block/qcow2-cluster.c
new file mode 100644
index 00000000000..cca76d4fcdd
--- /dev/null
+++ b/contrib/qemu/block/qcow2-cluster.c
@@ -0,0 +1,1478 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <zlib.h>
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+#include "trace.h"
+
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+                        bool exact_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int new_l1_size2, ret, i;
+    uint64_t *new_l1_table;
+    int64_t new_l1_table_offset, new_l1_size;
+    uint8_t data[12];
+
+    if (min_size <= s->l1_size)
+        return 0;
+
+    if (exact_size) {
+        new_l1_size = min_size;
+    } else {
+        /* Bump size up to reduce the number of times we have to grow */
+        new_l1_size = s->l1_size;
+        if (new_l1_size == 0) {
+            new_l1_size = 1;
+        }
+        while (min_size > new_l1_size) {
+            new_l1_size = (new_l1_size * 3 + 1) / 2;
+        }
+    }
+
+    if (new_l1_size > INT_MAX) {
+        return -EFBIG;
+    }
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
+            s->l1_size, new_l1_size);
+#endif
+
+    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
+    new_l1_table = g_malloc0(align_offset(new_l1_size2, 512));
+    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
+
+    /* write new table (align to cluster) */
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
+    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
+    if (new_l1_table_offset < 0) {
+        g_free(new_l1_table);
+        return new_l1_table_offset;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
+    for(i = 0; i < s->l1_size; i++)
+        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
+    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2);
+    if (ret < 0)
+        goto fail;
+    for(i = 0; i < s->l1_size; i++)
+        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
+
+    /* set new table */
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
+    cpu_to_be32w((uint32_t*)data, new_l1_size);
+    cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data));
+    if (ret < 0) {
+        goto fail;
+    }
+    g_free(s->l1_table);
+    qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
+    s->l1_table_offset = new_l1_table_offset;
+    s->l1_table = new_l1_table;
+    s->l1_size = new_l1_size;
+    return 0;
+ fail:
+    g_free(new_l1_table);
+    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
+                        QCOW2_DISCARD_OTHER);
+    return ret;
+}
+
+/*
+ * l2_load
+ *
+ * Loads a L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns a pointer to the L2 table on success, or NULL if the read from
+ * the image file failed.
+ */
+
+static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
+    uint64_t **l2_table)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
+
+    return ret;
+}
+
+/*
+ * Writes one sector of the L1 table to the disk (can't update single entries
+ * and we really don't want bdrv_pread to perform a read-modify-write)
+ */
+#define L1_ENTRIES_PER_SECTOR (512 / 8)
+static int write_l1_entry(BlockDriverState *bs, int l1_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t buf[L1_ENTRIES_PER_SECTOR];
+    int l1_start_index;
+    int i, ret;
+
+    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
+    for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
+        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index,
+        buf, sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+/*
+ * l2_allocate
+ *
+ * Allocate a new l2 entry in the file. If l1_index points to an already
+ * used entry in the L2 table (i.e. we are doing a copy on write for the L2
+ * table) copy the contents of the old L2 table into the newly allocated one.
+ * Otherwise the new table is initialized with zeros.
+ *
+ */
+
+static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t old_l2_offset;
+    uint64_t *l2_table;
+    int64_t l2_offset;
+    int ret;
+
+    old_l2_offset = s->l1_table[l1_index];
+
+    trace_qcow2_l2_allocate(bs, l1_index);
+
+    /* allocate a new l2 entry */
+
+    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
+    if (l2_offset < 0) {
+        return l2_offset;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* allocate a new entry in the l2 cache */
+
+    trace_qcow2_l2_allocate_get_empty(bs, l1_index);
+    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    l2_table = *table;
+
+    if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
+        /* if there was no old l2 table, clear the new table */
+        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+    } else {
+        uint64_t* old_table;
+
+        /* if there was an old l2 table, read it from the disk */
+        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
+        ret = qcow2_cache_get(bs, s->l2_table_cache,
+            old_l2_offset & L1E_OFFSET_MASK,
+            (void**) &old_table);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        memcpy(l2_table, old_table, s->cluster_size);
+
+        ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    /* write the l2 table to the file */
+    BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
+
+    trace_qcow2_l2_allocate_write_l2(bs, l1_index);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* update the L1 entry */
+    trace_qcow2_l2_allocate_write_l1(bs, l1_index);
+    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
+    ret = write_l1_entry(bs, l1_index);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    *table = l2_table;
+    trace_qcow2_l2_allocate_done(bs, l1_index, 0);
+    return 0;
+
+fail:
+    trace_qcow2_l2_allocate_done(bs, l1_index, ret);
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
+    s->l1_table[l1_index] = old_l2_offset;
+    return ret;
+}
+
+/*
+ * Checks how many clusters in a given L2 table are contiguous in the image
+ * file. As soon as one of the flags in the bitmask stop_flags changes compared
+ * to the first cluster, the search is stopped and the cluster is not counted
+ * as contiguous. (This allows it, for example, to stop at the first compressed
+ * cluster which may require a different handling)
+ */
+static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
+        uint64_t *l2_table, uint64_t start, uint64_t stop_flags)
+{
+    int i;
+    uint64_t mask = stop_flags | L2E_OFFSET_MASK;
+    uint64_t offset = be64_to_cpu(l2_table[0]) & mask;
+
+    if (!offset)
+        return 0;
+
+    for (i = start; i < start + nb_clusters; i++) {
+        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
+        if (offset + (uint64_t) i * cluster_size != l2_entry) {
+            break;
+        }
+    }
+
+	return (i - start);
+}
+
+static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
+{
+    int i;
+
+    for (i = 0; i < nb_clusters; i++) {
+        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
+
+        if (type != QCOW2_CLUSTER_UNALLOCATED) {
+            break;
+        }
+    }
+
+    return i;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+   supported */
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                           uint8_t *out_buf, const uint8_t *in_buf,
+                           int nb_sectors, int enc,
+                           const AES_KEY *key)
+{
+    union {
+        uint64_t ll[2];
+        uint8_t b[16];
+    } ivec;
+    int i;
+
+    for(i = 0; i < nb_sectors; i++) {
+        ivec.ll[0] = cpu_to_le64(sector_num);
+        ivec.ll[1] = 0;
+        AES_cbc_encrypt(in_buf, out_buf, 512, key,
+                        ivec.b, enc);
+        sector_num++;
+        in_buf += 512;
+        out_buf += 512;
+    }
+}
+
+static int coroutine_fn copy_sectors(BlockDriverState *bs,
+                                     uint64_t start_sect,
+                                     uint64_t cluster_offset,
+                                     int n_start, int n_end)
+{
+    BDRVQcowState *s = bs->opaque;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int n, ret;
+
+    /*
+     * If this is the last cluster and it is only partially used, we must only
+     * copy until the end of the image, or bdrv_check_request will fail for the
+     * bdrv_read/write calls below.
+     */
+    if (start_sect + n_end > bs->total_sectors) {
+        n_end = bs->total_sectors - start_sect;
+    }
+
+    n = n_end - n_start;
+    if (n <= 0) {
+        return 0;
+    }
+
+    iov.iov_len = n * BDRV_SECTOR_SIZE;
+    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
+
+    /* Call .bdrv_co_readv() directly instead of using the public block-layer
+     * interface.  This avoids double I/O throttling and request tracking,
+     * which can lead to deadlock when block layer copy-on-read is enabled.
+     */
+    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (s->crypt_method) {
+        qcow2_encrypt_sectors(s, start_sect + n_start,
+                        iov.iov_base, iov.iov_base, n, 1,
+                        &s->aes_encrypt_key);
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
+    ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0;
+out:
+    qemu_vfree(iov.iov_base);
+    return ret;
+}
+
+
+/*
+ * get_cluster_offset
+ *
+ * For a given offset of the disk image, find the cluster offset in
+ * qcow2 file. The offset is stored in *cluster_offset.
+ *
+ * on entry, *num is the number of contiguous sectors we'd like to
+ * access following offset.
+ *
+ * on exit, *num is the number of contiguous sectors we can read.
+ *
+ * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
+ * cases.
+ */
+int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int *num, uint64_t *cluster_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    unsigned int l2_index;
+    uint64_t l1_index, l2_offset, *l2_table;
+    int l1_bits, c;
+    unsigned int index_in_cluster, nb_clusters;
+    uint64_t nb_available, nb_needed;
+    int ret;
+
+    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
+    nb_needed = *num + index_in_cluster;
+
+    l1_bits = s->l2_bits + s->cluster_bits;
+
+    /* compute how many bytes there are between the offset and
+     * the end of the l1 entry
+     */
+
+    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
+
+    /* compute the number of available sectors */
+
+    nb_available = (nb_available >> 9) + index_in_cluster;
+
+    if (nb_needed > nb_available) {
+        nb_needed = nb_available;
+    }
+
+    *cluster_offset = 0;
+
+    /* seek the the l2 offset in the l1 table */
+
+    l1_index = offset >> l1_bits;
+    if (l1_index >= s->l1_size) {
+        ret = QCOW2_CLUSTER_UNALLOCATED;
+        goto out;
+    }
+
+    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+    if (!l2_offset) {
+        ret = QCOW2_CLUSTER_UNALLOCATED;
+        goto out;
+    }
+
+    /* load the l2 table in memory */
+
+    ret = l2_load(bs, l2_offset, &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* find the cluster offset for the given disk offset */
+
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    *cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    nb_clusters = size_to_clusters(s, nb_needed << 9);
+
+    ret = qcow2_get_cluster_type(*cluster_offset);
+    switch (ret) {
+    case QCOW2_CLUSTER_COMPRESSED:
+        /* Compressed clusters can only be processed one by one */
+        c = 1;
+        *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
+        break;
+    case QCOW2_CLUSTER_ZERO:
+        if (s->qcow_version < 3) {
+            return -EIO;
+        }
+        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+                &l2_table[l2_index], 0,
+                QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+        *cluster_offset = 0;
+        break;
+    case QCOW2_CLUSTER_UNALLOCATED:
+        /* how many empty clusters ? */
+        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+        *cluster_offset = 0;
+        break;
+    case QCOW2_CLUSTER_NORMAL:
+        /* how many allocated clusters ? */
+        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+                &l2_table[l2_index], 0,
+                QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
+        *cluster_offset &= L2E_OFFSET_MASK;
+        break;
+    default:
+        abort();
+    }
+
+    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+
+    nb_available = (c * s->cluster_sectors);
+
+out:
+    if (nb_available > nb_needed)
+        nb_available = nb_needed;
+
+    *num = nb_available - index_in_cluster;
+
+    return ret;
+}
+
+/*
+ * get_cluster_table
+ *
+ * for a given disk offset, load (and allocate if needed)
+ * the l2 table.
+ *
+ * the l2 table offset in the qcow2 file and the cluster index
+ * in the l2 table are given to the caller.
+ *
+ * Returns 0 on success, -errno in failure case
+ */
+static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
+                             uint64_t **new_l2_table,
+                             int *new_l2_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    unsigned int l2_index;
+    uint64_t l1_index, l2_offset;
+    uint64_t *l2_table = NULL;
+    int ret;
+
+    /* seek the the l2 offset in the l1 table */
+
+    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    if (l1_index >= s->l1_size) {
+        ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    assert(l1_index < s->l1_size);
+    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
+
+    /* seek the l2 table of the given l2 offset */
+
+    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
+        /* load the l2 table in memory */
+        ret = l2_load(bs, l2_offset, &l2_table);
+        if (ret < 0) {
+            return ret;
+        }
+    } else {
+        /* First allocate a new L2 table (and do COW if needed) */
+        ret = l2_allocate(bs, l1_index, &l2_table);
+        if (ret < 0) {
+            return ret;
+        }
+
+        /* Then decrease the refcount of the old table */
+        if (l2_offset) {
+            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
+                                QCOW2_DISCARD_OTHER);
+        }
+    }
+
+    /* find the cluster offset for the given disk offset */
+
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+
+    *new_l2_table = l2_table;
+    *new_l2_index = l2_index;
+
+    return 0;
+}
+
+/*
+ * alloc_compressed_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new compressed cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+                                               uint64_t offset,
+                                               int compressed_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index, ret;
+    uint64_t *l2_table;
+    int64_t cluster_offset;
+    int nb_csectors;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return 0;
+    }
+
+    /* Compression can't overwrite anything. Fail if the cluster was already
+     * allocated. */
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    if (cluster_offset & L2E_OFFSET_MASK) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        return 0;
+    }
+
+    cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
+    if (cluster_offset < 0) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+        return 0;
+    }
+
+    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
+                  (cluster_offset >> 9);
+
+    cluster_offset |= QCOW_OFLAG_COMPRESSED |
+                      ((uint64_t)nb_csectors << s->csize_shift);
+
+    /* update L2 table */
+
+    /* compressed clusters never have the copied flag */
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+    l2_table[l2_index] = cpu_to_be64(cluster_offset);
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return 0;
+    }
+
+    return cluster_offset;
+}
+
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    if (r->nb_sectors == 0) {
+        return 0;
+    }
+
+    qemu_co_mutex_unlock(&s->lock);
+    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
+                       r->offset / BDRV_SECTOR_SIZE,
+                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+    qemu_co_mutex_lock(&s->lock);
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    /*
+     * Before we update the L2 table to actually point to the new cluster, we
+     * need to be sure that the refcounts have been increased and COW was
+     * handled.
+     */
+    qcow2_cache_depends_on_flush(s->l2_table_cache);
+
+    return 0;
+}
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, j = 0, l2_index, ret;
+    uint64_t *old_cluster, *l2_table;
+    uint64_t cluster_offset = m->alloc_offset;
+
+    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
+    assert(m->nb_clusters > 0);
+
+    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t));
+
+    /* copy content of unmodified sectors */
+    ret = perform_cow(bs, m, &m->cow_start);
+    if (ret < 0) {
+        goto err;
+    }
+
+    ret = perform_cow(bs, m, &m->cow_end);
+    if (ret < 0) {
+        goto err;
+    }
+
+    /* Update L2 table. */
+    if (s->use_lazy_refcounts) {
+        qcow2_mark_dirty(bs);
+    }
+    if (qcow2_need_accurate_refcounts(s)) {
+        qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                                   s->refcount_block_cache);
+    }
+
+    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        goto err;
+    }
+    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+
+    for (i = 0; i < m->nb_clusters; i++) {
+        /* if two concurrent writes happen to the same unallocated cluster
+	 * each write allocates separate cluster and writes data concurrently.
+	 * The first one to complete updates l2 table with pointer to its
+	 * cluster the second one has to do RMW (which is done above by
+	 * copy_sectors()), update l2 table with its cluster pointer and free
+	 * old cluster. This is what this loop does */
+        if(l2_table[l2_index + i] != 0)
+            old_cluster[j++] = l2_table[l2_index + i];
+
+        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
+                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
+     }
+
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        goto err;
+    }
+
+    /*
+     * If this was a COW, we need to decrease the refcount of the old cluster.
+     * Also flush bs->file to get the right order for L2 and refcount update.
+     *
+     * Don't discard clusters that reach a refcount of 0 (e.g. compressed
+     * clusters), the next write will reuse them anyway.
+     */
+    if (j != 0) {
+        for (i = 0; i < j; i++) {
+            qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
+                                    QCOW2_DISCARD_NEVER);
+        }
+    }
+
+    ret = 0;
+err:
+    g_free(old_cluster);
+    return ret;
+ }
+
+/*
+ * Returns the number of contiguous clusters that can be used for an allocating
+ * write, but require COW to be performed (this includes yet unallocated space,
+ * which must copy from the backing file)
+ */
+static int count_cow_clusters(BDRVQcowState *s, int nb_clusters,
+    uint64_t *l2_table, int l2_index)
+{
+    int i;
+
+    for (i = 0; i < nb_clusters; i++) {
+        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
+        int cluster_type = qcow2_get_cluster_type(l2_entry);
+
+        switch(cluster_type) {
+        case QCOW2_CLUSTER_NORMAL:
+            if (l2_entry & QCOW_OFLAG_COPIED) {
+                goto out;
+            }
+            break;
+        case QCOW2_CLUSTER_UNALLOCATED:
+        case QCOW2_CLUSTER_COMPRESSED:
+        case QCOW2_CLUSTER_ZERO:
+            break;
+        default:
+            abort();
+        }
+    }
+
+out:
+    assert(i <= nb_clusters);
+    return i;
+}
+
+/*
+ * Check if there already is an AIO write request in flight which allocates
+ * the same cluster. In this case we need to wait until the previous
+ * request has completed and updated the L2 table accordingly.
+ *
+ * Returns:
+ *   0       if there was no dependency. *cur_bytes indicates the number of
+ *           bytes from guest_offset that can be read before the next
+ *           dependency must be processed (or the request is complete)
+ *
+ *   -EAGAIN if we had to wait for another request, previously gathered
+ *           information on cluster allocation may be invalid now. The caller
+ *           must start over anyway, so consider *cur_bytes undefined.
+ */
+static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *cur_bytes, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowL2Meta *old_alloc;
+    uint64_t bytes = *cur_bytes;
+
+    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
+
+        uint64_t start = guest_offset;
+        uint64_t end = start + bytes;
+        uint64_t old_start = l2meta_cow_start(old_alloc);
+        uint64_t old_end = l2meta_cow_end(old_alloc);
+
+        if (end <= old_start || start >= old_end) {
+            /* No intersection */
+        } else {
+            if (start < old_start) {
+                /* Stop at the start of a running allocation */
+                bytes = old_start - start;
+            } else {
+                bytes = 0;
+            }
+
+            /* Stop if already an l2meta exists. After yielding, it wouldn't
+             * be valid any more, so we'd have to clean up the old L2Metas
+             * and deal with requests depending on them before starting to
+             * gather new ones. Not worth the trouble. */
+            if (bytes == 0 && *m) {
+                *cur_bytes = 0;
+                return 0;
+            }
+
+            if (bytes == 0) {
+                /* Wait for the dependency to complete. We need to recheck
+                 * the free/allocated clusters when we continue. */
+                qemu_co_mutex_unlock(&s->lock);
+                qemu_co_queue_wait(&old_alloc->dependent_requests);
+                qemu_co_mutex_lock(&s->lock);
+                return -EAGAIN;
+            }
+        }
+    }
+
+    /* Make sure that existing clusters and new allocations are only used up to
+     * the next dependency if we shortened the request above */
+    *cur_bytes = bytes;
+
+    return 0;
+}
+
+/*
+ * Checks how many already allocated clusters that don't require a copy on
+ * write there are at the given guest_offset (up to *bytes). If
+ * *host_offset is not zero, only physically contiguous clusters beginning at
+ * this host offset are counted.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ *   0:     if no allocated clusters are available at the given offset.
+ *          *bytes is normally unchanged. It is set to 0 if the cluster
+ *          is allocated and doesn't need COW, but doesn't have the right
+ *          physical offset.
+ *
+ *   1:     if allocated clusters that don't require a COW are available at
+ *          the requested offset. *bytes may have decreased and describes
+ *          the length of the area that can be written to.
+ *
+ *  -errno: in error cases
+ */
+static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index;
+    uint64_t cluster_offset;
+    uint64_t *l2_table;
+    unsigned int nb_clusters;
+    unsigned int keep_clusters;
+    int ret, pret;
+
+    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
+                              *bytes);
+
+    assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
+                                == offset_into_cluster(s, *host_offset));
+
+    /*
+     * Calculate the number of clusters to look for. We stop at L2 table
+     * boundaries to keep things simple.
+     */
+    nb_clusters =
+        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+    l2_index = offset_to_l2_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    /* Find L2 entry for the first involved cluster */
+    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+    /* Check how many clusters are already allocated and don't need COW */
+    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
+        && (cluster_offset & QCOW_OFLAG_COPIED))
+    {
+        /* If a specific host_offset is required, check it */
+        bool offset_matches =
+            (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
+
+        if (*host_offset != 0 && !offset_matches) {
+            *bytes = 0;
+            ret = 0;
+            goto out;
+        }
+
+        /* We keep all QCOW_OFLAG_COPIED clusters */
+        keep_clusters =
+            count_contiguous_clusters(nb_clusters, s->cluster_size,
+                                      &l2_table[l2_index], 0,
+                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
+        assert(keep_clusters <= nb_clusters);
+
+        *bytes = MIN(*bytes,
+                 keep_clusters * s->cluster_size
+                 - offset_into_cluster(s, guest_offset));
+
+        ret = 1;
+    } else {
+        ret = 0;
+    }
+
+    /* Cleanup */
+out:
+    pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (pret < 0) {
+        return pret;
+    }
+
+    /* Only return a host offset if we actually made progress. Otherwise we
+     * would make requirements for handle_alloc() that it can't fulfill */
+    if (ret) {
+        *host_offset = (cluster_offset & L2E_OFFSET_MASK)
+                     + offset_into_cluster(s, guest_offset);
+    }
+
+    return ret;
+}
+
+/*
+ * Allocates new clusters for the given guest_offset.
+ *
+ * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
+ * contain the number of clusters that have been allocated and are contiguous
+ * in the image file.
+ *
+ * If *host_offset is non-zero, it specifies the offset in the image file at
+ * which the new clusters must start. *nb_clusters can be 0 on return in this
+ * case if the cluster at host_offset is already in use. If *host_offset is
+ * zero, the clusters can be allocated anywhere in the image file.
+ *
+ * *host_offset is updated to contain the offset into the image file at which
+ * the first allocated cluster starts.
+ *
+ * Return 0 on success and -errno in error cases. -EAGAIN means that the
+ * function has been waiting for another request and the allocation must be
+ * restarted, but the whole request should not be failed.
+ */
+static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, unsigned int *nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
+                                         *host_offset, *nb_clusters);
+
+    /* Allocate new clusters */
+    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
+    if (*host_offset == 0) {
+        int64_t cluster_offset =
+            qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
+        if (cluster_offset < 0) {
+            return cluster_offset;
+        }
+        *host_offset = cluster_offset;
+        return 0;
+    } else {
+        int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
+        if (ret < 0) {
+            return ret;
+        }
+        *nb_clusters = ret;
+        return 0;
+    }
+}
+
+/*
+ * Allocates new clusters for an area that either is yet unallocated or needs a
+ * copy on write. If *host_offset is non-zero, clusters are only allocated if
+ * the new allocation can match the specified host offset.
+ *
+ * Note that guest_offset may not be cluster aligned. In this case, the
+ * returned *host_offset points to exact byte referenced by guest_offset and
+ * therefore isn't cluster aligned as well.
+ *
+ * Returns:
+ *   0:     if no clusters could be allocated. *bytes is set to 0,
+ *          *host_offset is left unchanged.
+ *
+ *   1:     if new clusters were allocated. *bytes may be decreased if the
+ *          new allocation doesn't cover all of the requested area.
+ *          *host_offset is updated to contain the host offset of the first
+ *          newly allocated cluster.
+ *
+ *  -errno: in error cases
+ */
+static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
+    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index;
+    uint64_t *l2_table;
+    uint64_t entry;
+    unsigned int nb_clusters;
+    int ret;
+
+    uint64_t alloc_cluster_offset;
+
+    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
+                             *bytes);
+    assert(*bytes > 0);
+
+    /*
+     * Calculate the number of clusters to look for. We stop at L2 table
+     * boundaries to keep things simple.
+     */
+    nb_clusters =
+        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
+
+    l2_index = offset_to_l2_index(s, guest_offset);
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    /* Find L2 entry for the first involved cluster */
+    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    entry = be64_to_cpu(l2_table[l2_index]);
+
+    /* For the moment, overwrite compressed clusters one by one */
+    if (entry & QCOW_OFLAG_COMPRESSED) {
+        nb_clusters = 1;
+    } else {
+        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
+    }
+
+    /* This function is only called when there were no non-COW clusters, so if
+     * we can't find any unallocated or COW clusters either, something is
+     * wrong with our code. */
+    assert(nb_clusters > 0);
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Allocate, if necessary at a given offset in the image file */
+    alloc_cluster_offset = start_of_cluster(s, *host_offset);
+    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
+                                  &nb_clusters);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Can't extend contiguous allocation */
+    if (nb_clusters == 0) {
+        *bytes = 0;
+        return 0;
+    }
+
+    /*
+     * Save info needed for meta data update.
+     *
+     * requested_sectors: Number of sectors from the start of the first
+     * newly allocated cluster to the end of the (possibly shortened
+     * before) write request.
+     *
+     * avail_sectors: Number of sectors from the start of the first
+     * newly allocated to the end of the last newly allocated cluster.
+     *
+     * nb_sectors: The number of sectors from the start of the first
+     * newly allocated cluster to the end of the area that the write
+     * request actually writes to (excluding COW at the end)
+     */
+    int requested_sectors =
+        (*bytes + offset_into_cluster(s, guest_offset))
+        >> BDRV_SECTOR_BITS;
+    int avail_sectors = nb_clusters
+                        << (s->cluster_bits - BDRV_SECTOR_BITS);
+    int alloc_n_start = offset_into_cluster(s, guest_offset)
+                        >> BDRV_SECTOR_BITS;
+    int nb_sectors = MIN(requested_sectors, avail_sectors);
+    QCowL2Meta *old_m = *m;
+
+    *m = g_malloc0(sizeof(**m));
+
+    **m = (QCowL2Meta) {
+        .next           = old_m,
+
+        .alloc_offset   = alloc_cluster_offset,
+        .offset         = start_of_cluster(s, guest_offset),
+        .nb_clusters    = nb_clusters,
+        .nb_available   = nb_sectors,
+
+        .cow_start = {
+            .offset     = 0,
+            .nb_sectors = alloc_n_start,
+        },
+        .cow_end = {
+            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
+            .nb_sectors = avail_sectors - nb_sectors,
+        },
+    };
+    qemu_co_queue_init(&(*m)->dependent_requests);
+    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
+    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
+                         - offset_into_cluster(s, guest_offset));
+    assert(*bytes != 0);
+
+    return 1;
+
+fail:
+    if (*m && (*m)->nb_clusters > 0) {
+        QLIST_REMOVE(*m, next_in_flight);
+    }
+    return ret;
+}
+
+/*
+ * alloc_cluster_offset
+ *
+ * For a given offset on the virtual disk, find the cluster offset in qcow2
+ * file. If the offset is not found, allocate a new cluster.
+ *
+ * If the cluster was already allocated, m->nb_clusters is set to 0 and
+ * other fields in m are meaningless.
+ *
+ * If the cluster is newly allocated, m->nb_clusters is set to the number of
+ * contiguous clusters that have been allocated. In this case, the other
+ * fields of m are valid and contain information about the first allocated
+ * cluster.
+ *
+ * If the request conflicts with another write request in flight, the coroutine
+ * is queued and will be reentered when the dependency has completed.
+ *
+ * Return 0 on success and -errno in error cases
+ */
+int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t start, remaining;
+    uint64_t cluster_offset;
+    uint64_t cur_bytes;
+    int ret;
+
+    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
+                                      n_start, n_end);
+
+    assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset));
+    offset = start_of_cluster(s, offset);
+
+again:
+    start = offset + (n_start << BDRV_SECTOR_BITS);
+    remaining = (n_end - n_start) << BDRV_SECTOR_BITS;
+    cluster_offset = 0;
+    *host_offset = 0;
+    cur_bytes = 0;
+    *m = NULL;
+
+    while (true) {
+
+        if (!*host_offset) {
+            *host_offset = start_of_cluster(s, cluster_offset);
+        }
+
+        assert(remaining >= cur_bytes);
+
+        start           += cur_bytes;
+        remaining       -= cur_bytes;
+        cluster_offset  += cur_bytes;
+
+        if (remaining == 0) {
+            break;
+        }
+
+        cur_bytes = remaining;
+
+        /*
+         * Now start gathering as many contiguous clusters as possible:
+         *
+         * 1. Check for overlaps with in-flight allocations
+         *
+         *      a) Overlap not in the first cluster -> shorten this request and
+         *         let the caller handle the rest in its next loop iteration.
+         *
+         *      b) Real overlaps of two requests. Yield and restart the search
+         *         for contiguous clusters (the situation could have changed
+         *         while we were sleeping)
+         *
+         *      c) TODO: Request starts in the same cluster as the in-flight
+         *         allocation ends. Shorten the COW of the in-fight allocation,
+         *         set cluster_offset to write to the same cluster and set up
+         *         the right synchronisation between the in-flight request and
+         *         the new one.
+         */
+        ret = handle_dependencies(bs, start, &cur_bytes, m);
+        if (ret == -EAGAIN) {
+            /* Currently handle_dependencies() doesn't yield if we already had
+             * an allocation. If it did, we would have to clean up the L2Meta
+             * structs before starting over. */
+            assert(*m == NULL);
+            goto again;
+        } else if (ret < 0) {
+            return ret;
+        } else if (cur_bytes == 0) {
+            break;
+        } else {
+            /* handle_dependencies() may have decreased cur_bytes (shortened
+             * the allocations below) so that the next dependency is processed
+             * correctly during the next loop iteration. */
+        }
+
+        /*
+         * 2. Count contiguous COPIED clusters.
+         */
+        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            continue;
+        } else if (cur_bytes == 0) {
+            break;
+        }
+
+        /*
+         * 3. If the request still hasn't completed, allocate new clusters,
+         *    considering any cluster_offset of steps 1c or 2.
+         */
+        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            continue;
+        } else {
+            assert(cur_bytes == 0);
+            break;
+        }
+    }
+
+    *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS);
+    assert(*num > 0);
+    assert(*host_offset != 0);
+
+    return 0;
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+    z_stream strm1, *strm = &strm1;
+    int ret, out_len;
+
+    memset(strm, 0, sizeof(*strm));
+
+    strm->next_in = (uint8_t *)buf;
+    strm->avail_in = buf_size;
+    strm->next_out = out_buf;
+    strm->avail_out = out_buf_size;
+
+    ret = inflateInit2(strm, -12);
+    if (ret != Z_OK)
+        return -1;
+    ret = inflate(strm, Z_FINISH);
+    out_len = strm->next_out - out_buf;
+    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+        out_len != out_buf_size) {
+        inflateEnd(strm);
+        return -1;
+    }
+    inflateEnd(strm);
+    return 0;
+}
+
+int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, csize, nb_csectors, sector_offset;
+    uint64_t coffset;
+
+    coffset = cluster_offset & s->cluster_offset_mask;
+    if (s->cluster_cache_offset != coffset) {
+        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
+        sector_offset = coffset & 511;
+        csize = nb_csectors * 512 - sector_offset;
+        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+        ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors);
+        if (ret < 0) {
+            return ret;
+        }
+        if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                              s->cluster_data + sector_offset, csize) < 0) {
+            return -EIO;
+        }
+        s->cluster_cache_offset = coffset;
+    }
+    return 0;
+}
+
+/*
+ * This discards as many clusters of nb_clusters as possible at once (i.e.
+ * all clusters in the same L2 table) and returns the number of discarded
+ * clusters.
+ */
+static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
+    unsigned int nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table;
+    int l2_index;
+    int ret;
+    int i;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Limit nb_clusters to one L2 table */
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    for (i = 0; i < nb_clusters; i++) {
+        uint64_t old_offset;
+
+        old_offset = be64_to_cpu(l2_table[l2_index + i]);
+        if ((old_offset & L2E_OFFSET_MASK) == 0) {
+            continue;
+        }
+
+        /* First remove L2 entries */
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+        l2_table[l2_index + i] = cpu_to_be64(0);
+
+        /* Then decrease the refcount */
+        qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
+    }
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return nb_clusters;
+}
+
+int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
+    int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t end_offset;
+    unsigned int nb_clusters;
+    int ret;
+
+    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
+
+    /* Round start up and end down */
+    offset = align_offset(offset, s->cluster_size);
+    end_offset &= ~(s->cluster_size - 1);
+
+    if (offset > end_offset) {
+        return 0;
+    }
+
+    nb_clusters = size_to_clusters(s, end_offset - offset);
+
+    s->cache_discards = true;
+
+    /* Each L2 table is handled by its own loop iteration */
+    while (nb_clusters > 0) {
+        ret = discard_single_l2(bs, offset, nb_clusters);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        nb_clusters -= ret;
+        offset += (ret * s->cluster_size);
+    }
+
+    ret = 0;
+fail:
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    return ret;
+}
+
+/*
+ * This zeroes as many clusters of nb_clusters as possible at once (i.e.
+ * all clusters in the same L2 table) and returns the number of zeroed
+ * clusters.
+ */
+static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
+    unsigned int nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table;
+    int l2_index;
+    int ret;
+    int i;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Limit nb_clusters to one L2 table */
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    for (i = 0; i < nb_clusters; i++) {
+        uint64_t old_offset;
+
+        old_offset = be64_to_cpu(l2_table[l2_index + i]);
+
+        /* Update L2 entries */
+        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+        if (old_offset & QCOW_OFLAG_COMPRESSED) {
+            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
+        } else {
+            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
+        }
+    }
+
+    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return nb_clusters;
+}
+
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    unsigned int nb_clusters;
+    int ret;
+
+    /* The zero flag is only supported by version 3 and newer */
+    if (s->qcow_version < 3) {
+        return -ENOTSUP;
+    }
+
+    /* Each L2 table is handled by its own loop iteration */
+    nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
+
+    s->cache_discards = true;
+
+    while (nb_clusters > 0) {
+        ret = zero_single_l2(bs, offset, nb_clusters);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        nb_clusters -= ret;
+        offset += (ret * s->cluster_size);
+    }
+
+    ret = 0;
+fail:
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    return ret;
+}
diff --git a/contrib/qemu/block/qcow2-refcount.c b/contrib/qemu/block/qcow2-refcount.c
new file mode 100644
index 00000000000..1244693f39e
--- /dev/null
+++ b/contrib/qemu/block/qcow2-refcount.c
@@ -0,0 +1,1374 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
+                            int64_t offset, int64_t length,
+                            int addend, enum qcow2_discard_type type);
+
+
+/*********************************************************/
+/* refcount handling */
+
+int qcow2_refcount_init(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, refcount_table_size2, i;
+
+    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
+    s->refcount_table = g_malloc(refcount_table_size2);
+    if (s->refcount_table_size > 0) {
+        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
+        ret = bdrv_pread(bs->file, s->refcount_table_offset,
+                         s->refcount_table, refcount_table_size2);
+        if (ret != refcount_table_size2)
+            goto fail;
+        for(i = 0; i < s->refcount_table_size; i++)
+            be64_to_cpus(&s->refcount_table[i]);
+    }
+    return 0;
+ fail:
+    return -ENOMEM;
+}
+
+void qcow2_refcount_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    g_free(s->refcount_table);
+}
+
+
+static int load_refcount_block(BlockDriverState *bs,
+                               int64_t refcount_block_offset,
+                               void **refcount_block)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        refcount_block);
+
+    return ret;
+}
+
+/*
+ * Returns the refcount of the cluster given by its index. Any non-negative
+ * return value is the refcount of the cluster, negative values are -errno
+ * and indicate an error.
+ */
+static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    int refcount_table_index, block_index;
+    int64_t refcount_block_offset;
+    int ret;
+    uint16_t *refcount_block;
+    uint16_t refcount;
+
+    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+    if (refcount_table_index >= s->refcount_table_size)
+        return 0;
+    refcount_block_offset = s->refcount_table[refcount_table_index];
+    if (!refcount_block_offset)
+        return 0;
+
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    block_index = cluster_index &
+        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+    refcount = be16_to_cpu(refcount_block[block_index]);
+
+    ret = qcow2_cache_put(bs, s->refcount_block_cache,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return refcount;
+}
+
+/*
+ * Rounds the refcount table size up to avoid growing the table for each single
+ * refcount block that is allocated.
+ */
+static unsigned int next_refcount_table_size(BDRVQcowState *s,
+    unsigned int min_size)
+{
+    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
+    unsigned int refcount_table_clusters =
+        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
+
+    while (min_clusters > refcount_table_clusters) {
+        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
+    }
+
+    return refcount_table_clusters << (s->cluster_bits - 3);
+}
+
+
+/* Checks if two offsets are described by the same refcount block */
+static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
+    uint64_t offset_b)
+{
+    uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
+    uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
+
+    return (block_a == block_b);
+}
+
+/*
+ * Loads a refcount block. If it doesn't exist yet, it is allocated first
+ * (including growing the refcount table if needed).
+ *
+ * Returns 0 on success or -errno in error case
+ */
+static int alloc_refcount_block(BlockDriverState *bs,
+    int64_t cluster_index, uint16_t **refcount_block)
+{
+    BDRVQcowState *s = bs->opaque;
+    unsigned int refcount_table_index;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
+
+    /* Find the refcount block for the given cluster */
+    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+
+    if (refcount_table_index < s->refcount_table_size) {
+
+        uint64_t refcount_block_offset =
+            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
+
+        /* If it's already there, we're done */
+        if (refcount_block_offset) {
+             return load_refcount_block(bs, refcount_block_offset,
+                 (void**) refcount_block);
+        }
+    }
+
+    /*
+     * If we came here, we need to allocate something. Something is at least
+     * a cluster for the new refcount block. It may also include a new refcount
+     * table if the old refcount table is too small.
+     *
+     * Note that allocating clusters here needs some special care:
+     *
+     * - We can't use the normal qcow2_alloc_clusters(), it would try to
+     *   increase the refcount and very likely we would end up with an endless
+     *   recursion. Instead we must place the refcount blocks in a way that
+     *   they can describe them themselves.
+     *
+     * - We need to consider that at this point we are inside update_refcounts
+     *   and doing the initial refcount increase. This means that some clusters
+     *   have already been allocated by the caller, but their refcount isn't
+     *   accurate yet. free_cluster_index tells us where this allocation ends
+     *   as long as we don't overwrite it by freeing clusters.
+     *
+     * - alloc_clusters_noref and qcow2_free_clusters may load a different
+     *   refcount block into the cache
+     */
+
+    *refcount_block = NULL;
+
+    /* We write to the refcount table, so we might depend on L2 tables */
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Allocate the refcount block itself and mark it as used */
+    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
+    if (new_block < 0) {
+        return new_block;
+    }
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
+        " at %" PRIx64 "\n",
+        refcount_table_index, cluster_index << s->cluster_bits, new_block);
+#endif
+
+    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
+        /* Zero the new refcount block before updating it */
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
+
+        /* The block describes itself, need to update the cache */
+        int block_index = (new_block >> s->cluster_bits) &
+            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+        (*refcount_block)[block_index] = cpu_to_be16(1);
+    } else {
+        /* Described somewhere else. This can recurse at most twice before we
+         * arrive at a block that describes itself. */
+        ret = update_refcount(bs, new_block, s->cluster_size, 1,
+                              QCOW2_DISCARD_NEVER);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        /* Initialize the new refcount block only after updating its refcount,
+         * update_refcount uses the refcount cache itself */
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
+    }
+
+    /* Now the new refcount block needs to be written to disk */
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail_block;
+    }
+
+    /* If the refcount table is big enough, just hook the block up there */
+    if (refcount_table_index < s->refcount_table_size) {
+        uint64_t data64 = cpu_to_be64(new_block);
+        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
+        ret = bdrv_pwrite_sync(bs->file,
+            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
+            &data64, sizeof(data64));
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        s->refcount_table[refcount_table_index] = new_block;
+        return 0;
+    }
+
+    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    if (ret < 0) {
+        goto fail_block;
+    }
+
+    /*
+     * If we come here, we need to grow the refcount table. Again, a new
+     * refcount table needs some space and we can't simply allocate to avoid
+     * endless recursion.
+     *
+     * Therefore let's grab new refcount blocks at the end of the image, which
+     * will describe themselves and the new refcount table. This way we can
+     * reference them only in the new table and do the switch to the new
+     * refcount table at once without producing an inconsistent state in
+     * between.
+     */
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
+
+    /* Calculate the number of refcount blocks needed so far */
+    uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
+    uint64_t blocks_used = (s->free_cluster_index +
+        refcount_block_clusters - 1) / refcount_block_clusters;
+
+    /* And now we need at least one block more for the new metadata */
+    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
+    uint64_t last_table_size;
+    uint64_t blocks_clusters;
+    do {
+        uint64_t table_clusters =
+            size_to_clusters(s, table_size * sizeof(uint64_t));
+        blocks_clusters = 1 +
+            ((table_clusters + refcount_block_clusters - 1)
+            / refcount_block_clusters);
+        uint64_t meta_clusters = table_clusters + blocks_clusters;
+
+        last_table_size = table_size;
+        table_size = next_refcount_table_size(s, blocks_used +
+            ((meta_clusters + refcount_block_clusters - 1)
+            / refcount_block_clusters));
+
+    } while (last_table_size != table_size);
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
+        s->refcount_table_size, table_size);
+#endif
+
+    /* Create the new refcount table and blocks */
+    uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
+        s->cluster_size;
+    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
+    uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
+    uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
+
+    assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
+
+    /* Fill the new refcount table */
+    memcpy(new_table, s->refcount_table,
+        s->refcount_table_size * sizeof(uint64_t));
+    new_table[refcount_table_index] = new_block;
+
+    int i;
+    for (i = 0; i < blocks_clusters; i++) {
+        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
+    }
+
+    /* Fill the refcount blocks */
+    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
+    int block = 0;
+    for (i = 0; i < table_clusters + blocks_clusters; i++) {
+        new_blocks[block++] = cpu_to_be16(1);
+    }
+
+    /* Write refcount blocks to disk */
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
+    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
+        blocks_clusters * s->cluster_size);
+    g_free(new_blocks);
+    if (ret < 0) {
+        goto fail_table;
+    }
+
+    /* Write refcount table to disk */
+    for(i = 0; i < table_size; i++) {
+        cpu_to_be64s(&new_table[i]);
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
+    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
+        table_size * sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail_table;
+    }
+
+    for(i = 0; i < table_size; i++) {
+        be64_to_cpus(&new_table[i]);
+    }
+
+    /* Hook up the new refcount table in the qcow2 header */
+    uint8_t data[12];
+    cpu_to_be64w((uint64_t*)data, table_offset);
+    cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
+        data, sizeof(data));
+    if (ret < 0) {
+        goto fail_table;
+    }
+
+    /* And switch it in memory */
+    uint64_t old_table_offset = s->refcount_table_offset;
+    uint64_t old_table_size = s->refcount_table_size;
+
+    g_free(s->refcount_table);
+    s->refcount_table = new_table;
+    s->refcount_table_size = table_size;
+    s->refcount_table_offset = table_offset;
+
+    /* Free old table. Remember, we must not change free_cluster_index */
+    uint64_t old_free_cluster_index = s->free_cluster_index;
+    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
+    s->free_cluster_index = old_free_cluster_index;
+
+    ret = load_refcount_block(bs, new_block, (void**) refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+
+fail_table:
+    g_free(new_table);
+fail_block:
+    if (*refcount_block != NULL) {
+        qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    }
+    return ret;
+}
+
+void qcow2_process_discards(BlockDriverState *bs, int ret)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *next;
+
+    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
+        QTAILQ_REMOVE(&s->discards, d, next);
+
+        /* Discard is optional, ignore the return value */
+        if (ret >= 0) {
+            bdrv_discard(bs->file,
+                         d->offset >> BDRV_SECTOR_BITS,
+                         d->bytes >> BDRV_SECTOR_BITS);
+        }
+
+        g_free(d);
+    }
+}
+
+static void update_refcount_discard(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t length)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *p, *next;
+
+    QTAILQ_FOREACH(d, &s->discards, next) {
+        uint64_t new_start = MIN(offset, d->offset);
+        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
+
+        if (new_end - new_start <= length + d->bytes) {
+            /* There can't be any overlap, areas ending up here have no
+             * references any more and therefore shouldn't get freed another
+             * time. */
+            assert(d->bytes + length == new_end - new_start);
+            d->offset = new_start;
+            d->bytes = new_end - new_start;
+            goto found;
+        }
+    }
+
+    d = g_malloc(sizeof(*d));
+    *d = (Qcow2DiscardRegion) {
+        .bs     = bs,
+        .offset = offset,
+        .bytes  = length,
+    };
+    QTAILQ_INSERT_TAIL(&s->discards, d, next);
+
+found:
+    /* Merge discard requests if they are adjacent now */
+    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
+        if (p == d
+            || p->offset > d->offset + d->bytes
+            || d->offset > p->offset + p->bytes)
+        {
+            continue;
+        }
+
+        /* Still no overlap possible */
+        assert(p->offset == d->offset + d->bytes
+            || d->offset == p->offset + p->bytes);
+
+        QTAILQ_REMOVE(&s->discards, p, next);
+        d->offset = MIN(d->offset, p->offset);
+        d->bytes += p->bytes;
+    }
+}
+
+/* XXX: cache several refcount block clusters ? */
+static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
+    int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+    uint16_t *refcount_block = NULL;
+    int64_t old_table_index = -1;
+    int ret;
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
+           offset, length, addend);
+#endif
+    if (length < 0) {
+        return -EINVAL;
+    } else if (length == 0) {
+        return 0;
+    }
+
+    if (addend < 0) {
+        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
+            s->l2_table_cache);
+    }
+
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + length - 1) & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size)
+    {
+        int block_index, refcount;
+        int64_t cluster_index = cluster_offset >> s->cluster_bits;
+        int64_t table_index =
+            cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+
+        /* Load the refcount block and allocate it if needed */
+        if (table_index != old_table_index) {
+            if (refcount_block) {
+                ret = qcow2_cache_put(bs, s->refcount_block_cache,
+                    (void**) &refcount_block);
+                if (ret < 0) {
+                    goto fail;
+                }
+            }
+
+            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
+            if (ret < 0) {
+                goto fail;
+            }
+        }
+        old_table_index = table_index;
+
+        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
+
+        /* we can update the count and save it */
+        block_index = cluster_index &
+            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+
+        refcount = be16_to_cpu(refcount_block[block_index]);
+        refcount += addend;
+        if (refcount < 0 || refcount > 0xffff) {
+            ret = -EINVAL;
+            goto fail;
+        }
+        if (refcount == 0 && cluster_index < s->free_cluster_index) {
+            s->free_cluster_index = cluster_index;
+        }
+        refcount_block[block_index] = cpu_to_be16(refcount);
+
+        if (refcount == 0 && s->discard_passthrough[type]) {
+            update_refcount_discard(bs, cluster_offset, s->cluster_size);
+        }
+    }
+
+    ret = 0;
+fail:
+    if (!s->cache_discards) {
+        qcow2_process_discards(bs, ret);
+    }
+
+    /* Write last changed block to disk */
+    if (refcount_block) {
+        int wret;
+        wret = qcow2_cache_put(bs, s->refcount_block_cache,
+            (void**) &refcount_block);
+        if (wret < 0) {
+            return ret < 0 ? ret : wret;
+        }
+    }
+
+    /*
+     * Try do undo any updates if an error is returned (This may succeed in
+     * some cases like ENOSPC for allocating a new refcount block)
+     */
+    if (ret < 0) {
+        int dummy;
+        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
+                                QCOW2_DISCARD_NEVER);
+        (void)dummy;
+    }
+
+    return ret;
+}
+
+/*
+ * Increases or decreases the refcount of a given cluster by one.
+ * addend must be 1 or -1.
+ *
+ * If the return value is non-negative, it is the new refcount of the cluster.
+ * If it is negative, it is -errno and indicates an error.
+ */
+static int update_cluster_refcount(BlockDriverState *bs,
+                                   int64_t cluster_index,
+                                   int addend,
+                                   enum qcow2_discard_type type)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
+                          type);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return get_refcount(bs, cluster_index);
+}
+
+
+
+/*********************************************************/
+/* cluster allocation functions */
+
+
+
+/* return < 0 if error */
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, nb_clusters, refcount;
+
+    nb_clusters = size_to_clusters(s, size);
+retry:
+    for(i = 0; i < nb_clusters; i++) {
+        int64_t next_cluster_index = s->free_cluster_index++;
+        refcount = get_refcount(bs, next_cluster_index);
+
+        if (refcount < 0) {
+            return refcount;
+        } else if (refcount != 0) {
+            goto retry;
+        }
+    }
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
+            size,
+            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
+#endif
+    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
+}
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
+{
+    int64_t offset;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
+    offset = alloc_clusters_noref(bs, size);
+    if (offset < 0) {
+        return offset;
+    }
+
+    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return offset;
+}
+
+int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+    int nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t cluster_index;
+    uint64_t old_free_cluster_index;
+    int i, refcount, ret;
+
+    /* Check how many clusters there are free */
+    cluster_index = offset >> s->cluster_bits;
+    for(i = 0; i < nb_clusters; i++) {
+        refcount = get_refcount(bs, cluster_index++);
+
+        if (refcount < 0) {
+            return refcount;
+        } else if (refcount != 0) {
+            break;
+        }
+    }
+
+    /* And then allocate them */
+    old_free_cluster_index = s->free_cluster_index;
+    s->free_cluster_index = cluster_index + i;
+
+    ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+                          QCOW2_DISCARD_NEVER);
+    if (ret < 0) {
+        return ret;
+    }
+
+    s->free_cluster_index = old_free_cluster_index;
+
+    return i;
+}
+
+/* only used to allocate compressed sectors. We try to allocate
+   contiguous sectors. size must be <= cluster_size */
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t offset, cluster_offset;
+    int free_in_cluster;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
+    assert(size > 0 && size <= s->cluster_size);
+    if (s->free_byte_offset == 0) {
+        offset = qcow2_alloc_clusters(bs, s->cluster_size);
+        if (offset < 0) {
+            return offset;
+        }
+        s->free_byte_offset = offset;
+    }
+ redo:
+    free_in_cluster = s->cluster_size -
+        (s->free_byte_offset & (s->cluster_size - 1));
+    if (size <= free_in_cluster) {
+        /* enough space in current cluster */
+        offset = s->free_byte_offset;
+        s->free_byte_offset += size;
+        free_in_cluster -= size;
+        if (free_in_cluster == 0)
+            s->free_byte_offset = 0;
+        if ((offset & (s->cluster_size - 1)) != 0)
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                    QCOW2_DISCARD_NEVER);
+    } else {
+        offset = qcow2_alloc_clusters(bs, s->cluster_size);
+        if (offset < 0) {
+            return offset;
+        }
+        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+        if ((cluster_offset + s->cluster_size) == offset) {
+            /* we are lucky: contiguous data */
+            offset = s->free_byte_offset;
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                    QCOW2_DISCARD_NEVER);
+            s->free_byte_offset += size;
+        } else {
+            s->free_byte_offset = offset;
+            goto redo;
+        }
+    }
+
+    /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
+     * or explicitly by update_cluster_refcount().  Refcount blocks must be
+     * flushed before the caller's L2 table updates.
+     */
+    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
+    return offset;
+}
+
+void qcow2_free_clusters(BlockDriverState *bs,
+                          int64_t offset, int64_t size,
+                          enum qcow2_discard_type type)
+{
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
+    ret = update_refcount(bs, offset, size, -1, type);
+    if (ret < 0) {
+        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
+        /* TODO Remember the clusters to free them later and avoid leaking */
+    }
+}
+
+/*
+ * Free a cluster using its L2 entry (handles clusters of all types, e.g.
+ * normal cluster, compressed cluster, etc.)
+ */
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+                             int nb_clusters, enum qcow2_discard_type type)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    switch (qcow2_get_cluster_type(l2_entry)) {
+    case QCOW2_CLUSTER_COMPRESSED:
+        {
+            int nb_csectors;
+            nb_csectors = ((l2_entry >> s->csize_shift) &
+                           s->csize_mask) + 1;
+            qcow2_free_clusters(bs,
+                (l2_entry & s->cluster_offset_mask) & ~511,
+                nb_csectors * 512, type);
+        }
+        break;
+    case QCOW2_CLUSTER_NORMAL:
+        qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
+                            nb_clusters << s->cluster_bits, type);
+        break;
+    case QCOW2_CLUSTER_UNALLOCATED:
+    case QCOW2_CLUSTER_ZERO:
+        break;
+    default:
+        abort();
+    }
+}
+
+
+
+/*********************************************************/
+/* snapshots and image creation */
+
+
+
+/* update the refcounts of snapshots and the copied flag */
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+    int64_t l1_table_offset, int l1_size, int addend)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
+    int64_t old_offset, old_l2_offset;
+    int i, j, l1_modified = 0, nb_csectors, refcount;
+    int ret;
+
+    l2_table = NULL;
+    l1_table = NULL;
+    l1_size2 = l1_size * sizeof(uint64_t);
+
+    s->cache_discards = true;
+
+    /* WARNING: qcow2_snapshot_goto relies on this function not using the
+     * l1_table_offset when it is the current s->l1_table_offset! Be careful
+     * when changing this! */
+    if (l1_table_offset != s->l1_table_offset) {
+        l1_table = g_malloc0(align_offset(l1_size2, 512));
+        l1_allocated = 1;
+
+        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        for(i = 0;i < l1_size; i++)
+            be64_to_cpus(&l1_table[i]);
+    } else {
+        assert(l1_size == s->l1_size);
+        l1_table = s->l1_table;
+        l1_allocated = 0;
+    }
+
+    for(i = 0; i < l1_size; i++) {
+        l2_offset = l1_table[i];
+        if (l2_offset) {
+            old_l2_offset = l2_offset;
+            l2_offset &= L1E_OFFSET_MASK;
+
+            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                (void**) &l2_table);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            for(j = 0; j < s->l2_size; j++) {
+                offset = be64_to_cpu(l2_table[j]);
+                if (offset != 0) {
+                    old_offset = offset;
+                    offset &= ~QCOW_OFLAG_COPIED;
+                    if (offset & QCOW_OFLAG_COMPRESSED) {
+                        nb_csectors = ((offset >> s->csize_shift) &
+                                       s->csize_mask) + 1;
+                        if (addend != 0) {
+                            int ret;
+                            ret = update_refcount(bs,
+                                (offset & s->cluster_offset_mask) & ~511,
+                                nb_csectors * 512, addend,
+                                QCOW2_DISCARD_SNAPSHOT);
+                            if (ret < 0) {
+                                goto fail;
+                            }
+                        }
+                        /* compressed clusters are never modified */
+                        refcount = 2;
+                    } else {
+                        uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+                        if (addend != 0) {
+                            refcount = update_cluster_refcount(bs, cluster_index, addend,
+                                                               QCOW2_DISCARD_SNAPSHOT);
+                        } else {
+                            refcount = get_refcount(bs, cluster_index);
+                        }
+
+                        if (refcount < 0) {
+                            ret = refcount;
+                            goto fail;
+                        }
+                    }
+
+                    if (refcount == 1) {
+                        offset |= QCOW_OFLAG_COPIED;
+                    }
+                    if (offset != old_offset) {
+                        if (addend > 0) {
+                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                                s->refcount_block_cache);
+                        }
+                        l2_table[j] = cpu_to_be64(offset);
+                        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                    }
+                }
+            }
+
+            ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+            if (ret < 0) {
+                goto fail;
+            }
+
+
+            if (addend != 0) {
+                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
+                                                   QCOW2_DISCARD_SNAPSHOT);
+            } else {
+                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+            }
+            if (refcount < 0) {
+                ret = refcount;
+                goto fail;
+            } else if (refcount == 1) {
+                l2_offset |= QCOW_OFLAG_COPIED;
+            }
+            if (l2_offset != old_l2_offset) {
+                l1_table[i] = l2_offset;
+                l1_modified = 1;
+            }
+        }
+    }
+
+    ret = bdrv_flush(bs);
+fail:
+    if (l2_table) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    }
+
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    /* Update L1 only if it isn't deleted anyway (addend = -1) */
+    if (ret == 0 && addend >= 0 && l1_modified) {
+        for (i = 0; i < l1_size; i++) {
+            cpu_to_be64s(&l1_table[i]);
+        }
+
+        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
+
+        for (i = 0; i < l1_size; i++) {
+            be64_to_cpus(&l1_table[i]);
+        }
+    }
+    if (l1_allocated)
+        g_free(l1_table);
+    return ret;
+}
+
+
+
+
+/*********************************************************/
+/* refcount checking functions */
+
+
+
+/*
+ * Increases the refcount for a range of clusters in a given refcount table.
+ * This is used to construct a temporary refcount table out of L1 and L2 tables
+ * which can be compared the the refcount table saved in the image.
+ *
+ * Modifies the number of errors in res.
+ */
+static void inc_refcounts(BlockDriverState *bs,
+                          BdrvCheckResult *res,
+                          uint16_t *refcount_table,
+                          int refcount_table_size,
+                          int64_t offset, int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+    int k;
+
+    if (size <= 0)
+        return;
+
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + size - 1) & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size) {
+        k = cluster_offset >> s->cluster_bits;
+        if (k < 0) {
+            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
+                cluster_offset);
+            res->corruptions++;
+        } else if (k >= refcount_table_size) {
+            fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
+                "the end of the image file, can't properly check refcounts.\n",
+                cluster_offset);
+            res->check_errors++;
+        } else {
+            if (++refcount_table[k] == 0) {
+                fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
+                    "\n", cluster_offset);
+                res->corruptions++;
+            }
+        }
+    }
+}
+
+/* Flags for check_refcounts_l1() and check_refcounts_l2() */
+enum {
+    CHECK_OFLAG_COPIED = 0x1,   /* check QCOW_OFLAG_COPIED matches refcount */
+    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
+};
+
+/*
+ * Increases the refcount in the given refcount table for the all clusters
+ * referenced in the L2 table. While doing so, performs some checks on L2
+ * entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+    uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
+    int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table, l2_entry;
+    uint64_t next_contiguous_offset = 0;
+    int i, l2_size, nb_csectors, refcount;
+
+    /* Read L2 table from disk */
+    l2_size = s->l2_size * sizeof(uint64_t);
+    l2_table = g_malloc(l2_size);
+
+    if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
+        goto fail;
+
+    /* Do the actual checks */
+    for(i = 0; i < s->l2_size; i++) {
+        l2_entry = be64_to_cpu(l2_table[i]);
+
+        switch (qcow2_get_cluster_type(l2_entry)) {
+        case QCOW2_CLUSTER_COMPRESSED:
+            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
+            if (l2_entry & QCOW_OFLAG_COPIED) {
+                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+                    "copied flag must never be set for compressed "
+                    "clusters\n", l2_entry >> s->cluster_bits);
+                l2_entry &= ~QCOW_OFLAG_COPIED;
+                res->corruptions++;
+            }
+
+            /* Mark cluster as used */
+            nb_csectors = ((l2_entry >> s->csize_shift) &
+                           s->csize_mask) + 1;
+            l2_entry &= s->cluster_offset_mask;
+            inc_refcounts(bs, res, refcount_table, refcount_table_size,
+                l2_entry & ~511, nb_csectors * 512);
+
+            if (flags & CHECK_FRAG_INFO) {
+                res->bfi.allocated_clusters++;
+                res->bfi.compressed_clusters++;
+
+                /* Compressed clusters are fragmented by nature.  Since they
+                 * take up sub-sector space but we only have sector granularity
+                 * I/O we need to re-read the same sectors even for adjacent
+                 * compressed clusters.
+                 */
+                res->bfi.fragmented_clusters++;
+            }
+            break;
+
+        case QCOW2_CLUSTER_ZERO:
+            if ((l2_entry & L2E_OFFSET_MASK) == 0) {
+                break;
+            }
+            /* fall through */
+
+        case QCOW2_CLUSTER_NORMAL:
+        {
+            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
+
+            if (flags & CHECK_OFLAG_COPIED) {
+                refcount = get_refcount(bs, offset >> s->cluster_bits);
+                if (refcount < 0) {
+                    fprintf(stderr, "Can't get refcount for offset %"
+                        PRIx64 ": %s\n", l2_entry, strerror(-refcount));
+                    goto fail;
+                }
+                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
+                    fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
+                        PRIx64 " refcount=%d\n", l2_entry, refcount);
+                    res->corruptions++;
+                }
+            }
+
+            if (flags & CHECK_FRAG_INFO) {
+                res->bfi.allocated_clusters++;
+                if (next_contiguous_offset &&
+                    offset != next_contiguous_offset) {
+                    res->bfi.fragmented_clusters++;
+                }
+                next_contiguous_offset = offset + s->cluster_size;
+            }
+
+            /* Mark cluster as used */
+            inc_refcounts(bs, res, refcount_table,refcount_table_size,
+                offset, s->cluster_size);
+
+            /* Correct offsets are cluster aligned */
+            if (offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
+                    "properly aligned; L2 entry corrupted.\n", offset);
+                res->corruptions++;
+            }
+            break;
+        }
+
+        case QCOW2_CLUSTER_UNALLOCATED:
+            break;
+
+        default:
+            abort();
+        }
+    }
+
+    g_free(l2_table);
+    return 0;
+
+fail:
+    fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
+    g_free(l2_table);
+    return -EIO;
+}
+
+/*
+ * Increases the refcount for the L1 table, its L2 tables and all referenced
+ * clusters in the given refcount table. While doing so, performs some checks
+ * on L1 and L2 entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l1(BlockDriverState *bs,
+                              BdrvCheckResult *res,
+                              uint16_t *refcount_table,
+                              int refcount_table_size,
+                              int64_t l1_table_offset, int l1_size,
+                              int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table, l2_offset, l1_size2;
+    int i, refcount, ret;
+
+    l1_size2 = l1_size * sizeof(uint64_t);
+
+    /* Mark L1 table as used */
+    inc_refcounts(bs, res, refcount_table, refcount_table_size,
+        l1_table_offset, l1_size2);
+
+    /* Read L1 table entries from disk */
+    if (l1_size2 == 0) {
+        l1_table = NULL;
+    } else {
+        l1_table = g_malloc(l1_size2);
+        if (bdrv_pread(bs->file, l1_table_offset,
+                       l1_table, l1_size2) != l1_size2)
+            goto fail;
+        for(i = 0;i < l1_size; i++)
+            be64_to_cpus(&l1_table[i]);
+    }
+
+    /* Do the actual checks */
+    for(i = 0; i < l1_size; i++) {
+        l2_offset = l1_table[i];
+        if (l2_offset) {
+            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+            if (flags & CHECK_OFLAG_COPIED) {
+                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
+                    >> s->cluster_bits);
+                if (refcount < 0) {
+                    fprintf(stderr, "Can't get refcount for l2_offset %"
+                        PRIx64 ": %s\n", l2_offset, strerror(-refcount));
+                    goto fail;
+                }
+                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
+                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
+                        " refcount=%d\n", l2_offset, refcount);
+                    res->corruptions++;
+                }
+            }
+
+            /* Mark L2 table as used */
+            l2_offset &= L1E_OFFSET_MASK;
+            inc_refcounts(bs, res, refcount_table, refcount_table_size,
+                l2_offset, s->cluster_size);
+
+            /* L2 tables are cluster aligned */
+            if (l2_offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
+                    "cluster aligned; L1 entry corrupted\n", l2_offset);
+                res->corruptions++;
+            }
+
+            /* Process and check L2 entries */
+            ret = check_refcounts_l2(bs, res, refcount_table,
+                                     refcount_table_size, l2_offset, flags);
+            if (ret < 0) {
+                goto fail;
+            }
+        }
+    }
+    g_free(l1_table);
+    return 0;
+
+fail:
+    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+    res->check_errors++;
+    g_free(l1_table);
+    return -EIO;
+}
+
+/*
+ * Checks an image for refcount consistency.
+ *
+ * Returns 0 if no errors are found, the number of errors in case the image is
+ * detected as corrupted, and -errno when an internal error occurred.
+ */
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                          BdrvCheckMode fix)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t size, i, highest_cluster;
+    int nb_clusters, refcount1, refcount2;
+    QCowSnapshot *sn;
+    uint16_t *refcount_table;
+    int ret;
+
+    size = bdrv_getlength(bs->file);
+    nb_clusters = size_to_clusters(s, size);
+    refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
+
+    res->bfi.total_clusters =
+        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
+
+    /* header */
+    inc_refcounts(bs, res, refcount_table, nb_clusters,
+        0, s->cluster_size);
+
+    /* current L1 table */
+    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+                             s->l1_table_offset, s->l1_size,
+                             CHECK_OFLAG_COPIED | CHECK_FRAG_INFO);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* snapshots */
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+            sn->l1_table_offset, sn->l1_size, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+    inc_refcounts(bs, res, refcount_table, nb_clusters,
+        s->snapshots_offset, s->snapshots_size);
+
+    /* refcount data */
+    inc_refcounts(bs, res, refcount_table, nb_clusters,
+        s->refcount_table_offset,
+        s->refcount_table_size * sizeof(uint64_t));
+
+    for(i = 0; i < s->refcount_table_size; i++) {
+        uint64_t offset, cluster;
+        offset = s->refcount_table[i];
+        cluster = offset >> s->cluster_bits;
+
+        /* Refcount blocks are cluster aligned */
+        if (offset & (s->cluster_size - 1)) {
+            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
+                "cluster aligned; refcount table entry corrupted\n", i);
+            res->corruptions++;
+            continue;
+        }
+
+        if (cluster >= nb_clusters) {
+            fprintf(stderr, "ERROR refcount block %" PRId64
+                    " is outside image\n", i);
+            res->corruptions++;
+            continue;
+        }
+
+        if (offset != 0) {
+            inc_refcounts(bs, res, refcount_table, nb_clusters,
+                offset, s->cluster_size);
+            if (refcount_table[cluster] != 1) {
+                fprintf(stderr, "ERROR refcount block %" PRId64
+                    " refcount=%d\n",
+                    i, refcount_table[cluster]);
+                res->corruptions++;
+            }
+        }
+    }
+
+    /* compare ref counts */
+    for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
+        refcount1 = get_refcount(bs, i);
+        if (refcount1 < 0) {
+            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
+                i, strerror(-refcount1));
+            res->check_errors++;
+            continue;
+        }
+
+        refcount2 = refcount_table[i];
+
+        if (refcount1 > 0 || refcount2 > 0) {
+            highest_cluster = i;
+        }
+
+        if (refcount1 != refcount2) {
+
+            /* Check if we're allowed to fix the mismatch */
+            int *num_fixed = NULL;
+            if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
+                num_fixed = &res->leaks_fixed;
+            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
+                num_fixed = &res->corruptions_fixed;
+            }
+
+            fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
+                   num_fixed != NULL     ? "Repairing" :
+                   refcount1 < refcount2 ? "ERROR" :
+                                           "Leaked",
+                   i, refcount1, refcount2);
+
+            if (num_fixed) {
+                ret = update_refcount(bs, i << s->cluster_bits, 1,
+                                      refcount2 - refcount1,
+                                      QCOW2_DISCARD_ALWAYS);
+                if (ret >= 0) {
+                    (*num_fixed)++;
+                    continue;
+                }
+            }
+
+            /* And if we couldn't, print an error */
+            if (refcount1 < refcount2) {
+                res->corruptions++;
+            } else {
+                res->leaks++;
+            }
+        }
+    }
+
+    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
+    ret = 0;
+
+fail:
+    g_free(refcount_table);
+
+    return ret;
+}
+
diff --git a/contrib/qemu/block/qcow2-snapshot.c b/contrib/qemu/block/qcow2-snapshot.c
new file mode 100644
index 00000000000..0caac9055f8
--- /dev/null
+++ b/contrib/qemu/block/qcow2-snapshot.c
@@ -0,0 +1,660 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "block/qcow2.h"
+
+typedef struct QEMU_PACKED QCowSnapshotHeader {
+    /* header is 8 byte aligned */
+    uint64_t l1_table_offset;
+
+    uint32_t l1_size;
+    uint16_t id_str_size;
+    uint16_t name_size;
+
+    uint32_t date_sec;
+    uint32_t date_nsec;
+
+    uint64_t vm_clock_nsec;
+
+    uint32_t vm_state_size;
+    uint32_t extra_data_size; /* for extension */
+    /* extra data follows */
+    /* id_str follows */
+    /* name follows  */
+} QCowSnapshotHeader;
+
+typedef struct QEMU_PACKED QCowSnapshotExtraData {
+    uint64_t vm_state_size_large;
+    uint64_t disk_size;
+} QCowSnapshotExtraData;
+
+void qcow2_free_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        g_free(s->snapshots[i].name);
+        g_free(s->snapshots[i].id_str);
+    }
+    g_free(s->snapshots);
+    s->snapshots = NULL;
+    s->nb_snapshots = 0;
+}
+
+int qcow2_read_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshotHeader h;
+    QCowSnapshotExtraData extra;
+    QCowSnapshot *sn;
+    int i, id_str_size, name_size;
+    int64_t offset;
+    uint32_t extra_data_size;
+    int ret;
+
+    if (!s->nb_snapshots) {
+        s->snapshots = NULL;
+        s->snapshots_size = 0;
+        return 0;
+    }
+
+    offset = s->snapshots_offset;
+    s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot));
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        /* Read statically sized part of the snapshot header */
+        offset = align_offset(offset, 8);
+        ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
+        if (ret < 0) {
+            goto fail;
+        }
+
+        offset += sizeof(h);
+        sn = s->snapshots + i;
+        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
+        sn->l1_size = be32_to_cpu(h.l1_size);
+        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
+        sn->date_sec = be32_to_cpu(h.date_sec);
+        sn->date_nsec = be32_to_cpu(h.date_nsec);
+        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
+        extra_data_size = be32_to_cpu(h.extra_data_size);
+
+        id_str_size = be16_to_cpu(h.id_str_size);
+        name_size = be16_to_cpu(h.name_size);
+
+        /* Read extra data */
+        ret = bdrv_pread(bs->file, offset, &extra,
+                         MIN(sizeof(extra), extra_data_size));
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += extra_data_size;
+
+        if (extra_data_size >= 8) {
+            sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large);
+        }
+
+        if (extra_data_size >= 16) {
+            sn->disk_size = be64_to_cpu(extra.disk_size);
+        } else {
+            sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+        }
+
+        /* Read snapshot ID */
+        sn->id_str = g_malloc(id_str_size + 1);
+        ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += id_str_size;
+        sn->id_str[id_str_size] = '\0';
+
+        /* Read snapshot name */
+        sn->name = g_malloc(name_size + 1);
+        ret = bdrv_pread(bs->file, offset, sn->name, name_size);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += name_size;
+        sn->name[name_size] = '\0';
+    }
+
+    s->snapshots_size = offset - s->snapshots_offset;
+    return 0;
+
+fail:
+    qcow2_free_snapshots(bs);
+    return ret;
+}
+
+/* add at the end of the file a new list of snapshots */
+static int qcow2_write_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    QCowSnapshotHeader h;
+    QCowSnapshotExtraData extra;
+    int i, name_size, id_str_size, snapshots_size;
+    struct {
+        uint32_t nb_snapshots;
+        uint64_t snapshots_offset;
+    } QEMU_PACKED header_data;
+    int64_t offset, snapshots_offset;
+    int ret;
+
+    /* compute the size of the snapshots */
+    offset = 0;
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        offset = align_offset(offset, 8);
+        offset += sizeof(h);
+        offset += sizeof(extra);
+        offset += strlen(sn->id_str);
+        offset += strlen(sn->name);
+    }
+    snapshots_size = offset;
+
+    /* Allocate space for the new snapshot list */
+    snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
+    offset = snapshots_offset;
+    if (offset < 0) {
+        return offset;
+    }
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Write all snapshots to the new list */
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        memset(&h, 0, sizeof(h));
+        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
+        h.l1_size = cpu_to_be32(sn->l1_size);
+        /* If it doesn't fit in 32 bit, older implementations should treat it
+         * as a disk-only snapshot rather than truncate the VM state */
+        if (sn->vm_state_size <= 0xffffffff) {
+            h.vm_state_size = cpu_to_be32(sn->vm_state_size);
+        }
+        h.date_sec = cpu_to_be32(sn->date_sec);
+        h.date_nsec = cpu_to_be32(sn->date_nsec);
+        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
+        h.extra_data_size = cpu_to_be32(sizeof(extra));
+
+        memset(&extra, 0, sizeof(extra));
+        extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size);
+        extra.disk_size = cpu_to_be64(sn->disk_size);
+
+        id_str_size = strlen(sn->id_str);
+        name_size = strlen(sn->name);
+        h.id_str_size = cpu_to_be16(id_str_size);
+        h.name_size = cpu_to_be16(name_size);
+        offset = align_offset(offset, 8);
+
+        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += sizeof(h);
+
+        ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra));
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += sizeof(extra);
+
+        ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += id_str_size;
+
+        ret = bdrv_pwrite(bs->file, offset, sn->name, name_size);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += name_size;
+    }
+
+    /*
+     * Update the header to point to the new snapshot table. This requires the
+     * new table and its refcounts to be stable on disk.
+     */
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
+        offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
+
+    header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots);
+    header_data.snapshots_offset    = cpu_to_be64(snapshots_offset);
+
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+                           &header_data, sizeof(header_data));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* free the old snapshot table */
+    qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size,
+                        QCOW2_DISCARD_SNAPSHOT);
+    s->snapshots_offset = snapshots_offset;
+    s->snapshots_size = snapshots_size;
+    return 0;
+
+fail:
+    return ret;
+}
+
+static void find_new_snapshot_id(BlockDriverState *bs,
+                                 char *id_str, int id_str_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int i, id, id_max = 0;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        id = strtoul(sn->id_str, NULL, 10);
+        if (id > id_max)
+            id_max = id;
+    }
+    snprintf(id_str, id_str_size, "%d", id_max + 1);
+}
+
+static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        if (!strcmp(s->snapshots[i].id_str, id_str))
+            return i;
+    }
+    return -1;
+}
+
+static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, ret;
+
+    ret = find_snapshot_by_id(bs, name);
+    if (ret >= 0)
+        return ret;
+    for(i = 0; i < s->nb_snapshots; i++) {
+        if (!strcmp(s->snapshots[i].name, name))
+            return i;
+    }
+    return -1;
+}
+
+/* if no id is provided, a new one is constructed */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *new_snapshot_list = NULL;
+    QCowSnapshot *old_snapshot_list = NULL;
+    QCowSnapshot sn1, *sn = &sn1;
+    int i, ret;
+    uint64_t *l1_table = NULL;
+    int64_t l1_table_offset;
+
+    memset(sn, 0, sizeof(*sn));
+
+    /* Generate an ID if it wasn't passed */
+    if (sn_info->id_str[0] == '\0') {
+        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
+    }
+
+    /* Check that the ID is unique */
+    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) {
+        return -EEXIST;
+    }
+
+    /* Populate sn with passed data */
+    sn->id_str = g_strdup(sn_info->id_str);
+    sn->name = g_strdup(sn_info->name);
+
+    sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+    sn->vm_state_size = sn_info->vm_state_size;
+    sn->date_sec = sn_info->date_sec;
+    sn->date_nsec = sn_info->date_nsec;
+    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+
+    /* Allocate the L1 table of the snapshot and copy the current one there. */
+    l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
+    if (l1_table_offset < 0) {
+        ret = l1_table_offset;
+        goto fail;
+    }
+
+    sn->l1_table_offset = l1_table_offset;
+    sn->l1_size = s->l1_size;
+
+    l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
+    for(i = 0; i < s->l1_size; i++) {
+        l1_table[i] = cpu_to_be64(s->l1_table[i]);
+    }
+
+    ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
+                      s->l1_size * sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    g_free(l1_table);
+    l1_table = NULL;
+
+    /*
+     * Increase the refcounts of all clusters and make sure everything is
+     * stable on disk before updating the snapshot table to contain a pointer
+     * to the new L1 table.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Append the new snapshot to the snapshot list */
+    new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+    if (s->snapshots) {
+        memcpy(new_snapshot_list, s->snapshots,
+               s->nb_snapshots * sizeof(QCowSnapshot));
+        old_snapshot_list = s->snapshots;
+    }
+    s->snapshots = new_snapshot_list;
+    s->snapshots[s->nb_snapshots++] = *sn;
+
+    ret = qcow2_write_snapshots(bs);
+    if (ret < 0) {
+        g_free(s->snapshots);
+        s->snapshots = old_snapshot_list;
+        goto fail;
+    }
+
+    g_free(old_snapshot_list);
+
+#ifdef DEBUG_ALLOC
+    {
+      BdrvCheckResult result = {0};
+      qcow2_check_refcounts(bs, &result, 0);
+    }
+#endif
+    return 0;
+
+fail:
+    g_free(sn->id_str);
+    g_free(sn->name);
+    g_free(l1_table);
+
+    return ret;
+}
+
+/* copy the snapshot 'snapshot_name' into the current disk image */
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int i, snapshot_index;
+    int cur_l1_bytes, sn_l1_bytes;
+    int ret;
+    uint64_t *sn_l1_table = NULL;
+
+    /* Search the snapshot */
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+    if (snapshot_index < 0) {
+        return -ENOENT;
+    }
+    sn = &s->snapshots[snapshot_index];
+
+    if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) {
+        error_report("qcow2: Loading snapshots with different disk "
+            "size is not implemented");
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    /*
+     * Make sure that the current L1 table is big enough to contain the whole
+     * L1 table of the snapshot. If the snapshot L1 table is smaller, the
+     * current one must be padded with zeros.
+     */
+    ret = qcow2_grow_l1_table(bs, sn->l1_size, true);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    cur_l1_bytes = s->l1_size * sizeof(uint64_t);
+    sn_l1_bytes = sn->l1_size * sizeof(uint64_t);
+
+    /*
+     * Copy the snapshot L1 table to the current L1 table.
+     *
+     * Before overwriting the old current L1 table on disk, make sure to
+     * increase all refcounts for the clusters referenced by the new one.
+     * Decrease the refcount referenced by the old one only when the L1
+     * table is overwritten.
+     */
+    sn_l1_table = g_malloc0(cur_l1_bytes);
+
+    ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset,
+                                         sn->l1_size, 1);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
+                           cur_l1_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /*
+     * Decrease refcount of clusters of current L1 table.
+     *
+     * At this point, the in-memory s->l1_table points to the old L1 table,
+     * whereas on disk we already have the new one.
+     *
+     * qcow2_update_snapshot_refcount special cases the current L1 table to use
+     * the in-memory data instead of really using the offset to load a new one,
+     * which is why this works.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset,
+                                         s->l1_size, -1);
+
+    /*
+     * Now update the in-memory L1 table to be in sync with the on-disk one. We
+     * need to do this even if updating refcounts failed.
+     */
+    for(i = 0;i < s->l1_size; i++) {
+        s->l1_table[i] = be64_to_cpu(sn_l1_table[i]);
+    }
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+    g_free(sn_l1_table);
+    sn_l1_table = NULL;
+
+    /*
+     * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed
+     * when we decreased the refcount of the old snapshot.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+    if (ret < 0) {
+        goto fail;
+    }
+
+#ifdef DEBUG_ALLOC
+    {
+        BdrvCheckResult result = {0};
+        qcow2_check_refcounts(bs, &result, 0);
+    }
+#endif
+    return 0;
+
+fail:
+    g_free(sn_l1_table);
+    return ret;
+}
+
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot sn;
+    int snapshot_index, ret;
+
+    /* Search the snapshot */
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+    if (snapshot_index < 0) {
+        return -ENOENT;
+    }
+    sn = s->snapshots[snapshot_index];
+
+    /* Remove it from the snapshot list */
+    memmove(s->snapshots + snapshot_index,
+            s->snapshots + snapshot_index + 1,
+            (s->nb_snapshots - snapshot_index - 1) * sizeof(sn));
+    s->nb_snapshots--;
+    ret = qcow2_write_snapshots(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /*
+     * The snapshot is now unused, clean up. If we fail after this point, we
+     * won't recover but just leak clusters.
+     */
+    g_free(sn.id_str);
+    g_free(sn.name);
+
+    /*
+     * Now decrease the refcounts of clusters referenced by the snapshot and
+     * free the L1 table.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset,
+                                         sn.l1_size, -1);
+    if (ret < 0) {
+        return ret;
+    }
+    qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_SNAPSHOT);
+
+    /* must update the copied flag on the current cluster offsets */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+#ifdef DEBUG_ALLOC
+    {
+        BdrvCheckResult result = {0};
+        qcow2_check_refcounts(bs, &result, 0);
+    }
+#endif
+    return 0;
+}
+
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
+{
+    BDRVQcowState *s = bs->opaque;
+    QEMUSnapshotInfo *sn_tab, *sn_info;
+    QCowSnapshot *sn;
+    int i;
+
+    if (!s->nb_snapshots) {
+        *psn_tab = NULL;
+        return s->nb_snapshots;
+    }
+
+    sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn_info = sn_tab + i;
+        sn = s->snapshots + i;
+        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
+                sn->id_str);
+        pstrcpy(sn_info->name, sizeof(sn_info->name),
+                sn->name);
+        sn_info->vm_state_size = sn->vm_state_size;
+        sn_info->date_sec = sn->date_sec;
+        sn_info->date_nsec = sn->date_nsec;
+        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
+    }
+    *psn_tab = sn_tab;
+    return s->nb_snapshots;
+}
+
+int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
+{
+    int i, snapshot_index;
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    uint64_t *new_l1_table;
+    int new_l1_bytes;
+    int ret;
+
+    assert(bs->read_only);
+
+    /* Search the snapshot */
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name);
+    if (snapshot_index < 0) {
+        return -ENOENT;
+    }
+    sn = &s->snapshots[snapshot_index];
+
+    /* Allocate and read in the snapshot's L1 table */
+    new_l1_bytes = s->l1_size * sizeof(uint64_t);
+    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
+
+    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
+    if (ret < 0) {
+        g_free(new_l1_table);
+        return ret;
+    }
+
+    /* Switch the L1 table */
+    g_free(s->l1_table);
+
+    s->l1_size = sn->l1_size;
+    s->l1_table_offset = sn->l1_table_offset;
+    s->l1_table = new_l1_table;
+
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+
+    return 0;
+}
diff --git a/contrib/qemu/block/qcow2.c b/contrib/qemu/block/qcow2.c
new file mode 100644
index 00000000000..0eceefe2cd9
--- /dev/null
+++ b/contrib/qemu/block/qcow2.c
@@ -0,0 +1,1825 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include <zlib.h>
+#include "qemu/aes.h"
+#include "block/qcow2.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qbool.h"
+#include "trace.h"
+
+/*
+  Differences with QCOW:
+
+  - Support for multiple incremental snapshots.
+  - Memory management by reference counts.
+  - Clusters which have a reference count of one have the bit
+    QCOW_OFLAG_COPIED to optimize write performance.
+  - Size of compressed clusters is stored in sectors to reduce bit usage
+    in the cluster offsets.
+  - Support for storing additional data (such as the VM state) in the
+    snapshots.
+  - If a backing store is used, the cluster size is not constrained
+    (could be backported to QCOW).
+  - L2 tables have always a size of one cluster.
+*/
+
+
+typedef struct {
+    uint32_t magic;
+    uint32_t len;
+} QCowExtension;
+
+#define  QCOW2_EXT_MAGIC_END 0
+#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
+#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
+
+static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const QCowHeader *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(QCowHeader) &&
+        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+        be32_to_cpu(cow_header->version) >= 2)
+        return 100;
+    else
+        return 0;
+}
+
+
+/* 
+ * read qcow2 extension and fill bs
+ * start reading from start_offset
+ * finish reading upon magic of value 0 or when end_offset reached
+ * unknown magic is skipped (future extension this version knows nothing about)
+ * return 0 upon success, non-0 otherwise
+ */
+static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
+                                 uint64_t end_offset, void **p_feature_table)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowExtension ext;
+    uint64_t offset;
+    int ret;
+
+#ifdef DEBUG_EXT
+    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
+#endif
+    offset = start_offset;
+    while (offset < end_offset) {
+
+#ifdef DEBUG_EXT
+        /* Sanity check */
+        if (offset > s->cluster_size)
+            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
+
+        printf("attempting to read extended header in offset %lu\n", offset);
+#endif
+
+        if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
+            fprintf(stderr, "qcow2_read_extension: ERROR: "
+                    "pread fail from offset %" PRIu64 "\n",
+                    offset);
+            return 1;
+        }
+        be32_to_cpus(&ext.magic);
+        be32_to_cpus(&ext.len);
+        offset += sizeof(ext);
+#ifdef DEBUG_EXT
+        printf("ext.magic = 0x%x\n", ext.magic);
+#endif
+        if (ext.len > end_offset - offset) {
+            error_report("Header extension too large");
+            return -EINVAL;
+        }
+
+        switch (ext.magic) {
+        case QCOW2_EXT_MAGIC_END:
+            return 0;
+
+        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
+            if (ext.len >= sizeof(bs->backing_format)) {
+                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
+                        " (>=%zu)\n",
+                        ext.len, sizeof(bs->backing_format));
+                return 2;
+            }
+            if (bdrv_pread(bs->file, offset , bs->backing_format,
+                           ext.len) != ext.len)
+                return 3;
+            bs->backing_format[ext.len] = '\0';
+#ifdef DEBUG_EXT
+            printf("Qcow2: Got format extension %s\n", bs->backing_format);
+#endif
+            break;
+
+        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
+            if (p_feature_table != NULL) {
+                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
+                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
+                if (ret < 0) {
+                    return ret;
+                }
+
+                *p_feature_table = feature_table;
+            }
+            break;
+
+        default:
+            /* unknown magic - save it in case we need to rewrite the header */
+            {
+                Qcow2UnknownHeaderExtension *uext;
+
+                uext = g_malloc0(sizeof(*uext)  + ext.len);
+                uext->magic = ext.magic;
+                uext->len = ext.len;
+                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
+
+                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
+                if (ret < 0) {
+                    return ret;
+                }
+            }
+            break;
+        }
+
+        offset += ((ext.len + 7) & ~7);
+    }
+
+    return 0;
+}
+
+static void cleanup_unknown_header_ext(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2UnknownHeaderExtension *uext, *next;
+
+    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
+        QLIST_REMOVE(uext, next);
+        g_free(uext);
+    }
+}
+
+static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
+    const char *fmt, ...)
+{
+    char msg[64];
+    va_list ap;
+
+    va_start(ap, fmt);
+    vsnprintf(msg, sizeof(msg), fmt, ap);
+    va_end(ap);
+
+    qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+        bs->device_name, "qcow2", msg);
+}
+
+static void report_unsupported_feature(BlockDriverState *bs,
+    Qcow2Feature *table, uint64_t mask)
+{
+    while (table && table->name[0] != '\0') {
+        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
+            if (mask & (1 << table->bit)) {
+                report_unsupported(bs, "%.46s",table->name);
+                mask &= ~(1 << table->bit);
+            }
+        }
+        table++;
+    }
+
+    if (mask) {
+        report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
+    }
+}
+
+/*
+ * Sets the dirty bit and flushes afterwards if necessary.
+ *
+ * The incompatible_features bit is only set if the image file header was
+ * updated successfully.  Therefore it is not required to check the return
+ * value of this function.
+ */
+int qcow2_mark_dirty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t val;
+    int ret;
+
+    assert(s->qcow_version >= 3);
+
+    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+        return 0; /* already dirty */
+    }
+
+    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
+    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
+                      &val, sizeof(val));
+    if (ret < 0) {
+        return ret;
+    }
+    ret = bdrv_flush(bs->file);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Only treat image as dirty if the header was updated successfully */
+    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
+    return 0;
+}
+
+/*
+ * Clears the dirty bit and flushes before if necessary.  Only call this
+ * function when there are no pending requests, it does not guard against
+ * concurrent requests dirtying the image.
+ */
+static int qcow2_mark_clean(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+        int ret = bdrv_flush(bs);
+        if (ret < 0) {
+            return ret;
+        }
+
+        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
+        return qcow2_update_header(bs);
+    }
+    return 0;
+}
+
+static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
+                       BdrvCheckMode fix)
+{
+    int ret = qcow2_check_refcounts(bs, result, fix);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (fix && result->check_errors == 0 && result->corruptions == 0) {
+        return qcow2_mark_clean(bs);
+    }
+    return ret;
+}
+
+static QemuOptsList qcow2_runtime_opts = {
+    .name = "qcow2",
+    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
+    .desc = {
+        {
+            .name = "lazy_refcounts",
+            .type = QEMU_OPT_BOOL,
+            .help = "Postpone refcount updates",
+        },
+        {
+            .name = QCOW2_OPT_DISCARD_REQUEST,
+            .type = QEMU_OPT_BOOL,
+            .help = "Pass guest discard requests to the layer below",
+        },
+        {
+            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
+            .type = QEMU_OPT_BOOL,
+            .help = "Generate discard requests when snapshot related space "
+                    "is freed",
+        },
+        {
+            .name = QCOW2_OPT_DISCARD_OTHER,
+            .type = QEMU_OPT_BOOL,
+            .help = "Generate discard requests when other clusters are freed",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    int len, i, ret = 0;
+    QCowHeader header;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    uint64_t ext_end;
+    uint64_t l1_vm_state_index;
+
+    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
+    if (ret < 0) {
+        goto fail;
+    }
+    be32_to_cpus(&header.magic);
+    be32_to_cpus(&header.version);
+    be64_to_cpus(&header.backing_file_offset);
+    be32_to_cpus(&header.backing_file_size);
+    be64_to_cpus(&header.size);
+    be32_to_cpus(&header.cluster_bits);
+    be32_to_cpus(&header.crypt_method);
+    be64_to_cpus(&header.l1_table_offset);
+    be32_to_cpus(&header.l1_size);
+    be64_to_cpus(&header.refcount_table_offset);
+    be32_to_cpus(&header.refcount_table_clusters);
+    be64_to_cpus(&header.snapshots_offset);
+    be32_to_cpus(&header.nb_snapshots);
+
+    if (header.magic != QCOW_MAGIC) {
+        ret = -EMEDIUMTYPE;
+        goto fail;
+    }
+    if (header.version < 2 || header.version > 3) {
+        report_unsupported(bs, "QCOW version %d", header.version);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    s->qcow_version = header.version;
+
+    /* Initialise version 3 header fields */
+    if (header.version == 2) {
+        header.incompatible_features    = 0;
+        header.compatible_features      = 0;
+        header.autoclear_features       = 0;
+        header.refcount_order           = 4;
+        header.header_length            = 72;
+    } else {
+        be64_to_cpus(&header.incompatible_features);
+        be64_to_cpus(&header.compatible_features);
+        be64_to_cpus(&header.autoclear_features);
+        be32_to_cpus(&header.refcount_order);
+        be32_to_cpus(&header.header_length);
+    }
+
+    if (header.header_length > sizeof(header)) {
+        s->unknown_header_fields_size = header.header_length - sizeof(header);
+        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
+        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
+                         s->unknown_header_fields_size);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    if (header.backing_file_offset) {
+        ext_end = header.backing_file_offset;
+    } else {
+        ext_end = 1 << header.cluster_bits;
+    }
+
+    /* Handle feature bits */
+    s->incompatible_features    = header.incompatible_features;
+    s->compatible_features      = header.compatible_features;
+    s->autoclear_features       = header.autoclear_features;
+
+    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
+        void *feature_table = NULL;
+        qcow2_read_extensions(bs, header.header_length, ext_end,
+                              &feature_table);
+        report_unsupported_feature(bs, feature_table,
+                                   s->incompatible_features &
+                                   ~QCOW2_INCOMPAT_MASK);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    /* Check support for various header values */
+    if (header.refcount_order != 4) {
+        report_unsupported(bs, "%d bit reference counts",
+                           1 << header.refcount_order);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    if (header.cluster_bits < MIN_CLUSTER_BITS ||
+        header.cluster_bits > MAX_CLUSTER_BITS) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (header.crypt_method > QCOW_CRYPT_AES) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    s->crypt_method_header = header.crypt_method;
+    if (s->crypt_method_header) {
+        bs->encrypted = 1;
+    }
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
+    s->l2_size = 1 << s->l2_bits;
+    bs->total_sectors = header.size / 512;
+    s->csize_shift = (62 - (s->cluster_bits - 8));
+    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
+    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
+    s->refcount_table_offset = header.refcount_table_offset;
+    s->refcount_table_size =
+        header.refcount_table_clusters << (s->cluster_bits - 3);
+
+    s->snapshots_offset = header.snapshots_offset;
+    s->nb_snapshots = header.nb_snapshots;
+
+    /* read the level 1 table */
+    s->l1_size = header.l1_size;
+
+    l1_vm_state_index = size_to_l1(s, header.size);
+    if (l1_vm_state_index > INT_MAX) {
+        ret = -EFBIG;
+        goto fail;
+    }
+    s->l1_vm_state_index = l1_vm_state_index;
+
+    /* the L1 table must contain at least enough entries to put
+       header.size bytes */
+    if (s->l1_size < s->l1_vm_state_index) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    s->l1_table_offset = header.l1_table_offset;
+    if (s->l1_size > 0) {
+        s->l1_table = g_malloc0(
+            align_offset(s->l1_size * sizeof(uint64_t), 512));
+        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
+                         s->l1_size * sizeof(uint64_t));
+        if (ret < 0) {
+            goto fail;
+        }
+        for(i = 0;i < s->l1_size; i++) {
+            be64_to_cpus(&s->l1_table[i]);
+        }
+    }
+
+    /* alloc L2 table/refcount block cache */
+    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
+    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
+
+    s->cluster_cache = g_malloc(s->cluster_size);
+    /* one more sector for decompressed data alignment */
+    s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+                                  + 512);
+    s->cluster_cache_offset = -1;
+    s->flags = flags;
+
+    ret = qcow2_refcount_init(bs);
+    if (ret != 0) {
+        goto fail;
+    }
+
+    QLIST_INIT(&s->cluster_allocs);
+    QTAILQ_INIT(&s->discards);
+
+    /* read qcow2 extensions */
+    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* read the backing file name */
+    if (header.backing_file_offset != 0) {
+        len = header.backing_file_size;
+        if (len > 1023) {
+            len = 1023;
+        }
+        ret = bdrv_pread(bs->file, header.backing_file_offset,
+                         bs->backing_file, len);
+        if (ret < 0) {
+            goto fail;
+        }
+        bs->backing_file[len] = '\0';
+    }
+
+    ret = qcow2_read_snapshots(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Clear unknown autoclear feature bits */
+    if (!bs->read_only && s->autoclear_features != 0) {
+        s->autoclear_features = 0;
+        ret = qcow2_update_header(bs);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    /* Initialise locks */
+    qemu_co_mutex_init(&s->lock);
+
+    /* Repair image if dirty */
+    if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
+        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
+        BdrvCheckResult result = {0};
+
+        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    /* Enable lazy_refcounts according to image and command line options */
+    opts = qemu_opts_create_nofail(&qcow2_runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
+        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
+
+    s->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
+    s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
+    s->discard_passthrough[QCOW2_DISCARD_REQUEST] =
+        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
+                          flags & BDRV_O_UNMAP);
+    s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
+        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
+    s->discard_passthrough[QCOW2_DISCARD_OTHER] =
+        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
+
+    qemu_opts_del(opts);
+
+    if (s->use_lazy_refcounts && s->qcow_version < 3) {
+        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require "
+            "a qcow2 image with at least qemu 1.1 compatibility level");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+#ifdef DEBUG_ALLOC
+    {
+        BdrvCheckResult result = {0};
+        qcow2_check_refcounts(bs, &result, 0);
+    }
+#endif
+    return ret;
+
+ fail:
+    g_free(s->unknown_header_fields);
+    cleanup_unknown_header_ext(bs);
+    qcow2_free_snapshots(bs);
+    qcow2_refcount_close(bs);
+    g_free(s->l1_table);
+    if (s->l2_table_cache) {
+        qcow2_cache_destroy(bs, s->l2_table_cache);
+    }
+    g_free(s->cluster_cache);
+    qemu_vfree(s->cluster_data);
+    return ret;
+}
+
+static int qcow2_set_key(BlockDriverState *bs, const char *key)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint8_t keybuf[16];
+    int len, i;
+
+    memset(keybuf, 0, 16);
+    len = strlen(key);
+    if (len > 16)
+        len = 16;
+    /* XXX: we could compress the chars to 7 bits to increase
+       entropy */
+    for(i = 0;i < len;i++) {
+        keybuf[i] = key[i];
+    }
+    s->crypt_method = s->crypt_method_header;
+
+    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+        return -1;
+    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+        return -1;
+#if 0
+    /* test */
+    {
+        uint8_t in[16];
+        uint8_t out[16];
+        uint8_t tmp[16];
+        for(i=0;i<16;i++)
+            in[i] = i;
+        AES_encrypt(in, tmp, &s->aes_encrypt_key);
+        AES_decrypt(tmp, out, &s->aes_decrypt_key);
+        for(i = 0; i < 16; i++)
+            printf(" %02x", tmp[i]);
+        printf("\n");
+        for(i = 0; i < 16; i++)
+            printf(" %02x", out[i]);
+        printf("\n");
+    }
+#endif
+    return 0;
+}
+
+/* We have nothing to do for QCOW2 reopen, stubs just return
+ * success */
+static int qcow2_reopen_prepare(BDRVReopenState *state,
+                                BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t cluster_offset;
+    int ret;
+
+    *pnum = nb_sectors;
+    /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
+     * can't pass them on today */
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
+    qemu_co_mutex_unlock(&s->lock);
+    if (ret < 0) {
+        *pnum = 0;
+    }
+
+    return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
+}
+
+/* handle reading after the end of the backing file */
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+                  int64_t sector_num, int nb_sectors)
+{
+    int n1;
+    if ((sector_num + nb_sectors) <= bs->total_sectors)
+        return nb_sectors;
+    if (sector_num >= bs->total_sectors)
+        n1 = 0;
+    else
+        n1 = bs->total_sectors - sector_num;
+
+    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
+
+    return n1;
+}
+
+static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+                          int remaining_sectors, QEMUIOVector *qiov)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster, n1;
+    int ret;
+    int cur_nr_sectors; /* number of sectors in current iteration */
+    uint64_t cluster_offset = 0;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    uint8_t *cluster_data = NULL;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (remaining_sectors != 0) {
+
+        /* prepare next request */
+        cur_nr_sectors = remaining_sectors;
+        if (s->crypt_method) {
+            cur_nr_sectors = MIN(cur_nr_sectors,
+                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+        }
+
+        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
+            &cur_nr_sectors, &cluster_offset);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+
+        qemu_iovec_reset(&hd_qiov);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+            cur_nr_sectors * 512);
+
+        switch (ret) {
+        case QCOW2_CLUSTER_UNALLOCATED:
+
+            if (bs->backing_hd) {
+                /* read from the base image */
+                n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
+                    sector_num, cur_nr_sectors);
+                if (n1 > 0) {
+                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
+                    qemu_co_mutex_unlock(&s->lock);
+                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
+                                        n1, &hd_qiov);
+                    qemu_co_mutex_lock(&s->lock);
+                    if (ret < 0) {
+                        goto fail;
+                    }
+                }
+            } else {
+                /* Note: in this case, no need to wait */
+                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+            }
+            break;
+
+        case QCOW2_CLUSTER_ZERO:
+            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+            break;
+
+        case QCOW2_CLUSTER_COMPRESSED:
+            /* add AIO support for compressed blocks ? */
+            ret = qcow2_decompress_cluster(bs, cluster_offset);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            qemu_iovec_from_buf(&hd_qiov, 0,
+                s->cluster_cache + index_in_cluster * 512,
+                512 * cur_nr_sectors);
+            break;
+
+        case QCOW2_CLUSTER_NORMAL:
+            if ((cluster_offset & 511) != 0) {
+                ret = -EIO;
+                goto fail;
+            }
+
+            if (s->crypt_method) {
+                /*
+                 * For encrypted images, read everything into a temporary
+                 * contiguous buffer on which the AES functions can work.
+                 */
+                if (!cluster_data) {
+                    cluster_data =
+                        qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+                }
+
+                assert(cur_nr_sectors <=
+                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
+                qemu_iovec_reset(&hd_qiov);
+                qemu_iovec_add(&hd_qiov, cluster_data,
+                    512 * cur_nr_sectors);
+            }
+
+            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+            qemu_co_mutex_unlock(&s->lock);
+            ret = bdrv_co_readv(bs->file,
+                                (cluster_offset >> 9) + index_in_cluster,
+                                cur_nr_sectors, &hd_qiov);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                goto fail;
+            }
+            if (s->crypt_method) {
+                qcow2_encrypt_sectors(s, sector_num,  cluster_data,
+                    cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
+                qemu_iovec_from_buf(qiov, bytes_done,
+                    cluster_data, 512 * cur_nr_sectors);
+            }
+            break;
+
+        default:
+            g_assert_not_reached();
+            ret = -EIO;
+            goto fail;
+        }
+
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * 512;
+    }
+    ret = 0;
+
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+
+    qemu_iovec_destroy(&hd_qiov);
+    qemu_vfree(cluster_data);
+
+    return ret;
+}
+
+static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
+                           int64_t sector_num,
+                           int remaining_sectors,
+                           QEMUIOVector *qiov)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    int n_end;
+    int ret;
+    int cur_nr_sectors; /* number of sectors in current iteration */
+    uint64_t cluster_offset;
+    QEMUIOVector hd_qiov;
+    uint64_t bytes_done = 0;
+    uint8_t *cluster_data = NULL;
+    QCowL2Meta *l2meta = NULL;
+
+    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
+                                 remaining_sectors);
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (remaining_sectors != 0) {
+
+        l2meta = NULL;
+
+        trace_qcow2_writev_start_part(qemu_coroutine_self());
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n_end = index_in_cluster + remaining_sectors;
+        if (s->crypt_method &&
+            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
+            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+        }
+
+        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
+            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        assert((cluster_offset & 511) == 0);
+
+        qemu_iovec_reset(&hd_qiov);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
+            cur_nr_sectors * 512);
+
+        if (s->crypt_method) {
+            if (!cluster_data) {
+                cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
+                                                 s->cluster_size);
+            }
+
+            assert(hd_qiov.size <=
+                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
+
+            qcow2_encrypt_sectors(s, sector_num, cluster_data,
+                cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
+
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_add(&hd_qiov, cluster_data,
+                cur_nr_sectors * 512);
+        }
+
+        qemu_co_mutex_unlock(&s->lock);
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        trace_qcow2_writev_data(qemu_coroutine_self(),
+                                (cluster_offset >> 9) + index_in_cluster);
+        ret = bdrv_co_writev(bs->file,
+                             (cluster_offset >> 9) + index_in_cluster,
+                             cur_nr_sectors, &hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        while (l2meta != NULL) {
+            QCowL2Meta *next;
+
+            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            /* Take the request off the list of running requests */
+            if (l2meta->nb_clusters != 0) {
+                QLIST_REMOVE(l2meta, next_in_flight);
+            }
+
+            qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+            next = l2meta->next;
+            g_free(l2meta);
+            l2meta = next;
+        }
+
+        remaining_sectors -= cur_nr_sectors;
+        sector_num += cur_nr_sectors;
+        bytes_done += cur_nr_sectors * 512;
+        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
+    }
+    ret = 0;
+
+fail:
+    qemu_co_mutex_unlock(&s->lock);
+
+    while (l2meta != NULL) {
+        QCowL2Meta *next;
+
+        if (l2meta->nb_clusters != 0) {
+            QLIST_REMOVE(l2meta, next_in_flight);
+        }
+        qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+        next = l2meta->next;
+        g_free(l2meta);
+        l2meta = next;
+    }
+
+    qemu_iovec_destroy(&hd_qiov);
+    qemu_vfree(cluster_data);
+    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
+
+    return ret;
+}
+
+static void qcow2_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    g_free(s->l1_table);
+
+    qcow2_cache_flush(bs, s->l2_table_cache);
+    qcow2_cache_flush(bs, s->refcount_block_cache);
+
+    qcow2_mark_clean(bs);
+
+    qcow2_cache_destroy(bs, s->l2_table_cache);
+    qcow2_cache_destroy(bs, s->refcount_block_cache);
+
+    g_free(s->unknown_header_fields);
+    cleanup_unknown_header_ext(bs);
+
+    g_free(s->cluster_cache);
+    qemu_vfree(s->cluster_data);
+    qcow2_refcount_close(bs);
+    qcow2_free_snapshots(bs);
+}
+
+static void qcow2_invalidate_cache(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int flags = s->flags;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+    uint32_t crypt_method = 0;
+    QDict *options;
+
+    /*
+     * Backing files are read-only which makes all of their metadata immutable,
+     * that means we don't have to worry about reopening them here.
+     */
+
+    if (s->crypt_method) {
+        crypt_method = s->crypt_method;
+        memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
+        memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
+    }
+
+    qcow2_close(bs);
+
+    options = qdict_new();
+    qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
+              qbool_from_int(s->use_lazy_refcounts));
+
+    memset(s, 0, sizeof(BDRVQcowState));
+    qcow2_open(bs, options, flags);
+
+    QDECREF(options);
+
+    if (crypt_method) {
+        s->crypt_method = crypt_method;
+        memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
+        memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
+    }
+}
+
+static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
+    size_t len, size_t buflen)
+{
+    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
+    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
+
+    if (buflen < ext_len) {
+        return -ENOSPC;
+    }
+
+    *ext_backing_fmt = (QCowExtension) {
+        .magic  = cpu_to_be32(magic),
+        .len    = cpu_to_be32(len),
+    };
+    memcpy(buf + sizeof(QCowExtension), s, len);
+
+    return ext_len;
+}
+
+/*
+ * Updates the qcow2 header, including the variable length parts of it, i.e.
+ * the backing file name and all extensions. qcow2 was not designed to allow
+ * such changes, so if we run out of space (we can only use the first cluster)
+ * this function may fail.
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int qcow2_update_header(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowHeader *header;
+    char *buf;
+    size_t buflen = s->cluster_size;
+    int ret;
+    uint64_t total_size;
+    uint32_t refcount_table_clusters;
+    size_t header_length;
+    Qcow2UnknownHeaderExtension *uext;
+
+    buf = qemu_blockalign(bs, buflen);
+
+    /* Header structure */
+    header = (QCowHeader*) buf;
+
+    if (buflen < sizeof(*header)) {
+        ret = -ENOSPC;
+        goto fail;
+    }
+
+    header_length = sizeof(*header) + s->unknown_header_fields_size;
+    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
+
+    *header = (QCowHeader) {
+        /* Version 2 fields */
+        .magic                  = cpu_to_be32(QCOW_MAGIC),
+        .version                = cpu_to_be32(s->qcow_version),
+        .backing_file_offset    = 0,
+        .backing_file_size      = 0,
+        .cluster_bits           = cpu_to_be32(s->cluster_bits),
+        .size                   = cpu_to_be64(total_size),
+        .crypt_method           = cpu_to_be32(s->crypt_method_header),
+        .l1_size                = cpu_to_be32(s->l1_size),
+        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
+        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
+        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
+        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
+        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
+
+        /* Version 3 fields */
+        .incompatible_features  = cpu_to_be64(s->incompatible_features),
+        .compatible_features    = cpu_to_be64(s->compatible_features),
+        .autoclear_features     = cpu_to_be64(s->autoclear_features),
+        .refcount_order         = cpu_to_be32(3 + REFCOUNT_SHIFT),
+        .header_length          = cpu_to_be32(header_length),
+    };
+
+    /* For older versions, write a shorter header */
+    switch (s->qcow_version) {
+    case 2:
+        ret = offsetof(QCowHeader, incompatible_features);
+        break;
+    case 3:
+        ret = sizeof(*header);
+        break;
+    default:
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    buf += ret;
+    buflen -= ret;
+    memset(buf, 0, buflen);
+
+    /* Preserve any unknown field in the header */
+    if (s->unknown_header_fields_size) {
+        if (buflen < s->unknown_header_fields_size) {
+            ret = -ENOSPC;
+            goto fail;
+        }
+
+        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
+        buf += s->unknown_header_fields_size;
+        buflen -= s->unknown_header_fields_size;
+    }
+
+    /* Backing file format header extension */
+    if (*bs->backing_format) {
+        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
+                             bs->backing_format, strlen(bs->backing_format),
+                             buflen);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        buf += ret;
+        buflen -= ret;
+    }
+
+    /* Feature table */
+    Qcow2Feature features[] = {
+        {
+            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
+            .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
+            .name = "dirty bit",
+        },
+        {
+            .type = QCOW2_FEAT_TYPE_COMPATIBLE,
+            .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+            .name = "lazy refcounts",
+        },
+    };
+
+    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
+                         features, sizeof(features), buflen);
+    if (ret < 0) {
+        goto fail;
+    }
+    buf += ret;
+    buflen -= ret;
+
+    /* Keep unknown header extensions */
+    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
+        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        buf += ret;
+        buflen -= ret;
+    }
+
+    /* End of header extensions */
+    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    buf += ret;
+    buflen -= ret;
+
+    /* Backing file name */
+    if (*bs->backing_file) {
+        size_t backing_file_len = strlen(bs->backing_file);
+
+        if (buflen < backing_file_len) {
+            ret = -ENOSPC;
+            goto fail;
+        }
+
+        /* Using strncpy is ok here, since buf is not NUL-terminated. */
+        strncpy(buf, bs->backing_file, buflen);
+
+        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
+        header->backing_file_size   = cpu_to_be32(backing_file_len);
+    }
+
+    /* Write the new header */
+    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = 0;
+fail:
+    qemu_vfree(header);
+    return ret;
+}
+
+static int qcow2_change_backing_file(BlockDriverState *bs,
+    const char *backing_file, const char *backing_fmt)
+{
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
+    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
+
+    return qcow2_update_header(bs);
+}
+
+static int preallocate(BlockDriverState *bs)
+{
+    uint64_t nb_sectors;
+    uint64_t offset;
+    uint64_t host_offset = 0;
+    int num;
+    int ret;
+    QCowL2Meta *meta;
+
+    nb_sectors = bdrv_getlength(bs) >> 9;
+    offset = 0;
+
+    while (nb_sectors) {
+        num = MIN(nb_sectors, INT_MAX >> 9);
+        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
+                                         &host_offset, &meta);
+        if (ret < 0) {
+            return ret;
+        }
+
+        ret = qcow2_alloc_cluster_link_l2(bs, meta);
+        if (ret < 0) {
+            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters,
+                                    QCOW2_DISCARD_NEVER);
+            return ret;
+        }
+
+        /* There are no dependent requests, but we need to remove our request
+         * from the list of in-flight requests */
+        if (meta != NULL) {
+            QLIST_REMOVE(meta, next_in_flight);
+        }
+
+        /* TODO Preallocate data if requested */
+
+        nb_sectors -= num;
+        offset += num << 9;
+    }
+
+    /*
+     * It is expected that the image file is large enough to actually contain
+     * all of the allocated clusters (otherwise we get failing reads after
+     * EOF). Extend the image to the last allocated sector.
+     */
+    if (host_offset != 0) {
+        uint8_t buf[512];
+        memset(buf, 0, 512);
+        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static int qcow2_create2(const char *filename, int64_t total_size,
+                         const char *backing_file, const char *backing_format,
+                         int flags, size_t cluster_size, int prealloc,
+                         QEMUOptionParameter *options, int version)
+{
+    /* Calculate cluster_bits */
+    int cluster_bits;
+    cluster_bits = ffs(cluster_size) - 1;
+    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
+        (1 << cluster_bits) != cluster_size)
+    {
+        error_report(
+            "Cluster size must be a power of two between %d and %dk",
+            1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
+        return -EINVAL;
+    }
+
+    /*
+     * Open the image file and write a minimal qcow2 header.
+     *
+     * We keep things simple and start with a zero-sized image. We also
+     * do without refcount blocks or a L1 table for now. We'll fix the
+     * inconsistency later.
+     *
+     * We do need a refcount table because growing the refcount table means
+     * allocating two new refcount blocks - the seconds of which would be at
+     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
+     * size for any qcow2 image.
+     */
+    BlockDriverState* bs;
+    QCowHeader header;
+    uint8_t* refcount_table;
+    int ret;
+
+    ret = bdrv_create_file(filename, options);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Write the header */
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be32(QCOW_MAGIC);
+    header.version = cpu_to_be32(version);
+    header.cluster_bits = cpu_to_be32(cluster_bits);
+    header.size = cpu_to_be64(0);
+    header.l1_table_offset = cpu_to_be64(0);
+    header.l1_size = cpu_to_be32(0);
+    header.refcount_table_offset = cpu_to_be64(cluster_size);
+    header.refcount_table_clusters = cpu_to_be32(1);
+    header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
+    header.header_length = cpu_to_be32(sizeof(header));
+
+    if (flags & BLOCK_FLAG_ENCRYPT) {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+    } else {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+    }
+
+    if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
+        header.compatible_features |=
+            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
+    }
+
+    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Write an empty refcount table */
+    refcount_table = g_malloc0(cluster_size);
+    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
+    g_free(refcount_table);
+
+    if (ret < 0) {
+        goto out;
+    }
+
+    bdrv_close(bs);
+
+    /*
+     * And now open the image and make it consistent first (i.e. increase the
+     * refcount of the cluster that is occupied by the header and the refcount
+     * table)
+     */
+    BlockDriver* drv = bdrv_find_format("qcow2");
+    assert(drv != NULL);
+    ret = bdrv_open(bs, filename, NULL,
+        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
+    if (ret < 0) {
+        goto out;
+
+    } else if (ret != 0) {
+        error_report("Huh, first cluster in empty image is already in use?");
+        abort();
+    }
+
+    /* Okay, now that we have a valid image, let's give it the right size */
+    ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /* Want a backing file? There you go.*/
+    if (backing_file) {
+        ret = bdrv_change_backing_file(bs, backing_file, backing_format);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
+    /* And if we're supposed to preallocate metadata, do that now */
+    if (prealloc) {
+        BDRVQcowState *s = bs->opaque;
+        qemu_co_mutex_lock(&s->lock);
+        ret = preallocate(bs);
+        qemu_co_mutex_unlock(&s->lock);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
+    ret = 0;
+out:
+    bdrv_delete(bs);
+    return ret;
+}
+
+static int qcow2_create(const char *filename, QEMUOptionParameter *options)
+{
+    const char *backing_file = NULL;
+    const char *backing_fmt = NULL;
+    uint64_t sectors = 0;
+    int flags = 0;
+    size_t cluster_size = DEFAULT_CLUSTER_SIZE;
+    int prealloc = 0;
+    int version = 2;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            sectors = options->value.n / 512;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+            backing_fmt = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+            if (options->value.n) {
+                cluster_size = options->value.n;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
+            if (!options->value.s || !strcmp(options->value.s, "off")) {
+                prealloc = 0;
+            } else if (!strcmp(options->value.s, "metadata")) {
+                prealloc = 1;
+            } else {
+                fprintf(stderr, "Invalid preallocation mode: '%s'\n",
+                    options->value.s);
+                return -EINVAL;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
+            if (!options->value.s || !strcmp(options->value.s, "0.10")) {
+                version = 2;
+            } else if (!strcmp(options->value.s, "1.1")) {
+                version = 3;
+            } else {
+                fprintf(stderr, "Invalid compatibility level: '%s'\n",
+                    options->value.s);
+                return -EINVAL;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
+            flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
+        }
+        options++;
+    }
+
+    if (backing_file && prealloc) {
+        fprintf(stderr, "Backing file and preallocation cannot be used at "
+            "the same time\n");
+        return -EINVAL;
+    }
+
+    if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
+        fprintf(stderr, "Lazy refcounts only supported with compatibility "
+                "level 1.1 and above (use compat=1.1 or greater)\n");
+        return -EINVAL;
+    }
+
+    return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
+                         cluster_size, prealloc, options, version);
+}
+
+static int qcow2_make_empty(BlockDriverState *bs)
+{
+#if 0
+    /* XXX: not correct */
+    BDRVQcowState *s = bs->opaque;
+    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+    int ret;
+
+    memset(s->l1_table, 0, l1_length);
+    if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
+        return -1;
+    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
+    if (ret < 0)
+        return ret;
+
+    l2_cache_reset(bs);
+#endif
+    return 0;
+}
+
+static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
+{
+    int ret;
+    BDRVQcowState *s = bs->opaque;
+
+    /* Emulate misaligned zero writes */
+    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
+        return -ENOTSUP;
+    }
+
+    /* Whatever is left can use real zero clusters */
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+        nb_sectors);
+    qemu_co_mutex_unlock(&s->lock);
+
+    return ret;
+}
+
+static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
+{
+    int ret;
+    BDRVQcowState *s = bs->opaque;
+
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+        nb_sectors);
+    qemu_co_mutex_unlock(&s->lock);
+    return ret;
+}
+
+static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t new_l1_size;
+    int ret;
+
+    if (offset & 511) {
+        error_report("The new size must be a multiple of 512");
+        return -EINVAL;
+    }
+
+    /* cannot proceed if image has snapshots */
+    if (s->nb_snapshots) {
+        error_report("Can't resize an image which has snapshots");
+        return -ENOTSUP;
+    }
+
+    /* shrinking is currently not supported */
+    if (offset < bs->total_sectors * 512) {
+        error_report("qcow2 doesn't support shrinking images yet");
+        return -ENOTSUP;
+    }
+
+    new_l1_size = size_to_l1(s, offset);
+    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* write updated header.size */
+    offset = cpu_to_be64(offset);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
+                           &offset, sizeof(uint64_t));
+    if (ret < 0) {
+        return ret;
+    }
+
+    s->l1_vm_state_index = new_l1_size;
+    return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                  const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    z_stream strm;
+    int ret, out_len;
+    uint8_t *out_buf;
+    uint64_t cluster_offset;
+
+    if (nb_sectors == 0) {
+        /* align end of file to a sector boundary to ease reading with
+           sector based I/Os */
+        cluster_offset = bdrv_getlength(bs->file);
+        cluster_offset = (cluster_offset + 511) & ~511;
+        bdrv_truncate(bs->file, cluster_offset);
+        return 0;
+    }
+
+    if (nb_sectors != s->cluster_sectors) {
+        ret = -EINVAL;
+
+        /* Zero-pad last write if image size is not cluster aligned */
+        if (sector_num + nb_sectors == bs->total_sectors &&
+            nb_sectors < s->cluster_sectors) {
+            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
+            memset(pad_buf, 0, s->cluster_size);
+            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
+            ret = qcow2_write_compressed(bs, sector_num,
+                                         pad_buf, s->cluster_sectors);
+            qemu_vfree(pad_buf);
+        }
+        return ret;
+    }
+
+    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+    /* best compression, small window, no zlib header */
+    memset(&strm, 0, sizeof(strm));
+    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                       Z_DEFLATED, -12,
+                       9, Z_DEFAULT_STRATEGY);
+    if (ret != 0) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    strm.avail_in = s->cluster_size;
+    strm.next_in = (uint8_t *)buf;
+    strm.avail_out = s->cluster_size;
+    strm.next_out = out_buf;
+
+    ret = deflate(&strm, Z_FINISH);
+    if (ret != Z_STREAM_END && ret != Z_OK) {
+        deflateEnd(&strm);
+        ret = -EINVAL;
+        goto fail;
+    }
+    out_len = strm.next_out - out_buf;
+
+    deflateEnd(&strm);
+
+    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+        /* could not compress: write normal cluster */
+        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+        if (ret < 0) {
+            goto fail;
+        }
+    } else {
+        cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
+            sector_num << 9, out_len);
+        if (!cluster_offset) {
+            ret = -EIO;
+            goto fail;
+        }
+        cluster_offset &= s->cluster_offset_mask;
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
+        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    ret = 0;
+fail:
+    g_free(out_buf);
+    return ret;
+}
+
+static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        qemu_co_mutex_unlock(&s->lock);
+        return ret;
+    }
+
+    if (qcow2_need_accurate_refcounts(s)) {
+        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            qemu_co_mutex_unlock(&s->lock);
+            return ret;
+        }
+    }
+    qemu_co_mutex_unlock(&s->lock);
+
+    return 0;
+}
+
+static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
+{
+	return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
+}
+
+static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdi->cluster_size = s->cluster_size;
+    bdi->vm_state_offset = qcow2_vm_state_offset(s);
+    return 0;
+}
+
+#if 0
+static void dump_refcounts(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t nb_clusters, k, k1, size;
+    int refcount;
+
+    size = bdrv_getlength(bs->file);
+    nb_clusters = size_to_clusters(s, size);
+    for(k = 0; k < nb_clusters;) {
+        k1 = k;
+        refcount = get_refcount(bs, k);
+        k++;
+        while (k < nb_clusters && get_refcount(bs, k) == refcount)
+            k++;
+        printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
+               k - k1);
+    }
+}
+#endif
+
+static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                              int64_t pos)
+{
+    BDRVQcowState *s = bs->opaque;
+    int growable = bs->growable;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
+    bs->growable = 1;
+    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
+    bs->growable = growable;
+
+    return ret;
+}
+
+static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                              int64_t pos, int size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int growable = bs->growable;
+    int ret;
+
+    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
+    bs->growable = 1;
+    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
+    bs->growable = growable;
+
+    return ret;
+}
+
+static QEMUOptionParameter qcow2_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_COMPAT_LEVEL,
+        .type = OPT_STRING,
+        .help = "Compatibility level (0.10 or 1.1)"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "Image format of the base image"
+    },
+    {
+        .name = BLOCK_OPT_ENCRYPT,
+        .type = OPT_FLAG,
+        .help = "Encrypt the image"
+    },
+    {
+        .name = BLOCK_OPT_CLUSTER_SIZE,
+        .type = OPT_SIZE,
+        .help = "qcow2 cluster size",
+        .value = { .n = DEFAULT_CLUSTER_SIZE },
+    },
+    {
+        .name = BLOCK_OPT_PREALLOC,
+        .type = OPT_STRING,
+        .help = "Preallocation mode (allowed values: off, metadata)"
+    },
+    {
+        .name = BLOCK_OPT_LAZY_REFCOUNTS,
+        .type = OPT_FLAG,
+        .help = "Postpone refcount updates",
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_qcow2 = {
+    .format_name        = "qcow2",
+    .instance_size      = sizeof(BDRVQcowState),
+    .bdrv_probe         = qcow2_probe,
+    .bdrv_open          = qcow2_open,
+    .bdrv_close         = qcow2_close,
+    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
+    .bdrv_create        = qcow2_create,
+    .bdrv_has_zero_init = bdrv_has_zero_init_1,
+    .bdrv_co_is_allocated = qcow2_co_is_allocated,
+    .bdrv_set_key       = qcow2_set_key,
+    .bdrv_make_empty    = qcow2_make_empty,
+
+    .bdrv_co_readv          = qcow2_co_readv,
+    .bdrv_co_writev         = qcow2_co_writev,
+    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
+
+    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
+    .bdrv_co_discard        = qcow2_co_discard,
+    .bdrv_truncate          = qcow2_truncate,
+    .bdrv_write_compressed  = qcow2_write_compressed,
+
+    .bdrv_snapshot_create   = qcow2_snapshot_create,
+    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
+    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
+    .bdrv_snapshot_list     = qcow2_snapshot_list,
+    .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp,
+    .bdrv_get_info      = qcow2_get_info,
+
+    .bdrv_save_vmstate    = qcow2_save_vmstate,
+    .bdrv_load_vmstate    = qcow2_load_vmstate,
+
+    .bdrv_change_backing_file   = qcow2_change_backing_file,
+
+    .bdrv_invalidate_cache      = qcow2_invalidate_cache,
+
+    .create_options = qcow2_create_options,
+    .bdrv_check = qcow2_check,
+};
+
+static void bdrv_qcow2_init(void)
+{
+    bdrv_register(&bdrv_qcow2);
+}
+
+block_init(bdrv_qcow2_init);
diff --git a/contrib/qemu/block/qcow2.h b/contrib/qemu/block/qcow2.h
new file mode 100644
index 00000000000..3b2d5cda71f
--- /dev/null
+++ b/contrib/qemu/block/qcow2.h
@@ -0,0 +1,437 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_QCOW2_H
+#define BLOCK_QCOW2_H
+
+#include "qemu/aes.h"
+#include "block/coroutine.h"
+
+//#define DEBUG_ALLOC
+//#define DEBUG_ALLOC2
+//#define DEBUG_EXT
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+
+#define QCOW_MAX_CRYPT_CLUSTERS 32
+
+/* indicate that the refcount of the referenced cluster is exactly one. */
+#define QCOW_OFLAG_COPIED     (1LL << 63)
+/* indicate that the cluster is compressed (they never have the copied flag) */
+#define QCOW_OFLAG_COMPRESSED (1LL << 62)
+/* The cluster reads as all zeros */
+#define QCOW_OFLAG_ZERO (1LL << 0)
+
+#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
+
+#define MIN_CLUSTER_BITS 9
+#define MAX_CLUSTER_BITS 21
+
+#define L2_CACHE_SIZE 16
+
+/* Must be at least 4 to cover all cases of refcount table growth */
+#define REFCOUNT_CACHE_SIZE 4
+
+#define DEFAULT_CLUSTER_SIZE 65536
+
+
+#define QCOW2_OPT_LAZY_REFCOUNTS "lazy_refcounts"
+#define QCOW2_OPT_DISCARD_REQUEST "pass_discard_request"
+#define QCOW2_OPT_DISCARD_SNAPSHOT "pass_discard_snapshot"
+#define QCOW2_OPT_DISCARD_OTHER "pass_discard_other"
+
+typedef struct QCowHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t cluster_bits;
+    uint64_t size; /* in bytes */
+    uint32_t crypt_method;
+    uint32_t l1_size; /* XXX: save number of clusters instead ? */
+    uint64_t l1_table_offset;
+    uint64_t refcount_table_offset;
+    uint32_t refcount_table_clusters;
+    uint32_t nb_snapshots;
+    uint64_t snapshots_offset;
+
+    /* The following fields are only valid for version >= 3 */
+    uint64_t incompatible_features;
+    uint64_t compatible_features;
+    uint64_t autoclear_features;
+
+    uint32_t refcount_order;
+    uint32_t header_length;
+} QCowHeader;
+
+typedef struct QCowSnapshot {
+    uint64_t l1_table_offset;
+    uint32_t l1_size;
+    char *id_str;
+    char *name;
+    uint64_t disk_size;
+    uint64_t vm_state_size;
+    uint32_t date_sec;
+    uint32_t date_nsec;
+    uint64_t vm_clock_nsec;
+} QCowSnapshot;
+
+struct Qcow2Cache;
+typedef struct Qcow2Cache Qcow2Cache;
+
+typedef struct Qcow2UnknownHeaderExtension {
+    uint32_t magic;
+    uint32_t len;
+    QLIST_ENTRY(Qcow2UnknownHeaderExtension) next;
+    uint8_t data[];
+} Qcow2UnknownHeaderExtension;
+
+enum {
+    QCOW2_FEAT_TYPE_INCOMPATIBLE    = 0,
+    QCOW2_FEAT_TYPE_COMPATIBLE      = 1,
+    QCOW2_FEAT_TYPE_AUTOCLEAR       = 2,
+};
+
+/* Incompatible feature bits */
+enum {
+    QCOW2_INCOMPAT_DIRTY_BITNR   = 0,
+    QCOW2_INCOMPAT_DIRTY         = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+
+    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY,
+};
+
+/* Compatible feature bits */
+enum {
+    QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0,
+    QCOW2_COMPAT_LAZY_REFCOUNTS       = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+
+    QCOW2_COMPAT_FEAT_MASK            = QCOW2_COMPAT_LAZY_REFCOUNTS,
+};
+
+enum qcow2_discard_type {
+    QCOW2_DISCARD_NEVER = 0,
+    QCOW2_DISCARD_ALWAYS,
+    QCOW2_DISCARD_REQUEST,
+    QCOW2_DISCARD_SNAPSHOT,
+    QCOW2_DISCARD_OTHER,
+    QCOW2_DISCARD_MAX
+};
+
+typedef struct Qcow2Feature {
+    uint8_t type;
+    uint8_t bit;
+    char    name[46];
+} QEMU_PACKED Qcow2Feature;
+
+typedef struct Qcow2DiscardRegion {
+    BlockDriverState *bs;
+    uint64_t offset;
+    uint64_t bytes;
+    QTAILQ_ENTRY(Qcow2DiscardRegion) next;
+} Qcow2DiscardRegion;
+
+typedef struct BDRVQcowState {
+    int cluster_bits;
+    int cluster_size;
+    int cluster_sectors;
+    int l2_bits;
+    int l2_size;
+    int l1_size;
+    int l1_vm_state_index;
+    int csize_shift;
+    int csize_mask;
+    uint64_t cluster_offset_mask;
+    uint64_t l1_table_offset;
+    uint64_t *l1_table;
+
+    Qcow2Cache* l2_table_cache;
+    Qcow2Cache* refcount_block_cache;
+
+    uint8_t *cluster_cache;
+    uint8_t *cluster_data;
+    uint64_t cluster_cache_offset;
+    QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs;
+
+    uint64_t *refcount_table;
+    uint64_t refcount_table_offset;
+    uint32_t refcount_table_size;
+    int64_t free_cluster_index;
+    int64_t free_byte_offset;
+
+    CoMutex lock;
+
+    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    uint32_t crypt_method_header;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+    uint64_t snapshots_offset;
+    int snapshots_size;
+    int nb_snapshots;
+    QCowSnapshot *snapshots;
+
+    int flags;
+    int qcow_version;
+    bool use_lazy_refcounts;
+
+    bool discard_passthrough[QCOW2_DISCARD_MAX];
+
+    uint64_t incompatible_features;
+    uint64_t compatible_features;
+    uint64_t autoclear_features;
+
+    size_t unknown_header_fields_size;
+    void* unknown_header_fields;
+    QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext;
+    QTAILQ_HEAD (, Qcow2DiscardRegion) discards;
+    bool cache_discards;
+} BDRVQcowState;
+
+/* XXX: use std qcow open function ? */
+typedef struct QCowCreateState {
+    int cluster_size;
+    int cluster_bits;
+    uint16_t *refcount_block;
+    uint64_t *refcount_table;
+    int64_t l1_table_offset;
+    int64_t refcount_table_offset;
+    int64_t refcount_block_offset;
+} QCowCreateState;
+
+struct QCowAIOCB;
+
+typedef struct Qcow2COWRegion {
+    /**
+     * Offset of the COW region in bytes from the start of the first cluster
+     * touched by the request.
+     */
+    uint64_t    offset;
+
+    /** Number of sectors to copy */
+    int         nb_sectors;
+} Qcow2COWRegion;
+
+/**
+ * Describes an in-flight (part of a) write request that writes to clusters
+ * that are not referenced in their L2 table yet.
+ */
+typedef struct QCowL2Meta
+{
+    /** Guest offset of the first newly allocated cluster */
+    uint64_t offset;
+
+    /** Host offset of the first newly allocated cluster */
+    uint64_t alloc_offset;
+
+    /**
+     * Number of sectors from the start of the first allocated cluster to
+     * the end of the (possibly shortened) request
+     */
+    int nb_available;
+
+    /** Number of newly allocated clusters */
+    int nb_clusters;
+
+    /**
+     * Requests that overlap with this allocation and wait to be restarted
+     * when the allocating request has completed.
+     */
+    CoQueue dependent_requests;
+
+    /**
+     * The COW Region between the start of the first allocated cluster and the
+     * area the guest actually writes to.
+     */
+    Qcow2COWRegion cow_start;
+
+    /**
+     * The COW Region between the area the guest actually writes to and the
+     * end of the last allocated cluster.
+     */
+    Qcow2COWRegion cow_end;
+
+    /** Pointer to next L2Meta of the same write request */
+    struct QCowL2Meta *next;
+
+    QLIST_ENTRY(QCowL2Meta) next_in_flight;
+} QCowL2Meta;
+
+enum {
+    QCOW2_CLUSTER_UNALLOCATED,
+    QCOW2_CLUSTER_NORMAL,
+    QCOW2_CLUSTER_COMPRESSED,
+    QCOW2_CLUSTER_ZERO
+};
+
+#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
+
+#define REFT_OFFSET_MASK 0xffffffffffffff00ULL
+
+static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & ~(s->cluster_size - 1);
+}
+
+static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & (s->cluster_size - 1);
+}
+
+static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
+{
+    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
+}
+
+static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size)
+{
+    int shift = s->cluster_bits + s->l2_bits;
+    return (size + (1ULL << shift) - 1) >> shift;
+}
+
+static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset)
+{
+    return (offset >> s->cluster_bits) & (s->l2_size - 1);
+}
+
+static inline int64_t align_offset(int64_t offset, int n)
+{
+    offset = (offset + n - 1) & ~(n - 1);
+    return offset;
+}
+
+static inline int qcow2_get_cluster_type(uint64_t l2_entry)
+{
+    if (l2_entry & QCOW_OFLAG_COMPRESSED) {
+        return QCOW2_CLUSTER_COMPRESSED;
+    } else if (l2_entry & QCOW_OFLAG_ZERO) {
+        return QCOW2_CLUSTER_ZERO;
+    } else if (!(l2_entry & L2E_OFFSET_MASK)) {
+        return QCOW2_CLUSTER_UNALLOCATED;
+    } else {
+        return QCOW2_CLUSTER_NORMAL;
+    }
+}
+
+/* Check whether refcounts are eager or lazy */
+static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s)
+{
+    return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY);
+}
+
+static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
+{
+    return m->offset + m->cow_start.offset;
+}
+
+static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
+{
+    return m->offset + m->cow_end.offset
+        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+}
+
+// FIXME Need qcow2_ prefix to global functions
+
+/* qcow2.c functions */
+int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
+                  int64_t sector_num, int nb_sectors);
+
+int qcow2_mark_dirty(BlockDriverState *bs);
+int qcow2_update_header(BlockDriverState *bs);
+
+/* qcow2-refcount.c functions */
+int qcow2_refcount_init(BlockDriverState *bs);
+void qcow2_refcount_close(BlockDriverState *bs);
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
+int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+    int nb_clusters);
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
+void qcow2_free_clusters(BlockDriverState *bs,
+                          int64_t offset, int64_t size,
+                          enum qcow2_discard_type type);
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+                             int nb_clusters, enum qcow2_discard_type type);
+
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+    int64_t l1_table_offset, int l1_size, int addend);
+
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                          BdrvCheckMode fix);
+
+void qcow2_process_discards(BlockDriverState *bs, int ret);
+
+/* qcow2-cluster.c functions */
+int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
+                        bool exact_size);
+void qcow2_l2_cache_reset(BlockDriverState *bs);
+int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                     uint8_t *out_buf, const uint8_t *in_buf,
+                     int nb_sectors, int enc,
+                     const AES_KEY *key);
+
+int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int *num, uint64_t *cluster_offset);
+int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m);
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+                                         uint64_t offset,
+                                         int compressed_size);
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
+int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
+    int nb_sectors);
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
+
+/* qcow2-snapshot.c functions */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab);
+int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name);
+
+void qcow2_free_snapshots(BlockDriverState *bs);
+int qcow2_read_snapshots(BlockDriverState *bs);
+
+/* qcow2-cache.c functions */
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
+int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
+
+void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
+int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
+    Qcow2Cache *dependency);
+void qcow2_cache_depends_on_flush(Qcow2Cache *c);
+
+int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table);
+int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
+    void **table);
+int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
+
+#endif
diff --git a/contrib/qemu/block/qed-check.c b/contrib/qemu/block/qed-check.c
new file mode 100644
index 00000000000..b473dcd61f6
--- /dev/null
+++ b/contrib/qemu/block/qed-check.c
@@ -0,0 +1,248 @@
+/*
+ * QEMU Enhanced Disk Format Consistency Check
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+typedef struct {
+    BDRVQEDState *s;
+    BdrvCheckResult *result;
+    bool fix;                           /* whether to fix invalid offsets */
+
+    uint64_t nclusters;
+    uint32_t *used_clusters;            /* referenced cluster bitmap */
+
+    QEDRequest request;
+} QEDCheck;
+
+static bool qed_test_bit(uint32_t *bitmap, uint64_t n) {
+    return !!(bitmap[n / 32] & (1 << (n % 32)));
+}
+
+static void qed_set_bit(uint32_t *bitmap, uint64_t n) {
+    bitmap[n / 32] |= 1 << (n % 32);
+}
+
+/**
+ * Set bitmap bits for clusters
+ *
+ * @check:          Check structure
+ * @offset:         Starting offset in bytes
+ * @n:              Number of clusters
+ */
+static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset,
+                                  unsigned int n)
+{
+    uint64_t cluster = qed_bytes_to_clusters(check->s, offset);
+    unsigned int corruptions = 0;
+
+    while (n-- != 0) {
+        /* Clusters should only be referenced once */
+        if (qed_test_bit(check->used_clusters, cluster)) {
+            corruptions++;
+        }
+
+        qed_set_bit(check->used_clusters, cluster);
+        cluster++;
+    }
+
+    check->result->corruptions += corruptions;
+    return corruptions == 0;
+}
+
+/**
+ * Check an L2 table
+ *
+ * @ret:            Number of invalid cluster offsets
+ */
+static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
+{
+    BDRVQEDState *s = check->s;
+    unsigned int i, num_invalid = 0;
+    uint64_t last_offset = 0;
+
+    for (i = 0; i < s->table_nelems; i++) {
+        uint64_t offset = table->offsets[i];
+
+        if (qed_offset_is_unalloc_cluster(offset) ||
+            qed_offset_is_zero_cluster(offset)) {
+            continue;
+        }
+        check->result->bfi.allocated_clusters++;
+        if (last_offset && (last_offset + s->header.cluster_size != offset)) {
+            check->result->bfi.fragmented_clusters++;
+        }
+        last_offset = offset;
+
+        /* Detect invalid cluster offset */
+        if (!qed_check_cluster_offset(s, offset)) {
+            if (check->fix) {
+                table->offsets[i] = 0;
+                check->result->corruptions_fixed++;
+            } else {
+                check->result->corruptions++;
+            }
+
+            num_invalid++;
+            continue;
+        }
+
+        qed_set_used_clusters(check, offset, 1);
+    }
+
+    return num_invalid;
+}
+
+/**
+ * Descend tables and check each cluster is referenced once only
+ */
+static int qed_check_l1_table(QEDCheck *check, QEDTable *table)
+{
+    BDRVQEDState *s = check->s;
+    unsigned int i, num_invalid_l1 = 0;
+    int ret, last_error = 0;
+
+    /* Mark L1 table clusters used */
+    qed_set_used_clusters(check, s->header.l1_table_offset,
+                          s->header.table_size);
+
+    for (i = 0; i < s->table_nelems; i++) {
+        unsigned int num_invalid_l2;
+        uint64_t offset = table->offsets[i];
+
+        if (qed_offset_is_unalloc_cluster(offset)) {
+            continue;
+        }
+
+        /* Detect invalid L2 offset */
+        if (!qed_check_table_offset(s, offset)) {
+            /* Clear invalid offset */
+            if (check->fix) {
+                table->offsets[i] = 0;
+                check->result->corruptions_fixed++;
+            } else {
+                check->result->corruptions++;
+            }
+
+            num_invalid_l1++;
+            continue;
+        }
+
+        if (!qed_set_used_clusters(check, offset, s->header.table_size)) {
+            continue; /* skip an invalid table */
+        }
+
+        ret = qed_read_l2_table_sync(s, &check->request, offset);
+        if (ret) {
+            check->result->check_errors++;
+            last_error = ret;
+            continue;
+        }
+
+        num_invalid_l2 = qed_check_l2_table(check,
+                                            check->request.l2_table->table);
+
+        /* Write out fixed L2 table */
+        if (num_invalid_l2 > 0 && check->fix) {
+            ret = qed_write_l2_table_sync(s, &check->request, 0,
+                                          s->table_nelems, false);
+            if (ret) {
+                check->result->check_errors++;
+                last_error = ret;
+                continue;
+            }
+        }
+    }
+
+    /* Drop reference to final table */
+    qed_unref_l2_cache_entry(check->request.l2_table);
+    check->request.l2_table = NULL;
+
+    /* Write out fixed L1 table */
+    if (num_invalid_l1 > 0 && check->fix) {
+        ret = qed_write_l1_table_sync(s, 0, s->table_nelems);
+        if (ret) {
+            check->result->check_errors++;
+            last_error = ret;
+        }
+    }
+
+    return last_error;
+}
+
+/**
+ * Check for unreferenced (leaked) clusters
+ */
+static void qed_check_for_leaks(QEDCheck *check)
+{
+    BDRVQEDState *s = check->s;
+    uint64_t i;
+
+    for (i = s->header.header_size; i < check->nclusters; i++) {
+        if (!qed_test_bit(check->used_clusters, i)) {
+            check->result->leaks++;
+        }
+    }
+}
+
+/**
+ * Mark an image clean once it passes check or has been repaired
+ */
+static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result)
+{
+    /* Skip if there were unfixable corruptions or I/O errors */
+    if (result->corruptions > 0 || result->check_errors > 0) {
+        return;
+    }
+
+    /* Skip if image is already marked clean */
+    if (!(s->header.features & QED_F_NEED_CHECK)) {
+        return;
+    }
+
+    /* Ensure fixes reach storage before clearing check bit */
+    bdrv_flush(s->bs);
+
+    s->header.features &= ~QED_F_NEED_CHECK;
+    qed_write_header_sync(s);
+}
+
+int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
+{
+    QEDCheck check = {
+        .s = s,
+        .result = result,
+        .nclusters = qed_bytes_to_clusters(s, s->file_size),
+        .request = { .l2_table = NULL },
+        .fix = fix,
+    };
+    int ret;
+
+    check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) *
+                                       sizeof(check.used_clusters[0]));
+
+    check.result->bfi.total_clusters =
+        (s->header.image_size + s->header.cluster_size - 1) /
+            s->header.cluster_size;
+    ret = qed_check_l1_table(&check, s->l1_table);
+    if (ret == 0) {
+        /* Only check for leaks if entire image was scanned successfully */
+        qed_check_for_leaks(&check);
+
+        if (fix) {
+            qed_check_mark_clean(s, result);
+        }
+    }
+
+    g_free(check.used_clusters);
+    return ret;
+}
diff --git a/contrib/qemu/block/qed-cluster.c b/contrib/qemu/block/qed-cluster.c
new file mode 100644
index 00000000000..f64b2af8f7e
--- /dev/null
+++ b/contrib/qemu/block/qed-cluster.c
@@ -0,0 +1,165 @@
+/*
+ * QEMU Enhanced Disk Format Cluster functions
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+/**
+ * Count the number of contiguous data clusters
+ *
+ * @s:              QED state
+ * @table:          L2 table
+ * @index:          First cluster index
+ * @n:              Maximum number of clusters
+ * @offset:         Set to first cluster offset
+ *
+ * This function scans tables for contiguous clusters.  A contiguous run of
+ * clusters may be allocated, unallocated, or zero.
+ */
+static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
+                                                  QEDTable *table,
+                                                  unsigned int index,
+                                                  unsigned int n,
+                                                  uint64_t *offset)
+{
+    unsigned int end = MIN(index + n, s->table_nelems);
+    uint64_t last = table->offsets[index];
+    unsigned int i;
+
+    *offset = last;
+
+    for (i = index + 1; i < end; i++) {
+        if (qed_offset_is_unalloc_cluster(last)) {
+            /* Counting unallocated clusters */
+            if (!qed_offset_is_unalloc_cluster(table->offsets[i])) {
+                break;
+            }
+        } else if (qed_offset_is_zero_cluster(last)) {
+            /* Counting zero clusters */
+            if (!qed_offset_is_zero_cluster(table->offsets[i])) {
+                break;
+            }
+        } else {
+            /* Counting allocated clusters */
+            if (table->offsets[i] != last + s->header.cluster_size) {
+                break;
+            }
+            last = table->offsets[i];
+        }
+    }
+    return i - index;
+}
+
+typedef struct {
+    BDRVQEDState *s;
+    uint64_t pos;
+    size_t len;
+
+    QEDRequest *request;
+
+    /* User callback */
+    QEDFindClusterFunc *cb;
+    void *opaque;
+} QEDFindClusterCB;
+
+static void qed_find_cluster_cb(void *opaque, int ret)
+{
+    QEDFindClusterCB *find_cluster_cb = opaque;
+    BDRVQEDState *s = find_cluster_cb->s;
+    QEDRequest *request = find_cluster_cb->request;
+    uint64_t offset = 0;
+    size_t len = 0;
+    unsigned int index;
+    unsigned int n;
+
+    if (ret) {
+        goto out;
+    }
+
+    index = qed_l2_index(s, find_cluster_cb->pos);
+    n = qed_bytes_to_clusters(s,
+                              qed_offset_into_cluster(s, find_cluster_cb->pos) +
+                              find_cluster_cb->len);
+    n = qed_count_contiguous_clusters(s, request->l2_table->table,
+                                      index, n, &offset);
+
+    if (qed_offset_is_unalloc_cluster(offset)) {
+        ret = QED_CLUSTER_L2;
+    } else if (qed_offset_is_zero_cluster(offset)) {
+        ret = QED_CLUSTER_ZERO;
+    } else if (qed_check_cluster_offset(s, offset)) {
+        ret = QED_CLUSTER_FOUND;
+    } else {
+        ret = -EINVAL;
+    }
+
+    len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
+              qed_offset_into_cluster(s, find_cluster_cb->pos));
+
+out:
+    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
+    g_free(find_cluster_cb);
+}
+
+/**
+ * Find the offset of a data cluster
+ *
+ * @s:          QED state
+ * @request:    L2 cache entry
+ * @pos:        Byte position in device
+ * @len:        Number of bytes
+ * @cb:         Completion function
+ * @opaque:     User data for completion function
+ *
+ * This function translates a position in the block device to an offset in the
+ * image file.  It invokes the cb completion callback to report back the
+ * translated offset or unallocated range in the image file.
+ *
+ * If the L2 table exists, request->l2_table points to the L2 table cache entry
+ * and the caller must free the reference when they are finished.  The cache
+ * entry is exposed in this way to avoid callers having to read the L2 table
+ * again later during request processing.  If request->l2_table is non-NULL it
+ * will be unreferenced before taking on the new cache entry.
+ */
+void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+                      size_t len, QEDFindClusterFunc *cb, void *opaque)
+{
+    QEDFindClusterCB *find_cluster_cb;
+    uint64_t l2_offset;
+
+    /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
+     * so that a request acts on one L2 table at a time.
+     */
+    len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
+
+    l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
+    if (qed_offset_is_unalloc_cluster(l2_offset)) {
+        cb(opaque, QED_CLUSTER_L1, 0, len);
+        return;
+    }
+    if (!qed_check_table_offset(s, l2_offset)) {
+        cb(opaque, -EINVAL, 0, 0);
+        return;
+    }
+
+    find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
+    find_cluster_cb->s = s;
+    find_cluster_cb->pos = pos;
+    find_cluster_cb->len = len;
+    find_cluster_cb->cb = cb;
+    find_cluster_cb->opaque = opaque;
+    find_cluster_cb->request = request;
+
+    qed_read_l2_table(s, request, l2_offset,
+                      qed_find_cluster_cb, find_cluster_cb);
+}
diff --git a/contrib/qemu/block/qed-gencb.c b/contrib/qemu/block/qed-gencb.c
new file mode 100644
index 00000000000..7d7ac1ffc8e
--- /dev/null
+++ b/contrib/qemu/block/qed-gencb.c
@@ -0,0 +1,32 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque)
+{
+    GenericCB *gencb = g_malloc(len);
+    gencb->cb = cb;
+    gencb->opaque = opaque;
+    return gencb;
+}
+
+void gencb_complete(void *opaque, int ret)
+{
+    GenericCB *gencb = opaque;
+    BlockDriverCompletionFunc *cb = gencb->cb;
+    void *user_opaque = gencb->opaque;
+
+    g_free(gencb);
+    cb(user_opaque, ret);
+}
diff --git a/contrib/qemu/block/qed-l2-cache.c b/contrib/qemu/block/qed-l2-cache.c
new file mode 100644
index 00000000000..e9b2aae44d9
--- /dev/null
+++ b/contrib/qemu/block/qed-l2-cache.c
@@ -0,0 +1,187 @@
+/*
+ * QEMU Enhanced Disk Format L2 Cache
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*
+ * L2 table cache usage is as follows:
+ *
+ * An open image has one L2 table cache that is used to avoid accessing the
+ * image file for recently referenced L2 tables.
+ *
+ * Cluster offset lookup translates the logical offset within the block device
+ * to a cluster offset within the image file.  This is done by indexing into
+ * the L1 and L2 tables which store cluster offsets.  It is here where the L2
+ * table cache serves up recently referenced L2 tables.
+ *
+ * If there is a cache miss, that L2 table is read from the image file and
+ * committed to the cache.  Subsequent accesses to that L2 table will be served
+ * from the cache until the table is evicted from the cache.
+ *
+ * L2 tables are also committed to the cache when new L2 tables are allocated
+ * in the image file.  Since the L2 table cache is write-through, the new L2
+ * table is first written out to the image file and then committed to the
+ * cache.
+ *
+ * Multiple I/O requests may be using an L2 table cache entry at any given
+ * time.  That means an entry may be in use across several requests and
+ * reference counting is needed to free the entry at the correct time.  In
+ * particular, an entry evicted from the cache will only be freed once all
+ * references are dropped.
+ *
+ * An in-flight I/O request will hold a reference to a L2 table cache entry for
+ * the period during which it needs to access the L2 table.  This includes
+ * cluster offset lookup, L2 table allocation, and L2 table update when a new
+ * data cluster has been allocated.
+ *
+ * An interesting case occurs when two requests need to access an L2 table that
+ * is not in the cache.  Since the operation to read the table from the image
+ * file takes some time to complete, both requests may see a cache miss and
+ * start reading the L2 table from the image file.  The first to finish will
+ * commit its L2 table into the cache.  When the second tries to commit its
+ * table will be deleted in favor of the existing cache entry.
+ */
+
+#include "trace.h"
+#include "qed.h"
+
+/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */
+#define MAX_L2_CACHE_SIZE 50
+
+/**
+ * Initialize the L2 cache
+ */
+void qed_init_l2_cache(L2TableCache *l2_cache)
+{
+    QTAILQ_INIT(&l2_cache->entries);
+    l2_cache->n_entries = 0;
+}
+
+/**
+ * Free the L2 cache
+ */
+void qed_free_l2_cache(L2TableCache *l2_cache)
+{
+    CachedL2Table *entry, *next_entry;
+
+    QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) {
+        qemu_vfree(entry->table);
+        g_free(entry);
+    }
+}
+
+/**
+ * Allocate an uninitialized entry from the cache
+ *
+ * The returned entry has a reference count of 1 and is owned by the caller.
+ * The caller must allocate the actual table field for this entry and it must
+ * be freeable using qemu_vfree().
+ */
+CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache)
+{
+    CachedL2Table *entry;
+
+    entry = g_malloc0(sizeof(*entry));
+    entry->ref++;
+
+    trace_qed_alloc_l2_cache_entry(l2_cache, entry);
+
+    return entry;
+}
+
+/**
+ * Decrease an entry's reference count and free if necessary when the reference
+ * count drops to zero.
+ */
+void qed_unref_l2_cache_entry(CachedL2Table *entry)
+{
+    if (!entry) {
+        return;
+    }
+
+    entry->ref--;
+    trace_qed_unref_l2_cache_entry(entry, entry->ref);
+    if (entry->ref == 0) {
+        qemu_vfree(entry->table);
+        g_free(entry);
+    }
+}
+
+/**
+ * Find an entry in the L2 cache.  This may return NULL and it's up to the
+ * caller to satisfy the cache miss.
+ *
+ * For a cached entry, this function increases the reference count and returns
+ * the entry.
+ */
+CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset)
+{
+    CachedL2Table *entry;
+
+    QTAILQ_FOREACH(entry, &l2_cache->entries, node) {
+        if (entry->offset == offset) {
+            trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref);
+            entry->ref++;
+            return entry;
+        }
+    }
+    return NULL;
+}
+
+/**
+ * Commit an L2 cache entry into the cache.  This is meant to be used as part of
+ * the process to satisfy a cache miss.  A caller would allocate an entry which
+ * is not actually in the L2 cache and then once the entry was valid and
+ * present on disk, the entry can be committed into the cache.
+ *
+ * Since the cache is write-through, it's important that this function is not
+ * called until the entry is present on disk and the L1 has been updated to
+ * point to the entry.
+ *
+ * N.B. This function steals a reference to the l2_table from the caller so the
+ * caller must obtain a new reference by issuing a call to
+ * qed_find_l2_cache_entry().
+ */
+void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table)
+{
+    CachedL2Table *entry;
+
+    entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset);
+    if (entry) {
+        qed_unref_l2_cache_entry(entry);
+        qed_unref_l2_cache_entry(l2_table);
+        return;
+    }
+
+    /* Evict an unused cache entry so we have space.  If all entries are in use
+     * we can grow the cache temporarily and we try to shrink back down later.
+     */
+    if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) {
+        CachedL2Table *next;
+        QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) {
+            if (entry->ref > 1) {
+                continue;
+            }
+
+            QTAILQ_REMOVE(&l2_cache->entries, entry, node);
+            l2_cache->n_entries--;
+            qed_unref_l2_cache_entry(entry);
+
+            /* Stop evicting when we've shrunk back to max size */
+            if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) {
+                break;
+            }
+        }
+    }
+
+    l2_cache->n_entries++;
+    QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node);
+}
diff --git a/contrib/qemu/block/qed-table.c b/contrib/qemu/block/qed-table.c
new file mode 100644
index 00000000000..76d2dcccf81
--- /dev/null
+++ b/contrib/qemu/block/qed-table.c
@@ -0,0 +1,296 @@
+/*
+ * QEMU Enhanced Disk Format Table I/O
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "qed.h"
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    QEDTable *table;
+
+    struct iovec iov;
+    QEMUIOVector qiov;
+} QEDReadTableCB;
+
+static void qed_read_table_cb(void *opaque, int ret)
+{
+    QEDReadTableCB *read_table_cb = opaque;
+    QEDTable *table = read_table_cb->table;
+    int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
+    int i;
+
+    /* Handle I/O error */
+    if (ret) {
+        goto out;
+    }
+
+    /* Byteswap offsets */
+    for (i = 0; i < noffsets; i++) {
+        table->offsets[i] = le64_to_cpu(table->offsets[i]);
+    }
+
+out:
+    /* Completion */
+    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
+    gencb_complete(&read_table_cb->gencb, ret);
+}
+
+static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+                           BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
+                                                cb, opaque);
+    QEMUIOVector *qiov = &read_table_cb->qiov;
+
+    trace_qed_read_table(s, offset, table);
+
+    read_table_cb->s = s;
+    read_table_cb->table = table;
+    read_table_cb->iov.iov_base = table->offsets,
+    read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,
+
+    qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
+    bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
+                   qiov->size / BDRV_SECTOR_SIZE,
+                   qed_read_table_cb, read_table_cb);
+}
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    QEDTable *orig_table;
+    QEDTable *table;
+    bool flush;             /* flush after write? */
+
+    struct iovec iov;
+    QEMUIOVector qiov;
+} QEDWriteTableCB;
+
+static void qed_write_table_cb(void *opaque, int ret)
+{
+    QEDWriteTableCB *write_table_cb = opaque;
+
+    trace_qed_write_table_cb(write_table_cb->s,
+                             write_table_cb->orig_table,
+                             write_table_cb->flush,
+                             ret);
+
+    if (ret) {
+        goto out;
+    }
+
+    if (write_table_cb->flush) {
+        /* We still need to flush first */
+        write_table_cb->flush = false;
+        bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
+                       write_table_cb);
+        return;
+    }
+
+out:
+    qemu_vfree(write_table_cb->table);
+    gencb_complete(&write_table_cb->gencb, ret);
+}
+
+/**
+ * Write out an updated part or all of a table
+ *
+ * @s:          QED state
+ * @offset:     Offset of table in image file, in bytes
+ * @table:      Table
+ * @index:      Index of first element
+ * @n:          Number of elements
+ * @flush:      Whether or not to sync to disk
+ * @cb:         Completion function
+ * @opaque:     Argument for completion function
+ */
+static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+                            unsigned int index, unsigned int n, bool flush,
+                            BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QEDWriteTableCB *write_table_cb;
+    unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
+    unsigned int start, end, i;
+    size_t len_bytes;
+
+    trace_qed_write_table(s, offset, table, index, n);
+
+    /* Calculate indices of the first and one after last elements */
+    start = index & ~sector_mask;
+    end = (index + n + sector_mask) & ~sector_mask;
+
+    len_bytes = (end - start) * sizeof(uint64_t);
+
+    write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque);
+    write_table_cb->s = s;
+    write_table_cb->orig_table = table;
+    write_table_cb->flush = flush;
+    write_table_cb->table = qemu_blockalign(s->bs, len_bytes);
+    write_table_cb->iov.iov_base = write_table_cb->table->offsets;
+    write_table_cb->iov.iov_len = len_bytes;
+    qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1);
+
+    /* Byteswap table */
+    for (i = start; i < end; i++) {
+        uint64_t le_offset = cpu_to_le64(table->offsets[i]);
+        write_table_cb->table->offsets[i - start] = le_offset;
+    }
+
+    /* Adjust for offset into table */
+    offset += start * sizeof(uint64_t);
+
+    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+                    &write_table_cb->qiov,
+                    write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
+                    qed_write_table_cb, write_table_cb);
+}
+
+/**
+ * Propagate return value from async callback
+ */
+static void qed_sync_cb(void *opaque, int ret)
+{
+    *(int *)opaque = ret;
+}
+
+int qed_read_l1_table_sync(BDRVQEDState *s)
+{
+    int ret = -EINPROGRESS;
+
+    qed_read_table(s, s->header.l1_table_offset,
+                   s->l1_table, qed_sync_cb, &ret);
+    while (ret == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    return ret;
+}
+
+void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
+                        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
+    qed_write_table(s, s->header.l1_table_offset,
+                    s->l1_table, index, n, false, cb, opaque);
+}
+
+int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
+                            unsigned int n)
+{
+    int ret = -EINPROGRESS;
+
+    qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
+    while (ret == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    return ret;
+}
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    uint64_t l2_offset;
+    QEDRequest *request;
+} QEDReadL2TableCB;
+
+static void qed_read_l2_table_cb(void *opaque, int ret)
+{
+    QEDReadL2TableCB *read_l2_table_cb = opaque;
+    QEDRequest *request = read_l2_table_cb->request;
+    BDRVQEDState *s = read_l2_table_cb->s;
+    CachedL2Table *l2_table = request->l2_table;
+    uint64_t l2_offset = read_l2_table_cb->l2_offset;
+
+    if (ret) {
+        /* can't trust loaded L2 table anymore */
+        qed_unref_l2_cache_entry(l2_table);
+        request->l2_table = NULL;
+    } else {
+        l2_table->offset = l2_offset;
+
+        qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
+
+        /* This is guaranteed to succeed because we just committed the entry
+         * to the cache.
+         */
+        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+        assert(request->l2_table != NULL);
+    }
+
+    gencb_complete(&read_l2_table_cb->gencb, ret);
+}
+
+void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
+                       BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QEDReadL2TableCB *read_l2_table_cb;
+
+    qed_unref_l2_cache_entry(request->l2_table);
+
+    /* Check for cached L2 entry */
+    request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
+    if (request->l2_table) {
+        cb(opaque, 0);
+        return;
+    }
+
+    request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
+    request->l2_table->table = qed_alloc_table(s);
+
+    read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque);
+    read_l2_table_cb->s = s;
+    read_l2_table_cb->l2_offset = offset;
+    read_l2_table_cb->request = request;
+
+    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD);
+    qed_read_table(s, offset, request->l2_table->table,
+                   qed_read_l2_table_cb, read_l2_table_cb);
+}
+
+int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
+{
+    int ret = -EINPROGRESS;
+
+    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
+    while (ret == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    return ret;
+}
+
+void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                        unsigned int index, unsigned int n, bool flush,
+                        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
+    qed_write_table(s, request->l2_table->offset,
+                    request->l2_table->table, index, n, flush, cb, opaque);
+}
+
+int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+                            unsigned int index, unsigned int n, bool flush)
+{
+    int ret = -EINPROGRESS;
+
+    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
+    while (ret == -EINPROGRESS) {
+        qemu_aio_wait();
+    }
+
+    return ret;
+}
diff --git a/contrib/qemu/block/qed.c b/contrib/qemu/block/qed.c
new file mode 100644
index 00000000000..f767b0528ce
--- /dev/null
+++ b/contrib/qemu/block/qed.c
@@ -0,0 +1,1596 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/timer.h"
+#include "trace.h"
+#include "qed.h"
+#include "qapi/qmp/qerror.h"
+#include "migration/migration.h"
+
+static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QEDAIOCB *acb = (QEDAIOCB *)blockacb;
+    bool finished = false;
+
+    /* Wait for the request to finish */
+    acb->finished = &finished;
+    while (!finished) {
+        qemu_aio_wait();
+    }
+}
+
+static const AIOCBInfo qed_aiocb_info = {
+    .aiocb_size         = sizeof(QEDAIOCB),
+    .cancel             = qed_aio_cancel,
+};
+
+static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
+                          const char *filename)
+{
+    const QEDHeader *header = (const QEDHeader *)buf;
+
+    if (buf_size < sizeof(*header)) {
+        return 0;
+    }
+    if (le32_to_cpu(header->magic) != QED_MAGIC) {
+        return 0;
+    }
+    return 100;
+}
+
+/**
+ * Check whether an image format is raw
+ *
+ * @fmt:    Backing file format, may be NULL
+ */
+static bool qed_fmt_is_raw(const char *fmt)
+{
+    return fmt && strcmp(fmt, "raw") == 0;
+}
+
+static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
+{
+    cpu->magic = le32_to_cpu(le->magic);
+    cpu->cluster_size = le32_to_cpu(le->cluster_size);
+    cpu->table_size = le32_to_cpu(le->table_size);
+    cpu->header_size = le32_to_cpu(le->header_size);
+    cpu->features = le64_to_cpu(le->features);
+    cpu->compat_features = le64_to_cpu(le->compat_features);
+    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
+    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
+    cpu->image_size = le64_to_cpu(le->image_size);
+    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
+    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
+}
+
+static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
+{
+    le->magic = cpu_to_le32(cpu->magic);
+    le->cluster_size = cpu_to_le32(cpu->cluster_size);
+    le->table_size = cpu_to_le32(cpu->table_size);
+    le->header_size = cpu_to_le32(cpu->header_size);
+    le->features = cpu_to_le64(cpu->features);
+    le->compat_features = cpu_to_le64(cpu->compat_features);
+    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
+    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
+    le->image_size = cpu_to_le64(cpu->image_size);
+    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
+    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
+}
+
+int qed_write_header_sync(BDRVQEDState *s)
+{
+    QEDHeader le;
+    int ret;
+
+    qed_header_cpu_to_le(&s->header, &le);
+    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
+    if (ret != sizeof(le)) {
+        return ret;
+    }
+    return 0;
+}
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    int nsectors;
+    uint8_t *buf;
+} QEDWriteHeaderCB;
+
+static void qed_write_header_cb(void *opaque, int ret)
+{
+    QEDWriteHeaderCB *write_header_cb = opaque;
+
+    qemu_vfree(write_header_cb->buf);
+    gencb_complete(write_header_cb, ret);
+}
+
+static void qed_write_header_read_cb(void *opaque, int ret)
+{
+    QEDWriteHeaderCB *write_header_cb = opaque;
+    BDRVQEDState *s = write_header_cb->s;
+
+    if (ret) {
+        qed_write_header_cb(write_header_cb, ret);
+        return;
+    }
+
+    /* Update header */
+    qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
+
+    bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
+                    write_header_cb->nsectors, qed_write_header_cb,
+                    write_header_cb);
+}
+
+/**
+ * Update header in-place (does not rewrite backing filename or other strings)
+ *
+ * This function only updates known header fields in-place and does not affect
+ * extra data after the QED header.
+ */
+static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb,
+                             void *opaque)
+{
+    /* We must write full sectors for O_DIRECT but cannot necessarily generate
+     * the data following the header if an unrecognized compat feature is
+     * active.  Therefore, first read the sectors containing the header, update
+     * them, and write back.
+     */
+
+    int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
+                   BDRV_SECTOR_SIZE;
+    size_t len = nsectors * BDRV_SECTOR_SIZE;
+    QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
+                                                    cb, opaque);
+
+    write_header_cb->s = s;
+    write_header_cb->nsectors = nsectors;
+    write_header_cb->buf = qemu_blockalign(s->bs, len);
+    write_header_cb->iov.iov_base = write_header_cb->buf;
+    write_header_cb->iov.iov_len = len;
+    qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
+
+    bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
+                   qed_write_header_read_cb, write_header_cb);
+}
+
+static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
+{
+    uint64_t table_entries;
+    uint64_t l2_size;
+
+    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
+    l2_size = table_entries * cluster_size;
+
+    return l2_size * table_entries;
+}
+
+static bool qed_is_cluster_size_valid(uint32_t cluster_size)
+{
+    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
+        cluster_size > QED_MAX_CLUSTER_SIZE) {
+        return false;
+    }
+    if (cluster_size & (cluster_size - 1)) {
+        return false; /* not power of 2 */
+    }
+    return true;
+}
+
+static bool qed_is_table_size_valid(uint32_t table_size)
+{
+    if (table_size < QED_MIN_TABLE_SIZE ||
+        table_size > QED_MAX_TABLE_SIZE) {
+        return false;
+    }
+    if (table_size & (table_size - 1)) {
+        return false; /* not power of 2 */
+    }
+    return true;
+}
+
+static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
+                                    uint32_t table_size)
+{
+    if (image_size % BDRV_SECTOR_SIZE != 0) {
+        return false; /* not multiple of sector size */
+    }
+    if (image_size > qed_max_image_size(cluster_size, table_size)) {
+        return false; /* image is too large */
+    }
+    return true;
+}
+
+/**
+ * Read a string of known length from the image file
+ *
+ * @file:       Image file
+ * @offset:     File offset to start of string, in bytes
+ * @n:          String length in bytes
+ * @buf:        Destination buffer
+ * @buflen:     Destination buffer length in bytes
+ * @ret:        0 on success, -errno on failure
+ *
+ * The string is NUL-terminated.
+ */
+static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
+                           char *buf, size_t buflen)
+{
+    int ret;
+    if (n >= buflen) {
+        return -EINVAL;
+    }
+    ret = bdrv_pread(file, offset, buf, n);
+    if (ret < 0) {
+        return ret;
+    }
+    buf[n] = '\0';
+    return 0;
+}
+
+/**
+ * Allocate new clusters
+ *
+ * @s:          QED state
+ * @n:          Number of contiguous clusters to allocate
+ * @ret:        Offset of first allocated cluster
+ *
+ * This function only produces the offset where the new clusters should be
+ * written.  It updates BDRVQEDState but does not make any changes to the image
+ * file.
+ */
+static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
+{
+    uint64_t offset = s->file_size;
+    s->file_size += n * s->header.cluster_size;
+    return offset;
+}
+
+QEDTable *qed_alloc_table(BDRVQEDState *s)
+{
+    /* Honor O_DIRECT memory alignment requirements */
+    return qemu_blockalign(s->bs,
+                           s->header.cluster_size * s->header.table_size);
+}
+
+/**
+ * Allocate a new zeroed L2 table
+ */
+static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
+{
+    CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
+
+    l2_table->table = qed_alloc_table(s);
+    l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
+
+    memset(l2_table->table->offsets, 0,
+           s->header.cluster_size * s->header.table_size);
+    return l2_table;
+}
+
+static void qed_aio_next_io(void *opaque, int ret);
+
+static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
+{
+    assert(!s->allocating_write_reqs_plugged);
+
+    s->allocating_write_reqs_plugged = true;
+}
+
+static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
+{
+    QEDAIOCB *acb;
+
+    assert(s->allocating_write_reqs_plugged);
+
+    s->allocating_write_reqs_plugged = false;
+
+    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+    if (acb) {
+        qed_aio_next_io(acb, 0);
+    }
+}
+
+static void qed_finish_clear_need_check(void *opaque, int ret)
+{
+    /* Do nothing */
+}
+
+static void qed_flush_after_clear_need_check(void *opaque, int ret)
+{
+    BDRVQEDState *s = opaque;
+
+    bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
+
+    /* No need to wait until flush completes */
+    qed_unplug_allocating_write_reqs(s);
+}
+
+static void qed_clear_need_check(void *opaque, int ret)
+{
+    BDRVQEDState *s = opaque;
+
+    if (ret) {
+        qed_unplug_allocating_write_reqs(s);
+        return;
+    }
+
+    s->header.features &= ~QED_F_NEED_CHECK;
+    qed_write_header(s, qed_flush_after_clear_need_check, s);
+}
+
+static void qed_need_check_timer_cb(void *opaque)
+{
+    BDRVQEDState *s = opaque;
+
+    /* The timer should only fire when allocating writes have drained */
+    assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
+
+    trace_qed_need_check_timer_cb(s);
+
+    qed_plug_allocating_write_reqs(s);
+
+    /* Ensure writes are on disk before clearing flag */
+    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+}
+
+static void qed_start_need_check_timer(BDRVQEDState *s)
+{
+    trace_qed_start_need_check_timer(s);
+
+    /* Use vm_clock so we don't alter the image file while suspended for
+     * migration.
+     */
+    qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) +
+                   get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT);
+}
+
+/* It's okay to call this multiple times or when no timer is started */
+static void qed_cancel_need_check_timer(BDRVQEDState *s)
+{
+    trace_qed_cancel_need_check_timer(s);
+    qemu_del_timer(s->need_check_timer);
+}
+
+static void bdrv_qed_rebind(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+    s->bs = bs;
+}
+
+static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVQEDState *s = bs->opaque;
+    QEDHeader le_header;
+    int64_t file_size;
+    int ret;
+
+    s->bs = bs;
+    QSIMPLEQ_INIT(&s->allocating_write_reqs);
+
+    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
+    if (ret < 0) {
+        return ret;
+    }
+    qed_header_le_to_cpu(&le_header, &s->header);
+
+    if (s->header.magic != QED_MAGIC) {
+        return -EMEDIUMTYPE;
+    }
+    if (s->header.features & ~QED_FEATURE_MASK) {
+        /* image uses unsupported feature bits */
+        char buf[64];
+        snprintf(buf, sizeof(buf), "%" PRIx64,
+            s->header.features & ~QED_FEATURE_MASK);
+        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+            bs->device_name, "QED", buf);
+        return -ENOTSUP;
+    }
+    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
+        return -EINVAL;
+    }
+
+    /* Round down file size to the last cluster */
+    file_size = bdrv_getlength(bs->file);
+    if (file_size < 0) {
+        return file_size;
+    }
+    s->file_size = qed_start_of_cluster(s, file_size);
+
+    if (!qed_is_table_size_valid(s->header.table_size)) {
+        return -EINVAL;
+    }
+    if (!qed_is_image_size_valid(s->header.image_size,
+                                 s->header.cluster_size,
+                                 s->header.table_size)) {
+        return -EINVAL;
+    }
+    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
+        return -EINVAL;
+    }
+
+    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
+                      sizeof(uint64_t);
+    s->l2_shift = ffs(s->header.cluster_size) - 1;
+    s->l2_mask = s->table_nelems - 1;
+    s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+
+    if ((s->header.features & QED_F_BACKING_FILE)) {
+        if ((uint64_t)s->header.backing_filename_offset +
+            s->header.backing_filename_size >
+            s->header.cluster_size * s->header.header_size) {
+            return -EINVAL;
+        }
+
+        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
+                              s->header.backing_filename_size, bs->backing_file,
+                              sizeof(bs->backing_file));
+        if (ret < 0) {
+            return ret;
+        }
+
+        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
+            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
+        }
+    }
+
+    /* Reset unknown autoclear feature bits.  This is a backwards
+     * compatibility mechanism that allows images to be opened by older
+     * programs, which "knock out" unknown feature bits.  When an image is
+     * opened by a newer program again it can detect that the autoclear
+     * feature is no longer valid.
+     */
+    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
+        !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) {
+        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
+
+        ret = qed_write_header_sync(s);
+        if (ret) {
+            return ret;
+        }
+
+        /* From here on only known autoclear feature bits are valid */
+        bdrv_flush(bs->file);
+    }
+
+    s->l1_table = qed_alloc_table(s);
+    qed_init_l2_cache(&s->l2_cache);
+
+    ret = qed_read_l1_table_sync(s);
+    if (ret) {
+        goto out;
+    }
+
+    /* If image was not closed cleanly, check consistency */
+    if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
+        /* Read-only images cannot be fixed.  There is no risk of corruption
+         * since write operations are not possible.  Therefore, allow
+         * potentially inconsistent images to be opened read-only.  This can
+         * aid data recovery from an otherwise inconsistent image.
+         */
+        if (!bdrv_is_read_only(bs->file) &&
+            !(flags & BDRV_O_INCOMING)) {
+            BdrvCheckResult result = {0};
+
+            ret = qed_check(s, &result, true);
+            if (ret) {
+                goto out;
+            }
+        }
+    }
+
+    s->need_check_timer = qemu_new_timer_ns(vm_clock,
+                                            qed_need_check_timer_cb, s);
+
+out:
+    if (ret) {
+        qed_free_l2_cache(&s->l2_cache);
+        qemu_vfree(s->l1_table);
+    }
+    return ret;
+}
+
+/* We have nothing to do for QED reopen, stubs just return
+ * success */
+static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
+                                   BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+static void bdrv_qed_close(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    qed_cancel_need_check_timer(s);
+    qemu_free_timer(s->need_check_timer);
+
+    /* Ensure writes reach stable storage */
+    bdrv_flush(bs->file);
+
+    /* Clean shutdown, no check required on next open */
+    if (s->header.features & QED_F_NEED_CHECK) {
+        s->header.features &= ~QED_F_NEED_CHECK;
+        qed_write_header_sync(s);
+    }
+
+    qed_free_l2_cache(&s->l2_cache);
+    qemu_vfree(s->l1_table);
+}
+
+static int qed_create(const char *filename, uint32_t cluster_size,
+                      uint64_t image_size, uint32_t table_size,
+                      const char *backing_file, const char *backing_fmt)
+{
+    QEDHeader header = {
+        .magic = QED_MAGIC,
+        .cluster_size = cluster_size,
+        .table_size = table_size,
+        .header_size = 1,
+        .features = 0,
+        .compat_features = 0,
+        .l1_table_offset = cluster_size,
+        .image_size = image_size,
+    };
+    QEDHeader le_header;
+    uint8_t *l1_table = NULL;
+    size_t l1_size = header.cluster_size * header.table_size;
+    int ret = 0;
+    BlockDriverState *bs = NULL;
+
+    ret = bdrv_create_file(filename, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* File must start empty and grow, check truncate is supported */
+    ret = bdrv_truncate(bs, 0);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (backing_file) {
+        header.features |= QED_F_BACKING_FILE;
+        header.backing_filename_offset = sizeof(le_header);
+        header.backing_filename_size = strlen(backing_file);
+
+        if (qed_fmt_is_raw(backing_fmt)) {
+            header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
+        }
+    }
+
+    qed_header_cpu_to_le(&header, &le_header);
+    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
+    if (ret < 0) {
+        goto out;
+    }
+    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
+                      header.backing_filename_size);
+    if (ret < 0) {
+        goto out;
+    }
+
+    l1_table = g_malloc0(l1_size);
+    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0; /* success */
+out:
+    g_free(l1_table);
+    bdrv_delete(bs);
+    return ret;
+}
+
+static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
+{
+    uint64_t image_size = 0;
+    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
+    uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
+    const char *backing_file = NULL;
+    const char *backing_fmt = NULL;
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_size = options->value.n;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+            backing_fmt = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+            if (options->value.n) {
+                cluster_size = options->value.n;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) {
+            if (options->value.n) {
+                table_size = options->value.n;
+            }
+        }
+        options++;
+    }
+
+    if (!qed_is_cluster_size_valid(cluster_size)) {
+        fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n",
+                QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
+        return -EINVAL;
+    }
+    if (!qed_is_table_size_valid(table_size)) {
+        fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n",
+                QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
+        return -EINVAL;
+    }
+    if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) {
+        fprintf(stderr, "QED image size must be a non-zero multiple of "
+                        "cluster size and less than %" PRIu64 " bytes\n",
+                qed_max_image_size(cluster_size, table_size));
+        return -EINVAL;
+    }
+
+    return qed_create(filename, cluster_size, image_size, table_size,
+                      backing_file, backing_fmt);
+}
+
+typedef struct {
+    Coroutine *co;
+    int is_allocated;
+    int *pnum;
+} QEDIsAllocatedCB;
+
+static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
+{
+    QEDIsAllocatedCB *cb = opaque;
+    *cb->pnum = len / BDRV_SECTOR_SIZE;
+    cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
+    if (cb->co) {
+        qemu_coroutine_enter(cb->co, NULL);
+    }
+}
+
+static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
+                                                 int64_t sector_num,
+                                                 int nb_sectors, int *pnum)
+{
+    BDRVQEDState *s = bs->opaque;
+    uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
+    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
+    QEDIsAllocatedCB cb = {
+        .is_allocated = -1,
+        .pnum = pnum,
+    };
+    QEDRequest request = { .l2_table = NULL };
+
+    qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
+
+    /* Now sleep if the callback wasn't invoked immediately */
+    while (cb.is_allocated == -1) {
+        cb.co = qemu_coroutine_self();
+        qemu_coroutine_yield();
+    }
+
+    qed_unref_l2_cache_entry(request.l2_table);
+
+    return cb.is_allocated;
+}
+
+static int bdrv_qed_make_empty(BlockDriverState *bs)
+{
+    return -ENOTSUP;
+}
+
+static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
+{
+    return acb->common.bs->opaque;
+}
+
+/**
+ * Read from the backing file or zero-fill if no backing file
+ *
+ * @s:          QED state
+ * @pos:        Byte position in device
+ * @qiov:       Destination I/O vector
+ * @cb:         Completion function
+ * @opaque:     User data for completion function
+ *
+ * This function reads qiov->size bytes starting at pos from the backing file.
+ * If there is no backing file then zeroes are read.
+ */
+static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+                                  QEMUIOVector *qiov,
+                                  BlockDriverCompletionFunc *cb, void *opaque)
+{
+    uint64_t backing_length = 0;
+    size_t size;
+
+    /* If there is a backing file, get its length.  Treat the absence of a
+     * backing file like a zero length backing file.
+     */
+    if (s->bs->backing_hd) {
+        int64_t l = bdrv_getlength(s->bs->backing_hd);
+        if (l < 0) {
+            cb(opaque, l);
+            return;
+        }
+        backing_length = l;
+    }
+
+    /* Zero all sectors if reading beyond the end of the backing file */
+    if (pos >= backing_length ||
+        pos + qiov->size > backing_length) {
+        qemu_iovec_memset(qiov, 0, 0, qiov->size);
+    }
+
+    /* Complete now if there are no backing file sectors to read */
+    if (pos >= backing_length) {
+        cb(opaque, 0);
+        return;
+    }
+
+    /* If the read straddles the end of the backing file, shorten it */
+    size = MIN((uint64_t)backing_length - pos, qiov->size);
+
+    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
+    bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
+                   qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
+}
+
+typedef struct {
+    GenericCB gencb;
+    BDRVQEDState *s;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    uint64_t offset;
+} CopyFromBackingFileCB;
+
+static void qed_copy_from_backing_file_cb(void *opaque, int ret)
+{
+    CopyFromBackingFileCB *copy_cb = opaque;
+    qemu_vfree(copy_cb->iov.iov_base);
+    gencb_complete(&copy_cb->gencb, ret);
+}
+
+static void qed_copy_from_backing_file_write(void *opaque, int ret)
+{
+    CopyFromBackingFileCB *copy_cb = opaque;
+    BDRVQEDState *s = copy_cb->s;
+
+    if (ret) {
+        qed_copy_from_backing_file_cb(copy_cb, ret);
+        return;
+    }
+
+    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
+    bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
+                    &copy_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
+                    qed_copy_from_backing_file_cb, copy_cb);
+}
+
+/**
+ * Copy data from backing file into the image
+ *
+ * @s:          QED state
+ * @pos:        Byte position in device
+ * @len:        Number of bytes
+ * @offset:     Byte offset in image file
+ * @cb:         Completion function
+ * @opaque:     User data for completion function
+ */
+static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
+                                       uint64_t len, uint64_t offset,
+                                       BlockDriverCompletionFunc *cb,
+                                       void *opaque)
+{
+    CopyFromBackingFileCB *copy_cb;
+
+    /* Skip copy entirely if there is no work to do */
+    if (len == 0) {
+        cb(opaque, 0);
+        return;
+    }
+
+    copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
+    copy_cb->s = s;
+    copy_cb->offset = offset;
+    copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
+    copy_cb->iov.iov_len = len;
+    qemu_iovec_init_external(&copy_cb->qiov, &copy_cb->iov, 1);
+
+    qed_read_backing_file(s, pos, &copy_cb->qiov,
+                          qed_copy_from_backing_file_write, copy_cb);
+}
+
+/**
+ * Link one or more contiguous clusters into a table
+ *
+ * @s:              QED state
+ * @table:          L2 table
+ * @index:          First cluster index
+ * @n:              Number of contiguous clusters
+ * @cluster:        First cluster offset
+ *
+ * The cluster offset may be an allocated byte offset in the image file, the
+ * zero cluster marker, or the unallocated cluster marker.
+ */
+static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
+                                unsigned int n, uint64_t cluster)
+{
+    int i;
+    for (i = index; i < index + n; i++) {
+        table->offsets[i] = cluster;
+        if (!qed_offset_is_unalloc_cluster(cluster) &&
+            !qed_offset_is_zero_cluster(cluster)) {
+            cluster += s->header.cluster_size;
+        }
+    }
+}
+
+static void qed_aio_complete_bh(void *opaque)
+{
+    QEDAIOCB *acb = opaque;
+    BlockDriverCompletionFunc *cb = acb->common.cb;
+    void *user_opaque = acb->common.opaque;
+    int ret = acb->bh_ret;
+    bool *finished = acb->finished;
+
+    qemu_bh_delete(acb->bh);
+    qemu_aio_release(acb);
+
+    /* Invoke callback */
+    cb(user_opaque, ret);
+
+    /* Signal cancel completion */
+    if (finished) {
+        *finished = true;
+    }
+}
+
+static void qed_aio_complete(QEDAIOCB *acb, int ret)
+{
+    BDRVQEDState *s = acb_to_s(acb);
+
+    trace_qed_aio_complete(s, acb, ret);
+
+    /* Free resources */
+    qemu_iovec_destroy(&acb->cur_qiov);
+    qed_unref_l2_cache_entry(acb->request.l2_table);
+
+    /* Free the buffer we may have allocated for zero writes */
+    if (acb->flags & QED_AIOCB_ZERO) {
+        qemu_vfree(acb->qiov->iov[0].iov_base);
+        acb->qiov->iov[0].iov_base = NULL;
+    }
+
+    /* Arrange for a bh to invoke the completion function */
+    acb->bh_ret = ret;
+    acb->bh = qemu_bh_new(qed_aio_complete_bh, acb);
+    qemu_bh_schedule(acb->bh);
+
+    /* Start next allocating write request waiting behind this one.  Note that
+     * requests enqueue themselves when they first hit an unallocated cluster
+     * but they wait until the entire request is finished before waking up the
+     * next request in the queue.  This ensures that we don't cycle through
+     * requests multiple times but rather finish one at a time completely.
+     */
+    if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
+        acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+        if (acb) {
+            qed_aio_next_io(acb, 0);
+        } else if (s->header.features & QED_F_NEED_CHECK) {
+            qed_start_need_check_timer(s);
+        }
+    }
+}
+
+/**
+ * Commit the current L2 table to the cache
+ */
+static void qed_commit_l2_update(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    CachedL2Table *l2_table = acb->request.l2_table;
+    uint64_t l2_offset = l2_table->offset;
+
+    qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
+
+    /* This is guaranteed to succeed because we just committed the entry to the
+     * cache.
+     */
+    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
+    assert(acb->request.l2_table != NULL);
+
+    qed_aio_next_io(opaque, ret);
+}
+
+/**
+ * Update L1 table with new L2 table offset and write it out
+ */
+static void qed_aio_write_l1_update(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    int index;
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    index = qed_l1_index(s, acb->cur_pos);
+    s->l1_table->offsets[index] = acb->request.l2_table->offset;
+
+    qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
+}
+
+/**
+ * Update L2 table with new cluster offsets and write them out
+ */
+static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
+{
+    BDRVQEDState *s = acb_to_s(acb);
+    bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
+    int index;
+
+    if (ret) {
+        goto err;
+    }
+
+    if (need_alloc) {
+        qed_unref_l2_cache_entry(acb->request.l2_table);
+        acb->request.l2_table = qed_new_l2_table(s);
+    }
+
+    index = qed_l2_index(s, acb->cur_pos);
+    qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
+                         offset);
+
+    if (need_alloc) {
+        /* Write out the whole new L2 table */
+        qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
+                            qed_aio_write_l1_update, acb);
+    } else {
+        /* Write out only the updated part of the L2 table */
+        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
+                            qed_aio_next_io, acb);
+    }
+    return;
+
+err:
+    qed_aio_complete(acb, ret);
+}
+
+static void qed_aio_write_l2_update_cb(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+}
+
+/**
+ * Flush new data clusters before updating the L2 table
+ *
+ * This flush is necessary when a backing file is in use.  A crash during an
+ * allocating write could result in empty clusters in the image.  If the write
+ * only touched a subregion of the cluster, then backing image sectors have
+ * been lost in the untouched region.  The solution is to flush after writing a
+ * new data cluster and before updating the L2 table.
+ */
+static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+
+    if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
+        qed_aio_complete(acb, -EIO);
+    }
+}
+
+/**
+ * Write data to the image file
+ */
+static void qed_aio_write_main(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    uint64_t offset = acb->cur_cluster +
+                      qed_offset_into_cluster(s, acb->cur_pos);
+    BlockDriverCompletionFunc *next_fn;
+
+    trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
+        next_fn = qed_aio_next_io;
+    } else {
+        if (s->bs->backing_hd) {
+            next_fn = qed_aio_write_flush_before_l2_update;
+        } else {
+            next_fn = qed_aio_write_l2_update_cb;
+        }
+    }
+
+    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
+    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+                    next_fn, acb);
+}
+
+/**
+ * Populate back untouched region of new data cluster
+ */
+static void qed_aio_write_postfill(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    uint64_t start = acb->cur_pos + acb->cur_qiov.size;
+    uint64_t len =
+        qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
+    uint64_t offset = acb->cur_cluster +
+                      qed_offset_into_cluster(s, acb->cur_pos) +
+                      acb->cur_qiov.size;
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    trace_qed_aio_write_postfill(s, acb, start, len, offset);
+    qed_copy_from_backing_file(s, start, len, offset,
+                                qed_aio_write_main, acb);
+}
+
+/**
+ * Populate front untouched region of new data cluster
+ */
+static void qed_aio_write_prefill(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
+    uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
+
+    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
+    qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
+                                qed_aio_write_postfill, acb);
+}
+
+/**
+ * Check if the QED_F_NEED_CHECK bit should be set during allocating write
+ */
+static bool qed_should_set_need_check(BDRVQEDState *s)
+{
+    /* The flush before L2 update path ensures consistency */
+    if (s->bs->backing_hd) {
+        return false;
+    }
+
+    return !(s->header.features & QED_F_NEED_CHECK);
+}
+
+static void qed_aio_write_zero_cluster(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    qed_aio_write_l2_update(acb, 0, 1);
+}
+
+/**
+ * Write new data cluster
+ *
+ * @acb:        Write request
+ * @len:        Length in bytes
+ *
+ * This path is taken when writing to previously unallocated clusters.
+ */
+static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+{
+    BDRVQEDState *s = acb_to_s(acb);
+    BlockDriverCompletionFunc *cb;
+
+    /* Cancel timer when the first allocating request comes in */
+    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
+        qed_cancel_need_check_timer(s);
+    }
+
+    /* Freeze this request if another allocating write is in progress */
+    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+        QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
+    }
+    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
+        s->allocating_write_reqs_plugged) {
+        return; /* wait for existing request to finish */
+    }
+
+    acb->cur_nclusters = qed_bytes_to_clusters(s,
+            qed_offset_into_cluster(s, acb->cur_pos) + len);
+    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+    if (acb->flags & QED_AIOCB_ZERO) {
+        /* Skip ahead if the clusters are already zero */
+        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
+            qed_aio_next_io(acb, 0);
+            return;
+        }
+
+        cb = qed_aio_write_zero_cluster;
+    } else {
+        cb = qed_aio_write_prefill;
+        acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
+    }
+
+    if (qed_should_set_need_check(s)) {
+        s->header.features |= QED_F_NEED_CHECK;
+        qed_write_header(s, cb, acb);
+    } else {
+        cb(acb, 0);
+    }
+}
+
+/**
+ * Write data cluster in place
+ *
+ * @acb:        Write request
+ * @offset:     Cluster offset in bytes
+ * @len:        Length in bytes
+ *
+ * This path is taken when writing to already allocated clusters.
+ */
+static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
+{
+    /* Allocate buffer for zero writes */
+    if (acb->flags & QED_AIOCB_ZERO) {
+        struct iovec *iov = acb->qiov->iov;
+
+        if (!iov->iov_base) {
+            iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
+            memset(iov->iov_base, 0, iov->iov_len);
+        }
+    }
+
+    /* Calculate the I/O vector */
+    acb->cur_cluster = offset;
+    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+    /* Do the actual write */
+    qed_aio_write_main(acb, 0);
+}
+
+/**
+ * Write data cluster
+ *
+ * @opaque:     Write request
+ * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
+ *              or -errno
+ * @offset:     Cluster offset in bytes
+ * @len:        Length in bytes
+ *
+ * Callback from qed_find_cluster().
+ */
+static void qed_aio_write_data(void *opaque, int ret,
+                               uint64_t offset, size_t len)
+{
+    QEDAIOCB *acb = opaque;
+
+    trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
+
+    acb->find_cluster_ret = ret;
+
+    switch (ret) {
+    case QED_CLUSTER_FOUND:
+        qed_aio_write_inplace(acb, offset, len);
+        break;
+
+    case QED_CLUSTER_L2:
+    case QED_CLUSTER_L1:
+    case QED_CLUSTER_ZERO:
+        qed_aio_write_alloc(acb, len);
+        break;
+
+    default:
+        qed_aio_complete(acb, ret);
+        break;
+    }
+}
+
+/**
+ * Read data cluster
+ *
+ * @opaque:     Read request
+ * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
+ *              or -errno
+ * @offset:     Cluster offset in bytes
+ * @len:        Length in bytes
+ *
+ * Callback from qed_find_cluster().
+ */
+static void qed_aio_read_data(void *opaque, int ret,
+                              uint64_t offset, size_t len)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    BlockDriverState *bs = acb->common.bs;
+
+    /* Adjust offset into cluster */
+    offset += qed_offset_into_cluster(s, acb->cur_pos);
+
+    trace_qed_aio_read_data(s, acb, ret, offset, len);
+
+    if (ret < 0) {
+        goto err;
+    }
+
+    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+    /* Handle zero cluster and backing file reads */
+    if (ret == QED_CLUSTER_ZERO) {
+        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
+        qed_aio_next_io(acb, 0);
+        return;
+    } else if (ret != QED_CLUSTER_FOUND) {
+        qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
+                              qed_aio_next_io, acb);
+        return;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+    bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
+                   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+                   qed_aio_next_io, acb);
+    return;
+
+err:
+    qed_aio_complete(acb, ret);
+}
+
+/**
+ * Begin next I/O or complete the request
+ */
+static void qed_aio_next_io(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    BDRVQEDState *s = acb_to_s(acb);
+    QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
+                                qed_aio_write_data : qed_aio_read_data;
+
+    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
+
+    /* Handle I/O error */
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    acb->qiov_offset += acb->cur_qiov.size;
+    acb->cur_pos += acb->cur_qiov.size;
+    qemu_iovec_reset(&acb->cur_qiov);
+
+    /* Complete request */
+    if (acb->cur_pos >= acb->end_pos) {
+        qed_aio_complete(acb, 0);
+        return;
+    }
+
+    /* Find next cluster and start I/O */
+    qed_find_cluster(s, &acb->request,
+                      acb->cur_pos, acb->end_pos - acb->cur_pos,
+                      io_fn, acb);
+}
+
+static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
+                                       int64_t sector_num,
+                                       QEMUIOVector *qiov, int nb_sectors,
+                                       BlockDriverCompletionFunc *cb,
+                                       void *opaque, int flags)
+{
+    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
+
+    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
+                        opaque, flags);
+
+    acb->flags = flags;
+    acb->finished = NULL;
+    acb->qiov = qiov;
+    acb->qiov_offset = 0;
+    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
+    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
+    acb->request.l2_table = NULL;
+    qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+
+    /* Start request */
+    qed_aio_next_io(acb, 0);
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            QEMUIOVector *qiov, int nb_sectors,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque)
+{
+    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             QEMUIOVector *qiov, int nb_sectors,
+                                             BlockDriverCompletionFunc *cb,
+                                             void *opaque)
+{
+    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+                         opaque, QED_AIOCB_WRITE);
+}
+
+typedef struct {
+    Coroutine *co;
+    int ret;
+    bool done;
+} QEDWriteZeroesCB;
+
+static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
+{
+    QEDWriteZeroesCB *cb = opaque;
+
+    cb->done = true;
+    cb->ret = ret;
+    if (cb->co) {
+        qemu_coroutine_enter(cb->co, NULL);
+    }
+}
+
+static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
+                                                 int64_t sector_num,
+                                                 int nb_sectors)
+{
+    BlockDriverAIOCB *blockacb;
+    BDRVQEDState *s = bs->opaque;
+    QEDWriteZeroesCB cb = { .done = false };
+    QEMUIOVector qiov;
+    struct iovec iov;
+
+    /* Refuse if there are untouched backing file sectors */
+    if (bs->backing_hd) {
+        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
+        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+            return -ENOTSUP;
+        }
+    }
+
+    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
+     * then it will be allocated during request processing.
+     */
+    iov.iov_base = NULL,
+    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
+                             qed_co_write_zeroes_cb, &cb,
+                             QED_AIOCB_WRITE | QED_AIOCB_ZERO);
+    if (!blockacb) {
+        return -EIO;
+    }
+    if (!cb.done) {
+        cb.co = qemu_coroutine_self();
+        qemu_coroutine_yield();
+    }
+    assert(cb.done);
+    return cb.ret;
+}
+
+static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset)
+{
+    BDRVQEDState *s = bs->opaque;
+    uint64_t old_image_size;
+    int ret;
+
+    if (!qed_is_image_size_valid(offset, s->header.cluster_size,
+                                 s->header.table_size)) {
+        return -EINVAL;
+    }
+
+    /* Shrinking is currently not supported */
+    if ((uint64_t)offset < s->header.image_size) {
+        return -ENOTSUP;
+    }
+
+    old_image_size = s->header.image_size;
+    s->header.image_size = offset;
+    ret = qed_write_header_sync(s);
+    if (ret < 0) {
+        s->header.image_size = old_image_size;
+    }
+    return ret;
+}
+
+static int64_t bdrv_qed_getlength(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+    return s->header.image_size;
+}
+
+static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    memset(bdi, 0, sizeof(*bdi));
+    bdi->cluster_size = s->header.cluster_size;
+    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
+    return 0;
+}
+
+static int bdrv_qed_change_backing_file(BlockDriverState *bs,
+                                        const char *backing_file,
+                                        const char *backing_fmt)
+{
+    BDRVQEDState *s = bs->opaque;
+    QEDHeader new_header, le_header;
+    void *buffer;
+    size_t buffer_len, backing_file_len;
+    int ret;
+
+    /* Refuse to set backing filename if unknown compat feature bits are
+     * active.  If the image uses an unknown compat feature then we may not
+     * know the layout of data following the header structure and cannot safely
+     * add a new string.
+     */
+    if (backing_file && (s->header.compat_features &
+                         ~QED_COMPAT_FEATURE_MASK)) {
+        return -ENOTSUP;
+    }
+
+    memcpy(&new_header, &s->header, sizeof(new_header));
+
+    new_header.features &= ~(QED_F_BACKING_FILE |
+                             QED_F_BACKING_FORMAT_NO_PROBE);
+
+    /* Adjust feature flags */
+    if (backing_file) {
+        new_header.features |= QED_F_BACKING_FILE;
+
+        if (qed_fmt_is_raw(backing_fmt)) {
+            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
+        }
+    }
+
+    /* Calculate new header size */
+    backing_file_len = 0;
+
+    if (backing_file) {
+        backing_file_len = strlen(backing_file);
+    }
+
+    buffer_len = sizeof(new_header);
+    new_header.backing_filename_offset = buffer_len;
+    new_header.backing_filename_size = backing_file_len;
+    buffer_len += backing_file_len;
+
+    /* Make sure we can rewrite header without failing */
+    if (buffer_len > new_header.header_size * new_header.cluster_size) {
+        return -ENOSPC;
+    }
+
+    /* Prepare new header */
+    buffer = g_malloc(buffer_len);
+
+    qed_header_cpu_to_le(&new_header, &le_header);
+    memcpy(buffer, &le_header, sizeof(le_header));
+    buffer_len = sizeof(le_header);
+
+    if (backing_file) {
+        memcpy(buffer + buffer_len, backing_file, backing_file_len);
+        buffer_len += backing_file_len;
+    }
+
+    /* Write new header */
+    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
+    g_free(buffer);
+    if (ret == 0) {
+        memcpy(&s->header, &new_header, sizeof(new_header));
+    }
+    return ret;
+}
+
+static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    bdrv_qed_close(bs);
+    memset(s, 0, sizeof(BDRVQEDState));
+    bdrv_qed_open(bs, NULL, bs->open_flags);
+}
+
+static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
+                          BdrvCheckMode fix)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    return qed_check(s, result, !!fix);
+}
+
+static QEMUOptionParameter qed_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size (in bytes)"
+    }, {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    }, {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "Image format of the base image"
+    }, {
+        .name = BLOCK_OPT_CLUSTER_SIZE,
+        .type = OPT_SIZE,
+        .help = "Cluster size (in bytes)",
+        .value = { .n = QED_DEFAULT_CLUSTER_SIZE },
+    }, {
+        .name = BLOCK_OPT_TABLE_SIZE,
+        .type = OPT_SIZE,
+        .help = "L1/L2 table size (in clusters)"
+    },
+    { /* end of list */ }
+};
+
+static BlockDriver bdrv_qed = {
+    .format_name              = "qed",
+    .instance_size            = sizeof(BDRVQEDState),
+    .create_options           = qed_create_options,
+
+    .bdrv_probe               = bdrv_qed_probe,
+    .bdrv_rebind              = bdrv_qed_rebind,
+    .bdrv_open                = bdrv_qed_open,
+    .bdrv_close               = bdrv_qed_close,
+    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
+    .bdrv_create              = bdrv_qed_create,
+    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
+    .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
+    .bdrv_make_empty          = bdrv_qed_make_empty,
+    .bdrv_aio_readv           = bdrv_qed_aio_readv,
+    .bdrv_aio_writev          = bdrv_qed_aio_writev,
+    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
+    .bdrv_truncate            = bdrv_qed_truncate,
+    .bdrv_getlength           = bdrv_qed_getlength,
+    .bdrv_get_info            = bdrv_qed_get_info,
+    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
+    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
+    .bdrv_check               = bdrv_qed_check,
+};
+
+static void bdrv_qed_init(void)
+{
+    bdrv_register(&bdrv_qed);
+}
+
+block_init(bdrv_qed_init);
diff --git a/contrib/qemu/block/qed.h b/contrib/qemu/block/qed.h
new file mode 100644
index 00000000000..2b4ddedf313
--- /dev/null
+++ b/contrib/qemu/block/qed.h
@@ -0,0 +1,344 @@
+/*
+ * QEMU Enhanced Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_QED_H
+#define BLOCK_QED_H
+
+#include "block/block_int.h"
+
+/* The layout of a QED file is as follows:
+ *
+ * +--------+----------+----------+----------+-----+
+ * | header | L1 table | cluster0 | cluster1 | ... |
+ * +--------+----------+----------+----------+-----+
+ *
+ * There is a 2-level pagetable for cluster allocation:
+ *
+ *                     +----------+
+ *                     | L1 table |
+ *                     +----------+
+ *                ,------'  |  '------.
+ *           +----------+   |    +----------+
+ *           | L2 table |  ...   | L2 table |
+ *           +----------+        +----------+
+ *       ,------'  |  '------.
+ *  +----------+   |    +----------+
+ *  |   Data   |  ...   |   Data   |
+ *  +----------+        +----------+
+ *
+ * The L1 table is fixed size and always present.  L2 tables are allocated on
+ * demand.  The L1 table size determines the maximum possible image size; it
+ * can be influenced using the cluster_size and table_size values.
+ *
+ * All fields are little-endian on disk.
+ */
+
+enum {
+    QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24,
+
+    /* The image supports a backing file */
+    QED_F_BACKING_FILE = 0x01,
+
+    /* The image needs a consistency check before use */
+    QED_F_NEED_CHECK = 0x02,
+
+    /* The backing file format must not be probed, treat as raw image */
+    QED_F_BACKING_FORMAT_NO_PROBE = 0x04,
+
+    /* Feature bits must be used when the on-disk format changes */
+    QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */
+                       QED_F_NEED_CHECK |
+                       QED_F_BACKING_FORMAT_NO_PROBE,
+    QED_COMPAT_FEATURE_MASK = 0,            /* supported compat feature bits */
+    QED_AUTOCLEAR_FEATURE_MASK = 0,         /* supported autoclear feature bits */
+
+    /* Data is stored in groups of sectors called clusters.  Cluster size must
+     * be large to avoid keeping too much metadata.  I/O requests that have
+     * sub-cluster size will require read-modify-write.
+     */
+    QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */
+    QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024,
+    QED_DEFAULT_CLUSTER_SIZE = 64 * 1024,
+
+    /* Allocated clusters are tracked using a 2-level pagetable.  Table size is
+     * a multiple of clusters so large maximum image sizes can be supported
+     * without jacking up the cluster size too much.
+     */
+    QED_MIN_TABLE_SIZE = 1,        /* in clusters */
+    QED_MAX_TABLE_SIZE = 16,
+    QED_DEFAULT_TABLE_SIZE = 4,
+
+    /* Delay to flush and clean image after last allocating write completes */
+    QED_NEED_CHECK_TIMEOUT = 5,    /* in seconds */
+};
+
+typedef struct {
+    uint32_t magic;                 /* QED\0 */
+
+    uint32_t cluster_size;          /* in bytes */
+    uint32_t table_size;            /* for L1 and L2 tables, in clusters */
+    uint32_t header_size;           /* in clusters */
+
+    uint64_t features;              /* format feature bits */
+    uint64_t compat_features;       /* compatible feature bits */
+    uint64_t autoclear_features;    /* self-resetting feature bits */
+
+    uint64_t l1_table_offset;       /* in bytes */
+    uint64_t image_size;            /* total logical image size, in bytes */
+
+    /* if (features & QED_F_BACKING_FILE) */
+    uint32_t backing_filename_offset; /* in bytes from start of header */
+    uint32_t backing_filename_size;   /* in bytes */
+} QEDHeader;
+
+typedef struct {
+    uint64_t offsets[0];            /* in bytes */
+} QEDTable;
+
+/* The L2 cache is a simple write-through cache for L2 structures */
+typedef struct CachedL2Table {
+    QEDTable *table;
+    uint64_t offset;    /* offset=0 indicates an invalidate entry */
+    QTAILQ_ENTRY(CachedL2Table) node;
+    int ref;
+} CachedL2Table;
+
+typedef struct {
+    QTAILQ_HEAD(, CachedL2Table) entries;
+    unsigned int n_entries;
+} L2TableCache;
+
+typedef struct QEDRequest {
+    CachedL2Table *l2_table;
+} QEDRequest;
+
+enum {
+    QED_AIOCB_WRITE = 0x0001,       /* read or write? */
+    QED_AIOCB_ZERO  = 0x0002,       /* zero write, used with QED_AIOCB_WRITE */
+};
+
+typedef struct QEDAIOCB {
+    BlockDriverAIOCB common;
+    QEMUBH *bh;
+    int bh_ret;                     /* final return status for completion bh */
+    QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
+    int flags;                      /* QED_AIOCB_* bits ORed together */
+    bool *finished;                 /* signal for cancel completion */
+    uint64_t end_pos;               /* request end on block device, in bytes */
+
+    /* User scatter-gather list */
+    QEMUIOVector *qiov;
+    size_t qiov_offset;             /* byte count already processed */
+
+    /* Current cluster scatter-gather list */
+    QEMUIOVector cur_qiov;
+    uint64_t cur_pos;               /* position on block device, in bytes */
+    uint64_t cur_cluster;           /* cluster offset in image file */
+    unsigned int cur_nclusters;     /* number of clusters being accessed */
+    int find_cluster_ret;           /* used for L1/L2 update */
+
+    QEDRequest request;
+} QEDAIOCB;
+
+typedef struct {
+    BlockDriverState *bs;           /* device */
+    uint64_t file_size;             /* length of image file, in bytes */
+
+    QEDHeader header;               /* always cpu-endian */
+    QEDTable *l1_table;
+    L2TableCache l2_cache;          /* l2 table cache */
+    uint32_t table_nelems;
+    uint32_t l1_shift;
+    uint32_t l2_shift;
+    uint32_t l2_mask;
+
+    /* Allocating write request queue */
+    QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs;
+    bool allocating_write_reqs_plugged;
+
+    /* Periodic flush and clear need check flag */
+    QEMUTimer *need_check_timer;
+} BDRVQEDState;
+
+enum {
+    QED_CLUSTER_FOUND,         /* cluster found */
+    QED_CLUSTER_ZERO,          /* zero cluster found */
+    QED_CLUSTER_L2,            /* cluster missing in L2 */
+    QED_CLUSTER_L1,            /* cluster missing in L1 */
+};
+
+/**
+ * qed_find_cluster() completion callback
+ *
+ * @opaque:     User data for completion callback
+ * @ret:        QED_CLUSTER_FOUND   Success
+ *              QED_CLUSTER_L2      Data cluster unallocated in L2
+ *              QED_CLUSTER_L1      L2 unallocated in L1
+ *              -errno              POSIX error occurred
+ * @offset:     Data cluster offset
+ * @len:        Contiguous bytes starting from cluster offset
+ *
+ * This function is invoked when qed_find_cluster() completes.
+ *
+ * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range
+ * in the image file.
+ *
+ * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1
+ * table offset, respectively.  len is number of contiguous unallocated bytes.
+ */
+typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
+
+/**
+ * Generic callback for chaining async callbacks
+ */
+typedef struct {
+    BlockDriverCompletionFunc *cb;
+    void *opaque;
+} GenericCB;
+
+void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque);
+void gencb_complete(void *opaque, int ret);
+
+/**
+ * Header functions
+ */
+int qed_write_header_sync(BDRVQEDState *s);
+
+/**
+ * L2 cache functions
+ */
+void qed_init_l2_cache(L2TableCache *l2_cache);
+void qed_free_l2_cache(L2TableCache *l2_cache);
+CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache);
+void qed_unref_l2_cache_entry(CachedL2Table *entry);
+CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset);
+void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
+
+/**
+ * Table I/O functions
+ */
+int qed_read_l1_table_sync(BDRVQEDState *s);
+void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
+                        BlockDriverCompletionFunc *cb, void *opaque);
+int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
+                            unsigned int n);
+int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+                           uint64_t offset);
+void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
+                       BlockDriverCompletionFunc *cb, void *opaque);
+void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+                        unsigned int index, unsigned int n, bool flush,
+                        BlockDriverCompletionFunc *cb, void *opaque);
+int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
+                            unsigned int index, unsigned int n, bool flush);
+
+/**
+ * Cluster functions
+ */
+void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+                      size_t len, QEDFindClusterFunc *cb, void *opaque);
+
+/**
+ * Consistency check
+ */
+int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix);
+
+QEDTable *qed_alloc_table(BDRVQEDState *s);
+
+/**
+ * Round down to the start of a cluster
+ */
+static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset)
+{
+    return offset & ~(uint64_t)(s->header.cluster_size - 1);
+}
+
+static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset)
+{
+    return offset & (s->header.cluster_size - 1);
+}
+
+static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes)
+{
+    return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) /
+           (s->header.cluster_size - 1);
+}
+
+static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos)
+{
+    return pos >> s->l1_shift;
+}
+
+static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos)
+{
+    return (pos >> s->l2_shift) & s->l2_mask;
+}
+
+/**
+ * Test if a cluster offset is valid
+ */
+static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset)
+{
+    uint64_t header_size = (uint64_t)s->header.header_size *
+                           s->header.cluster_size;
+
+    if (offset & (s->header.cluster_size - 1)) {
+        return false;
+    }
+    return offset >= header_size && offset < s->file_size;
+}
+
+/**
+ * Test if a table offset is valid
+ */
+static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset)
+{
+    uint64_t end_offset = offset + (s->header.table_size - 1) *
+                          s->header.cluster_size;
+
+    /* Overflow check */
+    if (end_offset <= offset) {
+        return false;
+    }
+
+    return qed_check_cluster_offset(s, offset) &&
+           qed_check_cluster_offset(s, end_offset);
+}
+
+static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s,
+                                                 uint64_t offset)
+{
+    if (qed_offset_into_cluster(s, offset)) {
+        return false;
+    }
+    return true;
+}
+
+static inline bool qed_offset_is_unalloc_cluster(uint64_t offset)
+{
+    if (offset == 0) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool qed_offset_is_zero_cluster(uint64_t offset)
+{
+    if (offset == 1) {
+        return true;
+    }
+    return false;
+}
+
+#endif /* BLOCK_QED_H */
diff --git a/contrib/qemu/block/snapshot.c b/contrib/qemu/block/snapshot.c
new file mode 100644
index 00000000000..6c6d9deea1f
--- /dev/null
+++ b/contrib/qemu/block/snapshot.c
@@ -0,0 +1,157 @@
+/*
+ * Block layer snapshot related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block/snapshot.h"
+#include "block/block_int.h"
+
+int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
+                       const char *name)
+{
+    QEMUSnapshotInfo *sn_tab, *sn;
+    int nb_sns, i, ret;
+
+    ret = -ENOENT;
+    nb_sns = bdrv_snapshot_list(bs, &sn_tab);
+    if (nb_sns < 0) {
+        return ret;
+    }
+    for (i = 0; i < nb_sns; i++) {
+        sn = &sn_tab[i];
+        if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) {
+            *sn_info = *sn;
+            ret = 0;
+            break;
+        }
+    }
+    g_free(sn_tab);
+    return ret;
+}
+
+int bdrv_can_snapshot(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+        return 0;
+    }
+
+    if (!drv->bdrv_snapshot_create) {
+        if (bs->file != NULL) {
+            return bdrv_can_snapshot(bs->file);
+        }
+        return 0;
+    }
+
+    return 1;
+}
+
+int bdrv_snapshot_create(BlockDriverState *bs,
+                         QEMUSnapshotInfo *sn_info)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_create) {
+        return drv->bdrv_snapshot_create(bs, sn_info);
+    }
+    if (bs->file) {
+        return bdrv_snapshot_create(bs->file, sn_info);
+    }
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_goto(BlockDriverState *bs,
+                       const char *snapshot_id)
+{
+    BlockDriver *drv = bs->drv;
+    int ret, open_ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_goto) {
+        return drv->bdrv_snapshot_goto(bs, snapshot_id);
+    }
+
+    if (bs->file) {
+        drv->bdrv_close(bs);
+        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
+        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags);
+        if (open_ret < 0) {
+            bdrv_delete(bs->file);
+            bs->drv = NULL;
+            return open_ret;
+        }
+        return ret;
+    }
+
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_delete) {
+        return drv->bdrv_snapshot_delete(bs, snapshot_id);
+    }
+    if (bs->file) {
+        return bdrv_snapshot_delete(bs->file, snapshot_id);
+    }
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_list(BlockDriverState *bs,
+                       QEMUSnapshotInfo **psn_info)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (drv->bdrv_snapshot_list) {
+        return drv->bdrv_snapshot_list(bs, psn_info);
+    }
+    if (bs->file) {
+        return bdrv_snapshot_list(bs->file, psn_info);
+    }
+    return -ENOTSUP;
+}
+
+int bdrv_snapshot_load_tmp(BlockDriverState *bs,
+        const char *snapshot_name)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (!bs->read_only) {
+        return -EINVAL;
+    }
+    if (drv->bdrv_snapshot_load_tmp) {
+        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
+    }
+    return -ENOTSUP;
+}
-- 
cgit