diff options
Diffstat (limited to 'contrib/qemu/block')
| -rw-r--r-- | contrib/qemu/block/qcow.c | 914 | ||||
| -rw-r--r-- | contrib/qemu/block/qcow2-cache.c | 323 | ||||
| -rw-r--r-- | contrib/qemu/block/qcow2-cluster.c | 1478 | ||||
| -rw-r--r-- | contrib/qemu/block/qcow2-refcount.c | 1374 | ||||
| -rw-r--r-- | contrib/qemu/block/qcow2-snapshot.c | 660 | ||||
| -rw-r--r-- | contrib/qemu/block/qcow2.c | 1825 | ||||
| -rw-r--r-- | contrib/qemu/block/qcow2.h | 437 | ||||
| -rw-r--r-- | contrib/qemu/block/qed-check.c | 248 | ||||
| -rw-r--r-- | contrib/qemu/block/qed-cluster.c | 165 | ||||
| -rw-r--r-- | contrib/qemu/block/qed-gencb.c | 32 | ||||
| -rw-r--r-- | contrib/qemu/block/qed-l2-cache.c | 187 | ||||
| -rw-r--r-- | contrib/qemu/block/qed-table.c | 296 | ||||
| -rw-r--r-- | contrib/qemu/block/qed.c | 1596 | ||||
| -rw-r--r-- | contrib/qemu/block/qed.h | 344 | ||||
| -rw-r--r-- | contrib/qemu/block/snapshot.c | 157 | 
15 files changed, 10036 insertions, 0 deletions
diff --git a/contrib/qemu/block/qcow.c b/contrib/qemu/block/qcow.c new file mode 100644 index 000000000..5239bd68f --- /dev/null +++ b/contrib/qemu/block/qcow.c @@ -0,0 +1,914 @@ +/* + * Block driver for the QCOW format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> +#include "qemu/aes.h" +#include "migration/migration.h" + +/**************************************************************/ +/* QEMU COW block driver with compression and encryption support */ + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) +#define QCOW_VERSION 1 + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES  1 + +#define QCOW_OFLAG_COMPRESSED (1LL << 63) + +typedef struct QCowHeader { +    uint32_t magic; +    uint32_t version; +    uint64_t backing_file_offset; +    uint32_t backing_file_size; +    uint32_t mtime; +    uint64_t size; /* in bytes */ +    uint8_t cluster_bits; +    uint8_t l2_bits; +    uint32_t crypt_method; +    uint64_t l1_table_offset; +} QCowHeader; + +#define L2_CACHE_SIZE 16 + +typedef struct BDRVQcowState { +    int cluster_bits; +    int cluster_size; +    int cluster_sectors; +    int l2_bits; +    int l2_size; +    int l1_size; +    uint64_t cluster_offset_mask; +    uint64_t l1_table_offset; +    uint64_t *l1_table; +    uint64_t *l2_cache; +    uint64_t l2_cache_offsets[L2_CACHE_SIZE]; +    uint32_t l2_cache_counts[L2_CACHE_SIZE]; +    uint8_t *cluster_cache; +    uint8_t *cluster_data; +    uint64_t cluster_cache_offset; +    uint32_t crypt_method; /* current crypt method, 0 if no key yet */ +    uint32_t crypt_method_header; +    AES_KEY aes_encrypt_key; +    AES_KEY aes_decrypt_key; +    CoMutex lock; +    Error *migration_blocker; +} BDRVQcowState; + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); + +static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    const QCowHeader *cow_header = (const void *)buf; + +    if (buf_size >= sizeof(QCowHeader) && +        be32_to_cpu(cow_header->magic) == QCOW_MAGIC && +        be32_to_cpu(cow_header->version) == QCOW_VERSION) +        return 100; +    else +        return 0; +} + +static int qcow_open(BlockDriverState *bs, QDict *options, int flags) +{ +    BDRVQcowState *s = bs->opaque; +    int len, i, shift, ret; +    QCowHeader header; + +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); +    if (ret < 0) { +        goto fail; +    } +    be32_to_cpus(&header.magic); +    be32_to_cpus(&header.version); +    be64_to_cpus(&header.backing_file_offset); +    be32_to_cpus(&header.backing_file_size); +    be32_to_cpus(&header.mtime); +    be64_to_cpus(&header.size); +    be32_to_cpus(&header.crypt_method); +    be64_to_cpus(&header.l1_table_offset); + +    if (header.magic != QCOW_MAGIC) { +        ret = -EMEDIUMTYPE; +        goto fail; +    } +    if (header.version != QCOW_VERSION) { +        char version[64]; +        snprintf(version, sizeof(version), "QCOW version %d", header.version); +        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, +            bs->device_name, "qcow", version); +        ret = -ENOTSUP; +        goto fail; +    } + +    if (header.size <= 1 || header.cluster_bits < 9) { +        ret = -EINVAL; +        goto fail; +    } +    if (header.crypt_method > QCOW_CRYPT_AES) { +        ret = -EINVAL; +        goto fail; +    } +    s->crypt_method_header = header.crypt_method; +    if (s->crypt_method_header) { +        bs->encrypted = 1; +    } +    s->cluster_bits = header.cluster_bits; +    s->cluster_size = 1 << s->cluster_bits; +    s->cluster_sectors = 1 << (s->cluster_bits - 9); +    s->l2_bits = header.l2_bits; +    s->l2_size = 1 << s->l2_bits; +    bs->total_sectors = header.size / 512; +    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; + +    /* read the level 1 table */ +    shift = s->cluster_bits + s->l2_bits; +    s->l1_size = (header.size + (1LL << shift) - 1) >> shift; + +    s->l1_table_offset = header.l1_table_offset; +    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); + +    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, +               s->l1_size * sizeof(uint64_t)); +    if (ret < 0) { +        goto fail; +    } + +    for(i = 0;i < s->l1_size; i++) { +        be64_to_cpus(&s->l1_table[i]); +    } +    /* alloc L2 cache */ +    s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); +    s->cluster_cache = g_malloc(s->cluster_size); +    s->cluster_data = g_malloc(s->cluster_size); +    s->cluster_cache_offset = -1; + +    /* read the backing file name */ +    if (header.backing_file_offset != 0) { +        len = header.backing_file_size; +        if (len > 1023) { +            len = 1023; +        } +        ret = bdrv_pread(bs->file, header.backing_file_offset, +                   bs->backing_file, len); +        if (ret < 0) { +            goto fail; +        } +        bs->backing_file[len] = '\0'; +    } + +    /* Disable migration when qcow images are used */ +    error_set(&s->migration_blocker, +              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, +              "qcow", bs->device_name, "live migration"); +    migrate_add_blocker(s->migration_blocker); + +    qemu_co_mutex_init(&s->lock); +    return 0; + + fail: +    g_free(s->l1_table); +    g_free(s->l2_cache); +    g_free(s->cluster_cache); +    g_free(s->cluster_data); +    return ret; +} + + +/* We have nothing to do for QCOW reopen, stubs just return + * success */ +static int qcow_reopen_prepare(BDRVReopenState *state, +                               BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + +static int qcow_set_key(BlockDriverState *bs, const char *key) +{ +    BDRVQcowState *s = bs->opaque; +    uint8_t keybuf[16]; +    int len, i; + +    memset(keybuf, 0, 16); +    len = strlen(key); +    if (len > 16) +        len = 16; +    /* XXX: we could compress the chars to 7 bits to increase +       entropy */ +    for(i = 0;i < len;i++) { +        keybuf[i] = key[i]; +    } +    s->crypt_method = s->crypt_method_header; + +    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) +        return -1; +    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) +        return -1; +    return 0; +} + +/* The crypt function is compatible with the linux cryptoloop +   algorithm for < 4 GB images. NOTE: out_buf == in_buf is +   supported */ +static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num, +                            uint8_t *out_buf, const uint8_t *in_buf, +                            int nb_sectors, int enc, +                            const AES_KEY *key) +{ +    union { +        uint64_t ll[2]; +        uint8_t b[16]; +    } ivec; +    int i; + +    for(i = 0; i < nb_sectors; i++) { +        ivec.ll[0] = cpu_to_le64(sector_num); +        ivec.ll[1] = 0; +        AES_cbc_encrypt(in_buf, out_buf, 512, key, +                        ivec.b, enc); +        sector_num++; +        in_buf += 512; +        out_buf += 512; +    } +} + +/* 'allocate' is: + * + * 0 to not allocate. + * + * 1 to allocate a normal cluster (for sector indexes 'n_start' to + * 'n_end') + * + * 2 to allocate a compressed cluster of size + * 'compressed_size'. 'compressed_size' must be > 0 and < + * cluster_size + * + * return 0 if not allocated. + */ +static uint64_t get_cluster_offset(BlockDriverState *bs, +                                   uint64_t offset, int allocate, +                                   int compressed_size, +                                   int n_start, int n_end) +{ +    BDRVQcowState *s = bs->opaque; +    int min_index, i, j, l1_index, l2_index; +    uint64_t l2_offset, *l2_table, cluster_offset, tmp; +    uint32_t min_count; +    int new_l2_table; + +    l1_index = offset >> (s->l2_bits + s->cluster_bits); +    l2_offset = s->l1_table[l1_index]; +    new_l2_table = 0; +    if (!l2_offset) { +        if (!allocate) +            return 0; +        /* allocate a new l2 entry */ +        l2_offset = bdrv_getlength(bs->file); +        /* round to cluster size */ +        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); +        /* update the L1 entry */ +        s->l1_table[l1_index] = l2_offset; +        tmp = cpu_to_be64(l2_offset); +        if (bdrv_pwrite_sync(bs->file, +                s->l1_table_offset + l1_index * sizeof(tmp), +                &tmp, sizeof(tmp)) < 0) +            return 0; +        new_l2_table = 1; +    } +    for(i = 0; i < L2_CACHE_SIZE; i++) { +        if (l2_offset == s->l2_cache_offsets[i]) { +            /* increment the hit count */ +            if (++s->l2_cache_counts[i] == 0xffffffff) { +                for(j = 0; j < L2_CACHE_SIZE; j++) { +                    s->l2_cache_counts[j] >>= 1; +                } +            } +            l2_table = s->l2_cache + (i << s->l2_bits); +            goto found; +        } +    } +    /* not found: load a new entry in the least used one */ +    min_index = 0; +    min_count = 0xffffffff; +    for(i = 0; i < L2_CACHE_SIZE; i++) { +        if (s->l2_cache_counts[i] < min_count) { +            min_count = s->l2_cache_counts[i]; +            min_index = i; +        } +    } +    l2_table = s->l2_cache + (min_index << s->l2_bits); +    if (new_l2_table) { +        memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); +        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table, +                s->l2_size * sizeof(uint64_t)) < 0) +            return 0; +    } else { +        if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != +            s->l2_size * sizeof(uint64_t)) +            return 0; +    } +    s->l2_cache_offsets[min_index] = l2_offset; +    s->l2_cache_counts[min_index] = 1; + found: +    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); +    cluster_offset = be64_to_cpu(l2_table[l2_index]); +    if (!cluster_offset || +        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) { +        if (!allocate) +            return 0; +        /* allocate a new cluster */ +        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && +            (n_end - n_start) < s->cluster_sectors) { +            /* if the cluster is already compressed, we must +               decompress it in the case it is not completely +               overwritten */ +            if (decompress_cluster(bs, cluster_offset) < 0) +                return 0; +            cluster_offset = bdrv_getlength(bs->file); +            cluster_offset = (cluster_offset + s->cluster_size - 1) & +                ~(s->cluster_size - 1); +            /* write the cluster content */ +            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) != +                s->cluster_size) +                return -1; +        } else { +            cluster_offset = bdrv_getlength(bs->file); +            if (allocate == 1) { +                /* round to cluster size */ +                cluster_offset = (cluster_offset + s->cluster_size - 1) & +                    ~(s->cluster_size - 1); +                bdrv_truncate(bs->file, cluster_offset + s->cluster_size); +                /* if encrypted, we must initialize the cluster +                   content which won't be written */ +                if (s->crypt_method && +                    (n_end - n_start) < s->cluster_sectors) { +                    uint64_t start_sect; +                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9; +                    memset(s->cluster_data + 512, 0x00, 512); +                    for(i = 0; i < s->cluster_sectors; i++) { +                        if (i < n_start || i >= n_end) { +                            encrypt_sectors(s, start_sect + i, +                                            s->cluster_data, +                                            s->cluster_data + 512, 1, 1, +                                            &s->aes_encrypt_key); +                            if (bdrv_pwrite(bs->file, cluster_offset + i * 512, +                                            s->cluster_data, 512) != 512) +                                return -1; +                        } +                    } +                } +            } else if (allocate == 2) { +                cluster_offset |= QCOW_OFLAG_COMPRESSED | +                    (uint64_t)compressed_size << (63 - s->cluster_bits); +            } +        } +        /* update L2 table */ +        tmp = cpu_to_be64(cluster_offset); +        l2_table[l2_index] = tmp; +        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp), +                &tmp, sizeof(tmp)) < 0) +            return 0; +    } +    return cluster_offset; +} + +static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, int *pnum) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster, n; +    uint64_t cluster_offset; + +    qemu_co_mutex_lock(&s->lock); +    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); +    qemu_co_mutex_unlock(&s->lock); +    index_in_cluster = sector_num & (s->cluster_sectors - 1); +    n = s->cluster_sectors - index_in_cluster; +    if (n > nb_sectors) +        n = nb_sectors; +    *pnum = n; +    return (cluster_offset != 0); +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, +                             const uint8_t *buf, int buf_size) +{ +    z_stream strm1, *strm = &strm1; +    int ret, out_len; + +    memset(strm, 0, sizeof(*strm)); + +    strm->next_in = (uint8_t *)buf; +    strm->avail_in = buf_size; +    strm->next_out = out_buf; +    strm->avail_out = out_buf_size; + +    ret = inflateInit2(strm, -12); +    if (ret != Z_OK) +        return -1; +    ret = inflate(strm, Z_FINISH); +    out_len = strm->next_out - out_buf; +    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || +        out_len != out_buf_size) { +        inflateEnd(strm); +        return -1; +    } +    inflateEnd(strm); +    return 0; +} + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ +    BDRVQcowState *s = bs->opaque; +    int ret, csize; +    uint64_t coffset; + +    coffset = cluster_offset & s->cluster_offset_mask; +    if (s->cluster_cache_offset != coffset) { +        csize = cluster_offset >> (63 - s->cluster_bits); +        csize &= (s->cluster_size - 1); +        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize); +        if (ret != csize) +            return -1; +        if (decompress_buffer(s->cluster_cache, s->cluster_size, +                              s->cluster_data, csize) < 0) { +            return -1; +        } +        s->cluster_cache_offset = coffset; +    } +    return 0; +} + +static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, +                         int nb_sectors, QEMUIOVector *qiov) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster; +    int ret = 0, n; +    uint64_t cluster_offset; +    struct iovec hd_iov; +    QEMUIOVector hd_qiov; +    uint8_t *buf; +    void *orig_buf; + +    if (qiov->niov > 1) { +        buf = orig_buf = qemu_blockalign(bs, qiov->size); +    } else { +        orig_buf = NULL; +        buf = (uint8_t *)qiov->iov->iov_base; +    } + +    qemu_co_mutex_lock(&s->lock); + +    while (nb_sectors != 0) { +        /* prepare next request */ +        cluster_offset = get_cluster_offset(bs, sector_num << 9, +                                                 0, 0, 0, 0); +        index_in_cluster = sector_num & (s->cluster_sectors - 1); +        n = s->cluster_sectors - index_in_cluster; +        if (n > nb_sectors) { +            n = nb_sectors; +        } + +        if (!cluster_offset) { +            if (bs->backing_hd) { +                /* read from the base image */ +                hd_iov.iov_base = (void *)buf; +                hd_iov.iov_len = n * 512; +                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); +                qemu_co_mutex_unlock(&s->lock); +                ret = bdrv_co_readv(bs->backing_hd, sector_num, +                                    n, &hd_qiov); +                qemu_co_mutex_lock(&s->lock); +                if (ret < 0) { +                    goto fail; +                } +            } else { +                /* Note: in this case, no need to wait */ +                memset(buf, 0, 512 * n); +            } +        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { +            /* add AIO support for compressed blocks ? */ +            if (decompress_cluster(bs, cluster_offset) < 0) { +                goto fail; +            } +            memcpy(buf, +                   s->cluster_cache + index_in_cluster * 512, 512 * n); +        } else { +            if ((cluster_offset & 511) != 0) { +                goto fail; +            } +            hd_iov.iov_base = (void *)buf; +            hd_iov.iov_len = n * 512; +            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); +            qemu_co_mutex_unlock(&s->lock); +            ret = bdrv_co_readv(bs->file, +                                (cluster_offset >> 9) + index_in_cluster, +                                n, &hd_qiov); +            qemu_co_mutex_lock(&s->lock); +            if (ret < 0) { +                break; +            } +            if (s->crypt_method) { +                encrypt_sectors(s, sector_num, buf, buf, +                                n, 0, +                                &s->aes_decrypt_key); +            } +        } +        ret = 0; + +        nb_sectors -= n; +        sector_num += n; +        buf += n * 512; +    } + +done: +    qemu_co_mutex_unlock(&s->lock); + +    if (qiov->niov > 1) { +        qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size); +        qemu_vfree(orig_buf); +    } + +    return ret; + +fail: +    ret = -EIO; +    goto done; +} + +static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, +                          int nb_sectors, QEMUIOVector *qiov) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster; +    uint64_t cluster_offset; +    const uint8_t *src_buf; +    int ret = 0, n; +    uint8_t *cluster_data = NULL; +    struct iovec hd_iov; +    QEMUIOVector hd_qiov; +    uint8_t *buf; +    void *orig_buf; + +    s->cluster_cache_offset = -1; /* disable compressed cache */ + +    if (qiov->niov > 1) { +        buf = orig_buf = qemu_blockalign(bs, qiov->size); +        qemu_iovec_to_buf(qiov, 0, buf, qiov->size); +    } else { +        orig_buf = NULL; +        buf = (uint8_t *)qiov->iov->iov_base; +    } + +    qemu_co_mutex_lock(&s->lock); + +    while (nb_sectors != 0) { + +        index_in_cluster = sector_num & (s->cluster_sectors - 1); +        n = s->cluster_sectors - index_in_cluster; +        if (n > nb_sectors) { +            n = nb_sectors; +        } +        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, +                                            index_in_cluster, +                                            index_in_cluster + n); +        if (!cluster_offset || (cluster_offset & 511) != 0) { +            ret = -EIO; +            break; +        } +        if (s->crypt_method) { +            if (!cluster_data) { +                cluster_data = g_malloc0(s->cluster_size); +            } +            encrypt_sectors(s, sector_num, cluster_data, buf, +                            n, 1, &s->aes_encrypt_key); +            src_buf = cluster_data; +        } else { +            src_buf = buf; +        } + +        hd_iov.iov_base = (void *)src_buf; +        hd_iov.iov_len = n * 512; +        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); +        qemu_co_mutex_unlock(&s->lock); +        ret = bdrv_co_writev(bs->file, +                             (cluster_offset >> 9) + index_in_cluster, +                             n, &hd_qiov); +        qemu_co_mutex_lock(&s->lock); +        if (ret < 0) { +            break; +        } +        ret = 0; + +        nb_sectors -= n; +        sector_num += n; +        buf += n * 512; +    } +    qemu_co_mutex_unlock(&s->lock); + +    if (qiov->niov > 1) { +        qemu_vfree(orig_buf); +    } +    g_free(cluster_data); + +    return ret; +} + +static void qcow_close(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; + +    g_free(s->l1_table); +    g_free(s->l2_cache); +    g_free(s->cluster_cache); +    g_free(s->cluster_data); + +    migrate_del_blocker(s->migration_blocker); +    error_free(s->migration_blocker); +} + +static int qcow_create(const char *filename, QEMUOptionParameter *options) +{ +    int header_size, backing_filename_len, l1_size, shift, i; +    QCowHeader header; +    uint8_t *tmp; +    int64_t total_size = 0; +    const char *backing_file = NULL; +    int flags = 0; +    int ret; +    BlockDriverState *qcow_bs; + +    /* Read out options */ +    while (options && options->name) { +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) { +            total_size = options->value.n / 512; +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { +            backing_file = options->value.s; +        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { +            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; +        } +        options++; +    } + +    ret = bdrv_create_file(filename, options); +    if (ret < 0) { +        return ret; +    } + +    ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR); +    if (ret < 0) { +        return ret; +    } + +    ret = bdrv_truncate(qcow_bs, 0); +    if (ret < 0) { +        goto exit; +    } + +    memset(&header, 0, sizeof(header)); +    header.magic = cpu_to_be32(QCOW_MAGIC); +    header.version = cpu_to_be32(QCOW_VERSION); +    header.size = cpu_to_be64(total_size * 512); +    header_size = sizeof(header); +    backing_filename_len = 0; +    if (backing_file) { +        if (strcmp(backing_file, "fat:")) { +            header.backing_file_offset = cpu_to_be64(header_size); +            backing_filename_len = strlen(backing_file); +            header.backing_file_size = cpu_to_be32(backing_filename_len); +            header_size += backing_filename_len; +        } else { +            /* special backing file for vvfat */ +            backing_file = NULL; +        } +        header.cluster_bits = 9; /* 512 byte cluster to avoid copying +                                    unmodifyed sectors */ +        header.l2_bits = 12; /* 32 KB L2 tables */ +    } else { +        header.cluster_bits = 12; /* 4 KB clusters */ +        header.l2_bits = 9; /* 4 KB L2 tables */ +    } +    header_size = (header_size + 7) & ~7; +    shift = header.cluster_bits + header.l2_bits; +    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift; + +    header.l1_table_offset = cpu_to_be64(header_size); +    if (flags & BLOCK_FLAG_ENCRYPT) { +        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); +    } else { +        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); +    } + +    /* write all the data */ +    ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); +    if (ret != sizeof(header)) { +        goto exit; +    } + +    if (backing_file) { +        ret = bdrv_pwrite(qcow_bs, sizeof(header), +            backing_file, backing_filename_len); +        if (ret != backing_filename_len) { +            goto exit; +        } +    } + +    tmp = g_malloc0(BDRV_SECTOR_SIZE); +    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ +        BDRV_SECTOR_SIZE); i++) { +        ret = bdrv_pwrite(qcow_bs, header_size + +            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); +        if (ret != BDRV_SECTOR_SIZE) { +            g_free(tmp); +            goto exit; +        } +    } + +    g_free(tmp); +    ret = 0; +exit: +    bdrv_delete(qcow_bs); +    return ret; +} + +static int qcow_make_empty(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    uint32_t l1_length = s->l1_size * sizeof(uint64_t); +    int ret; + +    memset(s->l1_table, 0, l1_length); +    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table, +            l1_length) < 0) +        return -1; +    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); +    if (ret < 0) +        return ret; + +    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); +    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); +    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); + +    return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned +   tables to avoid losing bytes in alignment */ +static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, +                                 const uint8_t *buf, int nb_sectors) +{ +    BDRVQcowState *s = bs->opaque; +    z_stream strm; +    int ret, out_len; +    uint8_t *out_buf; +    uint64_t cluster_offset; + +    if (nb_sectors != s->cluster_sectors) { +        ret = -EINVAL; + +        /* Zero-pad last write if image size is not cluster aligned */ +        if (sector_num + nb_sectors == bs->total_sectors && +            nb_sectors < s->cluster_sectors) { +            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); +            memset(pad_buf, 0, s->cluster_size); +            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); +            ret = qcow_write_compressed(bs, sector_num, +                                        pad_buf, s->cluster_sectors); +            qemu_vfree(pad_buf); +        } +        return ret; +    } + +    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + +    /* best compression, small window, no zlib header */ +    memset(&strm, 0, sizeof(strm)); +    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, +                       Z_DEFLATED, -12, +                       9, Z_DEFAULT_STRATEGY); +    if (ret != 0) { +        ret = -EINVAL; +        goto fail; +    } + +    strm.avail_in = s->cluster_size; +    strm.next_in = (uint8_t *)buf; +    strm.avail_out = s->cluster_size; +    strm.next_out = out_buf; + +    ret = deflate(&strm, Z_FINISH); +    if (ret != Z_STREAM_END && ret != Z_OK) { +        deflateEnd(&strm); +        ret = -EINVAL; +        goto fail; +    } +    out_len = strm.next_out - out_buf; + +    deflateEnd(&strm); + +    if (ret != Z_STREAM_END || out_len >= s->cluster_size) { +        /* could not compress: write normal cluster */ +        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); +        if (ret < 0) { +            goto fail; +        } +    } else { +        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, +                                            out_len, 0, 0); +        if (cluster_offset == 0) { +            ret = -EIO; +            goto fail; +        } + +        cluster_offset &= s->cluster_offset_mask; +        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); +        if (ret < 0) { +            goto fail; +        } +    } + +    ret = 0; +fail: +    g_free(out_buf); +    return ret; +} + +static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVQcowState *s = bs->opaque; +    bdi->cluster_size = s->cluster_size; +    return 0; +} + + +static QEMUOptionParameter qcow_create_options[] = { +    { +        .name = BLOCK_OPT_SIZE, +        .type = OPT_SIZE, +        .help = "Virtual disk size" +    }, +    { +        .name = BLOCK_OPT_BACKING_FILE, +        .type = OPT_STRING, +        .help = "File name of a base image" +    }, +    { +        .name = BLOCK_OPT_ENCRYPT, +        .type = OPT_FLAG, +        .help = "Encrypt the image" +    }, +    { NULL } +}; + +static BlockDriver bdrv_qcow = { +    .format_name	= "qcow", +    .instance_size	= sizeof(BDRVQcowState), +    .bdrv_probe		= qcow_probe, +    .bdrv_open		= qcow_open, +    .bdrv_close		= qcow_close, +    .bdrv_reopen_prepare = qcow_reopen_prepare, +    .bdrv_create	= qcow_create, +    .bdrv_has_zero_init     = bdrv_has_zero_init_1, + +    .bdrv_co_readv          = qcow_co_readv, +    .bdrv_co_writev         = qcow_co_writev, +    .bdrv_co_is_allocated   = qcow_co_is_allocated, + +    .bdrv_set_key           = qcow_set_key, +    .bdrv_make_empty        = qcow_make_empty, +    .bdrv_write_compressed  = qcow_write_compressed, +    .bdrv_get_info          = qcow_get_info, + +    .create_options = qcow_create_options, +}; + +static void bdrv_qcow_init(void) +{ +    bdrv_register(&bdrv_qcow); +} + +block_init(bdrv_qcow_init); diff --git a/contrib/qemu/block/qcow2-cache.c b/contrib/qemu/block/qcow2-cache.c new file mode 100644 index 000000000..2f3114ecc --- /dev/null +++ b/contrib/qemu/block/qcow2-cache.c @@ -0,0 +1,323 @@ +/* + * L2/refcount table cache for the QCOW2 format + * + * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/block_int.h" +#include "qemu-common.h" +#include "qcow2.h" +#include "trace.h" + +typedef struct Qcow2CachedTable { +    void*   table; +    int64_t offset; +    bool    dirty; +    int     cache_hits; +    int     ref; +} Qcow2CachedTable; + +struct Qcow2Cache { +    Qcow2CachedTable*       entries; +    struct Qcow2Cache*      depends; +    int                     size; +    bool                    depends_on_flush; +}; + +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) +{ +    BDRVQcowState *s = bs->opaque; +    Qcow2Cache *c; +    int i; + +    c = g_malloc0(sizeof(*c)); +    c->size = num_tables; +    c->entries = g_malloc0(sizeof(*c->entries) * num_tables); + +    for (i = 0; i < c->size; i++) { +        c->entries[i].table = qemu_blockalign(bs, s->cluster_size); +    } + +    return c; +} + +int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c) +{ +    int i; + +    for (i = 0; i < c->size; i++) { +        assert(c->entries[i].ref == 0); +        qemu_vfree(c->entries[i].table); +    } + +    g_free(c->entries); +    g_free(c); + +    return 0; +} + +static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) +{ +    int ret; + +    ret = qcow2_cache_flush(bs, c->depends); +    if (ret < 0) { +        return ret; +    } + +    c->depends = NULL; +    c->depends_on_flush = false; + +    return 0; +} + +static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) +{ +    BDRVQcowState *s = bs->opaque; +    int ret = 0; + +    if (!c->entries[i].dirty || !c->entries[i].offset) { +        return 0; +    } + +    trace_qcow2_cache_entry_flush(qemu_coroutine_self(), +                                  c == s->l2_table_cache, i); + +    if (c->depends) { +        ret = qcow2_cache_flush_dependency(bs, c); +    } else if (c->depends_on_flush) { +        ret = bdrv_flush(bs->file); +        if (ret >= 0) { +            c->depends_on_flush = false; +        } +    } + +    if (ret < 0) { +        return ret; +    } + +    if (c == s->refcount_block_cache) { +        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); +    } else if (c == s->l2_table_cache) { +        BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); +    } + +    ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table, +        s->cluster_size); +    if (ret < 0) { +        return ret; +    } + +    c->entries[i].dirty = false; + +    return 0; +} + +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) +{ +    BDRVQcowState *s = bs->opaque; +    int result = 0; +    int ret; +    int i; + +    trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); + +    for (i = 0; i < c->size; i++) { +        ret = qcow2_cache_entry_flush(bs, c, i); +        if (ret < 0 && result != -ENOSPC) { +            result = ret; +        } +    } + +    if (result == 0) { +        ret = bdrv_flush(bs->file); +        if (ret < 0) { +            result = ret; +        } +    } + +    return result; +} + +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, +    Qcow2Cache *dependency) +{ +    int ret; + +    if (dependency->depends) { +        ret = qcow2_cache_flush_dependency(bs, dependency); +        if (ret < 0) { +            return ret; +        } +    } + +    if (c->depends && (c->depends != dependency)) { +        ret = qcow2_cache_flush_dependency(bs, c); +        if (ret < 0) { +            return ret; +        } +    } + +    c->depends = dependency; +    return 0; +} + +void qcow2_cache_depends_on_flush(Qcow2Cache *c) +{ +    c->depends_on_flush = true; +} + +static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c) +{ +    int i; +    int min_count = INT_MAX; +    int min_index = -1; + + +    for (i = 0; i < c->size; i++) { +        if (c->entries[i].ref) { +            continue; +        } + +        if (c->entries[i].cache_hits < min_count) { +            min_index = i; +            min_count = c->entries[i].cache_hits; +        } + +        /* Give newer hits priority */ +        /* TODO Check how to optimize the replacement strategy */ +        c->entries[i].cache_hits /= 2; +    } + +    if (min_index == -1) { +        /* This can't happen in current synchronous code, but leave the check +         * here as a reminder for whoever starts using AIO with the cache */ +        abort(); +    } +    return min_index; +} + +static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, +    uint64_t offset, void **table, bool read_from_disk) +{ +    BDRVQcowState *s = bs->opaque; +    int i; +    int ret; + +    trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, +                          offset, read_from_disk); + +    /* Check if the table is already cached */ +    for (i = 0; i < c->size; i++) { +        if (c->entries[i].offset == offset) { +            goto found; +        } +    } + +    /* If not, write a table back and replace it */ +    i = qcow2_cache_find_entry_to_replace(c); +    trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), +                                        c == s->l2_table_cache, i); +    if (i < 0) { +        return i; +    } + +    ret = qcow2_cache_entry_flush(bs, c, i); +    if (ret < 0) { +        return ret; +    } + +    trace_qcow2_cache_get_read(qemu_coroutine_self(), +                               c == s->l2_table_cache, i); +    c->entries[i].offset = 0; +    if (read_from_disk) { +        if (c == s->l2_table_cache) { +            BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); +        } + +        ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size); +        if (ret < 0) { +            return ret; +        } +    } + +    /* Give the table some hits for the start so that it won't be replaced +     * immediately. The number 32 is completely arbitrary. */ +    c->entries[i].cache_hits = 32; +    c->entries[i].offset = offset; + +    /* And return the right table */ +found: +    c->entries[i].cache_hits++; +    c->entries[i].ref++; +    *table = c->entries[i].table; + +    trace_qcow2_cache_get_done(qemu_coroutine_self(), +                               c == s->l2_table_cache, i); + +    return 0; +} + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, +    void **table) +{ +    return qcow2_cache_do_get(bs, c, offset, table, true); +} + +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, +    void **table) +{ +    return qcow2_cache_do_get(bs, c, offset, table, false); +} + +int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) +{ +    int i; + +    for (i = 0; i < c->size; i++) { +        if (c->entries[i].table == *table) { +            goto found; +        } +    } +    return -ENOENT; + +found: +    c->entries[i].ref--; +    *table = NULL; + +    assert(c->entries[i].ref >= 0); +    return 0; +} + +void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table) +{ +    int i; + +    for (i = 0; i < c->size; i++) { +        if (c->entries[i].table == table) { +            goto found; +        } +    } +    abort(); + +found: +    c->entries[i].dirty = true; +} diff --git a/contrib/qemu/block/qcow2-cluster.c b/contrib/qemu/block/qcow2-cluster.c new file mode 100644 index 000000000..cca76d4fc --- /dev/null +++ b/contrib/qemu/block/qcow2-cluster.c @@ -0,0 +1,1478 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <zlib.h> + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" +#include "trace.h" + +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, +                        bool exact_size) +{ +    BDRVQcowState *s = bs->opaque; +    int new_l1_size2, ret, i; +    uint64_t *new_l1_table; +    int64_t new_l1_table_offset, new_l1_size; +    uint8_t data[12]; + +    if (min_size <= s->l1_size) +        return 0; + +    if (exact_size) { +        new_l1_size = min_size; +    } else { +        /* Bump size up to reduce the number of times we have to grow */ +        new_l1_size = s->l1_size; +        if (new_l1_size == 0) { +            new_l1_size = 1; +        } +        while (min_size > new_l1_size) { +            new_l1_size = (new_l1_size * 3 + 1) / 2; +        } +    } + +    if (new_l1_size > INT_MAX) { +        return -EFBIG; +    } + +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", +            s->l1_size, new_l1_size); +#endif + +    new_l1_size2 = sizeof(uint64_t) * new_l1_size; +    new_l1_table = g_malloc0(align_offset(new_l1_size2, 512)); +    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); + +    /* write new table (align to cluster) */ +    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); +    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); +    if (new_l1_table_offset < 0) { +        g_free(new_l1_table); +        return new_l1_table_offset; +    } + +    ret = qcow2_cache_flush(bs, s->refcount_block_cache); +    if (ret < 0) { +        goto fail; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); +    for(i = 0; i < s->l1_size; i++) +        new_l1_table[i] = cpu_to_be64(new_l1_table[i]); +    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); +    if (ret < 0) +        goto fail; +    for(i = 0; i < s->l1_size; i++) +        new_l1_table[i] = be64_to_cpu(new_l1_table[i]); + +    /* set new table */ +    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); +    cpu_to_be32w((uint32_t*)data, new_l1_size); +    cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset); +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); +    if (ret < 0) { +        goto fail; +    } +    g_free(s->l1_table); +    qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t), +                        QCOW2_DISCARD_OTHER); +    s->l1_table_offset = new_l1_table_offset; +    s->l1_table = new_l1_table; +    s->l1_size = new_l1_size; +    return 0; + fail: +    g_free(new_l1_table); +    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, +                        QCOW2_DISCARD_OTHER); +    return ret; +} + +/* + * l2_load + * + * Loads a L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns a pointer to the L2 table on success, or NULL if the read from + * the image file failed. + */ + +static int l2_load(BlockDriverState *bs, uint64_t l2_offset, +    uint64_t **l2_table) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); + +    return ret; +} + +/* + * Writes one sector of the L1 table to the disk (can't update single entries + * and we really don't want bdrv_pread to perform a read-modify-write) + */ +#define L1_ENTRIES_PER_SECTOR (512 / 8) +static int write_l1_entry(BlockDriverState *bs, int l1_index) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t buf[L1_ENTRIES_PER_SECTOR]; +    int l1_start_index; +    int i, ret; + +    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); +    for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) { +        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); +    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, +        buf, sizeof(buf)); +    if (ret < 0) { +        return ret; +    } + +    return 0; +} + +/* + * l2_allocate + * + * Allocate a new l2 entry in the file. If l1_index points to an already + * used entry in the L2 table (i.e. we are doing a copy on write for the L2 + * table) copy the contents of the old L2 table into the newly allocated one. + * Otherwise the new table is initialized with zeros. + * + */ + +static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t old_l2_offset; +    uint64_t *l2_table; +    int64_t l2_offset; +    int ret; + +    old_l2_offset = s->l1_table[l1_index]; + +    trace_qcow2_l2_allocate(bs, l1_index); + +    /* allocate a new l2 entry */ + +    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); +    if (l2_offset < 0) { +        return l2_offset; +    } + +    ret = qcow2_cache_flush(bs, s->refcount_block_cache); +    if (ret < 0) { +        goto fail; +    } + +    /* allocate a new entry in the l2 cache */ + +    trace_qcow2_l2_allocate_get_empty(bs, l1_index); +    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); +    if (ret < 0) { +        return ret; +    } + +    l2_table = *table; + +    if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { +        /* if there was no old l2 table, clear the new table */ +        memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); +    } else { +        uint64_t* old_table; + +        /* if there was an old l2 table, read it from the disk */ +        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); +        ret = qcow2_cache_get(bs, s->l2_table_cache, +            old_l2_offset & L1E_OFFSET_MASK, +            (void**) &old_table); +        if (ret < 0) { +            goto fail; +        } + +        memcpy(l2_table, old_table, s->cluster_size); + +        ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table); +        if (ret < 0) { +            goto fail; +        } +    } + +    /* write the l2 table to the file */ +    BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); + +    trace_qcow2_l2_allocate_write_l2(bs, l1_index); +    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); +    ret = qcow2_cache_flush(bs, s->l2_table_cache); +    if (ret < 0) { +        goto fail; +    } + +    /* update the L1 entry */ +    trace_qcow2_l2_allocate_write_l1(bs, l1_index); +    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; +    ret = write_l1_entry(bs, l1_index); +    if (ret < 0) { +        goto fail; +    } + +    *table = l2_table; +    trace_qcow2_l2_allocate_done(bs, l1_index, 0); +    return 0; + +fail: +    trace_qcow2_l2_allocate_done(bs, l1_index, ret); +    qcow2_cache_put(bs, s->l2_table_cache, (void**) table); +    s->l1_table[l1_index] = old_l2_offset; +    return ret; +} + +/* + * Checks how many clusters in a given L2 table are contiguous in the image + * file. As soon as one of the flags in the bitmask stop_flags changes compared + * to the first cluster, the search is stopped and the cluster is not counted + * as contiguous. (This allows it, for example, to stop at the first compressed + * cluster which may require a different handling) + */ +static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, +        uint64_t *l2_table, uint64_t start, uint64_t stop_flags) +{ +    int i; +    uint64_t mask = stop_flags | L2E_OFFSET_MASK; +    uint64_t offset = be64_to_cpu(l2_table[0]) & mask; + +    if (!offset) +        return 0; + +    for (i = start; i < start + nb_clusters; i++) { +        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; +        if (offset + (uint64_t) i * cluster_size != l2_entry) { +            break; +        } +    } + +	return (i - start); +} + +static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) +{ +    int i; + +    for (i = 0; i < nb_clusters; i++) { +        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); + +        if (type != QCOW2_CLUSTER_UNALLOCATED) { +            break; +        } +    } + +    return i; +} + +/* The crypt function is compatible with the linux cryptoloop +   algorithm for < 4 GB images. NOTE: out_buf == in_buf is +   supported */ +void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, +                           uint8_t *out_buf, const uint8_t *in_buf, +                           int nb_sectors, int enc, +                           const AES_KEY *key) +{ +    union { +        uint64_t ll[2]; +        uint8_t b[16]; +    } ivec; +    int i; + +    for(i = 0; i < nb_sectors; i++) { +        ivec.ll[0] = cpu_to_le64(sector_num); +        ivec.ll[1] = 0; +        AES_cbc_encrypt(in_buf, out_buf, 512, key, +                        ivec.b, enc); +        sector_num++; +        in_buf += 512; +        out_buf += 512; +    } +} + +static int coroutine_fn copy_sectors(BlockDriverState *bs, +                                     uint64_t start_sect, +                                     uint64_t cluster_offset, +                                     int n_start, int n_end) +{ +    BDRVQcowState *s = bs->opaque; +    QEMUIOVector qiov; +    struct iovec iov; +    int n, ret; + +    /* +     * If this is the last cluster and it is only partially used, we must only +     * copy until the end of the image, or bdrv_check_request will fail for the +     * bdrv_read/write calls below. +     */ +    if (start_sect + n_end > bs->total_sectors) { +        n_end = bs->total_sectors - start_sect; +    } + +    n = n_end - n_start; +    if (n <= 0) { +        return 0; +    } + +    iov.iov_len = n * BDRV_SECTOR_SIZE; +    iov.iov_base = qemu_blockalign(bs, iov.iov_len); + +    qemu_iovec_init_external(&qiov, &iov, 1); + +    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); + +    /* Call .bdrv_co_readv() directly instead of using the public block-layer +     * interface.  This avoids double I/O throttling and request tracking, +     * which can lead to deadlock when block layer copy-on-read is enabled. +     */ +    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); +    if (ret < 0) { +        goto out; +    } + +    if (s->crypt_method) { +        qcow2_encrypt_sectors(s, start_sect + n_start, +                        iov.iov_base, iov.iov_base, n, 1, +                        &s->aes_encrypt_key); +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); +    ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); +    if (ret < 0) { +        goto out; +    } + +    ret = 0; +out: +    qemu_vfree(iov.iov_base); +    return ret; +} + + +/* + * get_cluster_offset + * + * For a given offset of the disk image, find the cluster offset in + * qcow2 file. The offset is stored in *cluster_offset. + * + * on entry, *num is the number of contiguous sectors we'd like to + * access following offset. + * + * on exit, *num is the number of contiguous sectors we can read. + * + * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error + * cases. + */ +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, +    int *num, uint64_t *cluster_offset) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int l2_index; +    uint64_t l1_index, l2_offset, *l2_table; +    int l1_bits, c; +    unsigned int index_in_cluster, nb_clusters; +    uint64_t nb_available, nb_needed; +    int ret; + +    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); +    nb_needed = *num + index_in_cluster; + +    l1_bits = s->l2_bits + s->cluster_bits; + +    /* compute how many bytes there are between the offset and +     * the end of the l1 entry +     */ + +    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); + +    /* compute the number of available sectors */ + +    nb_available = (nb_available >> 9) + index_in_cluster; + +    if (nb_needed > nb_available) { +        nb_needed = nb_available; +    } + +    *cluster_offset = 0; + +    /* seek the the l2 offset in the l1 table */ + +    l1_index = offset >> l1_bits; +    if (l1_index >= s->l1_size) { +        ret = QCOW2_CLUSTER_UNALLOCATED; +        goto out; +    } + +    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; +    if (!l2_offset) { +        ret = QCOW2_CLUSTER_UNALLOCATED; +        goto out; +    } + +    /* load the l2 table in memory */ + +    ret = l2_load(bs, l2_offset, &l2_table); +    if (ret < 0) { +        return ret; +    } + +    /* find the cluster offset for the given disk offset */ + +    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); +    *cluster_offset = be64_to_cpu(l2_table[l2_index]); +    nb_clusters = size_to_clusters(s, nb_needed << 9); + +    ret = qcow2_get_cluster_type(*cluster_offset); +    switch (ret) { +    case QCOW2_CLUSTER_COMPRESSED: +        /* Compressed clusters can only be processed one by one */ +        c = 1; +        *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; +        break; +    case QCOW2_CLUSTER_ZERO: +        if (s->qcow_version < 3) { +            return -EIO; +        } +        c = count_contiguous_clusters(nb_clusters, s->cluster_size, +                &l2_table[l2_index], 0, +                QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); +        *cluster_offset = 0; +        break; +    case QCOW2_CLUSTER_UNALLOCATED: +        /* how many empty clusters ? */ +        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); +        *cluster_offset = 0; +        break; +    case QCOW2_CLUSTER_NORMAL: +        /* how many allocated clusters ? */ +        c = count_contiguous_clusters(nb_clusters, s->cluster_size, +                &l2_table[l2_index], 0, +                QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); +        *cluster_offset &= L2E_OFFSET_MASK; +        break; +    default: +        abort(); +    } + +    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + +    nb_available = (c * s->cluster_sectors); + +out: +    if (nb_available > nb_needed) +        nb_available = nb_needed; + +    *num = nb_available - index_in_cluster; + +    return ret; +} + +/* + * get_cluster_table + * + * for a given disk offset, load (and allocate if needed) + * the l2 table. + * + * the l2 table offset in the qcow2 file and the cluster index + * in the l2 table are given to the caller. + * + * Returns 0 on success, -errno in failure case + */ +static int get_cluster_table(BlockDriverState *bs, uint64_t offset, +                             uint64_t **new_l2_table, +                             int *new_l2_index) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int l2_index; +    uint64_t l1_index, l2_offset; +    uint64_t *l2_table = NULL; +    int ret; + +    /* seek the the l2 offset in the l1 table */ + +    l1_index = offset >> (s->l2_bits + s->cluster_bits); +    if (l1_index >= s->l1_size) { +        ret = qcow2_grow_l1_table(bs, l1_index + 1, false); +        if (ret < 0) { +            return ret; +        } +    } + +    assert(l1_index < s->l1_size); +    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; + +    /* seek the l2 table of the given l2 offset */ + +    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { +        /* load the l2 table in memory */ +        ret = l2_load(bs, l2_offset, &l2_table); +        if (ret < 0) { +            return ret; +        } +    } else { +        /* First allocate a new L2 table (and do COW if needed) */ +        ret = l2_allocate(bs, l1_index, &l2_table); +        if (ret < 0) { +            return ret; +        } + +        /* Then decrease the refcount of the old table */ +        if (l2_offset) { +            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), +                                QCOW2_DISCARD_OTHER); +        } +    } + +    /* find the cluster offset for the given disk offset */ + +    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + +    *new_l2_table = l2_table; +    *new_l2_index = l2_index; + +    return 0; +} + +/* + * alloc_compressed_cluster_offset + * + * For a given offset of the disk image, return cluster offset in + * qcow2 file. + * + * If the offset is not found, allocate a new compressed cluster. + * + * Return the cluster offset if successful, + * Return 0, otherwise. + * + */ + +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, +                                               uint64_t offset, +                                               int compressed_size) +{ +    BDRVQcowState *s = bs->opaque; +    int l2_index, ret; +    uint64_t *l2_table; +    int64_t cluster_offset; +    int nb_csectors; + +    ret = get_cluster_table(bs, offset, &l2_table, &l2_index); +    if (ret < 0) { +        return 0; +    } + +    /* Compression can't overwrite anything. Fail if the cluster was already +     * allocated. */ +    cluster_offset = be64_to_cpu(l2_table[l2_index]); +    if (cluster_offset & L2E_OFFSET_MASK) { +        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +        return 0; +    } + +    cluster_offset = qcow2_alloc_bytes(bs, compressed_size); +    if (cluster_offset < 0) { +        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +        return 0; +    } + +    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - +                  (cluster_offset >> 9); + +    cluster_offset |= QCOW_OFLAG_COMPRESSED | +                      ((uint64_t)nb_csectors << s->csize_shift); + +    /* update L2 table */ + +    /* compressed clusters never have the copied flag */ + +    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); +    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); +    l2_table[l2_index] = cpu_to_be64(cluster_offset); +    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +    if (ret < 0) { +        return 0; +    } + +    return cluster_offset; +} + +static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    if (r->nb_sectors == 0) { +        return 0; +    } + +    qemu_co_mutex_unlock(&s->lock); +    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, +                       r->offset / BDRV_SECTOR_SIZE, +                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); +    qemu_co_mutex_lock(&s->lock); + +    if (ret < 0) { +        return ret; +    } + +    /* +     * Before we update the L2 table to actually point to the new cluster, we +     * need to be sure that the refcounts have been increased and COW was +     * handled. +     */ +    qcow2_cache_depends_on_flush(s->l2_table_cache); + +    return 0; +} + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) +{ +    BDRVQcowState *s = bs->opaque; +    int i, j = 0, l2_index, ret; +    uint64_t *old_cluster, *l2_table; +    uint64_t cluster_offset = m->alloc_offset; + +    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); +    assert(m->nb_clusters > 0); + +    old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); + +    /* copy content of unmodified sectors */ +    ret = perform_cow(bs, m, &m->cow_start); +    if (ret < 0) { +        goto err; +    } + +    ret = perform_cow(bs, m, &m->cow_end); +    if (ret < 0) { +        goto err; +    } + +    /* Update L2 table. */ +    if (s->use_lazy_refcounts) { +        qcow2_mark_dirty(bs); +    } +    if (qcow2_need_accurate_refcounts(s)) { +        qcow2_cache_set_dependency(bs, s->l2_table_cache, +                                   s->refcount_block_cache); +    } + +    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); +    if (ret < 0) { +        goto err; +    } +    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + +    for (i = 0; i < m->nb_clusters; i++) { +        /* if two concurrent writes happen to the same unallocated cluster +	 * each write allocates separate cluster and writes data concurrently. +	 * The first one to complete updates l2 table with pointer to its +	 * cluster the second one has to do RMW (which is done above by +	 * copy_sectors()), update l2 table with its cluster pointer and free +	 * old cluster. This is what this loop does */ +        if(l2_table[l2_index + i] != 0) +            old_cluster[j++] = l2_table[l2_index + i]; + +        l2_table[l2_index + i] = cpu_to_be64((cluster_offset + +                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); +     } + + +    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +    if (ret < 0) { +        goto err; +    } + +    /* +     * If this was a COW, we need to decrease the refcount of the old cluster. +     * Also flush bs->file to get the right order for L2 and refcount update. +     * +     * Don't discard clusters that reach a refcount of 0 (e.g. compressed +     * clusters), the next write will reuse them anyway. +     */ +    if (j != 0) { +        for (i = 0; i < j; i++) { +            qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, +                                    QCOW2_DISCARD_NEVER); +        } +    } + +    ret = 0; +err: +    g_free(old_cluster); +    return ret; + } + +/* + * Returns the number of contiguous clusters that can be used for an allocating + * write, but require COW to be performed (this includes yet unallocated space, + * which must copy from the backing file) + */ +static int count_cow_clusters(BDRVQcowState *s, int nb_clusters, +    uint64_t *l2_table, int l2_index) +{ +    int i; + +    for (i = 0; i < nb_clusters; i++) { +        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); +        int cluster_type = qcow2_get_cluster_type(l2_entry); + +        switch(cluster_type) { +        case QCOW2_CLUSTER_NORMAL: +            if (l2_entry & QCOW_OFLAG_COPIED) { +                goto out; +            } +            break; +        case QCOW2_CLUSTER_UNALLOCATED: +        case QCOW2_CLUSTER_COMPRESSED: +        case QCOW2_CLUSTER_ZERO: +            break; +        default: +            abort(); +        } +    } + +out: +    assert(i <= nb_clusters); +    return i; +} + +/* + * Check if there already is an AIO write request in flight which allocates + * the same cluster. In this case we need to wait until the previous + * request has completed and updated the L2 table accordingly. + * + * Returns: + *   0       if there was no dependency. *cur_bytes indicates the number of + *           bytes from guest_offset that can be read before the next + *           dependency must be processed (or the request is complete) + * + *   -EAGAIN if we had to wait for another request, previously gathered + *           information on cluster allocation may be invalid now. The caller + *           must start over anyway, so consider *cur_bytes undefined. + */ +static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, +    uint64_t *cur_bytes, QCowL2Meta **m) +{ +    BDRVQcowState *s = bs->opaque; +    QCowL2Meta *old_alloc; +    uint64_t bytes = *cur_bytes; + +    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { + +        uint64_t start = guest_offset; +        uint64_t end = start + bytes; +        uint64_t old_start = l2meta_cow_start(old_alloc); +        uint64_t old_end = l2meta_cow_end(old_alloc); + +        if (end <= old_start || start >= old_end) { +            /* No intersection */ +        } else { +            if (start < old_start) { +                /* Stop at the start of a running allocation */ +                bytes = old_start - start; +            } else { +                bytes = 0; +            } + +            /* Stop if already an l2meta exists. After yielding, it wouldn't +             * be valid any more, so we'd have to clean up the old L2Metas +             * and deal with requests depending on them before starting to +             * gather new ones. Not worth the trouble. */ +            if (bytes == 0 && *m) { +                *cur_bytes = 0; +                return 0; +            } + +            if (bytes == 0) { +                /* Wait for the dependency to complete. We need to recheck +                 * the free/allocated clusters when we continue. */ +                qemu_co_mutex_unlock(&s->lock); +                qemu_co_queue_wait(&old_alloc->dependent_requests); +                qemu_co_mutex_lock(&s->lock); +                return -EAGAIN; +            } +        } +    } + +    /* Make sure that existing clusters and new allocations are only used up to +     * the next dependency if we shortened the request above */ +    *cur_bytes = bytes; + +    return 0; +} + +/* + * Checks how many already allocated clusters that don't require a copy on + * write there are at the given guest_offset (up to *bytes). If + * *host_offset is not zero, only physically contiguous clusters beginning at + * this host offset are counted. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + *   0:     if no allocated clusters are available at the given offset. + *          *bytes is normally unchanged. It is set to 0 if the cluster + *          is allocated and doesn't need COW, but doesn't have the right + *          physical offset. + * + *   1:     if allocated clusters that don't require a COW are available at + *          the requested offset. *bytes may have decreased and describes + *          the length of the area that can be written to. + * + *  -errno: in error cases + */ +static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, +    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ +    BDRVQcowState *s = bs->opaque; +    int l2_index; +    uint64_t cluster_offset; +    uint64_t *l2_table; +    unsigned int nb_clusters; +    unsigned int keep_clusters; +    int ret, pret; + +    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, +                              *bytes); + +    assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset) +                                == offset_into_cluster(s, *host_offset)); + +    /* +     * Calculate the number of clusters to look for. We stop at L2 table +     * boundaries to keep things simple. +     */ +    nb_clusters = +        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + +    l2_index = offset_to_l2_index(s, guest_offset); +    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + +    /* Find L2 entry for the first involved cluster */ +    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); +    if (ret < 0) { +        return ret; +    } + +    cluster_offset = be64_to_cpu(l2_table[l2_index]); + +    /* Check how many clusters are already allocated and don't need COW */ +    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL +        && (cluster_offset & QCOW_OFLAG_COPIED)) +    { +        /* If a specific host_offset is required, check it */ +        bool offset_matches = +            (cluster_offset & L2E_OFFSET_MASK) == *host_offset; + +        if (*host_offset != 0 && !offset_matches) { +            *bytes = 0; +            ret = 0; +            goto out; +        } + +        /* We keep all QCOW_OFLAG_COPIED clusters */ +        keep_clusters = +            count_contiguous_clusters(nb_clusters, s->cluster_size, +                                      &l2_table[l2_index], 0, +                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); +        assert(keep_clusters <= nb_clusters); + +        *bytes = MIN(*bytes, +                 keep_clusters * s->cluster_size +                 - offset_into_cluster(s, guest_offset)); + +        ret = 1; +    } else { +        ret = 0; +    } + +    /* Cleanup */ +out: +    pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +    if (pret < 0) { +        return pret; +    } + +    /* Only return a host offset if we actually made progress. Otherwise we +     * would make requirements for handle_alloc() that it can't fulfill */ +    if (ret) { +        *host_offset = (cluster_offset & L2E_OFFSET_MASK) +                     + offset_into_cluster(s, guest_offset); +    } + +    return ret; +} + +/* + * Allocates new clusters for the given guest_offset. + * + * At most *nb_clusters are allocated, and on return *nb_clusters is updated to + * contain the number of clusters that have been allocated and are contiguous + * in the image file. + * + * If *host_offset is non-zero, it specifies the offset in the image file at + * which the new clusters must start. *nb_clusters can be 0 on return in this + * case if the cluster at host_offset is already in use. If *host_offset is + * zero, the clusters can be allocated anywhere in the image file. + * + * *host_offset is updated to contain the offset into the image file at which + * the first allocated cluster starts. + * + * Return 0 on success and -errno in error cases. -EAGAIN means that the + * function has been waiting for another request and the allocation must be + * restarted, but the whole request should not be failed. + */ +static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, +    uint64_t *host_offset, unsigned int *nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; + +    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, +                                         *host_offset, *nb_clusters); + +    /* Allocate new clusters */ +    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); +    if (*host_offset == 0) { +        int64_t cluster_offset = +            qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); +        if (cluster_offset < 0) { +            return cluster_offset; +        } +        *host_offset = cluster_offset; +        return 0; +    } else { +        int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); +        if (ret < 0) { +            return ret; +        } +        *nb_clusters = ret; +        return 0; +    } +} + +/* + * Allocates new clusters for an area that either is yet unallocated or needs a + * copy on write. If *host_offset is non-zero, clusters are only allocated if + * the new allocation can match the specified host offset. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + *   0:     if no clusters could be allocated. *bytes is set to 0, + *          *host_offset is left unchanged. + * + *   1:     if new clusters were allocated. *bytes may be decreased if the + *          new allocation doesn't cover all of the requested area. + *          *host_offset is updated to contain the host offset of the first + *          newly allocated cluster. + * + *  -errno: in error cases + */ +static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, +    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ +    BDRVQcowState *s = bs->opaque; +    int l2_index; +    uint64_t *l2_table; +    uint64_t entry; +    unsigned int nb_clusters; +    int ret; + +    uint64_t alloc_cluster_offset; + +    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, +                             *bytes); +    assert(*bytes > 0); + +    /* +     * Calculate the number of clusters to look for. We stop at L2 table +     * boundaries to keep things simple. +     */ +    nb_clusters = +        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + +    l2_index = offset_to_l2_index(s, guest_offset); +    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + +    /* Find L2 entry for the first involved cluster */ +    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); +    if (ret < 0) { +        return ret; +    } + +    entry = be64_to_cpu(l2_table[l2_index]); + +    /* For the moment, overwrite compressed clusters one by one */ +    if (entry & QCOW_OFLAG_COMPRESSED) { +        nb_clusters = 1; +    } else { +        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); +    } + +    /* This function is only called when there were no non-COW clusters, so if +     * we can't find any unallocated or COW clusters either, something is +     * wrong with our code. */ +    assert(nb_clusters > 0); + +    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +    if (ret < 0) { +        return ret; +    } + +    /* Allocate, if necessary at a given offset in the image file */ +    alloc_cluster_offset = start_of_cluster(s, *host_offset); +    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, +                                  &nb_clusters); +    if (ret < 0) { +        goto fail; +    } + +    /* Can't extend contiguous allocation */ +    if (nb_clusters == 0) { +        *bytes = 0; +        return 0; +    } + +    /* +     * Save info needed for meta data update. +     * +     * requested_sectors: Number of sectors from the start of the first +     * newly allocated cluster to the end of the (possibly shortened +     * before) write request. +     * +     * avail_sectors: Number of sectors from the start of the first +     * newly allocated to the end of the last newly allocated cluster. +     * +     * nb_sectors: The number of sectors from the start of the first +     * newly allocated cluster to the end of the area that the write +     * request actually writes to (excluding COW at the end) +     */ +    int requested_sectors = +        (*bytes + offset_into_cluster(s, guest_offset)) +        >> BDRV_SECTOR_BITS; +    int avail_sectors = nb_clusters +                        << (s->cluster_bits - BDRV_SECTOR_BITS); +    int alloc_n_start = offset_into_cluster(s, guest_offset) +                        >> BDRV_SECTOR_BITS; +    int nb_sectors = MIN(requested_sectors, avail_sectors); +    QCowL2Meta *old_m = *m; + +    *m = g_malloc0(sizeof(**m)); + +    **m = (QCowL2Meta) { +        .next           = old_m, + +        .alloc_offset   = alloc_cluster_offset, +        .offset         = start_of_cluster(s, guest_offset), +        .nb_clusters    = nb_clusters, +        .nb_available   = nb_sectors, + +        .cow_start = { +            .offset     = 0, +            .nb_sectors = alloc_n_start, +        }, +        .cow_end = { +            .offset     = nb_sectors * BDRV_SECTOR_SIZE, +            .nb_sectors = avail_sectors - nb_sectors, +        }, +    }; +    qemu_co_queue_init(&(*m)->dependent_requests); +    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); + +    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); +    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) +                         - offset_into_cluster(s, guest_offset)); +    assert(*bytes != 0); + +    return 1; + +fail: +    if (*m && (*m)->nb_clusters > 0) { +        QLIST_REMOVE(*m, next_in_flight); +    } +    return ret; +} + +/* + * alloc_cluster_offset + * + * For a given offset on the virtual disk, find the cluster offset in qcow2 + * file. If the offset is not found, allocate a new cluster. + * + * If the cluster was already allocated, m->nb_clusters is set to 0 and + * other fields in m are meaningless. + * + * If the cluster is newly allocated, m->nb_clusters is set to the number of + * contiguous clusters that have been allocated. In this case, the other + * fields of m are valid and contain information about the first allocated + * cluster. + * + * If the request conflicts with another write request in flight, the coroutine + * is queued and will be reentered when the dependency has completed. + * + * Return 0 on success and -errno in error cases + */ +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, +    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t start, remaining; +    uint64_t cluster_offset; +    uint64_t cur_bytes; +    int ret; + +    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, +                                      n_start, n_end); + +    assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset)); +    offset = start_of_cluster(s, offset); + +again: +    start = offset + (n_start << BDRV_SECTOR_BITS); +    remaining = (n_end - n_start) << BDRV_SECTOR_BITS; +    cluster_offset = 0; +    *host_offset = 0; +    cur_bytes = 0; +    *m = NULL; + +    while (true) { + +        if (!*host_offset) { +            *host_offset = start_of_cluster(s, cluster_offset); +        } + +        assert(remaining >= cur_bytes); + +        start           += cur_bytes; +        remaining       -= cur_bytes; +        cluster_offset  += cur_bytes; + +        if (remaining == 0) { +            break; +        } + +        cur_bytes = remaining; + +        /* +         * Now start gathering as many contiguous clusters as possible: +         * +         * 1. Check for overlaps with in-flight allocations +         * +         *      a) Overlap not in the first cluster -> shorten this request and +         *         let the caller handle the rest in its next loop iteration. +         * +         *      b) Real overlaps of two requests. Yield and restart the search +         *         for contiguous clusters (the situation could have changed +         *         while we were sleeping) +         * +         *      c) TODO: Request starts in the same cluster as the in-flight +         *         allocation ends. Shorten the COW of the in-fight allocation, +         *         set cluster_offset to write to the same cluster and set up +         *         the right synchronisation between the in-flight request and +         *         the new one. +         */ +        ret = handle_dependencies(bs, start, &cur_bytes, m); +        if (ret == -EAGAIN) { +            /* Currently handle_dependencies() doesn't yield if we already had +             * an allocation. If it did, we would have to clean up the L2Meta +             * structs before starting over. */ +            assert(*m == NULL); +            goto again; +        } else if (ret < 0) { +            return ret; +        } else if (cur_bytes == 0) { +            break; +        } else { +            /* handle_dependencies() may have decreased cur_bytes (shortened +             * the allocations below) so that the next dependency is processed +             * correctly during the next loop iteration. */ +        } + +        /* +         * 2. Count contiguous COPIED clusters. +         */ +        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); +        if (ret < 0) { +            return ret; +        } else if (ret) { +            continue; +        } else if (cur_bytes == 0) { +            break; +        } + +        /* +         * 3. If the request still hasn't completed, allocate new clusters, +         *    considering any cluster_offset of steps 1c or 2. +         */ +        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); +        if (ret < 0) { +            return ret; +        } else if (ret) { +            continue; +        } else { +            assert(cur_bytes == 0); +            break; +        } +    } + +    *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS); +    assert(*num > 0); +    assert(*host_offset != 0); + +    return 0; +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, +                             const uint8_t *buf, int buf_size) +{ +    z_stream strm1, *strm = &strm1; +    int ret, out_len; + +    memset(strm, 0, sizeof(*strm)); + +    strm->next_in = (uint8_t *)buf; +    strm->avail_in = buf_size; +    strm->next_out = out_buf; +    strm->avail_out = out_buf_size; + +    ret = inflateInit2(strm, -12); +    if (ret != Z_OK) +        return -1; +    ret = inflate(strm, Z_FINISH); +    out_len = strm->next_out - out_buf; +    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || +        out_len != out_buf_size) { +        inflateEnd(strm); +        return -1; +    } +    inflateEnd(strm); +    return 0; +} + +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ +    BDRVQcowState *s = bs->opaque; +    int ret, csize, nb_csectors, sector_offset; +    uint64_t coffset; + +    coffset = cluster_offset & s->cluster_offset_mask; +    if (s->cluster_cache_offset != coffset) { +        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; +        sector_offset = coffset & 511; +        csize = nb_csectors * 512 - sector_offset; +        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); +        ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); +        if (ret < 0) { +            return ret; +        } +        if (decompress_buffer(s->cluster_cache, s->cluster_size, +                              s->cluster_data + sector_offset, csize) < 0) { +            return -EIO; +        } +        s->cluster_cache_offset = coffset; +    } +    return 0; +} + +/* + * This discards as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of discarded + * clusters. + */ +static int discard_single_l2(BlockDriverState *bs, uint64_t offset, +    unsigned int nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l2_table; +    int l2_index; +    int ret; +    int i; + +    ret = get_cluster_table(bs, offset, &l2_table, &l2_index); +    if (ret < 0) { +        return ret; +    } + +    /* Limit nb_clusters to one L2 table */ +    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + +    for (i = 0; i < nb_clusters; i++) { +        uint64_t old_offset; + +        old_offset = be64_to_cpu(l2_table[l2_index + i]); +        if ((old_offset & L2E_OFFSET_MASK) == 0) { +            continue; +        } + +        /* First remove L2 entries */ +        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); +        l2_table[l2_index + i] = cpu_to_be64(0); + +        /* Then decrease the refcount */ +        qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); +    } + +    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +    if (ret < 0) { +        return ret; +    } + +    return nb_clusters; +} + +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, +    int nb_sectors) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t end_offset; +    unsigned int nb_clusters; +    int ret; + +    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); + +    /* Round start up and end down */ +    offset = align_offset(offset, s->cluster_size); +    end_offset &= ~(s->cluster_size - 1); + +    if (offset > end_offset) { +        return 0; +    } + +    nb_clusters = size_to_clusters(s, end_offset - offset); + +    s->cache_discards = true; + +    /* Each L2 table is handled by its own loop iteration */ +    while (nb_clusters > 0) { +        ret = discard_single_l2(bs, offset, nb_clusters); +        if (ret < 0) { +            goto fail; +        } + +        nb_clusters -= ret; +        offset += (ret * s->cluster_size); +    } + +    ret = 0; +fail: +    s->cache_discards = false; +    qcow2_process_discards(bs, ret); + +    return ret; +} + +/* + * This zeroes as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of zeroed + * clusters. + */ +static int zero_single_l2(BlockDriverState *bs, uint64_t offset, +    unsigned int nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l2_table; +    int l2_index; +    int ret; +    int i; + +    ret = get_cluster_table(bs, offset, &l2_table, &l2_index); +    if (ret < 0) { +        return ret; +    } + +    /* Limit nb_clusters to one L2 table */ +    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + +    for (i = 0; i < nb_clusters; i++) { +        uint64_t old_offset; + +        old_offset = be64_to_cpu(l2_table[l2_index + i]); + +        /* Update L2 entries */ +        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); +        if (old_offset & QCOW_OFLAG_COMPRESSED) { +            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); +            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); +        } else { +            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); +        } +    } + +    ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +    if (ret < 0) { +        return ret; +    } + +    return nb_clusters; +} + +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int nb_clusters; +    int ret; + +    /* The zero flag is only supported by version 3 and newer */ +    if (s->qcow_version < 3) { +        return -ENOTSUP; +    } + +    /* Each L2 table is handled by its own loop iteration */ +    nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); + +    s->cache_discards = true; + +    while (nb_clusters > 0) { +        ret = zero_single_l2(bs, offset, nb_clusters); +        if (ret < 0) { +            goto fail; +        } + +        nb_clusters -= ret; +        offset += (ret * s->cluster_size); +    } + +    ret = 0; +fail: +    s->cache_discards = false; +    qcow2_process_discards(bs, ret); + +    return ret; +} diff --git a/contrib/qemu/block/qcow2-refcount.c b/contrib/qemu/block/qcow2-refcount.c new file mode 100644 index 000000000..1244693f3 --- /dev/null +++ b/contrib/qemu/block/qcow2-refcount.c @@ -0,0 +1,1374 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" + +static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size); +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, +                            int64_t offset, int64_t length, +                            int addend, enum qcow2_discard_type type); + + +/*********************************************************/ +/* refcount handling */ + +int qcow2_refcount_init(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    int ret, refcount_table_size2, i; + +    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); +    s->refcount_table = g_malloc(refcount_table_size2); +    if (s->refcount_table_size > 0) { +        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); +        ret = bdrv_pread(bs->file, s->refcount_table_offset, +                         s->refcount_table, refcount_table_size2); +        if (ret != refcount_table_size2) +            goto fail; +        for(i = 0; i < s->refcount_table_size; i++) +            be64_to_cpus(&s->refcount_table[i]); +    } +    return 0; + fail: +    return -ENOMEM; +} + +void qcow2_refcount_close(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    g_free(s->refcount_table); +} + + +static int load_refcount_block(BlockDriverState *bs, +                               int64_t refcount_block_offset, +                               void **refcount_block) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); +    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, +        refcount_block); + +    return ret; +} + +/* + * Returns the refcount of the cluster given by its index. Any non-negative + * return value is the refcount of the cluster, negative values are -errno + * and indicate an error. + */ +static int get_refcount(BlockDriverState *bs, int64_t cluster_index) +{ +    BDRVQcowState *s = bs->opaque; +    int refcount_table_index, block_index; +    int64_t refcount_block_offset; +    int ret; +    uint16_t *refcount_block; +    uint16_t refcount; + +    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); +    if (refcount_table_index >= s->refcount_table_size) +        return 0; +    refcount_block_offset = s->refcount_table[refcount_table_index]; +    if (!refcount_block_offset) +        return 0; + +    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, +        (void**) &refcount_block); +    if (ret < 0) { +        return ret; +    } + +    block_index = cluster_index & +        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); +    refcount = be16_to_cpu(refcount_block[block_index]); + +    ret = qcow2_cache_put(bs, s->refcount_block_cache, +        (void**) &refcount_block); +    if (ret < 0) { +        return ret; +    } + +    return refcount; +} + +/* + * Rounds the refcount table size up to avoid growing the table for each single + * refcount block that is allocated. + */ +static unsigned int next_refcount_table_size(BDRVQcowState *s, +    unsigned int min_size) +{ +    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; +    unsigned int refcount_table_clusters = +        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); + +    while (min_clusters > refcount_table_clusters) { +        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; +    } + +    return refcount_table_clusters << (s->cluster_bits - 3); +} + + +/* Checks if two offsets are described by the same refcount block */ +static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, +    uint64_t offset_b) +{ +    uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT); +    uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT); + +    return (block_a == block_b); +} + +/* + * Loads a refcount block. If it doesn't exist yet, it is allocated first + * (including growing the refcount table if needed). + * + * Returns 0 on success or -errno in error case + */ +static int alloc_refcount_block(BlockDriverState *bs, +    int64_t cluster_index, uint16_t **refcount_block) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int refcount_table_index; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + +    /* Find the refcount block for the given cluster */ +    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + +    if (refcount_table_index < s->refcount_table_size) { + +        uint64_t refcount_block_offset = +            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; + +        /* If it's already there, we're done */ +        if (refcount_block_offset) { +             return load_refcount_block(bs, refcount_block_offset, +                 (void**) refcount_block); +        } +    } + +    /* +     * If we came here, we need to allocate something. Something is at least +     * a cluster for the new refcount block. It may also include a new refcount +     * table if the old refcount table is too small. +     * +     * Note that allocating clusters here needs some special care: +     * +     * - We can't use the normal qcow2_alloc_clusters(), it would try to +     *   increase the refcount and very likely we would end up with an endless +     *   recursion. Instead we must place the refcount blocks in a way that +     *   they can describe them themselves. +     * +     * - We need to consider that at this point we are inside update_refcounts +     *   and doing the initial refcount increase. This means that some clusters +     *   have already been allocated by the caller, but their refcount isn't +     *   accurate yet. free_cluster_index tells us where this allocation ends +     *   as long as we don't overwrite it by freeing clusters. +     * +     * - alloc_clusters_noref and qcow2_free_clusters may load a different +     *   refcount block into the cache +     */ + +    *refcount_block = NULL; + +    /* We write to the refcount table, so we might depend on L2 tables */ +    ret = qcow2_cache_flush(bs, s->l2_table_cache); +    if (ret < 0) { +        return ret; +    } + +    /* Allocate the refcount block itself and mark it as used */ +    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); +    if (new_block < 0) { +        return new_block; +    } + +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 +        " at %" PRIx64 "\n", +        refcount_table_index, cluster_index << s->cluster_bits, new_block); +#endif + +    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { +        /* Zero the new refcount block before updating it */ +        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, +            (void**) refcount_block); +        if (ret < 0) { +            goto fail_block; +        } + +        memset(*refcount_block, 0, s->cluster_size); + +        /* The block describes itself, need to update the cache */ +        int block_index = (new_block >> s->cluster_bits) & +            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); +        (*refcount_block)[block_index] = cpu_to_be16(1); +    } else { +        /* Described somewhere else. This can recurse at most twice before we +         * arrive at a block that describes itself. */ +        ret = update_refcount(bs, new_block, s->cluster_size, 1, +                              QCOW2_DISCARD_NEVER); +        if (ret < 0) { +            goto fail_block; +        } + +        ret = qcow2_cache_flush(bs, s->refcount_block_cache); +        if (ret < 0) { +            goto fail_block; +        } + +        /* Initialize the new refcount block only after updating its refcount, +         * update_refcount uses the refcount cache itself */ +        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, +            (void**) refcount_block); +        if (ret < 0) { +            goto fail_block; +        } + +        memset(*refcount_block, 0, s->cluster_size); +    } + +    /* Now the new refcount block needs to be written to disk */ +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); +    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block); +    ret = qcow2_cache_flush(bs, s->refcount_block_cache); +    if (ret < 0) { +        goto fail_block; +    } + +    /* If the refcount table is big enough, just hook the block up there */ +    if (refcount_table_index < s->refcount_table_size) { +        uint64_t data64 = cpu_to_be64(new_block); +        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); +        ret = bdrv_pwrite_sync(bs->file, +            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), +            &data64, sizeof(data64)); +        if (ret < 0) { +            goto fail_block; +        } + +        s->refcount_table[refcount_table_index] = new_block; +        return 0; +    } + +    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); +    if (ret < 0) { +        goto fail_block; +    } + +    /* +     * If we come here, we need to grow the refcount table. Again, a new +     * refcount table needs some space and we can't simply allocate to avoid +     * endless recursion. +     * +     * Therefore let's grab new refcount blocks at the end of the image, which +     * will describe themselves and the new refcount table. This way we can +     * reference them only in the new table and do the switch to the new +     * refcount table at once without producing an inconsistent state in +     * between. +     */ +    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); + +    /* Calculate the number of refcount blocks needed so far */ +    uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT); +    uint64_t blocks_used = (s->free_cluster_index + +        refcount_block_clusters - 1) / refcount_block_clusters; + +    /* And now we need at least one block more for the new metadata */ +    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); +    uint64_t last_table_size; +    uint64_t blocks_clusters; +    do { +        uint64_t table_clusters = +            size_to_clusters(s, table_size * sizeof(uint64_t)); +        blocks_clusters = 1 + +            ((table_clusters + refcount_block_clusters - 1) +            / refcount_block_clusters); +        uint64_t meta_clusters = table_clusters + blocks_clusters; + +        last_table_size = table_size; +        table_size = next_refcount_table_size(s, blocks_used + +            ((meta_clusters + refcount_block_clusters - 1) +            / refcount_block_clusters)); + +    } while (last_table_size != table_size); + +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", +        s->refcount_table_size, table_size); +#endif + +    /* Create the new refcount table and blocks */ +    uint64_t meta_offset = (blocks_used * refcount_block_clusters) * +        s->cluster_size; +    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; +    uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size); +    uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t)); + +    assert(meta_offset >= (s->free_cluster_index * s->cluster_size)); + +    /* Fill the new refcount table */ +    memcpy(new_table, s->refcount_table, +        s->refcount_table_size * sizeof(uint64_t)); +    new_table[refcount_table_index] = new_block; + +    int i; +    for (i = 0; i < blocks_clusters; i++) { +        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); +    } + +    /* Fill the refcount blocks */ +    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); +    int block = 0; +    for (i = 0; i < table_clusters + blocks_clusters; i++) { +        new_blocks[block++] = cpu_to_be16(1); +    } + +    /* Write refcount blocks to disk */ +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); +    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, +        blocks_clusters * s->cluster_size); +    g_free(new_blocks); +    if (ret < 0) { +        goto fail_table; +    } + +    /* Write refcount table to disk */ +    for(i = 0; i < table_size; i++) { +        cpu_to_be64s(&new_table[i]); +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); +    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, +        table_size * sizeof(uint64_t)); +    if (ret < 0) { +        goto fail_table; +    } + +    for(i = 0; i < table_size; i++) { +        be64_to_cpus(&new_table[i]); +    } + +    /* Hook up the new refcount table in the qcow2 header */ +    uint8_t data[12]; +    cpu_to_be64w((uint64_t*)data, table_offset); +    cpu_to_be32w((uint32_t*)(data + 8), table_clusters); +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset), +        data, sizeof(data)); +    if (ret < 0) { +        goto fail_table; +    } + +    /* And switch it in memory */ +    uint64_t old_table_offset = s->refcount_table_offset; +    uint64_t old_table_size = s->refcount_table_size; + +    g_free(s->refcount_table); +    s->refcount_table = new_table; +    s->refcount_table_size = table_size; +    s->refcount_table_offset = table_offset; + +    /* Free old table. Remember, we must not change free_cluster_index */ +    uint64_t old_free_cluster_index = s->free_cluster_index; +    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), +                        QCOW2_DISCARD_OTHER); +    s->free_cluster_index = old_free_cluster_index; + +    ret = load_refcount_block(bs, new_block, (void**) refcount_block); +    if (ret < 0) { +        return ret; +    } + +    return 0; + +fail_table: +    g_free(new_table); +fail_block: +    if (*refcount_block != NULL) { +        qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); +    } +    return ret; +} + +void qcow2_process_discards(BlockDriverState *bs, int ret) +{ +    BDRVQcowState *s = bs->opaque; +    Qcow2DiscardRegion *d, *next; + +    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { +        QTAILQ_REMOVE(&s->discards, d, next); + +        /* Discard is optional, ignore the return value */ +        if (ret >= 0) { +            bdrv_discard(bs->file, +                         d->offset >> BDRV_SECTOR_BITS, +                         d->bytes >> BDRV_SECTOR_BITS); +        } + +        g_free(d); +    } +} + +static void update_refcount_discard(BlockDriverState *bs, +                                    uint64_t offset, uint64_t length) +{ +    BDRVQcowState *s = bs->opaque; +    Qcow2DiscardRegion *d, *p, *next; + +    QTAILQ_FOREACH(d, &s->discards, next) { +        uint64_t new_start = MIN(offset, d->offset); +        uint64_t new_end = MAX(offset + length, d->offset + d->bytes); + +        if (new_end - new_start <= length + d->bytes) { +            /* There can't be any overlap, areas ending up here have no +             * references any more and therefore shouldn't get freed another +             * time. */ +            assert(d->bytes + length == new_end - new_start); +            d->offset = new_start; +            d->bytes = new_end - new_start; +            goto found; +        } +    } + +    d = g_malloc(sizeof(*d)); +    *d = (Qcow2DiscardRegion) { +        .bs     = bs, +        .offset = offset, +        .bytes  = length, +    }; +    QTAILQ_INSERT_TAIL(&s->discards, d, next); + +found: +    /* Merge discard requests if they are adjacent now */ +    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { +        if (p == d +            || p->offset > d->offset + d->bytes +            || d->offset > p->offset + p->bytes) +        { +            continue; +        } + +        /* Still no overlap possible */ +        assert(p->offset == d->offset + d->bytes +            || d->offset == p->offset + p->bytes); + +        QTAILQ_REMOVE(&s->discards, p, next); +        d->offset = MIN(d->offset, p->offset); +        d->bytes += p->bytes; +    } +} + +/* XXX: cache several refcount block clusters ? */ +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, +    int64_t offset, int64_t length, int addend, enum qcow2_discard_type type) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t start, last, cluster_offset; +    uint16_t *refcount_block = NULL; +    int64_t old_table_index = -1; +    int ret; + +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n", +           offset, length, addend); +#endif +    if (length < 0) { +        return -EINVAL; +    } else if (length == 0) { +        return 0; +    } + +    if (addend < 0) { +        qcow2_cache_set_dependency(bs, s->refcount_block_cache, +            s->l2_table_cache); +    } + +    start = offset & ~(s->cluster_size - 1); +    last = (offset + length - 1) & ~(s->cluster_size - 1); +    for(cluster_offset = start; cluster_offset <= last; +        cluster_offset += s->cluster_size) +    { +        int block_index, refcount; +        int64_t cluster_index = cluster_offset >> s->cluster_bits; +        int64_t table_index = +            cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + +        /* Load the refcount block and allocate it if needed */ +        if (table_index != old_table_index) { +            if (refcount_block) { +                ret = qcow2_cache_put(bs, s->refcount_block_cache, +                    (void**) &refcount_block); +                if (ret < 0) { +                    goto fail; +                } +            } + +            ret = alloc_refcount_block(bs, cluster_index, &refcount_block); +            if (ret < 0) { +                goto fail; +            } +        } +        old_table_index = table_index; + +        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block); + +        /* we can update the count and save it */ +        block_index = cluster_index & +            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + +        refcount = be16_to_cpu(refcount_block[block_index]); +        refcount += addend; +        if (refcount < 0 || refcount > 0xffff) { +            ret = -EINVAL; +            goto fail; +        } +        if (refcount == 0 && cluster_index < s->free_cluster_index) { +            s->free_cluster_index = cluster_index; +        } +        refcount_block[block_index] = cpu_to_be16(refcount); + +        if (refcount == 0 && s->discard_passthrough[type]) { +            update_refcount_discard(bs, cluster_offset, s->cluster_size); +        } +    } + +    ret = 0; +fail: +    if (!s->cache_discards) { +        qcow2_process_discards(bs, ret); +    } + +    /* Write last changed block to disk */ +    if (refcount_block) { +        int wret; +        wret = qcow2_cache_put(bs, s->refcount_block_cache, +            (void**) &refcount_block); +        if (wret < 0) { +            return ret < 0 ? ret : wret; +        } +    } + +    /* +     * Try do undo any updates if an error is returned (This may succeed in +     * some cases like ENOSPC for allocating a new refcount block) +     */ +    if (ret < 0) { +        int dummy; +        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend, +                                QCOW2_DISCARD_NEVER); +        (void)dummy; +    } + +    return ret; +} + +/* + * Increases or decreases the refcount of a given cluster by one. + * addend must be 1 or -1. + * + * If the return value is non-negative, it is the new refcount of the cluster. + * If it is negative, it is -errno and indicates an error. + */ +static int update_cluster_refcount(BlockDriverState *bs, +                                   int64_t cluster_index, +                                   int addend, +                                   enum qcow2_discard_type type) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, +                          type); +    if (ret < 0) { +        return ret; +    } + +    return get_refcount(bs, cluster_index); +} + + + +/*********************************************************/ +/* cluster allocation functions */ + + + +/* return < 0 if error */ +static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) +{ +    BDRVQcowState *s = bs->opaque; +    int i, nb_clusters, refcount; + +    nb_clusters = size_to_clusters(s, size); +retry: +    for(i = 0; i < nb_clusters; i++) { +        int64_t next_cluster_index = s->free_cluster_index++; +        refcount = get_refcount(bs, next_cluster_index); + +        if (refcount < 0) { +            return refcount; +        } else if (refcount != 0) { +            goto retry; +        } +    } +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", +            size, +            (s->free_cluster_index - nb_clusters) << s->cluster_bits); +#endif +    return (s->free_cluster_index - nb_clusters) << s->cluster_bits; +} + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size) +{ +    int64_t offset; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); +    offset = alloc_clusters_noref(bs, size); +    if (offset < 0) { +        return offset; +    } + +    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); +    if (ret < 0) { +        return ret; +    } + +    return offset; +} + +int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, +    int nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t cluster_index; +    uint64_t old_free_cluster_index; +    int i, refcount, ret; + +    /* Check how many clusters there are free */ +    cluster_index = offset >> s->cluster_bits; +    for(i = 0; i < nb_clusters; i++) { +        refcount = get_refcount(bs, cluster_index++); + +        if (refcount < 0) { +            return refcount; +        } else if (refcount != 0) { +            break; +        } +    } + +    /* And then allocate them */ +    old_free_cluster_index = s->free_cluster_index; +    s->free_cluster_index = cluster_index + i; + +    ret = update_refcount(bs, offset, i << s->cluster_bits, 1, +                          QCOW2_DISCARD_NEVER); +    if (ret < 0) { +        return ret; +    } + +    s->free_cluster_index = old_free_cluster_index; + +    return i; +} + +/* only used to allocate compressed sectors. We try to allocate +   contiguous sectors. size must be <= cluster_size */ +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t offset, cluster_offset; +    int free_in_cluster; + +    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); +    assert(size > 0 && size <= s->cluster_size); +    if (s->free_byte_offset == 0) { +        offset = qcow2_alloc_clusters(bs, s->cluster_size); +        if (offset < 0) { +            return offset; +        } +        s->free_byte_offset = offset; +    } + redo: +    free_in_cluster = s->cluster_size - +        (s->free_byte_offset & (s->cluster_size - 1)); +    if (size <= free_in_cluster) { +        /* enough space in current cluster */ +        offset = s->free_byte_offset; +        s->free_byte_offset += size; +        free_in_cluster -= size; +        if (free_in_cluster == 0) +            s->free_byte_offset = 0; +        if ((offset & (s->cluster_size - 1)) != 0) +            update_cluster_refcount(bs, offset >> s->cluster_bits, 1, +                                    QCOW2_DISCARD_NEVER); +    } else { +        offset = qcow2_alloc_clusters(bs, s->cluster_size); +        if (offset < 0) { +            return offset; +        } +        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1); +        if ((cluster_offset + s->cluster_size) == offset) { +            /* we are lucky: contiguous data */ +            offset = s->free_byte_offset; +            update_cluster_refcount(bs, offset >> s->cluster_bits, 1, +                                    QCOW2_DISCARD_NEVER); +            s->free_byte_offset += size; +        } else { +            s->free_byte_offset = offset; +            goto redo; +        } +    } + +    /* The cluster refcount was incremented, either by qcow2_alloc_clusters() +     * or explicitly by update_cluster_refcount().  Refcount blocks must be +     * flushed before the caller's L2 table updates. +     */ +    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); +    return offset; +} + +void qcow2_free_clusters(BlockDriverState *bs, +                          int64_t offset, int64_t size, +                          enum qcow2_discard_type type) +{ +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); +    ret = update_refcount(bs, offset, size, -1, type); +    if (ret < 0) { +        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); +        /* TODO Remember the clusters to free them later and avoid leaking */ +    } +} + +/* + * Free a cluster using its L2 entry (handles clusters of all types, e.g. + * normal cluster, compressed cluster, etc.) + */ +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, +                             int nb_clusters, enum qcow2_discard_type type) +{ +    BDRVQcowState *s = bs->opaque; + +    switch (qcow2_get_cluster_type(l2_entry)) { +    case QCOW2_CLUSTER_COMPRESSED: +        { +            int nb_csectors; +            nb_csectors = ((l2_entry >> s->csize_shift) & +                           s->csize_mask) + 1; +            qcow2_free_clusters(bs, +                (l2_entry & s->cluster_offset_mask) & ~511, +                nb_csectors * 512, type); +        } +        break; +    case QCOW2_CLUSTER_NORMAL: +        qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, +                            nb_clusters << s->cluster_bits, type); +        break; +    case QCOW2_CLUSTER_UNALLOCATED: +    case QCOW2_CLUSTER_ZERO: +        break; +    default: +        abort(); +    } +} + + + +/*********************************************************/ +/* snapshots and image creation */ + + + +/* update the refcounts of snapshots and the copied flag */ +int qcow2_update_snapshot_refcount(BlockDriverState *bs, +    int64_t l1_table_offset, int l1_size, int addend) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated; +    int64_t old_offset, old_l2_offset; +    int i, j, l1_modified = 0, nb_csectors, refcount; +    int ret; + +    l2_table = NULL; +    l1_table = NULL; +    l1_size2 = l1_size * sizeof(uint64_t); + +    s->cache_discards = true; + +    /* WARNING: qcow2_snapshot_goto relies on this function not using the +     * l1_table_offset when it is the current s->l1_table_offset! Be careful +     * when changing this! */ +    if (l1_table_offset != s->l1_table_offset) { +        l1_table = g_malloc0(align_offset(l1_size2, 512)); +        l1_allocated = 1; + +        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); +        if (ret < 0) { +            goto fail; +        } + +        for(i = 0;i < l1_size; i++) +            be64_to_cpus(&l1_table[i]); +    } else { +        assert(l1_size == s->l1_size); +        l1_table = s->l1_table; +        l1_allocated = 0; +    } + +    for(i = 0; i < l1_size; i++) { +        l2_offset = l1_table[i]; +        if (l2_offset) { +            old_l2_offset = l2_offset; +            l2_offset &= L1E_OFFSET_MASK; + +            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, +                (void**) &l2_table); +            if (ret < 0) { +                goto fail; +            } + +            for(j = 0; j < s->l2_size; j++) { +                offset = be64_to_cpu(l2_table[j]); +                if (offset != 0) { +                    old_offset = offset; +                    offset &= ~QCOW_OFLAG_COPIED; +                    if (offset & QCOW_OFLAG_COMPRESSED) { +                        nb_csectors = ((offset >> s->csize_shift) & +                                       s->csize_mask) + 1; +                        if (addend != 0) { +                            int ret; +                            ret = update_refcount(bs, +                                (offset & s->cluster_offset_mask) & ~511, +                                nb_csectors * 512, addend, +                                QCOW2_DISCARD_SNAPSHOT); +                            if (ret < 0) { +                                goto fail; +                            } +                        } +                        /* compressed clusters are never modified */ +                        refcount = 2; +                    } else { +                        uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; +                        if (addend != 0) { +                            refcount = update_cluster_refcount(bs, cluster_index, addend, +                                                               QCOW2_DISCARD_SNAPSHOT); +                        } else { +                            refcount = get_refcount(bs, cluster_index); +                        } + +                        if (refcount < 0) { +                            ret = refcount; +                            goto fail; +                        } +                    } + +                    if (refcount == 1) { +                        offset |= QCOW_OFLAG_COPIED; +                    } +                    if (offset != old_offset) { +                        if (addend > 0) { +                            qcow2_cache_set_dependency(bs, s->l2_table_cache, +                                s->refcount_block_cache); +                        } +                        l2_table[j] = cpu_to_be64(offset); +                        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); +                    } +                } +            } + +            ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +            if (ret < 0) { +                goto fail; +            } + + +            if (addend != 0) { +                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend, +                                                   QCOW2_DISCARD_SNAPSHOT); +            } else { +                refcount = get_refcount(bs, l2_offset >> s->cluster_bits); +            } +            if (refcount < 0) { +                ret = refcount; +                goto fail; +            } else if (refcount == 1) { +                l2_offset |= QCOW_OFLAG_COPIED; +            } +            if (l2_offset != old_l2_offset) { +                l1_table[i] = l2_offset; +                l1_modified = 1; +            } +        } +    } + +    ret = bdrv_flush(bs); +fail: +    if (l2_table) { +        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +    } + +    s->cache_discards = false; +    qcow2_process_discards(bs, ret); + +    /* Update L1 only if it isn't deleted anyway (addend = -1) */ +    if (ret == 0 && addend >= 0 && l1_modified) { +        for (i = 0; i < l1_size; i++) { +            cpu_to_be64s(&l1_table[i]); +        } + +        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2); + +        for (i = 0; i < l1_size; i++) { +            be64_to_cpus(&l1_table[i]); +        } +    } +    if (l1_allocated) +        g_free(l1_table); +    return ret; +} + + + + +/*********************************************************/ +/* refcount checking functions */ + + + +/* + * Increases the refcount for a range of clusters in a given refcount table. + * This is used to construct a temporary refcount table out of L1 and L2 tables + * which can be compared the the refcount table saved in the image. + * + * Modifies the number of errors in res. + */ +static void inc_refcounts(BlockDriverState *bs, +                          BdrvCheckResult *res, +                          uint16_t *refcount_table, +                          int refcount_table_size, +                          int64_t offset, int64_t size) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t start, last, cluster_offset; +    int k; + +    if (size <= 0) +        return; + +    start = offset & ~(s->cluster_size - 1); +    last = (offset + size - 1) & ~(s->cluster_size - 1); +    for(cluster_offset = start; cluster_offset <= last; +        cluster_offset += s->cluster_size) { +        k = cluster_offset >> s->cluster_bits; +        if (k < 0) { +            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n", +                cluster_offset); +            res->corruptions++; +        } else if (k >= refcount_table_size) { +            fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after " +                "the end of the image file, can't properly check refcounts.\n", +                cluster_offset); +            res->check_errors++; +        } else { +            if (++refcount_table[k] == 0) { +                fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 +                    "\n", cluster_offset); +                res->corruptions++; +            } +        } +    } +} + +/* Flags for check_refcounts_l1() and check_refcounts_l2() */ +enum { +    CHECK_OFLAG_COPIED = 0x1,   /* check QCOW_OFLAG_COPIED matches refcount */ +    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */ +}; + +/* + * Increases the refcount in the given refcount table for the all clusters + * referenced in the L2 table. While doing so, performs some checks on L2 + * entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, +    uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset, +    int flags) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l2_table, l2_entry; +    uint64_t next_contiguous_offset = 0; +    int i, l2_size, nb_csectors, refcount; + +    /* Read L2 table from disk */ +    l2_size = s->l2_size * sizeof(uint64_t); +    l2_table = g_malloc(l2_size); + +    if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size) +        goto fail; + +    /* Do the actual checks */ +    for(i = 0; i < s->l2_size; i++) { +        l2_entry = be64_to_cpu(l2_table[i]); + +        switch (qcow2_get_cluster_type(l2_entry)) { +        case QCOW2_CLUSTER_COMPRESSED: +            /* Compressed clusters don't have QCOW_OFLAG_COPIED */ +            if (l2_entry & QCOW_OFLAG_COPIED) { +                fprintf(stderr, "ERROR: cluster %" PRId64 ": " +                    "copied flag must never be set for compressed " +                    "clusters\n", l2_entry >> s->cluster_bits); +                l2_entry &= ~QCOW_OFLAG_COPIED; +                res->corruptions++; +            } + +            /* Mark cluster as used */ +            nb_csectors = ((l2_entry >> s->csize_shift) & +                           s->csize_mask) + 1; +            l2_entry &= s->cluster_offset_mask; +            inc_refcounts(bs, res, refcount_table, refcount_table_size, +                l2_entry & ~511, nb_csectors * 512); + +            if (flags & CHECK_FRAG_INFO) { +                res->bfi.allocated_clusters++; +                res->bfi.compressed_clusters++; + +                /* Compressed clusters are fragmented by nature.  Since they +                 * take up sub-sector space but we only have sector granularity +                 * I/O we need to re-read the same sectors even for adjacent +                 * compressed clusters. +                 */ +                res->bfi.fragmented_clusters++; +            } +            break; + +        case QCOW2_CLUSTER_ZERO: +            if ((l2_entry & L2E_OFFSET_MASK) == 0) { +                break; +            } +            /* fall through */ + +        case QCOW2_CLUSTER_NORMAL: +        { +            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ +            uint64_t offset = l2_entry & L2E_OFFSET_MASK; + +            if (flags & CHECK_OFLAG_COPIED) { +                refcount = get_refcount(bs, offset >> s->cluster_bits); +                if (refcount < 0) { +                    fprintf(stderr, "Can't get refcount for offset %" +                        PRIx64 ": %s\n", l2_entry, strerror(-refcount)); +                    goto fail; +                } +                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { +                    fprintf(stderr, "ERROR OFLAG_COPIED: offset=%" +                        PRIx64 " refcount=%d\n", l2_entry, refcount); +                    res->corruptions++; +                } +            } + +            if (flags & CHECK_FRAG_INFO) { +                res->bfi.allocated_clusters++; +                if (next_contiguous_offset && +                    offset != next_contiguous_offset) { +                    res->bfi.fragmented_clusters++; +                } +                next_contiguous_offset = offset + s->cluster_size; +            } + +            /* Mark cluster as used */ +            inc_refcounts(bs, res, refcount_table,refcount_table_size, +                offset, s->cluster_size); + +            /* Correct offsets are cluster aligned */ +            if (offset & (s->cluster_size - 1)) { +                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " +                    "properly aligned; L2 entry corrupted.\n", offset); +                res->corruptions++; +            } +            break; +        } + +        case QCOW2_CLUSTER_UNALLOCATED: +            break; + +        default: +            abort(); +        } +    } + +    g_free(l2_table); +    return 0; + +fail: +    fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); +    g_free(l2_table); +    return -EIO; +} + +/* + * Increases the refcount for the L1 table, its L2 tables and all referenced + * clusters in the given refcount table. While doing so, performs some checks + * on L1 and L2 entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l1(BlockDriverState *bs, +                              BdrvCheckResult *res, +                              uint16_t *refcount_table, +                              int refcount_table_size, +                              int64_t l1_table_offset, int l1_size, +                              int flags) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l1_table, l2_offset, l1_size2; +    int i, refcount, ret; + +    l1_size2 = l1_size * sizeof(uint64_t); + +    /* Mark L1 table as used */ +    inc_refcounts(bs, res, refcount_table, refcount_table_size, +        l1_table_offset, l1_size2); + +    /* Read L1 table entries from disk */ +    if (l1_size2 == 0) { +        l1_table = NULL; +    } else { +        l1_table = g_malloc(l1_size2); +        if (bdrv_pread(bs->file, l1_table_offset, +                       l1_table, l1_size2) != l1_size2) +            goto fail; +        for(i = 0;i < l1_size; i++) +            be64_to_cpus(&l1_table[i]); +    } + +    /* Do the actual checks */ +    for(i = 0; i < l1_size; i++) { +        l2_offset = l1_table[i]; +        if (l2_offset) { +            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ +            if (flags & CHECK_OFLAG_COPIED) { +                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) +                    >> s->cluster_bits); +                if (refcount < 0) { +                    fprintf(stderr, "Can't get refcount for l2_offset %" +                        PRIx64 ": %s\n", l2_offset, strerror(-refcount)); +                    goto fail; +                } +                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) { +                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64 +                        " refcount=%d\n", l2_offset, refcount); +                    res->corruptions++; +                } +            } + +            /* Mark L2 table as used */ +            l2_offset &= L1E_OFFSET_MASK; +            inc_refcounts(bs, res, refcount_table, refcount_table_size, +                l2_offset, s->cluster_size); + +            /* L2 tables are cluster aligned */ +            if (l2_offset & (s->cluster_size - 1)) { +                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " +                    "cluster aligned; L1 entry corrupted\n", l2_offset); +                res->corruptions++; +            } + +            /* Process and check L2 entries */ +            ret = check_refcounts_l2(bs, res, refcount_table, +                                     refcount_table_size, l2_offset, flags); +            if (ret < 0) { +                goto fail; +            } +        } +    } +    g_free(l1_table); +    return 0; + +fail: +    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); +    res->check_errors++; +    g_free(l1_table); +    return -EIO; +} + +/* + * Checks an image for refcount consistency. + * + * Returns 0 if no errors are found, the number of errors in case the image is + * detected as corrupted, and -errno when an internal error occurred. + */ +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, +                          BdrvCheckMode fix) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t size, i, highest_cluster; +    int nb_clusters, refcount1, refcount2; +    QCowSnapshot *sn; +    uint16_t *refcount_table; +    int ret; + +    size = bdrv_getlength(bs->file); +    nb_clusters = size_to_clusters(s, size); +    refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t)); + +    res->bfi.total_clusters = +        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); + +    /* header */ +    inc_refcounts(bs, res, refcount_table, nb_clusters, +        0, s->cluster_size); + +    /* current L1 table */ +    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, +                             s->l1_table_offset, s->l1_size, +                             CHECK_OFLAG_COPIED | CHECK_FRAG_INFO); +    if (ret < 0) { +        goto fail; +    } + +    /* snapshots */ +    for(i = 0; i < s->nb_snapshots; i++) { +        sn = s->snapshots + i; +        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, +            sn->l1_table_offset, sn->l1_size, 0); +        if (ret < 0) { +            goto fail; +        } +    } +    inc_refcounts(bs, res, refcount_table, nb_clusters, +        s->snapshots_offset, s->snapshots_size); + +    /* refcount data */ +    inc_refcounts(bs, res, refcount_table, nb_clusters, +        s->refcount_table_offset, +        s->refcount_table_size * sizeof(uint64_t)); + +    for(i = 0; i < s->refcount_table_size; i++) { +        uint64_t offset, cluster; +        offset = s->refcount_table[i]; +        cluster = offset >> s->cluster_bits; + +        /* Refcount blocks are cluster aligned */ +        if (offset & (s->cluster_size - 1)) { +            fprintf(stderr, "ERROR refcount block %" PRId64 " is not " +                "cluster aligned; refcount table entry corrupted\n", i); +            res->corruptions++; +            continue; +        } + +        if (cluster >= nb_clusters) { +            fprintf(stderr, "ERROR refcount block %" PRId64 +                    " is outside image\n", i); +            res->corruptions++; +            continue; +        } + +        if (offset != 0) { +            inc_refcounts(bs, res, refcount_table, nb_clusters, +                offset, s->cluster_size); +            if (refcount_table[cluster] != 1) { +                fprintf(stderr, "ERROR refcount block %" PRId64 +                    " refcount=%d\n", +                    i, refcount_table[cluster]); +                res->corruptions++; +            } +        } +    } + +    /* compare ref counts */ +    for (i = 0, highest_cluster = 0; i < nb_clusters; i++) { +        refcount1 = get_refcount(bs, i); +        if (refcount1 < 0) { +            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", +                i, strerror(-refcount1)); +            res->check_errors++; +            continue; +        } + +        refcount2 = refcount_table[i]; + +        if (refcount1 > 0 || refcount2 > 0) { +            highest_cluster = i; +        } + +        if (refcount1 != refcount2) { + +            /* Check if we're allowed to fix the mismatch */ +            int *num_fixed = NULL; +            if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { +                num_fixed = &res->leaks_fixed; +            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { +                num_fixed = &res->corruptions_fixed; +            } + +            fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n", +                   num_fixed != NULL     ? "Repairing" : +                   refcount1 < refcount2 ? "ERROR" : +                                           "Leaked", +                   i, refcount1, refcount2); + +            if (num_fixed) { +                ret = update_refcount(bs, i << s->cluster_bits, 1, +                                      refcount2 - refcount1, +                                      QCOW2_DISCARD_ALWAYS); +                if (ret >= 0) { +                    (*num_fixed)++; +                    continue; +                } +            } + +            /* And if we couldn't, print an error */ +            if (refcount1 < refcount2) { +                res->corruptions++; +            } else { +                res->leaks++; +            } +        } +    } + +    res->image_end_offset = (highest_cluster + 1) * s->cluster_size; +    ret = 0; + +fail: +    g_free(refcount_table); + +    return ret; +} + diff --git a/contrib/qemu/block/qcow2-snapshot.c b/contrib/qemu/block/qcow2-snapshot.c new file mode 100644 index 000000000..0caac9055 --- /dev/null +++ b/contrib/qemu/block/qcow2-snapshot.c @@ -0,0 +1,660 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" + +typedef struct QEMU_PACKED QCowSnapshotHeader { +    /* header is 8 byte aligned */ +    uint64_t l1_table_offset; + +    uint32_t l1_size; +    uint16_t id_str_size; +    uint16_t name_size; + +    uint32_t date_sec; +    uint32_t date_nsec; + +    uint64_t vm_clock_nsec; + +    uint32_t vm_state_size; +    uint32_t extra_data_size; /* for extension */ +    /* extra data follows */ +    /* id_str follows */ +    /* name follows  */ +} QCowSnapshotHeader; + +typedef struct QEMU_PACKED QCowSnapshotExtraData { +    uint64_t vm_state_size_large; +    uint64_t disk_size; +} QCowSnapshotExtraData; + +void qcow2_free_snapshots(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    int i; + +    for(i = 0; i < s->nb_snapshots; i++) { +        g_free(s->snapshots[i].name); +        g_free(s->snapshots[i].id_str); +    } +    g_free(s->snapshots); +    s->snapshots = NULL; +    s->nb_snapshots = 0; +} + +int qcow2_read_snapshots(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshotHeader h; +    QCowSnapshotExtraData extra; +    QCowSnapshot *sn; +    int i, id_str_size, name_size; +    int64_t offset; +    uint32_t extra_data_size; +    int ret; + +    if (!s->nb_snapshots) { +        s->snapshots = NULL; +        s->snapshots_size = 0; +        return 0; +    } + +    offset = s->snapshots_offset; +    s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot)); + +    for(i = 0; i < s->nb_snapshots; i++) { +        /* Read statically sized part of the snapshot header */ +        offset = align_offset(offset, 8); +        ret = bdrv_pread(bs->file, offset, &h, sizeof(h)); +        if (ret < 0) { +            goto fail; +        } + +        offset += sizeof(h); +        sn = s->snapshots + i; +        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); +        sn->l1_size = be32_to_cpu(h.l1_size); +        sn->vm_state_size = be32_to_cpu(h.vm_state_size); +        sn->date_sec = be32_to_cpu(h.date_sec); +        sn->date_nsec = be32_to_cpu(h.date_nsec); +        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec); +        extra_data_size = be32_to_cpu(h.extra_data_size); + +        id_str_size = be16_to_cpu(h.id_str_size); +        name_size = be16_to_cpu(h.name_size); + +        /* Read extra data */ +        ret = bdrv_pread(bs->file, offset, &extra, +                         MIN(sizeof(extra), extra_data_size)); +        if (ret < 0) { +            goto fail; +        } +        offset += extra_data_size; + +        if (extra_data_size >= 8) { +            sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large); +        } + +        if (extra_data_size >= 16) { +            sn->disk_size = be64_to_cpu(extra.disk_size); +        } else { +            sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; +        } + +        /* Read snapshot ID */ +        sn->id_str = g_malloc(id_str_size + 1); +        ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size); +        if (ret < 0) { +            goto fail; +        } +        offset += id_str_size; +        sn->id_str[id_str_size] = '\0'; + +        /* Read snapshot name */ +        sn->name = g_malloc(name_size + 1); +        ret = bdrv_pread(bs->file, offset, sn->name, name_size); +        if (ret < 0) { +            goto fail; +        } +        offset += name_size; +        sn->name[name_size] = '\0'; +    } + +    s->snapshots_size = offset - s->snapshots_offset; +    return 0; + +fail: +    qcow2_free_snapshots(bs); +    return ret; +} + +/* add at the end of the file a new list of snapshots */ +static int qcow2_write_snapshots(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *sn; +    QCowSnapshotHeader h; +    QCowSnapshotExtraData extra; +    int i, name_size, id_str_size, snapshots_size; +    struct { +        uint32_t nb_snapshots; +        uint64_t snapshots_offset; +    } QEMU_PACKED header_data; +    int64_t offset, snapshots_offset; +    int ret; + +    /* compute the size of the snapshots */ +    offset = 0; +    for(i = 0; i < s->nb_snapshots; i++) { +        sn = s->snapshots + i; +        offset = align_offset(offset, 8); +        offset += sizeof(h); +        offset += sizeof(extra); +        offset += strlen(sn->id_str); +        offset += strlen(sn->name); +    } +    snapshots_size = offset; + +    /* Allocate space for the new snapshot list */ +    snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); +    offset = snapshots_offset; +    if (offset < 0) { +        return offset; +    } +    ret = bdrv_flush(bs); +    if (ret < 0) { +        return ret; +    } + +    /* Write all snapshots to the new list */ +    for(i = 0; i < s->nb_snapshots; i++) { +        sn = s->snapshots + i; +        memset(&h, 0, sizeof(h)); +        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset); +        h.l1_size = cpu_to_be32(sn->l1_size); +        /* If it doesn't fit in 32 bit, older implementations should treat it +         * as a disk-only snapshot rather than truncate the VM state */ +        if (sn->vm_state_size <= 0xffffffff) { +            h.vm_state_size = cpu_to_be32(sn->vm_state_size); +        } +        h.date_sec = cpu_to_be32(sn->date_sec); +        h.date_nsec = cpu_to_be32(sn->date_nsec); +        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec); +        h.extra_data_size = cpu_to_be32(sizeof(extra)); + +        memset(&extra, 0, sizeof(extra)); +        extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size); +        extra.disk_size = cpu_to_be64(sn->disk_size); + +        id_str_size = strlen(sn->id_str); +        name_size = strlen(sn->name); +        h.id_str_size = cpu_to_be16(id_str_size); +        h.name_size = cpu_to_be16(name_size); +        offset = align_offset(offset, 8); + +        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h)); +        if (ret < 0) { +            goto fail; +        } +        offset += sizeof(h); + +        ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra)); +        if (ret < 0) { +            goto fail; +        } +        offset += sizeof(extra); + +        ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size); +        if (ret < 0) { +            goto fail; +        } +        offset += id_str_size; + +        ret = bdrv_pwrite(bs->file, offset, sn->name, name_size); +        if (ret < 0) { +            goto fail; +        } +        offset += name_size; +    } + +    /* +     * Update the header to point to the new snapshot table. This requires the +     * new table and its refcounts to be stable on disk. +     */ +    ret = bdrv_flush(bs); +    if (ret < 0) { +        goto fail; +    } + +    QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) != +        offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots)); + +    header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots); +    header_data.snapshots_offset    = cpu_to_be64(snapshots_offset); + +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), +                           &header_data, sizeof(header_data)); +    if (ret < 0) { +        goto fail; +    } + +    /* free the old snapshot table */ +    qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size, +                        QCOW2_DISCARD_SNAPSHOT); +    s->snapshots_offset = snapshots_offset; +    s->snapshots_size = snapshots_size; +    return 0; + +fail: +    return ret; +} + +static void find_new_snapshot_id(BlockDriverState *bs, +                                 char *id_str, int id_str_size) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *sn; +    int i, id, id_max = 0; + +    for(i = 0; i < s->nb_snapshots; i++) { +        sn = s->snapshots + i; +        id = strtoul(sn->id_str, NULL, 10); +        if (id > id_max) +            id_max = id; +    } +    snprintf(id_str, id_str_size, "%d", id_max + 1); +} + +static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str) +{ +    BDRVQcowState *s = bs->opaque; +    int i; + +    for(i = 0; i < s->nb_snapshots; i++) { +        if (!strcmp(s->snapshots[i].id_str, id_str)) +            return i; +    } +    return -1; +} + +static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name) +{ +    BDRVQcowState *s = bs->opaque; +    int i, ret; + +    ret = find_snapshot_by_id(bs, name); +    if (ret >= 0) +        return ret; +    for(i = 0; i < s->nb_snapshots; i++) { +        if (!strcmp(s->snapshots[i].name, name)) +            return i; +    } +    return -1; +} + +/* if no id is provided, a new one is constructed */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *new_snapshot_list = NULL; +    QCowSnapshot *old_snapshot_list = NULL; +    QCowSnapshot sn1, *sn = &sn1; +    int i, ret; +    uint64_t *l1_table = NULL; +    int64_t l1_table_offset; + +    memset(sn, 0, sizeof(*sn)); + +    /* Generate an ID if it wasn't passed */ +    if (sn_info->id_str[0] == '\0') { +        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); +    } + +    /* Check that the ID is unique */ +    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) { +        return -EEXIST; +    } + +    /* Populate sn with passed data */ +    sn->id_str = g_strdup(sn_info->id_str); +    sn->name = g_strdup(sn_info->name); + +    sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; +    sn->vm_state_size = sn_info->vm_state_size; +    sn->date_sec = sn_info->date_sec; +    sn->date_nsec = sn_info->date_nsec; +    sn->vm_clock_nsec = sn_info->vm_clock_nsec; + +    /* Allocate the L1 table of the snapshot and copy the current one there. */ +    l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); +    if (l1_table_offset < 0) { +        ret = l1_table_offset; +        goto fail; +    } + +    sn->l1_table_offset = l1_table_offset; +    sn->l1_size = s->l1_size; + +    l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); +    for(i = 0; i < s->l1_size; i++) { +        l1_table[i] = cpu_to_be64(s->l1_table[i]); +    } + +    ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, +                      s->l1_size * sizeof(uint64_t)); +    if (ret < 0) { +        goto fail; +    } + +    g_free(l1_table); +    l1_table = NULL; + +    /* +     * Increase the refcounts of all clusters and make sure everything is +     * stable on disk before updating the snapshot table to contain a pointer +     * to the new L1 table. +     */ +    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); +    if (ret < 0) { +        goto fail; +    } + +    /* Append the new snapshot to the snapshot list */ +    new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); +    if (s->snapshots) { +        memcpy(new_snapshot_list, s->snapshots, +               s->nb_snapshots * sizeof(QCowSnapshot)); +        old_snapshot_list = s->snapshots; +    } +    s->snapshots = new_snapshot_list; +    s->snapshots[s->nb_snapshots++] = *sn; + +    ret = qcow2_write_snapshots(bs); +    if (ret < 0) { +        g_free(s->snapshots); +        s->snapshots = old_snapshot_list; +        goto fail; +    } + +    g_free(old_snapshot_list); + +#ifdef DEBUG_ALLOC +    { +      BdrvCheckResult result = {0}; +      qcow2_check_refcounts(bs, &result, 0); +    } +#endif +    return 0; + +fail: +    g_free(sn->id_str); +    g_free(sn->name); +    g_free(l1_table); + +    return ret; +} + +/* copy the snapshot 'snapshot_name' into the current disk image */ +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *sn; +    int i, snapshot_index; +    int cur_l1_bytes, sn_l1_bytes; +    int ret; +    uint64_t *sn_l1_table = NULL; + +    /* Search the snapshot */ +    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); +    if (snapshot_index < 0) { +        return -ENOENT; +    } +    sn = &s->snapshots[snapshot_index]; + +    if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) { +        error_report("qcow2: Loading snapshots with different disk " +            "size is not implemented"); +        ret = -ENOTSUP; +        goto fail; +    } + +    /* +     * Make sure that the current L1 table is big enough to contain the whole +     * L1 table of the snapshot. If the snapshot L1 table is smaller, the +     * current one must be padded with zeros. +     */ +    ret = qcow2_grow_l1_table(bs, sn->l1_size, true); +    if (ret < 0) { +        goto fail; +    } + +    cur_l1_bytes = s->l1_size * sizeof(uint64_t); +    sn_l1_bytes = sn->l1_size * sizeof(uint64_t); + +    /* +     * Copy the snapshot L1 table to the current L1 table. +     * +     * Before overwriting the old current L1 table on disk, make sure to +     * increase all refcounts for the clusters referenced by the new one. +     * Decrease the refcount referenced by the old one only when the L1 +     * table is overwritten. +     */ +    sn_l1_table = g_malloc0(cur_l1_bytes); + +    ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); +    if (ret < 0) { +        goto fail; +    } + +    ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, +                                         sn->l1_size, 1); +    if (ret < 0) { +        goto fail; +    } + +    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, +                           cur_l1_bytes); +    if (ret < 0) { +        goto fail; +    } + +    /* +     * Decrease refcount of clusters of current L1 table. +     * +     * At this point, the in-memory s->l1_table points to the old L1 table, +     * whereas on disk we already have the new one. +     * +     * qcow2_update_snapshot_refcount special cases the current L1 table to use +     * the in-memory data instead of really using the offset to load a new one, +     * which is why this works. +     */ +    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, +                                         s->l1_size, -1); + +    /* +     * Now update the in-memory L1 table to be in sync with the on-disk one. We +     * need to do this even if updating refcounts failed. +     */ +    for(i = 0;i < s->l1_size; i++) { +        s->l1_table[i] = be64_to_cpu(sn_l1_table[i]); +    } + +    if (ret < 0) { +        goto fail; +    } + +    g_free(sn_l1_table); +    sn_l1_table = NULL; + +    /* +     * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed +     * when we decreased the refcount of the old snapshot. +     */ +    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); +    if (ret < 0) { +        goto fail; +    } + +#ifdef DEBUG_ALLOC +    { +        BdrvCheckResult result = {0}; +        qcow2_check_refcounts(bs, &result, 0); +    } +#endif +    return 0; + +fail: +    g_free(sn_l1_table); +    return ret; +} + +int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot sn; +    int snapshot_index, ret; + +    /* Search the snapshot */ +    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); +    if (snapshot_index < 0) { +        return -ENOENT; +    } +    sn = s->snapshots[snapshot_index]; + +    /* Remove it from the snapshot list */ +    memmove(s->snapshots + snapshot_index, +            s->snapshots + snapshot_index + 1, +            (s->nb_snapshots - snapshot_index - 1) * sizeof(sn)); +    s->nb_snapshots--; +    ret = qcow2_write_snapshots(bs); +    if (ret < 0) { +        return ret; +    } + +    /* +     * The snapshot is now unused, clean up. If we fail after this point, we +     * won't recover but just leak clusters. +     */ +    g_free(sn.id_str); +    g_free(sn.name); + +    /* +     * Now decrease the refcounts of clusters referenced by the snapshot and +     * free the L1 table. +     */ +    ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, +                                         sn.l1_size, -1); +    if (ret < 0) { +        return ret; +    } +    qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t), +                        QCOW2_DISCARD_SNAPSHOT); + +    /* must update the copied flag on the current cluster offsets */ +    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); +    if (ret < 0) { +        return ret; +    } + +#ifdef DEBUG_ALLOC +    { +        BdrvCheckResult result = {0}; +        qcow2_check_refcounts(bs, &result, 0); +    } +#endif +    return 0; +} + +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) +{ +    BDRVQcowState *s = bs->opaque; +    QEMUSnapshotInfo *sn_tab, *sn_info; +    QCowSnapshot *sn; +    int i; + +    if (!s->nb_snapshots) { +        *psn_tab = NULL; +        return s->nb_snapshots; +    } + +    sn_tab = g_malloc0(s->nb_snapshots * sizeof(QEMUSnapshotInfo)); +    for(i = 0; i < s->nb_snapshots; i++) { +        sn_info = sn_tab + i; +        sn = s->snapshots + i; +        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), +                sn->id_str); +        pstrcpy(sn_info->name, sizeof(sn_info->name), +                sn->name); +        sn_info->vm_state_size = sn->vm_state_size; +        sn_info->date_sec = sn->date_sec; +        sn_info->date_nsec = sn->date_nsec; +        sn_info->vm_clock_nsec = sn->vm_clock_nsec; +    } +    *psn_tab = sn_tab; +    return s->nb_snapshots; +} + +int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) +{ +    int i, snapshot_index; +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *sn; +    uint64_t *new_l1_table; +    int new_l1_bytes; +    int ret; + +    assert(bs->read_only); + +    /* Search the snapshot */ +    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name); +    if (snapshot_index < 0) { +        return -ENOENT; +    } +    sn = &s->snapshots[snapshot_index]; + +    /* Allocate and read in the snapshot's L1 table */ +    new_l1_bytes = s->l1_size * sizeof(uint64_t); +    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512)); + +    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); +    if (ret < 0) { +        g_free(new_l1_table); +        return ret; +    } + +    /* Switch the L1 table */ +    g_free(s->l1_table); + +    s->l1_size = sn->l1_size; +    s->l1_table_offset = sn->l1_table_offset; +    s->l1_table = new_l1_table; + +    for(i = 0;i < s->l1_size; i++) { +        be64_to_cpus(&s->l1_table[i]); +    } + +    return 0; +} diff --git a/contrib/qemu/block/qcow2.c b/contrib/qemu/block/qcow2.c new file mode 100644 index 000000000..0eceefe2c --- /dev/null +++ b/contrib/qemu/block/qcow2.c @@ -0,0 +1,1825 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> +#include "qemu/aes.h" +#include "block/qcow2.h" +#include "qemu/error-report.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qbool.h" +#include "trace.h" + +/* +  Differences with QCOW: + +  - Support for multiple incremental snapshots. +  - Memory management by reference counts. +  - Clusters which have a reference count of one have the bit +    QCOW_OFLAG_COPIED to optimize write performance. +  - Size of compressed clusters is stored in sectors to reduce bit usage +    in the cluster offsets. +  - Support for storing additional data (such as the VM state) in the +    snapshots. +  - If a backing store is used, the cluster size is not constrained +    (could be backported to QCOW). +  - L2 tables have always a size of one cluster. +*/ + + +typedef struct { +    uint32_t magic; +    uint32_t len; +} QCowExtension; + +#define  QCOW2_EXT_MAGIC_END 0 +#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA +#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 + +static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    const QCowHeader *cow_header = (const void *)buf; + +    if (buf_size >= sizeof(QCowHeader) && +        be32_to_cpu(cow_header->magic) == QCOW_MAGIC && +        be32_to_cpu(cow_header->version) >= 2) +        return 100; +    else +        return 0; +} + + +/*  + * read qcow2 extension and fill bs + * start reading from start_offset + * finish reading upon magic of value 0 or when end_offset reached + * unknown magic is skipped (future extension this version knows nothing about) + * return 0 upon success, non-0 otherwise + */ +static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, +                                 uint64_t end_offset, void **p_feature_table) +{ +    BDRVQcowState *s = bs->opaque; +    QCowExtension ext; +    uint64_t offset; +    int ret; + +#ifdef DEBUG_EXT +    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); +#endif +    offset = start_offset; +    while (offset < end_offset) { + +#ifdef DEBUG_EXT +        /* Sanity check */ +        if (offset > s->cluster_size) +            printf("qcow2_read_extension: suspicious offset %lu\n", offset); + +        printf("attempting to read extended header in offset %lu\n", offset); +#endif + +        if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) { +            fprintf(stderr, "qcow2_read_extension: ERROR: " +                    "pread fail from offset %" PRIu64 "\n", +                    offset); +            return 1; +        } +        be32_to_cpus(&ext.magic); +        be32_to_cpus(&ext.len); +        offset += sizeof(ext); +#ifdef DEBUG_EXT +        printf("ext.magic = 0x%x\n", ext.magic); +#endif +        if (ext.len > end_offset - offset) { +            error_report("Header extension too large"); +            return -EINVAL; +        } + +        switch (ext.magic) { +        case QCOW2_EXT_MAGIC_END: +            return 0; + +        case QCOW2_EXT_MAGIC_BACKING_FORMAT: +            if (ext.len >= sizeof(bs->backing_format)) { +                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large" +                        " (>=%zu)\n", +                        ext.len, sizeof(bs->backing_format)); +                return 2; +            } +            if (bdrv_pread(bs->file, offset , bs->backing_format, +                           ext.len) != ext.len) +                return 3; +            bs->backing_format[ext.len] = '\0'; +#ifdef DEBUG_EXT +            printf("Qcow2: Got format extension %s\n", bs->backing_format); +#endif +            break; + +        case QCOW2_EXT_MAGIC_FEATURE_TABLE: +            if (p_feature_table != NULL) { +                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); +                ret = bdrv_pread(bs->file, offset , feature_table, ext.len); +                if (ret < 0) { +                    return ret; +                } + +                *p_feature_table = feature_table; +            } +            break; + +        default: +            /* unknown magic - save it in case we need to rewrite the header */ +            { +                Qcow2UnknownHeaderExtension *uext; + +                uext = g_malloc0(sizeof(*uext)  + ext.len); +                uext->magic = ext.magic; +                uext->len = ext.len; +                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); + +                ret = bdrv_pread(bs->file, offset , uext->data, uext->len); +                if (ret < 0) { +                    return ret; +                } +            } +            break; +        } + +        offset += ((ext.len + 7) & ~7); +    } + +    return 0; +} + +static void cleanup_unknown_header_ext(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    Qcow2UnknownHeaderExtension *uext, *next; + +    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { +        QLIST_REMOVE(uext, next); +        g_free(uext); +    } +} + +static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs, +    const char *fmt, ...) +{ +    char msg[64]; +    va_list ap; + +    va_start(ap, fmt); +    vsnprintf(msg, sizeof(msg), fmt, ap); +    va_end(ap); + +    qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, +        bs->device_name, "qcow2", msg); +} + +static void report_unsupported_feature(BlockDriverState *bs, +    Qcow2Feature *table, uint64_t mask) +{ +    while (table && table->name[0] != '\0') { +        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { +            if (mask & (1 << table->bit)) { +                report_unsupported(bs, "%.46s",table->name); +                mask &= ~(1 << table->bit); +            } +        } +        table++; +    } + +    if (mask) { +        report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask); +    } +} + +/* + * Sets the dirty bit and flushes afterwards if necessary. + * + * The incompatible_features bit is only set if the image file header was + * updated successfully.  Therefore it is not required to check the return + * value of this function. + */ +int qcow2_mark_dirty(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t val; +    int ret; + +    assert(s->qcow_version >= 3); + +    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { +        return 0; /* already dirty */ +    } + +    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); +    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), +                      &val, sizeof(val)); +    if (ret < 0) { +        return ret; +    } +    ret = bdrv_flush(bs->file); +    if (ret < 0) { +        return ret; +    } + +    /* Only treat image as dirty if the header was updated successfully */ +    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; +    return 0; +} + +/* + * Clears the dirty bit and flushes before if necessary.  Only call this + * function when there are no pending requests, it does not guard against + * concurrent requests dirtying the image. + */ +static int qcow2_mark_clean(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; + +    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { +        int ret = bdrv_flush(bs); +        if (ret < 0) { +            return ret; +        } + +        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; +        return qcow2_update_header(bs); +    } +    return 0; +} + +static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, +                       BdrvCheckMode fix) +{ +    int ret = qcow2_check_refcounts(bs, result, fix); +    if (ret < 0) { +        return ret; +    } + +    if (fix && result->check_errors == 0 && result->corruptions == 0) { +        return qcow2_mark_clean(bs); +    } +    return ret; +} + +static QemuOptsList qcow2_runtime_opts = { +    .name = "qcow2", +    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), +    .desc = { +        { +            .name = "lazy_refcounts", +            .type = QEMU_OPT_BOOL, +            .help = "Postpone refcount updates", +        }, +        { +            .name = QCOW2_OPT_DISCARD_REQUEST, +            .type = QEMU_OPT_BOOL, +            .help = "Pass guest discard requests to the layer below", +        }, +        { +            .name = QCOW2_OPT_DISCARD_SNAPSHOT, +            .type = QEMU_OPT_BOOL, +            .help = "Generate discard requests when snapshot related space " +                    "is freed", +        }, +        { +            .name = QCOW2_OPT_DISCARD_OTHER, +            .type = QEMU_OPT_BOOL, +            .help = "Generate discard requests when other clusters are freed", +        }, +        { /* end of list */ } +    }, +}; + +static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) +{ +    BDRVQcowState *s = bs->opaque; +    int len, i, ret = 0; +    QCowHeader header; +    QemuOpts *opts; +    Error *local_err = NULL; +    uint64_t ext_end; +    uint64_t l1_vm_state_index; + +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); +    if (ret < 0) { +        goto fail; +    } +    be32_to_cpus(&header.magic); +    be32_to_cpus(&header.version); +    be64_to_cpus(&header.backing_file_offset); +    be32_to_cpus(&header.backing_file_size); +    be64_to_cpus(&header.size); +    be32_to_cpus(&header.cluster_bits); +    be32_to_cpus(&header.crypt_method); +    be64_to_cpus(&header.l1_table_offset); +    be32_to_cpus(&header.l1_size); +    be64_to_cpus(&header.refcount_table_offset); +    be32_to_cpus(&header.refcount_table_clusters); +    be64_to_cpus(&header.snapshots_offset); +    be32_to_cpus(&header.nb_snapshots); + +    if (header.magic != QCOW_MAGIC) { +        ret = -EMEDIUMTYPE; +        goto fail; +    } +    if (header.version < 2 || header.version > 3) { +        report_unsupported(bs, "QCOW version %d", header.version); +        ret = -ENOTSUP; +        goto fail; +    } + +    s->qcow_version = header.version; + +    /* Initialise version 3 header fields */ +    if (header.version == 2) { +        header.incompatible_features    = 0; +        header.compatible_features      = 0; +        header.autoclear_features       = 0; +        header.refcount_order           = 4; +        header.header_length            = 72; +    } else { +        be64_to_cpus(&header.incompatible_features); +        be64_to_cpus(&header.compatible_features); +        be64_to_cpus(&header.autoclear_features); +        be32_to_cpus(&header.refcount_order); +        be32_to_cpus(&header.header_length); +    } + +    if (header.header_length > sizeof(header)) { +        s->unknown_header_fields_size = header.header_length - sizeof(header); +        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); +        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, +                         s->unknown_header_fields_size); +        if (ret < 0) { +            goto fail; +        } +    } + +    if (header.backing_file_offset) { +        ext_end = header.backing_file_offset; +    } else { +        ext_end = 1 << header.cluster_bits; +    } + +    /* Handle feature bits */ +    s->incompatible_features    = header.incompatible_features; +    s->compatible_features      = header.compatible_features; +    s->autoclear_features       = header.autoclear_features; + +    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { +        void *feature_table = NULL; +        qcow2_read_extensions(bs, header.header_length, ext_end, +                              &feature_table); +        report_unsupported_feature(bs, feature_table, +                                   s->incompatible_features & +                                   ~QCOW2_INCOMPAT_MASK); +        ret = -ENOTSUP; +        goto fail; +    } + +    /* Check support for various header values */ +    if (header.refcount_order != 4) { +        report_unsupported(bs, "%d bit reference counts", +                           1 << header.refcount_order); +        ret = -ENOTSUP; +        goto fail; +    } + +    if (header.cluster_bits < MIN_CLUSTER_BITS || +        header.cluster_bits > MAX_CLUSTER_BITS) { +        ret = -EINVAL; +        goto fail; +    } +    if (header.crypt_method > QCOW_CRYPT_AES) { +        ret = -EINVAL; +        goto fail; +    } +    s->crypt_method_header = header.crypt_method; +    if (s->crypt_method_header) { +        bs->encrypted = 1; +    } +    s->cluster_bits = header.cluster_bits; +    s->cluster_size = 1 << s->cluster_bits; +    s->cluster_sectors = 1 << (s->cluster_bits - 9); +    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ +    s->l2_size = 1 << s->l2_bits; +    bs->total_sectors = header.size / 512; +    s->csize_shift = (62 - (s->cluster_bits - 8)); +    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; +    s->cluster_offset_mask = (1LL << s->csize_shift) - 1; +    s->refcount_table_offset = header.refcount_table_offset; +    s->refcount_table_size = +        header.refcount_table_clusters << (s->cluster_bits - 3); + +    s->snapshots_offset = header.snapshots_offset; +    s->nb_snapshots = header.nb_snapshots; + +    /* read the level 1 table */ +    s->l1_size = header.l1_size; + +    l1_vm_state_index = size_to_l1(s, header.size); +    if (l1_vm_state_index > INT_MAX) { +        ret = -EFBIG; +        goto fail; +    } +    s->l1_vm_state_index = l1_vm_state_index; + +    /* the L1 table must contain at least enough entries to put +       header.size bytes */ +    if (s->l1_size < s->l1_vm_state_index) { +        ret = -EINVAL; +        goto fail; +    } +    s->l1_table_offset = header.l1_table_offset; +    if (s->l1_size > 0) { +        s->l1_table = g_malloc0( +            align_offset(s->l1_size * sizeof(uint64_t), 512)); +        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, +                         s->l1_size * sizeof(uint64_t)); +        if (ret < 0) { +            goto fail; +        } +        for(i = 0;i < s->l1_size; i++) { +            be64_to_cpus(&s->l1_table[i]); +        } +    } + +    /* alloc L2 table/refcount block cache */ +    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE); +    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE); + +    s->cluster_cache = g_malloc(s->cluster_size); +    /* one more sector for decompressed data alignment */ +    s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size +                                  + 512); +    s->cluster_cache_offset = -1; +    s->flags = flags; + +    ret = qcow2_refcount_init(bs); +    if (ret != 0) { +        goto fail; +    } + +    QLIST_INIT(&s->cluster_allocs); +    QTAILQ_INIT(&s->discards); + +    /* read qcow2 extensions */ +    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) { +        ret = -EINVAL; +        goto fail; +    } + +    /* read the backing file name */ +    if (header.backing_file_offset != 0) { +        len = header.backing_file_size; +        if (len > 1023) { +            len = 1023; +        } +        ret = bdrv_pread(bs->file, header.backing_file_offset, +                         bs->backing_file, len); +        if (ret < 0) { +            goto fail; +        } +        bs->backing_file[len] = '\0'; +    } + +    ret = qcow2_read_snapshots(bs); +    if (ret < 0) { +        goto fail; +    } + +    /* Clear unknown autoclear feature bits */ +    if (!bs->read_only && s->autoclear_features != 0) { +        s->autoclear_features = 0; +        ret = qcow2_update_header(bs); +        if (ret < 0) { +            goto fail; +        } +    } + +    /* Initialise locks */ +    qemu_co_mutex_init(&s->lock); + +    /* Repair image if dirty */ +    if (!(flags & BDRV_O_CHECK) && !bs->read_only && +        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { +        BdrvCheckResult result = {0}; + +        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS); +        if (ret < 0) { +            goto fail; +        } +    } + +    /* Enable lazy_refcounts according to image and command line options */ +    opts = qemu_opts_create_nofail(&qcow2_runtime_opts); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (error_is_set(&local_err)) { +        qerror_report_err(local_err); +        error_free(local_err); +        ret = -EINVAL; +        goto fail; +    } + +    s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, +        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); + +    s->discard_passthrough[QCOW2_DISCARD_NEVER] = false; +    s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; +    s->discard_passthrough[QCOW2_DISCARD_REQUEST] = +        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, +                          flags & BDRV_O_UNMAP); +    s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = +        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); +    s->discard_passthrough[QCOW2_DISCARD_OTHER] = +        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + +    qemu_opts_del(opts); + +    if (s->use_lazy_refcounts && s->qcow_version < 3) { +        qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require " +            "a qcow2 image with at least qemu 1.1 compatibility level"); +        ret = -EINVAL; +        goto fail; +    } + +#ifdef DEBUG_ALLOC +    { +        BdrvCheckResult result = {0}; +        qcow2_check_refcounts(bs, &result, 0); +    } +#endif +    return ret; + + fail: +    g_free(s->unknown_header_fields); +    cleanup_unknown_header_ext(bs); +    qcow2_free_snapshots(bs); +    qcow2_refcount_close(bs); +    g_free(s->l1_table); +    if (s->l2_table_cache) { +        qcow2_cache_destroy(bs, s->l2_table_cache); +    } +    g_free(s->cluster_cache); +    qemu_vfree(s->cluster_data); +    return ret; +} + +static int qcow2_set_key(BlockDriverState *bs, const char *key) +{ +    BDRVQcowState *s = bs->opaque; +    uint8_t keybuf[16]; +    int len, i; + +    memset(keybuf, 0, 16); +    len = strlen(key); +    if (len > 16) +        len = 16; +    /* XXX: we could compress the chars to 7 bits to increase +       entropy */ +    for(i = 0;i < len;i++) { +        keybuf[i] = key[i]; +    } +    s->crypt_method = s->crypt_method_header; + +    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) +        return -1; +    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) +        return -1; +#if 0 +    /* test */ +    { +        uint8_t in[16]; +        uint8_t out[16]; +        uint8_t tmp[16]; +        for(i=0;i<16;i++) +            in[i] = i; +        AES_encrypt(in, tmp, &s->aes_encrypt_key); +        AES_decrypt(tmp, out, &s->aes_decrypt_key); +        for(i = 0; i < 16; i++) +            printf(" %02x", tmp[i]); +        printf("\n"); +        for(i = 0; i < 16; i++) +            printf(" %02x", out[i]); +        printf("\n"); +    } +#endif +    return 0; +} + +/* We have nothing to do for QCOW2 reopen, stubs just return + * success */ +static int qcow2_reopen_prepare(BDRVReopenState *state, +                                BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + +static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, int *pnum) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t cluster_offset; +    int ret; + +    *pnum = nb_sectors; +    /* FIXME We can get errors here, but the bdrv_co_is_allocated interface +     * can't pass them on today */ +    qemu_co_mutex_lock(&s->lock); +    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); +    qemu_co_mutex_unlock(&s->lock); +    if (ret < 0) { +        *pnum = 0; +    } + +    return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO); +} + +/* handle reading after the end of the backing file */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, +                  int64_t sector_num, int nb_sectors) +{ +    int n1; +    if ((sector_num + nb_sectors) <= bs->total_sectors) +        return nb_sectors; +    if (sector_num >= bs->total_sectors) +        n1 = 0; +    else +        n1 = bs->total_sectors - sector_num; + +    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1)); + +    return n1; +} + +static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, +                          int remaining_sectors, QEMUIOVector *qiov) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster, n1; +    int ret; +    int cur_nr_sectors; /* number of sectors in current iteration */ +    uint64_t cluster_offset = 0; +    uint64_t bytes_done = 0; +    QEMUIOVector hd_qiov; +    uint8_t *cluster_data = NULL; + +    qemu_iovec_init(&hd_qiov, qiov->niov); + +    qemu_co_mutex_lock(&s->lock); + +    while (remaining_sectors != 0) { + +        /* prepare next request */ +        cur_nr_sectors = remaining_sectors; +        if (s->crypt_method) { +            cur_nr_sectors = MIN(cur_nr_sectors, +                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); +        } + +        ret = qcow2_get_cluster_offset(bs, sector_num << 9, +            &cur_nr_sectors, &cluster_offset); +        if (ret < 0) { +            goto fail; +        } + +        index_in_cluster = sector_num & (s->cluster_sectors - 1); + +        qemu_iovec_reset(&hd_qiov); +        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, +            cur_nr_sectors * 512); + +        switch (ret) { +        case QCOW2_CLUSTER_UNALLOCATED: + +            if (bs->backing_hd) { +                /* read from the base image */ +                n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov, +                    sector_num, cur_nr_sectors); +                if (n1 > 0) { +                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); +                    qemu_co_mutex_unlock(&s->lock); +                    ret = bdrv_co_readv(bs->backing_hd, sector_num, +                                        n1, &hd_qiov); +                    qemu_co_mutex_lock(&s->lock); +                    if (ret < 0) { +                        goto fail; +                    } +                } +            } else { +                /* Note: in this case, no need to wait */ +                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); +            } +            break; + +        case QCOW2_CLUSTER_ZERO: +            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); +            break; + +        case QCOW2_CLUSTER_COMPRESSED: +            /* add AIO support for compressed blocks ? */ +            ret = qcow2_decompress_cluster(bs, cluster_offset); +            if (ret < 0) { +                goto fail; +            } + +            qemu_iovec_from_buf(&hd_qiov, 0, +                s->cluster_cache + index_in_cluster * 512, +                512 * cur_nr_sectors); +            break; + +        case QCOW2_CLUSTER_NORMAL: +            if ((cluster_offset & 511) != 0) { +                ret = -EIO; +                goto fail; +            } + +            if (s->crypt_method) { +                /* +                 * For encrypted images, read everything into a temporary +                 * contiguous buffer on which the AES functions can work. +                 */ +                if (!cluster_data) { +                    cluster_data = +                        qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); +                } + +                assert(cur_nr_sectors <= +                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); +                qemu_iovec_reset(&hd_qiov); +                qemu_iovec_add(&hd_qiov, cluster_data, +                    512 * cur_nr_sectors); +            } + +            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); +            qemu_co_mutex_unlock(&s->lock); +            ret = bdrv_co_readv(bs->file, +                                (cluster_offset >> 9) + index_in_cluster, +                                cur_nr_sectors, &hd_qiov); +            qemu_co_mutex_lock(&s->lock); +            if (ret < 0) { +                goto fail; +            } +            if (s->crypt_method) { +                qcow2_encrypt_sectors(s, sector_num,  cluster_data, +                    cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key); +                qemu_iovec_from_buf(qiov, bytes_done, +                    cluster_data, 512 * cur_nr_sectors); +            } +            break; + +        default: +            g_assert_not_reached(); +            ret = -EIO; +            goto fail; +        } + +        remaining_sectors -= cur_nr_sectors; +        sector_num += cur_nr_sectors; +        bytes_done += cur_nr_sectors * 512; +    } +    ret = 0; + +fail: +    qemu_co_mutex_unlock(&s->lock); + +    qemu_iovec_destroy(&hd_qiov); +    qemu_vfree(cluster_data); + +    return ret; +} + +static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, +                           int64_t sector_num, +                           int remaining_sectors, +                           QEMUIOVector *qiov) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster; +    int n_end; +    int ret; +    int cur_nr_sectors; /* number of sectors in current iteration */ +    uint64_t cluster_offset; +    QEMUIOVector hd_qiov; +    uint64_t bytes_done = 0; +    uint8_t *cluster_data = NULL; +    QCowL2Meta *l2meta = NULL; + +    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, +                                 remaining_sectors); + +    qemu_iovec_init(&hd_qiov, qiov->niov); + +    s->cluster_cache_offset = -1; /* disable compressed cache */ + +    qemu_co_mutex_lock(&s->lock); + +    while (remaining_sectors != 0) { + +        l2meta = NULL; + +        trace_qcow2_writev_start_part(qemu_coroutine_self()); +        index_in_cluster = sector_num & (s->cluster_sectors - 1); +        n_end = index_in_cluster + remaining_sectors; +        if (s->crypt_method && +            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) { +            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; +        } + +        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, +            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta); +        if (ret < 0) { +            goto fail; +        } + +        assert((cluster_offset & 511) == 0); + +        qemu_iovec_reset(&hd_qiov); +        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, +            cur_nr_sectors * 512); + +        if (s->crypt_method) { +            if (!cluster_data) { +                cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * +                                                 s->cluster_size); +            } + +            assert(hd_qiov.size <= +                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); +            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); + +            qcow2_encrypt_sectors(s, sector_num, cluster_data, +                cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key); + +            qemu_iovec_reset(&hd_qiov); +            qemu_iovec_add(&hd_qiov, cluster_data, +                cur_nr_sectors * 512); +        } + +        qemu_co_mutex_unlock(&s->lock); +        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); +        trace_qcow2_writev_data(qemu_coroutine_self(), +                                (cluster_offset >> 9) + index_in_cluster); +        ret = bdrv_co_writev(bs->file, +                             (cluster_offset >> 9) + index_in_cluster, +                             cur_nr_sectors, &hd_qiov); +        qemu_co_mutex_lock(&s->lock); +        if (ret < 0) { +            goto fail; +        } + +        while (l2meta != NULL) { +            QCowL2Meta *next; + +            ret = qcow2_alloc_cluster_link_l2(bs, l2meta); +            if (ret < 0) { +                goto fail; +            } + +            /* Take the request off the list of running requests */ +            if (l2meta->nb_clusters != 0) { +                QLIST_REMOVE(l2meta, next_in_flight); +            } + +            qemu_co_queue_restart_all(&l2meta->dependent_requests); + +            next = l2meta->next; +            g_free(l2meta); +            l2meta = next; +        } + +        remaining_sectors -= cur_nr_sectors; +        sector_num += cur_nr_sectors; +        bytes_done += cur_nr_sectors * 512; +        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors); +    } +    ret = 0; + +fail: +    qemu_co_mutex_unlock(&s->lock); + +    while (l2meta != NULL) { +        QCowL2Meta *next; + +        if (l2meta->nb_clusters != 0) { +            QLIST_REMOVE(l2meta, next_in_flight); +        } +        qemu_co_queue_restart_all(&l2meta->dependent_requests); + +        next = l2meta->next; +        g_free(l2meta); +        l2meta = next; +    } + +    qemu_iovec_destroy(&hd_qiov); +    qemu_vfree(cluster_data); +    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); + +    return ret; +} + +static void qcow2_close(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    g_free(s->l1_table); + +    qcow2_cache_flush(bs, s->l2_table_cache); +    qcow2_cache_flush(bs, s->refcount_block_cache); + +    qcow2_mark_clean(bs); + +    qcow2_cache_destroy(bs, s->l2_table_cache); +    qcow2_cache_destroy(bs, s->refcount_block_cache); + +    g_free(s->unknown_header_fields); +    cleanup_unknown_header_ext(bs); + +    g_free(s->cluster_cache); +    qemu_vfree(s->cluster_data); +    qcow2_refcount_close(bs); +    qcow2_free_snapshots(bs); +} + +static void qcow2_invalidate_cache(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    int flags = s->flags; +    AES_KEY aes_encrypt_key; +    AES_KEY aes_decrypt_key; +    uint32_t crypt_method = 0; +    QDict *options; + +    /* +     * Backing files are read-only which makes all of their metadata immutable, +     * that means we don't have to worry about reopening them here. +     */ + +    if (s->crypt_method) { +        crypt_method = s->crypt_method; +        memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key)); +        memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key)); +    } + +    qcow2_close(bs); + +    options = qdict_new(); +    qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS, +              qbool_from_int(s->use_lazy_refcounts)); + +    memset(s, 0, sizeof(BDRVQcowState)); +    qcow2_open(bs, options, flags); + +    QDECREF(options); + +    if (crypt_method) { +        s->crypt_method = crypt_method; +        memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key)); +        memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key)); +    } +} + +static size_t header_ext_add(char *buf, uint32_t magic, const void *s, +    size_t len, size_t buflen) +{ +    QCowExtension *ext_backing_fmt = (QCowExtension*) buf; +    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); + +    if (buflen < ext_len) { +        return -ENOSPC; +    } + +    *ext_backing_fmt = (QCowExtension) { +        .magic  = cpu_to_be32(magic), +        .len    = cpu_to_be32(len), +    }; +    memcpy(buf + sizeof(QCowExtension), s, len); + +    return ext_len; +} + +/* + * Updates the qcow2 header, including the variable length parts of it, i.e. + * the backing file name and all extensions. qcow2 was not designed to allow + * such changes, so if we run out of space (we can only use the first cluster) + * this function may fail. + * + * Returns 0 on success, -errno in error cases. + */ +int qcow2_update_header(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    QCowHeader *header; +    char *buf; +    size_t buflen = s->cluster_size; +    int ret; +    uint64_t total_size; +    uint32_t refcount_table_clusters; +    size_t header_length; +    Qcow2UnknownHeaderExtension *uext; + +    buf = qemu_blockalign(bs, buflen); + +    /* Header structure */ +    header = (QCowHeader*) buf; + +    if (buflen < sizeof(*header)) { +        ret = -ENOSPC; +        goto fail; +    } + +    header_length = sizeof(*header) + s->unknown_header_fields_size; +    total_size = bs->total_sectors * BDRV_SECTOR_SIZE; +    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); + +    *header = (QCowHeader) { +        /* Version 2 fields */ +        .magic                  = cpu_to_be32(QCOW_MAGIC), +        .version                = cpu_to_be32(s->qcow_version), +        .backing_file_offset    = 0, +        .backing_file_size      = 0, +        .cluster_bits           = cpu_to_be32(s->cluster_bits), +        .size                   = cpu_to_be64(total_size), +        .crypt_method           = cpu_to_be32(s->crypt_method_header), +        .l1_size                = cpu_to_be32(s->l1_size), +        .l1_table_offset        = cpu_to_be64(s->l1_table_offset), +        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset), +        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), +        .nb_snapshots           = cpu_to_be32(s->nb_snapshots), +        .snapshots_offset       = cpu_to_be64(s->snapshots_offset), + +        /* Version 3 fields */ +        .incompatible_features  = cpu_to_be64(s->incompatible_features), +        .compatible_features    = cpu_to_be64(s->compatible_features), +        .autoclear_features     = cpu_to_be64(s->autoclear_features), +        .refcount_order         = cpu_to_be32(3 + REFCOUNT_SHIFT), +        .header_length          = cpu_to_be32(header_length), +    }; + +    /* For older versions, write a shorter header */ +    switch (s->qcow_version) { +    case 2: +        ret = offsetof(QCowHeader, incompatible_features); +        break; +    case 3: +        ret = sizeof(*header); +        break; +    default: +        ret = -EINVAL; +        goto fail; +    } + +    buf += ret; +    buflen -= ret; +    memset(buf, 0, buflen); + +    /* Preserve any unknown field in the header */ +    if (s->unknown_header_fields_size) { +        if (buflen < s->unknown_header_fields_size) { +            ret = -ENOSPC; +            goto fail; +        } + +        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); +        buf += s->unknown_header_fields_size; +        buflen -= s->unknown_header_fields_size; +    } + +    /* Backing file format header extension */ +    if (*bs->backing_format) { +        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, +                             bs->backing_format, strlen(bs->backing_format), +                             buflen); +        if (ret < 0) { +            goto fail; +        } + +        buf += ret; +        buflen -= ret; +    } + +    /* Feature table */ +    Qcow2Feature features[] = { +        { +            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, +            .bit  = QCOW2_INCOMPAT_DIRTY_BITNR, +            .name = "dirty bit", +        }, +        { +            .type = QCOW2_FEAT_TYPE_COMPATIBLE, +            .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, +            .name = "lazy refcounts", +        }, +    }; + +    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, +                         features, sizeof(features), buflen); +    if (ret < 0) { +        goto fail; +    } +    buf += ret; +    buflen -= ret; + +    /* Keep unknown header extensions */ +    QLIST_FOREACH(uext, &s->unknown_header_ext, next) { +        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); +        if (ret < 0) { +            goto fail; +        } + +        buf += ret; +        buflen -= ret; +    } + +    /* End of header extensions */ +    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); +    if (ret < 0) { +        goto fail; +    } + +    buf += ret; +    buflen -= ret; + +    /* Backing file name */ +    if (*bs->backing_file) { +        size_t backing_file_len = strlen(bs->backing_file); + +        if (buflen < backing_file_len) { +            ret = -ENOSPC; +            goto fail; +        } + +        /* Using strncpy is ok here, since buf is not NUL-terminated. */ +        strncpy(buf, bs->backing_file, buflen); + +        header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); +        header->backing_file_size   = cpu_to_be32(backing_file_len); +    } + +    /* Write the new header */ +    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); +    if (ret < 0) { +        goto fail; +    } + +    ret = 0; +fail: +    qemu_vfree(header); +    return ret; +} + +static int qcow2_change_backing_file(BlockDriverState *bs, +    const char *backing_file, const char *backing_fmt) +{ +    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); +    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); + +    return qcow2_update_header(bs); +} + +static int preallocate(BlockDriverState *bs) +{ +    uint64_t nb_sectors; +    uint64_t offset; +    uint64_t host_offset = 0; +    int num; +    int ret; +    QCowL2Meta *meta; + +    nb_sectors = bdrv_getlength(bs) >> 9; +    offset = 0; + +    while (nb_sectors) { +        num = MIN(nb_sectors, INT_MAX >> 9); +        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, +                                         &host_offset, &meta); +        if (ret < 0) { +            return ret; +        } + +        ret = qcow2_alloc_cluster_link_l2(bs, meta); +        if (ret < 0) { +            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters, +                                    QCOW2_DISCARD_NEVER); +            return ret; +        } + +        /* There are no dependent requests, but we need to remove our request +         * from the list of in-flight requests */ +        if (meta != NULL) { +            QLIST_REMOVE(meta, next_in_flight); +        } + +        /* TODO Preallocate data if requested */ + +        nb_sectors -= num; +        offset += num << 9; +    } + +    /* +     * It is expected that the image file is large enough to actually contain +     * all of the allocated clusters (otherwise we get failing reads after +     * EOF). Extend the image to the last allocated sector. +     */ +    if (host_offset != 0) { +        uint8_t buf[512]; +        memset(buf, 0, 512); +        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1); +        if (ret < 0) { +            return ret; +        } +    } + +    return 0; +} + +static int qcow2_create2(const char *filename, int64_t total_size, +                         const char *backing_file, const char *backing_format, +                         int flags, size_t cluster_size, int prealloc, +                         QEMUOptionParameter *options, int version) +{ +    /* Calculate cluster_bits */ +    int cluster_bits; +    cluster_bits = ffs(cluster_size) - 1; +    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || +        (1 << cluster_bits) != cluster_size) +    { +        error_report( +            "Cluster size must be a power of two between %d and %dk", +            1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); +        return -EINVAL; +    } + +    /* +     * Open the image file and write a minimal qcow2 header. +     * +     * We keep things simple and start with a zero-sized image. We also +     * do without refcount blocks or a L1 table for now. We'll fix the +     * inconsistency later. +     * +     * We do need a refcount table because growing the refcount table means +     * allocating two new refcount blocks - the seconds of which would be at +     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file +     * size for any qcow2 image. +     */ +    BlockDriverState* bs; +    QCowHeader header; +    uint8_t* refcount_table; +    int ret; + +    ret = bdrv_create_file(filename, options); +    if (ret < 0) { +        return ret; +    } + +    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); +    if (ret < 0) { +        return ret; +    } + +    /* Write the header */ +    memset(&header, 0, sizeof(header)); +    header.magic = cpu_to_be32(QCOW_MAGIC); +    header.version = cpu_to_be32(version); +    header.cluster_bits = cpu_to_be32(cluster_bits); +    header.size = cpu_to_be64(0); +    header.l1_table_offset = cpu_to_be64(0); +    header.l1_size = cpu_to_be32(0); +    header.refcount_table_offset = cpu_to_be64(cluster_size); +    header.refcount_table_clusters = cpu_to_be32(1); +    header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT); +    header.header_length = cpu_to_be32(sizeof(header)); + +    if (flags & BLOCK_FLAG_ENCRYPT) { +        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); +    } else { +        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); +    } + +    if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { +        header.compatible_features |= +            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); +    } + +    ret = bdrv_pwrite(bs, 0, &header, sizeof(header)); +    if (ret < 0) { +        goto out; +    } + +    /* Write an empty refcount table */ +    refcount_table = g_malloc0(cluster_size); +    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size); +    g_free(refcount_table); + +    if (ret < 0) { +        goto out; +    } + +    bdrv_close(bs); + +    /* +     * And now open the image and make it consistent first (i.e. increase the +     * refcount of the cluster that is occupied by the header and the refcount +     * table) +     */ +    BlockDriver* drv = bdrv_find_format("qcow2"); +    assert(drv != NULL); +    ret = bdrv_open(bs, filename, NULL, +        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv); +    if (ret < 0) { +        goto out; +    } + +    ret = qcow2_alloc_clusters(bs, 2 * cluster_size); +    if (ret < 0) { +        goto out; + +    } else if (ret != 0) { +        error_report("Huh, first cluster in empty image is already in use?"); +        abort(); +    } + +    /* Okay, now that we have a valid image, let's give it the right size */ +    ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE); +    if (ret < 0) { +        goto out; +    } + +    /* Want a backing file? There you go.*/ +    if (backing_file) { +        ret = bdrv_change_backing_file(bs, backing_file, backing_format); +        if (ret < 0) { +            goto out; +        } +    } + +    /* And if we're supposed to preallocate metadata, do that now */ +    if (prealloc) { +        BDRVQcowState *s = bs->opaque; +        qemu_co_mutex_lock(&s->lock); +        ret = preallocate(bs); +        qemu_co_mutex_unlock(&s->lock); +        if (ret < 0) { +            goto out; +        } +    } + +    ret = 0; +out: +    bdrv_delete(bs); +    return ret; +} + +static int qcow2_create(const char *filename, QEMUOptionParameter *options) +{ +    const char *backing_file = NULL; +    const char *backing_fmt = NULL; +    uint64_t sectors = 0; +    int flags = 0; +    size_t cluster_size = DEFAULT_CLUSTER_SIZE; +    int prealloc = 0; +    int version = 2; + +    /* Read out options */ +    while (options && options->name) { +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) { +            sectors = options->value.n / 512; +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { +            backing_file = options->value.s; +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { +            backing_fmt = options->value.s; +        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { +            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; +        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { +            if (options->value.n) { +                cluster_size = options->value.n; +            } +        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { +            if (!options->value.s || !strcmp(options->value.s, "off")) { +                prealloc = 0; +            } else if (!strcmp(options->value.s, "metadata")) { +                prealloc = 1; +            } else { +                fprintf(stderr, "Invalid preallocation mode: '%s'\n", +                    options->value.s); +                return -EINVAL; +            } +        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) { +            if (!options->value.s || !strcmp(options->value.s, "0.10")) { +                version = 2; +            } else if (!strcmp(options->value.s, "1.1")) { +                version = 3; +            } else { +                fprintf(stderr, "Invalid compatibility level: '%s'\n", +                    options->value.s); +                return -EINVAL; +            } +        } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) { +            flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0; +        } +        options++; +    } + +    if (backing_file && prealloc) { +        fprintf(stderr, "Backing file and preallocation cannot be used at " +            "the same time\n"); +        return -EINVAL; +    } + +    if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { +        fprintf(stderr, "Lazy refcounts only supported with compatibility " +                "level 1.1 and above (use compat=1.1 or greater)\n"); +        return -EINVAL; +    } + +    return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, +                         cluster_size, prealloc, options, version); +} + +static int qcow2_make_empty(BlockDriverState *bs) +{ +#if 0 +    /* XXX: not correct */ +    BDRVQcowState *s = bs->opaque; +    uint32_t l1_length = s->l1_size * sizeof(uint64_t); +    int ret; + +    memset(s->l1_table, 0, l1_length); +    if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) +        return -1; +    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); +    if (ret < 0) +        return ret; + +    l2_cache_reset(bs); +#endif +    return 0; +} + +static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors) +{ +    int ret; +    BDRVQcowState *s = bs->opaque; + +    /* Emulate misaligned zero writes */ +    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { +        return -ENOTSUP; +    } + +    /* Whatever is left can use real zero clusters */ +    qemu_co_mutex_lock(&s->lock); +    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, +        nb_sectors); +    qemu_co_mutex_unlock(&s->lock); + +    return ret; +} + +static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors) +{ +    int ret; +    BDRVQcowState *s = bs->opaque; + +    qemu_co_mutex_lock(&s->lock); +    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, +        nb_sectors); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static int qcow2_truncate(BlockDriverState *bs, int64_t offset) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t new_l1_size; +    int ret; + +    if (offset & 511) { +        error_report("The new size must be a multiple of 512"); +        return -EINVAL; +    } + +    /* cannot proceed if image has snapshots */ +    if (s->nb_snapshots) { +        error_report("Can't resize an image which has snapshots"); +        return -ENOTSUP; +    } + +    /* shrinking is currently not supported */ +    if (offset < bs->total_sectors * 512) { +        error_report("qcow2 doesn't support shrinking images yet"); +        return -ENOTSUP; +    } + +    new_l1_size = size_to_l1(s, offset); +    ret = qcow2_grow_l1_table(bs, new_l1_size, true); +    if (ret < 0) { +        return ret; +    } + +    /* write updated header.size */ +    offset = cpu_to_be64(offset); +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), +                           &offset, sizeof(uint64_t)); +    if (ret < 0) { +        return ret; +    } + +    s->l1_vm_state_index = new_l1_size; +    return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned +   tables to avoid losing bytes in alignment */ +static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, +                                  const uint8_t *buf, int nb_sectors) +{ +    BDRVQcowState *s = bs->opaque; +    z_stream strm; +    int ret, out_len; +    uint8_t *out_buf; +    uint64_t cluster_offset; + +    if (nb_sectors == 0) { +        /* align end of file to a sector boundary to ease reading with +           sector based I/Os */ +        cluster_offset = bdrv_getlength(bs->file); +        cluster_offset = (cluster_offset + 511) & ~511; +        bdrv_truncate(bs->file, cluster_offset); +        return 0; +    } + +    if (nb_sectors != s->cluster_sectors) { +        ret = -EINVAL; + +        /* Zero-pad last write if image size is not cluster aligned */ +        if (sector_num + nb_sectors == bs->total_sectors && +            nb_sectors < s->cluster_sectors) { +            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); +            memset(pad_buf, 0, s->cluster_size); +            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); +            ret = qcow2_write_compressed(bs, sector_num, +                                         pad_buf, s->cluster_sectors); +            qemu_vfree(pad_buf); +        } +        return ret; +    } + +    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + +    /* best compression, small window, no zlib header */ +    memset(&strm, 0, sizeof(strm)); +    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, +                       Z_DEFLATED, -12, +                       9, Z_DEFAULT_STRATEGY); +    if (ret != 0) { +        ret = -EINVAL; +        goto fail; +    } + +    strm.avail_in = s->cluster_size; +    strm.next_in = (uint8_t *)buf; +    strm.avail_out = s->cluster_size; +    strm.next_out = out_buf; + +    ret = deflate(&strm, Z_FINISH); +    if (ret != Z_STREAM_END && ret != Z_OK) { +        deflateEnd(&strm); +        ret = -EINVAL; +        goto fail; +    } +    out_len = strm.next_out - out_buf; + +    deflateEnd(&strm); + +    if (ret != Z_STREAM_END || out_len >= s->cluster_size) { +        /* could not compress: write normal cluster */ +        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); +        if (ret < 0) { +            goto fail; +        } +    } else { +        cluster_offset = qcow2_alloc_compressed_cluster_offset(bs, +            sector_num << 9, out_len); +        if (!cluster_offset) { +            ret = -EIO; +            goto fail; +        } +        cluster_offset &= s->cluster_offset_mask; +        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); +        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); +        if (ret < 0) { +            goto fail; +        } +    } + +    ret = 0; +fail: +    g_free(out_buf); +    return ret; +} + +static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    qemu_co_mutex_lock(&s->lock); +    ret = qcow2_cache_flush(bs, s->l2_table_cache); +    if (ret < 0) { +        qemu_co_mutex_unlock(&s->lock); +        return ret; +    } + +    if (qcow2_need_accurate_refcounts(s)) { +        ret = qcow2_cache_flush(bs, s->refcount_block_cache); +        if (ret < 0) { +            qemu_co_mutex_unlock(&s->lock); +            return ret; +        } +    } +    qemu_co_mutex_unlock(&s->lock); + +    return 0; +} + +static int64_t qcow2_vm_state_offset(BDRVQcowState *s) +{ +	return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); +} + +static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVQcowState *s = bs->opaque; +    bdi->cluster_size = s->cluster_size; +    bdi->vm_state_offset = qcow2_vm_state_offset(s); +    return 0; +} + +#if 0 +static void dump_refcounts(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t nb_clusters, k, k1, size; +    int refcount; + +    size = bdrv_getlength(bs->file); +    nb_clusters = size_to_clusters(s, size); +    for(k = 0; k < nb_clusters;) { +        k1 = k; +        refcount = get_refcount(bs, k); +        k++; +        while (k < nb_clusters && get_refcount(bs, k) == refcount) +            k++; +        printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount, +               k - k1); +    } +} +#endif + +static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, +                              int64_t pos) +{ +    BDRVQcowState *s = bs->opaque; +    int growable = bs->growable; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); +    bs->growable = 1; +    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); +    bs->growable = growable; + +    return ret; +} + +static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, +                              int64_t pos, int size) +{ +    BDRVQcowState *s = bs->opaque; +    int growable = bs->growable; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); +    bs->growable = 1; +    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); +    bs->growable = growable; + +    return ret; +} + +static QEMUOptionParameter qcow2_create_options[] = { +    { +        .name = BLOCK_OPT_SIZE, +        .type = OPT_SIZE, +        .help = "Virtual disk size" +    }, +    { +        .name = BLOCK_OPT_COMPAT_LEVEL, +        .type = OPT_STRING, +        .help = "Compatibility level (0.10 or 1.1)" +    }, +    { +        .name = BLOCK_OPT_BACKING_FILE, +        .type = OPT_STRING, +        .help = "File name of a base image" +    }, +    { +        .name = BLOCK_OPT_BACKING_FMT, +        .type = OPT_STRING, +        .help = "Image format of the base image" +    }, +    { +        .name = BLOCK_OPT_ENCRYPT, +        .type = OPT_FLAG, +        .help = "Encrypt the image" +    }, +    { +        .name = BLOCK_OPT_CLUSTER_SIZE, +        .type = OPT_SIZE, +        .help = "qcow2 cluster size", +        .value = { .n = DEFAULT_CLUSTER_SIZE }, +    }, +    { +        .name = BLOCK_OPT_PREALLOC, +        .type = OPT_STRING, +        .help = "Preallocation mode (allowed values: off, metadata)" +    }, +    { +        .name = BLOCK_OPT_LAZY_REFCOUNTS, +        .type = OPT_FLAG, +        .help = "Postpone refcount updates", +    }, +    { NULL } +}; + +static BlockDriver bdrv_qcow2 = { +    .format_name        = "qcow2", +    .instance_size      = sizeof(BDRVQcowState), +    .bdrv_probe         = qcow2_probe, +    .bdrv_open          = qcow2_open, +    .bdrv_close         = qcow2_close, +    .bdrv_reopen_prepare  = qcow2_reopen_prepare, +    .bdrv_create        = qcow2_create, +    .bdrv_has_zero_init = bdrv_has_zero_init_1, +    .bdrv_co_is_allocated = qcow2_co_is_allocated, +    .bdrv_set_key       = qcow2_set_key, +    .bdrv_make_empty    = qcow2_make_empty, + +    .bdrv_co_readv          = qcow2_co_readv, +    .bdrv_co_writev         = qcow2_co_writev, +    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os, + +    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes, +    .bdrv_co_discard        = qcow2_co_discard, +    .bdrv_truncate          = qcow2_truncate, +    .bdrv_write_compressed  = qcow2_write_compressed, + +    .bdrv_snapshot_create   = qcow2_snapshot_create, +    .bdrv_snapshot_goto     = qcow2_snapshot_goto, +    .bdrv_snapshot_delete   = qcow2_snapshot_delete, +    .bdrv_snapshot_list     = qcow2_snapshot_list, +    .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp, +    .bdrv_get_info      = qcow2_get_info, + +    .bdrv_save_vmstate    = qcow2_save_vmstate, +    .bdrv_load_vmstate    = qcow2_load_vmstate, + +    .bdrv_change_backing_file   = qcow2_change_backing_file, + +    .bdrv_invalidate_cache      = qcow2_invalidate_cache, + +    .create_options = qcow2_create_options, +    .bdrv_check = qcow2_check, +}; + +static void bdrv_qcow2_init(void) +{ +    bdrv_register(&bdrv_qcow2); +} + +block_init(bdrv_qcow2_init); diff --git a/contrib/qemu/block/qcow2.h b/contrib/qemu/block/qcow2.h new file mode 100644 index 000000000..3b2d5cda7 --- /dev/null +++ b/contrib/qemu/block/qcow2.h @@ -0,0 +1,437 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCK_QCOW2_H +#define BLOCK_QCOW2_H + +#include "qemu/aes.h" +#include "block/coroutine.h" + +//#define DEBUG_ALLOC +//#define DEBUG_ALLOC2 +//#define DEBUG_EXT + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES  1 + +#define QCOW_MAX_CRYPT_CLUSTERS 32 + +/* indicate that the refcount of the referenced cluster is exactly one. */ +#define QCOW_OFLAG_COPIED     (1LL << 63) +/* indicate that the cluster is compressed (they never have the copied flag) */ +#define QCOW_OFLAG_COMPRESSED (1LL << 62) +/* The cluster reads as all zeros */ +#define QCOW_OFLAG_ZERO (1LL << 0) + +#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */ + +#define MIN_CLUSTER_BITS 9 +#define MAX_CLUSTER_BITS 21 + +#define L2_CACHE_SIZE 16 + +/* Must be at least 4 to cover all cases of refcount table growth */ +#define REFCOUNT_CACHE_SIZE 4 + +#define DEFAULT_CLUSTER_SIZE 65536 + + +#define QCOW2_OPT_LAZY_REFCOUNTS "lazy_refcounts" +#define QCOW2_OPT_DISCARD_REQUEST "pass_discard_request" +#define QCOW2_OPT_DISCARD_SNAPSHOT "pass_discard_snapshot" +#define QCOW2_OPT_DISCARD_OTHER "pass_discard_other" + +typedef struct QCowHeader { +    uint32_t magic; +    uint32_t version; +    uint64_t backing_file_offset; +    uint32_t backing_file_size; +    uint32_t cluster_bits; +    uint64_t size; /* in bytes */ +    uint32_t crypt_method; +    uint32_t l1_size; /* XXX: save number of clusters instead ? */ +    uint64_t l1_table_offset; +    uint64_t refcount_table_offset; +    uint32_t refcount_table_clusters; +    uint32_t nb_snapshots; +    uint64_t snapshots_offset; + +    /* The following fields are only valid for version >= 3 */ +    uint64_t incompatible_features; +    uint64_t compatible_features; +    uint64_t autoclear_features; + +    uint32_t refcount_order; +    uint32_t header_length; +} QCowHeader; + +typedef struct QCowSnapshot { +    uint64_t l1_table_offset; +    uint32_t l1_size; +    char *id_str; +    char *name; +    uint64_t disk_size; +    uint64_t vm_state_size; +    uint32_t date_sec; +    uint32_t date_nsec; +    uint64_t vm_clock_nsec; +} QCowSnapshot; + +struct Qcow2Cache; +typedef struct Qcow2Cache Qcow2Cache; + +typedef struct Qcow2UnknownHeaderExtension { +    uint32_t magic; +    uint32_t len; +    QLIST_ENTRY(Qcow2UnknownHeaderExtension) next; +    uint8_t data[]; +} Qcow2UnknownHeaderExtension; + +enum { +    QCOW2_FEAT_TYPE_INCOMPATIBLE    = 0, +    QCOW2_FEAT_TYPE_COMPATIBLE      = 1, +    QCOW2_FEAT_TYPE_AUTOCLEAR       = 2, +}; + +/* Incompatible feature bits */ +enum { +    QCOW2_INCOMPAT_DIRTY_BITNR   = 0, +    QCOW2_INCOMPAT_DIRTY         = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, + +    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY, +}; + +/* Compatible feature bits */ +enum { +    QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0, +    QCOW2_COMPAT_LAZY_REFCOUNTS       = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + +    QCOW2_COMPAT_FEAT_MASK            = QCOW2_COMPAT_LAZY_REFCOUNTS, +}; + +enum qcow2_discard_type { +    QCOW2_DISCARD_NEVER = 0, +    QCOW2_DISCARD_ALWAYS, +    QCOW2_DISCARD_REQUEST, +    QCOW2_DISCARD_SNAPSHOT, +    QCOW2_DISCARD_OTHER, +    QCOW2_DISCARD_MAX +}; + +typedef struct Qcow2Feature { +    uint8_t type; +    uint8_t bit; +    char    name[46]; +} QEMU_PACKED Qcow2Feature; + +typedef struct Qcow2DiscardRegion { +    BlockDriverState *bs; +    uint64_t offset; +    uint64_t bytes; +    QTAILQ_ENTRY(Qcow2DiscardRegion) next; +} Qcow2DiscardRegion; + +typedef struct BDRVQcowState { +    int cluster_bits; +    int cluster_size; +    int cluster_sectors; +    int l2_bits; +    int l2_size; +    int l1_size; +    int l1_vm_state_index; +    int csize_shift; +    int csize_mask; +    uint64_t cluster_offset_mask; +    uint64_t l1_table_offset; +    uint64_t *l1_table; + +    Qcow2Cache* l2_table_cache; +    Qcow2Cache* refcount_block_cache; + +    uint8_t *cluster_cache; +    uint8_t *cluster_data; +    uint64_t cluster_cache_offset; +    QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; + +    uint64_t *refcount_table; +    uint64_t refcount_table_offset; +    uint32_t refcount_table_size; +    int64_t free_cluster_index; +    int64_t free_byte_offset; + +    CoMutex lock; + +    uint32_t crypt_method; /* current crypt method, 0 if no key yet */ +    uint32_t crypt_method_header; +    AES_KEY aes_encrypt_key; +    AES_KEY aes_decrypt_key; +    uint64_t snapshots_offset; +    int snapshots_size; +    int nb_snapshots; +    QCowSnapshot *snapshots; + +    int flags; +    int qcow_version; +    bool use_lazy_refcounts; + +    bool discard_passthrough[QCOW2_DISCARD_MAX]; + +    uint64_t incompatible_features; +    uint64_t compatible_features; +    uint64_t autoclear_features; + +    size_t unknown_header_fields_size; +    void* unknown_header_fields; +    QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext; +    QTAILQ_HEAD (, Qcow2DiscardRegion) discards; +    bool cache_discards; +} BDRVQcowState; + +/* XXX: use std qcow open function ? */ +typedef struct QCowCreateState { +    int cluster_size; +    int cluster_bits; +    uint16_t *refcount_block; +    uint64_t *refcount_table; +    int64_t l1_table_offset; +    int64_t refcount_table_offset; +    int64_t refcount_block_offset; +} QCowCreateState; + +struct QCowAIOCB; + +typedef struct Qcow2COWRegion { +    /** +     * Offset of the COW region in bytes from the start of the first cluster +     * touched by the request. +     */ +    uint64_t    offset; + +    /** Number of sectors to copy */ +    int         nb_sectors; +} Qcow2COWRegion; + +/** + * Describes an in-flight (part of a) write request that writes to clusters + * that are not referenced in their L2 table yet. + */ +typedef struct QCowL2Meta +{ +    /** Guest offset of the first newly allocated cluster */ +    uint64_t offset; + +    /** Host offset of the first newly allocated cluster */ +    uint64_t alloc_offset; + +    /** +     * Number of sectors from the start of the first allocated cluster to +     * the end of the (possibly shortened) request +     */ +    int nb_available; + +    /** Number of newly allocated clusters */ +    int nb_clusters; + +    /** +     * Requests that overlap with this allocation and wait to be restarted +     * when the allocating request has completed. +     */ +    CoQueue dependent_requests; + +    /** +     * The COW Region between the start of the first allocated cluster and the +     * area the guest actually writes to. +     */ +    Qcow2COWRegion cow_start; + +    /** +     * The COW Region between the area the guest actually writes to and the +     * end of the last allocated cluster. +     */ +    Qcow2COWRegion cow_end; + +    /** Pointer to next L2Meta of the same write request */ +    struct QCowL2Meta *next; + +    QLIST_ENTRY(QCowL2Meta) next_in_flight; +} QCowL2Meta; + +enum { +    QCOW2_CLUSTER_UNALLOCATED, +    QCOW2_CLUSTER_NORMAL, +    QCOW2_CLUSTER_COMPRESSED, +    QCOW2_CLUSTER_ZERO +}; + +#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL +#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL +#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL + +#define REFT_OFFSET_MASK 0xffffffffffffff00ULL + +static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset) +{ +    return offset & ~(s->cluster_size - 1); +} + +static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset) +{ +    return offset & (s->cluster_size - 1); +} + +static inline int size_to_clusters(BDRVQcowState *s, int64_t size) +{ +    return (size + (s->cluster_size - 1)) >> s->cluster_bits; +} + +static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size) +{ +    int shift = s->cluster_bits + s->l2_bits; +    return (size + (1ULL << shift) - 1) >> shift; +} + +static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset) +{ +    return (offset >> s->cluster_bits) & (s->l2_size - 1); +} + +static inline int64_t align_offset(int64_t offset, int n) +{ +    offset = (offset + n - 1) & ~(n - 1); +    return offset; +} + +static inline int qcow2_get_cluster_type(uint64_t l2_entry) +{ +    if (l2_entry & QCOW_OFLAG_COMPRESSED) { +        return QCOW2_CLUSTER_COMPRESSED; +    } else if (l2_entry & QCOW_OFLAG_ZERO) { +        return QCOW2_CLUSTER_ZERO; +    } else if (!(l2_entry & L2E_OFFSET_MASK)) { +        return QCOW2_CLUSTER_UNALLOCATED; +    } else { +        return QCOW2_CLUSTER_NORMAL; +    } +} + +/* Check whether refcounts are eager or lazy */ +static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s) +{ +    return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY); +} + +static inline uint64_t l2meta_cow_start(QCowL2Meta *m) +{ +    return m->offset + m->cow_start.offset; +} + +static inline uint64_t l2meta_cow_end(QCowL2Meta *m) +{ +    return m->offset + m->cow_end.offset +        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS); +} + +// FIXME Need qcow2_ prefix to global functions + +/* qcow2.c functions */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, +                  int64_t sector_num, int nb_sectors); + +int qcow2_mark_dirty(BlockDriverState *bs); +int qcow2_update_header(BlockDriverState *bs); + +/* qcow2-refcount.c functions */ +int qcow2_refcount_init(BlockDriverState *bs); +void qcow2_refcount_close(BlockDriverState *bs); + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size); +int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, +    int nb_clusters); +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); +void qcow2_free_clusters(BlockDriverState *bs, +                          int64_t offset, int64_t size, +                          enum qcow2_discard_type type); +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, +                             int nb_clusters, enum qcow2_discard_type type); + +int qcow2_update_snapshot_refcount(BlockDriverState *bs, +    int64_t l1_table_offset, int l1_size, int addend); + +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, +                          BdrvCheckMode fix); + +void qcow2_process_discards(BlockDriverState *bs, int ret); + +/* qcow2-cluster.c functions */ +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, +                        bool exact_size); +void qcow2_l2_cache_reset(BlockDriverState *bs); +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); +void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, +                     uint8_t *out_buf, const uint8_t *in_buf, +                     int nb_sectors, int enc, +                     const AES_KEY *key); + +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, +    int *num, uint64_t *cluster_offset); +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, +    int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m); +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, +                                         uint64_t offset, +                                         int compressed_size); + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, +    int nb_sectors); +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); + +/* qcow2-snapshot.c functions */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab); +int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name); + +void qcow2_free_snapshots(BlockDriverState *bs); +int qcow2_read_snapshots(BlockDriverState *bs); + +/* qcow2-cache.c functions */ +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables); +int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); + +void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table); +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, +    Qcow2Cache *dependency); +void qcow2_cache_depends_on_flush(Qcow2Cache *c); + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, +    void **table); +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, +    void **table); +int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); + +#endif diff --git a/contrib/qemu/block/qed-check.c b/contrib/qemu/block/qed-check.c new file mode 100644 index 000000000..b473dcd61 --- /dev/null +++ b/contrib/qemu/block/qed-check.c @@ -0,0 +1,248 @@ +/* + * QEMU Enhanced Disk Format Consistency Check + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +typedef struct { +    BDRVQEDState *s; +    BdrvCheckResult *result; +    bool fix;                           /* whether to fix invalid offsets */ + +    uint64_t nclusters; +    uint32_t *used_clusters;            /* referenced cluster bitmap */ + +    QEDRequest request; +} QEDCheck; + +static bool qed_test_bit(uint32_t *bitmap, uint64_t n) { +    return !!(bitmap[n / 32] & (1 << (n % 32))); +} + +static void qed_set_bit(uint32_t *bitmap, uint64_t n) { +    bitmap[n / 32] |= 1 << (n % 32); +} + +/** + * Set bitmap bits for clusters + * + * @check:          Check structure + * @offset:         Starting offset in bytes + * @n:              Number of clusters + */ +static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset, +                                  unsigned int n) +{ +    uint64_t cluster = qed_bytes_to_clusters(check->s, offset); +    unsigned int corruptions = 0; + +    while (n-- != 0) { +        /* Clusters should only be referenced once */ +        if (qed_test_bit(check->used_clusters, cluster)) { +            corruptions++; +        } + +        qed_set_bit(check->used_clusters, cluster); +        cluster++; +    } + +    check->result->corruptions += corruptions; +    return corruptions == 0; +} + +/** + * Check an L2 table + * + * @ret:            Number of invalid cluster offsets + */ +static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table) +{ +    BDRVQEDState *s = check->s; +    unsigned int i, num_invalid = 0; +    uint64_t last_offset = 0; + +    for (i = 0; i < s->table_nelems; i++) { +        uint64_t offset = table->offsets[i]; + +        if (qed_offset_is_unalloc_cluster(offset) || +            qed_offset_is_zero_cluster(offset)) { +            continue; +        } +        check->result->bfi.allocated_clusters++; +        if (last_offset && (last_offset + s->header.cluster_size != offset)) { +            check->result->bfi.fragmented_clusters++; +        } +        last_offset = offset; + +        /* Detect invalid cluster offset */ +        if (!qed_check_cluster_offset(s, offset)) { +            if (check->fix) { +                table->offsets[i] = 0; +                check->result->corruptions_fixed++; +            } else { +                check->result->corruptions++; +            } + +            num_invalid++; +            continue; +        } + +        qed_set_used_clusters(check, offset, 1); +    } + +    return num_invalid; +} + +/** + * Descend tables and check each cluster is referenced once only + */ +static int qed_check_l1_table(QEDCheck *check, QEDTable *table) +{ +    BDRVQEDState *s = check->s; +    unsigned int i, num_invalid_l1 = 0; +    int ret, last_error = 0; + +    /* Mark L1 table clusters used */ +    qed_set_used_clusters(check, s->header.l1_table_offset, +                          s->header.table_size); + +    for (i = 0; i < s->table_nelems; i++) { +        unsigned int num_invalid_l2; +        uint64_t offset = table->offsets[i]; + +        if (qed_offset_is_unalloc_cluster(offset)) { +            continue; +        } + +        /* Detect invalid L2 offset */ +        if (!qed_check_table_offset(s, offset)) { +            /* Clear invalid offset */ +            if (check->fix) { +                table->offsets[i] = 0; +                check->result->corruptions_fixed++; +            } else { +                check->result->corruptions++; +            } + +            num_invalid_l1++; +            continue; +        } + +        if (!qed_set_used_clusters(check, offset, s->header.table_size)) { +            continue; /* skip an invalid table */ +        } + +        ret = qed_read_l2_table_sync(s, &check->request, offset); +        if (ret) { +            check->result->check_errors++; +            last_error = ret; +            continue; +        } + +        num_invalid_l2 = qed_check_l2_table(check, +                                            check->request.l2_table->table); + +        /* Write out fixed L2 table */ +        if (num_invalid_l2 > 0 && check->fix) { +            ret = qed_write_l2_table_sync(s, &check->request, 0, +                                          s->table_nelems, false); +            if (ret) { +                check->result->check_errors++; +                last_error = ret; +                continue; +            } +        } +    } + +    /* Drop reference to final table */ +    qed_unref_l2_cache_entry(check->request.l2_table); +    check->request.l2_table = NULL; + +    /* Write out fixed L1 table */ +    if (num_invalid_l1 > 0 && check->fix) { +        ret = qed_write_l1_table_sync(s, 0, s->table_nelems); +        if (ret) { +            check->result->check_errors++; +            last_error = ret; +        } +    } + +    return last_error; +} + +/** + * Check for unreferenced (leaked) clusters + */ +static void qed_check_for_leaks(QEDCheck *check) +{ +    BDRVQEDState *s = check->s; +    uint64_t i; + +    for (i = s->header.header_size; i < check->nclusters; i++) { +        if (!qed_test_bit(check->used_clusters, i)) { +            check->result->leaks++; +        } +    } +} + +/** + * Mark an image clean once it passes check or has been repaired + */ +static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) +{ +    /* Skip if there were unfixable corruptions or I/O errors */ +    if (result->corruptions > 0 || result->check_errors > 0) { +        return; +    } + +    /* Skip if image is already marked clean */ +    if (!(s->header.features & QED_F_NEED_CHECK)) { +        return; +    } + +    /* Ensure fixes reach storage before clearing check bit */ +    bdrv_flush(s->bs); + +    s->header.features &= ~QED_F_NEED_CHECK; +    qed_write_header_sync(s); +} + +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) +{ +    QEDCheck check = { +        .s = s, +        .result = result, +        .nclusters = qed_bytes_to_clusters(s, s->file_size), +        .request = { .l2_table = NULL }, +        .fix = fix, +    }; +    int ret; + +    check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) * +                                       sizeof(check.used_clusters[0])); + +    check.result->bfi.total_clusters = +        (s->header.image_size + s->header.cluster_size - 1) / +            s->header.cluster_size; +    ret = qed_check_l1_table(&check, s->l1_table); +    if (ret == 0) { +        /* Only check for leaks if entire image was scanned successfully */ +        qed_check_for_leaks(&check); + +        if (fix) { +            qed_check_mark_clean(s, result); +        } +    } + +    g_free(check.used_clusters); +    return ret; +} diff --git a/contrib/qemu/block/qed-cluster.c b/contrib/qemu/block/qed-cluster.c new file mode 100644 index 000000000..f64b2af8f --- /dev/null +++ b/contrib/qemu/block/qed-cluster.c @@ -0,0 +1,165 @@ +/* + * QEMU Enhanced Disk Format Cluster functions + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +/** + * Count the number of contiguous data clusters + * + * @s:              QED state + * @table:          L2 table + * @index:          First cluster index + * @n:              Maximum number of clusters + * @offset:         Set to first cluster offset + * + * This function scans tables for contiguous clusters.  A contiguous run of + * clusters may be allocated, unallocated, or zero. + */ +static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, +                                                  QEDTable *table, +                                                  unsigned int index, +                                                  unsigned int n, +                                                  uint64_t *offset) +{ +    unsigned int end = MIN(index + n, s->table_nelems); +    uint64_t last = table->offsets[index]; +    unsigned int i; + +    *offset = last; + +    for (i = index + 1; i < end; i++) { +        if (qed_offset_is_unalloc_cluster(last)) { +            /* Counting unallocated clusters */ +            if (!qed_offset_is_unalloc_cluster(table->offsets[i])) { +                break; +            } +        } else if (qed_offset_is_zero_cluster(last)) { +            /* Counting zero clusters */ +            if (!qed_offset_is_zero_cluster(table->offsets[i])) { +                break; +            } +        } else { +            /* Counting allocated clusters */ +            if (table->offsets[i] != last + s->header.cluster_size) { +                break; +            } +            last = table->offsets[i]; +        } +    } +    return i - index; +} + +typedef struct { +    BDRVQEDState *s; +    uint64_t pos; +    size_t len; + +    QEDRequest *request; + +    /* User callback */ +    QEDFindClusterFunc *cb; +    void *opaque; +} QEDFindClusterCB; + +static void qed_find_cluster_cb(void *opaque, int ret) +{ +    QEDFindClusterCB *find_cluster_cb = opaque; +    BDRVQEDState *s = find_cluster_cb->s; +    QEDRequest *request = find_cluster_cb->request; +    uint64_t offset = 0; +    size_t len = 0; +    unsigned int index; +    unsigned int n; + +    if (ret) { +        goto out; +    } + +    index = qed_l2_index(s, find_cluster_cb->pos); +    n = qed_bytes_to_clusters(s, +                              qed_offset_into_cluster(s, find_cluster_cb->pos) + +                              find_cluster_cb->len); +    n = qed_count_contiguous_clusters(s, request->l2_table->table, +                                      index, n, &offset); + +    if (qed_offset_is_unalloc_cluster(offset)) { +        ret = QED_CLUSTER_L2; +    } else if (qed_offset_is_zero_cluster(offset)) { +        ret = QED_CLUSTER_ZERO; +    } else if (qed_check_cluster_offset(s, offset)) { +        ret = QED_CLUSTER_FOUND; +    } else { +        ret = -EINVAL; +    } + +    len = MIN(find_cluster_cb->len, n * s->header.cluster_size - +              qed_offset_into_cluster(s, find_cluster_cb->pos)); + +out: +    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); +    g_free(find_cluster_cb); +} + +/** + * Find the offset of a data cluster + * + * @s:          QED state + * @request:    L2 cache entry + * @pos:        Byte position in device + * @len:        Number of bytes + * @cb:         Completion function + * @opaque:     User data for completion function + * + * This function translates a position in the block device to an offset in the + * image file.  It invokes the cb completion callback to report back the + * translated offset or unallocated range in the image file. + * + * If the L2 table exists, request->l2_table points to the L2 table cache entry + * and the caller must free the reference when they are finished.  The cache + * entry is exposed in this way to avoid callers having to read the L2 table + * again later during request processing.  If request->l2_table is non-NULL it + * will be unreferenced before taking on the new cache entry. + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, +                      size_t len, QEDFindClusterFunc *cb, void *opaque) +{ +    QEDFindClusterCB *find_cluster_cb; +    uint64_t l2_offset; + +    /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary +     * so that a request acts on one L2 table at a time. +     */ +    len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); + +    l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; +    if (qed_offset_is_unalloc_cluster(l2_offset)) { +        cb(opaque, QED_CLUSTER_L1, 0, len); +        return; +    } +    if (!qed_check_table_offset(s, l2_offset)) { +        cb(opaque, -EINVAL, 0, 0); +        return; +    } + +    find_cluster_cb = g_malloc(sizeof(*find_cluster_cb)); +    find_cluster_cb->s = s; +    find_cluster_cb->pos = pos; +    find_cluster_cb->len = len; +    find_cluster_cb->cb = cb; +    find_cluster_cb->opaque = opaque; +    find_cluster_cb->request = request; + +    qed_read_l2_table(s, request, l2_offset, +                      qed_find_cluster_cb, find_cluster_cb); +} diff --git a/contrib/qemu/block/qed-gencb.c b/contrib/qemu/block/qed-gencb.c new file mode 100644 index 000000000..7d7ac1ffc --- /dev/null +++ b/contrib/qemu/block/qed-gencb.c @@ -0,0 +1,32 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque) +{ +    GenericCB *gencb = g_malloc(len); +    gencb->cb = cb; +    gencb->opaque = opaque; +    return gencb; +} + +void gencb_complete(void *opaque, int ret) +{ +    GenericCB *gencb = opaque; +    BlockDriverCompletionFunc *cb = gencb->cb; +    void *user_opaque = gencb->opaque; + +    g_free(gencb); +    cb(user_opaque, ret); +} diff --git a/contrib/qemu/block/qed-l2-cache.c b/contrib/qemu/block/qed-l2-cache.c new file mode 100644 index 000000000..e9b2aae44 --- /dev/null +++ b/contrib/qemu/block/qed-l2-cache.c @@ -0,0 +1,187 @@ +/* + * QEMU Enhanced Disk Format L2 Cache + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +/* + * L2 table cache usage is as follows: + * + * An open image has one L2 table cache that is used to avoid accessing the + * image file for recently referenced L2 tables. + * + * Cluster offset lookup translates the logical offset within the block device + * to a cluster offset within the image file.  This is done by indexing into + * the L1 and L2 tables which store cluster offsets.  It is here where the L2 + * table cache serves up recently referenced L2 tables. + * + * If there is a cache miss, that L2 table is read from the image file and + * committed to the cache.  Subsequent accesses to that L2 table will be served + * from the cache until the table is evicted from the cache. + * + * L2 tables are also committed to the cache when new L2 tables are allocated + * in the image file.  Since the L2 table cache is write-through, the new L2 + * table is first written out to the image file and then committed to the + * cache. + * + * Multiple I/O requests may be using an L2 table cache entry at any given + * time.  That means an entry may be in use across several requests and + * reference counting is needed to free the entry at the correct time.  In + * particular, an entry evicted from the cache will only be freed once all + * references are dropped. + * + * An in-flight I/O request will hold a reference to a L2 table cache entry for + * the period during which it needs to access the L2 table.  This includes + * cluster offset lookup, L2 table allocation, and L2 table update when a new + * data cluster has been allocated. + * + * An interesting case occurs when two requests need to access an L2 table that + * is not in the cache.  Since the operation to read the table from the image + * file takes some time to complete, both requests may see a cache miss and + * start reading the L2 table from the image file.  The first to finish will + * commit its L2 table into the cache.  When the second tries to commit its + * table will be deleted in favor of the existing cache entry. + */ + +#include "trace.h" +#include "qed.h" + +/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ +#define MAX_L2_CACHE_SIZE 50 + +/** + * Initialize the L2 cache + */ +void qed_init_l2_cache(L2TableCache *l2_cache) +{ +    QTAILQ_INIT(&l2_cache->entries); +    l2_cache->n_entries = 0; +} + +/** + * Free the L2 cache + */ +void qed_free_l2_cache(L2TableCache *l2_cache) +{ +    CachedL2Table *entry, *next_entry; + +    QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { +        qemu_vfree(entry->table); +        g_free(entry); +    } +} + +/** + * Allocate an uninitialized entry from the cache + * + * The returned entry has a reference count of 1 and is owned by the caller. + * The caller must allocate the actual table field for this entry and it must + * be freeable using qemu_vfree(). + */ +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) +{ +    CachedL2Table *entry; + +    entry = g_malloc0(sizeof(*entry)); +    entry->ref++; + +    trace_qed_alloc_l2_cache_entry(l2_cache, entry); + +    return entry; +} + +/** + * Decrease an entry's reference count and free if necessary when the reference + * count drops to zero. + */ +void qed_unref_l2_cache_entry(CachedL2Table *entry) +{ +    if (!entry) { +        return; +    } + +    entry->ref--; +    trace_qed_unref_l2_cache_entry(entry, entry->ref); +    if (entry->ref == 0) { +        qemu_vfree(entry->table); +        g_free(entry); +    } +} + +/** + * Find an entry in the L2 cache.  This may return NULL and it's up to the + * caller to satisfy the cache miss. + * + * For a cached entry, this function increases the reference count and returns + * the entry. + */ +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) +{ +    CachedL2Table *entry; + +    QTAILQ_FOREACH(entry, &l2_cache->entries, node) { +        if (entry->offset == offset) { +            trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref); +            entry->ref++; +            return entry; +        } +    } +    return NULL; +} + +/** + * Commit an L2 cache entry into the cache.  This is meant to be used as part of + * the process to satisfy a cache miss.  A caller would allocate an entry which + * is not actually in the L2 cache and then once the entry was valid and + * present on disk, the entry can be committed into the cache. + * + * Since the cache is write-through, it's important that this function is not + * called until the entry is present on disk and the L1 has been updated to + * point to the entry. + * + * N.B. This function steals a reference to the l2_table from the caller so the + * caller must obtain a new reference by issuing a call to + * qed_find_l2_cache_entry(). + */ +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) +{ +    CachedL2Table *entry; + +    entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); +    if (entry) { +        qed_unref_l2_cache_entry(entry); +        qed_unref_l2_cache_entry(l2_table); +        return; +    } + +    /* Evict an unused cache entry so we have space.  If all entries are in use +     * we can grow the cache temporarily and we try to shrink back down later. +     */ +    if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { +        CachedL2Table *next; +        QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) { +            if (entry->ref > 1) { +                continue; +            } + +            QTAILQ_REMOVE(&l2_cache->entries, entry, node); +            l2_cache->n_entries--; +            qed_unref_l2_cache_entry(entry); + +            /* Stop evicting when we've shrunk back to max size */ +            if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) { +                break; +            } +        } +    } + +    l2_cache->n_entries++; +    QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); +} diff --git a/contrib/qemu/block/qed-table.c b/contrib/qemu/block/qed-table.c new file mode 100644 index 000000000..76d2dcccf --- /dev/null +++ b/contrib/qemu/block/qed-table.c @@ -0,0 +1,296 @@ +/* + * QEMU Enhanced Disk Format Table I/O + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ +#include "qed.h" + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    QEDTable *table; + +    struct iovec iov; +    QEMUIOVector qiov; +} QEDReadTableCB; + +static void qed_read_table_cb(void *opaque, int ret) +{ +    QEDReadTableCB *read_table_cb = opaque; +    QEDTable *table = read_table_cb->table; +    int noffsets = read_table_cb->qiov.size / sizeof(uint64_t); +    int i; + +    /* Handle I/O error */ +    if (ret) { +        goto out; +    } + +    /* Byteswap offsets */ +    for (i = 0; i < noffsets; i++) { +        table->offsets[i] = le64_to_cpu(table->offsets[i]); +    } + +out: +    /* Completion */ +    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret); +    gencb_complete(&read_table_cb->gencb, ret); +} + +static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, +                           BlockDriverCompletionFunc *cb, void *opaque) +{ +    QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), +                                                cb, opaque); +    QEMUIOVector *qiov = &read_table_cb->qiov; + +    trace_qed_read_table(s, offset, table); + +    read_table_cb->s = s; +    read_table_cb->table = table; +    read_table_cb->iov.iov_base = table->offsets, +    read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, + +    qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); +    bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, +                   qiov->size / BDRV_SECTOR_SIZE, +                   qed_read_table_cb, read_table_cb); +} + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    QEDTable *orig_table; +    QEDTable *table; +    bool flush;             /* flush after write? */ + +    struct iovec iov; +    QEMUIOVector qiov; +} QEDWriteTableCB; + +static void qed_write_table_cb(void *opaque, int ret) +{ +    QEDWriteTableCB *write_table_cb = opaque; + +    trace_qed_write_table_cb(write_table_cb->s, +                             write_table_cb->orig_table, +                             write_table_cb->flush, +                             ret); + +    if (ret) { +        goto out; +    } + +    if (write_table_cb->flush) { +        /* We still need to flush first */ +        write_table_cb->flush = false; +        bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, +                       write_table_cb); +        return; +    } + +out: +    qemu_vfree(write_table_cb->table); +    gencb_complete(&write_table_cb->gencb, ret); +} + +/** + * Write out an updated part or all of a table + * + * @s:          QED state + * @offset:     Offset of table in image file, in bytes + * @table:      Table + * @index:      Index of first element + * @n:          Number of elements + * @flush:      Whether or not to sync to disk + * @cb:         Completion function + * @opaque:     Argument for completion function + */ +static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, +                            unsigned int index, unsigned int n, bool flush, +                            BlockDriverCompletionFunc *cb, void *opaque) +{ +    QEDWriteTableCB *write_table_cb; +    unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; +    unsigned int start, end, i; +    size_t len_bytes; + +    trace_qed_write_table(s, offset, table, index, n); + +    /* Calculate indices of the first and one after last elements */ +    start = index & ~sector_mask; +    end = (index + n + sector_mask) & ~sector_mask; + +    len_bytes = (end - start) * sizeof(uint64_t); + +    write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque); +    write_table_cb->s = s; +    write_table_cb->orig_table = table; +    write_table_cb->flush = flush; +    write_table_cb->table = qemu_blockalign(s->bs, len_bytes); +    write_table_cb->iov.iov_base = write_table_cb->table->offsets; +    write_table_cb->iov.iov_len = len_bytes; +    qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); + +    /* Byteswap table */ +    for (i = start; i < end; i++) { +        uint64_t le_offset = cpu_to_le64(table->offsets[i]); +        write_table_cb->table->offsets[i - start] = le_offset; +    } + +    /* Adjust for offset into table */ +    offset += start * sizeof(uint64_t); + +    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, +                    &write_table_cb->qiov, +                    write_table_cb->qiov.size / BDRV_SECTOR_SIZE, +                    qed_write_table_cb, write_table_cb); +} + +/** + * Propagate return value from async callback + */ +static void qed_sync_cb(void *opaque, int ret) +{ +    *(int *)opaque = ret; +} + +int qed_read_l1_table_sync(BDRVQEDState *s) +{ +    int ret = -EINPROGRESS; + +    qed_read_table(s, s->header.l1_table_offset, +                   s->l1_table, qed_sync_cb, &ret); +    while (ret == -EINPROGRESS) { +        qemu_aio_wait(); +    } + +    return ret; +} + +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, +                        BlockDriverCompletionFunc *cb, void *opaque) +{ +    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); +    qed_write_table(s, s->header.l1_table_offset, +                    s->l1_table, index, n, false, cb, opaque); +} + +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, +                            unsigned int n) +{ +    int ret = -EINPROGRESS; + +    qed_write_l1_table(s, index, n, qed_sync_cb, &ret); +    while (ret == -EINPROGRESS) { +        qemu_aio_wait(); +    } + +    return ret; +} + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    uint64_t l2_offset; +    QEDRequest *request; +} QEDReadL2TableCB; + +static void qed_read_l2_table_cb(void *opaque, int ret) +{ +    QEDReadL2TableCB *read_l2_table_cb = opaque; +    QEDRequest *request = read_l2_table_cb->request; +    BDRVQEDState *s = read_l2_table_cb->s; +    CachedL2Table *l2_table = request->l2_table; +    uint64_t l2_offset = read_l2_table_cb->l2_offset; + +    if (ret) { +        /* can't trust loaded L2 table anymore */ +        qed_unref_l2_cache_entry(l2_table); +        request->l2_table = NULL; +    } else { +        l2_table->offset = l2_offset; + +        qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + +        /* This is guaranteed to succeed because we just committed the entry +         * to the cache. +         */ +        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); +        assert(request->l2_table != NULL); +    } + +    gencb_complete(&read_l2_table_cb->gencb, ret); +} + +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, +                       BlockDriverCompletionFunc *cb, void *opaque) +{ +    QEDReadL2TableCB *read_l2_table_cb; + +    qed_unref_l2_cache_entry(request->l2_table); + +    /* Check for cached L2 entry */ +    request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); +    if (request->l2_table) { +        cb(opaque, 0); +        return; +    } + +    request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); +    request->l2_table->table = qed_alloc_table(s); + +    read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); +    read_l2_table_cb->s = s; +    read_l2_table_cb->l2_offset = offset; +    read_l2_table_cb->request = request; + +    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); +    qed_read_table(s, offset, request->l2_table->table, +                   qed_read_l2_table_cb, read_l2_table_cb); +} + +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset) +{ +    int ret = -EINPROGRESS; + +    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); +    while (ret == -EINPROGRESS) { +        qemu_aio_wait(); +    } + +    return ret; +} + +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, +                        unsigned int index, unsigned int n, bool flush, +                        BlockDriverCompletionFunc *cb, void *opaque) +{ +    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE); +    qed_write_table(s, request->l2_table->offset, +                    request->l2_table->table, index, n, flush, cb, opaque); +} + +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, +                            unsigned int index, unsigned int n, bool flush) +{ +    int ret = -EINPROGRESS; + +    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); +    while (ret == -EINPROGRESS) { +        qemu_aio_wait(); +    } + +    return ret; +} diff --git a/contrib/qemu/block/qed.c b/contrib/qemu/block/qed.c new file mode 100644 index 000000000..f767b0528 --- /dev/null +++ b/contrib/qemu/block/qed.c @@ -0,0 +1,1596 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/timer.h" +#include "trace.h" +#include "qed.h" +#include "qapi/qmp/qerror.h" +#include "migration/migration.h" + +static void qed_aio_cancel(BlockDriverAIOCB *blockacb) +{ +    QEDAIOCB *acb = (QEDAIOCB *)blockacb; +    bool finished = false; + +    /* Wait for the request to finish */ +    acb->finished = &finished; +    while (!finished) { +        qemu_aio_wait(); +    } +} + +static const AIOCBInfo qed_aiocb_info = { +    .aiocb_size         = sizeof(QEDAIOCB), +    .cancel             = qed_aio_cancel, +}; + +static int bdrv_qed_probe(const uint8_t *buf, int buf_size, +                          const char *filename) +{ +    const QEDHeader *header = (const QEDHeader *)buf; + +    if (buf_size < sizeof(*header)) { +        return 0; +    } +    if (le32_to_cpu(header->magic) != QED_MAGIC) { +        return 0; +    } +    return 100; +} + +/** + * Check whether an image format is raw + * + * @fmt:    Backing file format, may be NULL + */ +static bool qed_fmt_is_raw(const char *fmt) +{ +    return fmt && strcmp(fmt, "raw") == 0; +} + +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) +{ +    cpu->magic = le32_to_cpu(le->magic); +    cpu->cluster_size = le32_to_cpu(le->cluster_size); +    cpu->table_size = le32_to_cpu(le->table_size); +    cpu->header_size = le32_to_cpu(le->header_size); +    cpu->features = le64_to_cpu(le->features); +    cpu->compat_features = le64_to_cpu(le->compat_features); +    cpu->autoclear_features = le64_to_cpu(le->autoclear_features); +    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); +    cpu->image_size = le64_to_cpu(le->image_size); +    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset); +    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size); +} + +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) +{ +    le->magic = cpu_to_le32(cpu->magic); +    le->cluster_size = cpu_to_le32(cpu->cluster_size); +    le->table_size = cpu_to_le32(cpu->table_size); +    le->header_size = cpu_to_le32(cpu->header_size); +    le->features = cpu_to_le64(cpu->features); +    le->compat_features = cpu_to_le64(cpu->compat_features); +    le->autoclear_features = cpu_to_le64(cpu->autoclear_features); +    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); +    le->image_size = cpu_to_le64(cpu->image_size); +    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset); +    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size); +} + +int qed_write_header_sync(BDRVQEDState *s) +{ +    QEDHeader le; +    int ret; + +    qed_header_cpu_to_le(&s->header, &le); +    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le)); +    if (ret != sizeof(le)) { +        return ret; +    } +    return 0; +} + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    struct iovec iov; +    QEMUIOVector qiov; +    int nsectors; +    uint8_t *buf; +} QEDWriteHeaderCB; + +static void qed_write_header_cb(void *opaque, int ret) +{ +    QEDWriteHeaderCB *write_header_cb = opaque; + +    qemu_vfree(write_header_cb->buf); +    gencb_complete(write_header_cb, ret); +} + +static void qed_write_header_read_cb(void *opaque, int ret) +{ +    QEDWriteHeaderCB *write_header_cb = opaque; +    BDRVQEDState *s = write_header_cb->s; + +    if (ret) { +        qed_write_header_cb(write_header_cb, ret); +        return; +    } + +    /* Update header */ +    qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); + +    bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, +                    write_header_cb->nsectors, qed_write_header_cb, +                    write_header_cb); +} + +/** + * Update header in-place (does not rewrite backing filename or other strings) + * + * This function only updates known header fields in-place and does not affect + * extra data after the QED header. + */ +static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb, +                             void *opaque) +{ +    /* We must write full sectors for O_DIRECT but cannot necessarily generate +     * the data following the header if an unrecognized compat feature is +     * active.  Therefore, first read the sectors containing the header, update +     * them, and write back. +     */ + +    int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) / +                   BDRV_SECTOR_SIZE; +    size_t len = nsectors * BDRV_SECTOR_SIZE; +    QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), +                                                    cb, opaque); + +    write_header_cb->s = s; +    write_header_cb->nsectors = nsectors; +    write_header_cb->buf = qemu_blockalign(s->bs, len); +    write_header_cb->iov.iov_base = write_header_cb->buf; +    write_header_cb->iov.iov_len = len; +    qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); + +    bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors, +                   qed_write_header_read_cb, write_header_cb); +} + +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) +{ +    uint64_t table_entries; +    uint64_t l2_size; + +    table_entries = (table_size * cluster_size) / sizeof(uint64_t); +    l2_size = table_entries * cluster_size; + +    return l2_size * table_entries; +} + +static bool qed_is_cluster_size_valid(uint32_t cluster_size) +{ +    if (cluster_size < QED_MIN_CLUSTER_SIZE || +        cluster_size > QED_MAX_CLUSTER_SIZE) { +        return false; +    } +    if (cluster_size & (cluster_size - 1)) { +        return false; /* not power of 2 */ +    } +    return true; +} + +static bool qed_is_table_size_valid(uint32_t table_size) +{ +    if (table_size < QED_MIN_TABLE_SIZE || +        table_size > QED_MAX_TABLE_SIZE) { +        return false; +    } +    if (table_size & (table_size - 1)) { +        return false; /* not power of 2 */ +    } +    return true; +} + +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, +                                    uint32_t table_size) +{ +    if (image_size % BDRV_SECTOR_SIZE != 0) { +        return false; /* not multiple of sector size */ +    } +    if (image_size > qed_max_image_size(cluster_size, table_size)) { +        return false; /* image is too large */ +    } +    return true; +} + +/** + * Read a string of known length from the image file + * + * @file:       Image file + * @offset:     File offset to start of string, in bytes + * @n:          String length in bytes + * @buf:        Destination buffer + * @buflen:     Destination buffer length in bytes + * @ret:        0 on success, -errno on failure + * + * The string is NUL-terminated. + */ +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, +                           char *buf, size_t buflen) +{ +    int ret; +    if (n >= buflen) { +        return -EINVAL; +    } +    ret = bdrv_pread(file, offset, buf, n); +    if (ret < 0) { +        return ret; +    } +    buf[n] = '\0'; +    return 0; +} + +/** + * Allocate new clusters + * + * @s:          QED state + * @n:          Number of contiguous clusters to allocate + * @ret:        Offset of first allocated cluster + * + * This function only produces the offset where the new clusters should be + * written.  It updates BDRVQEDState but does not make any changes to the image + * file. + */ +static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) +{ +    uint64_t offset = s->file_size; +    s->file_size += n * s->header.cluster_size; +    return offset; +} + +QEDTable *qed_alloc_table(BDRVQEDState *s) +{ +    /* Honor O_DIRECT memory alignment requirements */ +    return qemu_blockalign(s->bs, +                           s->header.cluster_size * s->header.table_size); +} + +/** + * Allocate a new zeroed L2 table + */ +static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) +{ +    CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + +    l2_table->table = qed_alloc_table(s); +    l2_table->offset = qed_alloc_clusters(s, s->header.table_size); + +    memset(l2_table->table->offsets, 0, +           s->header.cluster_size * s->header.table_size); +    return l2_table; +} + +static void qed_aio_next_io(void *opaque, int ret); + +static void qed_plug_allocating_write_reqs(BDRVQEDState *s) +{ +    assert(!s->allocating_write_reqs_plugged); + +    s->allocating_write_reqs_plugged = true; +} + +static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) +{ +    QEDAIOCB *acb; + +    assert(s->allocating_write_reqs_plugged); + +    s->allocating_write_reqs_plugged = false; + +    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); +    if (acb) { +        qed_aio_next_io(acb, 0); +    } +} + +static void qed_finish_clear_need_check(void *opaque, int ret) +{ +    /* Do nothing */ +} + +static void qed_flush_after_clear_need_check(void *opaque, int ret) +{ +    BDRVQEDState *s = opaque; + +    bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s); + +    /* No need to wait until flush completes */ +    qed_unplug_allocating_write_reqs(s); +} + +static void qed_clear_need_check(void *opaque, int ret) +{ +    BDRVQEDState *s = opaque; + +    if (ret) { +        qed_unplug_allocating_write_reqs(s); +        return; +    } + +    s->header.features &= ~QED_F_NEED_CHECK; +    qed_write_header(s, qed_flush_after_clear_need_check, s); +} + +static void qed_need_check_timer_cb(void *opaque) +{ +    BDRVQEDState *s = opaque; + +    /* The timer should only fire when allocating writes have drained */ +    assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs)); + +    trace_qed_need_check_timer_cb(s); + +    qed_plug_allocating_write_reqs(s); + +    /* Ensure writes are on disk before clearing flag */ +    bdrv_aio_flush(s->bs, qed_clear_need_check, s); +} + +static void qed_start_need_check_timer(BDRVQEDState *s) +{ +    trace_qed_start_need_check_timer(s); + +    /* Use vm_clock so we don't alter the image file while suspended for +     * migration. +     */ +    qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) + +                   get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); +} + +/* It's okay to call this multiple times or when no timer is started */ +static void qed_cancel_need_check_timer(BDRVQEDState *s) +{ +    trace_qed_cancel_need_check_timer(s); +    qemu_del_timer(s->need_check_timer); +} + +static void bdrv_qed_rebind(BlockDriverState *bs) +{ +    BDRVQEDState *s = bs->opaque; +    s->bs = bs; +} + +static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) +{ +    BDRVQEDState *s = bs->opaque; +    QEDHeader le_header; +    int64_t file_size; +    int ret; + +    s->bs = bs; +    QSIMPLEQ_INIT(&s->allocating_write_reqs); + +    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); +    if (ret < 0) { +        return ret; +    } +    qed_header_le_to_cpu(&le_header, &s->header); + +    if (s->header.magic != QED_MAGIC) { +        return -EMEDIUMTYPE; +    } +    if (s->header.features & ~QED_FEATURE_MASK) { +        /* image uses unsupported feature bits */ +        char buf[64]; +        snprintf(buf, sizeof(buf), "%" PRIx64, +            s->header.features & ~QED_FEATURE_MASK); +        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, +            bs->device_name, "QED", buf); +        return -ENOTSUP; +    } +    if (!qed_is_cluster_size_valid(s->header.cluster_size)) { +        return -EINVAL; +    } + +    /* Round down file size to the last cluster */ +    file_size = bdrv_getlength(bs->file); +    if (file_size < 0) { +        return file_size; +    } +    s->file_size = qed_start_of_cluster(s, file_size); + +    if (!qed_is_table_size_valid(s->header.table_size)) { +        return -EINVAL; +    } +    if (!qed_is_image_size_valid(s->header.image_size, +                                 s->header.cluster_size, +                                 s->header.table_size)) { +        return -EINVAL; +    } +    if (!qed_check_table_offset(s, s->header.l1_table_offset)) { +        return -EINVAL; +    } + +    s->table_nelems = (s->header.cluster_size * s->header.table_size) / +                      sizeof(uint64_t); +    s->l2_shift = ffs(s->header.cluster_size) - 1; +    s->l2_mask = s->table_nelems - 1; +    s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1; + +    if ((s->header.features & QED_F_BACKING_FILE)) { +        if ((uint64_t)s->header.backing_filename_offset + +            s->header.backing_filename_size > +            s->header.cluster_size * s->header.header_size) { +            return -EINVAL; +        } + +        ret = qed_read_string(bs->file, s->header.backing_filename_offset, +                              s->header.backing_filename_size, bs->backing_file, +                              sizeof(bs->backing_file)); +        if (ret < 0) { +            return ret; +        } + +        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) { +            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw"); +        } +    } + +    /* Reset unknown autoclear feature bits.  This is a backwards +     * compatibility mechanism that allows images to be opened by older +     * programs, which "knock out" unknown feature bits.  When an image is +     * opened by a newer program again it can detect that the autoclear +     * feature is no longer valid. +     */ +    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && +        !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) { +        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; + +        ret = qed_write_header_sync(s); +        if (ret) { +            return ret; +        } + +        /* From here on only known autoclear feature bits are valid */ +        bdrv_flush(bs->file); +    } + +    s->l1_table = qed_alloc_table(s); +    qed_init_l2_cache(&s->l2_cache); + +    ret = qed_read_l1_table_sync(s); +    if (ret) { +        goto out; +    } + +    /* If image was not closed cleanly, check consistency */ +    if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) { +        /* Read-only images cannot be fixed.  There is no risk of corruption +         * since write operations are not possible.  Therefore, allow +         * potentially inconsistent images to be opened read-only.  This can +         * aid data recovery from an otherwise inconsistent image. +         */ +        if (!bdrv_is_read_only(bs->file) && +            !(flags & BDRV_O_INCOMING)) { +            BdrvCheckResult result = {0}; + +            ret = qed_check(s, &result, true); +            if (ret) { +                goto out; +            } +        } +    } + +    s->need_check_timer = qemu_new_timer_ns(vm_clock, +                                            qed_need_check_timer_cb, s); + +out: +    if (ret) { +        qed_free_l2_cache(&s->l2_cache); +        qemu_vfree(s->l1_table); +    } +    return ret; +} + +/* We have nothing to do for QED reopen, stubs just return + * success */ +static int bdrv_qed_reopen_prepare(BDRVReopenState *state, +                                   BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + +static void bdrv_qed_close(BlockDriverState *bs) +{ +    BDRVQEDState *s = bs->opaque; + +    qed_cancel_need_check_timer(s); +    qemu_free_timer(s->need_check_timer); + +    /* Ensure writes reach stable storage */ +    bdrv_flush(bs->file); + +    /* Clean shutdown, no check required on next open */ +    if (s->header.features & QED_F_NEED_CHECK) { +        s->header.features &= ~QED_F_NEED_CHECK; +        qed_write_header_sync(s); +    } + +    qed_free_l2_cache(&s->l2_cache); +    qemu_vfree(s->l1_table); +} + +static int qed_create(const char *filename, uint32_t cluster_size, +                      uint64_t image_size, uint32_t table_size, +                      const char *backing_file, const char *backing_fmt) +{ +    QEDHeader header = { +        .magic = QED_MAGIC, +        .cluster_size = cluster_size, +        .table_size = table_size, +        .header_size = 1, +        .features = 0, +        .compat_features = 0, +        .l1_table_offset = cluster_size, +        .image_size = image_size, +    }; +    QEDHeader le_header; +    uint8_t *l1_table = NULL; +    size_t l1_size = header.cluster_size * header.table_size; +    int ret = 0; +    BlockDriverState *bs = NULL; + +    ret = bdrv_create_file(filename, NULL); +    if (ret < 0) { +        return ret; +    } + +    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB); +    if (ret < 0) { +        return ret; +    } + +    /* File must start empty and grow, check truncate is supported */ +    ret = bdrv_truncate(bs, 0); +    if (ret < 0) { +        goto out; +    } + +    if (backing_file) { +        header.features |= QED_F_BACKING_FILE; +        header.backing_filename_offset = sizeof(le_header); +        header.backing_filename_size = strlen(backing_file); + +        if (qed_fmt_is_raw(backing_fmt)) { +            header.features |= QED_F_BACKING_FORMAT_NO_PROBE; +        } +    } + +    qed_header_cpu_to_le(&header, &le_header); +    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); +    if (ret < 0) { +        goto out; +    } +    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, +                      header.backing_filename_size); +    if (ret < 0) { +        goto out; +    } + +    l1_table = g_malloc0(l1_size); +    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); +    if (ret < 0) { +        goto out; +    } + +    ret = 0; /* success */ +out: +    g_free(l1_table); +    bdrv_delete(bs); +    return ret; +} + +static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) +{ +    uint64_t image_size = 0; +    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; +    uint32_t table_size = QED_DEFAULT_TABLE_SIZE; +    const char *backing_file = NULL; +    const char *backing_fmt = NULL; + +    while (options && options->name) { +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) { +            image_size = options->value.n; +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { +            backing_file = options->value.s; +        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { +            backing_fmt = options->value.s; +        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { +            if (options->value.n) { +                cluster_size = options->value.n; +            } +        } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) { +            if (options->value.n) { +                table_size = options->value.n; +            } +        } +        options++; +    } + +    if (!qed_is_cluster_size_valid(cluster_size)) { +        fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n", +                QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); +        return -EINVAL; +    } +    if (!qed_is_table_size_valid(table_size)) { +        fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n", +                QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); +        return -EINVAL; +    } +    if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { +        fprintf(stderr, "QED image size must be a non-zero multiple of " +                        "cluster size and less than %" PRIu64 " bytes\n", +                qed_max_image_size(cluster_size, table_size)); +        return -EINVAL; +    } + +    return qed_create(filename, cluster_size, image_size, table_size, +                      backing_file, backing_fmt); +} + +typedef struct { +    Coroutine *co; +    int is_allocated; +    int *pnum; +} QEDIsAllocatedCB; + +static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) +{ +    QEDIsAllocatedCB *cb = opaque; +    *cb->pnum = len / BDRV_SECTOR_SIZE; +    cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO); +    if (cb->co) { +        qemu_coroutine_enter(cb->co, NULL); +    } +} + +static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs, +                                                 int64_t sector_num, +                                                 int nb_sectors, int *pnum) +{ +    BDRVQEDState *s = bs->opaque; +    uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; +    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; +    QEDIsAllocatedCB cb = { +        .is_allocated = -1, +        .pnum = pnum, +    }; +    QEDRequest request = { .l2_table = NULL }; + +    qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); + +    /* Now sleep if the callback wasn't invoked immediately */ +    while (cb.is_allocated == -1) { +        cb.co = qemu_coroutine_self(); +        qemu_coroutine_yield(); +    } + +    qed_unref_l2_cache_entry(request.l2_table); + +    return cb.is_allocated; +} + +static int bdrv_qed_make_empty(BlockDriverState *bs) +{ +    return -ENOTSUP; +} + +static BDRVQEDState *acb_to_s(QEDAIOCB *acb) +{ +    return acb->common.bs->opaque; +} + +/** + * Read from the backing file or zero-fill if no backing file + * + * @s:          QED state + * @pos:        Byte position in device + * @qiov:       Destination I/O vector + * @cb:         Completion function + * @opaque:     User data for completion function + * + * This function reads qiov->size bytes starting at pos from the backing file. + * If there is no backing file then zeroes are read. + */ +static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, +                                  QEMUIOVector *qiov, +                                  BlockDriverCompletionFunc *cb, void *opaque) +{ +    uint64_t backing_length = 0; +    size_t size; + +    /* If there is a backing file, get its length.  Treat the absence of a +     * backing file like a zero length backing file. +     */ +    if (s->bs->backing_hd) { +        int64_t l = bdrv_getlength(s->bs->backing_hd); +        if (l < 0) { +            cb(opaque, l); +            return; +        } +        backing_length = l; +    } + +    /* Zero all sectors if reading beyond the end of the backing file */ +    if (pos >= backing_length || +        pos + qiov->size > backing_length) { +        qemu_iovec_memset(qiov, 0, 0, qiov->size); +    } + +    /* Complete now if there are no backing file sectors to read */ +    if (pos >= backing_length) { +        cb(opaque, 0); +        return; +    } + +    /* If the read straddles the end of the backing file, shorten it */ +    size = MIN((uint64_t)backing_length - pos, qiov->size); + +    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); +    bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, +                   qiov, size / BDRV_SECTOR_SIZE, cb, opaque); +} + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    QEMUIOVector qiov; +    struct iovec iov; +    uint64_t offset; +} CopyFromBackingFileCB; + +static void qed_copy_from_backing_file_cb(void *opaque, int ret) +{ +    CopyFromBackingFileCB *copy_cb = opaque; +    qemu_vfree(copy_cb->iov.iov_base); +    gencb_complete(©_cb->gencb, ret); +} + +static void qed_copy_from_backing_file_write(void *opaque, int ret) +{ +    CopyFromBackingFileCB *copy_cb = opaque; +    BDRVQEDState *s = copy_cb->s; + +    if (ret) { +        qed_copy_from_backing_file_cb(copy_cb, ret); +        return; +    } + +    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); +    bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, +                    ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, +                    qed_copy_from_backing_file_cb, copy_cb); +} + +/** + * Copy data from backing file into the image + * + * @s:          QED state + * @pos:        Byte position in device + * @len:        Number of bytes + * @offset:     Byte offset in image file + * @cb:         Completion function + * @opaque:     User data for completion function + */ +static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, +                                       uint64_t len, uint64_t offset, +                                       BlockDriverCompletionFunc *cb, +                                       void *opaque) +{ +    CopyFromBackingFileCB *copy_cb; + +    /* Skip copy entirely if there is no work to do */ +    if (len == 0) { +        cb(opaque, 0); +        return; +    } + +    copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); +    copy_cb->s = s; +    copy_cb->offset = offset; +    copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); +    copy_cb->iov.iov_len = len; +    qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); + +    qed_read_backing_file(s, pos, ©_cb->qiov, +                          qed_copy_from_backing_file_write, copy_cb); +} + +/** + * Link one or more contiguous clusters into a table + * + * @s:              QED state + * @table:          L2 table + * @index:          First cluster index + * @n:              Number of contiguous clusters + * @cluster:        First cluster offset + * + * The cluster offset may be an allocated byte offset in the image file, the + * zero cluster marker, or the unallocated cluster marker. + */ +static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, +                                unsigned int n, uint64_t cluster) +{ +    int i; +    for (i = index; i < index + n; i++) { +        table->offsets[i] = cluster; +        if (!qed_offset_is_unalloc_cluster(cluster) && +            !qed_offset_is_zero_cluster(cluster)) { +            cluster += s->header.cluster_size; +        } +    } +} + +static void qed_aio_complete_bh(void *opaque) +{ +    QEDAIOCB *acb = opaque; +    BlockDriverCompletionFunc *cb = acb->common.cb; +    void *user_opaque = acb->common.opaque; +    int ret = acb->bh_ret; +    bool *finished = acb->finished; + +    qemu_bh_delete(acb->bh); +    qemu_aio_release(acb); + +    /* Invoke callback */ +    cb(user_opaque, ret); + +    /* Signal cancel completion */ +    if (finished) { +        *finished = true; +    } +} + +static void qed_aio_complete(QEDAIOCB *acb, int ret) +{ +    BDRVQEDState *s = acb_to_s(acb); + +    trace_qed_aio_complete(s, acb, ret); + +    /* Free resources */ +    qemu_iovec_destroy(&acb->cur_qiov); +    qed_unref_l2_cache_entry(acb->request.l2_table); + +    /* Free the buffer we may have allocated for zero writes */ +    if (acb->flags & QED_AIOCB_ZERO) { +        qemu_vfree(acb->qiov->iov[0].iov_base); +        acb->qiov->iov[0].iov_base = NULL; +    } + +    /* Arrange for a bh to invoke the completion function */ +    acb->bh_ret = ret; +    acb->bh = qemu_bh_new(qed_aio_complete_bh, acb); +    qemu_bh_schedule(acb->bh); + +    /* Start next allocating write request waiting behind this one.  Note that +     * requests enqueue themselves when they first hit an unallocated cluster +     * but they wait until the entire request is finished before waking up the +     * next request in the queue.  This ensures that we don't cycle through +     * requests multiple times but rather finish one at a time completely. +     */ +    if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { +        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); +        acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); +        if (acb) { +            qed_aio_next_io(acb, 0); +        } else if (s->header.features & QED_F_NEED_CHECK) { +            qed_start_need_check_timer(s); +        } +    } +} + +/** + * Commit the current L2 table to the cache + */ +static void qed_commit_l2_update(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    CachedL2Table *l2_table = acb->request.l2_table; +    uint64_t l2_offset = l2_table->offset; + +    qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + +    /* This is guaranteed to succeed because we just committed the entry to the +     * cache. +     */ +    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); +    assert(acb->request.l2_table != NULL); + +    qed_aio_next_io(opaque, ret); +} + +/** + * Update L1 table with new L2 table offset and write it out + */ +static void qed_aio_write_l1_update(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    int index; + +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    index = qed_l1_index(s, acb->cur_pos); +    s->l1_table->offsets[index] = acb->request.l2_table->offset; + +    qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); +} + +/** + * Update L2 table with new cluster offsets and write them out + */ +static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) +{ +    BDRVQEDState *s = acb_to_s(acb); +    bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; +    int index; + +    if (ret) { +        goto err; +    } + +    if (need_alloc) { +        qed_unref_l2_cache_entry(acb->request.l2_table); +        acb->request.l2_table = qed_new_l2_table(s); +    } + +    index = qed_l2_index(s, acb->cur_pos); +    qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, +                         offset); + +    if (need_alloc) { +        /* Write out the whole new L2 table */ +        qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, +                            qed_aio_write_l1_update, acb); +    } else { +        /* Write out only the updated part of the L2 table */ +        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, +                            qed_aio_next_io, acb); +    } +    return; + +err: +    qed_aio_complete(acb, ret); +} + +static void qed_aio_write_l2_update_cb(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    qed_aio_write_l2_update(acb, ret, acb->cur_cluster); +} + +/** + * Flush new data clusters before updating the L2 table + * + * This flush is necessary when a backing file is in use.  A crash during an + * allocating write could result in empty clusters in the image.  If the write + * only touched a subregion of the cluster, then backing image sectors have + * been lost in the untouched region.  The solution is to flush after writing a + * new data cluster and before updating the L2 table. + */ +static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); + +    if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) { +        qed_aio_complete(acb, -EIO); +    } +} + +/** + * Write data to the image file + */ +static void qed_aio_write_main(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    uint64_t offset = acb->cur_cluster + +                      qed_offset_into_cluster(s, acb->cur_pos); +    BlockDriverCompletionFunc *next_fn; + +    trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); + +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { +        next_fn = qed_aio_next_io; +    } else { +        if (s->bs->backing_hd) { +            next_fn = qed_aio_write_flush_before_l2_update; +        } else { +            next_fn = qed_aio_write_l2_update_cb; +        } +    } + +    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); +    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, +                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, +                    next_fn, acb); +} + +/** + * Populate back untouched region of new data cluster + */ +static void qed_aio_write_postfill(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    uint64_t start = acb->cur_pos + acb->cur_qiov.size; +    uint64_t len = +        qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; +    uint64_t offset = acb->cur_cluster + +                      qed_offset_into_cluster(s, acb->cur_pos) + +                      acb->cur_qiov.size; + +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    trace_qed_aio_write_postfill(s, acb, start, len, offset); +    qed_copy_from_backing_file(s, start, len, offset, +                                qed_aio_write_main, acb); +} + +/** + * Populate front untouched region of new data cluster + */ +static void qed_aio_write_prefill(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    uint64_t start = qed_start_of_cluster(s, acb->cur_pos); +    uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); + +    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); +    qed_copy_from_backing_file(s, start, len, acb->cur_cluster, +                                qed_aio_write_postfill, acb); +} + +/** + * Check if the QED_F_NEED_CHECK bit should be set during allocating write + */ +static bool qed_should_set_need_check(BDRVQEDState *s) +{ +    /* The flush before L2 update path ensures consistency */ +    if (s->bs->backing_hd) { +        return false; +    } + +    return !(s->header.features & QED_F_NEED_CHECK); +} + +static void qed_aio_write_zero_cluster(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; + +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    qed_aio_write_l2_update(acb, 0, 1); +} + +/** + * Write new data cluster + * + * @acb:        Write request + * @len:        Length in bytes + * + * This path is taken when writing to previously unallocated clusters. + */ +static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) +{ +    BDRVQEDState *s = acb_to_s(acb); +    BlockDriverCompletionFunc *cb; + +    /* Cancel timer when the first allocating request comes in */ +    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { +        qed_cancel_need_check_timer(s); +    } + +    /* Freeze this request if another allocating write is in progress */ +    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { +        QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); +    } +    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) || +        s->allocating_write_reqs_plugged) { +        return; /* wait for existing request to finish */ +    } + +    acb->cur_nclusters = qed_bytes_to_clusters(s, +            qed_offset_into_cluster(s, acb->cur_pos) + len); +    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + +    if (acb->flags & QED_AIOCB_ZERO) { +        /* Skip ahead if the clusters are already zero */ +        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { +            qed_aio_next_io(acb, 0); +            return; +        } + +        cb = qed_aio_write_zero_cluster; +    } else { +        cb = qed_aio_write_prefill; +        acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); +    } + +    if (qed_should_set_need_check(s)) { +        s->header.features |= QED_F_NEED_CHECK; +        qed_write_header(s, cb, acb); +    } else { +        cb(acb, 0); +    } +} + +/** + * Write data cluster in place + * + * @acb:        Write request + * @offset:     Cluster offset in bytes + * @len:        Length in bytes + * + * This path is taken when writing to already allocated clusters. + */ +static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) +{ +    /* Allocate buffer for zero writes */ +    if (acb->flags & QED_AIOCB_ZERO) { +        struct iovec *iov = acb->qiov->iov; + +        if (!iov->iov_base) { +            iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len); +            memset(iov->iov_base, 0, iov->iov_len); +        } +    } + +    /* Calculate the I/O vector */ +    acb->cur_cluster = offset; +    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + +    /* Do the actual write */ +    qed_aio_write_main(acb, 0); +} + +/** + * Write data cluster + * + * @opaque:     Write request + * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + *              or -errno + * @offset:     Cluster offset in bytes + * @len:        Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_write_data(void *opaque, int ret, +                               uint64_t offset, size_t len) +{ +    QEDAIOCB *acb = opaque; + +    trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len); + +    acb->find_cluster_ret = ret; + +    switch (ret) { +    case QED_CLUSTER_FOUND: +        qed_aio_write_inplace(acb, offset, len); +        break; + +    case QED_CLUSTER_L2: +    case QED_CLUSTER_L1: +    case QED_CLUSTER_ZERO: +        qed_aio_write_alloc(acb, len); +        break; + +    default: +        qed_aio_complete(acb, ret); +        break; +    } +} + +/** + * Read data cluster + * + * @opaque:     Read request + * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + *              or -errno + * @offset:     Cluster offset in bytes + * @len:        Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_read_data(void *opaque, int ret, +                              uint64_t offset, size_t len) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    BlockDriverState *bs = acb->common.bs; + +    /* Adjust offset into cluster */ +    offset += qed_offset_into_cluster(s, acb->cur_pos); + +    trace_qed_aio_read_data(s, acb, ret, offset, len); + +    if (ret < 0) { +        goto err; +    } + +    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + +    /* Handle zero cluster and backing file reads */ +    if (ret == QED_CLUSTER_ZERO) { +        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size); +        qed_aio_next_io(acb, 0); +        return; +    } else if (ret != QED_CLUSTER_FOUND) { +        qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, +                              qed_aio_next_io, acb); +        return; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); +    bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, +                   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, +                   qed_aio_next_io, acb); +    return; + +err: +    qed_aio_complete(acb, ret); +} + +/** + * Begin next I/O or complete the request + */ +static void qed_aio_next_io(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ? +                                qed_aio_write_data : qed_aio_read_data; + +    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); + +    /* Handle I/O error */ +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    acb->qiov_offset += acb->cur_qiov.size; +    acb->cur_pos += acb->cur_qiov.size; +    qemu_iovec_reset(&acb->cur_qiov); + +    /* Complete request */ +    if (acb->cur_pos >= acb->end_pos) { +        qed_aio_complete(acb, 0); +        return; +    } + +    /* Find next cluster and start I/O */ +    qed_find_cluster(s, &acb->request, +                      acb->cur_pos, acb->end_pos - acb->cur_pos, +                      io_fn, acb); +} + +static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, +                                       int64_t sector_num, +                                       QEMUIOVector *qiov, int nb_sectors, +                                       BlockDriverCompletionFunc *cb, +                                       void *opaque, int flags) +{ +    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque); + +    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, +                        opaque, flags); + +    acb->flags = flags; +    acb->finished = NULL; +    acb->qiov = qiov; +    acb->qiov_offset = 0; +    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; +    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; +    acb->request.l2_table = NULL; +    qemu_iovec_init(&acb->cur_qiov, qiov->niov); + +    /* Start request */ +    qed_aio_next_io(acb, 0); +    return &acb->common; +} + +static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, +                                            int64_t sector_num, +                                            QEMUIOVector *qiov, int nb_sectors, +                                            BlockDriverCompletionFunc *cb, +                                            void *opaque) +{ +    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, +                                             int64_t sector_num, +                                             QEMUIOVector *qiov, int nb_sectors, +                                             BlockDriverCompletionFunc *cb, +                                             void *opaque) +{ +    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, +                         opaque, QED_AIOCB_WRITE); +} + +typedef struct { +    Coroutine *co; +    int ret; +    bool done; +} QEDWriteZeroesCB; + +static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) +{ +    QEDWriteZeroesCB *cb = opaque; + +    cb->done = true; +    cb->ret = ret; +    if (cb->co) { +        qemu_coroutine_enter(cb->co, NULL); +    } +} + +static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, +                                                 int64_t sector_num, +                                                 int nb_sectors) +{ +    BlockDriverAIOCB *blockacb; +    BDRVQEDState *s = bs->opaque; +    QEDWriteZeroesCB cb = { .done = false }; +    QEMUIOVector qiov; +    struct iovec iov; + +    /* Refuse if there are untouched backing file sectors */ +    if (bs->backing_hd) { +        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { +            return -ENOTSUP; +        } +        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) { +            return -ENOTSUP; +        } +    } + +    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary +     * then it will be allocated during request processing. +     */ +    iov.iov_base = NULL, +    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE, + +    qemu_iovec_init_external(&qiov, &iov, 1); +    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, +                             qed_co_write_zeroes_cb, &cb, +                             QED_AIOCB_WRITE | QED_AIOCB_ZERO); +    if (!blockacb) { +        return -EIO; +    } +    if (!cb.done) { +        cb.co = qemu_coroutine_self(); +        qemu_coroutine_yield(); +    } +    assert(cb.done); +    return cb.ret; +} + +static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) +{ +    BDRVQEDState *s = bs->opaque; +    uint64_t old_image_size; +    int ret; + +    if (!qed_is_image_size_valid(offset, s->header.cluster_size, +                                 s->header.table_size)) { +        return -EINVAL; +    } + +    /* Shrinking is currently not supported */ +    if ((uint64_t)offset < s->header.image_size) { +        return -ENOTSUP; +    } + +    old_image_size = s->header.image_size; +    s->header.image_size = offset; +    ret = qed_write_header_sync(s); +    if (ret < 0) { +        s->header.image_size = old_image_size; +    } +    return ret; +} + +static int64_t bdrv_qed_getlength(BlockDriverState *bs) +{ +    BDRVQEDState *s = bs->opaque; +    return s->header.image_size; +} + +static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVQEDState *s = bs->opaque; + +    memset(bdi, 0, sizeof(*bdi)); +    bdi->cluster_size = s->header.cluster_size; +    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK; +    return 0; +} + +static int bdrv_qed_change_backing_file(BlockDriverState *bs, +                                        const char *backing_file, +                                        const char *backing_fmt) +{ +    BDRVQEDState *s = bs->opaque; +    QEDHeader new_header, le_header; +    void *buffer; +    size_t buffer_len, backing_file_len; +    int ret; + +    /* Refuse to set backing filename if unknown compat feature bits are +     * active.  If the image uses an unknown compat feature then we may not +     * know the layout of data following the header structure and cannot safely +     * add a new string. +     */ +    if (backing_file && (s->header.compat_features & +                         ~QED_COMPAT_FEATURE_MASK)) { +        return -ENOTSUP; +    } + +    memcpy(&new_header, &s->header, sizeof(new_header)); + +    new_header.features &= ~(QED_F_BACKING_FILE | +                             QED_F_BACKING_FORMAT_NO_PROBE); + +    /* Adjust feature flags */ +    if (backing_file) { +        new_header.features |= QED_F_BACKING_FILE; + +        if (qed_fmt_is_raw(backing_fmt)) { +            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE; +        } +    } + +    /* Calculate new header size */ +    backing_file_len = 0; + +    if (backing_file) { +        backing_file_len = strlen(backing_file); +    } + +    buffer_len = sizeof(new_header); +    new_header.backing_filename_offset = buffer_len; +    new_header.backing_filename_size = backing_file_len; +    buffer_len += backing_file_len; + +    /* Make sure we can rewrite header without failing */ +    if (buffer_len > new_header.header_size * new_header.cluster_size) { +        return -ENOSPC; +    } + +    /* Prepare new header */ +    buffer = g_malloc(buffer_len); + +    qed_header_cpu_to_le(&new_header, &le_header); +    memcpy(buffer, &le_header, sizeof(le_header)); +    buffer_len = sizeof(le_header); + +    if (backing_file) { +        memcpy(buffer + buffer_len, backing_file, backing_file_len); +        buffer_len += backing_file_len; +    } + +    /* Write new header */ +    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len); +    g_free(buffer); +    if (ret == 0) { +        memcpy(&s->header, &new_header, sizeof(new_header)); +    } +    return ret; +} + +static void bdrv_qed_invalidate_cache(BlockDriverState *bs) +{ +    BDRVQEDState *s = bs->opaque; + +    bdrv_qed_close(bs); +    memset(s, 0, sizeof(BDRVQEDState)); +    bdrv_qed_open(bs, NULL, bs->open_flags); +} + +static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, +                          BdrvCheckMode fix) +{ +    BDRVQEDState *s = bs->opaque; + +    return qed_check(s, result, !!fix); +} + +static QEMUOptionParameter qed_create_options[] = { +    { +        .name = BLOCK_OPT_SIZE, +        .type = OPT_SIZE, +        .help = "Virtual disk size (in bytes)" +    }, { +        .name = BLOCK_OPT_BACKING_FILE, +        .type = OPT_STRING, +        .help = "File name of a base image" +    }, { +        .name = BLOCK_OPT_BACKING_FMT, +        .type = OPT_STRING, +        .help = "Image format of the base image" +    }, { +        .name = BLOCK_OPT_CLUSTER_SIZE, +        .type = OPT_SIZE, +        .help = "Cluster size (in bytes)", +        .value = { .n = QED_DEFAULT_CLUSTER_SIZE }, +    }, { +        .name = BLOCK_OPT_TABLE_SIZE, +        .type = OPT_SIZE, +        .help = "L1/L2 table size (in clusters)" +    }, +    { /* end of list */ } +}; + +static BlockDriver bdrv_qed = { +    .format_name              = "qed", +    .instance_size            = sizeof(BDRVQEDState), +    .create_options           = qed_create_options, + +    .bdrv_probe               = bdrv_qed_probe, +    .bdrv_rebind              = bdrv_qed_rebind, +    .bdrv_open                = bdrv_qed_open, +    .bdrv_close               = bdrv_qed_close, +    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare, +    .bdrv_create              = bdrv_qed_create, +    .bdrv_has_zero_init       = bdrv_has_zero_init_1, +    .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated, +    .bdrv_make_empty          = bdrv_qed_make_empty, +    .bdrv_aio_readv           = bdrv_qed_aio_readv, +    .bdrv_aio_writev          = bdrv_qed_aio_writev, +    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes, +    .bdrv_truncate            = bdrv_qed_truncate, +    .bdrv_getlength           = bdrv_qed_getlength, +    .bdrv_get_info            = bdrv_qed_get_info, +    .bdrv_change_backing_file = bdrv_qed_change_backing_file, +    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache, +    .bdrv_check               = bdrv_qed_check, +}; + +static void bdrv_qed_init(void) +{ +    bdrv_register(&bdrv_qed); +} + +block_init(bdrv_qed_init); diff --git a/contrib/qemu/block/qed.h b/contrib/qemu/block/qed.h new file mode 100644 index 000000000..2b4ddedf3 --- /dev/null +++ b/contrib/qemu/block/qed.h @@ -0,0 +1,344 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef BLOCK_QED_H +#define BLOCK_QED_H + +#include "block/block_int.h" + +/* The layout of a QED file is as follows: + * + * +--------+----------+----------+----------+-----+ + * | header | L1 table | cluster0 | cluster1 | ... | + * +--------+----------+----------+----------+-----+ + * + * There is a 2-level pagetable for cluster allocation: + * + *                     +----------+ + *                     | L1 table | + *                     +----------+ + *                ,------'  |  '------. + *           +----------+   |    +----------+ + *           | L2 table |  ...   | L2 table | + *           +----------+        +----------+ + *       ,------'  |  '------. + *  +----------+   |    +----------+ + *  |   Data   |  ...   |   Data   | + *  +----------+        +----------+ + * + * The L1 table is fixed size and always present.  L2 tables are allocated on + * demand.  The L1 table size determines the maximum possible image size; it + * can be influenced using the cluster_size and table_size values. + * + * All fields are little-endian on disk. + */ + +enum { +    QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, + +    /* The image supports a backing file */ +    QED_F_BACKING_FILE = 0x01, + +    /* The image needs a consistency check before use */ +    QED_F_NEED_CHECK = 0x02, + +    /* The backing file format must not be probed, treat as raw image */ +    QED_F_BACKING_FORMAT_NO_PROBE = 0x04, + +    /* Feature bits must be used when the on-disk format changes */ +    QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */ +                       QED_F_NEED_CHECK | +                       QED_F_BACKING_FORMAT_NO_PROBE, +    QED_COMPAT_FEATURE_MASK = 0,            /* supported compat feature bits */ +    QED_AUTOCLEAR_FEATURE_MASK = 0,         /* supported autoclear feature bits */ + +    /* Data is stored in groups of sectors called clusters.  Cluster size must +     * be large to avoid keeping too much metadata.  I/O requests that have +     * sub-cluster size will require read-modify-write. +     */ +    QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ +    QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, +    QED_DEFAULT_CLUSTER_SIZE = 64 * 1024, + +    /* Allocated clusters are tracked using a 2-level pagetable.  Table size is +     * a multiple of clusters so large maximum image sizes can be supported +     * without jacking up the cluster size too much. +     */ +    QED_MIN_TABLE_SIZE = 1,        /* in clusters */ +    QED_MAX_TABLE_SIZE = 16, +    QED_DEFAULT_TABLE_SIZE = 4, + +    /* Delay to flush and clean image after last allocating write completes */ +    QED_NEED_CHECK_TIMEOUT = 5,    /* in seconds */ +}; + +typedef struct { +    uint32_t magic;                 /* QED\0 */ + +    uint32_t cluster_size;          /* in bytes */ +    uint32_t table_size;            /* for L1 and L2 tables, in clusters */ +    uint32_t header_size;           /* in clusters */ + +    uint64_t features;              /* format feature bits */ +    uint64_t compat_features;       /* compatible feature bits */ +    uint64_t autoclear_features;    /* self-resetting feature bits */ + +    uint64_t l1_table_offset;       /* in bytes */ +    uint64_t image_size;            /* total logical image size, in bytes */ + +    /* if (features & QED_F_BACKING_FILE) */ +    uint32_t backing_filename_offset; /* in bytes from start of header */ +    uint32_t backing_filename_size;   /* in bytes */ +} QEDHeader; + +typedef struct { +    uint64_t offsets[0];            /* in bytes */ +} QEDTable; + +/* The L2 cache is a simple write-through cache for L2 structures */ +typedef struct CachedL2Table { +    QEDTable *table; +    uint64_t offset;    /* offset=0 indicates an invalidate entry */ +    QTAILQ_ENTRY(CachedL2Table) node; +    int ref; +} CachedL2Table; + +typedef struct { +    QTAILQ_HEAD(, CachedL2Table) entries; +    unsigned int n_entries; +} L2TableCache; + +typedef struct QEDRequest { +    CachedL2Table *l2_table; +} QEDRequest; + +enum { +    QED_AIOCB_WRITE = 0x0001,       /* read or write? */ +    QED_AIOCB_ZERO  = 0x0002,       /* zero write, used with QED_AIOCB_WRITE */ +}; + +typedef struct QEDAIOCB { +    BlockDriverAIOCB common; +    QEMUBH *bh; +    int bh_ret;                     /* final return status for completion bh */ +    QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */ +    int flags;                      /* QED_AIOCB_* bits ORed together */ +    bool *finished;                 /* signal for cancel completion */ +    uint64_t end_pos;               /* request end on block device, in bytes */ + +    /* User scatter-gather list */ +    QEMUIOVector *qiov; +    size_t qiov_offset;             /* byte count already processed */ + +    /* Current cluster scatter-gather list */ +    QEMUIOVector cur_qiov; +    uint64_t cur_pos;               /* position on block device, in bytes */ +    uint64_t cur_cluster;           /* cluster offset in image file */ +    unsigned int cur_nclusters;     /* number of clusters being accessed */ +    int find_cluster_ret;           /* used for L1/L2 update */ + +    QEDRequest request; +} QEDAIOCB; + +typedef struct { +    BlockDriverState *bs;           /* device */ +    uint64_t file_size;             /* length of image file, in bytes */ + +    QEDHeader header;               /* always cpu-endian */ +    QEDTable *l1_table; +    L2TableCache l2_cache;          /* l2 table cache */ +    uint32_t table_nelems; +    uint32_t l1_shift; +    uint32_t l2_shift; +    uint32_t l2_mask; + +    /* Allocating write request queue */ +    QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs; +    bool allocating_write_reqs_plugged; + +    /* Periodic flush and clear need check flag */ +    QEMUTimer *need_check_timer; +} BDRVQEDState; + +enum { +    QED_CLUSTER_FOUND,         /* cluster found */ +    QED_CLUSTER_ZERO,          /* zero cluster found */ +    QED_CLUSTER_L2,            /* cluster missing in L2 */ +    QED_CLUSTER_L1,            /* cluster missing in L1 */ +}; + +/** + * qed_find_cluster() completion callback + * + * @opaque:     User data for completion callback + * @ret:        QED_CLUSTER_FOUND   Success + *              QED_CLUSTER_L2      Data cluster unallocated in L2 + *              QED_CLUSTER_L1      L2 unallocated in L1 + *              -errno              POSIX error occurred + * @offset:     Data cluster offset + * @len:        Contiguous bytes starting from cluster offset + * + * This function is invoked when qed_find_cluster() completes. + * + * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range + * in the image file. + * + * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1 + * table offset, respectively.  len is number of contiguous unallocated bytes. + */ +typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len); + +/** + * Generic callback for chaining async callbacks + */ +typedef struct { +    BlockDriverCompletionFunc *cb; +    void *opaque; +} GenericCB; + +void *gencb_alloc(size_t len, BlockDriverCompletionFunc *cb, void *opaque); +void gencb_complete(void *opaque, int ret); + +/** + * Header functions + */ +int qed_write_header_sync(BDRVQEDState *s); + +/** + * L2 cache functions + */ +void qed_init_l2_cache(L2TableCache *l2_cache); +void qed_free_l2_cache(L2TableCache *l2_cache); +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache); +void qed_unref_l2_cache_entry(CachedL2Table *entry); +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset); +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table); + +/** + * Table I/O functions + */ +int qed_read_l1_table_sync(BDRVQEDState *s); +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, +                        BlockDriverCompletionFunc *cb, void *opaque); +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, +                            unsigned int n); +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, +                           uint64_t offset); +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, +                       BlockDriverCompletionFunc *cb, void *opaque); +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, +                        unsigned int index, unsigned int n, bool flush, +                        BlockDriverCompletionFunc *cb, void *opaque); +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, +                            unsigned int index, unsigned int n, bool flush); + +/** + * Cluster functions + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, +                      size_t len, QEDFindClusterFunc *cb, void *opaque); + +/** + * Consistency check + */ +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix); + +QEDTable *qed_alloc_table(BDRVQEDState *s); + +/** + * Round down to the start of a cluster + */ +static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset) +{ +    return offset & ~(uint64_t)(s->header.cluster_size - 1); +} + +static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset) +{ +    return offset & (s->header.cluster_size - 1); +} + +static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes) +{ +    return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) / +           (s->header.cluster_size - 1); +} + +static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos) +{ +    return pos >> s->l1_shift; +} + +static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos) +{ +    return (pos >> s->l2_shift) & s->l2_mask; +} + +/** + * Test if a cluster offset is valid + */ +static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset) +{ +    uint64_t header_size = (uint64_t)s->header.header_size * +                           s->header.cluster_size; + +    if (offset & (s->header.cluster_size - 1)) { +        return false; +    } +    return offset >= header_size && offset < s->file_size; +} + +/** + * Test if a table offset is valid + */ +static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset) +{ +    uint64_t end_offset = offset + (s->header.table_size - 1) * +                          s->header.cluster_size; + +    /* Overflow check */ +    if (end_offset <= offset) { +        return false; +    } + +    return qed_check_cluster_offset(s, offset) && +           qed_check_cluster_offset(s, end_offset); +} + +static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s, +                                                 uint64_t offset) +{ +    if (qed_offset_into_cluster(s, offset)) { +        return false; +    } +    return true; +} + +static inline bool qed_offset_is_unalloc_cluster(uint64_t offset) +{ +    if (offset == 0) { +        return true; +    } +    return false; +} + +static inline bool qed_offset_is_zero_cluster(uint64_t offset) +{ +    if (offset == 1) { +        return true; +    } +    return false; +} + +#endif /* BLOCK_QED_H */ diff --git a/contrib/qemu/block/snapshot.c b/contrib/qemu/block/snapshot.c new file mode 100644 index 000000000..6c6d9deea --- /dev/null +++ b/contrib/qemu/block/snapshot.c @@ -0,0 +1,157 @@ +/* + * Block layer snapshot related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/snapshot.h" +#include "block/block_int.h" + +int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, +                       const char *name) +{ +    QEMUSnapshotInfo *sn_tab, *sn; +    int nb_sns, i, ret; + +    ret = -ENOENT; +    nb_sns = bdrv_snapshot_list(bs, &sn_tab); +    if (nb_sns < 0) { +        return ret; +    } +    for (i = 0; i < nb_sns; i++) { +        sn = &sn_tab[i]; +        if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) { +            *sn_info = *sn; +            ret = 0; +            break; +        } +    } +    g_free(sn_tab); +    return ret; +} + +int bdrv_can_snapshot(BlockDriverState *bs) +{ +    BlockDriver *drv = bs->drv; +    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { +        return 0; +    } + +    if (!drv->bdrv_snapshot_create) { +        if (bs->file != NULL) { +            return bdrv_can_snapshot(bs->file); +        } +        return 0; +    } + +    return 1; +} + +int bdrv_snapshot_create(BlockDriverState *bs, +                         QEMUSnapshotInfo *sn_info) +{ +    BlockDriver *drv = bs->drv; +    if (!drv) { +        return -ENOMEDIUM; +    } +    if (drv->bdrv_snapshot_create) { +        return drv->bdrv_snapshot_create(bs, sn_info); +    } +    if (bs->file) { +        return bdrv_snapshot_create(bs->file, sn_info); +    } +    return -ENOTSUP; +} + +int bdrv_snapshot_goto(BlockDriverState *bs, +                       const char *snapshot_id) +{ +    BlockDriver *drv = bs->drv; +    int ret, open_ret; + +    if (!drv) { +        return -ENOMEDIUM; +    } +    if (drv->bdrv_snapshot_goto) { +        return drv->bdrv_snapshot_goto(bs, snapshot_id); +    } + +    if (bs->file) { +        drv->bdrv_close(bs); +        ret = bdrv_snapshot_goto(bs->file, snapshot_id); +        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags); +        if (open_ret < 0) { +            bdrv_delete(bs->file); +            bs->drv = NULL; +            return open_ret; +        } +        return ret; +    } + +    return -ENOTSUP; +} + +int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +{ +    BlockDriver *drv = bs->drv; +    if (!drv) { +        return -ENOMEDIUM; +    } +    if (drv->bdrv_snapshot_delete) { +        return drv->bdrv_snapshot_delete(bs, snapshot_id); +    } +    if (bs->file) { +        return bdrv_snapshot_delete(bs->file, snapshot_id); +    } +    return -ENOTSUP; +} + +int bdrv_snapshot_list(BlockDriverState *bs, +                       QEMUSnapshotInfo **psn_info) +{ +    BlockDriver *drv = bs->drv; +    if (!drv) { +        return -ENOMEDIUM; +    } +    if (drv->bdrv_snapshot_list) { +        return drv->bdrv_snapshot_list(bs, psn_info); +    } +    if (bs->file) { +        return bdrv_snapshot_list(bs->file, psn_info); +    } +    return -ENOTSUP; +} + +int bdrv_snapshot_load_tmp(BlockDriverState *bs, +        const char *snapshot_name) +{ +    BlockDriver *drv = bs->drv; +    if (!drv) { +        return -ENOMEDIUM; +    } +    if (!bs->read_only) { +        return -EINVAL; +    } +    if (drv->bdrv_snapshot_load_tmp) { +        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name); +    } +    return -ENOTSUP; +}  | 
