Commit 585f8587 authored by bellard's avatar bellard
Browse files

new qcow2 disk image format


git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@2083 c046a42c-6fe2-441c-8c8c-71466251a162
parent 9e46cfa4
/*
* Block driver for the QCOW version 2 format
*
* Copyright (c) 2004-2006 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "vl.h"
#include "block_int.h"
#include <zlib.h>
#include "aes.h"
#include <assert.h>
/*
Differences with QCOW:
- Support for multiple incremental snapshots.
- Memory management by reference counts.
- Clusters which have a reference count of one have the bit
QCOW_OFLAG_COPIED to optimize write performance.
- Size of compressed clusters is stored in sectors to reduce bit usage
in the cluster offsets.
- Support for storing additional data (such as the VM state) in the
snapshots.
- If a backing store is used, the cluster size is not constrained
(could be backported to QCOW).
- L2 tables have always a size of one cluster.
*/
//#define DEBUG_ALLOC
//#define DEBUG_ALLOC2
#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
#define QCOW_VERSION 2
#define QCOW_CRYPT_NONE 0
#define QCOW_CRYPT_AES 1
/* indicate that the refcount of the referenced cluster is exactly one. */
#define QCOW_OFLAG_COPIED (1LL << 63)
/* indicate that the cluster is compressed (they never have the copied flag) */
#define QCOW_OFLAG_COMPRESSED (1LL << 62)
#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
#ifndef offsetof
#define offsetof(type, field) ((size_t) &((type *)0)->field)
#endif
typedef struct QCowHeader {
uint32_t magic;
uint32_t version;
uint64_t backing_file_offset;
uint32_t backing_file_size;
uint32_t cluster_bits;
uint64_t size; /* in bytes */
uint32_t crypt_method;
uint32_t l1_size; /* XXX: save number of clusters instead ? */
uint64_t l1_table_offset;
uint64_t refcount_table_offset;
uint32_t refcount_table_clusters;
uint32_t nb_snapshots;
uint64_t snapshots_offset;
} QCowHeader;
typedef struct __attribute__((packed)) QCowSnapshotHeader {
/* header is 8 byte aligned */
uint64_t l1_table_offset;
uint32_t l1_size;
uint16_t id_str_size;
uint16_t name_size;
uint32_t date_sec;
uint32_t date_nsec;
uint64_t vm_clock_nsec;
uint32_t vm_state_size;
uint32_t extra_data_size; /* for extension */
/* extra data follows */
/* id_str follows */
/* name follows */
} QCowSnapshotHeader;
#define L2_CACHE_SIZE 16
typedef struct QCowSnapshot {
uint64_t l1_table_offset;
uint32_t l1_size;
char *id_str;
char *name;
uint32_t vm_state_size;
uint32_t date_sec;
uint32_t date_nsec;
uint64_t vm_clock_nsec;
} QCowSnapshot;
typedef struct BDRVQcowState {
BlockDriverState *hd;
int cluster_bits;
int cluster_size;
int cluster_sectors;
int l2_bits;
int l2_size;
int l1_size;
int l1_vm_state_index;
int csize_shift;
int csize_mask;
uint64_t cluster_offset_mask;
uint64_t l1_table_offset;
uint64_t *l1_table;
uint64_t *l2_cache;
uint64_t l2_cache_offsets[L2_CACHE_SIZE];
uint32_t l2_cache_counts[L2_CACHE_SIZE];
uint8_t *cluster_cache;
uint8_t *cluster_data;
uint64_t cluster_cache_offset;
uint64_t *refcount_table;
uint64_t refcount_table_offset;
uint32_t refcount_table_size;
uint64_t refcount_block_cache_offset;
uint16_t *refcount_block_cache;
int64_t free_cluster_index;
int64_t free_byte_offset;
uint32_t crypt_method; /* current crypt method, 0 if no key yet */
uint32_t crypt_method_header;
AES_KEY aes_encrypt_key;
AES_KEY aes_decrypt_key;
uint64_t snapshots_offset;
int snapshots_size;
int nb_snapshots;
QCowSnapshot *snapshots;
} BDRVQcowState;
static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
static int qcow_read(BlockDriverState *bs, int64_t sector_num,
uint8_t *buf, int nb_sectors);
static int qcow_read_snapshots(BlockDriverState *bs);
static void qcow_free_snapshots(BlockDriverState *bs);
static int refcount_init(BlockDriverState *bs);
static void refcount_close(BlockDriverState *bs);
static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
static int update_cluster_refcount(BlockDriverState *bs,
int64_t cluster_index,
int addend);
static void update_refcount(BlockDriverState *bs,
int64_t offset, int64_t length,
int addend);
static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
static int64_t alloc_bytes(BlockDriverState *bs, int size);
static void free_clusters(BlockDriverState *bs,
int64_t offset, int64_t size);
#ifdef DEBUG_ALLOC
static void check_refcounts(BlockDriverState *bs);
#endif
static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
{
const QCowHeader *cow_header = (const void *)buf;
if (buf_size >= sizeof(QCowHeader) &&
be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
be32_to_cpu(cow_header->version) == QCOW_VERSION)
return 100;
else
return 0;
}
static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
{
BDRVQcowState *s = bs->opaque;
int len, i, shift, ret;
QCowHeader header;
ret = bdrv_file_open(&s->hd, filename, flags);
if (ret < 0)
return ret;
if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
goto fail;
be32_to_cpus(&header.magic);
be32_to_cpus(&header.version);
be64_to_cpus(&header.backing_file_offset);
be32_to_cpus(&header.backing_file_size);
be64_to_cpus(&header.size);
be32_to_cpus(&header.cluster_bits);
be32_to_cpus(&header.crypt_method);
be64_to_cpus(&header.l1_table_offset);
be32_to_cpus(&header.l1_size);
be64_to_cpus(&header.refcount_table_offset);
be32_to_cpus(&header.refcount_table_clusters);
be64_to_cpus(&header.snapshots_offset);
be32_to_cpus(&header.nb_snapshots);
if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
goto fail;
if (header.size <= 1 ||
header.cluster_bits < 9 ||
header.cluster_bits > 16)
goto fail;
if (header.crypt_method > QCOW_CRYPT_AES)
goto fail;
s->crypt_method_header = header.crypt_method;
if (s->crypt_method_header)
bs->encrypted = 1;
s->cluster_bits = header.cluster_bits;
s->cluster_size = 1 << s->cluster_bits;
s->cluster_sectors = 1 << (s->cluster_bits - 9);
s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
s->l2_size = 1 << s->l2_bits;
bs->total_sectors = header.size / 512;
s->csize_shift = (62 - (s->cluster_bits - 8));
s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
s->refcount_table_offset = header.refcount_table_offset;
s->refcount_table_size =
header.refcount_table_clusters << (s->cluster_bits - 3);
s->snapshots_offset = header.snapshots_offset;
s->nb_snapshots = header.nb_snapshots;
/* read the level 1 table */
s->l1_size = header.l1_size;
shift = s->cluster_bits + s->l2_bits;
s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
/* the L1 table must contain at least enough entries to put
header.size bytes */
if (s->l1_size < s->l1_vm_state_index)
goto fail;
s->l1_table_offset = header.l1_table_offset;
s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
if (!s->l1_table)
goto fail;
if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
s->l1_size * sizeof(uint64_t))
goto fail;
for(i = 0;i < s->l1_size; i++) {
be64_to_cpus(&s->l1_table[i]);
}
/* alloc L2 cache */
s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
if (!s->l2_cache)
goto fail;
s->cluster_cache = qemu_malloc(s->cluster_size);
if (!s->cluster_cache)
goto fail;
/* one more sector for decompressed data alignment */
s->cluster_data = qemu_malloc(s->cluster_size + 512);
if (!s->cluster_data)
goto fail;
s->cluster_cache_offset = -1;
if (refcount_init(bs) < 0)
goto fail;
/* read the backing file name */
if (header.backing_file_offset != 0) {
len = header.backing_file_size;
if (len > 1023)
len = 1023;
if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
goto fail;
bs->backing_file[len] = '\0';
}
if (qcow_read_snapshots(bs) < 0)
goto fail;
#ifdef DEBUG_ALLOC
check_refcounts(bs);
#endif
return 0;
fail:
qcow_free_snapshots(bs);
refcount_close(bs);
qemu_free(s->l1_table);
qemu_free(s->l2_cache);
qemu_free(s->cluster_cache);
qemu_free(s->cluster_data);
bdrv_delete(s->hd);
return -1;
}
static int qcow_set_key(BlockDriverState *bs, const char *key)
{
BDRVQcowState *s = bs->opaque;
uint8_t keybuf[16];
int len, i;
memset(keybuf, 0, 16);
len = strlen(key);
if (len > 16)
len = 16;
/* XXX: we could compress the chars to 7 bits to increase
entropy */
for(i = 0;i < len;i++) {
keybuf[i] = key[i];
}
s->crypt_method = s->crypt_method_header;
if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
return -1;
if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
return -1;
#if 0
/* test */
{
uint8_t in[16];
uint8_t out[16];
uint8_t tmp[16];
for(i=0;i<16;i++)
in[i] = i;
AES_encrypt(in, tmp, &s->aes_encrypt_key);
AES_decrypt(tmp, out, &s->aes_decrypt_key);
for(i = 0; i < 16; i++)
printf(" %02x", tmp[i]);
printf("\n");
for(i = 0; i < 16; i++)
printf(" %02x", out[i]);
printf("\n");
}
#endif
return 0;
}
/* The crypt function is compatible with the linux cryptoloop
algorithm for < 4 GB images. NOTE: out_buf == in_buf is
supported */
static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
uint8_t *out_buf, const uint8_t *in_buf,
int nb_sectors, int enc,
const AES_KEY *key)
{
union {
uint64_t ll[2];
uint8_t b[16];
} ivec;
int i;
for(i = 0; i < nb_sectors; i++) {
ivec.ll[0] = cpu_to_le64(sector_num);
ivec.ll[1] = 0;
AES_cbc_encrypt(in_buf, out_buf, 512, key,
ivec.b, enc);
sector_num++;
in_buf += 512;
out_buf += 512;
}
}
static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
uint64_t cluster_offset, int n_start, int n_end)
{
BDRVQcowState *s = bs->opaque;
int n, ret;
n = n_end - n_start;
if (n <= 0)
return 0;
ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
if (ret < 0)
return ret;
if (s->crypt_method) {
encrypt_sectors(s, start_sect + n_start,
s->cluster_data,
s->cluster_data, n, 1,
&s->aes_encrypt_key);
}
ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
s->cluster_data, n);
if (ret < 0)
return ret;
return 0;
}
static void l2_cache_reset(BlockDriverState *bs)
{
BDRVQcowState *s = bs->opaque;
memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
}
static inline int l2_cache_new_entry(BlockDriverState *bs)
{
BDRVQcowState *s = bs->opaque;
uint32_t min_count;
int min_index, i;
/* find a new entry in the least used one */
min_index = 0;
min_count = 0xffffffff;
for(i = 0; i < L2_CACHE_SIZE; i++) {
if (s->l2_cache_counts[i] < min_count) {
min_count = s->l2_cache_counts[i];
min_index = i;
}
}
return min_index;
}
static int64_t align_offset(int64_t offset, int n)
{
offset = (offset + n - 1) & ~(n - 1);
return offset;
}
static int grow_l1_table(BlockDriverState *bs, int min_size)
{
BDRVQcowState *s = bs->opaque;
int new_l1_size, new_l1_size2, ret, i;
uint64_t *new_l1_table;
uint64_t new_l1_table_offset;
uint64_t data64;
uint32_t data32;
new_l1_size = s->l1_size;
if (min_size <= new_l1_size)
return 0;
while (min_size > new_l1_size) {
new_l1_size = (new_l1_size * 3 + 1) / 2;
}
#ifdef DEBUG_ALLOC2
printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
#endif
new_l1_size2 = sizeof(uint64_t) * new_l1_size;
new_l1_table = qemu_mallocz(new_l1_size2);
if (!new_l1_table)
return -ENOMEM;
memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
/* write new table (align to cluster) */
new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
for(i = 0; i < s->l1_size; i++)
new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
if (ret != new_l1_size2)
goto fail;
for(i = 0; i < s->l1_size; i++)
new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
/* set new table */
data64 = cpu_to_be64(new_l1_table_offset);
if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_table_offset),
&data64, sizeof(data64)) != sizeof(data64))
goto fail;
data32 = cpu_to_be32(new_l1_size);
if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size),
&data32, sizeof(data32)) != sizeof(data32))
goto fail;
qemu_free(s->l1_table);
free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
s->l1_table_offset = new_l1_table_offset;
s->l1_table = new_l1_table;
s->l1_size = new_l1_size;
return 0;
fail:
qemu_free(s->l1_table);
return -EIO;
}
/* 'allocate' is:
*
* 0 not to allocate.
*
* 1 to allocate a normal cluster (for sector indexes 'n_start' to
* 'n_end')
*
* 2 to allocate a compressed cluster of size
* 'compressed_size'. 'compressed_size' must be > 0 and <
* cluster_size
*
* return 0 if not allocated.
*/
static uint64_t get_cluster_offset(BlockDriverState *bs,
uint64_t offset, int allocate,
int compressed_size,
int n_start, int n_end)
{
BDRVQcowState *s = bs->opaque;
int min_index, i, j, l1_index, l2_index, ret;
uint64_t l2_offset, *l2_table, cluster_offset, tmp, old_l2_offset;
l1_index = offset >> (s->l2_bits + s->cluster_bits);
if (l1_index >= s->l1_size) {
/* outside l1 table is allowed: we grow the table if needed */
if (!allocate)
return 0;
if (grow_l1_table(bs, l1_index + 1) < 0)
return 0;
}
l2_offset = s->l1_table[l1_index];
if (!l2_offset) {
if (!allocate)
return 0;
l2_allocate:
old_l2_offset = l2_offset;
/* allocate a new l2 entry */
l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
/* update the L1 entry */
s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
&tmp, sizeof(tmp)) != sizeof(tmp))
return 0;
min_index = l2_cache_new_entry(bs);
l2_table = s->l2_cache + (min_index << s->l2_bits);
if (old_l2_offset == 0) {
memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
} else {
if (bdrv_pread(s->hd, old_l2_offset,
l2_table, s->l2_size * sizeof(uint64_t)) !=
s->l2_size * sizeof(uint64_t))
return 0;
}
if (bdrv_pwrite(s->hd, l2_offset,
l2_table, s->l2_size * sizeof(uint64_t)) !=
s->l2_size * sizeof(uint64_t))
return 0;
} else {
if (!(l2_offset & QCOW_OFLAG_COPIED)) {
if (allocate) {
free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
goto l2_allocate;
}
} else {
l2_offset &= ~QCOW_OFLAG_COPIED;
}
for(i = 0; i < L2_CACHE_SIZE; i++) {
if (l2_offset == s->l2_cache_offsets[i]) {
/* increment the hit count */
if (++s->l2_cache_counts[i] == 0xffffffff) {
for(j = 0; j < L2_CACHE_SIZE; j++) {
s->l2_cache_counts[j] >>= 1;
}
}
l2_table = s->l2_cache + (i << s->l2_bits);
goto found;
}
}
/* not found: load a new entry in the least used one */
min_index = l2_cache_new_entry(bs);
l2_table = s->l2_cache + (min_index << s->l2_bits);
if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
s->l2_size * sizeof(uint64_t))
return 0;
}
s->l2_cache_offsets[min_index] = l2_offset;
s->l2_cache_counts[min_index] = 1;
found:
l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
cluster_offset = be64_to_cpu(l2_table[l2_index]);
if (!cluster_offset) {
if (!allocate)
return cluster_offset;
} else if (!(cluster_offset & QCOW_OFLAG_COPIED)) {
if (!allocate)
return cluster_offset;
/* free the cluster */
if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
int nb_csectors;
nb_csectors = ((cluster_offset >> s->csize_shift) &
s->csize_mask) + 1;
free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
nb_csectors * 512);
} else {
free_clusters(bs, cluster_offset, s->cluster_size);
}
} else {
cluster_offset &= ~QCOW_OFLAG_COPIED;
return cluster_offset;
}
if (allocate == 1) {
/* allocate a new cluster */
cluster_offset = alloc_clusters(bs, s->cluster_size);
/* we must initialize the cluster content which won't be
written */
if ((n_end - n_start) < s->cluster_sectors) {
uint64_t start_sect;
start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
ret = copy_sectors(bs, start_sect,
cluster_offset, 0, n_start);
if (ret < 0)
return 0;
ret = copy_sectors(bs, start_sect,
cluster_offset, n_end, s->cluster_sectors);
if (ret < 0)
return 0;
}
tmp = cpu_to_be64(cluster_offset | QCOW_OFLAG_COPIED);
} else {
int nb_csectors;
cluster_offset = alloc_bytes(bs, compressed_size);
nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
(cluster_offset >> 9);
cluster_offset |= QCOW_OFLAG_COMPRESSED |
((uint64_t)nb_csectors << s->csize_shift);
/* compressed clusters never have the copied flag */
tmp = cpu_to_be64(cluster_offset);
}
/* update L2 table */
l2_table[l2_index] = tmp;