block-migration.c 12.5 KB
Newer Older
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 * QEMU live block migration
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Liran Schour   <lirans@il.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

#include "qemu-common.h"
#include "block_int.h"
#include "hw/hw.h"
17
#include "qemu-queue.h"
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
18
19
20
#include "block-migration.h"
#include <assert.h>

21
#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
22
23
24
25
26
27
28
29
30
31
32
33

#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
#define BLK_MIG_FLAG_EOS                0x02

#define MAX_IS_ALLOCATED_SEARCH 65536
#define MAX_BLOCKS_READ 10000
#define BLOCKS_READ_CHANGE 100
#define INITIAL_BLOCKS_READ 100

//#define DEBUG_BLK_MIGRATION

#ifdef DEBUG_BLK_MIGRATION
34
#define dprintf(fmt, ...) \
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
35
36
    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
#else
37
#define dprintf(fmt, ...) \
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
38
39
40
    do { } while (0)
#endif

41
42
43
44
45
typedef struct BlkMigDevState {
    BlockDriverState *bs;
    int bulk_completed;
    int shared_base;
    int64_t cur_sector;
46
    int64_t completed_sectors;
47
48
    int64_t total_sectors;
    int64_t dirty;
49
    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
50
51
} BlkMigDevState;

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
52
53
54
55
56
57
58
59
typedef struct BlkMigBlock {
    uint8_t *buf;
    BlkMigDevState *bmds;
    int64_t sector;
    struct iovec iov;
    QEMUIOVector qiov;
    BlockDriverAIOCB *aiocb;
    int ret;
60
    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
61
62
63
64
65
} BlkMigBlock;

typedef struct BlkMigState {
    int blk_enable;
    int shared_base;
66
67
    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
68
69
70
    int submitted;
    int read_done;
    int transferred;
71
    int64_t total_sector_sum;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
72
73
74
    int64_t print_completion;
} BlkMigState;

75
static BlkMigState block_mig_state;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
76

77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
static void blk_send(QEMUFile *f, BlkMigBlock * blk)
{
    int len;

    /* sector number and flags */
    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
                     | BLK_MIG_FLAG_DEVICE_BLOCK);

    /* device name */
    len = strlen(blk->bmds->bs->device_name);
    qemu_put_byte(f, len);
    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);

    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
}

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
93
94
95
static void blk_mig_read_cb(void *opaque, int ret)
{
    BlkMigBlock *blk = opaque;
96

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
97
    blk->ret = ret;
98

99
    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
100

101
102
103
    block_mig_state.submitted--;
    block_mig_state.read_done++;
    assert(block_mig_state.submitted >= 0);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
104
105
}

106
static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds, int is_async)
107
{
108
109
110
    int64_t total_sectors = bmds->total_sectors;
    int64_t cur_sector = bmds->cur_sector;
    BlockDriverState *bs = bmds->bs;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
111
    BlkMigBlock *blk;
112
    int nr_sectors;
113

114
    if (bmds->shared_base) {
115
        while (cur_sector < total_sectors &&
116
117
               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
                                  &nr_sectors)) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
118
119
120
            cur_sector += nr_sectors;
        }
    }
121
122

    if (cur_sector >= total_sectors) {
123
        bmds->cur_sector = bmds->completed_sectors = total_sectors;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
124
125
        return 1;
    }
126

127
    bmds->completed_sectors = cur_sector;
128

129
130
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);

131
132
    /* we are going to transfer a full block even if it is not allocated */
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
133

134
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
135
        nr_sectors = total_sectors - cur_sector;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
136
    }
137

138
139
140
141
    blk = qemu_malloc(sizeof(BlkMigBlock));
    blk->buf = qemu_malloc(BLOCK_SIZE);
    blk->bmds = bmds;
    blk->sector = cur_sector;
142

143
    if (is_async) {
144
145
146
        blk->iov.iov_base = blk->buf;
        blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
        qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
147

148
149
150
        blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                    nr_sectors, blk_mig_read_cb, blk);
        if (!blk->aiocb) {
151
            goto error;
152
153
154
        }
        block_mig_state.submitted++;
    } else {
155
        if (bdrv_read(bs, cur_sector, blk->buf, nr_sectors) < 0) {
156
            goto error;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
157
        }
158
        blk_send(f, blk);
159

160
161
        qemu_free(blk->buf);
        qemu_free(blk);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
162
163
    }

164
165
    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
    bmds->cur_sector = cur_sector + nr_sectors;
166

167
    return (bmds->cur_sector >= total_sectors);
168
169
170
171
172
173
174

error:
    printf("Error reading sector %" PRId64 "\n", cur_sector);
    qemu_file_set_error(f);
    qemu_free(blk->buf);
    qemu_free(blk);
    return 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
175
176
177
178
179
}

static void set_dirty_tracking(int enable)
{
    BlkMigDevState *bmds;
180
181

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
182
        bdrv_set_dirty_tracking(bmds->bs, enable);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
183
184
185
186
187
    }
}

static void init_blk_migration(QEMUFile *f)
{
188
    BlkMigDevState *bmds;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
189
    BlockDriverState *bs;
190

191
192
193
    block_mig_state.submitted = 0;
    block_mig_state.read_done = 0;
    block_mig_state.transferred = 0;
194
    block_mig_state.total_sector_sum = 0;
195
196
    block_mig_state.print_completion = 0;

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
197
    for (bs = bdrv_first; bs != NULL; bs = bs->next) {
198
        if (bs->type == BDRV_TYPE_HD) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
199
200
201
            bmds = qemu_mallocz(sizeof(BlkMigDevState));
            bmds->bs = bs;
            bmds->bulk_completed = 0;
202
            bmds->total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
203
            bmds->completed_sectors = 0;
204
            bmds->shared_base = block_mig_state.shared_base;
205

206
207
            block_mig_state.total_sector_sum += bmds->total_sectors;

208
209
            if (bmds->shared_base) {
                printf("Start migration for %s with shared base image\n",
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
210
211
212
213
                       bs->device_name);
            } else {
                printf("Start full migration for %s\n", bs->device_name);
            }
214

215
            QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
216
        }
217
    }
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
218
219
220
221
}

static int blk_mig_save_bulked_block(QEMUFile *f, int is_async)
{
222
    int64_t completed_sector_sum = 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
223
    BlkMigDevState *bmds;
224
    int ret = 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
225

226
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
227
        if (bmds->bulk_completed == 0) {
228
229
230
            if (mig_save_device_bulk(f, bmds, is_async) == 1) {
                /* completed bulk section for this device */
                bmds->bulk_completed = 1;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
231
            }
232
233
234
235
236
            completed_sector_sum += bmds->completed_sectors;
            ret = 1;
            break;
        } else {
            completed_sector_sum += bmds->completed_sectors;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
237
238
        }
    }
239

240
241
242
243
244
245
246
247
248
    if (completed_sector_sum >= block_mig_state.print_completion) {
        printf("Completed %" PRId64 " %%\r",
               completed_sector_sum * 100 / block_mig_state.total_sector_sum);
        fflush(stdout);
        block_mig_state.print_completion +=
            (BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
    }

    return ret;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
249
250
251
252
253
254
255
}

#define MAX_NUM_BLOCKS 4

static void blk_mig_save_dirty_blocks(QEMUFile *f)
{
    BlkMigDevState *bmds;
256
    BlkMigBlock blk;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
257
    int64_t sector;
258

259
    blk.buf = qemu_malloc(BLOCK_SIZE);
260

261
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
262
263
        for (sector = 0; sector < bmds->cur_sector;) {
            if (bdrv_get_dirty(bmds->bs, sector)) {
264
                if (bdrv_read(bmds->bs, sector, blk.buf,
265
                              BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
266
                    printf("Error reading sector %" PRId64 "\n", sector);
267
268
269
                    qemu_file_set_error(f);
                    qemu_free(blk.buf);
                    return;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
270
                }
271
272
273
                blk.bmds = bmds;
                blk.sector = sector;
                blk_send(f, &blk);
274
275

                bdrv_reset_dirty(bmds->bs, sector,
276
                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
277
            }
278
            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
279
280
        }
    }
281

282
    qemu_free(blk.buf);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
283
284
285
286
}

static void flush_blks(QEMUFile* f)
{
287
    BlkMigBlock *blk;
288

289
290
291
    dprintf("%s Enter submitted %d read_done %d transferred %d\n",
            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
292

293
294
295
296
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        if (qemu_file_rate_limit(f)) {
            break;
        }
297
298
299
300
        if (blk->ret < 0) {
            qemu_file_set_error(f);
            break;
        }
301
        blk_send(f, blk);
302

303
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
304
305
        qemu_free(blk->buf);
        qemu_free(blk);
306

307
308
309
        block_mig_state.read_done--;
        block_mig_state.transferred++;
        assert(block_mig_state.read_done >= 0);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
310
311
    }

312
313
314
    dprintf("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
            block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
315
316
317
318
319
}

static int is_stage2_completed(void)
{
    BlkMigDevState *bmds;
320

321
    if (block_mig_state.submitted > 0) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
322
323
        return 0;
    }
324

325
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
326
        if (bmds->bulk_completed == 0) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
327
328
329
            return 0;
        }
    }
330

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
331
332
333
    return 1;
}

334
335
static void blk_mig_cleanup(void)
{
336
337
    BlkMigDevState *bmds;
    BlkMigBlock *blk;
338

339
340
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
341
342
343
        qemu_free(bmds);
    }

344
345
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
346
347
348
349
350
351
352
353
354
        qemu_free(blk->buf);
        qemu_free(blk);
    }

    set_dirty_tracking(0);

    printf("\n");
}

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
355
356
static int block_save_live(QEMUFile *f, int stage, void *opaque)
{
357
358
    dprintf("Enter save live stage %d submitted %d transferred %d\n",
            stage, block_mig_state.submitted, block_mig_state.transferred);
359

360
361
362
363
364
    if (stage < 0) {
        blk_mig_cleanup();
        return 0;
    }

365
    if (block_mig_state.blk_enable != 1) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
366
        /* no need to migrate storage */
367
        qemu_put_be64(f, BLK_MIG_FLAG_EOS);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
368
369
        return 1;
    }
370
371

    if (stage == 1) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
372
        init_blk_migration(f);
373

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
374
375
376
377
378
        /* start track dirty blocks */
        set_dirty_tracking(1);
    }

    flush_blks(f);
379

380
    if (qemu_file_has_error(f)) {
381
        blk_mig_cleanup();
382
383
384
        return 0;
    }

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
385
    /* control the rate of transfer */
386
387
    while ((block_mig_state.submitted +
            block_mig_state.read_done) * BLOCK_SIZE <
388
389
390
           qemu_file_get_rate_limit(f)) {
        if (blk_mig_save_bulked_block(f, 1) == 0) {
            /* no more bulk blocks for now */
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
391
            break;
392
        }
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
393
    }
394

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
395
    flush_blks(f);
396

397
    if (qemu_file_has_error(f)) {
398
        blk_mig_cleanup();
399
400
401
        return 0;
    }

402
403
404
405
406
    if (stage == 3) {
        while (blk_mig_save_bulked_block(f, 0) != 0) {
            /* empty */
        }

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
407
        blk_mig_save_dirty_blocks(f);
408
        blk_mig_cleanup();
409

410
411
412
413
        if (qemu_file_has_error(f)) {
            return 0;
        }

414
        printf("Block migration completed\n");
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
415
    }
416
417
418

    qemu_put_be64(f, BLK_MIG_FLAG_EOS);

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
419
420
421
422
423
424
425
426
427
428
    return ((stage == 2) && is_stage2_completed());
}

static int block_load(QEMUFile *f, void *opaque, int version_id)
{
    int len, flags;
    char device_name[256];
    int64_t addr;
    BlockDriverState *bs;
    uint8_t *buf;
429

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
430
431
    do {
        addr = qemu_get_be64(f);
432

433
434
        flags = addr & ~BDRV_SECTOR_MASK;
        addr >>= BDRV_SECTOR_BITS;
435
436

        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
437
438
439
440
            /* get device name */
            len = qemu_get_byte(f);
            qemu_get_buffer(f, (uint8_t *)device_name, len);
            device_name[len] = '\0';
441

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
442
            bs = bdrv_find(device_name);
443
444
445
446
447
            if (!bs) {
                fprintf(stderr, "Error unknown block device %s\n",
                        device_name);
                return -EINVAL;
            }
448

449
450
            buf = qemu_malloc(BLOCK_SIZE);

451
            qemu_get_buffer(f, buf, BLOCK_SIZE);
452
            bdrv_write(bs, addr, buf, BDRV_SECTORS_PER_DIRTY_CHUNK);
453
454

            qemu_free(buf);
455
        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
456
457
458
459
460
            fprintf(stderr, "Unknown flags\n");
            return -EINVAL;
        }
        if (qemu_file_has_error(f)) {
            return -EIO;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
461
        }
462
463
    } while (!(flags & BLK_MIG_FLAG_EOS));

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
464
465
466
467
468
    return 0;
}

static void block_set_params(int blk_enable, int shared_base, void *opaque)
{
469
470
    block_mig_state.blk_enable = blk_enable;
    block_mig_state.shared_base = shared_base;
471

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
472
    /* shared base means that blk_enable = 1 */
473
    block_mig_state.blk_enable |= shared_base;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
474
475
476
}

void blk_mig_init(void)
477
{
478
479
480
    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
    QSIMPLEQ_INIT(&block_mig_state.blk_list);

481
    register_savevm_live("block", 0, 1, block_set_params, block_save_live,
482
                         NULL, block_load, &block_mig_state);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
483
}