block-migration.c 12.9 KB
Newer Older
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 * QEMU live block migration
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Liran Schour   <lirans@il.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

#include "qemu-common.h"
#include "block_int.h"
#include "hw/hw.h"
17
#include "qemu-queue.h"
18
#include "monitor.h"
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
19
20
21
#include "block-migration.h"
#include <assert.h>

22
#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
23
24
25
26
27
28
29
30
31
32
33
34

#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
#define BLK_MIG_FLAG_EOS                0x02

#define MAX_IS_ALLOCATED_SEARCH 65536
#define MAX_BLOCKS_READ 10000
#define BLOCKS_READ_CHANGE 100
#define INITIAL_BLOCKS_READ 100

//#define DEBUG_BLK_MIGRATION

#ifdef DEBUG_BLK_MIGRATION
35
#define dprintf(fmt, ...) \
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
36
37
    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
#else
38
#define dprintf(fmt, ...) \
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
39
40
41
    do { } while (0)
#endif

42
43
44
45
46
typedef struct BlkMigDevState {
    BlockDriverState *bs;
    int bulk_completed;
    int shared_base;
    int64_t cur_sector;
47
    int64_t completed_sectors;
48
49
    int64_t total_sectors;
    int64_t dirty;
50
    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
51
52
} BlkMigDevState;

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
53
54
55
56
57
58
59
60
typedef struct BlkMigBlock {
    uint8_t *buf;
    BlkMigDevState *bmds;
    int64_t sector;
    struct iovec iov;
    QEMUIOVector qiov;
    BlockDriverAIOCB *aiocb;
    int ret;
61
    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
62
63
64
65
66
} BlkMigBlock;

typedef struct BlkMigState {
    int blk_enable;
    int shared_base;
67
68
    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
69
70
71
    int submitted;
    int read_done;
    int transferred;
72
    int64_t total_sector_sum;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
73
74
75
    int64_t print_completion;
} BlkMigState;

76
static BlkMigState block_mig_state;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
77

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
static void blk_send(QEMUFile *f, BlkMigBlock * blk)
{
    int len;

    /* sector number and flags */
    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
                     | BLK_MIG_FLAG_DEVICE_BLOCK);

    /* device name */
    len = strlen(blk->bmds->bs->device_name);
    qemu_put_byte(f, len);
    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);

    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
}

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
94
95
96
static void blk_mig_read_cb(void *opaque, int ret)
{
    BlkMigBlock *blk = opaque;
97

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
98
    blk->ret = ret;
99

100
    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
101

102
103
104
    block_mig_state.submitted--;
    block_mig_state.read_done++;
    assert(block_mig_state.submitted >= 0);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
105
106
}

107
108
static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
                                BlkMigDevState *bmds, int is_async)
109
{
110
111
112
    int64_t total_sectors = bmds->total_sectors;
    int64_t cur_sector = bmds->cur_sector;
    BlockDriverState *bs = bmds->bs;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
113
    BlkMigBlock *blk;
114
    int nr_sectors;
115

116
    if (bmds->shared_base) {
117
        while (cur_sector < total_sectors &&
118
119
               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
                                  &nr_sectors)) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
120
121
122
            cur_sector += nr_sectors;
        }
    }
123
124

    if (cur_sector >= total_sectors) {
125
        bmds->cur_sector = bmds->completed_sectors = total_sectors;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
126
127
        return 1;
    }
128

129
    bmds->completed_sectors = cur_sector;
130

131
132
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);

133
134
    /* we are going to transfer a full block even if it is not allocated */
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
135

136
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
137
        nr_sectors = total_sectors - cur_sector;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
138
    }
139

140
141
142
143
    blk = qemu_malloc(sizeof(BlkMigBlock));
    blk->buf = qemu_malloc(BLOCK_SIZE);
    blk->bmds = bmds;
    blk->sector = cur_sector;
144

145
    if (is_async) {
146
147
148
        blk->iov.iov_base = blk->buf;
        blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
        qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
149

150
151
152
        blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                    nr_sectors, blk_mig_read_cb, blk);
        if (!blk->aiocb) {
153
            goto error;
154
155
156
        }
        block_mig_state.submitted++;
    } else {
157
        if (bdrv_read(bs, cur_sector, blk->buf, nr_sectors) < 0) {
158
            goto error;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
159
        }
160
        blk_send(f, blk);
161

162
163
        qemu_free(blk->buf);
        qemu_free(blk);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
164
165
    }

166
167
    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
    bmds->cur_sector = cur_sector + nr_sectors;
168

169
    return (bmds->cur_sector >= total_sectors);
170
171

error:
172
    monitor_printf(mon, "Error reading sector %" PRId64 "\n", cur_sector);
173
174
175
176
    qemu_file_set_error(f);
    qemu_free(blk->buf);
    qemu_free(blk);
    return 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
177
178
179
180
181
}

static void set_dirty_tracking(int enable)
{
    BlkMigDevState *bmds;
182
183

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
184
        bdrv_set_dirty_tracking(bmds->bs, enable);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
185
186
187
    }
}

188
static void init_blk_migration(Monitor *mon, QEMUFile *f)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
189
{
190
    BlkMigDevState *bmds;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
191
    BlockDriverState *bs;
192

193
194
195
    block_mig_state.submitted = 0;
    block_mig_state.read_done = 0;
    block_mig_state.transferred = 0;
196
    block_mig_state.total_sector_sum = 0;
197
198
    block_mig_state.print_completion = 0;

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
199
    for (bs = bdrv_first; bs != NULL; bs = bs->next) {
200
        if (bs->type == BDRV_TYPE_HD) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
201
202
203
            bmds = qemu_mallocz(sizeof(BlkMigDevState));
            bmds->bs = bs;
            bmds->bulk_completed = 0;
204
            bmds->total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
205
            bmds->completed_sectors = 0;
206
            bmds->shared_base = block_mig_state.shared_base;
207

208
209
            block_mig_state.total_sector_sum += bmds->total_sectors;

210
            if (bmds->shared_base) {
211
212
213
                monitor_printf(mon, "Start migration for %s with shared base "
                                    "image\n",
                               bs->device_name);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
214
            } else {
215
216
                monitor_printf(mon, "Start full migration for %s\n",
                               bs->device_name);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
217
            }
218

219
            QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
220
        }
221
    }
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
222
223
}

224
static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
225
{
226
    int64_t completed_sector_sum = 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
227
    BlkMigDevState *bmds;
228
    int ret = 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
229

230
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
231
        if (bmds->bulk_completed == 0) {
232
            if (mig_save_device_bulk(mon, f, bmds, is_async) == 1) {
233
234
                /* completed bulk section for this device */
                bmds->bulk_completed = 1;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
235
            }
236
237
238
239
240
            completed_sector_sum += bmds->completed_sectors;
            ret = 1;
            break;
        } else {
            completed_sector_sum += bmds->completed_sectors;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
241
242
        }
    }
243

244
    if (completed_sector_sum >= block_mig_state.print_completion) {
245
246
247
248
        monitor_printf(mon, "Completed %" PRId64 " %%\r",
                       completed_sector_sum * 100 /
                       block_mig_state.total_sector_sum);
        monitor_flush(mon);
249
250
251
252
253
        block_mig_state.print_completion +=
            (BDRV_SECTORS_PER_DIRTY_CHUNK * 10000);
    }

    return ret;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
254
255
256
257
}

#define MAX_NUM_BLOCKS 4

258
static void blk_mig_save_dirty_blocks(Monitor *mon, QEMUFile *f)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
259
260
{
    BlkMigDevState *bmds;
261
    BlkMigBlock blk;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
262
    int64_t sector;
263

264
    blk.buf = qemu_malloc(BLOCK_SIZE);
265

266
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
267
268
        for (sector = 0; sector < bmds->cur_sector;) {
            if (bdrv_get_dirty(bmds->bs, sector)) {
269
                if (bdrv_read(bmds->bs, sector, blk.buf,
270
                              BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
271
272
                    monitor_printf(mon, "Error reading sector %" PRId64 "\n",
                                   sector);
273
274
275
                    qemu_file_set_error(f);
                    qemu_free(blk.buf);
                    return;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
276
                }
277
278
279
                blk.bmds = bmds;
                blk.sector = sector;
                blk_send(f, &blk);
280
281

                bdrv_reset_dirty(bmds->bs, sector,
282
                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
283
            }
284
            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
285
286
        }
    }
287

288
    qemu_free(blk.buf);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
289
290
291
292
}

static void flush_blks(QEMUFile* f)
{
293
    BlkMigBlock *blk;
294

295
296
297
    dprintf("%s Enter submitted %d read_done %d transferred %d\n",
            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
298

299
300
301
302
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        if (qemu_file_rate_limit(f)) {
            break;
        }
303
304
305
306
        if (blk->ret < 0) {
            qemu_file_set_error(f);
            break;
        }
307
        blk_send(f, blk);
308

309
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
310
311
        qemu_free(blk->buf);
        qemu_free(blk);
312

313
314
315
        block_mig_state.read_done--;
        block_mig_state.transferred++;
        assert(block_mig_state.read_done >= 0);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
316
317
    }

318
319
320
    dprintf("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
            block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
321
322
323
324
325
}

static int is_stage2_completed(void)
{
    BlkMigDevState *bmds;
326

327
    if (block_mig_state.submitted > 0) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
328
329
        return 0;
    }
330

331
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
332
        if (bmds->bulk_completed == 0) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
333
334
335
            return 0;
        }
    }
336

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
337
338
339
    return 1;
}

340
static void blk_mig_cleanup(Monitor *mon)
341
{
342
343
    BlkMigDevState *bmds;
    BlkMigBlock *blk;
344

345
346
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
347
348
349
        qemu_free(bmds);
    }

350
351
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
352
353
354
355
356
357
        qemu_free(blk->buf);
        qemu_free(blk);
    }

    set_dirty_tracking(0);

358
    monitor_printf(mon, "\n");
359
360
}

361
static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
362
{
363
364
    dprintf("Enter save live stage %d submitted %d transferred %d\n",
            stage, block_mig_state.submitted, block_mig_state.transferred);
365

366
    if (stage < 0) {
367
        blk_mig_cleanup(mon);
368
369
370
        return 0;
    }

371
    if (block_mig_state.blk_enable != 1) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
372
        /* no need to migrate storage */
373
        qemu_put_be64(f, BLK_MIG_FLAG_EOS);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
374
375
        return 1;
    }
376
377

    if (stage == 1) {
378
        init_blk_migration(mon, f);
379

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
380
381
382
383
384
        /* start track dirty blocks */
        set_dirty_tracking(1);
    }

    flush_blks(f);
385

386
    if (qemu_file_has_error(f)) {
387
        blk_mig_cleanup(mon);
388
389
390
        return 0;
    }

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
391
    /* control the rate of transfer */
392
393
    while ((block_mig_state.submitted +
            block_mig_state.read_done) * BLOCK_SIZE <
394
           qemu_file_get_rate_limit(f)) {
395
        if (blk_mig_save_bulked_block(mon, f, 1) == 0) {
396
            /* no more bulk blocks for now */
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
397
            break;
398
        }
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
399
    }
400

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
401
    flush_blks(f);
402

403
    if (qemu_file_has_error(f)) {
404
        blk_mig_cleanup(mon);
405
406
407
        return 0;
    }

408
    if (stage == 3) {
409
        while (blk_mig_save_bulked_block(mon, f, 0) != 0) {
410
411
412
            /* empty */
        }

413
414
        blk_mig_save_dirty_blocks(mon, f);
        blk_mig_cleanup(mon);
415

416
417
418
419
        if (qemu_file_has_error(f)) {
            return 0;
        }

420
        monitor_printf(mon, "Block migration completed\n");
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
421
    }
422
423
424

    qemu_put_be64(f, BLK_MIG_FLAG_EOS);

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
425
426
427
428
429
430
431
432
433
434
    return ((stage == 2) && is_stage2_completed());
}

static int block_load(QEMUFile *f, void *opaque, int version_id)
{
    int len, flags;
    char device_name[256];
    int64_t addr;
    BlockDriverState *bs;
    uint8_t *buf;
435

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
436
437
    do {
        addr = qemu_get_be64(f);
438

439
440
        flags = addr & ~BDRV_SECTOR_MASK;
        addr >>= BDRV_SECTOR_BITS;
441
442

        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
443
444
445
446
            /* get device name */
            len = qemu_get_byte(f);
            qemu_get_buffer(f, (uint8_t *)device_name, len);
            device_name[len] = '\0';
447

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
448
            bs = bdrv_find(device_name);
449
450
451
452
453
            if (!bs) {
                fprintf(stderr, "Error unknown block device %s\n",
                        device_name);
                return -EINVAL;
            }
454

455
456
            buf = qemu_malloc(BLOCK_SIZE);

457
            qemu_get_buffer(f, buf, BLOCK_SIZE);
458
            bdrv_write(bs, addr, buf, BDRV_SECTORS_PER_DIRTY_CHUNK);
459
460

            qemu_free(buf);
461
        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
462
463
464
465
466
            fprintf(stderr, "Unknown flags\n");
            return -EINVAL;
        }
        if (qemu_file_has_error(f)) {
            return -EIO;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
467
        }
468
469
    } while (!(flags & BLK_MIG_FLAG_EOS));

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
470
471
472
473
474
    return 0;
}

static void block_set_params(int blk_enable, int shared_base, void *opaque)
{
475
476
    block_mig_state.blk_enable = blk_enable;
    block_mig_state.shared_base = shared_base;
477

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
478
    /* shared base means that blk_enable = 1 */
479
    block_mig_state.blk_enable |= shared_base;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
480
481
482
}

void blk_mig_init(void)
483
{
484
485
486
    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
    QSIMPLEQ_INIT(&block_mig_state.blk_list);

487
    register_savevm_live("block", 0, 1, block_set_params, block_save_live,
488
                         NULL, block_load, &block_mig_state);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
489
}