block-migration.c 19.4 KB
Newer Older
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 * QEMU live block migration
 *
 * Copyright IBM, Corp. 2009
 *
 * Authors:
 *  Liran Schour   <lirans@il.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

#include "qemu-common.h"
#include "block_int.h"
#include "hw/hw.h"
17
#include "qemu-queue.h"
18
#include "qemu-timer.h"
19
#include "monitor.h"
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
20
#include "block-migration.h"
21
#include "migration.h"
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
22
23
#include <assert.h>

24
#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
25
26
27

#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
#define BLK_MIG_FLAG_EOS                0x02
28
#define BLK_MIG_FLAG_PROGRESS           0x04
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
29
30
31
32
33
34

#define MAX_IS_ALLOCATED_SEARCH 65536

//#define DEBUG_BLK_MIGRATION

#ifdef DEBUG_BLK_MIGRATION
malc's avatar
malc committed
35
#define DPRINTF(fmt, ...) \
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
36
37
    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
#else
malc's avatar
malc committed
38
#define DPRINTF(fmt, ...) \
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
39
40
41
    do { } while (0)
#endif

42
43
44
45
46
typedef struct BlkMigDevState {
    BlockDriverState *bs;
    int bulk_completed;
    int shared_base;
    int64_t cur_sector;
47
    int64_t cur_dirty;
48
    int64_t completed_sectors;
49
50
    int64_t total_sectors;
    int64_t dirty;
51
    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
52
    unsigned long *aio_bitmap;
53
54
} BlkMigDevState;

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
55
56
57
58
typedef struct BlkMigBlock {
    uint8_t *buf;
    BlkMigDevState *bmds;
    int64_t sector;
59
    int nr_sectors;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
60
61
62
63
    struct iovec iov;
    QEMUIOVector qiov;
    BlockDriverAIOCB *aiocb;
    int ret;
64
    int64_t time;
65
    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
66
67
68
69
70
} BlkMigBlock;

typedef struct BlkMigState {
    int blk_enable;
    int shared_base;
71
72
    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
73
74
75
    int submitted;
    int read_done;
    int transferred;
76
    int64_t total_sector_sum;
77
    int prev_progress;
Liran Schour's avatar
Liran Schour committed
78
    int bulk_completed;
79
80
    long double total_time;
    int reads;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
81
82
} BlkMigState;

83
static BlkMigState block_mig_state;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
84

85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
static void blk_send(QEMUFile *f, BlkMigBlock * blk)
{
    int len;

    /* sector number and flags */
    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
                     | BLK_MIG_FLAG_DEVICE_BLOCK);

    /* device name */
    len = strlen(blk->bmds->bs->device_name);
    qemu_put_byte(f, len);
    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);

    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
}

101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
int blk_mig_active(void)
{
    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
}

uint64_t blk_mig_bytes_transferred(void)
{
    BlkMigDevState *bmds;
    uint64_t sum = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        sum += bmds->completed_sectors;
    }
    return sum << BDRV_SECTOR_BITS;
}

uint64_t blk_mig_bytes_remaining(void)
{
    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
}

uint64_t blk_mig_bytes_total(void)
{
    BlkMigDevState *bmds;
    uint64_t sum = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        sum += bmds->total_sectors;
    }
    return sum << BDRV_SECTOR_BITS;
}

133
134
135
136
137
138
139
140
141
142
143
144
static inline void add_avg_read_time(int64_t time)
{
    block_mig_state.reads++;
    block_mig_state.total_time += time;
}

static inline long double compute_read_bwidth(void)
{
    assert(block_mig_state.total_time != 0);
    return  (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
}

145
146
147
148
static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
{
    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;

149
    if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
            (1UL << (chunk % (sizeof(unsigned long) * 8))));
    } else {
        return 0;
    }
}

static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
                             int nb_sectors, int set)
{
    int64_t start, end;
    unsigned long val, idx, bit;

    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;

    for (; start <= end; start++) {
        idx = start / (sizeof(unsigned long) * 8);
        bit = start % (sizeof(unsigned long) * 8);
        val = bmds->aio_bitmap[idx];
        if (set) {
171
            val |= 1UL << bit;
172
        } else {
173
            val &= ~(1UL << bit);
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
        }
        bmds->aio_bitmap[idx] = val;
    }
}

static void alloc_aio_bitmap(BlkMigDevState *bmds)
{
    BlockDriverState *bs = bmds->bs;
    int64_t bitmap_size;

    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;

    bmds->aio_bitmap = qemu_mallocz(bitmap_size);
}

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
191
192
193
static void blk_mig_read_cb(void *opaque, int ret)
{
    BlkMigBlock *blk = opaque;
194

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
195
    blk->ret = ret;
196

197
198
199
200
    blk->time = qemu_get_clock_ns(rt_clock) - blk->time;

    add_avg_read_time(blk->time);

201
    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
202
    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
203

204
205
206
    block_mig_state.submitted--;
    block_mig_state.read_done++;
    assert(block_mig_state.submitted >= 0);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
207
208
}

209
static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
Liran Schour's avatar
Liran Schour committed
210
                                BlkMigDevState *bmds)
211
{
212
213
214
    int64_t total_sectors = bmds->total_sectors;
    int64_t cur_sector = bmds->cur_sector;
    BlockDriverState *bs = bmds->bs;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
215
    BlkMigBlock *blk;
216
    int nr_sectors;
217

218
    if (bmds->shared_base) {
219
        while (cur_sector < total_sectors &&
220
221
               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
                                  &nr_sectors)) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
222
223
224
            cur_sector += nr_sectors;
        }
    }
225
226

    if (cur_sector >= total_sectors) {
227
        bmds->cur_sector = bmds->completed_sectors = total_sectors;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
228
229
        return 1;
    }
230

231
    bmds->completed_sectors = cur_sector;
232

233
234
    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);

235
236
    /* we are going to transfer a full block even if it is not allocated */
    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
237

238
    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
239
        nr_sectors = total_sectors - cur_sector;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
240
    }
241

242
243
244
245
    blk = qemu_malloc(sizeof(BlkMigBlock));
    blk->buf = qemu_malloc(BLOCK_SIZE);
    blk->bmds = bmds;
    blk->sector = cur_sector;
246
    blk->nr_sectors = nr_sectors;
247

Liran Schour's avatar
Liran Schour committed
248
249
250
    blk->iov.iov_base = blk->buf;
    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
251

252
253
    blk->time = qemu_get_clock_ns(rt_clock);

Liran Schour's avatar
Liran Schour committed
254
255
256
257
    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                nr_sectors, blk_mig_read_cb, blk);
    if (!blk->aiocb) {
        goto error;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
258
    }
Liran Schour's avatar
Liran Schour committed
259
    block_mig_state.submitted++;
260

261
262
    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
    bmds->cur_sector = cur_sector + nr_sectors;
263

264
    return (bmds->cur_sector >= total_sectors);
265
266

error:
267
    monitor_printf(mon, "Error reading sector %" PRId64 "\n", cur_sector);
268
269
270
271
    qemu_file_set_error(f);
    qemu_free(blk->buf);
    qemu_free(blk);
    return 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
272
273
274
275
276
}

static void set_dirty_tracking(int enable)
{
    BlkMigDevState *bmds;
277
278

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
279
        bdrv_set_dirty_tracking(bmds->bs, enable);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
280
281
282
    }
}

283
static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
284
{
285
    Monitor *mon = opaque;
286
    BlkMigDevState *bmds;
287
    int64_t sectors;
288

289
    if (!bdrv_is_read_only(bs)) {
290
        sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
291
        if (sectors <= 0) {
292
293
294
295
296
297
298
299
300
            return;
        }

        bmds = qemu_mallocz(sizeof(BlkMigDevState));
        bmds->bs = bs;
        bmds->bulk_completed = 0;
        bmds->total_sectors = sectors;
        bmds->completed_sectors = 0;
        bmds->shared_base = block_mig_state.shared_base;
301
        alloc_aio_bitmap(bmds);
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319

        block_mig_state.total_sector_sum += sectors;

        if (bmds->shared_base) {
            monitor_printf(mon, "Start migration for %s with shared base "
                                "image\n",
                           bs->device_name);
        } else {
            monitor_printf(mon, "Start full migration for %s\n",
                           bs->device_name);
        }

        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
    }
}

static void init_blk_migration(Monitor *mon, QEMUFile *f)
{
320
321
322
    block_mig_state.submitted = 0;
    block_mig_state.read_done = 0;
    block_mig_state.transferred = 0;
323
    block_mig_state.total_sector_sum = 0;
324
    block_mig_state.prev_progress = -1;
Liran Schour's avatar
Liran Schour committed
325
    block_mig_state.bulk_completed = 0;
326
327
    block_mig_state.total_time = 0;
    block_mig_state.reads = 0;
328

329
    bdrv_iterate(init_blk_migration_it, mon);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
330
331
}

Liran Schour's avatar
Liran Schour committed
332
static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
333
{
334
    int64_t completed_sector_sum = 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
335
    BlkMigDevState *bmds;
336
    int progress;
337
    int ret = 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
338

339
    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
340
        if (bmds->bulk_completed == 0) {
Liran Schour's avatar
Liran Schour committed
341
            if (mig_save_device_bulk(mon, f, bmds) == 1) {
342
343
                /* completed bulk section for this device */
                bmds->bulk_completed = 1;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
344
            }
345
346
347
348
349
            completed_sector_sum += bmds->completed_sectors;
            ret = 1;
            break;
        } else {
            completed_sector_sum += bmds->completed_sectors;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
350
351
        }
    }
352

353
354
355
356
357
358
    if (block_mig_state.total_sector_sum != 0) {
        progress = completed_sector_sum * 100 /
                   block_mig_state.total_sector_sum;
    } else {
        progress = 100;
    }
359
360
361
362
363
    if (progress != block_mig_state.prev_progress) {
        block_mig_state.prev_progress = progress;
        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
                         | BLK_MIG_FLAG_PROGRESS);
        monitor_printf(mon, "Completed %d %%\r", progress);
364
        monitor_flush(mon);
365
366
367
    }

    return ret;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
368
369
}

370
static void blk_mig_reset_dirty_cursor(void)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
371
372
{
    BlkMigDevState *bmds;
373
374
375
376
377
378
379
380
381
382
383

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        bmds->cur_dirty = 0;
    }
}

static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
                                 BlkMigDevState *bmds, int is_async)
{
    BlkMigBlock *blk;
    int64_t total_sectors = bmds->total_sectors;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
384
    int64_t sector;
385
    int nr_sectors;
386

387
    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
388
        if (bmds_aio_inflight(bmds, sector)) {
389
            qemu_aio_flush();
390
        }
391
        if (bdrv_get_dirty(bmds->bs, sector)) {
392

393
394
395
396
397
398
399
400
401
            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                nr_sectors = total_sectors - sector;
            } else {
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
            }
            blk = qemu_malloc(sizeof(BlkMigBlock));
            blk->buf = qemu_malloc(BLOCK_SIZE);
            blk->bmds = bmds;
            blk->sector = sector;
402
            blk->nr_sectors = nr_sectors;
403

404
            if (is_async) {
405
406
407
408
                blk->iov.iov_base = blk->buf;
                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);

409
                blk->time = qemu_get_clock_ns(rt_clock);
410

411
412
413
414
415
416
                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
                                            nr_sectors, blk_mig_read_cb, blk);
                if (!blk->aiocb) {
                    goto error;
                }
                block_mig_state.submitted++;
417
                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
418
419
420
421
            } else {
                if (bdrv_read(bmds->bs, sector, blk->buf,
                              nr_sectors) < 0) {
                    goto error;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
422
                }
423
                blk_send(f, blk);
424

425
426
                qemu_free(blk->buf);
                qemu_free(blk);
427
            }
428
429
430

            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
            break;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
431
        }
432
433
        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
        bmds->cur_dirty = sector;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
434
    }
435

436
437
    return (bmds->cur_dirty >= bmds->total_sectors);

438
error:
439
440
441
442
443
444
445
446
447
448
449
450
451
    monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector);
    qemu_file_set_error(f);
    qemu_free(blk->buf);
    qemu_free(blk);
    return 0;
}

static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async)
{
    BlkMigDevState *bmds;
    int ret = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
452
        if (mig_save_device_dirty(mon, f, bmds, is_async) == 0) {
453
454
455
456
457
458
            ret = 1;
            break;
        }
    }

    return ret;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
459
460
461
462
}

static void flush_blks(QEMUFile* f)
{
463
    BlkMigBlock *blk;
464

malc's avatar
malc committed
465
    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
466
467
            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
468

469
470
471
472
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        if (qemu_file_rate_limit(f)) {
            break;
        }
473
474
475
476
        if (blk->ret < 0) {
            qemu_file_set_error(f);
            break;
        }
477
        blk_send(f, blk);
478

479
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
480
481
        qemu_free(blk->buf);
        qemu_free(blk);
482

483
484
485
        block_mig_state.read_done--;
        block_mig_state.transferred++;
        assert(block_mig_state.read_done >= 0);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
486
487
    }

malc's avatar
malc committed
488
    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
489
490
            block_mig_state.submitted, block_mig_state.read_done,
            block_mig_state.transferred);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
491
492
}

493
494
495
496
497
498
499
500
501
502
503
504
static int64_t get_remaining_dirty(void)
{
    BlkMigDevState *bmds;
    int64_t dirty = 0;

    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
        dirty += bdrv_get_dirty_count(bmds->bs);
    }

    return dirty * BLOCK_SIZE;
}

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
505
506
static int is_stage2_completed(void)
{
507
508
509
510
511
512
    int64_t remaining_dirty;
    long double bwidth;

    if (block_mig_state.bulk_completed == 1) {

        remaining_dirty = get_remaining_dirty();
513
514
515
        if (remaining_dirty == 0) {
            return 1;
        }
516

517
        bwidth = compute_read_bwidth();
518

519
        if ((remaining_dirty / bwidth) <=
520
521
522
523
524
525
526
527
528
            migrate_max_downtime()) {
            /* finish stage2 because we think that we can finish remaing work
               below max_downtime */

            return 1;
        }
    }

    return 0;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
529
530
}

531
static void blk_mig_cleanup(Monitor *mon)
532
{
533
534
    BlkMigDevState *bmds;
    BlkMigBlock *blk;
535

536
537
    set_dirty_tracking(0);

538
539
    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
540
        qemu_free(bmds->aio_bitmap);
541
542
543
        qemu_free(bmds);
    }

544
545
    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
546
547
548
549
        qemu_free(blk->buf);
        qemu_free(blk);
    }

550
    monitor_printf(mon, "\n");
551
552
}

553
static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
554
{
malc's avatar
malc committed
555
    DPRINTF("Enter save live stage %d submitted %d transferred %d\n",
556
            stage, block_mig_state.submitted, block_mig_state.transferred);
557

558
    if (stage < 0) {
559
        blk_mig_cleanup(mon);
560
561
562
        return 0;
    }

563
    if (block_mig_state.blk_enable != 1) {
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
564
        /* no need to migrate storage */
565
        qemu_put_be64(f, BLK_MIG_FLAG_EOS);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
566
567
        return 1;
    }
568
569

    if (stage == 1) {
570
        init_blk_migration(mon, f);
571

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
572
573
574
575
576
        /* start track dirty blocks */
        set_dirty_tracking(1);
    }

    flush_blks(f);
577

578
    if (qemu_file_has_error(f)) {
579
        blk_mig_cleanup(mon);
580
581
582
        return 0;
    }

583
584
    blk_mig_reset_dirty_cursor();

585
    if (stage == 2) {
586
587
588
589
590
591
592
        /* control the rate of transfer */
        while ((block_mig_state.submitted +
                block_mig_state.read_done) * BLOCK_SIZE <
               qemu_file_get_rate_limit(f)) {
            if (block_mig_state.bulk_completed == 0) {
                /* first finish the bulk phase */
                if (blk_mig_save_bulked_block(mon, f) == 0) {
593
                    /* finished saving bulk on all devices */
594
595
596
597
598
599
600
601
                    block_mig_state.bulk_completed = 1;
                }
            } else {
                if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
                    /* no more dirty blocks */
                    break;
                }
            }
602
603
        }

604
        flush_blks(f);
605

606
607
608
609
        if (qemu_file_has_error(f)) {
            blk_mig_cleanup(mon);
            return 0;
        }
610
611
    }

612
    if (stage == 3) {
613
614
615
        /* we know for sure that save bulk is completed and
           all async read completed */
        assert(block_mig_state.submitted == 0);
616

617
        while (blk_mig_save_dirty_block(mon, f, 0) != 0);
618
        blk_mig_cleanup(mon);
619

620
621
622
        /* report completion */
        qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);

623
624
625
626
        if (qemu_file_has_error(f)) {
            return 0;
        }

627
        monitor_printf(mon, "Block migration completed\n");
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
628
    }
629
630
631

    qemu_put_be64(f, BLK_MIG_FLAG_EOS);

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
632
633
634
635
636
    return ((stage == 2) && is_stage2_completed());
}

static int block_load(QEMUFile *f, void *opaque, int version_id)
{
637
    static int banner_printed;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
638
639
640
    int len, flags;
    char device_name[256];
    int64_t addr;
641
    BlockDriverState *bs, *bs_prev = NULL;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
642
    uint8_t *buf;
643
644
    int64_t total_sectors = 0;
    int nr_sectors;
645

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
646
647
    do {
        addr = qemu_get_be64(f);
648

649
650
        flags = addr & ~BDRV_SECTOR_MASK;
        addr >>= BDRV_SECTOR_BITS;
651
652

        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
653
            int ret;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
654
655
656
657
            /* get device name */
            len = qemu_get_byte(f);
            qemu_get_buffer(f, (uint8_t *)device_name, len);
            device_name[len] = '\0';
658

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
659
            bs = bdrv_find(device_name);
660
661
662
663
664
            if (!bs) {
                fprintf(stderr, "Error unknown block device %s\n",
                        device_name);
                return -EINVAL;
            }
665

666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
            if (bs != bs_prev) {
                bs_prev = bs;
                total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
                if (total_sectors <= 0) {
                    error_report("Error getting length of block device %s\n",
                                 device_name);
                    return -EINVAL;
                }
            }

            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
                nr_sectors = total_sectors - addr;
            } else {
                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
            }

682
683
            buf = qemu_malloc(BLOCK_SIZE);

684
            qemu_get_buffer(f, buf, BLOCK_SIZE);
685
            ret = bdrv_write(bs, addr, buf, nr_sectors);
686
687

            qemu_free(buf);
688
689
690
            if (ret < 0) {
                return ret;
            }
691
692
693
694
695
696
697
698
        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
            if (!banner_printed) {
                printf("Receiving block device images\n");
                banner_printed = 1;
            }
            printf("Completed %d %%%c", (int)addr,
                   (addr == 100) ? '\n' : '\r');
            fflush(stdout);
699
        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
700
701
702
703
704
            fprintf(stderr, "Unknown flags\n");
            return -EINVAL;
        }
        if (qemu_file_has_error(f)) {
            return -EIO;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
705
        }
706
707
    } while (!(flags & BLK_MIG_FLAG_EOS));

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
708
709
710
711
712
    return 0;
}

static void block_set_params(int blk_enable, int shared_base, void *opaque)
{
713
714
    block_mig_state.blk_enable = blk_enable;
    block_mig_state.shared_base = shared_base;
715

lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
716
    /* shared base means that blk_enable = 1 */
717
    block_mig_state.blk_enable |= shared_base;
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
718
719
720
}

void blk_mig_init(void)
721
{
722
723
724
    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
    QSIMPLEQ_INIT(&block_mig_state.blk_list);

725
726
    register_savevm_live(NULL, "block", 0, 1, block_set_params,
                         block_save_live, NULL, block_load, &block_mig_state);
lirans@il.ibm.com's avatar
lirans@il.ibm.com committed
727
}