Commit e4ed1541 authored by Juan Quintela's avatar Juan Quintela

savevm: New save live migration method: pending

Code just now does (simplified for clarity)

    if (qemu_savevm_state_iterate(s->file) == 1) {
       vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
       qemu_savevm_state_complete(s->file);
    }

Problem here is that qemu_savevm_state_iterate() returns 1 when it
knows that remaining memory to sent takes less than max downtime.

But this means that we could end spending 2x max_downtime, one
downtime in qemu_savevm_iterate, and the other in
qemu_savevm_state_complete.

Changed code to:

    pending_size = qemu_savevm_state_pending(s->file, max_size);
    DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
    if (pending_size >= max_size) {
        ret = qemu_savevm_state_iterate(s->file);
     } else {
        vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
        qemu_savevm_state_complete(s->file);
     }

So what we do is: at current network speed, we calculate the maximum
number of bytes we can sent: max_size.

Then we ask every save_live section how much they have pending.  If
they are less than max_size, we move to complete phase, otherwise we
do an iterate one.

This makes things much simpler, because now individual sections don't
have to caluclate the bandwidth (it was implossible to do right from
there).
Signed-off-by: default avatarJuan Quintela <quintela@redhat.com>
Reviewed-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent f50b4986
......@@ -582,12 +582,9 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
static int ram_save_iterate(QEMUFile *f, void *opaque)
{
uint64_t bytes_transferred_last;
double bwidth = 0;
int ret;
int i;
uint64_t expected_downtime;
MigrationState *s = migrate_get_current();
int64_t t0;
qemu_mutex_lock_ramlist();
......@@ -595,9 +592,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
reset_ram_globals();
}
bytes_transferred_last = bytes_transferred;
bwidth = qemu_get_clock_ns(rt_clock);
t0 = qemu_get_clock_ns(rt_clock);
i = 0;
while ((ret = qemu_file_rate_limit(f)) == 0) {
int bytes_sent;
......@@ -615,7 +610,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
iterations
*/
if ((i & 63) == 0) {
uint64_t t1 = (qemu_get_clock_ns(rt_clock) - bwidth) / 1000000;
uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
if (t1 > MAX_WAIT) {
DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
t1, i);
......@@ -629,31 +624,10 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
return ret;
}
bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
/* if we haven't transferred anything this round, force
* expected_downtime to a very high value, but without
* crashing */
if (bwidth == 0) {
bwidth = 0.000001;
}
qemu_mutex_unlock_ramlist();
qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
DPRINTF("ram_save_live: expected(%" PRIu64 ") <= max(" PRIu64 ")?\n",
expected_downtime, migrate_max_downtime());
if (expected_downtime <= migrate_max_downtime()) {
migration_bitmap_sync();
expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
s->expected_downtime = expected_downtime / 1000000; /* ns -> ms */
return expected_downtime <= migrate_max_downtime();
}
return 0;
return i;
}
static int ram_save_complete(QEMUFile *f, void *opaque)
......@@ -683,6 +657,19 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
return 0;
}
static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
{
uint64_t remaining_size;
remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
if (remaining_size < max_size) {
migration_bitmap_sync();
remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
}
return remaining_size;
}
static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
{
int ret, rc = 0;
......@@ -869,6 +856,7 @@ SaveVMHandlers savevm_ram_handlers = {
.save_live_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,
.save_live_complete = ram_save_complete,
.save_live_pending = ram_save_pending,
.load_state = ram_load,
.cancel = ram_migration_cancel,
};
......
......@@ -77,9 +77,7 @@ typedef struct BlkMigState {
int64_t total_sector_sum;
int prev_progress;
int bulk_completed;
long double total_time;
long double prev_time_offset;
int reads;
} BlkMigState;
static BlkMigState block_mig_state;
......@@ -132,12 +130,6 @@ uint64_t blk_mig_bytes_total(void)
return sum << BDRV_SECTOR_BITS;
}
static inline long double compute_read_bwidth(void)
{
assert(block_mig_state.total_time != 0);
return (block_mig_state.reads / block_mig_state.total_time) * BLOCK_SIZE;
}
static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
{
int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
......@@ -191,8 +183,6 @@ static void blk_mig_read_cb(void *opaque, int ret)
blk->ret = ret;
block_mig_state.reads++;
block_mig_state.total_time += (curr_time - block_mig_state.prev_time_offset);
block_mig_state.prev_time_offset = curr_time;
QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
......@@ -310,8 +300,6 @@ static void init_blk_migration(QEMUFile *f)
block_mig_state.total_sector_sum = 0;
block_mig_state.prev_progress = -1;
block_mig_state.bulk_completed = 0;
block_mig_state.total_time = 0;
block_mig_state.reads = 0;
bdrv_iterate(init_blk_migration_it, NULL);
}
......@@ -493,32 +481,6 @@ static int64_t get_remaining_dirty(void)
return dirty * BLOCK_SIZE;
}
static int is_stage2_completed(void)
{
int64_t remaining_dirty;
long double bwidth;
if (block_mig_state.bulk_completed == 1) {
remaining_dirty = get_remaining_dirty();
if (remaining_dirty == 0) {
return 1;
}
bwidth = compute_read_bwidth();
if ((remaining_dirty / bwidth) <=
migrate_max_downtime()) {
/* finish stage2 because we think that we can finish remaining work
below max_downtime */
return 1;
}
}
return 0;
}
static void blk_mig_cleanup(void)
{
BlkMigDevState *bmds;
......@@ -619,7 +581,7 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
qemu_put_be64(f, BLK_MIG_FLAG_EOS);
return is_stage2_completed();
return 0;
}
static int block_save_complete(QEMUFile *f, void *opaque)
......@@ -659,6 +621,14 @@ static int block_save_complete(QEMUFile *f, void *opaque)
return 0;
}
static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
{
DPRINTF("Enter save live pending %ld\n", get_remaining_dirty());
return get_remaining_dirty();
}
static int block_load(QEMUFile *f, void *opaque, int version_id)
{
static int banner_printed;
......@@ -755,6 +725,7 @@ SaveVMHandlers savevm_block_handlers = {
.save_live_setup = block_save_setup,
.save_live_iterate = block_save_iterate,
.save_live_complete = block_save_complete,
.save_live_pending = block_save_pending,
.load_state = block_load,
.cancel = block_migration_cancel,
.is_active = block_is_active,
......
......@@ -181,13 +181,15 @@ static int64_t buffered_get_rate_limit(void *opaque)
return s->xfer_limit;
}
/* 10ms xfer_limit is the limit that we should write each 10ms */
/* 100ms xfer_limit is the limit that we should write each 100ms */
#define BUFFER_DELAY 100
static void *buffered_file_thread(void *opaque)
{
QEMUFileBuffered *s = opaque;
int64_t expire_time = qemu_get_clock_ms(rt_clock) + BUFFER_DELAY;
int64_t initial_time = qemu_get_clock_ms(rt_clock);
int64_t max_size = 0;
bool last_round = false;
while (true) {
int64_t current_time = qemu_get_clock_ms(rt_clock);
......@@ -195,13 +197,22 @@ static void *buffered_file_thread(void *opaque)
if (s->migration_state->complete) {
break;
}
if (current_time >= expire_time) {
if (current_time >= initial_time + BUFFER_DELAY) {
uint64_t transferred_bytes = s->bytes_xfer;
uint64_t time_spent = current_time - initial_time;
double bandwidth = transferred_bytes / time_spent;
max_size = bandwidth * migrate_max_downtime() / 1000000;
DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
" bandwidth %g max_size %" PRId64 "\n",
transferred_bytes, time_spent, bandwidth, max_size);
s->bytes_xfer = 0;
expire_time = current_time + BUFFER_DELAY;
initial_time = current_time;
}
if (s->bytes_xfer >= s->xfer_limit) {
if (!last_round && (s->bytes_xfer >= s->xfer_limit)) {
/* usleep expects microseconds */
g_usleep((expire_time - current_time)*1000);
g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
}
if (buffered_flush(s) < 0) {
break;
......@@ -210,7 +221,7 @@ static void *buffered_file_thread(void *opaque)
DPRINTF("file is ready\n");
if (s->bytes_xfer < s->xfer_limit) {
DPRINTF("notifying client\n");
migrate_fd_put_ready(s->migration_state);
last_round = migrate_fd_put_ready(s->migration_state, max_size);
}
}
......
......@@ -81,7 +81,7 @@ void migrate_fd_connect(MigrationState *s);
ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
size_t size);
void migrate_fd_put_ready(MigrationState *s);
bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size);
int migrate_fd_close(MigrationState *s);
void add_migration_state_change_notifier(Notifier *notify);
......
......@@ -35,6 +35,7 @@ typedef struct SaveVMHandlers {
int (*save_live_setup)(QEMUFile *f, void *opaque);
int (*save_live_iterate)(QEMUFile *f, void *opaque);
int (*save_live_complete)(QEMUFile *f, void *opaque);
uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
void (*cancel)(void *opaque);
LoadStateHandler *load_state;
bool (*is_active)(void *opaque);
......
......@@ -78,6 +78,7 @@ int qemu_savevm_state_begin(QEMUFile *f,
int qemu_savevm_state_iterate(QEMUFile *f);
int qemu_savevm_state_complete(QEMUFile *f);
void qemu_savevm_state_cancel(QEMUFile *f);
uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size);
int qemu_loadvm_state(QEMUFile *f);
/* SLIRP */
......
......@@ -316,15 +316,17 @@ ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
return ret;
}
void migrate_fd_put_ready(MigrationState *s)
bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
{
int ret;
uint64_t pending_size;
bool last_round = false;
qemu_mutex_lock_iothread();
if (s->state != MIG_STATE_ACTIVE) {
DPRINTF("put_ready returning because of non-active state\n");
qemu_mutex_unlock_iothread();
return;
return false;
}
if (s->first_time) {
s->first_time = false;
......@@ -334,15 +336,19 @@ void migrate_fd_put_ready(MigrationState *s)
DPRINTF("failed, %d\n", ret);
migrate_fd_error(s);
qemu_mutex_unlock_iothread();
return;
return false;
}
}
DPRINTF("iterate\n");
ret = qemu_savevm_state_iterate(s->file);
if (ret < 0) {
migrate_fd_error(s);
} else if (ret == 1) {
pending_size = qemu_savevm_state_pending(s->file, max_size);
DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
if (pending_size >= max_size) {
ret = qemu_savevm_state_iterate(s->file);
if (ret < 0) {
migrate_fd_error(s);
}
} else {
int old_vm_running = runstate_is_running();
int64_t start_time, end_time;
......@@ -368,9 +374,11 @@ void migrate_fd_put_ready(MigrationState *s)
vm_start();
}
}
last_round = true;
}
qemu_mutex_unlock_iothread();
return last_round;
}
static void migrate_fd_cancel(MigrationState *s)
......
......@@ -1753,6 +1753,25 @@ int qemu_savevm_state_complete(QEMUFile *f)
return qemu_file_get_error(f);
}
uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
{
SaveStateEntry *se;
uint64_t ret = 0;
QTAILQ_FOREACH(se, &savevm_handlers, entry) {
if (!se->ops || !se->ops->save_live_pending) {
continue;
}
if (se->ops && se->ops->is_active) {
if (!se->ops->is_active(se->opaque)) {
continue;
}
}
ret += se->ops->save_live_pending(f, se->opaque, max_size);
}
return ret;
}
void qemu_savevm_state_cancel(QEMUFile *f)
{
SaveStateEntry *se;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment