Commit 3b0a7a0c authored by Leigh Stoller's avatar Leigh Stoller

Better retry handling when a vnode gets stuck in BOOTING.

Might do the same for physical nodes.
parent 9df17ba7
......@@ -1657,6 +1657,9 @@ sub WaitForNodes($$@)
$nodes{$node->node_id()} = $node;
$node->_waitstart(time());
$node->_waitend(undef);
$node->_laststate("");
$node->_laststatestamp(time());
$node->_retried(0);
$node->Refresh();
#
# Count up number of virtnodes on each physnode.
......@@ -1797,6 +1800,40 @@ sub WaitForNodes($$@)
delete($nodes{$node_id});
next;
}
#
# Watch for a node stuck in BOOTING; this happens a lot with
# XEN VMs, a reboot typically solves it.
#
if ($node->isvirtnode() &&
$state eq TBDB_NODESTATE_BOOTING &&
$state eq $node->_laststate() &&
time() - $node->_laststatestamp() > 180) {
my $giveup = $node->_retried();
if ($giveup) {
print STDERR
"$node_id still stuck in BOOTING, giving up.\n";
}
else {
print STDERR
"$node_id is stuck in BOOTING, restarting it.\n";
$node->_retried(1);
system("$NODEREBOOT $node_id");
$giveup = 1
if ($?);
}
if ($giveup) {
$node->_sliver()->SetStatus("failed")
if (defined($node->_sliver()));
$node->_waitend(time());
delete($nodes{$node_id});
}
next;
}
# Mark when state changes.
if ($node->_laststate() ne $state) {
$node->_laststatestamp(time());
$node->_laststate($state);
}
if (int($waittime / 60) > $minutes) {
# Changing minutes is why we get this print for just
# a single node each time.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment