Commit af9b08f3 authored by Leigh Stoller's avatar Leigh Stoller

Add new state for XEN guest boot; send a VNODEBOOTSTART just before we do

the xl create, so that we can watch for VMs that do not get to TBSETUP in a
reasonable amount of time (which means it hung and we need to restart it).
parent b5695436
......@@ -2294,9 +2294,20 @@ sub vnodeBoot($$$$)
captureStart($vnode_id);
}
# notify stated that we are about to boot
# notify stated that we are about to boot. We need this transition for
# stated to do its thing, this state name is treated specially.
libutil::setState("BOOTING");
#
# But, we find ourselves stuck in BOOTING quite often if the VM
# fails to boot far enough to to send in a state transition. We want
# to catch this specific hangup, so we will send an intermediate
# state that the server side can notice, and watch for how long it
# stays in the state.
#
sleep(1);
libutil::setState("VNODEBOOTSTART");
#
# We are going to watch for a busted control network interface, which
# happens a lot. There is a problem with the control vif not working,
......
......@@ -130,6 +130,7 @@ use vars qw(@ISA @EXPORT);
TBDB_NODESTATE_MFSSETUP TBDB_NODESTATE_TBFAILED
TBDB_NODESTATE_POWEROFF TBDB_NODESTATE_SECVIOLATION
TBDB_NODESTATE_GPXEBOOTING TBDB_NODESTATE_TPMSIGNOFF
TBDB_NODESTATE_VNODEBOOTSTART
TBDB_NODEOPMODE_NORMAL TBDB_NODEOPMODE_DELAYING
TBDB_NODEOPMODE_UNKNOWNOS TBDB_NODEOPMODE_RELOADING
......@@ -502,6 +503,7 @@ sub TBDB_NODESTATE_GPXEBOOTING(){ "GPXEBOOTING"; }
sub TBDB_NODESTATE_TPMSIGNOFF() { "TPMSIGNOFF"; }
sub TBDB_NODESTATE_SECVIOLATION(){ "SECVIOLATION"; }
sub TBDB_NODESTATE_MFSBOOTING() { "MFSBOOTING"; }
sub TBDB_NODESTATE_VNODEBOOTSTART() { "VNODEBOOTSTART"; }
sub TBDB_NODEOPMODE_ANY { "*"; } # A wildcard opmode
sub TBDB_NODEOPMODE_NORMAL { "NORMAL"; }
......
......@@ -1801,11 +1801,11 @@ sub WaitForNodes($$@)
next;
}
#
# Watch for a node stuck in BOOTING; this happens a lot with
# XEN VMs, a reboot typically solves it.
# Watch for a node stuck in VNODEBOOTSTART; this happens a lot with
# XEN VMs, a reboot typically solves it.
#
if ($node->isvirtnode() &&
$state eq TBDB_NODESTATE_BOOTING &&
$state eq TBDB_NODESTATE_VNODEBOOTSTART &&
$state eq $node->_laststate() &&
time() - $node->_laststatestamp() > 180) {
my $giveup = $node->_retried();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment