Commit e897f047 authored by Leigh Stoller's avatar Leigh Stoller

Watch for RELOADFAILED state and handle like TBFAILED, to avoid hanging

there for 30 minutes or so.
parent cfe1834d
......@@ -130,7 +130,7 @@ use vars qw(@ISA @EXPORT);
TBDB_NODESTATE_MFSSETUP TBDB_NODESTATE_TBFAILED
TBDB_NODESTATE_POWEROFF TBDB_NODESTATE_SECVIOLATION
TBDB_NODESTATE_GPXEBOOTING TBDB_NODESTATE_TPMSIGNOFF
TBDB_NODESTATE_VNODEBOOTSTART
TBDB_NODESTATE_VNODEBOOTSTART TBDB_NODESTATE_RELOADFAILED
TBDB_NODEOPMODE_NORMAL TBDB_NODEOPMODE_DELAYING
TBDB_NODEOPMODE_UNKNOWNOS TBDB_NODEOPMODE_RELOADING
......@@ -504,6 +504,7 @@ sub TBDB_NODESTATE_TPMSIGNOFF() { "TPMSIGNOFF"; }
sub TBDB_NODESTATE_SECVIOLATION(){ "SECVIOLATION"; }
sub TBDB_NODESTATE_MFSBOOTING() { "MFSBOOTING"; }
sub TBDB_NODESTATE_VNODEBOOTSTART() { "VNODEBOOTSTART"; }
sub TBDB_NODESTATE_RELOADFAILED() { "RELOADFAILED"; }
sub TBDB_NODEOPMODE_ANY { "*"; } # A wildcard opmode
sub TBDB_NODEOPMODE_NORMAL { "NORMAL"; }
......
......@@ -1012,7 +1012,7 @@ sub Action($$$)
# but we want to do this after we have forked off from the parent
# and we have returned to the client (rpc).
#
my $output = GeniUtil::ExecQuiet("$IMAGE_SETUP -g $pid,$eid");
my $output = GeniUtil::ExecQuiet("$IMAGE_SETUP -d -g $pid,$eid");
if ($?) {
$msg = "Could not setup images:\n$output";
goto bad;
......@@ -1616,7 +1616,8 @@ sub WaitForNodes($$@)
{
my ($self, $pfailed, @nodes) = @_;
my %nodes = ();
my @waitstates = (TBDB_NODESTATE_TBFAILED, TBDB_NODESTATE_ISUP);
my @waitstates = (TBDB_NODESTATE_TBFAILED, TBDB_NODESTATE_RELOADFAILED,
TBDB_NODESTATE_ISUP);
return 0
if (!@nodes);
......
......@@ -689,7 +689,7 @@ use emutil;
use XML::Simple;
use libdb qw(TBDB_ALLOCSTATE_RES_INIT_DIRTY TBDB_NODESTATE_SHUTDOWN
TBResolveNextOSID TBDB_NODESTATE_ISUP TBDB_NODESTATE_TBFAILED
TBDB_NODESTATE_PXEWAIT);
TBDB_NODESTATE_RELOADFAILED TBDB_NODESTATE_PXEWAIT);
# Error log for local physical node. This overrides the default method above,
# since it is stored in the node.
......@@ -1357,7 +1357,8 @@ sub ComputeStatus($$)
# inner elab is setup and running okay. Inner boss/ops do tell
# us their state, so we use that to determine ready.
#
if ($node->eventstate() eq TBDB_NODESTATE_TBFAILED()) {
if ($node->eventstate() eq TBDB_NODESTATE_TBFAILED() ||
$node->eventstate() eq TBDB_NODESTATE_RELOADFAILED()) {
$status = "failed";
goto done;
}
......@@ -1379,7 +1380,8 @@ sub ComputeStatus($$)
if ($eventstate eq TBDB_NODESTATE_ISUP()) {
$status = "ready";
}
elsif ($eventstate eq TBDB_NODESTATE_TBFAILED()) {
elsif ($eventstate eq TBDB_NODESTATE_TBFAILED() ||
$eventstate eq TBDB_NODESTATE_RELOADFAILED()) {
$status = "failed";
}
elsif ($eventstate eq TBDB_NODESTATE_SHUTDOWN() ||
......
......@@ -1059,6 +1059,7 @@ sub WaitTillReloadDone($$$$$@)
if ($waittime > $maxwait ||
$eventstate eq TBDB_NODESTATE_TBFAILED() ||
$eventstate eq TBDB_NODESTATE_PXEFAILED() ||
$eventstate eq TBDB_NODESTATE_RELOADFAILED() ||
$isstuck) {
#
......@@ -1079,6 +1080,7 @@ sub WaitTillReloadDone($$$$$@)
"it has been $t minutes since it was rebooted.";
if ($eventstate eq TBDB_NODESTATE_TBFAILED() ||
$eventstate eq TBDB_NODESTATE_RELOADFAILED() ||
$eventstate eq TBDB_NODESTATE_PXEFAILED()) {
tbnotice(" $node is stuck in $eventstate.");
}
......@@ -1120,6 +1122,7 @@ sub WaitTillReloadDone($$$$$@)
(60*6),
\$actual_state,
(TBDB_NODESTATE_TBFAILED,
TBDB_NODESTATE_RELOADFAILED,
TBDB_NODESTATE_PXEFAILED,
TBDB_NODESTATE_ISUP))) {
$done{$node} = $waitmode;
......
......@@ -802,7 +802,9 @@ sub WaitForNodes($@)
{
my ($self, @nodelist) = @_;
my %nodes = ();
my @waitstates = (TBDB_NODESTATE_TBFAILED, TBDB_NODESTATE_ISUP);
my @waitstates = (TBDB_NODESTATE_TBFAILED,
TBDB_NODESTATE_RELOADFAILED,
TBDB_NODESTATE_ISUP);
# Maybe all nodes failed to light up?
return 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment