Commit 4dc57d48 authored by Mike Hibler's avatar Mike Hibler

Handle a common failure on the node reload path.

Under load, nodes that have just entered reloading and have just rebooted
might fail to get bootinfo.  The default behavior in this case is for the
node to boot from disk (dubious, but that is the topic for another day).
This causes the node to fall off the RELOAD path, winding up in either
TBFAILED or ISUP.  Worse, if the node makes it to ISUP, its reload state
is cleared and even if the reload_daemon reboots the node, it will still
not go through the reloading process.

The result is a bunch of nodes left in reloading.  Now if a node makes an
invalid transition to TBFAILED or ISUP while in the RELOAD state machine,
it fires the new REBOOT trigger which does...well, you figure it out.
Note that in the ISUP case, this trigger overrides the default that would
otherwise clear the reload state--so reboot is sufficient to get the machine
back on the RELOAD track.
parent 7c4493dd
......@@ -819,8 +819,8 @@ sub stateTransition($$) {
objname => $node);
next;
};
(/^$TBPOWERCYCLE$/) && do {
handleCommand($node,$TBPOWERCYCLE);
(/^$TBREBOOT$/ || /^$TBPOWERCYCLE$/) && do {
handleCommand($node,$trig);
next;
};
/^RELOADOLDMFS$/ && do {
......@@ -1084,6 +1084,7 @@ sub handleCommand($$;$$) {
debug("Checking rebooting: $node, $nodes{$node}, ".
"$nodes{$node}{state}, $nodes{$node}{noretry}\n");
if (($nodes{$node}{state} ne TBDB_NODESTATE_ISUP) &&
($nodes{$node}{state} ne TBDB_NODESTATE_TBFAILED) &&
(!$nodes{$node}{noretry}) ) {
# This node shouldn't be rebooted now...
# XXX Send feedback here somehow!
......@@ -1103,13 +1104,9 @@ sub handleCommand($$;$$) {
# Permissions were checked in order to send the message,
# so we don't need to do any fancy stuff here.
my $cmd = "$nodereboot -r $nodelist";
my $redir = " 2>&1 >> $rebootlog &";
debug("$cmd $redir\n");
system("date $redir");
system($cmd.$redir) and
notify("$params/$command: ".
"Command '$cmd' failed, error $?: $!\n");
system("(date; $nodereboot -r $nodelist) >>$rebootlog 2>&1 &") and
notify("$params/$command: ".
"Command '$cmd' failed, error $?: $!\n");
# Set up a timeout, so we retry if we don't get SHUTDOWN in time
foreach $node (@nodes) {
......
......@@ -329,7 +329,7 @@ REPLACE INTO state_timeouts VALUES ('USERSTATUS','ACTIVE',0,'');
REPLACE INTO state_timeouts VALUES ('USERSTATUS','FROZEN',0,'');
REPLACE INTO state_timeouts VALUES ('USERSTATUS','NEWUSER',0,'');
REPLACE INTO state_timeouts VALUES ('USERSTATUS','UNAPPROVED',0,'');
REPLACE INTO state_timeouts VALUES ('TBCOMMAND','REBOOT',15,'CMDRETRY');
REPLACE INTO state_timeouts VALUES ('TBCOMMAND','REBOOT',45,'CMDRETRY');
REPLACE INTO state_timeouts VALUES ('TBCOMMAND','POWEROFF',0,'CMDRETRY');
REPLACE INTO state_timeouts VALUES ('TBCOMMAND','POWERON',0,'CMDRETRY');
REPLACE INTO state_timeouts VALUES ('TBCOMMAND','POWERCYCLE',0,'CMDRETRY');
......@@ -534,6 +534,11 @@ REPLACE INTO state_transitions VALUES ('RELOAD-PCVM','RELOADSETUP','RELOADING','
REPLACE INTO state_transitions VALUES ('RELOAD-PCVM','RELOADING','RELOADDONE','ReloadDone');
REPLACE INTO state_transitions VALUES ('RELOAD-PCVM','RELOADDONE','SHUTDOWN','ReloadDone');
REPLACE INTO state_transitions VALUES ('RELOAD-PCVM','SHUTDOWN','RELOADSETUP','ReloadSetup');
REPLACE INTO state_transitions VALUES ('RELOAD','BOOTING','TBSETUP','FailedBoot');
REPLACE INTO state_transitions VALUES ('RELOAD','TBSETUP','ISUP','FailedBoot');
REPLACE INTO state_transitions VALUES ('RELOAD','TBSETUP','TBFAILED','FailedBoot');
REPLACE INTO state_transitions VALUES ('RELOAD','ISUP','SHUTDOWN','RebootAfterFail');
REPLACE INTO state_transitions VALUES ('RELOAD','TBFAILED','SHUTDOWN','RebootAfterFail');
--
-- Dumping data for table `state_triggers`
......@@ -552,6 +557,8 @@ REPLACE INTO state_triggers VALUES ('*','OPSNODEBSD','ISUP','SCRIPT:opsreboot');
REPLACE INTO state_triggers VALUES ('*','NORMALv2','WEDGED','POWERCYCLE');
REPLACE INTO state_triggers VALUES ('*','RELOAD','RELOADOLDMFS','RELOADOLDMFS');
REPLACE INTO state_triggers VALUES ('*','RELOAD-PCVM','RELOADDONE','RESET, RELOADDONE');
REPLACE INTO state_triggers VALUES ('*','RELOAD','ISUP','REBOOT');
REPLACE INTO state_triggers VALUES ('*','RELOAD','TBFAILED','REBOOT');
--
-- Dumping data for table `table_regex`
......
#
# State transition/triggers changes to detect nodes which incorrectly boot
# from disk in op_mode RELOAD. This happens when nodes cannot get their
# boot info and pxeboot falls back to booting from the default partition.
#
use strict;
use libdb;
sub DoUpdate($$$)
{
my ($dbhandle, $dbname, $version) = @_;
my @transitions = (
["RELOAD", "ISUP", "SHUTDOWN", "RebootAfterFail"],
["RELOAD", "TBFAILED", "SHUTDOWN", "RebootAfterFail"],
["RELOAD", "TBSETUP", "TBFAILED", "FailedBoot"],
["RELOAD", "TBSETUP", "ISUP", "FailedBoot"],
["RELOAD", "BOOTING", "TBSETUP", "FailedBoot"]
);
my @triggers = (
["*", "RELOAD", "TBFAILED", "REBOOT"],
["*", "RELOAD", "ISUP", "REBOOT"]
);
foreach my $row (@transitions) {
my ($opm,$s1,$s2,$lab) = @$row;
my $query_result =
DBQueryFatal("SELECT op_mode FROM state_transitions WHERE ".
"op_mode='$opm' AND state1='$s1' AND state2='$s2'");
if ($query_result->numrows == 0) {
DBQueryFatal("INSERT INTO state_transitions VALUES ".
"('$opm','$s1','$s2','$lab')");
}
}
foreach my $row (@triggers) {
my ($node,$opm,$s,$trig) = @$row;
my $query_result =
DBQueryFatal("SELECT node_id FROM state_triggers WHERE ".
"node_id='$node' AND op_mode='$opm' AND state='$s'");
if ($query_result->numrows == 0) {
DBQueryFatal("INSERT INTO state_triggers VALUES ".
"('$node','$opm','$s','$trig')");
}
}
#
# stated implements the reboot trigger by firing off a node_reboot in
# the background and queuing a timeout. If it doesn't get a SHUTDOWN
# transition from the node before the timeout happens, it will fire
# off another node_reboot. Thus if the timeout is too short, we could
# double reboot the node.
#
# Well, the timeout was too short. We need to allow time enough for
# node_reboot to fail an ssh (~60 seconds) and send an ipod.
#
DBQueryFatal("UPDATE state_timeouts SET timeout=75 ".
"WHERE op_mode='TBCOMMAND' and state='REBOOT'");
return 0;
}
1;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment