Commit 3b210b7b authored by Mac Newbold's avatar Mac Newbold

Rollback to prestatewait for now.

parent b71f5f90
This diff is collapsed.
......@@ -116,7 +116,6 @@ my $TBNODESTATE = TBDB_TBEVENT_NODESTATE;
my $TBNODEOPMODE = TBDB_TBEVENT_NODEOPMODE;
my $TBCONTROL = TBDB_TBEVENT_CONTROL;
my $TBCOMMAND = TBDB_TBEVENT_COMMAND;
my $TBFAILED = TBDB_TBEVENT_TBFAILED;
my $TBREBOOT = TBDB_COMMAND_REBOOT;
my $TBPOWEROFF = TBDB_COMMAND_POWEROFF;
my $TBPOWERON = TBDB_COMMAND_POWERON;
......@@ -296,10 +295,10 @@ while (1) {
# Check for nodes that have passed their timeout
if (!qhead($deadline,$node)) {
debug("HEAD: $node in ".($deadline-$now).", queue=".qsize()."\n");
info("HEAD: $node in ".($deadline-$now).", queue=".qsize()."\n");
while ($now >= $deadline && $node ne "") {
qpop($deadline,$node);
debug("POP: $node in ".($deadline-$now).", queue=".qsize()."\n");
info("POP: $node in ".($deadline-$now).", queue=".qsize()."\n");
handleCtrlEvent($node,$TBTIMEOUT);
if (0) { qshow(); }
if (qhead($deadline,$node)) {
......@@ -369,7 +368,6 @@ sub readStates(;@) {
$nodes{$node_id}{notified} = 0;
$nodes{$node_id}{timedout} = 0;
$nodes{$node_id}{noretry} = 0;
$nodes{$node_id}{rebooting} = 0;
# Is there a timeout? If so, set it up!
setTimeout($mode,$state,$node_id,$timestamp);
}
......@@ -624,7 +622,6 @@ sub stateTransition($$) {
# We successfully booted, so clear some flags
$nodes{$node}{noretry} = 0;
$nodes{$node}{timedout} = 0;
$nodes{$node}{rebooting} = 0;
# Check if we really need to do a reset
my $r = DBQueryWarn("select osid,def_boot_osid from nodes ".
"where node_id='$node'");
......@@ -829,33 +826,18 @@ sub handleCtrlEvent($$) {
foreach ($action) {
/^$TBTIMEOUTREBOOT/ && do {
# If the node is in our control (ie node_reboot),
# we want to do something. If it is in the user's
# control (went to shutdown without a reboot event),
# then we don't want to touch it.
if ($nodes{$node}{rebooting}) {
if ($timedout>1) {
# We've tried too many times...
# The node has now officially failed to boot
notify("Node $node timed out $timedout times!\n".
"Giving up until it boots sucessfully.\n");
$nodes{$node}{noretry} = 1;
info("$node: Sending $TBFAILED $TBREBOOT $node\n");
EventSendWarn(host => $BOSSNODE ,
objtype => $TBFAILED ,
eventtype => $TBREBOOT ,
objname => $node);
} else {
# XXX Temporary! For now notify instead of
# really rebooting, until the timeout/retry
# stuff is gone from os_setup and os_load
#notify("Node $node has timed out in state ".
# "$mode/$state - REBOOT requested\n");
handleCommand($node,$TBREBOOT,$timedout,1);
}
if ($timedout>3) {
# We've tried too many times...
notify("Node $node has timed out too many times!\n".
"Giving up until it boots sucessfully.\n");
$nodes{$node}{noretry} = 1;
} else {
info("Node $node timed out in state $mode/$state ".
"under user's control - not rebooting\n");
# XXX Temporary! For now notify instead of
# really rebooting, until the timeout/retry
# stuff is gone from os_setup and os_load
notify("Node $node has timed out in state ".
"$mode/$state - REBOOT requested\n");
#handleCommand($node,$TBREBOOT,$timedout,1);
}
last; };
/^$TBTIMEOUTNOTIFY/ && do {
......@@ -881,7 +863,7 @@ sub handleCommand($$;$$) {
# We may need to do it here (while iterating over the list), or
# make some other fix up in handleEvent.
if ($command eq $TBREBOOT && $retry >1) {
if ($command eq $TBREBOOT && $retry >=4) {
announce("Node $params has tried rebooting $retry times and has \n".
"still not been successful. Please look into it soon.\n".
"" );# "In the meantime, $params will be powered off.\n");
......@@ -903,14 +885,11 @@ sub handleCommand($$;$$) {
$node = $nodes[$n];
debug("Checking rebooting: $node, $nodes{$node}, ".
"$nodes{$node}{state}, $nodes{$node}{noretry}\n");
if (($nodes{$node}{rebooting}) &&
(!$nodes{$node}{noretry}) ) {
if (($nodes{$node}{state} ne TBDB_NODESTATE_ISUP) &&
(!$nodes{$node}{noretry}) ) {
# This node shouldn't be rebooted now...
info("$node: Sending $TBFAILED $TBREBOOT $node\n");
EventSendWarn(host => $BOSSNODE ,
objtype => $TBFAILED ,
eventtype => $TBREBOOT ,
objname => $node);
# XXX Send feedback here somehow!
info("$node: Trying to reboot too soon! Skipping.\n");
# Cut it out of the list
debug("Nodelist before ==> ".join(" ",@nodes)."\n");
......@@ -926,8 +905,6 @@ sub handleCommand($$;$$) {
# Permissions were checked in order to send the message,
# so we don't need to do any fancy stuff here.
$nodes{$node}{rebooting}=1;
my $cmd = "$nodereboot -r $nodelist";
my $redir = " 2>&1 >> /usr/testbed/log/nodereboot.log &";
debug("$cmd $redir\n");
......
......@@ -45,7 +45,6 @@ use lib "@prefix@/lib";
use libdb;
use libtestbed;
use event;
use StateWait;
use POSIX qw(strftime);
my $ssh = "$TB/bin/sshtb -n";
......@@ -121,10 +120,10 @@ if (defined($options{"e"})) {
}
# XXX Temporary, until we make event sending the default
#$realmode=1;
if ($realmode && $UID && !TBAdmin($UID)) {
die("*** You cannot use real mode!\n");
}
$realmode=1;
#if ($realmode && $UID && !TBAdmin($UID)) {
# die("*** You cannot use real mode!\n");
#}
#
# If eidmode, then get the node list out of the DB instead of the command
......@@ -233,51 +232,21 @@ if (! keys(%realnodes) && ! keys(%virtnodes)) {
my @sortednodes = sort(keys(%realnodes));
if (!$realmode) {
$StateWait::debug = $debug;
if (!$nowait) {
my @states = ();
if ($waitmode) {
print "Waiting for nodes to shut down and come up...\n";
@states= ( TBDB_NODESTATE_SHUTDOWN , TBDB_NODESTATE_ISUP );
} else {
print "Waiting for nodes to shut down...\n";
@states= ( TBDB_NODESTATE_SHUTDOWN );
}
initStateWait(\@states, @sortednodes)
}
EventSendFatal(host => $BOSSNODE ,
objtype => TBDB_TBEVENT_COMMAND ,
eventtype => TBDB_COMMAND_REBOOT ,
objname => join(",",@sortednodes) );
my $rv = 0;
if (!$nowait) {
# In here we can do some output to tell the user what's going on.
my $start = time();
my $now = $start;
my $done = 0;
my $total = scalar(@sortednodes);
my @finished = ();
my @failed = ();
while( $done < $total ) {
print "Waiting for ".($total-$done)." nodes...\n";
waitForState(\@finished,\@failed,60);
$now = time();
$done = scalar(@finished) + scalar(@failed);
my $min = int(($now - $start + 30)/60); # round to nearest min.
print "After $min min., $done nodes done...\n";
#print "fin = ".join(",",@finished)." fail = ".join(",",@failed).
# " Time=$now (".($now-$start)." elapsed), done=$done\n";
sleep(1);
}
print "All $total nodes finished.\n";
$bad = scalar(@failed);
if ($bad) {
print "There were $bad failures: ".join(" ",@failed)."\n";
$rv = 1;
if ($waitmode) {
# Wait for [SHUTDOWN,ISUP]
} else {
# Wait for [SHUTDOWN]
}
endStateWait();
}
exit($rv);
exit(0);
}
#
......
......@@ -51,7 +51,6 @@ my $MAXRETRIES = 1;
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use StateWait;
# Be careful not to exit on transient error
$libdb::DBQUERY_MAXTRIES = 30;
......@@ -69,6 +68,7 @@ my $imageid;
my $imagepid = TB_OPSPID;
my %imageid_row;
my @nodes = ();
my %retries = ();
my $mereuser = 0;
my $waitmode = 1;
my $failures = 0;
......@@ -308,36 +308,111 @@ if (! $waitmode) {
exit $failures;
}
print "Issuing reboot for @nodes and then waiting ...\n";
initStateWait([ TBDB_NODESTATE_RELOADDONE ] , @nodes);
system("$nodereboot @nodes");
if ($?) {
print "Reboot failed for (some of) @nodes. Quitting!\n";
exit ($? >> 8);
# The retry vector is initialized to the number of retries we allow per
# node, afterwhich its a fatal error.
foreach my $node (@nodes) {
$retries{$node} = $MAXRETRIES;
}
my $total = scalar(@nodes);
my @finished = ();
my @failed = ();
waitForState(\@finished, \@failed, 60 * 15);
endStateWait();
my $worked = scalar(@finished);
my $failed = scalar(@failed);
my $remain = $total - $worked - $failed;
if ($worked != $total) {
print "*** os_load: Only $worked nodes of $total succeeded!\n";
if ($failed) { print "\tThere were $failed failures.\n"; }
if ($remain) { print "\tThere were $remain nodes that timed out.\n"; }
my @failed=();
while (@nodes) {
# Reboot them all.
print "Issuing reboot for @nodes and then waiting ...\n";
if ($reboot) {
system("$nodereboot @nodes");
if ($?) {
print "Reboot failed for (some of) @nodes. Quitting!\n";
exit ($? >> 8);
}
}
# Now wait for them.
$startwait = time;
@failed = WaitTillReloadDone(@nodes);
@nodes=();
while (@failed) {
my $node = shift(@failed);
if ($retries{$node}) {
print "*** Trying $node again (resetting/rebooting) ...\n";
push(@nodes, $node);
# Possible race with reboot?
SetupReload($node);
# Retry until count hits zero.
$retries{$node} -= 1;
} else {
print "*** $node failed too many times. Skipping!\n";
$failures++;
}
}
}
$failures += $failed;
print "OS Reload Done! There were $failures failures!\n";
exit($failures);
# Wait for a reload to finish by watching its state
sub WaitTillReloadDone {
my (@nodes) = @_;
my %done = ();
my $count = @nodes;
my @failed = ();
foreach my $node ( @nodes ) { $done{$node} = 0; }
print STDERR "Waiting for @nodes to finish reloading\n".`date` if $dbg;
# Start a counter going, relative to the time we rebooted the first
# node.
my $waittime = 0;
my $minutes = 0;
while ($count) {
# Wait first to make sure reboot is done, and so that we don't
# wait one more time after everyone is up.
sleep(5);
foreach my $node ( @nodes ) {
if (! $done{$node}) {
my ($query_result, @row);
$query_result =
DBQueryFatal("SELECT op_mode FROM nodes ".
"where node_id='$node'");
@row = $query_result->fetchrow_array();
# We simply wait for the node to leave the reloading opmode
if ($row[0] ne TBDB_NODEOPMODE_RELOAD) {
print STDERR "$node has left reloading mode\n".`date` if $dbg;
$count--;
$done{$node} = 1;
next;
}
# Soon we will have stated's timeouts take care of
# rebooting once or twice if we get stuck during
# reloading.
$waittime = time - $startwait;
if ($waittime > $maxwait) {
my $t = (int ($waittime / 60));
print "*** $node appears wedged; ".
"its been $t minutes since it was rebooted.\n";
$count--;
$done{$node} = 1;
push(@failed, $node);
next;
}
if (int($waittime / 60) > $minutes) {
$minutes = int($waittime / 60);
print "Still waiting for $node to reload - ".
"its been $minutes minute(s)\n";
}
}
}
}
return @failed;
}
# Setup a reload. Note that imageid is global.
sub SetupReload($) {
......
This diff is collapsed.
......@@ -45,7 +45,7 @@ my $state;
#
# Untaint the path
#
#
# Untaint the path
$ENV{'PATH'} = "/bin:/usr/bin:/sbin:/usr/sbin:$TB/libexec:$TB/sbin:$TB/bin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
......@@ -75,7 +75,7 @@ TBDebugTimeStamp("tbrestart started");
#
# Must be an active experiment to restart!
#
#
if (! ($state = ExpState($pid, $eid))) {
die("*** $0:\n".
" No such experiment $pid/$eid\n");
......@@ -86,8 +86,8 @@ if ($state ne EXPTSTATE_ACTIVE) {
}
#
# Stop the event system.
#
# Stop the event system.
#
if (!$DISABLE_EVENTS) {
print "Stopping the event system.\n";
TBDebugTimeStamp("eventsys_control started");
......@@ -100,14 +100,14 @@ if (!$DISABLE_EVENTS) {
#
# Clearing the portstat counters seems like a good idea.
#
#
print "Clearing port counters.\n";
TBDebugTimeStamp("portstats started");
if (system("portstats -z -a -q $pid $eid")) {
print STDERR "*** WARNING: Failed to clear port counters.\n";
#
# This is a non-fatal error.
#
#
}
TBDebugTimeStamp("portstats finished");
......@@ -118,21 +118,17 @@ TBDebugTimeStamp("portstats finished");
# this point. This is terribly imperfect of course, since there are no
# guarantees, especially since the events are async (a tbreset and isup
# could be in the event queue for a node). The ready bits present the worst
# problem.
# problem.
#
print "Rebooting all nodes\n";
TBDebugTimeStamp("node reboot started");
# Start the wait before the node reboot, so we don't miss any events...
initStateWait([ TBDB_NODESTATE_ISUP ],@nodes);
foreach my $node ( @nodes ) {
#
# Must duplicate a check that would be done in node_reboot if we
# gave it the entire list. No point in rebooting local jails.
#
my ($jailed, $plab);
if (TBIsNodeVirtual($node, \$jailed, \$plab)) {
if (! $jailed && ! $plab) {
next;
......@@ -141,7 +137,7 @@ foreach my $node ( @nodes ) {
next;
}
}
if (system("$nodereboot $node")) {
die("*** $0:\n".
" Failed to reboot node $node!\n");
......@@ -151,56 +147,16 @@ foreach my $node ( @nodes ) {
}
print STDOUT "Waiting for nodes to come up ...\n";
my %notified = ();
foreach $n (@nodes) { $notified{$n} = 0; }
my $maxwait = 60 * 15; # Don't wait more than 15 min.
my $u = 60; # Update the user every 60 seconds.
my $total = scalar(@nodes);
my $done = 0;
my $start = time();
my $now = $start;
my @finished = ();
my @failed = ();
while ( ($now - $start) < $maxwait && $done < $total ) {
my $wait = min($u,$start + $maxwait - $now);
waitForState(\@finished, \@failed, $wait);
$now = time();
$done = scalar(@finished) + scalar(@failed);
foreach $n (@finished) {
if (!$notified{$n}) {
print STDOUT "$n is alive and well\n";
SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
$notified{$n}=1;
}
}
foreach $n (@failed) {
if (!$notified{$n}) {
print STDOUT "$n failed to come up!\n";
SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
$failed++;
$notified{$n}=1;
}
}
my $min = int(($now - $start + 30)/60); # round to nearest min.
print "After $min min., $done nodes done...\n";
}
endStateWait();
my $m = $maxwait/60;
if ($done != $total) {
foreach $n (@nodes) {
if (!$notified{$n}) {
print STDOUT "$n failed to come up after $m minutes!\n";
SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
$failed++;
$notified{$n}=1;
}
foreach my $node ( sort(@nodes) ) {
if (! TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $waitstart, (60*6))) {
print STDOUT "$node is alive and well\n";
SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
next;
}
SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
$failed++;
}
TBDebugTimeStamp("node reboot finished");
if ($failed) {
......@@ -209,8 +165,8 @@ if ($failed) {
}
#
# Start the event system.
#
# Start the event system.
#
if (!$DISABLE_EVENTS) {
print "Starting the event system.\n";
TBDebugTimeStamp("eventsys_control started");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment