Commit 2b2a306d authored by Mac Newbold's avatar Mac Newbold

New StateWait changes - the main point of all this is to move to our new

model of waiting for state changes. Before we were watching the database
(which means we can only watch for terminal/stable/long-lived states, and
have to poll the db). Now things that are waiting for states to change
become event listeners, and watch the stream of events flow by, and don't
have to do any polling. They can now watch for any state, and even
sequences of states (ie a Shutdown followed by an Isup).

To do this, there is now a cool StateWait.pm library that encapsulates the
functionality needed. To use it, you call initStateWait before you start
the chain of events (ie before you call node reboot). Then do your stuff,
and call waitForState() when you're ready to wait. It can be told to
return periodically with the results so far, and you can cancel waiting
for things. An example program called waitForState is in
testbed/event/stated/ , and can also be used nicely as a command line tool
that wraps up the library functionality.

This also required the introduction of a TBFAILED event that can be sent
when a node isn't going to make it to the state that someone may be
waiting for. Ie if it gets wedged coming up, and stated retries, but
eventually gives up on it, it sends this to let things know that the node
is hozed and won't ever come up.

Another thing that is part of this is that node_reboot moves (back) to the
fully-event-driven model, where users call node reboot, and it does some
checks and sends some events. Then stated calls node_reboot in "real mode"
to actually do the work, and handles doing the appropriate retries until
the node either comes up or is deemed "failed" and stated gives up on it.
This means stated is also the gatekeeper of when you can and cannot reboot
a node. (See mail archives for extensive discussions of the details.)

A big part of the motivation for this was to get uninformed timeouts and
retries out of os_load/os_setup and put them in stated where we can make a
wiser choice. So os_load and os_setup now use this new stuff and don't
have to worry about timing out on nodes and rebooting. Stated makes sure
that they either come up, get retried, or fail to boot. tbrestart also
underwent a similar change.
parent ed3fcd05
This diff is collapsed.
......@@ -56,13 +56,13 @@ $port = "";
$URL = "";
$debug = 0;
my $handle;
my $tuple;
$handle=0;
$tuple=0;
my @done = ();
my @failures = ();
my $nodecount = ();
my %remain = ();
my @done;
my @failures;
my $nodecount;
my %remain;
#
# Exported Sub-Routines / Functions
......@@ -71,6 +71,10 @@ my %remain = ();
sub initStateWait( $@ ) {
my $states = shift;
my @nodes = @_;
@done=();
@failures=();
$nodecount=0;
%remain=();
$nodecount = scalar(@nodes);
# states is an arrayref
if ($debug) {
......@@ -84,24 +88,34 @@ sub initStateWait( $@ ) {
# Do the subscription for the right stuff, including all the
# states for all the nodes, and the failure event for all nodes
$handle = event_register($URL,0);
if (!$handle) { die "Unable to register with event system\n"; }
if ($handle==0) {
if ($debug) { print "Getting handle for $URL - "; }
$handle = event_register($URL,0);
if ($debug) { print "returned - "; }
if (!$handle) { die "Unable to register with event system\n"; }
if ($debug) { print "Success: $handle\n"; }
}
if ($debug) { print "Getting tuple - "; }
$tuple = address_tuple_alloc();
if (!$tuple) { die "Could not allocate an address tuple\n"; }
%$tuple = ( objtype => TBDB_TBEVENT_NODESTATE,
eventtype => join(",",@$states),
%$tuple = ( objtype => join(",",TBDB_TBEVENT_NODESTATE,
TBDB_TBEVENT_TBFAILED),
eventtype => join(",",@$states, TBDB_COMMAND_REBOOT ),
objname => join(",",@nodes) );
if ($debug > 1) {
if ($debug) { print "Success: $tuple\n"; }
if ($debug > 0) {
print "tuple = ('".join("', '",keys(%$tuple))."') => ('".
join("', '",values(%$tuple))."')\n";
}
if ($debug) { print "Subscribing - "; }
if (!event_subscribe($handle,\&doEvent,$tuple)) {
die "Could not subscribe to events\n";
}
if ($debug) { print "Success.\n"; }
foreach $n (@nodes) {
my @l = @$states;
......@@ -131,6 +145,12 @@ sub doEvent( $$$ ) {
"$eventtype\n";
}
my $n = $objname;
if (defined($remain{$n}) && $objtype eq TBDB_TBEVENT_TBFAILED) {
# It is a failed boot... add it to the failures list.
if ($debug) { print "Got $eventtype failure for $n... Aborting!\n"; }
push(@failures,$n);
delete($remain{$n});
}
if (defined($remain{$n}) && @{$remain{$n}}[0] eq $eventtype) {
# this is the next state we were waiting for
if ($debug) { print "Got $eventtype for $n\n" };
......@@ -191,9 +211,7 @@ sub cancelWait( $ ) {
sub endStateWait() {
%remain = ();
if (event_unregister($handle) == 0) {
die "Unable to unregister with event system\n";
}
$tuple = address_tuple_free($tuple);
if ($debug) { print "endStateWait\n"; }
return 0;
}
......@@ -201,3 +219,8 @@ sub endStateWait() {
# Always end a package successfully!
1;
END {
if ($handle && event_unregister($handle) == 0) {
die "Unable to unregister with event system\n";
}
}
......@@ -116,6 +116,7 @@ my $TBNODESTATE = TBDB_TBEVENT_NODESTATE;
my $TBNODEOPMODE = TBDB_TBEVENT_NODEOPMODE;
my $TBCONTROL = TBDB_TBEVENT_CONTROL;
my $TBCOMMAND = TBDB_TBEVENT_COMMAND;
my $TBFAILED = TBDB_TBEVENT_TBFAILED;
my $TBREBOOT = TBDB_COMMAND_REBOOT;
my $TBPOWEROFF = TBDB_COMMAND_POWEROFF;
my $TBPOWERON = TBDB_COMMAND_POWERON;
......@@ -295,10 +296,10 @@ while (1) {
# Check for nodes that have passed their timeout
if (!qhead($deadline,$node)) {
info("HEAD: $node in ".($deadline-$now).", queue=".qsize()."\n");
debug("HEAD: $node in ".($deadline-$now).", queue=".qsize()."\n");
while ($now >= $deadline && $node ne "") {
qpop($deadline,$node);
info("POP: $node in ".($deadline-$now).", queue=".qsize()."\n");
debug("POP: $node in ".($deadline-$now).", queue=".qsize()."\n");
handleCtrlEvent($node,$TBTIMEOUT);
if (0) { qshow(); }
if (qhead($deadline,$node)) {
......@@ -368,6 +369,7 @@ sub readStates(;@) {
$nodes{$node_id}{notified} = 0;
$nodes{$node_id}{timedout} = 0;
$nodes{$node_id}{noretry} = 0;
$nodes{$node_id}{rebooting} = 0;
# Is there a timeout? If so, set it up!
setTimeout($mode,$state,$node_id,$timestamp);
}
......@@ -622,6 +624,7 @@ sub stateTransition($$) {
# We successfully booted, so clear some flags
$nodes{$node}{noretry} = 0;
$nodes{$node}{timedout} = 0;
$nodes{$node}{rebooting} = 0;
# Check if we really need to do a reset
my $r = DBQueryWarn("select osid,def_boot_osid from nodes ".
"where node_id='$node'");
......@@ -826,18 +829,33 @@ sub handleCtrlEvent($$) {
foreach ($action) {
/^$TBTIMEOUTREBOOT/ && do {
if ($timedout>3) {
# We've tried too many times...
notify("Node $node has timed out too many times!\n".
"Giving up until it boots sucessfully.\n");
$nodes{$node}{noretry} = 1;
# If the node is in our control (ie node_reboot),
# we want to do something. If it is in the user's
# control (went to shutdown without a reboot event),
# then we don't want to touch it.
if ($nodes{$node}{rebooting}) {
if ($timedout>1) {
# We've tried too many times...
# The node has now officially failed to boot
notify("Node $node timed out $timedout times!\n".
"Giving up until it boots sucessfully.\n");
$nodes{$node}{noretry} = 1;
info("$node: Sending $TBFAILED $TBREBOOT $node\n");
EventSendWarn(host => $BOSSNODE ,
objtype => $TBFAILED ,
eventtype => $TBREBOOT ,
objname => $node);
} else {
# XXX Temporary! For now notify instead of
# really rebooting, until the timeout/retry
# stuff is gone from os_setup and os_load
#notify("Node $node has timed out in state ".
# "$mode/$state - REBOOT requested\n");
handleCommand($node,$TBREBOOT,$timedout,1);
}
} else {
# XXX Temporary! For now notify instead of
# really rebooting, until the timeout/retry
# stuff is gone from os_setup and os_load
notify("Node $node has timed out in state ".
"$mode/$state - REBOOT requested\n");
#handleCommand($node,$TBREBOOT,$timedout,1);
info("Node $node timed out in state $mode/$state ".
"under user's control - not rebooting\n");
}
last; };
/^$TBTIMEOUTNOTIFY/ && do {
......@@ -863,7 +881,7 @@ sub handleCommand($$;$$) {
# We may need to do it here (while iterating over the list), or
# make some other fix up in handleEvent.
if ($command eq $TBREBOOT && $retry >=4) {
if ($command eq $TBREBOOT && $retry >1) {
announce("Node $params has tried rebooting $retry times and has \n".
"still not been successful. Please look into it soon.\n".
"" );# "In the meantime, $params will be powered off.\n");
......@@ -885,11 +903,14 @@ sub handleCommand($$;$$) {
$node = $nodes[$n];
debug("Checking rebooting: $node, $nodes{$node}, ".
"$nodes{$node}{state}, $nodes{$node}{noretry}\n");
if (($nodes{$node}{state} ne TBDB_NODESTATE_ISUP) &&
(!$nodes{$node}{noretry}) ) {
if (($nodes{$node}{rebooting}) &&
(!$nodes{$node}{noretry}) ) {
# This node shouldn't be rebooted now...
# XXX Send feedback here somehow!
info("$node: Sending $TBFAILED $TBREBOOT $node\n");
EventSendWarn(host => $BOSSNODE ,
objtype => $TBFAILED ,
eventtype => $TBREBOOT ,
objname => $node);
info("$node: Trying to reboot too soon! Skipping.\n");
# Cut it out of the list
debug("Nodelist before ==> ".join(" ",@nodes)."\n");
......@@ -905,6 +926,8 @@ sub handleCommand($$;$$) {
# Permissions were checked in order to send the message,
# so we don't need to do any fancy stuff here.
$nodes{$node}{rebooting}=1;
my $cmd = "$nodereboot -r $nodelist";
my $redir = " 2>&1 >> /usr/testbed/log/nodereboot.log &";
debug("$cmd $redir\n");
......
......@@ -18,10 +18,11 @@ my @states=('ISUP');
my $u = 10;
my $t = 300;
my %opt = ();
getopts("hds:u:t:",\%opt);
getopts("hdv:s:u:t:",\%opt);
if ($opt{h}) { usage(); }
if ($opt{d}) { $debug++; }
if ($opt{v}) { $debug+=$opt{v}; }
if ($opt{u}) { $u = $opt{u}; }
if ($opt{t}) { $t = $opt{t}; }
if ($opt{s}) { @states = split(",",$opt{s}); }
......@@ -32,7 +33,7 @@ my @nodes = @ARGV;
sub usage {
print <<EOF;
Usage:
$0 [-h] [-d]
$0 [-h] [-d] [-v <level>]
[-u <update freq.>] [-t <timeout>]
[-s state1[,s2]] <node> [<node> ...]
......@@ -42,7 +43,7 @@ order (non-consecutively). If no state is supplied, ISUP is
used. Update frequency is how many seconds apart status updates should
be (default $u). Timeout is how many seconds we should wait before
giving up and returning (default $t). The -h option shows this
message, and -d enables extra debugging output.
message, and -d and -v enable extra debugging output.
EOF
exit(1);
......
......@@ -45,6 +45,7 @@ use lib "@prefix@/lib";
use libdb;
use libtestbed;
use event;
use StateWait;
use POSIX qw(strftime);
my $ssh = "$TB/bin/sshtb -n";
......@@ -120,10 +121,10 @@ if (defined($options{"e"})) {
}
# XXX Temporary, until we make event sending the default
$realmode=1;
#if ($realmode && $UID && !TBAdmin($UID)) {
# die("*** You cannot use real mode!\n");
#}
#$realmode=1;
if ($realmode && $UID && !TBAdmin($UID)) {
die("*** You cannot use real mode!\n");
}
#
# If eidmode, then get the node list out of the DB instead of the command
......@@ -232,21 +233,51 @@ if (! keys(%realnodes) && ! keys(%virtnodes)) {
my @sortednodes = sort(keys(%realnodes));
if (!$realmode) {
$StateWait::debug = $debug;
if (!$nowait) {
my @states = ();
if ($waitmode) {
print "Waiting for nodes to shut down and come up...\n";
@states= ( TBDB_NODESTATE_SHUTDOWN , TBDB_NODESTATE_ISUP );
} else {
print "Waiting for nodes to shut down...\n";
@states= ( TBDB_NODESTATE_SHUTDOWN );
}
initStateWait(\@states, @sortednodes)
}
EventSendFatal(host => $BOSSNODE ,
objtype => TBDB_TBEVENT_COMMAND ,
eventtype => TBDB_COMMAND_REBOOT ,
objname => join(",",@sortednodes) );
my $rv = 0;
if (!$nowait) {
# In here we can do some output to tell the user what's going on.
if ($waitmode) {
# Wait for [SHUTDOWN,ISUP]
} else {
# Wait for [SHUTDOWN]
my $start = time();
my $now = $start;
my $done = 0;
my $total = scalar(@sortednodes);
my @finished = ();
my @failed = ();
while( $done < $total ) {
print "Waiting for ".($total-$done)." nodes...\n";
waitForState(\@finished,\@failed,60);
$now = time();
$done = scalar(@finished) + scalar(@failed);
my $min = int(($now - $start + 30)/60); # round to nearest min.
print "After $min min., $done nodes done...\n";
#print "fin = ".join(",",@finished)." fail = ".join(",",@failed).
# " Time=$now (".($now-$start)." elapsed), done=$done\n";
sleep(1);
}
print "All $total nodes finished.\n";
$bad = scalar(@failed);
if ($bad) {
print "There were $bad failures: ".join(" ",@failed)."\n";
$rv = 1;
}
endStateWait();
}
exit(0);
exit($rv);
}
#
......
......@@ -51,6 +51,7 @@ my $MAXRETRIES = 1;
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use StateWait;
# Be careful not to exit on transient error
$libdb::DBQUERY_MAXTRIES = 30;
......@@ -68,7 +69,6 @@ my $imageid;
my $imagepid = TB_OPSPID;
my %imageid_row;
my @nodes = ();
my %retries = ();
my $mereuser = 0;
my $waitmode = 1;
my $failures = 0;
......@@ -308,111 +308,36 @@ if (! $waitmode) {
exit $failures;
}
# The retry vector is initialized to the number of retries we allow per
# node, afterwhich its a fatal error.
foreach my $node (@nodes) {
$retries{$node} = $MAXRETRIES;
print "Issuing reboot for @nodes and then waiting ...\n";
initStateWait([ TBDB_NODESTATE_RELOADDONE ] , @nodes);
system("$nodereboot @nodes");
if ($?) {
print "Reboot failed for (some of) @nodes. Quitting!\n";
exit ($? >> 8);
}
my @failed=();
while (@nodes) {
# Reboot them all.
print "Issuing reboot for @nodes and then waiting ...\n";
if ($reboot) {
system("$nodereboot @nodes");
if ($?) {
print "Reboot failed for (some of) @nodes. Quitting!\n";
exit ($? >> 8);
}
}
# Now wait for them.
$startwait = time;
@failed = WaitTillReloadDone(@nodes);
@nodes=();
while (@failed) {
my $node = shift(@failed);
if ($retries{$node}) {
print "*** Trying $node again (resetting/rebooting) ...\n";
push(@nodes, $node);
# Possible race with reboot?
SetupReload($node);
# Retry until count hits zero.
$retries{$node} -= 1;
} else {
print "*** $node failed too many times. Skipping!\n";
$failures++;
}
}
my $total = scalar(@nodes);
my @finished = ();
my @failed = ();
waitForState(\@finished, \@failed, 60 * 15);
endStateWait();
my $worked = scalar(@finished);
my $failed = scalar(@failed);
my $remain = $total - $worked - $failed;
if ($worked != $total) {
print "*** os_load: Only $worked nodes of $total succeeded!\n";
if ($failed) { print "\tThere were $failed failures.\n"; }
if ($remain) { print "\tThere were $remain nodes that timed out.\n"; }
}
$failures += $failed;
print "OS Reload Done! There were $failures failures!\n";
exit($failures);
# Wait for a reload to finish by watching its state
sub WaitTillReloadDone {
my (@nodes) = @_;
my %done = ();
my $count = @nodes;
my @failed = ();
foreach my $node ( @nodes ) { $done{$node} = 0; }
print STDERR "Waiting for @nodes to finish reloading\n".`date` if $dbg;
# Start a counter going, relative to the time we rebooted the first
# node.
my $waittime = 0;
my $minutes = 0;
while ($count) {
# Wait first to make sure reboot is done, and so that we don't
# wait one more time after everyone is up.
sleep(5);
foreach my $node ( @nodes ) {
if (! $done{$node}) {
my ($query_result, @row);
$query_result =
DBQueryFatal("SELECT op_mode FROM nodes ".
"where node_id='$node'");
@row = $query_result->fetchrow_array();
# We simply wait for the node to leave the reloading opmode
if ($row[0] ne TBDB_NODEOPMODE_RELOAD) {
print STDERR "$node has left reloading mode\n".`date` if $dbg;
$count--;
$done{$node} = 1;
next;
}
# Soon we will have stated's timeouts take care of
# rebooting once or twice if we get stuck during
# reloading.
$waittime = time - $startwait;
if ($waittime > $maxwait) {
my $t = (int ($waittime / 60));
print "*** $node appears wedged; ".
"its been $t minutes since it was rebooted.\n";
$count--;
$done{$node} = 1;
push(@failed, $node);
next;
}
if (int($waittime / 60) > $minutes) {
$minutes = int($waittime / 60);
print "Still waiting for $node to reload - ".
"its been $minutes minute(s)\n";
}
}
}
}
return @failed;
}
# Setup a reload. Note that imageid is global.
sub SetupReload($) {
......
This diff is collapsed.
......@@ -45,7 +45,7 @@ my $state;
#
# Untaint the path
#
#
# Untaint the path
$ENV{'PATH'} = "/bin:/usr/bin:/sbin:/usr/sbin:$TB/libexec:$TB/sbin:$TB/bin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
......@@ -75,7 +75,7 @@ TBDebugTimeStamp("tbrestart started");
#
# Must be an active experiment to restart!
#
#
if (! ($state = ExpState($pid, $eid))) {
die("*** $0:\n".
" No such experiment $pid/$eid\n");
......@@ -86,8 +86,8 @@ if ($state ne EXPTSTATE_ACTIVE) {
}
#
# Stop the event system.
#
# Stop the event system.
#
if (!$DISABLE_EVENTS) {
print "Stopping the event system.\n";
TBDebugTimeStamp("eventsys_control started");
......@@ -100,14 +100,14 @@ if (!$DISABLE_EVENTS) {
#
# Clearing the portstat counters seems like a good idea.
#
#
print "Clearing port counters.\n";
TBDebugTimeStamp("portstats started");
if (system("portstats -z -a -q $pid $eid")) {
print STDERR "*** WARNING: Failed to clear port counters.\n";
#
# This is a non-fatal error.
#
#
}
TBDebugTimeStamp("portstats finished");
......@@ -118,17 +118,21 @@ TBDebugTimeStamp("portstats finished");
# this point. This is terribly imperfect of course, since there are no
# guarantees, especially since the events are async (a tbreset and isup
# could be in the event queue for a node). The ready bits present the worst
# problem.
# problem.
#
print "Rebooting all nodes\n";
TBDebugTimeStamp("node reboot started");
# Start the wait before the node reboot, so we don't miss any events...
initStateWait([ TBDB_NODESTATE_ISUP ],@nodes);
foreach my $node ( @nodes ) {
#
# Must duplicate a check that would be done in node_reboot if we
# gave it the entire list. No point in rebooting local jails.
#
my ($jailed, $plab);
if (TBIsNodeVirtual($node, \$jailed, \$plab)) {
if (! $jailed && ! $plab) {
next;
......@@ -137,7 +141,7 @@ foreach my $node ( @nodes ) {
next;
}
}
if (system("$nodereboot $node")) {
die("*** $0:\n".
" Failed to reboot node $node!\n");
......@@ -147,16 +151,56 @@ foreach my $node ( @nodes ) {
}
print STDOUT "Waiting for nodes to come up ...\n";
foreach my $node ( sort(@nodes) ) {
if (! TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $waitstart, (60*6))) {
print STDOUT "$node is alive and well\n";
SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
next;
my %notified = ();
foreach $n (@nodes) { $notified{$n} = 0; }
my $maxwait = 60 * 15; # Don't wait more than 15 min.
my $u = 60; # Update the user every 60 seconds.
my $total = scalar(@nodes);
my $done = 0;
my $start = time();
my $now = $start;
my @finished = ();
my @failed = ();
while ( ($now - $start) < $maxwait && $done < $total ) {
my $wait = min($u,$start + $maxwait - $now);
waitForState(\@finished, \@failed, $wait);
$now = time();
$done = scalar(@finished) + scalar(@failed);
foreach $n (@finished) {
if (!$notified{$n}) {
print STDOUT "$n is alive and well\n";
SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
$notified{$n}=1;
}
}
foreach $n (@failed) {
if (!$notified{$n}) {
print STDOUT "$n failed to come up!\n";
SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
$failed++;
$notified{$n}=1;
}
}
my $min = int(($now - $start + 30)/60); # round to nearest min.
print "After $min min., $done nodes done...\n";
}
endStateWait();
my $m = $maxwait/60;
if ($done != $total) {
foreach $n (@nodes) {
if (!$notified{$n}) {
print STDOUT "$n failed to come up after $m minutes!\n";
SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
$failed++;
$notified{$n}=1;
}
}
SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
$failed++;
}
TBDebugTimeStamp("node reboot finished");
if ($failed) {
......@@ -165,8 +209,8 @@ if ($failed) {
}
#
# Start the event system.
#
# Start the event system.
#
if (!$DISABLE_EVENTS) {
print "Starting the event system.\n";
TBDebugTimeStamp("eventsys_control started");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment