Commit 2b2a306d authored by Mac Newbold's avatar Mac Newbold
Browse files

New StateWait changes - the main point of all this is to move to our new

model of waiting for state changes. Before we were watching the database
(which means we can only watch for terminal/stable/long-lived states, and
have to poll the db). Now things that are waiting for states to change
become event listeners, and watch the stream of events flow by, and don't
have to do any polling. They can now watch for any state, and even
sequences of states (ie a Shutdown followed by an Isup).

To do this, there is now a cool StateWait.pm library that encapsulates the
functionality needed. To use it, you call initStateWait before you start
the chain of events (ie before you call node reboot). Then do your stuff,
and call waitForState() when you're ready to wait. It can be told to
return periodically with the results so far, and you can cancel waiting
for things. An example program called waitForState is in
testbed/event/stated/ , and can also be used nicely as a command line tool
that wraps up the library functionality.

This also required the introduction of a TBFAILED event that can be sent
when a node isn't going to make it to the state that someone may be
waiting for. Ie if it gets wedged coming up, and stated retries, but
eventually gives up on it, it sends this to let things know that the node
is hozed and won't ever come up.

Another thing that is part of this is that node_reboot moves (back) to the
fully-event-driven model, where users call node reboot, and it does some
checks and sends some events. Then stated calls node_reboot in "real mode"
to actually do the work, and handles doing the appropriate retries until
the node either comes up or is deemed "failed" and stated gives up on it.
This means stated is also the gatekeeper of when you can and cannot reboot
a node. (See mail archives for extensive discussions of the details.)

A big part of the motivation for this was to get uninformed timeouts and
retries out of os_load/os_setup and put them in stated where we can make a
wiser choice. So os_load and os_setup now use this new stuff and don't
have to worry about timing out on nodes and rebooting. Stated makes sure
that they either come up, get retried, or fail to boot. tbrestart also
underwent a similar change.
parent ed3fcd05
This diff is collapsed.
......@@ -56,13 +56,13 @@ $port = "";
$URL = "";
$debug = 0;
my $handle;
my $tuple;
$handle=0;
$tuple=0;
my @done = ();
my @failures = ();
my $nodecount = ();
my %remain = ();
my @done;
my @failures;
my $nodecount;
my %remain;
#
# Exported Sub-Routines / Functions
......@@ -71,6 +71,10 @@ my %remain = ();
sub initStateWait( $@ ) {
my $states = shift;
my @nodes = @_;
@done=();
@failures=();
$nodecount=0;
%remain=();
$nodecount = scalar(@nodes);
# states is an arrayref
if ($debug) {
......@@ -84,24 +88,34 @@ sub initStateWait( $@ ) {
# Do the subscription for the right stuff, including all the
# states for all the nodes, and the failure event for all nodes
$handle = event_register($URL,0);
if (!$handle) { die "Unable to register with event system\n"; }
if ($handle==0) {
if ($debug) { print "Getting handle for $URL - "; }
$handle = event_register($URL,0);
if ($debug) { print "returned - "; }
if (!$handle) { die "Unable to register with event system\n"; }
if ($debug) { print "Success: $handle\n"; }
}
if ($debug) { print "Getting tuple - "; }
$tuple = address_tuple_alloc();
if (!$tuple) { die "Could not allocate an address tuple\n"; }
%$tuple = ( objtype => TBDB_TBEVENT_NODESTATE,
eventtype => join(",",@$states),
%$tuple = ( objtype => join(",",TBDB_TBEVENT_NODESTATE,
TBDB_TBEVENT_TBFAILED),
eventtype => join(",",@$states, TBDB_COMMAND_REBOOT ),
objname => join(",",@nodes) );
if ($debug > 1) {
if ($debug) { print "Success: $tuple\n"; }
if ($debug > 0) {
print "tuple = ('".join("', '",keys(%$tuple))."') => ('".
join("', '",values(%$tuple))."')\n";
}
if ($debug) { print "Subscribing - "; }
if (!event_subscribe($handle,\&doEvent,$tuple)) {
die "Could not subscribe to events\n";
}
if ($debug) { print "Success.\n"; }
foreach $n (@nodes) {
my @l = @$states;
......@@ -131,6 +145,12 @@ sub doEvent( $$$ ) {
"$eventtype\n";
}
my $n = $objname;
if (defined($remain{$n}) && $objtype eq TBDB_TBEVENT_TBFAILED) {
# It is a failed boot... add it to the failures list.
if ($debug) { print "Got $eventtype failure for $n... Aborting!\n"; }
push(@failures,$n);
delete($remain{$n});
}
if (defined($remain{$n}) && @{$remain{$n}}[0] eq $eventtype) {
# this is the next state we were waiting for
if ($debug) { print "Got $eventtype for $n\n" };
......@@ -191,9 +211,7 @@ sub cancelWait( $ ) {
sub endStateWait() {
%remain = ();
if (event_unregister($handle) == 0) {
die "Unable to unregister with event system\n";
}
$tuple = address_tuple_free($tuple);
if ($debug) { print "endStateWait\n"; }
return 0;
}
......@@ -201,3 +219,8 @@ sub endStateWait() {
# Always end a package successfully!
1;
END {
if ($handle && event_unregister($handle) == 0) {
die "Unable to unregister with event system\n";
}
}
......@@ -116,6 +116,7 @@ my $TBNODESTATE = TBDB_TBEVENT_NODESTATE;
my $TBNODEOPMODE = TBDB_TBEVENT_NODEOPMODE;
my $TBCONTROL = TBDB_TBEVENT_CONTROL;
my $TBCOMMAND = TBDB_TBEVENT_COMMAND;
my $TBFAILED = TBDB_TBEVENT_TBFAILED;
my $TBREBOOT = TBDB_COMMAND_REBOOT;
my $TBPOWEROFF = TBDB_COMMAND_POWEROFF;
my $TBPOWERON = TBDB_COMMAND_POWERON;
......@@ -295,10 +296,10 @@ while (1) {
# Check for nodes that have passed their timeout
if (!qhead($deadline,$node)) {
info("HEAD: $node in ".($deadline-$now).", queue=".qsize()."\n");
debug("HEAD: $node in ".($deadline-$now).", queue=".qsize()."\n");
while ($now >= $deadline && $node ne "") {
qpop($deadline,$node);
info("POP: $node in ".($deadline-$now).", queue=".qsize()."\n");
debug("POP: $node in ".($deadline-$now).", queue=".qsize()."\n");
handleCtrlEvent($node,$TBTIMEOUT);
if (0) { qshow(); }
if (qhead($deadline,$node)) {
......@@ -368,6 +369,7 @@ sub readStates(;@) {
$nodes{$node_id}{notified} = 0;
$nodes{$node_id}{timedout} = 0;
$nodes{$node_id}{noretry} = 0;
$nodes{$node_id}{rebooting} = 0;
# Is there a timeout? If so, set it up!
setTimeout($mode,$state,$node_id,$timestamp);
}
......@@ -622,6 +624,7 @@ sub stateTransition($$) {
# We successfully booted, so clear some flags
$nodes{$node}{noretry} = 0;
$nodes{$node}{timedout} = 0;
$nodes{$node}{rebooting} = 0;
# Check if we really need to do a reset
my $r = DBQueryWarn("select osid,def_boot_osid from nodes ".
"where node_id='$node'");
......@@ -826,18 +829,33 @@ sub handleCtrlEvent($$) {
foreach ($action) {
/^$TBTIMEOUTREBOOT/ && do {
if ($timedout>3) {
# We've tried too many times...
notify("Node $node has timed out too many times!\n".
"Giving up until it boots sucessfully.\n");
$nodes{$node}{noretry} = 1;
# If the node is in our control (ie node_reboot),
# we want to do something. If it is in the user's
# control (went to shutdown without a reboot event),
# then we don't want to touch it.
if ($nodes{$node}{rebooting}) {
if ($timedout>1) {
# We've tried too many times...
# The node has now officially failed to boot
notify("Node $node timed out $timedout times!\n".
"Giving up until it boots sucessfully.\n");
$nodes{$node}{noretry} = 1;
info("$node: Sending $TBFAILED $TBREBOOT $node\n");
EventSendWarn(host => $BOSSNODE ,
objtype => $TBFAILED ,
eventtype => $TBREBOOT ,
objname => $node);
} else {
# XXX Temporary! For now notify instead of
# really rebooting, until the timeout/retry
# stuff is gone from os_setup and os_load
#notify("Node $node has timed out in state ".
# "$mode/$state - REBOOT requested\n");
handleCommand($node,$TBREBOOT,$timedout,1);
}
} else {
# XXX Temporary! For now notify instead of
# really rebooting, until the timeout/retry
# stuff is gone from os_setup and os_load
notify("Node $node has timed out in state ".
"$mode/$state - REBOOT requested\n");
#handleCommand($node,$TBREBOOT,$timedout,1);
info("Node $node timed out in state $mode/$state ".
"under user's control - not rebooting\n");
}
last; };
/^$TBTIMEOUTNOTIFY/ && do {
......@@ -863,7 +881,7 @@ sub handleCommand($$;$$) {
# We may need to do it here (while iterating over the list), or
# make some other fix up in handleEvent.
if ($command eq $TBREBOOT && $retry >=4) {
if ($command eq $TBREBOOT && $retry >1) {
announce("Node $params has tried rebooting $retry times and has \n".
"still not been successful. Please look into it soon.\n".
"" );# "In the meantime, $params will be powered off.\n");
......@@ -885,11 +903,14 @@ sub handleCommand($$;$$) {
$node = $nodes[$n];
debug("Checking rebooting: $node, $nodes{$node}, ".
"$nodes{$node}{state}, $nodes{$node}{noretry}\n");
if (($nodes{$node}{state} ne TBDB_NODESTATE_ISUP) &&
(!$nodes{$node}{noretry}) ) {
if (($nodes{$node}{rebooting}) &&
(!$nodes{$node}{noretry}) ) {
# This node shouldn't be rebooted now...
# XXX Send feedback here somehow!
info("$node: Sending $TBFAILED $TBREBOOT $node\n");
EventSendWarn(host => $BOSSNODE ,
objtype => $TBFAILED ,
eventtype => $TBREBOOT ,
objname => $node);
info("$node: Trying to reboot too soon! Skipping.\n");
# Cut it out of the list
debug("Nodelist before ==> ".join(" ",@nodes)."\n");
......@@ -905,6 +926,8 @@ sub handleCommand($$;$$) {
# Permissions were checked in order to send the message,
# so we don't need to do any fancy stuff here.
$nodes{$node}{rebooting}=1;
my $cmd = "$nodereboot -r $nodelist";
my $redir = " 2>&1 >> /usr/testbed/log/nodereboot.log &";
debug("$cmd $redir\n");
......
......@@ -18,10 +18,11 @@ my @states=('ISUP');
my $u = 10;
my $t = 300;
my %opt = ();
getopts("hds:u:t:",\%opt);
getopts("hdv:s:u:t:",\%opt);
if ($opt{h}) { usage(); }
if ($opt{d}) { $debug++; }
if ($opt{v}) { $debug+=$opt{v}; }
if ($opt{u}) { $u = $opt{u}; }
if ($opt{t}) { $t = $opt{t}; }
if ($opt{s}) { @states = split(",",$opt{s}); }
......@@ -32,7 +33,7 @@ my @nodes = @ARGV;
sub usage {
print <<EOF;
Usage:
$0 [-h] [-d]
$0 [-h] [-d] [-v <level>]
[-u <update freq.>] [-t <timeout>]
[-s state1[,s2]] <node> [<node> ...]
......@@ -42,7 +43,7 @@ order (non-consecutively). If no state is supplied, ISUP is
used. Update frequency is how many seconds apart status updates should
be (default $u). Timeout is how many seconds we should wait before
giving up and returning (default $t). The -h option shows this
message, and -d enables extra debugging output.
message, and -d and -v enable extra debugging output.
EOF
exit(1);
......
......@@ -45,6 +45,7 @@ use lib "@prefix@/lib";
use libdb;
use libtestbed;
use event;
use StateWait;
use POSIX qw(strftime);
my $ssh = "$TB/bin/sshtb -n";
......@@ -120,10 +121,10 @@ if (defined($options{"e"})) {
}
# XXX Temporary, until we make event sending the default
$realmode=1;
#if ($realmode && $UID && !TBAdmin($UID)) {
# die("*** You cannot use real mode!\n");
#}
#$realmode=1;
if ($realmode && $UID && !TBAdmin($UID)) {
die("*** You cannot use real mode!\n");
}
#
# If eidmode, then get the node list out of the DB instead of the command
......@@ -232,21 +233,51 @@ if (! keys(%realnodes) && ! keys(%virtnodes)) {
my @sortednodes = sort(keys(%realnodes));
if (!$realmode) {
$StateWait::debug = $debug;
if (!$nowait) {
my @states = ();
if ($waitmode) {
print "Waiting for nodes to shut down and come up...\n";
@states= ( TBDB_NODESTATE_SHUTDOWN , TBDB_NODESTATE_ISUP );
} else {
print "Waiting for nodes to shut down...\n";
@states= ( TBDB_NODESTATE_SHUTDOWN );
}
initStateWait(\@states, @sortednodes)
}
EventSendFatal(host => $BOSSNODE ,
objtype => TBDB_TBEVENT_COMMAND ,
eventtype => TBDB_COMMAND_REBOOT ,
objname => join(",",@sortednodes) );
my $rv = 0;
if (!$nowait) {
# In here we can do some output to tell the user what's going on.
if ($waitmode) {
# Wait for [SHUTDOWN,ISUP]
} else {
# Wait for [SHUTDOWN]
my $start = time();
my $now = $start;
my $done = 0;
my $total = scalar(@sortednodes);
my @finished = ();
my @failed = ();
while( $done < $total ) {
print "Waiting for ".($total-$done)." nodes...\n";
waitForState(\@finished,\@failed,60);
$now = time();
$done = scalar(@finished) + scalar(@failed);
my $min = int(($now - $start + 30)/60); # round to nearest min.
print "After $min min., $done nodes done...\n";
#print "fin = ".join(",",@finished)." fail = ".join(",",@failed).
# " Time=$now (".($now-$start)." elapsed), done=$done\n";
sleep(1);
}
print "All $total nodes finished.\n";
$bad = scalar(@failed);
if ($bad) {
print "There were $bad failures: ".join(" ",@failed)."\n";
$rv = 1;
}
endStateWait();
}
exit(0);
exit($rv);
}
#
......
......@@ -51,6 +51,7 @@ my $MAXRETRIES = 1;
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use StateWait;
# Be careful not to exit on transient error
$libdb::DBQUERY_MAXTRIES = 30;
......@@ -68,7 +69,6 @@ my $imageid;
my $imagepid = TB_OPSPID;
my %imageid_row;
my @nodes = ();
my %retries = ();
my $mereuser = 0;
my $waitmode = 1;
my $failures = 0;
......@@ -308,111 +308,36 @@ if (! $waitmode) {
exit $failures;
}
# The retry vector is initialized to the number of retries we allow per
# node, afterwhich its a fatal error.
foreach my $node (@nodes) {
$retries{$node} = $MAXRETRIES;
print "Issuing reboot for @nodes and then waiting ...\n";
initStateWait([ TBDB_NODESTATE_RELOADDONE ] , @nodes);
system("$nodereboot @nodes");
if ($?) {
print "Reboot failed for (some of) @nodes. Quitting!\n";
exit ($? >> 8);
}
my @failed=();
while (@nodes) {
# Reboot them all.
print "Issuing reboot for @nodes and then waiting ...\n";
if ($reboot) {
system("$nodereboot @nodes");
if ($?) {
print "Reboot failed for (some of) @nodes. Quitting!\n";
exit ($? >> 8);
}
}
# Now wait for them.
$startwait = time;
@failed = WaitTillReloadDone(@nodes);
@nodes=();
while (@failed) {
my $node = shift(@failed);
if ($retries{$node}) {
print "*** Trying $node again (resetting/rebooting) ...\n";
push(@nodes, $node);
# Possible race with reboot?
SetupReload($node);
# Retry until count hits zero.
$retries{$node} -= 1;
} else {
print "*** $node failed too many times. Skipping!\n";
$failures++;
}
}
my $total = scalar(@nodes);
my @finished = ();
my @failed = ();
waitForState(\@finished, \@failed, 60 * 15);
endStateWait();
my $worked = scalar(@finished);
my $failed = scalar(@failed);
my $remain = $total - $worked - $failed;
if ($worked != $total) {
print "*** os_load: Only $worked nodes of $total succeeded!\n";
if ($failed) { print "\tThere were $failed failures.\n"; }
if ($remain) { print "\tThere were $remain nodes that timed out.\n"; }
}
$failures += $failed;
print "OS Reload Done! There were $failures failures!\n";
exit($failures);
# Wait for a reload to finish by watching its state
sub WaitTillReloadDone {
my (@nodes) = @_;
my %done = ();
my $count = @nodes;
my @failed = ();
foreach my $node ( @nodes ) { $done{$node} = 0; }
print STDERR "Waiting for @nodes to finish reloading\n".`date` if $dbg;
# Start a counter going, relative to the time we rebooted the first
# node.
my $waittime = 0;
my $minutes = 0;
while ($count) {
# Wait first to make sure reboot is done, and so that we don't
# wait one more time after everyone is up.
sleep(5);
foreach my $node ( @nodes ) {
if (! $done{$node}) {
my ($query_result, @row);
$query_result =
DBQueryFatal("SELECT op_mode FROM nodes ".
"where node_id='$node'");
@row = $query_result->fetchrow_array();
# We simply wait for the node to leave the reloading opmode
if ($row[0] ne TBDB_NODEOPMODE_RELOAD) {
print STDERR "$node has left reloading mode\n".`date` if $dbg;
$count--;
$done{$node} = 1;
next;
}
# Soon we will have stated's timeouts take care of
# rebooting once or twice if we get stuck during
# reloading.
$waittime = time - $startwait;
if ($waittime > $maxwait) {
my $t = (int ($waittime / 60));
print "*** $node appears wedged; ".
"its been $t minutes since it was rebooted.\n";
$count--;
$done{$node} = 1;
push(@failed, $node);
next;
}
if (int($waittime / 60) > $minutes) {
$minutes = int($waittime / 60);
print "Still waiting for $node to reload - ".
"its been $minutes minute(s)\n";
}
}
}
}
return @failed;
}
# Setup a reload. Note that imageid is global.
sub SetupReload($) {
......
......@@ -17,7 +17,7 @@ require 'ctime.pl';
# experiment creation to continue.
#
# TODO: Reload disk images.
#
#
# usage: os_setup <pid> <eid>
#
# errorcode: 0 - all reboots succeeded.
......@@ -58,6 +58,7 @@ my $TFTP = "/tftpboot";
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use StateWait;
my $nodereboot = "$TB/bin/node_reboot";
my $os_load = "$TB/bin/os_load";
......@@ -81,14 +82,14 @@ my @row;
#
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
# imageid, a list of the nodes to pass to os_load for that imageid.
# imageid, a list of the nodes to pass to os_load for that imageid.
#
my %reloads = ();
my %reboots = ();
my %rebooted = ();
my $doautoload = 1;
my $dolastload = 1;
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
......@@ -171,7 +172,7 @@ $db_result =
"left join node_types as nt on nt.type=n.type ".
"where r.pid='$pid' and r.eid='$eid'");
if ($db_result->numrows < 1) {
if ($db_result->numrows < 1) {
print "There are no nodes in experiment '$eid' in project '$pid'.\n";
exit 0;
}
......@@ -217,7 +218,7 @@ while (my %row = $db_result->fetchhash()) {
#
# Make sure the files specified in the paths exist. We mount the
# user tftp directory on boss node, so we can ignore the IP address,
# and just check the path directly.
# and just check the path directly.
#
if (defined($row{'def_boot_path'})) {
my $path = $row{'def_boot_path'};
......@@ -286,13 +287,13 @@ while (my %row = $db_result->fetchhash()) {
die_noretry("*** RPM $rpm for node $node does not exist!");
}
}
#
# XXX - Ditto for tarfiles.
#
foreach my $tarspec (split(":", $row{'tarballs'})) {
my ($dir, $tar) = split(" ", $tarspec);
if (! -f $tar) {
die_noretry("*** Tarfile $tar for node $node does not exist!");
}
......@@ -302,13 +303,13 @@ while (my %row = $db_result->fetchhash()) {
# If there is a path specified, then we don't worry anymore about it.
# The user must know what is going on. The OSID might have a path
# associated with it, which means the same thing; we don't worry about
# it.
# it.
#
if (!$bootpath && !$jailnode && !$plabnode && !$subnode) {
#
# These checks are not necessary if the front end and web page
# are doing the right thing, but lets be careful anyway.
#
#
if (! $osid) {
die_noretry(
"*** $node has no bootpath and no def_boot_osid set!");
......@@ -319,20 +320,20 @@ while (my %row = $db_result->fetchhash()) {
#
my $osid_result =
DBQueryFatal("select * from os_info where osid='$osid'");
if ($osid_result->numrows == 0) {
die_noretry("*** No such OSID $osid is defined!");
}
my %osid_row = $osid_result->fetchhash();
#
# If there is an actual path, its an OSKit kernel not an image.
#
#
if (! defined($osid_row{'path'}) || $osid_row{'path'} eq "") {
#
# Not an OSKit kernel.
# Make sure this OSID is actually loaded on the machine.
# Make sure this OSID is actually loaded on the machine.
#
my $p_result =
DBQueryFatal("select * from partitions ".
......@@ -340,8 +341,8 @@ while (my %row = $db_result->fetchhash()) {
#
# If not loaded, then see if the user was looking for the generic
# name of the OS that is loaded.
#
# name of the OS that is loaded.
#
if ($p_result->numrows == 0) {
#
# Check to see if a non specific version specified.
......@@ -351,7 +352,7 @@ while (my %row = $db_result->fetchhash()) {
#
# A non-specific version. There needs to be a way to
# map it to another osid.
# map it to another osid.
#
if (!defined($osid_row{'nextosid'})) {
die_noretry(
......@@ -359,11 +360,11 @@ while (my %row = $db_result->fetchhash()) {
" No mapping can be made for $osid ($node)!");
}
my $nextosid = $osid_row{'nextosid'};
#
# See if the nextosid is already on the disk. If not,
# it needs to be loaded.
#
#
my $o_result =
DBQueryFatal("select osid from partitions as p ".
"where p.node_id='$node' and ".
......@@ -372,18 +373,18 @@ while (my %row = $db_result->fetchhash()) {
if (! $o_result->numrows) {
#
# User wants a specific version of an OS, but its not
# loaded on the machine.
# loaded on the machine.
#
print "Mapping $osid on $node to $nextosid ".
"and setting up a reload.\n";