Commit f1fa5a51 authored by Kirk Webb's avatar Kirk Webb
Browse files

New plab vnode monitor framework, now with proactive node checking action!

The old monitor has been completely replaced.  The new one uses modular pools
to test and track plab nodes.  There are currently two pool modules:
good and bad.  THe good pool tests nodes that have are not known to have
issues to proactively find problems and push nodes into the "bad" pool
when necessary.  The bad pool acts similarly to the old plabmonitor; it
does and end to end test on nodes, and if and when they finally come up,
moves them to the good pool.  Both pools have a testing backoff mechanism
that works as follows:

  * The node is tested right away upon entering either pool
  * Node fails to setup:
    * goodpool: node is sent to bad pool (hwdown)
    * badpool:  node is scheduled to be retested according to
                an additive backoff function, maxing out at 1 hour.
  * Node setup succeeds:
    * goodpool: node is scheduled to be retested according to
                an additive backoff function, maxing out at 1 hour.
    * badpool:  node is moved to good pool.

The backoff thing may be bogus, we'll see.  It seems like a reasonable thing
to do though - no need to hammer a node with tests if it consistently
succeeds or fails.  Nodes that flop back and forth will get the most
testing punishment.  A future enhancement will be to watch for flopping
and force nodes that exhibit this behavior to pass several consecutive
tests before being eligible for return back into the good pool.

The monitor only allows a configurable window's worth of outstanding
tests to go on at once.  When tests finish, more nodes tests are allowed
to start up right away.

Some refactoring needs to be done.  Currently the good and bad pools share
quite a bit of duplicated code.  I don't know if I dare venture into
inheritance with perl, but that would be a good way to approach this.

Some other pool module ideas:

* dynamic setup pools

When experiments w/ plab vnodes are swapped in, use the plab monitor to
manage setting up the vnodes by dynamically creating pools on a per-experiment
basis.  This has the advantage that the monitor can keep a global cap on
the number of outstanding setup operations.  These pools might also try to
bring up vnodes that failed to setup during swapin later on, along with other
vnode monitoring tasks.

* "all nodes" pools

Similar to the dynamic pools just mentioned, but with the mission to extend
experiments to all plab nodes possible (as nodes come and go).  Useful for
services.
parent 70cbdf5e
......@@ -2329,8 +2329,11 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/plab/mod_dslice.py tbsetup/plab/mod_PLC.py \
tbsetup/plab/mod_PLCNM.py \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabrenewd \
tbsetup/plab/plabrenewonce \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord tbsetup/plab/plablinkdata \
tbsetup/plab/plabmonitord \
tbsetup/plab/plabmon_badpool.pm tbsetup/plab/plabmon_goodpool.pm \
tbsetup/plab/plablinkdata \
tbsetup/plab/libdslice/GNUmakefile tbsetup/plab/etc/GNUmakefile \
tbsetup/plab/plabdist tbsetup/plab/plabhttpd \
tbsetup/plab/plabdiscover tbsetup/plab/etc/netbed_files/GNUmakefile \
......
......@@ -764,8 +764,11 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/plab/mod_dslice.py tbsetup/plab/mod_PLC.py \
tbsetup/plab/mod_PLCNM.py \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabrenewd \
tbsetup/plab/plabrenewonce \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord tbsetup/plab/plablinkdata \
tbsetup/plab/plabmonitord \
tbsetup/plab/plabmon_badpool.pm tbsetup/plab/plabmon_goodpool.pm \
tbsetup/plab/plablinkdata \
tbsetup/plab/libdslice/GNUmakefile tbsetup/plab/etc/GNUmakefile \
tbsetup/plab/plabdist tbsetup/plab/plabhttpd \
tbsetup/plab/plabdiscover tbsetup/plab/etc/netbed_files/GNUmakefile \
......
......@@ -30,6 +30,7 @@ use vars qw(@ISA @EXPORT);
PROJROOT GROUPROOT USERROOT TBOPSPID EXPTLOGNAME
PLABMOND_PID PLABMOND_EID PLABHOLDING_PID PLABHOLDING_EID
PLABTESTING_PID PLABTESTING_EID PLABDOWN_PID PLABDOWN_EID
TBTrustConvert TBMinTrust TBGrpTrust TBProjTrust MapNumericUID
......@@ -402,8 +403,12 @@ sub NODEDEAD_PID() { $TBOPSPID; }
sub NODEDEAD_EID() { "hwdown"; }
sub PLABMOND_PID() { $TBOPSPID; }
sub PLABMOND_EID() { "plab-monitor"; }
sub PLABTESTING_PID() { $TBOPSPID; }
sub PLABTESTING_EID() { "plab-testing"; }
sub PLABHOLDING_PID() { $TBOPSPID; }
sub PLABHOLDING_EID() { "plabnodes"; }
sub PLABHOLDING_EID() { "plabup"; }
sub PLABDOWN_PID() { $TBOPSPID; }
sub PLABDOWN_EID() { "plabdown"; }
sub OLDRESERVED_PID() { $TBOPSPID; }
sub OLDRESERVED_EID() { "oldreserved"; }
sub NFREELOCKED_PID() { $TBOPSPID; }
......
......@@ -18,7 +18,8 @@ SBIN_STUFF = plabslice plabnode plabrenewd plabmetrics plabstats \
plabmonitord plablinkdata plabdist plabhttpd plabdiscover \
plabrenewonce
LIB_STUFF = libplab.py mod_dslice.py mod_PLC.py mod_PLCNM.py
LIB_STUFF = libplab.py mod_dslice.py mod_PLC.py mod_PLCNM.py \
plabmon_badpool.pm plabmon_goodpool.pm
LIBEXEC_STUFF = webplabstats
......
# -*- perl -*-
#
# EMULAB-LGPL
# Copyright (c) 2000-2005 University of Utah and the Flux Group.
# All rights reserved.
#
#
# plabmonitor node pool module for bad (malfunctional) nodes.
#
package plabmon_badpool;
use strict;
use English;
use POSIX qw(WIFSIGNALED WEXITSTATUS);
$| = 1; # Turn off line buffering on output
use lib "@prefix@/lib";
use libdb;
use libtestbed;
my $BADINST = 99;
my $VNODESETUPTIMEOUT = 600; # XXX: need to get from sitevar, or
# pass in via new().
my $ISUPWAITTIME = 600; # XXX: likewise...
my $TEARDOWNTIMEOUT = 120; # XXX: ...
my $SETUPMODE = "SETUPMODE";
my $TEARDOWNMODE = "TEARDOWNMODE";
my $BIGINT = 9999999999;
my $CLIENT_BIN = "@CLIENT_BINDIR@";
my $SSH = "@prefix@/bin/sshtb -n";
my $PLABNODE = "@prefix@/sbin/plabnode";
# XXX - testing
#my $PLABNODE = "/home/kwebb/bin/randsleep.pl";
my $PLABMOND_PID = PLABMOND_PID();
my $PLABMOND_EID = PLABMOND_EID();
my $PLABHOLDING_PID = PLABHOLDING_PID();
my $PLABHOLDING_EID = PLABHOLDING_EID();
sub MIN($$) {
my ($a, $b) = @_;
my $res = $a < $b ? $a : $b;
return $res;
}
sub TimeStamp()
{
return POSIX::strftime("%m/%d/%y %H:%M:%S", localtime());
}
sub new($$$$$) {
# The next two lines are some voodoo taken from perltoot(1)
my $proto = shift;
my $class = ref($proto) || $proto;
my ($poolname, $poolpid, $pooleid, $chpid2pool) = @_;
#
# Create the actual object
#
my $self = {};
$self->{'NAME'} = $poolname;
$self->{'PID'} = $poolpid;
$self->{'EID'} = $pooleid;
$self->{'CHPID2POOL'} = $chpid2pool;
$self->{'PNODES'} = {};
$self->{'PENDING'} = {};
bless($self,$class);
return $self;
}
#
# Things to do to check a node in the bad pool:
# 1) Ping the node (maybe with ssh) (?) - not done right now
# 2) Try to instantiate the sliver via plabnode
# 3) Try to run vnodesetup on the instantiated node.
# 4) Wait for ISUP (or failure/timeout).
#
sub checknextnode($) {
my $self = shift;
my $now = time();
my $pnode = $self->getnextchecknode();
# Nothinig to check!
if (!$pnode) {
return;
}
# Grab the vnode for this pnode (service sliver vnode)
my $vnode = $self->getsvcvnode($pnode->{'name'});
if (!defined($vnode)) {
print "Could not find vnode associated with $pnode!\n";
return;
}
print "Pool: $self->{'NAME'}: Testing node $pnode->{'name'} at ".
TimeStamp() . "\n";
my $chpid = fork();
if ($chpid) {
# Update node attributes
$pnode->{'lastcheckstart'} = $now;
$pnode->{'vnode'} = $vnode;
$pnode->{'mode'} = $SETUPMODE;
$pnode->{'pid'} = $chpid;
$pnode->{'timeout'} = $now + $VNODESETUPTIMEOUT;
$self->{'PENDING'}->{$pnode->{'name'}} = $pnode;
$self->{'CHPID2POOL'}->{$chpid} = $self;
return
}
# Worker process.
else {
$SIG{CHLD} = 'DEFAULT';
TBdbfork(); # So we get the event system fork too ...
# Make sure vnode is in the proper state before trying to
# bring it up.
TBSetNodeEventState($vnode, TBDB_NODESTATE_SHUTDOWN());
# XXX: should probably look to see if we have an RCAP for this
# vnode and try to clean it up first if so.
if (TBForkCmd("$PLABNODE -f alloc $PLABMOND_PID ".
"$PLABMOND_EID $vnode",1)) {
print "*** Vserver instantiation failed: $vnode\n";
# XXX: Should check DB state instead.
exit($BADINST);
}
exec "$SSH -host $vnode $CLIENT_BIN/vnodesetup -p $vnode" or
die "Yike! Can't exec command!\n";
}
# NOTREACHED
}
sub getnextchecknode($) {
my $self = shift;
my $retnode = "";
my $nextcheck = $BIGINT;
foreach my $pnode (values %{$self->{'PNODES'}}) {
my $nchecktime = $pnode->{'nextchecktime'};
if (!exists($self->{'PENDING'}->{$pnode->{'name'}}) and
$nchecktime < $nextcheck) {
$nextcheck = $nchecktime;
$retnode = $pnode;
}
}
return $retnode;
}
#
# XXX: comment.
#
sub getnextchecktime($) {
my $self = shift;
my $nextnode = $self->getnextchecknode();
if ($nextnode) {
return $nextnode->{'nextchecktime'}
}
return $BIGINT; # XXX
}
sub getnexttimeout($) {
my $self = shift;
my $timeout = $BIGINT; # XXX
foreach my $pnode (values %{$self->{'PNODES'}}) {
my $ntmo = $pnode->{'timeout'};
if ($ntmo && exists($self->{'PENDING'}->{$pnode->{'name'}})) {
$timeout = MIN($ntmo, $timeout);
}
}
return $timeout;
}
sub getnextservicetime($) {
my $self = shift;
return MIN($self->getnexttimeout(), $self->getnextchecktime());
}
#
# XXX: Comment me
#
sub processchild($$$) {
my $self = shift;
my $chpid = shift;
my $exstat = shift;
my $pnode;
my $now = time();
my $bad = 0;
foreach my $findpnode (values %{$self->{'PENDING'}}) {
if (defined($findpnode->{'pid'}) and $findpnode->{'pid'} == $chpid) {
$pnode = $findpnode;
last;
}
}
if (!defined($pnode)) {
print "Pool: $self->{'NAME'}: $chpid not found in pending list!\n";
return 0;
}
# Clear pid entry - child has gone away.
delete $pnode->{'pid'};
# Check the nodes to find out which are up, and which failed
# in the vnode_setup we just ran.
print "Pool: $self->{'NAME'}: Checking status of ".
"$pnode->{'name'} @ ". `date`;
# XXX: ignore teardown mode for now (except to clear from pending list).
if ($pnode->{'mode'} eq $TEARDOWNMODE) {
delete $self->{'PENDING'}->{$pnode->{'name'}};
if (!$exstat) {
print "Teardown of $pnode->{'vnode'} complete\n";
} else {
print "Teardown of $pnode->{'vnode'} failed: $exstat\n";
}
return 1;
}
SWRN1: for ($exstat) {
WIFSIGNALED($_) && do {
if ($now > $pnode->{'timeout'}) {
print "Timeout waiting for $pnode->{'vnode'} to instantiate.\n";
}
else {
print "Setup of $pnode->{'vnode'} killed for unknown reason.\n";
}
$bad = 1;
last SWRN1;
};
WEXITSTATUS($_) == $BADINST && do {
print "Instantiation of $pnode->{'vnode'} failed.\n";
$bad = 1;
last SWRN1;
};
WEXITSTATUS($_) > 0 && do {
print "Vnodesetup failed on $pnode->{'vnode'}.\n";
$bad = 1;
last SWRN1;
};
# default
print "Node setup succeeded on $pnode->{'vnode'}.\n".
" Waiting for node to hit ISUP.\n";
$bad = 0;
}
# Log success/failure in any case.
# XXX: do this!
if ($bad) {
# If setup failed, schedule a vnode teardown.
$self->teardownnode($pnode);
$self->calcnextcheck($pnode);
$pnode->{'consecfails'}++;
return 0;
}
# Setup a timeout to wait for ISUP.
$pnode->{'timeout'} = $now + $ISUPWAITTIME;
return 1;
}
#
# XXX: Comment me
#
sub checkexpiration($) {
my $self = shift;
my $now = time();
foreach my $pnode (values %{ $self->{'PENDING'} }) {
# ISUP or TBFAILED? Check for these before timeout.
my $state = TBDB_NODESTATE_UNKNOWN();
if (TBGetNodeEventState($pnode->{'vnode'}, \$state)) {
if ($state eq TBDB_NODESTATE_ISUP()) {
# Yes! Node is up.
$self->nodesetupcomplete($pnode);
next;
}
elsif ($state eq TBDB_NODESTATE_TBFAILED()) {
$self->teardownnode($pnode);
$self->calcnextcheck($pnode);
$pnode->{'consecfails'}++;
next;
}
} else {
print "Error getting event state for $pnode->{'vnode'}\n";
}
# Have we timed out waiting for this node?
if ($pnode->{'timeout'} <= $now) {
$pnode->{'timeout'} = 0;
print "Pool: $self->{'NAME'}: $pnode->{'vnode'} timeout.\n";
# Node has an associated PID
if (defined($pnode->{'pid'})) {
kill("TERM", $pnode->{'pid'});
# Cleanup/processing handled in processchild()
}
# ... else we were waiting for an ISUP
else {
$self->teardownnode($pnode);
$self->calcnextcheck($pnode);
$pnode->{'consecfails'}++;
}
}
}
return;
}
sub teardownnode($$;$) {
my ($self, $pnode, $reason) = @_;
my $now = time();
my $chpid = fork();
if ($chpid) {
# Update node attributes, return pid of worker proc.
$pnode->{'pid'} = $chpid;
$pnode->{'mode'} = $TEARDOWNMODE;
$pnode->{'timeout'} = $now + $TEARDOWNTIMEOUT;
$self->{'PENDING'}->{$pnode->{'name'}} = $pnode;
$self->{'CHPID2POOL'}->{$chpid} = $self;
return;
}
# Worker process.
else {
$SIG{CHLD} = 'DEFAULT';
TBdbfork(); # So we get the event system fork too ...
my $vnode = $pnode->{'vnode'};
# Free the vserver, if possible.
TBForkCmd("$SSH -host $vnode $CLIENT_BIN/vnodesetup -p -k $vnode",1);
# Make sure vnode is in the proper state (regardless of the success
# or failure of the previous command.
TBSetNodeEventState($vnode, TBDB_NODESTATE_SHUTDOWN());
# Try to ssh in and kill the vserver.
exec "$PLABNODE -f free $PLABMOND_PID $PLABMOND_EID $vnode" or
die "Doh! Can't exec command!\n";
}
# NOTREACHED
}
# XXX: may be bogus, but it'll do for now.
my $MININTERVAL = 300;
my $MAXINTERVAL = 3600;
sub calcnextcheck($$;$) {
my ($self, $pnode, $reason) = @_;
my $now = time();
my $numfails = $pnode->{'consecfails'};
my $nextcheck = int($now + $MININTERVAL + 0.5 * $numfails * $MININTERVAL);
$pnode->{'nextchecktime'} = MIN($now+$MAXINTERVAL, $nextcheck) + int(rand(60));
}
#
# Check vnode status, moving nodes back into production if they booted up,
# and leaving them in hwdown if they didn't.
#
# XXX: fix this damn thing.
#
sub nodesetupcomplete($$) {
my $self = shift;
my $pnode = shift;
$pnode->{'timeout'} = 0;
$pnode->{'consecfails'} = 0;
$pnode->{'consecsuccess'} = 1;
delete $self->{'PENDING'}->{$pnode->{'name'}};
#
# It came up! Move the pnode out of hwdown and back into
# normal holding experiment.
#
DBQueryWarn("update reserved set ".
" pid='$PLABHOLDING_PID',eid='$PLABHOLDING_EID' ".
" where node_id=\"$pnode->{'name'}\"");
print "$pnode->{'name'} brought back from the afterworld at ".
TimeStamp() . "\n";
TBSetNodeLogEntry($pnode->{'name'}, "root", TB_DEFAULT_NODELOGTYPE(),
"'Moved to $PLABHOLDING_EID; ".
"plab node $pnode->{'vnode'} setup okay by monitor.'");
# XXX: move to goodpool.
return;
}
# Get vnode entries (and their corresponding pnodes)
# for the service slice vservers.
#
# XXX: move to plabmon library module.
#
sub getsvcvnode($$) {
my $self = shift;
my $pnode = shift;
my $qres =
DBQueryWarn("select r.node_id from reserved as r ".
"left join nodes as n on n.node_id=r.node_id ".
"where r.pid='$PLABMOND_PID' and ".
" r.eid='$PLABMOND_EID' and ".
" n.phys_nodeid='$pnode'");
if (!$qres || !$qres->num_rows()) {
print "Failed to get vnode from DB in getsvcvnode()! \n";
return undef;
}
my @row = $qres->fetchrow_array();
return $row[0];
}
# Make perl happy...
1;
# -*- perl -*-
#
# EMULAB-LGPL
# Copyright (c) 2000-2005 University of Utah and the Flux Group.
# All rights reserved.
#
#
# plabmonitor node pool module for checking up on "good" plab
# nodes.
#
package plabmon_goodpool;
use strict;
use English;
use POSIX qw(WIFSIGNALED WEXITSTATUS);
$| = 1; # Turn off line buffering on output
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use Node;
my $BADINST = 99;
my $VNODESETUPTIMEOUT = 600; # XXX: need to get from sitevar, or
# pass in via new().
my $ISUPWAITTIME = 600; # XXX: likewise...
my $TEARDOWNTIMEOUT = 120; # XXX: ...
my $SETUPMODE = "SETUPMODE";
my $TEARDOWNGOODMODE = "TEARDOWNGOODMODE";
my $TEARDOWNBADMODE = "TEARDOWNBADMODE";
my $SETUPFAIL = "SETUPFAIL";
my $SETUPSUCCESS = "SETUPSUCCESS";
my $BIGINT = 9999999999;
my $CLIENT_BIN = "@CLIENT_BINDIR@";
my $SSH = "@prefix@/bin/sshtb -n";
my $PLABNODE = "@prefix@/sbin/plabnode";
my $VNODE_SETUP = "@prefix@/sbin/vnode_setup";
my $NFREE = "@prefix@/bin/nfree";
# XXX - testing
#my $PLABNODE = "/home/kwebb/bin/randsleep.pl";
my $PLABDOWN_PID = PLABDOWN_PID();
my $PLABDOWN_EID = PLABDOWN_EID();
my $PLABTESTING_PID = PLABTESTING_PID();
my $PLABTESTING_EID = PLABTESTING_EID();
sub MIN($$) {
my ($a, $b) = @_;
my $res = $a < $b ? $a : $b;
return $res;
}
sub TimeStamp()
{
return POSIX::strftime("%m/%d/%y %H:%M:%S", localtime());
}
sub new($$$$$) {
# The next two lines are some voodoo taken from perltoot(1)
my $proto = shift;
my $class = ref($proto) || $proto;
my ($poolname, $poolpid, $pooleid, $chpid2pool) = @_;
#
# Create the actual object
#
my $self = {};
$self->{'NAME'} = $poolname;
$self->{'PID'} = $poolpid;
$self->{'EID'} = $pooleid;
$self->{'CHPID2POOL'} = $chpid2pool;
$self->{'PNODES'} = {};
$self->{'PENDING'} = {};
# Clean up anything left behind by a terminated monitor
# XXX: this is kind of hacky, but it works.
my @vnodes = ExpNodes($PLABTESTING_PID, $PLABTESTING_EID);
if (@vnodes) {
system("$VNODE_SETUP -f -k -n 100 $PLABTESTING_PID $PLABTESTING_EID");
Node::DeleteVnodes(@vnodes);
}