Commit 37f4392e authored by Kirk Webb's avatar Kirk Webb

Updates to the plab monitor.  Fixed a couple of bugs and created a
separate libplabmon library module.
parent bbb67cf0
......@@ -2331,7 +2331,7 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabrenewd \
tbsetup/plab/plabrenewonce \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord \
tbsetup/plab/plabmonitord tbsetup/plab/libplabmon.pm \
tbsetup/plab/plabmon_badpool.pm tbsetup/plab/plabmon_goodpool.pm \
tbsetup/plab/plablinkdata \
tbsetup/plab/libdslice/GNUmakefile tbsetup/plab/etc/GNUmakefile \
......
......@@ -766,7 +766,7 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabrenewd \
tbsetup/plab/plabrenewonce \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord \
tbsetup/plab/plabmonitord tbsetup/plab/libplabmon.pm \
tbsetup/plab/plabmon_badpool.pm tbsetup/plab/plabmon_goodpool.pm \
tbsetup/plab/plablinkdata \
tbsetup/plab/libdslice/GNUmakefile tbsetup/plab/etc/GNUmakefile \
......
......@@ -19,7 +19,7 @@ SBIN_STUFF = plabslice plabnode plabrenewd plabmetrics plabstats \
plabrenewonce
LIB_STUFF = libplab.py mod_dslice.py mod_PLC.py mod_PLCNM.py \
plabmon_badpool.pm plabmon_goodpool.pm
plabmon_badpool.pm plabmon_goodpool.pm libplabmon.pm
LIBEXEC_STUFF = webplabstats
......
......@@ -21,6 +21,7 @@ $| = 1; # Turn off line buffering on output
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use libplabmon;
my $BADINST = 99;
my $VNODESETUPTIMEOUT = 600; # XXX: need to get from sitevar, or
......@@ -44,19 +45,6 @@ my $PLABMOND_EID = PLABMOND_EID();
my $PLABHOLDING_PID = PLABHOLDING_PID();
my $PLABHOLDING_EID = PLABHOLDING_EID();
sub MIN($$) {
my ($a, $b) = @_;
my $res = $a < $b ? $a : $b;
return $res;
}
sub TimeStamp()
{
return POSIX::strftime("%m/%d/%y %H:%M:%S", localtime());
}
sub new($$$$$) {
# The next two lines are some voodoo taken from perltoot(1)
......@@ -122,9 +110,6 @@ sub checknextnode($) {
$self->{'PENDING'}->{$pnode->{'name'}} = $pnode;
$self->{'CHPID2POOL'}->{$chpid} = $self;
print "Pool $self->{'NAME'}: fired off worker $chpid for ".
"$pnode->{'name'}\n";
return
}
......@@ -248,6 +233,7 @@ sub processchild($$$) {
} else {
print "Teardown of $pnode->{'vnode'} failed: $exstat\n";
}
return 1;
}
......@@ -289,12 +275,13 @@ sub processchild($$$) {
$self->teardownnode($pnode);
$self->calcnextcheck($pnode);
$pnode->{'consecfails'}++;
return 0;
}
} else {
# Instantiation was successful.
# Setup a timeout to wait for ISUP.
$pnode->{'timeout'} = $now + $ISUPWAITTIME;
}
# Setup a timeout to wait for ISUP.
$pnode->{'timeout'} = $now + $ISUPWAITTIME;
return 1;
return 0;
}
#
......@@ -303,22 +290,28 @@ sub processchild($$$) {
sub checkexpiration($) {
my $self = shift;
my $now = time();
my $numfinished = 0;
foreach my $pnode (values %{ $self->{'PENDING'} }) {
# ISUP or TBFAILED? Check for these before timeout.
my $state = TBDB_NODESTATE_UNKNOWN();
if (TBGetNodeEventState($pnode->{'vnode'}, \$state)) {
if ($state eq TBDB_NODESTATE_ISUP()) {
# Yes! Node is up.
$self->nodesetupcomplete($pnode);
next;
}
elsif ($state eq TBDB_NODESTATE_TBFAILED()) {
$self->teardownnode($pnode);
$self->calcnextcheck($pnode);
$pnode->{'consecfails'}++;
next;
if ($pnode->{'mode'} eq $SETUPMODE) {
if ($state eq TBDB_NODESTATE_ISUP()) {
# Yes! Node is up.
print "Setup of $pnode->{'vnode'} on $pnode->{'name'} ".
" succeeded\n";
$self->nodesetupcomplete($pnode);
$numfinished++;
next;
}
elsif ($state eq TBDB_NODESTATE_TBFAILED()) {
$self->teardownnode($pnode);
$self->calcnextcheck($pnode);
$pnode->{'consecfails'}++;
next;
}
}
} else {
print "Error getting event state for $pnode->{'vnode'}\n";
......@@ -342,7 +335,7 @@ sub checkexpiration($) {
}
}
return;
return $numfinished;
}
......@@ -361,9 +354,6 @@ sub teardownnode($$;$) {
$self->{'PENDING'}->{$pnode->{'name'}} = $pnode;
$self->{'CHPID2POOL'}->{$chpid} = $self;
print "Pool $self->{'NAME'}: fired off teardown worker $chpid for ".
"$pnode->{'name'}\n";
return;
}
......@@ -390,23 +380,22 @@ sub teardownnode($$;$) {
# XXX: may be bogus, but it'll do for now.
my $MININTERVAL = 300;
my $MAXINTERVAL = 3600;
my $MAXINTERVAL = 12 * 3600;
sub calcnextcheck($$;$) {
my ($self, $pnode, $reason) = @_;
my $now = time();
my $numfails = $pnode->{'consecfails'};
my $nextcheck = int($now + $MININTERVAL + 0.5 * $numfails * $MININTERVAL);
$pnode->{'nextchecktime'} = MIN($now+$MAXINTERVAL, $nextcheck) + int(rand(60));
my $numfails = $pnode->{'consecfails'} ? $pnode->{'consecfails'} : 1;
my $nextint = int($numfails * $numfails * $MININTERVAL);
$pnode->{'nextchecktime'} =
$now + MIN($MAXINTERVAL, $nextint) + int(rand(120));
}
#
# Check vnode status, moving nodes back into production if they booted up,
# and leaving them in hwdown if they didn't.
#
# XXX: fix this damn thing.
#
sub nodesetupcomplete($$) {
my $self = shift;
my $pnode = shift;
......@@ -435,11 +424,11 @@ sub nodesetupcomplete($$) {
return;
}
#
# Get vnode entries (and their corresponding pnodes)
# for the service slice vservers.
#
# XXX: move to plabmon library module.
#
sub getsvcvnode($$) {
my $self = shift;
my $pnode = shift;
......@@ -461,5 +450,6 @@ sub getsvcvnode($$) {
return $row[0];
}
# Make perl happy...
1;
......@@ -22,6 +22,7 @@ $| = 1; # Turn off line buffering on output
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use libplabmon;
use Node;
my $BADINST = 99;
......@@ -53,19 +54,6 @@ my $PLABDOWN_EID = PLABDOWN_EID();
my $PLABTESTING_PID = PLABTESTING_PID();
my $PLABTESTING_EID = PLABTESTING_EID();
sub MIN($$) {
my ($a, $b) = @_;
my $res = $a < $b ? $a : $b;
return $res;
}
sub TimeStamp()
{
return POSIX::strftime("%m/%d/%y %H:%M:%S", localtime());
}
sub new($$$$$) {
# The next two lines are some voodoo taken from perltoot(1)
......@@ -111,7 +99,7 @@ sub checknextnode($) {
my $now = time();
my $pnode = $self->getnextchecknode();
# Nothinig to check!
# Nothing to check!
if (!$pnode) {
return;
}
......@@ -150,10 +138,7 @@ sub checknextnode($) {
$self->{'PENDING'}->{$pnode->{'name'}} = $pnode;
$self->{'CHPID2POOL'}->{$chpid} = $self;
print "Pool $self->{'NAME'}: fired off setup worker $chpid for ".
"$pnode->{'name'}\n";
return
return;
}
# Worker process.
......@@ -290,6 +275,7 @@ sub processchild($$$) {
elsif ($pnode->{'mode'} eq $TEARDOWNBADMODE) {
$self->movetodownpool($pnode);
}
return 1;
}
......@@ -342,22 +328,25 @@ sub processchild($$$) {
sub checkexpiration($) {
my $self = shift;
my $now = time();
my $numfinished = 0;
foreach my $pnode (values %{ $self->{'PENDING'} }) {
foreach my $pnode (values %{ $self->{'PENDING'} }) {
# ISUP or TBFAILED? Check for these before timeout.
my $state = TBDB_NODESTATE_UNKNOWN();
if (TBGetNodeEventState($pnode->{'vnode'}, \$state)) {
if ($state eq TBDB_NODESTATE_ISUP()) {
# Yes! Node is up.
print "Setup of $pnode->{'vnode'} on $pnode->{'name'} ".
" succeeded\n";
$self->teardownnode($pnode, $SETUPSUCCESS);
next;
}
elsif ($state eq TBDB_NODESTATE_TBFAILED()) {
$self->teardownnode($pnode, $SETUPFAIL);
next;
if ($pnode->{'mode'} eq $SETUPMODE) {
if ($state eq TBDB_NODESTATE_ISUP()) {
# Yes! Node is up.
print "Setup of $pnode->{'vnode'} on $pnode->{'name'} ".
" succeeded\n";
$self->teardownnode($pnode, $SETUPSUCCESS);
next;
}
elsif ($state eq TBDB_NODESTATE_TBFAILED()) {
$self->teardownnode($pnode, $SETUPFAIL);
next;
}
}
} else {
print "Error getting event state for $pnode->{'vnode'}\n";
......@@ -379,7 +368,7 @@ sub checkexpiration($) {
}
}
return;
return $numfinished;
}
......@@ -404,9 +393,6 @@ sub teardownnode($$;$) {
$pnode->{'mode'} = $TEARDOWNGOODMODE;
}
print "Pool $self->{'NAME'}: fired off teardown worker $chpid for ".
"$pnode->{'name'}\n";
return;
}
......@@ -415,17 +401,16 @@ sub teardownnode($$;$) {
TBdbfork(); # So we get the event system fork too ...
my $vnode = $pnode->{'vnode'};
# Free the vserver, if possible.
# Try to ssh in and kill processes running in the vserver.
TBForkCmd("$SSH -host $vnode $CLIENT_BIN/vnodesetup -p -k $vnode",1);
# Make sure vnode is in the proper state (regardless of the success
# or failure of the previous command.
TBSetNodeEventState($vnode, TBDB_NODESTATE_SHUTDOWN());
# Try to ssh in and kill the vserver.
TBForkCmd("$PLABNODE -f free $PLABTESTING_PID $PLABTESTING_EID $vnode",1);
exit(0);
# Free the vserver, if possible.
exec "$PLABNODE -f free $PLABTESTING_PID $PLABTESTING_EID $vnode" or
die "Doh! Can't exec command!\n";
}
# NOTREACHED
......@@ -440,9 +425,10 @@ sub calcnextcheck($$;$) {
my $now = time();
my $numsuccess = $pnode->{'consecsuccess'};
my $nextcheck = int($now + $MININTERVAL + 0.5 * $numsuccess * $MININTERVAL);
$pnode->{'nextchecktime'} = MIN($now+$MAXINTERVAL, $nextcheck) + int(rand(60));
my $numsuccess = $pnode->{'consecsuccess'} ? $pnode->{'consecsuccess'} : 1;
my $nextint = int(2 * $numsuccess * $MININTERVAL);
$pnode->{'nextchecktime'} =
$now + MIN($MAXINTERVAL, $nextint) + int(rand(120));
}
#
......
......@@ -51,6 +51,7 @@ $| = 1;
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use libplabmon;
# Load pool libraries
use plabmon_badpool;
......@@ -99,7 +100,6 @@ my $CHILLTIME = 5; # How long to wait after processing expirations.
my $NEVER = 0; # "Never" in seconds since the Epoch.
my $MAXLA = 20; # Don't let the system load get out of hand.
my $UPTIME = "/usr/bin/uptime";
#
# daemonize
......@@ -122,6 +122,10 @@ my %chpid2pool = ();
print "Plab Monitor Daemon starting... pid $$, at ".`date`;
# XXX: testing
#OpenDebugLog("chlog", "$TB/log/chlog.dbg") or die "Can't open debug log!";
#DebugLog("chlog", "Plabmonitord start: Child debug log opened");
#
# Create the node pools.
#
......@@ -135,23 +139,6 @@ my $goodpool = plabmon_goodpool->new("good",
\%chpid2pool);
@allpools = ($badpool, $goodpool);
#
# helpers
#
sub MIN($$) {
my ($a, $b) = @_;
my $res = $a < $b ? $a : $b;
return $res;
}
sub getLA() {
my ($LAstr) = `$UPTIME` =~ /load averages:\s+([\d\.]+),/;
return int($LAstr);
}
#
# Handle termination/hangup signals
#
......@@ -225,6 +212,8 @@ while (1) {
sleep($sleeptime);
#$now = time(); # Must reset $now after sleep.
# Handle any children that have exited.
while((my $chpid = waitpid(-1, WNOHANG)) > 0) {
my $chstat = $?;
......@@ -236,13 +225,14 @@ while (1) {
}
}
#$now = time(); # Must reset $now after sleep.
# Look for expired processes. Calling checkexpiration on a pool
# has the side effect of checking for ISUP (or ISUP expiration) for
# any nodes pending thusly in the pool.
# any nodes pending thusly in the pool. The return value is the
# number of nodes that the pool has finished processing (if any).
# Decrement the windowsize appropriately.
foreach my $pool (@allpools) {
$pool->checkexpiration();
my $numfinished = $pool->checkexpiration();
$windowsize -= $numfinished;
}
# We may have just fired off a bunch of kills, so chill for a bit to
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment