Commit 59c5d5bb authored by Leigh Stoller's avatar Leigh Stoller

Commit my daemon to monitor the status of plab physnodes in hwdown,

trying to bring them back from the dead periodically by trying to
instantiate a vserver/vnode on them, and then tearing it down. If we
can do that, then the node is usable, and it gets moved back into the
normal holding experiment so that ptopgen will add it to ptop files.

This deamon is not turned on yet; waiting for other little bits and
pieces to be done.

There is an equiv change in os_setup that moves physnodes into hwdown
when a setup on a vnode fails.

Lbs
parent 89d6ea0f
......@@ -1397,7 +1397,8 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/newnode_reboot \
tbsetup/plab/GNUmakefile tbsetup/plab/libplab.py \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabdaemon \
tbsetup/plab/plabmetrics \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord \
tbsetup/plab/libdslice/GNUmakefile tbsetup/plab/etc/GNUmakefile \
tip/GNUmakefile \
tmcd/GNUmakefile tmcd/freebsd/GNUmakefile tmcd/openbsd/GNUmakefile \
......
......@@ -443,7 +443,8 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/newnode_reboot \
tbsetup/plab/GNUmakefile tbsetup/plab/libplab.py \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabdaemon \
tbsetup/plab/plabmetrics \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord \
tbsetup/plab/libdslice/GNUmakefile tbsetup/plab/etc/GNUmakefile \
tip/GNUmakefile \
tmcd/GNUmakefile tmcd/freebsd/GNUmakefile tmcd/openbsd/GNUmakefile \
......
......@@ -25,6 +25,8 @@ use Exporter;
PROJMEMBERTRUST_ROOT PROJMEMBERTRUST_GROUPROOT
PROJMEMBERTRUST_PROJROOT
PLABMOND_PID PLABMOND_EID PLABHOLDING_PID PLABHOLDING_EID
TBTrustConvert TBMinTrust TBGrpTrust TBProjTrust
TB_NODEACCESS_READINFO TB_NODEACCESS_MODIFYINFO
......@@ -147,7 +149,7 @@ use Exporter;
TBSaveExpLogFiles TBExptWorkDir TBExptUserDir TBExptLogDir
TBExptDestroy TBIPtoNodeID TBNodeBootReset TBNodeStateWait
TBLeaderMailList ExpGroup TBExptSetSwapUID TBExptSetThumbNail
TBNodeAllocCheck TBPlabNodeUsername
TBNodeAllocCheck TBPlabNodeUsername MarkPhysNodeDown
TBExptRemoveVirtualState TBExptBackupVirtualState
TBExptRestoreVirtualState
......@@ -264,11 +266,15 @@ sub TBDB_EXPT_WORKDIR() { "/usr/testbed/expwork"; }
# that look like constants cause you do not need to call a perl subroutine
# with parens. That is, FOO and FOO() are the same thing.
#
sub NODERELOADING_PID() { "emulab-ops"; }
sub NODERELOADING_PID() { $TBOPSPID; }
sub NODERELOADING_EID() { "reloading"; }
sub NODERELOADPENDING_EID() { "reloadpending"; }
sub NODEDEAD_PID() { "emulab-ops"; }
sub NODEDEAD_PID() { $TBOPSPID; }
sub NODEDEAD_EID() { "hwdown"; }
sub PLABMOND_PID() { $TBOPSPID; }
sub PLABMOND_EID() { "plab-monitor"; }
sub PLABHOLDING_PID() { $TBOPSPID; }
sub PLABHOLDING_EID() { "plabnodes"; }
sub NODEBOOTSTATUS_OKAY() { "okay" ; }
sub NODEBOOTSTATUS_FAILED() { "failed"; }
......@@ -2344,6 +2350,29 @@ sub TBPlabNodeUsername($$)
return 0;
}
#
# Mark a Phys node as down. Cannot use next reserve since the pnode is not
# going to go through the free path.
#
# usage: MarkPhysNodeDown(char *nodeid)
#
sub MarkPhysNodeDown($)
{
my($pnode) = $_[0];
my($pid, $eid);
$pid = NODEDEAD_PID;
$eid = NODEDEAD_EID;
DBQueryFatal("lock tables reserved write");
DBQueryFatal("update reserved set " .
" pid='$pid',eid='$eid' ".
"where node_id='$pnode'");
DBQueryFatal("unlock tables");
}
#
# Set/Clear the current logfile for an experiment. The idea is to provide
# a way to look at what is going on from the web interface!
......
......@@ -745,6 +745,12 @@ elsif (@vnodelist) {
next;
}
if ($plabvnodes{$node}) {
#
# We move the pnode into hwdown so that it will not be considered
# again, until the plab monitor daemon determines that it is
# really working again.
#
MarkPhysNodeDown($pnode);
$failedplab++;
}
else {
......
......@@ -14,7 +14,8 @@ include $(OBJDIR)/Makeconf
SUBDIRS = libdslice etc
SBIN_STUFF = plabslice plabnode plabdaemon plabmetrics plabstats
SBIN_STUFF = plabslice plabnode plabdaemon plabmetrics plabstats \
plabmonitord
LIB_STUFF = libplab.py
......
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
# All rights reserved.
#
use English;
use Getopt::Std;
use POSIX qw(strftime);
#
# Monitor the condition of plab nodes by continually trying to setup/teardown
# vnodes on pnodes that are in hwdown. The goal is to move the pnodes out
# of hwdown so that the vnodes on that pnode will be considered okay for
# experiments (see ptopgen).
#
sub usage()
{
print STDERR "Usage: plabmonitor [-d]\n";
exit(-1);
}
my $optlist = "d";
my $debug = 0;
#
# Only real root can call this.
#
if ($UID != 0) {
print STDERR "You must be root to run this script!\n";
exit(-1);
}
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
# un-taint path
$ENV{'PATH'} = "/bin:/usr/bin:/usr/local/bin:$TB/sbin:$TB/bin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
$ENV{'WITH_TB_ADMIN_PRIVS'} = '1';
# Turn off line buffering on output
$| = 1;
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libdb;
use libtestbed;
# Be careful not to exit on transient error
$libdb::DBQUERY_MAXTRIES = 30;
# Variables from libdb.
my $PLABMOND_PID = PLABMOND_PID();
my $PLABMOND_EID = PLABMOND_EID();
my $PLABHOLDING_PID = PLABHOLDING_PID();
my $PLABHOLDING_EID = PLABHOLDING_EID();
my $NODEDEAD_PID = NODEDEAD_PID();
my $NODEDEAD_EID = NODEDEAD_EID();
sub TimeStamp()
{
return POSIX::strftime("%m/%d/%y %H:%M:%S", localtime());
}
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (@ARGV) {
usage();
}
if (defined($options{"d"})) {
$debug = 1;
}
#
# We want list of all vnodes in our special experiment, whose pnodes are
# in hwdown. These are the nodes we test, hoping to move them out of
# hwdown.
#
while (1) {
my $query_result =
DBQueryWarn("select r1.node_id,n1.phys_nodeid from reserved as r1 ".
"left join nodes as n1 on n1.node_id=r1.node_id ".
"left join reserved as r2 on r2.node_id=n1.phys_nodeid ".
"where r1.pid='$PLABMOND_PID' and ".
" r1.eid='$PLABMOND_EID' and ".
" r2.pid='$NODEDEAD_PID' and ".
" r2.eid='$NODEDEAD_EID' ".
"limit 1");
if (!$query_result) {
print "Failed to get node list from DB! Waiting a bit ...\n";
goto loop;
}
while (my ($vnode,$pnode) = $query_result->fetchrow_array()) {
my $revive = 0;
sleep(5);
print "Checking $vnode on $pnode at " . TimeStamp() . "\n";
#
# Try to set it up, wait for ISUP, then tear it down.
#
system("vnode_setup -f -d $PLABMOND_PID $PLABMOND_EID $vnode");
if ($?) {
print "Leaving $pnode in hwdown!\n";
next;
}
if (! TBNodeStateWait($vnode, TBDB_NODESTATE_ISUP, time(), 120)) {
$revive = 1;
}
system("vnode_setup -f -k -d $PLABMOND_PID $PLABMOND_EID $vnode");
if ($?) {
$revive = 0;
}
#
# That all worked. Move the pnode out of hwdown and back into
# normal holding experiment.
#
if ($revive &&
DBQueryWarn("update reserved set ".
" pid='$PLABHOLDING_PID',eid='$PLABHOLDING_EID' ".
"where node_id='$pnode'")) {
print "$pnode brought back from the afterworld at ".
TimeStamp() . "\n";
SENDMAIL($TBOPS, "$pnode is alive",
"$pnode has been brought back from the afterworld!".
$TBOPS);
}
}
loop:
sleep(60);
}
exit(0);
sub fatal($)
{
local($msg) = $_[0];
SENDMAIL($TBOPS, "Plab Monitor Died", $msg, $TBOPS);
die($msg);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment