All new accounts created on Gitlab now require administrator approval. If you invite any collaborators, please let Flux staff know so they can approve the accounts.

Commit daf45cef authored by Leigh B Stoller's avatar Leigh B Stoller

Add "poolmonitor" to watch for shared hosts that have died.

parent 246d2022
......@@ -177,7 +177,10 @@ sub Install($$$)
"$LOGDIR/sa_daemon.log 640 9 1000 * Z ".
"/var/run/sa_daemon.pid",
"$LOGDIR/foam.log 640 7 * 168 Z",
"$LOGDIR/flowvisor.log 640 7 1000 * Z");
"$LOGDIR/flowvisor.log 640 7 1000 * Z",
"$LOGDIR/poolmonitor.log 644 7 2000 * Z ".
"/var/run/poolmonitor.pid",
);
};
};
......
#
# Add pool monitor.
#
use strict;
use libinstall;
use installvars;
my $LOGFILE = "$TBROOT/log/poolmonitor.log";
my $TESTBED_STARTUP = "/usr/local/etc/rc.d/3.testbed.sh";
sub InstallUpdate($$)
{
my ($version, $phase) = @_;
#
# If something should run in the pre-install phase.
#
if ($phase eq "pre") {
Phase "poolmonitor.log", "Adding poolmonitor logging", sub {
DoneIfEdited($NEWSYSLOG_CONF);
BackUpFileFatal($NEWSYSLOG_CONF);
AppendToFileFatal($NEWSYSLOG_CONF,
"$LOGFILE 644 7 2000 * Z /var/run/poolmonitor.pid");
};
Phase "startupfile", "Updating testbed startup file", sub {
DoneIfIdentical("$TOP_OBJDIR/rc.d/3.testbed.sh", $TESTBED_STARTUP);
DiffFiles("$TOP_OBJDIR/rc.d/3.testbed.sh", $TESTBED_STARTUP);
ExecQuietFatal("$GMAKE -C $TOP_OBJDIR/rc.d install");
};
}
#
# If something should run in the post-install phase.
#
if ($phase eq "post") {
}
return 0;
}
1;
# Local Variables:
# mode:perl
# End:
......@@ -117,6 +117,11 @@ case "$1" in
@prefix@/sbin/portal_daemon
fi
if [ -x @prefix@/sbin/poolmonitor ]; then
echo -n " poolmonitor"
@prefix@/sbin/poolmonitor
fi
if [ -x @prefix@/sbin/tcppd ]; then
echo -n " tcppd"
@prefix@/sbin/tcppd &
......@@ -195,6 +200,9 @@ case "$1" in
if [ -r /var/run/portal_daemon.pid ]; then
kill `cat /var/run/portal_daemon.pid`
fi
if [ -r /var/run/poolmonitor.pid ]; then
kill `cat /var/run/poolmonitor.pid`
fi
if [ -r /var/run/tcppd.pid ]; then
kill `cat /var/run/tcppd.pid`
fi
......
......@@ -50,7 +50,7 @@ SBIN_SCRIPTS = vlandiff vlansync withadminprivs export_tables cvsupd.pl \
addvpubaddr imageinfo ctrladdr image_import \
prereserve_check tcppd addexternalnetwork \
update_sitevars delete_image sitecheckin sitecheckin_client \
mktestbedtest fixrootcert addservers
mktestbedtest fixrootcert addservers poolmonitor
WEB_SBIN_SCRIPTS= webnewnode webdeletenode webspewconlog webarchive_list \
webwanodecheckin webspewimage webdumpdescriptor \
......
#!/usr/bin/perl -w
#
# Copyright (c) 2008-2013 University of Utah and the Flux Group.
#
# {{{GENIPUBLIC-LICENSE
#
# GENI Public License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and/or hardware specification (the "Work") to
# deal in the Work without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Work, and to permit persons to whom the Work
# is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Work.
#
# THE WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE WORK OR THE USE OR OTHER DEALINGS
# IN THE WORK.
#
# }}}
#
use strict;
use English;
use Getopt::Std;
use Data::Dumper;
#
# Watch the shared node pool, looking for dead puppies. Does a simple ssh
# with timeout and if it times out or has an error, we send word that
# the node is dead. Report aggregate list once a day.
#
sub usage()
{
print "Usage: poolmonitor [-d] [-s]\n";
print "Options:\n";
print " -d - Run in foreground, do not daemonize.\n";
print " -s - Run once and exit.\n";
exit(1);
}
my $optlist = "ds";
my $debug = 0;
my $oneshot = 0;
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $LOGFILE = "$TB/log/poolmonitor.log";
my $SLEEP_INTERVAL= 300;
# Lets not warn more then once a day.
my %warned = ();
my $lastmail = time();
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub fatal($);
#
# Turn off line buffering on output
#
$| = 1;
if ($UID != 0) {
fatal("Must be root to run this script\n");
}
#
# Check args early so we get the right DB.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"d"})) {
$debug++;
}
if (defined($options{"s"})) {
$oneshot = 1;
}
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libtestbed;
use emdb;
use Experiment;
use Node;
use emutil;
use libEmulab;
if (!$oneshot) {
if (CheckDaemonRunning("poolmonitor")) {
fatal("Not starting another poolmonitor daemon!");
}
# Go to ground.
if (! $debug) {
if (TBBackGround($LOGFILE)) {
exit(0);
}
}
if (MarkDaemonRunning("poolmonitor")) {
fatal("Could not mark daemon as running!");
}
}
#
# Setup a signal handler for newsyslog.
#
sub handler()
{
my $SAVEEUID = $EUID;
$EUID = 0;
ReOpenLog($LOGFILE);
$EUID = $SAVEEUID;
}
$SIG{HUP} = \&handler
if (! ($debug || $oneshot));
print "Pool Monitor starting... pid $$, at ".`date`;
while (1) {
if (NoLogins()) {
sleep(5);
next;
}
print "Running at ".
POSIX::strftime("20%y-%m-%d %H:%M:%S", localtime()) . "\n";
my $query_result =
DBQueryWarn("select r.node_id from reserved as r ".
"left join nodes as n on n.node_id=r.node_id ".
"left join node_types as t on t.type=n.type ".
"where sharing_mode is not null and ".
" n.node_id=n.phys_nodeid and t.class='pc'");
goto skip
if (!$query_result || !$query_result->numrows);
while (my ($node_id) = $query_result->fetchrow_array()) {
print "Checking to see if $node_id is reactive ...\n";
my $status = SSHwithTimeout($node_id, "ls / > /dev/null", 15, $debug);
if ($status) {
print "--> $node_id is down for the count!\n";
if (!exists($warned{$node_id})) {
SENDMAIL($TBOPS,
"Shared node $node_id is unresponsive",
"Shared node $node_id is unresponsive",
$TBOPS);
$warned{$node_id} = time();
}
}
else {
print "--> $node_id appears to be alive and kicking!\n";
delete($warned{$node_id})
if (exists($warned{$node_id}));
}
sleep(1);
}
#
# Warn of all nodes down once a day.
#
if (keys(%warned) && (time() - $lastmail) > (24 * 3600)) {
my @nodes = keys(%warned);
print "Nodes still unresponsive after (another) 24 hours: @nodes\n";
SENDMAIL($TBOPS,
"WARNING: unresponsive shared nodes",
"Nodes still unresponsive after (another) 24 hours:\n" .
"@nodes\n",
$TBOPS);
$lastmail = time();
}
if ($oneshot) {
exit(0);
}
skip:
sleep($SLEEP_INTERVAL);
}
sub fatal($)
{
my ($msg) = @_;
#
# Send a message to the testbed list.
#
SENDMAIL($TBOPS,
"Pool monitor daemon died",
$msg,
$TBOPS);
MarkDaemonStopped("poolmonitor");
die("*** $0:\n".
" $msg\n");
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment