Commit ca503691 authored by Leigh Stoller's avatar Leigh Stoller

Add a watchdog to watch pubsubd on ops, which does die/hang

every now and then. Seems to happen a lot on the racks, and
there are lots of them.
parent 86f54631
#
# Copyright (c) 2000-2012 University of Utah and the Flux Group.
# Copyright (c) 2000-2014 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -49,14 +49,15 @@ SUBDIRS += monitoring
endif
endif
all: etc-subdir all-subdirs
all: etc-subdir all-subdirs event_watchdog
include $(TESTBED_SRCDIR)/GNUmakerules
etc-subdir:
@$(MAKE) -C etc all
install: install-subdirs
install: install-subdirs \
$(INSTALL_SBINDIR)/event_watchdog
client: client-subdirs
client-install: client client-install-subdirs
......
#!/usr/bin/perl -w
#
# Copyright (c) 2004-2014 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
# This file is part of the Emulab network testbed software.
#
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this file. If not, see <http://www.gnu.org/licenses/>.
#
# }}}
#
use English;
use Getopt::Std;
use POSIX qw(strftime);
sub usage()
{
print "Usage: $0 [-d]\n";
exit(1);
}
my $optlist = "d";
my $debug = 0;
my $paused = 0;
#
# Configure variables.
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $LOGFILE = "$TB/log/event_watchdog.log";
# Turn off line buffering on output
$| = 1;
# Only root.
if ($UID != 0) {
die("*** $0:\n".
" Must be root to run this script!\n");
}
#
# Testbed support libs.
#
use lib "@prefix@/lib";
use libtestbed;
use emutil;
use libEmulab;
use event;
# Protos
sub TryEvents();
sub notify($);
sub fatal($);
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"d"})) {
$debug = 1;
}
usage()
if (@ARGV);
if (CheckDaemonRunning("event_watchdog")) {
fatal("Not starting another event_watchdog daemon!");
}
# Go to ground.
if (! $debug) {
if (TBBackGround($LOGFILE)) {
exit(0);
}
}
if (MarkDaemonRunning("event_watchdog")) {
fatal("Could not mark event_watchdog as running!");
}
#
# Delay at startup to allow ops time to boot. Hopefully this
# will prevent too many false alarms at system startup time.
#
sleep(60)
if (!$debug);
# Loop forever ...
while (1) {
if (TryEvents() < 0) {
if (!$paused) {
notify("Event server is offline at ".
POSIX::strftime("20%y-%m-%d %H:%M:%S", localtime()) . ".\n");
}
$paused = 1;
}
else {
if ($paused) {
notify("Event server is back online at ".
POSIX::strftime("20%y-%m-%d %H:%M:%S", localtime()) .
". Phew!\n");
}
$paused = 0;
}
sleep(60);
}
#
# Try to connect to event server, protected by a timeout.
#
# Returns: 0 if everything is fine.
# 1 if timed out or otherwise unresponsive.
#
sub TryEvents()
{
my $URL = "elvin://event-server";
my $rval = 0;
print "Trying to connect to the event server at $URL\n"
if ($debug);
my $childpid = fork();
if ($childpid) {
#
# Parent waits for child to complete.
#
local $SIG{ALRM} = sub { kill("USR1", $childpid); };
alarm 30;
waitpid($childpid, 0);
alarm 0;
# An exit value of 30 is the USR1 signal.
if ($?) {
if ($? == 30) {
$rval = -1;
}
elsif (($? >> 8) == 1) {
$rval = -1;
}
# 15 is TERM. Do nothing.
elsif ($? == 15) {
$rval = 0;
}
else {
$rval = -1;
}
}
else {
$rval = 0;
}
print "Attempt is returning $rval\n"
if ($debug);
return $rval;
}
else {
my $handle = event_register($URL,0);
fatal("Unable to register with event system!")
if (!$handle);
exit(0);
}
}
sub notify($)
{
my($mesg) = $_[0];
print $mesg;
#
# Send a message to the testbed list.
#
SENDMAIL($TBOPS,
"Event Watchdog Message",
$mesg,
$TBOPS);
}
sub fatal($)
{
my ($msg) = @_;
#
# Send a message to the testbed list.
#
SENDMAIL($TBOPS,
"Event watchdog daemon died",
$msg,
$TBOPS);
MarkDaemonStopped("event_watchdog");
die("*** $0:\n".
" $msg\n");
}
......@@ -127,6 +127,11 @@ case "$1" in
@prefix@/sbin/tcppd &
fi
if [ -x @prefix@/sbin/event_watchdog ]; then
echo -n " event_watchdog"
@prefix@/sbin/event_watchdog
fi
#
# Could trigger experiment creation, so make sure everything
# else is setup first; i.e., run this last!
......@@ -209,6 +214,9 @@ case "$1" in
if [ -r /var/run/tcppd.pid ]; then
kill `cat /var/run/tcppd.pid`
fi
if [ -r /var/run/event_watchdog.pid ]; then
kill `cat /var/run/event_watchdog.pid`
fi
;;
*)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment