Commit 6b75cf38 authored by Leigh Stoller's avatar Leigh Stoller

Add a 60 second timer to tell Emulab that the node is alive and

well. We use a UDP packet to keep it lightweight. If it does not get
through, thats okay, obviously. The return value is just a yes/no flag
that says an update needs to run. Right now, thats just accounts.
This allows us to churn a little less on accounts.
Other cleanups.
parent 43939c43
......@@ -12,10 +12,10 @@ use POSIX qw(strftime);
#
sub usage()
{
print "Usage: watchdog [-t timeout]\n";
print "Usage: watchdog [-n] [-t timeout]\n";
exit(1);
}
my $optlist = "t:";
my $optlist = "t:n";
#
# Turn off line buffering on output
......@@ -41,6 +41,7 @@ my $timeout = (60 * 30); # In seconds of course.
my $logname = "/tmp/emulab-watchdog.debug";
my $vndir = "/var/testbed";
my $pidfile = "/var/run/emulab-watchdog.pid";
my $noboot = 0;
#
# Parse command arguments. Once we return from getopts, all that should be
......@@ -53,6 +54,9 @@ if (! getopts($optlist, \%options)) {
if (defined($options{"t"})) {
$timeout = $options{"t"};
}
if (defined($options{"n"})) {
$noboot = 1;
}
if (@ARGV) {
usage();
}
......@@ -68,6 +72,21 @@ if (1 && TBBackGround($logname)) {
exit(0);
}
#
# Setup a handler to catch TERM, and kill our process group.
#
my $pgrp = getpgrp(0);
sub handler () {
$SIG{TERM} = 'IGNORE';
$SIG{INT} = 'IGNORE';
kill('TERM', -$pgrp);
sleep(5);
exit(0);
}
$SIG{TERM} = \&handler;
$SIG{INT} = \&handler;
#
# Write our pid into the pid file so we can be killed later (when the
# experiment is torn down). We must do this first so that we can be
......@@ -78,7 +97,6 @@ open(PFILE, "> $pidfile")
print PFILE "$PID\n";
close(PFILE);
#
#
# Inform TMCD that we have rebooted, and are starting testbed setup.
#
......@@ -89,8 +107,10 @@ system("tmcc state REBOOTED");
print "Looking for new Emulab accounts ...\n";
system("update -i");
# Also setup existing vnodes.
bootvnodes();
if (! $noboot) {
# Also setup existing vnodes.
bootvnodes();
}
#
# Inform TMCD that we are up and running.
......@@ -98,6 +118,48 @@ bootvnodes();
print "Informing Emulab Operations that we're up and running ...\n";
system("tmcc state ISUP");
#
# Fire off a child that does nothing but tell the boss we are alive.
#
my $mypid = fork();
if (! $mypid) {
print "Keep alive starting up ... \n";
while (1) {
#
# Run tmcc in UDP mode. The command is ignored at the other end.
# Its just the connection that tells tmcd we are alive.
# Since its UDP, we try it a couple of times if it fails.
#
my $retries = 3;
while ($retries) {
my $options = "";
if (REMOTE()) {
$options .= " -u -t 3";
}
my $result = `tmcc $options isalive`;
if (! $?) {
my $date = POSIX::strftime("20%y/%m/%d %H:%M:%S", localtime());
chomp $result;
my (undef,$update) = split("=", $result);
if ($update) {
print "Running an update at $date ...\n";
system("update -i");
}
last;
}
$retries--;
}
if (!$retries) {
print "keep alive returned $?\n";
}
sleep(60);
}
exit(0);
}
#
# Loop!
#
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment