Commit 2a5cbb2a authored by Leigh Stoller's avatar Leigh Stoller

New version of the portal monitor that is specific to the Mothership.

This version is intended to replace the old autostatus monitor on bas,
except for monitoring the Mothership itself. We also notify the Slack
channel like the autostatus version. Driven from the apt_aggregates
table in the DB, we do the following.

1. fping all the boss nodes.

2. fping all the ops nodes and dboxen. Aside; there are two special
   cases for now, that will eventually come from the database. 1)
   powder wireless aggregates do not have a public ops node, and 2) the
   dboxen are hardwired into a table at the top of the file.

3. Check all the DNS servers. Different from autostatus (which just
   checks that port 53 is listening), we do an actual lookup at the
   server. This is done with dig @ the boss node with recursion turned
   off. At the moment this is serialized test of all the DNS servers,
   might need to change that latter. I've lowered the timeout, and if
   things are operational 99% of the time (which I expect), then this
   will be okay until we get a couple of dozen aggregates to test.

   Note that this test is skipped if the boss is not pingable in the
   first step, so in general this test will not be a bottleneck.

4. Check all the CMs with a GetVersion() call. As with the DNS check, we
   skip this if the boss does not ping. This test *is* done in parallel
   using ParRun() since its slower and the most likely to time out when
   the CM is busy. The time out is 20 seconds. This seems to be the best
   balance between too much email and not hanging for too long on any
   one aggregate.

5. Send email and slack notifications. The current loop is every 60
   seconds, and each test has to fail twice in a row before marking a
   test as a failure and sending notification. Also send a 24 hour
   update for anything that is still down.

At the moment, the full set of tests takes 15 seconds on our seven
aggregates when they are all up. Will need more tuning later, as the
number of aggregates goes up.
parent 3dcc45bc
......@@ -48,6 +48,8 @@ my $optlist = "dns";
my $debug = 0;
my $impotent = 0;
my $oneshot = 0;
my %status = ();
my $lastdaily = 0;
# Debugging
my $usemydevtree = 0;
......@@ -70,9 +72,27 @@ my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@;
my $LOGFILE = "$TB/log/portal_monitor.log";
my $DIG = "/usr/local/bin/dig";
my $PING = "/sbin/ping";
my $FPING = "/usr/local/sbin/fping";
my $CURL = "/usr/local/bin/curl";
my $SLEEP_INTERVAL = 60;
my $DOWN_THRESHOLD = 120;
my $AGGDOWN_THRESHOLD= 180;
my $DAILY_INTERVAL = 24 * 3600;
my $SLACK = "https://hooks.slack.com/services";
my $SLACKURL = "$SLACK/T0D79QFGC/B9V105D99/kTE1wzm0binEIsBsQFuCkqfK";
#
# Mothership extra node definitions, which will go into the database
# at some point.
#
my %mothershipnodes = (
"emulab.net" => ["dbox1", "dbox2"],
"apt.emulab.net" => ["dbox2"],
"utah.cloudlab.us" => ["dbox1"],
"wisc.cloudlab.us" => ["dbox"],
"clemson.cloudlab.us" => ["dbox"],
);
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
......@@ -80,6 +100,7 @@ delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub fatal($);
sub doPing($$);
#
# Turn off line buffering on output
......@@ -89,6 +110,10 @@ $| = 1;
if ($UID != 0) {
fatal("Must be root to run this script\n");
}
# Silently exit if not the Mothership, this currently is specific to Utah.
if (!$MAINSITE) {
exit(0);
}
#
# Check args early so we get the right DB.
......@@ -119,6 +144,7 @@ use Genixmlrpc;
use GeniResponse;
use GeniCredential;
use GeniXML;
use GeniHRN;
use POSIX qw(strftime ceil);
if (! ($oneshot || $impotent)) {
......@@ -139,10 +165,16 @@ my $context = APT_Geni::GeniContext();
fatal("Could not load our XMLRPC context")
if (!defined($context));
my $credential = APT_Geni::GenAuthCredential($context->certificate());
if (!defined($credential)) {
print STDERR "Could not generate credential!\n";
return -1;
}
#
# We want this to be a quick test, not a long timeout.
#
Genixmlrpc->SetTimeout(10);
Genixmlrpc->SetTimeout(15);
#
# Setup a signal handler for newsyslog.
......@@ -163,202 +195,649 @@ $SIG{HUP} = \&handler
#
sub CheckAggregates()
{
my %aggregates = ();
my %pingchecks = ();
my $query_result =
DBQueryWarn("select urn from apt_aggregates ".
"where nomonitor=0");
"where nomonitor=0 and disabled=0");
return 0
if (!$query_result->numrows);
my $credential = APT_Geni::GenAuthCredential($context->certificate());
if (!defined($credential)) {
print STDERR "Could not generate credential!\n";
return -1;
}
while (my ($urn) = $query_result->fetchrow_array()) {
my $aggregate = APT_Aggregate->Lookup($urn);
if (!defined($aggregate)) {
print STDERR "Could not lookup aggregate: $urn\n";
next;
}
my $nickname = $aggregate->nickname();
my $authority = APT_Geni::GetAuthority($urn);
if (!defined($authority)) {
print STDERR "Could not lookup authority: $urn\n";
next;
# Convert URN into a boss/ops hostnames for the ping/DNS tests.
#
my $hrn = GeniHRN->new($urn);
my $boss = "boss." . $hrn->domain();
$aggregates{$urn} = {
"aggregate" => $aggregate,
"alive" => 1,
"domain" => $hrn->domain(),
"boss" => "boss." . $hrn->domain(),
"status" => undef,
"dns" => undef,
"nodes" => {},
};
#
# Hack alert; we have a gross test in for powderwireless until
# DB is updated; the ops nodes do not have public IPs.
#
if ($hrn->domain() !~ /powderwireless\.net$/) {
$aggregates{$urn}->{"nodes"}->{"ops." . $hrn->domain()} = undef;
}
my $cmurl = $authority->url();
$cmurl = devurl($cmurl) if ($usemydevtree);
if ($debug) {
print "$nickname -> $cmurl\n";
#
# Another hacky check for other nodes that need to be checked.
#
if ($MAINSITE) {
if (exists($mothershipnodes{$hrn->domain()})) {
foreach my $hostname (@{$mothershipnodes{$hrn->domain()}}) {
my $host = "${hostname}." . $hrn->domain();
$aggregates{$urn}->{"nodes"}->{$host} = undef;
}
}
}
# Add new history entries since last loop.
if (!exists($status{$urn})) {
$status{$urn} = {
"alive" => "up",
"stamp" => time(),
"status" => {"status" => "up", "stamp" => time()},
"dns" => {"status" => "up", "stamp" => time()},
"nodes" => {},
};
foreach my $node (keys(%{$aggregates{$urn}->{"nodes"}})) {
$status{$urn}->{"nodes"}->{$node} = {
"status" => "up", "stamp" => time()
};
}
}
}
# Cull out status entries that are gone from the DB since last loop.
foreach my $urn (keys(%status)) {
delete $status{$urn} if (!exists($aggregates{$urn}));
}
#
# Generate a list to check with fping. First check all the boss nodes.
# If we cannot ping boss, then we can skip everything else, marking
# the aggregate down.
#
foreach my $urn (keys(%aggregates)) {
my $aggregate = $aggregates{$urn}->{"aggregate"};
my $boss = $aggregates{$urn}->{"boss"};
#
# Mark as trying to contact.
# Do not do this on the local host, no point and it will often
# fail if we are in a VM.
#
$aggregate->last_attempt(time())
if (!$impotent);
next
if ($aggregate->IsLocalCluster());
my $response =
Genixmlrpc::CallMethod($cmurl, $context, "GetVersion");
$pingchecks{$boss} = $urn;
}
my %pingresults = ();
doPing(\%pingchecks, \%pingresults);
if ($response->code() != GENIRESPONSE_SUCCESS) {
my $nickname = $aggregate->nickname();
my $reason = $response->output();
#
# Go through the results, mark any that failed as down.
#
foreach my $boss (keys(%pingchecks)) {
my $urn = $pingchecks{$boss};
my $alive = $pingresults{$boss};
my $aggregate = $aggregates{$urn}->{"aggregate"};
print STDERR "$nickname: $reason\n";
#
# Decide how to mark the aggregate.
#
my $status;
next
if ($alive);
if ($response->code() == GENIRESPONSE_SERVER_UNAVAILABLE()) {
$status = "offline";
}
else {
$status = "down";
}
if ($impotent) {
print STDERR "Would mark $aggregate as $status\n";
}
else {
if ($aggregate->status() ne $status) {
$aggregate->status($status);
$aggregate->StatusEvent($status);
}
$aggregate->last_error($reason);
}
print STDERR "Ping $boss failed\n";
$aggregates{$urn}->{"alive"} = 0;
}
#
# Check the nodes, skipping any aggregates that failed boss fping, with
# boss unreachable the status of anything else at the aggregate is
# suspect and likely to generate needless noise.
#
%pingchecks = ();
foreach my $urn (keys(%aggregates)) {
my $aggregate = $aggregates{$urn}->{"aggregate"};
my $nodelist = $aggregates{$urn}->{"nodes"};
foreach my $hostname (keys(%{$nodelist})) {
$pingchecks{$hostname} = $urn;
}
}
%pingresults = ();
doPing(\%pingchecks, \%pingresults);
#
# Mark down nodes, process below.
#
foreach my $hostname (keys(%pingchecks)) {
my $urn = $pingchecks{$hostname};
my $alive = $pingresults{$hostname};
if ($alive) {
$aggregates{$urn}->{"nodes"}->{$hostname} = "up";
next;
}
print STDERR "Ping $hostname failed\n";
$aggregates{$urn}->{"nodes"}->{$hostname} = "down";
}
#
# DNS checks on boss nodes. If just named is dead the test below will
# still likely work cause of secondaries answering, but we want to know
# if named dies and report it separately.
#
foreach my $urn (keys(%aggregates)) {
my $aggregate = $aggregates{$urn}->{"aggregate"};
my $boss = $aggregates{$urn}->{"boss"};
#
# Mark that we could get the status. Also mark it as up.
# Do not do this if boss ping failed.
#
if ($impotent) {
print "Would mark $aggregate as up\n";
next
if (!$aggregates{$urn}->{"alive"});
if ($debug) {
print "Doing DNS test on $boss\n";
}
system("$DIG $boss \@${boss} +norecurse +short +noanswer +time=3");
if ($?) {
print STDERR "DNS $boss failed\n";
$aggregates{$urn}->{"dns"} = "down";
}
else {
$aggregate->last_success($aggregate->last_attempt());
$aggregate->last_error("");
if ($aggregate->status() ne "up") {
$aggregate->status("up");
$aggregate->StatusEvent("up");
}
$aggregates{$urn}->{"dns"} = "up";
}
}
return 0;
# Not yet sure what to do if this fails.
if (GetVersion(\%aggregates)) {
next;
}
# Process the results and send email.
ProcessResults(\%aggregates);
}
#
# Send email about down clusters. We will send this email every time
# monitor is restarted, but thats okay.
# Process the results, updating ongoing status info for next time,
# and sending email.
#
my %lastmail = ();
my $lastdaily = 0;
sub SendEmail()
sub ProcessResults($)
{
my %downmail = ();
my %upmail = ();
my $dailymail = 0;
my ($aggregates) = @_;
my %downbosses = ();
my %upbosses = ();
my %downaggs = ();
my %upaggs = ();
my %downnodes = ();
my %upnodes = ();
my %downdns = ();
my %updns = ();
my $dailymail = 0;
#
# We send a summary email once every 24 hours. Maybe do this at a
# set time of day?
#
if (time() - $lastdaily >= (24 * 3600)) {
if (time() - $lastdaily >= $DAILY_INTERVAL) {
$dailymail = 1;
$lastdaily = time();
}
my $query_result =
DBQueryWarn("select urn from apt_aggregates ".
"where nomonitor=0");
return 0
if (!$query_result->numrows);
while (my ($urn) = $query_result->fetchrow_array()) {
my $aggregate = APT_Aggregate->Lookup($urn);
if (!defined($aggregate)) {
print STDERR "Could not lookup aggregate: $urn\n";
next;
#
# First check for ping failures to boss nodes; these aggregates
# are clearly down, but wait till a second loop to mark them and
# send email.
#
foreach my $urn (keys(%{$aggregates})) {
my $ref = $aggregates->{$urn};
my $aggregate = $ref->{"aggregate"};
my $nickname = $aggregate->nickname();
if ($aggregate->IsLocalCluster()) {
# We skipped the ping test on the local host.
}
elsif (!$ref->{'alive'}) {
if ($status{$urn}->{'alive'} eq "down") {
# Down last time too, mark as dead, send email.
if ($impotent) {
print STDERR "Would mark $nickname as down\n";
}
else {
if ($aggregate->status() ne "down") {
$aggregate->status("down");
$aggregate->StatusEvent("down");
}
$aggregate->last_error("Ping failed");
}
$downbosses{$urn} = $status{$urn}->{'stamp'};
# Mark that we have sent email
$status{$urn}->{'alive'} = "dead";
}
elsif ($dailymail && $status{$urn}->{'alive'} eq "dead") {
$downbosses{$urn} = $status{$urn}->{'stamp'};
}
elsif ($status{$urn}->{'alive'} eq "up") {
# Remember for next time, we need to send email.
$status{$urn}->{'alive'} = "down";
$status{$urn}->{'stamp'} = time();
}
}
elsif ($status{$urn}->{'alive'} ne "up") {
#
# Dead last time, up this time. Mark as up and send email.
#
if ($status{$urn}->{'alive'} eq "dead") {
if ($impotent) {
print STDERR "Would mark $nickname as up\n";
}
else {
$aggregate->last_error("");
if ($aggregate->status() ne "up") {
$aggregate->status("up");
$aggregate->StatusEvent("up");
}
}
$upbosses{$urn} = $status{$urn}->{'stamp'};
}
$status{$urn}->{'alive'} = "up";
$status{$urn}->{'stamp'} = undef;
}
#
# No point in doing this for the local cluster. The local admin
# probably knows by the time this message turns up.
# Check the nodes associated this aggregate.
#
next
if ($aggregate->IsLocalCluster() && !$debug);
if ($aggregate->status() ne "up") {
my $last = str2time($aggregate->last_success());
foreach my $host (keys(%{$ref->{'nodes'}})) {
my $thisstatus = $ref->{'nodes'}->{$host};
my $laststatus = $status{$urn}->{'nodes'}->{$host}->{'status'};
if ($thisstatus eq "down") {
if ($laststatus eq "up") {
# Mark as down, record time.
$status{$urn}->{'nodes'}->{$host}->{'status'} = "down";
$status{$urn}->{'nodes'}->{$host}->{'stamp'} = time();
}
elsif ($dailymail && $laststatus eq "dead") {
# Mark for email
$downnodes{$urn}->{$host} =
$status{$urn}->{'nodes'}->{$host}->{'stamp'};
}
elsif ($laststatus ne "dead") {
# Mark for email
$downnodes{$urn}->{$host} =
$status{$urn}->{'nodes'}->{$host}->{'stamp'};
# Mark as dead so we know we sent email.
$status{$urn}->{'nodes'}->{$host}->{'status'} = "dead";
}
}
elsif ($laststatus ne "up") {
# Node is back, mark for email and clear previous status.
if ($laststatus eq "dead") {
$upnodes{$urn}->{$host} =
$status{$urn}->{'nodes'}->{$host}->{'stamp'};
}
$status{$urn}->{'nodes'}->{$host}->{'status'} = "up";
$status{$urn}->{'nodes'}->{$host}->{'stamp'} = undef;
}
}
#
# Check DNS. These rate seperate email. But if boss did not
# ping, then we skip since we do not know anything about DNS.
#
if ($ref->{'alive'}) {
if ($ref->{'dns'} eq "down") {
if ($status{$urn}->{'dns'}->{'status'} eq "down") {
# Dead last time too, mark as dead, send email.
$downdns{$urn} = $status{$urn}->{'dns'}->{'stamp'};
# Mark that we have sent email
$status{$urn}->{'dns'}->{'status'} = "dead";
}
elsif ($dailymail &&
$status{$urn}->{'dns'}->{'status'} eq "dead") {
# Mark for email
$downdns{$urn} = $status{$urn}->{'dns'}->{'stamp'};
}
elsif ($status{$urn}->{'dns'}->{'status'} eq "up") {
# Remember for next time, we need to send email.
$status{$urn}->{'dns'}->{'status'} = "down";
$status{$urn}->{'dns'}->{'stamp'} = time();
}
}
elsif ($status{$urn}->{'dns'}->{'status'} ne "up") {
# DNS is back, mark for email and clear previous status.
if ($status{$urn}->{'dns'}->{'status'} eq "dead") {
$updns{$urn} = $status{$urn}->{'dns'}->{'stamp'};
}
$status{$urn}->{'dns'}->{'status'} = "up";
$status{$urn}->{'dns'}->{'stamp'} = undef;
}
}
#
# Check Aggregate status. These also rate seperate email. But if
# boss did not ping, then we skip since we do not know anything
# about the aggregate.
#
if ($ref->{'alive'}) {
if ($ref->{'status'} eq "down") {
if ($status{$urn}->{'status'}->{'status'} eq "down") {
# Dead last time too, mark as dead, send email.
$downaggs{$urn} = $status{$urn}->{'status'}->{'stamp'};
# Mark that we have sent email
$status{$urn}->{'status'}->{'status'} = "dead";
}
elsif ($dailymail &&
$status{$urn}->{'status'}->{'status'} eq "dead") {
# Mark for email.
$downaggs{$urn} = $status{$urn}->{'status'}->{'stamp'};
}
elsif ($status{$urn}->{'status'}->{'status'} eq "up") {
# Remember for next time, we need to send email.
$status{$urn}->{'status'}->{'status'} = "down";
$status{$urn}->{'status'}->{'stamp'} = time();
}
}
elsif ($status{$urn}->{'status'}->{'status'} ne "up") {
# Aggregate is back, mark for email and clear previous status.
if ($status{$urn}->{'status'}->{'status'} eq "dead") {
$upaggs{$urn} = $status{$urn}->{'status'}->{'stamp'};
}
$status{$urn}->{'status'}->{'status'} = "up";
$status{$urn}->{'status'}->{'stamp'} = undef;
}
}
}
#
# And send email.
#
if (keys(%downbosses)) {
my $subject = "Portal Boss Nodes are " .
($dailymail ? "still " : "") . "unreachable";
my $body = "";
foreach my $urn (keys(%downbosses)) {
my $when = $downbosses{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
next
if (time() - $last < $DOWN_THRESHOLD);
$body .= "${boss}: is unreachable since ".
TBDateStringLocal($when) . "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%upbosses)) {
my $subject = "Portal Boss Nodes are back online";
my $body = "";
# Only once per day or once per event.
next
if (exists($lastmail{$urn}) && !$dailymail);
foreach my $urn (keys(%upbosses)) {
my $when = $upbosses{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
$downmail{$urn} = $aggregate;
$lastmail{$urn} = time();
$body .= "${boss}: is now online " . "\n";
}
else {
if (exists($lastmail{$urn})) {
$upmail{$urn} = $aggregate;
delete($lastmail{$urn});
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%downaggs)) {
my $subject = "Portal Aggregates are " .
($dailymail ? "still " : "") . "offlne";
my $body = "";
foreach my $urn (keys(%downaggs)) {
my $when = $downaggs{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
$body .= "${boss}: CM is offline since ".
TBDateStringLocal($when) . "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%upmail)) {
my $subject = "Portal Aggregates are alive";
my $body = "${subject}:\n\n";
if (keys(%upaggs)) {
my $subject = "Portal Aggregates are back online";
my $body = "";
foreach my $urn (keys(%upaggs)) {
my $when = $upaggs{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
foreach my $aggregate (values(%upmail)) {
$body .= $aggregate->name() . ": is now online " . "\n";
$body .= "${boss}: CM is now online " . "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%downmail)) {
my $subject = "Portal Aggregates are " .
if (keys(%downdns)) {
my $subject = "Portal DNS servers are " .
($dailymail ? "still " : "") . "offline";
my $body = "${subject}:\n\n";
my $body = "";
foreach my $urn (keys(%downdns)) {
my $when = $downdns{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
$body .= "${boss}: DNS is offline since ".
TBDateStringLocal($when) . "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%updns)) {
my $subject = "Portal DNS servers are back online";
my $body = "";
foreach my $aggregate (values(%downmail)) {
my $status = $aggregate->status();
foreach my $urn (keys(%updns)) {
my $when = $updns{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
$body .= "${boss}: DNS is now online " . "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%downnodes)) {
my $subject = "Portal Nodes are " .
($dailymail ? "still " : "") . "unreachable";
my $body = "";
foreach my $urn (keys(%downnodes)) {
foreach my $hostname (keys(%{$downnodes{$urn}})) {
my $when = $downnodes{$urn}->{$hostname};
$body .= $aggregate->name() . ": is $status since ".
TBDateStringLocal($aggregate->last_success()) . "\n";
$body .= "${hostname}: is unreachable since ".
TBDateStringLocal($when) . "\n";
}
$body .= "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%upnodes)) {
my $subject = "Portal Nodes are back online";
my $body = "";
foreach my $urn (keys(%upnodes)) {
foreach my $hostname (keys(%{$upnodes{$urn}})) {
my $when = $upnodes{$urn}->{$hostname};
$body .= "${hostname}: is now online\n";
}
$body .= "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
}
#
# Do a Getversion on all aggregates in parallel.
#
sub GetVersion($)
{
my ($aggregates) = @_;
my @return_codes = ();
my @agglist = ();
# Only check if we could ping boss.
foreach my $urn (keys(%{$aggregates})) {
push(@agglist, $aggregates->{$urn}->{"aggregate"})
if ($aggregates->{$urn}->{"alive"});
}
if ($debug) {
print "GetVersion: @agglist\n";
}
my $coderef = sub {
my ($aggregate) = @_;
my $error;
if ($debug) {
print "Checking status: $aggregate\n";
}
# Ping test using GetVersion. We want the actual error message
# back so use this directly instead of $aggregate->CheckStatus()
# Also want to change the default timeout to be more robust on
# very busy aggregates.
my $retval = APT_Geni::PingAggregate($aggregate, \$error, undef, 20);
if ($retval) {
$aggregate->last_error($error);
return -1;
}
return 0;
};
if (ParRun({"maxwaittime" => 600,
"maxchildren" => 10}, \@return_codes, $coderef, @agglist)) {
print STDERR "ParRun failed";
return -1;
}
#
# Process return codes and update status since work done in fork.
#
foreach my $aggregate (@agglist) {