Commit 2a5cbb2a authored by Leigh Stoller's avatar Leigh Stoller

New version of the portal monitor that is specific to the Mothership.

This version is intended to replace the old autostatus monitor on bas,
except for monitoring the Mothership itself. We also notify the Slack
channel like the autostatus version. Driven from the apt_aggregates
table in the DB, we do the following.

1. fping all the boss nodes.

2. fping all the ops nodes and dboxen. Aside; there are two special
   cases for now, that will eventually come from the database. 1)
   powder wireless aggregates do not have a public ops node, and 2) the
   dboxen are hardwired into a table at the top of the file.

3. Check all the DNS servers. Different from autostatus (which just
   checks that port 53 is listening), we do an actual lookup at the
   server. This is done with dig @ the boss node with recursion turned
   off. At the moment this is serialized test of all the DNS servers,
   might need to change that latter. I've lowered the timeout, and if
   things are operational 99% of the time (which I expect), then this
   will be okay until we get a couple of dozen aggregates to test.

   Note that this test is skipped if the boss is not pingable in the
   first step, so in general this test will not be a bottleneck.

4. Check all the CMs with a GetVersion() call. As with the DNS check, we
   skip this if the boss does not ping. This test *is* done in parallel
   using ParRun() since its slower and the most likely to time out when
   the CM is busy. The time out is 20 seconds. This seems to be the best
   balance between too much email and not hanging for too long on any
   one aggregate.

5. Send email and slack notifications. The current loop is every 60
   seconds, and each test has to fail twice in a row before marking a
   test as a failure and sending notification. Also send a 24 hour
   update for anything that is still down.

At the moment, the full set of tests takes 15 seconds on our seven
aggregates when they are all up. Will need more tuning later, as the
number of aggregates goes up.
parent 3dcc45bc
...@@ -48,6 +48,8 @@ my $optlist = "dns"; ...@@ -48,6 +48,8 @@ my $optlist = "dns";
my $debug = 0; my $debug = 0;
my $impotent = 0; my $impotent = 0;
my $oneshot = 0; my $oneshot = 0;
my %status = ();
my $lastdaily = 0;
# Debugging # Debugging
my $usemydevtree = 0; my $usemydevtree = 0;
...@@ -70,9 +72,27 @@ my $TBOPS = "@TBOPSEMAIL@"; ...@@ -70,9 +72,27 @@ my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@"; my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@; my $MAINSITE = @TBMAINSITE@;
my $LOGFILE = "$TB/log/portal_monitor.log"; my $LOGFILE = "$TB/log/portal_monitor.log";
my $DIG = "/usr/local/bin/dig";
my $PING = "/sbin/ping";
my $FPING = "/usr/local/sbin/fping";
my $CURL = "/usr/local/bin/curl";
my $SLEEP_INTERVAL = 60; my $SLEEP_INTERVAL = 60;
my $DOWN_THRESHOLD = 120; my $AGGDOWN_THRESHOLD= 180;
my $DAILY_INTERVAL = 24 * 3600; my $DAILY_INTERVAL = 24 * 3600;
my $SLACK = "https://hooks.slack.com/services";
my $SLACKURL = "$SLACK/T0D79QFGC/B9V105D99/kTE1wzm0binEIsBsQFuCkqfK";
#
# Mothership extra node definitions, which will go into the database
# at some point.
#
my %mothershipnodes = (
"emulab.net" => ["dbox1", "dbox2"],
"apt.emulab.net" => ["dbox2"],
"utah.cloudlab.us" => ["dbox1"],
"wisc.cloudlab.us" => ["dbox"],
"clemson.cloudlab.us" => ["dbox"],
);
# un-taint path # un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin'; $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
...@@ -80,6 +100,7 @@ delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; ...@@ -80,6 +100,7 @@ delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos # Protos
sub fatal($); sub fatal($);
sub doPing($$);
# #
# Turn off line buffering on output # Turn off line buffering on output
...@@ -89,6 +110,10 @@ $| = 1; ...@@ -89,6 +110,10 @@ $| = 1;
if ($UID != 0) { if ($UID != 0) {
fatal("Must be root to run this script\n"); fatal("Must be root to run this script\n");
} }
# Silently exit if not the Mothership, this currently is specific to Utah.
if (!$MAINSITE) {
exit(0);
}
# #
# Check args early so we get the right DB. # Check args early so we get the right DB.
...@@ -119,6 +144,7 @@ use Genixmlrpc; ...@@ -119,6 +144,7 @@ use Genixmlrpc;
use GeniResponse; use GeniResponse;
use GeniCredential; use GeniCredential;
use GeniXML; use GeniXML;
use GeniHRN;
use POSIX qw(strftime ceil); use POSIX qw(strftime ceil);
if (! ($oneshot || $impotent)) { if (! ($oneshot || $impotent)) {
...@@ -139,10 +165,16 @@ my $context = APT_Geni::GeniContext(); ...@@ -139,10 +165,16 @@ my $context = APT_Geni::GeniContext();
fatal("Could not load our XMLRPC context") fatal("Could not load our XMLRPC context")
if (!defined($context)); if (!defined($context));
my $credential = APT_Geni::GenAuthCredential($context->certificate());
if (!defined($credential)) {
print STDERR "Could not generate credential!\n";
return -1;
}
# #
# We want this to be a quick test, not a long timeout. # We want this to be a quick test, not a long timeout.
# #
Genixmlrpc->SetTimeout(10); Genixmlrpc->SetTimeout(15);
# #
# Setup a signal handler for newsyslog. # Setup a signal handler for newsyslog.
...@@ -163,202 +195,649 @@ $SIG{HUP} = \&handler ...@@ -163,202 +195,649 @@ $SIG{HUP} = \&handler
# #
sub CheckAggregates() sub CheckAggregates()
{ {
my %aggregates = ();
my %pingchecks = ();
my $query_result = my $query_result =
DBQueryWarn("select urn from apt_aggregates ". DBQueryWarn("select urn from apt_aggregates ".
"where nomonitor=0"); "where nomonitor=0 and disabled=0");
return 0 return 0
if (!$query_result->numrows); if (!$query_result->numrows);
my $credential = APT_Geni::GenAuthCredential($context->certificate());
if (!defined($credential)) {
print STDERR "Could not generate credential!\n";
return -1;
}
while (my ($urn) = $query_result->fetchrow_array()) { while (my ($urn) = $query_result->fetchrow_array()) {
my $aggregate = APT_Aggregate->Lookup($urn); my $aggregate = APT_Aggregate->Lookup($urn);
if (!defined($aggregate)) { if (!defined($aggregate)) {
print STDERR "Could not lookup aggregate: $urn\n"; print STDERR "Could not lookup aggregate: $urn\n";
next; next;
} }
my $nickname = $aggregate->nickname(); # Convert URN into a boss/ops hostnames for the ping/DNS tests.
my $authority = APT_Geni::GetAuthority($urn); #
if (!defined($authority)) { my $hrn = GeniHRN->new($urn);
print STDERR "Could not lookup authority: $urn\n"; my $boss = "boss." . $hrn->domain();
next;
$aggregates{$urn} = {
"aggregate" => $aggregate,
"alive" => 1,
"domain" => $hrn->domain(),
"boss" => "boss." . $hrn->domain(),
"status" => undef,
"dns" => undef,
"nodes" => {},
};
#
# Hack alert; we have a gross test in for powderwireless until
# DB is updated; the ops nodes do not have public IPs.
#
if ($hrn->domain() !~ /powderwireless\.net$/) {
$aggregates{$urn}->{"nodes"}->{"ops." . $hrn->domain()} = undef;
} }
my $cmurl = $authority->url();
$cmurl = devurl($cmurl) if ($usemydevtree);
if ($debug) { #
print "$nickname -> $cmurl\n"; # Another hacky check for other nodes that need to be checked.
#
if ($MAINSITE) {
if (exists($mothershipnodes{$hrn->domain()})) {
foreach my $hostname (@{$mothershipnodes{$hrn->domain()}}) {
my $host = "${hostname}." . $hrn->domain();
$aggregates{$urn}->{"nodes"}->{$host} = undef;
}
}
}
# Add new history entries since last loop.
if (!exists($status{$urn})) {
$status{$urn} = {
"alive" => "up",
"stamp" => time(),
"status" => {"status" => "up", "stamp" => time()},
"dns" => {"status" => "up", "stamp" => time()},
"nodes" => {},
};
foreach my $node (keys(%{$aggregates{$urn}->{"nodes"}})) {
$status{$urn}->{"nodes"}->{$node} = {
"status" => "up", "stamp" => time()
};
}
} }
}
# Cull out status entries that are gone from the DB since last loop.
foreach my $urn (keys(%status)) {
delete $status{$urn} if (!exists($aggregates{$urn}));
}
#
# Generate a list to check with fping. First check all the boss nodes.
# If we cannot ping boss, then we can skip everything else, marking
# the aggregate down.
#
foreach my $urn (keys(%aggregates)) {
my $aggregate = $aggregates{$urn}->{"aggregate"};
my $boss = $aggregates{$urn}->{"boss"};
# #
# Mark as trying to contact. # Do not do this on the local host, no point and it will often
# fail if we are in a VM.
# #
$aggregate->last_attempt(time()) next
if (!$impotent); if ($aggregate->IsLocalCluster());
my $response = $pingchecks{$boss} = $urn;
Genixmlrpc::CallMethod($cmurl, $context, "GetVersion"); }
my %pingresults = ();
doPing(\%pingchecks, \%pingresults);
if ($response->code() != GENIRESPONSE_SUCCESS) { #
my $nickname = $aggregate->nickname(); # Go through the results, mark any that failed as down.
my $reason = $response->output(); #
foreach my $boss (keys(%pingchecks)) {
my $urn = $pingchecks{$boss};
my $alive = $pingresults{$boss};
my $aggregate = $aggregates{$urn}->{"aggregate"};
print STDERR "$nickname: $reason\n"; next
if ($alive);
#
# Decide how to mark the aggregate.
#
my $status;
if ($response->code() == GENIRESPONSE_SERVER_UNAVAILABLE()) { print STDERR "Ping $boss failed\n";
$status = "offline"; $aggregates{$urn}->{"alive"} = 0;
} }
else {
$status = "down"; #
} # Check the nodes, skipping any aggregates that failed boss fping, with
if ($impotent) { # boss unreachable the status of anything else at the aggregate is
print STDERR "Would mark $aggregate as $status\n"; # suspect and likely to generate needless noise.
} #
else { %pingchecks = ();
if ($aggregate->status() ne $status) {
$aggregate->status($status); foreach my $urn (keys(%aggregates)) {
$aggregate->StatusEvent($status); my $aggregate = $aggregates{$urn}->{"aggregate"};
} my $nodelist = $aggregates{$urn}->{"nodes"};
$aggregate->last_error($reason); foreach my $hostname (keys(%{$nodelist})) {
} $pingchecks{$hostname} = $urn;
}
}
%pingresults = ();
doPing(\%pingchecks, \%pingresults);
#
# Mark down nodes, process below.
#
foreach my $hostname (keys(%pingchecks)) {
my $urn = $pingchecks{$hostname};
my $alive = $pingresults{$hostname};
if ($alive) {
$aggregates{$urn}->{"nodes"}->{$hostname} = "up";
next; next;
} }
print STDERR "Ping $hostname failed\n";
$aggregates{$urn}->{"nodes"}->{$hostname} = "down";
}
#
# DNS checks on boss nodes. If just named is dead the test below will
# still likely work cause of secondaries answering, but we want to know
# if named dies and report it separately.
#
foreach my $urn (keys(%aggregates)) {
my $aggregate = $aggregates{$urn}->{"aggregate"};
my $boss = $aggregates{$urn}->{"boss"};
# #
# Mark that we could get the status. Also mark it as up. # Do not do this if boss ping failed.
# #
if ($impotent) { next
print "Would mark $aggregate as up\n"; if (!$aggregates{$urn}->{"alive"});
if ($debug) {
print "Doing DNS test on $boss\n";
}
system("$DIG $boss \@${boss} +norecurse +short +noanswer +time=3");
if ($?) {
print STDERR "DNS $boss failed\n";
$aggregates{$urn}->{"dns"} = "down";
} }
else { else {
$aggregate->last_success($aggregate->last_attempt()); $aggregates{$urn}->{"dns"} = "up";
$aggregate->last_error("");
if ($aggregate->status() ne "up") {
$aggregate->status("up");
$aggregate->StatusEvent("up");
}
} }
} }
return 0; # Not yet sure what to do if this fails.
if (GetVersion(\%aggregates)) {
next;
}
# Process the results and send email.
ProcessResults(\%aggregates);
} }
# #
# Send email about down clusters. We will send this email every time # Process the results, updating ongoing status info for next time,
# monitor is restarted, but thats okay. # and sending email.
# #
my %lastmail = (); sub ProcessResults($)
my $lastdaily = 0;
sub SendEmail()
{ {
my %downmail = (); my ($aggregates) = @_;
my %upmail = (); my %downbosses = ();
my $dailymail = 0; my %upbosses = ();
my %downaggs = ();
my %upaggs = ();
my %downnodes = ();
my %upnodes = ();
my %downdns = ();
my %updns = ();
my $dailymail = 0;
# #
# We send a summary email once every 24 hours. Maybe do this at a # We send a summary email once every 24 hours. Maybe do this at a
# set time of day? # set time of day?
# #
if (time() - $lastdaily >= (24 * 3600)) { if (time() - $lastdaily >= $DAILY_INTERVAL) {
$dailymail = 1; $dailymail = 1;
$lastdaily = time(); $lastdaily = time();
} }
my $query_result =
DBQueryWarn("select urn from apt_aggregates ".
"where nomonitor=0");
return 0
if (!$query_result->numrows);
while (my ($urn) = $query_result->fetchrow_array()) { #
my $aggregate = APT_Aggregate->Lookup($urn); # First check for ping failures to boss nodes; these aggregates
if (!defined($aggregate)) { # are clearly down, but wait till a second loop to mark them and
print STDERR "Could not lookup aggregate: $urn\n"; # send email.
next; #
foreach my $urn (keys(%{$aggregates})) {
my $ref = $aggregates->{$urn};
my $aggregate = $ref->{"aggregate"};
my $nickname = $aggregate->nickname();
if ($aggregate->IsLocalCluster()) {
# We skipped the ping test on the local host.
}
elsif (!$ref->{'alive'}) {
if ($status{$urn}->{'alive'} eq "down") {
# Down last time too, mark as dead, send email.
if ($impotent) {
print STDERR "Would mark $nickname as down\n";
}
else {
if ($aggregate->status() ne "down") {
$aggregate->status("down");
$aggregate->StatusEvent("down");
}
$aggregate->last_error("Ping failed");
}
$downbosses{$urn} = $status{$urn}->{'stamp'};
# Mark that we have sent email
$status{$urn}->{'alive'} = "dead";
}
elsif ($dailymail && $status{$urn}->{'alive'} eq "dead") {
$downbosses{$urn} = $status{$urn}->{'stamp'};
}
elsif ($status{$urn}->{'alive'} eq "up") {
# Remember for next time, we need to send email.
$status{$urn}->{'alive'} = "down";
$status{$urn}->{'stamp'} = time();
}
}
elsif ($status{$urn}->{'alive'} ne "up") {
#
# Dead last time, up this time. Mark as up and send email.
#
if ($status{$urn}->{'alive'} eq "dead") {
if ($impotent) {
print STDERR "Would mark $nickname as up\n";
}
else {
$aggregate->last_error("");
if ($aggregate->status() ne "up") {
$aggregate->status("up");
$aggregate->StatusEvent("up");
}
}
$upbosses{$urn} = $status{$urn}->{'stamp'};
}
$status{$urn}->{'alive'} = "up";
$status{$urn}->{'stamp'} = undef;
} }
# #
# No point in doing this for the local cluster. The local admin # Check the nodes associated this aggregate.
# probably knows by the time this message turns up.
# #
next foreach my $host (keys(%{$ref->{'nodes'}})) {
if ($aggregate->IsLocalCluster() && !$debug); my $thisstatus = $ref->{'nodes'}->{$host};
my $laststatus = $status{$urn}->{'nodes'}->{$host}->{'status'};
if ($aggregate->status() ne "up") {
my $last = str2time($aggregate->last_success()); if ($thisstatus eq "down") {
if ($laststatus eq "up") {
# Mark as down, record time.
$status{$urn}->{'nodes'}->{$host}->{'status'} = "down";
$status{$urn}->{'nodes'}->{$host}->{'stamp'} = time();
}
elsif ($dailymail && $laststatus eq "dead") {
# Mark for email
$downnodes{$urn}->{$host} =
$status{$urn}->{'nodes'}->{$host}->{'stamp'};
}
elsif ($laststatus ne "dead") {
# Mark for email
$downnodes{$urn}->{$host} =
$status{$urn}->{'nodes'}->{$host}->{'stamp'};
# Mark as dead so we know we sent email.
$status{$urn}->{'nodes'}->{$host}->{'status'} = "dead";
}
}
elsif ($laststatus ne "up") {
# Node is back, mark for email and clear previous status.
if ($laststatus eq "dead") {
$upnodes{$urn}->{$host} =
$status{$urn}->{'nodes'}->{$host}->{'stamp'};
}
$status{$urn}->{'nodes'}->{$host}->{'status'} = "up";
$status{$urn}->{'nodes'}->{$host}->{'stamp'} = undef;
}
}
#
# Check DNS. These rate seperate email. But if boss did not
# ping, then we skip since we do not know anything about DNS.
#
if ($ref->{'alive'}) {
if ($ref->{'dns'} eq "down") {
if ($status{$urn}->{'dns'}->{'status'} eq "down") {
# Dead last time too, mark as dead, send email.
$downdns{$urn} = $status{$urn}->{'dns'}->{'stamp'};
# Mark that we have sent email
$status{$urn}->{'dns'}->{'status'} = "dead";
}
elsif ($dailymail &&
$status{$urn}->{'dns'}->{'status'} eq "dead") {
# Mark for email
$downdns{$urn} = $status{$urn}->{'dns'}->{'stamp'};
}
elsif ($status{$urn}->{'dns'}->{'status'} eq "up") {
# Remember for next time, we need to send email.
$status{$urn}->{'dns'}->{'status'} = "down";
$status{$urn}->{'dns'}->{'stamp'} = time();
}
}
elsif ($status{$urn}->{'dns'}->{'status'} ne "up") {
# DNS is back, mark for email and clear previous status.
if ($status{$urn}->{'dns'}->{'status'} eq "dead") {
$updns{$urn} = $status{$urn}->{'dns'}->{'stamp'};
}
$status{$urn}->{'dns'}->{'status'} = "up";
$status{$urn}->{'dns'}->{'stamp'} = undef;
}
}
#
# Check Aggregate status. These also rate seperate email. But if
# boss did not ping, then we skip since we do not know anything
# about the aggregate.
#
if ($ref->{'alive'}) {
if ($ref->{'status'} eq "down") {
if ($status{$urn}->{'status'}->{'status'} eq "down") {
# Dead last time too, mark as dead, send email.
$downaggs{$urn} = $status{$urn}->{'status'}->{'stamp'};
# Mark that we have sent email
$status{$urn}->{'status'}->{'status'} = "dead";
}
elsif ($dailymail &&
$status{$urn}->{'status'}->{'status'} eq "dead") {
# Mark for email.
$downaggs{$urn} = $status{$urn}->{'status'}->{'stamp'};
}
elsif ($status{$urn}->{'status'}->{'status'} eq "up") {
# Remember for next time, we need to send email.
$status{$urn}->{'status'}->{'status'} = "down";
$status{$urn}->{'status'}->{'stamp'} = time();
}
}
elsif ($status{$urn}->{'status'}->{'status'} ne "up") {
# Aggregate is back, mark for email and clear previous status.
if ($status{$urn}->{'status'}->{'status'} eq "dead") {
$upaggs{$urn} = $status{$urn}->{'status'}->{'stamp'};
}
$status{$urn}->{'status'}->{'status'} = "up";
$status{$urn}->{'status'}->{'stamp'} = undef;
}
}
}
#
# And send email.
#
if (keys(%downbosses)) {
my $subject = "Portal Boss Nodes are " .
($dailymail ? "still " : "") . "unreachable";
my $body = "";
foreach my $urn (keys(%downbosses)) {
my $when = $downbosses{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
next $body .= "${boss}: is unreachable since ".
if (time() - $last < $DOWN_THRESHOLD); TBDateStringLocal($when) . "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%upbosses)) {
my $subject = "Portal Boss Nodes are back online";
my $body = "";
# Only once per day or once per event. foreach my $urn (keys(%upbosses)) {
next my $when = $upbosses{$urn};
if (exists($lastmail{$urn}) && !$dailymail); my $boss = $aggregates->{$urn}->{'boss'};
$downmail{$urn} = $aggregate; $body .= "${boss}: is now online " . "\n";
$lastmail{$urn} = time();
} }
else { NotifySlack($body);
if (exists($lastmail{$urn})) { SENDMAIL($TBOPS, $subject, $body, $TBOPS);
$upmail{$urn} = $aggregate; }
delete($lastmail{$urn}); if (keys(%downaggs)) {
} my $subject = "Portal Aggregates are " .
($dailymail ? "still " : "") . "offlne";
my $body = "";
foreach my $urn (keys(%downaggs)) {
my $when = $downaggs{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
$body .= "${boss}: CM is offline since ".
TBDateStringLocal($when) . "\n";
} }
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
} }
if (keys(%upmail)) { if (keys(%upaggs)) {
my $subject = "Portal Aggregates are alive"; my $subject = "Portal Aggregates are back online";
my $body = "${subject}:\n\n"; my $body = "";
foreach my $urn (keys(%upaggs)) {
my $when = $upaggs{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
foreach my $aggregate (values(%upmail)) { $body .= "${boss}: CM is now online " . "\n";
$body .= $aggregate->name() . ": is now online " . "\n";
} }
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS); SENDMAIL($TBOPS, $subject, $body, $TBOPS);
} }
if (keys(%downmail)) { if (keys(%downdns)) {
my $subject = "Portal Aggregates are " . my $subject = "Portal DNS servers are " .
($dailymail ? "still " : "") . "offline"; ($dailymail ? "still " : "") . "offline";
my $body = "${subject}:\n\n"; my $body = "";
foreach my $urn (keys(%downdns)) {
my $when = $downdns{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
$body .= "${boss}: DNS is offline since ".
TBDateStringLocal($when) . "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%updns)) {
my $subject = "Portal DNS servers are back online";
my $body = "";
foreach my $aggregate (values(%downmail)) { foreach my $urn (keys(%updns)) {
my $status = $aggregate->status(); my $when = $updns{$urn};
my $boss = $aggregates->{$urn}->{'boss'};
$body .= "${boss}: DNS is now online " . "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%downnodes)) {
my $subject = "Portal Nodes are " .
($dailymail ? "still " : "") . "unreachable";
my $body = "";
foreach my $urn (keys(%downnodes)) {
foreach my $hostname (keys(%{$downnodes{$urn}})) {
my $when = $downnodes{$urn}->{$hostname};
$body .= $aggregate->name() . ": is $status since ". $body .= "${hostname}: is unreachable since ".
TBDateStringLocal($aggregate->last_success()) . "\n"; TBDateStringLocal($when) . "\n";
}
$body .= "\n";
}
NotifySlack($body);
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
if (keys(%upnodes)) {
my $subject = "Portal Nodes are back online";
my $body = "";
foreach my $urn (keys(%upnodes)) {
foreach my $hostname (keys(%{$upnodes{$urn}})) {
my $when = $upnodes{$urn}->{$hostname};