Commit 9f3205c9 authored by Leigh B Stoller's avatar Leigh B Stoller

More work on the aggregate monitoring.

1. Split the resource stuff (where we ask for an advertisement and
   process it) into a separate script, since that takes a long time to
   cycle through cause of the size of the ads from the big clusters.

2. On the monitor, distinguish offline (nologins) from actually being
   down.

3. Add a table to store changes in status so we can see over time how
   much time the aggregates are usable.
parent 9320782d
...@@ -207,6 +207,21 @@ sub STATUS($$;$) ...@@ -207,6 +207,21 @@ sub STATUS($$;$)
return $self->{'STATUS'}->{$name}; return $self->{'STATUS'}->{$name};
} }
#
# Insert a status (change) event.
#
sub StatusEvent($$)
{
my ($self, $event) = @_;
my $urn = $self->urn();
DBQueryWarn("insert into apt_aggregate_events set ".
" urn='$urn', event='$event', stamp=now()")
or return -1;
return 0;
}
# #
# Lookup all aggregates for a portal. # Lookup all aggregates for a portal.
# #
......
...@@ -36,7 +36,7 @@ BIN_SCRIPTS = manage_profile manage_instance manage_dataset \ ...@@ -36,7 +36,7 @@ BIN_SCRIPTS = manage_profile manage_instance manage_dataset \
manage_images rtecheck checkprofile manage_extensions \ manage_images rtecheck checkprofile manage_extensions \
create_slivers searchip create_slivers searchip
SBIN_SCRIPTS = apt_daemon aptevent_daemon portal_xmlrpc apt_checkup \ SBIN_SCRIPTS = apt_daemon aptevent_daemon portal_xmlrpc apt_checkup \
portal_monitor apt_scheduler portal_monitor apt_scheduler portal_resources
LIB_SCRIPTS = APT_Profile.pm APT_Instance.pm APT_Dataset.pm APT_Geni.pm \ LIB_SCRIPTS = APT_Profile.pm APT_Instance.pm APT_Dataset.pm APT_Geni.pm \
APT_Aggregate.pm APT_Utility.pm APT_Rspec.pm APT_Aggregate.pm APT_Utility.pm APT_Rspec.pm
WEB_BIN_SCRIPTS = webmanage_profile webmanage_instance webmanage_dataset \ WEB_BIN_SCRIPTS = webmanage_profile webmanage_instance webmanage_dataset \
......
...@@ -70,10 +70,8 @@ my $TBOPS = "@TBOPSEMAIL@"; ...@@ -70,10 +70,8 @@ my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@"; my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@; my $MAINSITE = @TBMAINSITE@;
my $LOGFILE = "$TB/log/portal_monitor.log"; my $LOGFILE = "$TB/log/portal_monitor.log";
my $PROTOUSER = "elabman"; my $SLEEP_INTERVAL = 60;
my $SUDO = "/usr/local/bin/sudo"; my $DOWN_THRESHOLD = 120;
my $WGET = "/usr/local/bin/wget";
my $SLEEP_INTERVAL = 300;
my $DAILY_INTERVAL = 24 * 3600; my $DAILY_INTERVAL = 24 * 3600;
# un-taint path # un-taint path
...@@ -123,7 +121,7 @@ use GeniCredential; ...@@ -123,7 +121,7 @@ use GeniCredential;
use GeniXML; use GeniXML;
use POSIX qw(strftime ceil); use POSIX qw(strftime ceil);
if (!$oneshot) { if (! ($oneshot || $impotent)) {
if (CheckDaemonRunning("portal_monitor")) { if (CheckDaemonRunning("portal_monitor")) {
fatal("Not starting another portal_monitor daemon!"); fatal("Not starting another portal_monitor daemon!");
} }
...@@ -141,6 +139,11 @@ my $context = APT_Geni::GeniContext(); ...@@ -141,6 +139,11 @@ my $context = APT_Geni::GeniContext();
fatal("Could not load our XMLRPC context") fatal("Could not load our XMLRPC context")
if (!defined($context)); if (!defined($context));
#
# We want this to be a quick test, not a long timeout.
#
Genixmlrpc->SetTimeout(10);
# #
# Setup a signal handler for newsyslog. # Setup a signal handler for newsyslog.
# #
...@@ -172,53 +175,6 @@ sub CheckAggregates() ...@@ -172,53 +175,6 @@ sub CheckAggregates()
print STDERR "Could not generate credential!\n"; print STDERR "Could not generate credential!\n";
return -1; return -1;
} }
#
# AM V3 API.
#
my @params = ([{"geni_type" => "geni_sfa",
"geni_version" => 3,
"geni_value" => $credential->asString()},
],
# Options array.
{"geni_compressed" => 1,
"geni_rspec_version" => {'type' => 'GENI',
'version' => '3'}}
);
my $markError = sub($$) {
my ($aggregate, $error) = @_;
my $nickname = $aggregate->nickname();
print STDERR "$nickname: $error\n";
#
# Decide if aggregate should be marked as down.
#
my $status;
# Note that this field is not defined. I think I intended to.
if (1 || !defined($aggregate->last_contact())) {
$status = "down";
}
else {
my $last_attempt = str2time($aggregate->last_attempt());
my $last_contact = str2time($aggregate->last_contact());
if ($last_attempt - $last_contact < 600) {
$status = "unknown";
}
else {
$status = "down";
}
}
if ($impotent) {
print STDERR "Would mark $aggregate as $status\n";
}
else {
$aggregate->status($status);
$aggregate->last_error($error);
}
};
while (my ($urn) = $query_result->fetchrow_array()) { while (my ($urn) = $query_result->fetchrow_array()) {
my $aggregate = APT_Aggregate->Lookup($urn); my $aggregate = APT_Aggregate->Lookup($urn);
...@@ -229,14 +185,11 @@ sub CheckAggregates() ...@@ -229,14 +185,11 @@ sub CheckAggregates()
my $nickname = $aggregate->nickname(); my $nickname = $aggregate->nickname();
my $authority = APT_Geni::GetAuthority($urn); my $authority = APT_Geni::GetAuthority($urn);
if (!defined($authority)) { if (!defined($authority)) {
&$markError($aggregate, "Could not lookup authority: $urn"); print STDERR "Could not lookup authority: $urn\n";
next; next;
} }
my $cmurl = $authority->url(); my $cmurl = $authority->url();
# Convert URL.
$cmurl =~ s/\/cm$/\/am/;
$cmurl = devurl($cmurl) if ($usemydevtree); $cmurl = devurl($cmurl) if ($usemydevtree);
$cmurl .= "/3.0";
if ($debug) { if ($debug) {
print "$nickname -> $cmurl\n"; print "$nickname -> $cmurl\n";
...@@ -247,216 +200,54 @@ sub CheckAggregates() ...@@ -247,216 +200,54 @@ sub CheckAggregates()
$aggregate->last_attempt(time()) $aggregate->last_attempt(time())
if (!$impotent); if (!$impotent);
#
# Do a quick test to see if we can even get there.
#
Genixmlrpc->SetTimeout(10);
my $response = my $response =
Genixmlrpc::CallMethod($cmurl, $context, "GetVersion"); Genixmlrpc::CallMethod($cmurl, $context, "GetVersion");
if ($response->code() != GENIRESPONSE_SUCCESS) { if ($response->code() != GENIRESPONSE_SUCCESS) {
&$markError($aggregate, my $nickname = $aggregate->nickname();
"GetVersion error: " . $response->output()); my $reason = $response->output();
next;
}
#
# This can take some time on a big cluster, which is why we
# did the GetVersion above, cause we know that will be fast,
# so its a good initial check.
#
Genixmlrpc->SetTimeout(180);
$response =
Genixmlrpc::CallMethod($cmurl, $context, "ListResources", @params);
if ($response->code() != GENIRESPONSE_SUCCESS) { print STDERR "$nickname: $reason\n";
&$markError($aggregate,
"ListResources error: ". $response->output()); #
next; # Decide how to mark the aggregate.
} #
if ($debug > 1) { my $status;
print $response->value() . "\n";
}
# if ($response->code() == GENIRESPONSE_SERVER_UNAVAILABLE()) {
# Decode and decompress. $status = "offline";
# }
my $decoded = eval { decode_base64($response->value()); }; else {
if ($@) { $status = "down";
&$markError($aggregate, "Could not base64 decode response"); }
next; if ($impotent) {
} print STDERR "Would mark $aggregate as $status\n";
my $xml = eval { uncompress($decoded); }; }
if ($@) { else {
&$markError($aggregate, "Could not uncompress response"); if ($aggregate->status() ne $status) {
next; $aggregate->status($status);
} $aggregate->StatusEvent($status);
if ($debug > 1) { }
print $xml . "\n"; $aggregate->last_error($reason);
} }
my $manifest = GeniXML::Parse($xml);
if (!defined($manifest)) {
&$markError($aggregate, "Could not parse manifest");
next; next;
} }
# #
# Mark that we could get the advertisement. Also mark it as up. # Mark that we could get the status. Also mark it as up.
# Not sure about when to mark it down though.
# #
if ($impotent) { if ($impotent) {
print "Would mark $aggregate as up\n"; print "Would mark $aggregate as up\n";
} }
else { else {
$aggregate->last_success($aggregate->last_attempt()); $aggregate->last_success($aggregate->last_attempt());
$aggregate->status("up");
$aggregate->last_error(""); $aggregate->last_error("");
} if ($aggregate->status() ne "up") {
$aggregate->status("up");
# $aggregate->StatusEvent("up");
# Get the list of reservable types. Need to be backwards compat
# here until all clusters updated with this element. See below.
#
my $reservable_types;
if (my $ref = GeniXML::FindNodesNS("n:reservable_types",
$manifest,
$GeniXML::EMULAB_NS)->pop()) {
$reservable_types = {};
foreach my $t (GeniXML::FindNodesNS("n:type", $ref,
$GeniXML::EMULAB_NS)->get_nodelist()) {
my $typename = GeniXML::GetText("name", $t);
$reservable_types->{$typename} = $typename;
}
}
my $pcount = 0;
my $pavail = 0;
my $vcount = 0;
my $vfree = 0;
my %type_count = ();
my %type_avail = ();
foreach my $ref (GeniXML::FindNodes("n:node",
$manifest)->get_nodelist()) {
my $node_id = GeniXML::GetNodeId($ref);
#
# Need to search the sliver types for raw-pc.
#
foreach my $sref (GeniXML::FindNodes("n:sliver_type",
$ref)->get_nodelist()) {
my $name = GeniXML::GetText("name", $sref);
if (defined($name)) {
if ($name eq "raw-pc") {
$pcount++;
$pavail++
if (GeniXML::IsAvailable($ref));
foreach my $htype (FindNodes("n:hardware_type",
$ref)->get_nodelist()) {
my $hname = GeniXML::GetText("name", $htype);
if (defined($reservable_types)) {
next
if (!exists($reservable_types->{$hname}));
}
else {
# This can be deleted when clusters updated.
next
if (!defined($hname) || $hname eq "" ||
$hname eq "pcvm" || $hname eq "pc" ||
$hname =~ /^delay/ ||
# Protect DB.
$hname !~ /^[-\w]+$/);
}
my $ntype =
GeniXML::FindNodesNS("n:node_type",
$htype,
$GeniXML::EMULAB_NS)->pop();
next
if (!$ntype);
my $slots = GeniXML::GetText("type_slots", $ntype);
next
if (!defined($slots) || $slots !~ /^\d+$/);
next
if ($slots > 1);
if (!exists($type_count{$hname})) {
$type_count{$hname} = 0;
$type_avail{$hname} = 0;
}
$type_count{$hname} += 1;
$type_avail{$hname} += 1
if (GeniXML::IsAvailable($ref));
}
}
elsif ($name eq "emulab-xen") {
my $exclusive = GeniXML::GetExclusive($ref);
#print "$node_id, $exclusive\n";
# Shared nodes are marked as not exclusive.
next
if (!defined($exclusive) || $exclusive);
# And they are available.
next
if (!GeniXML::IsAvailable($ref));
#
# We need the pcvm type to find the slots.
#
foreach my $htype (FindNodes("n:hardware_type",
$ref)->get_nodelist()) {
my $hname = GeniXML::GetText("name", $htype);
next
if (!defined($hname) || $hname ne "pcvm");
my $ntype =
GeniXML::FindNodesNS("n:node_type",
$htype,
$GeniXML::EMULAB_NS)->pop();
next
if (!$ntype);
my $slots = GeniXML::GetText("type_slots", $ntype);
next
if (!defined($slots) || $slots !~ /^\d+$/);
#
# Yuck, we do not get the total available on
# shared node, only how many still avail. Kludge
# it for now.
#
$vcount += 50;
$vfree += $slots;
}
}
}
} }
} }
print "$nickname: pcount:$pcount, pfree:$pavail, ".
"vcount:$vcount vfree:$vfree\n";
foreach my $type (keys(%type_count)) {
my $count = $type_count{$type};
my $avail = $type_avail{$type};
if ($debug || $impotent) {
print "$type $count:$avail\n";
}
if (!$impotent) {
DBQueryWarn("replace into apt_aggregate_nodetypes set ".
" urn='$urn',type='$type',".
" count='$count',free='$avail'");
}
}
if (!$impotent) {
$aggregate->pcount($pcount);
$aggregate->pfree($pavail);
$aggregate->vcount($vcount);
$aggregate->vfree($vfree);
}
} }
return 0; return 0;
} }
...@@ -476,7 +267,7 @@ sub SendEmail() ...@@ -476,7 +267,7 @@ sub SendEmail()
# #
# We send a summary email once every 24 hours. Maybe do this at a # We send a summary email once every 24 hours. Maybe do this at a
# set tim of day? # set time of day?
# #
if (time() - $lastdaily >= (24 * 3600)) { if (time() - $lastdaily >= (24 * 3600)) {
$dailymail = 1; $dailymail = 1;
...@@ -499,16 +290,13 @@ sub SendEmail() ...@@ -499,16 +290,13 @@ sub SendEmail()
# probably knows by the time this message turns up. # probably knows by the time this message turns up.
# #
next next
if ($aggregate->IsLocalCluster()); if ($aggregate->IsLocalCluster() && !$debug);
if ($aggregate->status() eq "down") { if ($aggregate->status() ne "up") {
my $last = str2time($aggregate->last_success()); my $last = str2time($aggregate->last_success());
#
# At least 10 minutes (which is two checks above).
#
next next
if (time() - $last < 600); if (time() - $last < $DOWN_THRESHOLD);
# Only once per day or once per event. # Only once per day or once per event.
next next
...@@ -517,7 +305,7 @@ sub SendEmail() ...@@ -517,7 +305,7 @@ sub SendEmail()
$downmail{$urn} = $aggregate; $downmail{$urn} = $aggregate;
$lastmail{$urn} = time(); $lastmail{$urn} = time();
} }
elsif ($aggregate->status() eq "up") { else {
if (exists($lastmail{$urn})) { if (exists($lastmail{$urn})) {
$upmail{$urn} = $aggregate; $upmail{$urn} = $aggregate;
delete($lastmail{$urn}); delete($lastmail{$urn});
...@@ -529,8 +317,7 @@ sub SendEmail() ...@@ -529,8 +317,7 @@ sub SendEmail()
my $body = "${subject}:\n\n"; my $body = "${subject}:\n\n";
foreach my $aggregate (values(%upmail)) { foreach my $aggregate (values(%upmail)) {
$body .= $aggregate->name() . ": was offline since ". $body .= $aggregate->name() . ": is now online " . "\n";
TBDateStringLocal($aggregate->last_success()) . "\n";
} }
SENDMAIL($TBOPS, $subject, $body, $TBOPS); SENDMAIL($TBOPS, $subject, $body, $TBOPS);
} }
...@@ -540,7 +327,9 @@ sub SendEmail() ...@@ -540,7 +327,9 @@ sub SendEmail()
my $body = "${subject}:\n\n"; my $body = "${subject}:\n\n";
foreach my $aggregate (values(%downmail)) { foreach my $aggregate (values(%downmail)) {
$body .= $aggregate->name() . ": is offline since ". my $status = $aggregate->status();
$body .= $aggregate->name() . ": is $status since ".
TBDateStringLocal($aggregate->last_success()) . "\n"; TBDateStringLocal($aggregate->last_success()) . "\n";
} }
SENDMAIL($TBOPS, $subject, $body, $TBOPS); SENDMAIL($TBOPS, $subject, $body, $TBOPS);
...@@ -548,7 +337,7 @@ sub SendEmail() ...@@ -548,7 +337,7 @@ sub SendEmail()
} }
while (1) { while (1) {
if (NoLogins()) { if (0 && NoLogins()) {
sleep(5); sleep(5);
next; next;
} }
......
#!/usr/bin/perl -w
#
# Copyright (c) 2008-2018 University of Utah and the Flux Group.
#
# {{{GENIPUBLIC-LICENSE
#
# GENI Public License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and/or hardware specification (the "Work") to
# deal in the Work without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Work, and to permit persons to whom the Work
# is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Work.
#
# THE WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE WORK OR THE USE OR OTHER DEALINGS
# IN THE WORK.
#
# }}}
#
use strict;
use English;
use Getopt::Std;
use Data::Dumper;
use File::Basename;
use Compress::Zlib;
use MIME::Base64;
use Date::Parse;
#
# Contact all clusters and get resource availability, for the web UI.
#
sub usage()
{
print "Usage: portal_resources [-d] [-s] [-n]\n";
exit(1);
}
my $optlist = "dns";
my $debug = 0;
my $impotent = 0;
my $oneshot = 0;
# Debugging
my $usemydevtree = 0;
sub devurl($)
{
my ($cmurl) = @_;
if ($usemydevtree) {
$cmurl =~ s/protogeni/protogeni\/stoller/;
# $cmurl =~ s/12369/12396/;
}
return $cmurl;
}
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@;
my $LOGFILE = "$TB/log/portal_resources.log";
my $SLEEP_INTERVAL = 300;
my $DAILY_INTERVAL = 24 * 3600;
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub fatal($);
#
# Turn off line buffering on output
#
$| = 1;
if ($UID != 0) {
fatal("Must be root to run this script\n");
}
#
# Check args early so we get the right DB.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"d"})) {
$debug = 1;
}
if (defined($options{"s"})) {
$oneshot = 1;
}
if (defined($options{"n"})) {
$impotent = 1;
}
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use emdb;
use libtestbed;
use emutil;
use libEmulab;
use APT_Aggregate;
use APT_Geni;
use Genixmlrpc;
use GeniResponse;
use GeniCredential;
use GeniXML;
use POSIX qw(strftime ceil);
if (! ($oneshot || $impotent)) {
if (CheckDaemonRunning("portal_resources")) {
fatal("Not starting another portal_resources daemon!");
}