Commit 9f3205c9 authored by Leigh B Stoller's avatar Leigh B Stoller

More work on the aggregate monitoring.

1. Split the resource stuff (where we ask for an advertisement and
   process it) into a separate script, since that takes a long time to
   cycle through cause of the size of the ads from the big clusters.

2. On the monitor, distinguish offline (nologins) from actually being
   down.

3. Add a table to store changes in status so we can see over time how
   much time the aggregates are usable.
parent 9320782d
......@@ -207,6 +207,21 @@ sub STATUS($$;$)
return $self->{'STATUS'}->{$name};
}
#
# Insert a status (change) event.
#
sub StatusEvent($$)
{
my ($self, $event) = @_;
my $urn = $self->urn();
DBQueryWarn("insert into apt_aggregate_events set ".
" urn='$urn', event='$event', stamp=now()")
or return -1;
return 0;
}
#
# Lookup all aggregates for a portal.
#
......
......@@ -36,7 +36,7 @@ BIN_SCRIPTS = manage_profile manage_instance manage_dataset \
manage_images rtecheck checkprofile manage_extensions \
create_slivers searchip
SBIN_SCRIPTS = apt_daemon aptevent_daemon portal_xmlrpc apt_checkup \
portal_monitor apt_scheduler
portal_monitor apt_scheduler portal_resources
LIB_SCRIPTS = APT_Profile.pm APT_Instance.pm APT_Dataset.pm APT_Geni.pm \
APT_Aggregate.pm APT_Utility.pm APT_Rspec.pm
WEB_BIN_SCRIPTS = webmanage_profile webmanage_instance webmanage_dataset \
......
......@@ -70,10 +70,8 @@ my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@;
my $LOGFILE = "$TB/log/portal_monitor.log";
my $PROTOUSER = "elabman";
my $SUDO = "/usr/local/bin/sudo";
my $WGET = "/usr/local/bin/wget";
my $SLEEP_INTERVAL = 300;
my $SLEEP_INTERVAL = 60;
my $DOWN_THRESHOLD = 120;
my $DAILY_INTERVAL = 24 * 3600;
# un-taint path
......@@ -123,7 +121,7 @@ use GeniCredential;
use GeniXML;
use POSIX qw(strftime ceil);
if (!$oneshot) {
if (! ($oneshot || $impotent)) {
if (CheckDaemonRunning("portal_monitor")) {
fatal("Not starting another portal_monitor daemon!");
}
......@@ -141,6 +139,11 @@ my $context = APT_Geni::GeniContext();
fatal("Could not load our XMLRPC context")
if (!defined($context));
#
# We want this to be a quick test, not a long timeout.
#
Genixmlrpc->SetTimeout(10);
#
# Setup a signal handler for newsyslog.
#
......@@ -172,53 +175,6 @@ sub CheckAggregates()
print STDERR "Could not generate credential!\n";
return -1;
}
#
# AM V3 API.
#
my @params = ([{"geni_type" => "geni_sfa",
"geni_version" => 3,
"geni_value" => $credential->asString()},
],
# Options array.
{"geni_compressed" => 1,
"geni_rspec_version" => {'type' => 'GENI',
'version' => '3'}}
);
my $markError = sub($$) {
my ($aggregate, $error) = @_;
my $nickname = $aggregate->nickname();
print STDERR "$nickname: $error\n";
#
# Decide if aggregate should be marked as down.
#
my $status;
# Note that this field is not defined. I think I intended to.
if (1 || !defined($aggregate->last_contact())) {
$status = "down";
}
else {
my $last_attempt = str2time($aggregate->last_attempt());
my $last_contact = str2time($aggregate->last_contact());
if ($last_attempt - $last_contact < 600) {
$status = "unknown";
}
else {
$status = "down";
}
}
if ($impotent) {
print STDERR "Would mark $aggregate as $status\n";
}
else {
$aggregate->status($status);
$aggregate->last_error($error);
}
};
while (my ($urn) = $query_result->fetchrow_array()) {
my $aggregate = APT_Aggregate->Lookup($urn);
......@@ -229,14 +185,11 @@ sub CheckAggregates()
my $nickname = $aggregate->nickname();
my $authority = APT_Geni::GetAuthority($urn);
if (!defined($authority)) {
&$markError($aggregate, "Could not lookup authority: $urn");
print STDERR "Could not lookup authority: $urn\n";
next;
}
my $cmurl = $authority->url();
# Convert URL.
$cmurl =~ s/\/cm$/\/am/;
$cmurl = devurl($cmurl) if ($usemydevtree);
$cmurl .= "/3.0";
if ($debug) {
print "$nickname -> $cmurl\n";
......@@ -247,216 +200,54 @@ sub CheckAggregates()
$aggregate->last_attempt(time())
if (!$impotent);
#
# Do a quick test to see if we can even get there.
#
Genixmlrpc->SetTimeout(10);
my $response =
Genixmlrpc::CallMethod($cmurl, $context, "GetVersion");
if ($response->code() != GENIRESPONSE_SUCCESS) {
&$markError($aggregate,
"GetVersion error: " . $response->output());
next;
}
my $nickname = $aggregate->nickname();
my $reason = $response->output();
print STDERR "$nickname: $reason\n";
#
# This can take some time on a big cluster, which is why we
# did the GetVersion above, cause we know that will be fast,
# so its a good initial check.
# Decide how to mark the aggregate.
#
Genixmlrpc->SetTimeout(180);
$response =
Genixmlrpc::CallMethod($cmurl, $context, "ListResources", @params);
my $status;
if ($response->code() != GENIRESPONSE_SUCCESS) {
&$markError($aggregate,
"ListResources error: ". $response->output());
next;
if ($response->code() == GENIRESPONSE_SERVER_UNAVAILABLE()) {
$status = "offline";
}
if ($debug > 1) {
print $response->value() . "\n";
else {
$status = "down";
}
#
# Decode and decompress.
#
my $decoded = eval { decode_base64($response->value()); };
if ($@) {
&$markError($aggregate, "Could not base64 decode response");
next;
if ($impotent) {
print STDERR "Would mark $aggregate as $status\n";
}
my $xml = eval { uncompress($decoded); };
if ($@) {
&$markError($aggregate, "Could not uncompress response");
next;
else {
if ($aggregate->status() ne $status) {
$aggregate->status($status);
$aggregate->StatusEvent($status);
}
if ($debug > 1) {
print $xml . "\n";
$aggregate->last_error($reason);
}
my $manifest = GeniXML::Parse($xml);
if (!defined($manifest)) {
&$markError($aggregate, "Could not parse manifest");
next;
}
#
# Mark that we could get the advertisement. Also mark it as up.
# Not sure about when to mark it down though.
# Mark that we could get the status. Also mark it as up.
#
if ($impotent) {
print "Would mark $aggregate as up\n";
}
else {
$aggregate->last_success($aggregate->last_attempt());
$aggregate->status("up");
$aggregate->last_error("");
}
#
# Get the list of reservable types. Need to be backwards compat
# here until all clusters updated with this element. See below.
#
my $reservable_types;
if (my $ref = GeniXML::FindNodesNS("n:reservable_types",
$manifest,
$GeniXML::EMULAB_NS)->pop()) {
$reservable_types = {};
foreach my $t (GeniXML::FindNodesNS("n:type", $ref,
$GeniXML::EMULAB_NS)->get_nodelist()) {
my $typename = GeniXML::GetText("name", $t);
$reservable_types->{$typename} = $typename;
}
}
my $pcount = 0;
my $pavail = 0;
my $vcount = 0;
my $vfree = 0;
my %type_count = ();
my %type_avail = ();
foreach my $ref (GeniXML::FindNodes("n:node",
$manifest)->get_nodelist()) {
my $node_id = GeniXML::GetNodeId($ref);
#
# Need to search the sliver types for raw-pc.
#
foreach my $sref (GeniXML::FindNodes("n:sliver_type",
$ref)->get_nodelist()) {
my $name = GeniXML::GetText("name", $sref);
if (defined($name)) {
if ($name eq "raw-pc") {
$pcount++;
$pavail++
if (GeniXML::IsAvailable($ref));
foreach my $htype (FindNodes("n:hardware_type",
$ref)->get_nodelist()) {
my $hname = GeniXML::GetText("name", $htype);
if (defined($reservable_types)) {
next
if (!exists($reservable_types->{$hname}));
}
else {
# This can be deleted when clusters updated.
next
if (!defined($hname) || $hname eq "" ||
$hname eq "pcvm" || $hname eq "pc" ||
$hname =~ /^delay/ ||
# Protect DB.
$hname !~ /^[-\w]+$/);
}
my $ntype =
GeniXML::FindNodesNS("n:node_type",
$htype,
$GeniXML::EMULAB_NS)->pop();
next
if (!$ntype);
my $slots = GeniXML::GetText("type_slots", $ntype);
next
if (!defined($slots) || $slots !~ /^\d+$/);
next
if ($slots > 1);
if (!exists($type_count{$hname})) {
$type_count{$hname} = 0;
$type_avail{$hname} = 0;
}
$type_count{$hname} += 1;
$type_avail{$hname} += 1
if (GeniXML::IsAvailable($ref));
if ($aggregate->status() ne "up") {
$aggregate->status("up");
$aggregate->StatusEvent("up");
}
}
elsif ($name eq "emulab-xen") {
my $exclusive = GeniXML::GetExclusive($ref);
#print "$node_id, $exclusive\n";
# Shared nodes are marked as not exclusive.
next
if (!defined($exclusive) || $exclusive);
# And they are available.
next
if (!GeniXML::IsAvailable($ref));
#
# We need the pcvm type to find the slots.
#
foreach my $htype (FindNodes("n:hardware_type",
$ref)->get_nodelist()) {
my $hname = GeniXML::GetText("name", $htype);
next
if (!defined($hname) || $hname ne "pcvm");
my $ntype =
GeniXML::FindNodesNS("n:node_type",
$htype,
$GeniXML::EMULAB_NS)->pop();
next
if (!$ntype);
my $slots = GeniXML::GetText("type_slots", $ntype);
next
if (!defined($slots) || $slots !~ /^\d+$/);
#
# Yuck, we do not get the total available on
# shared node, only how many still avail. Kludge
# it for now.
#
$vcount += 50;
$vfree += $slots;
}
}
}
}
}
print "$nickname: pcount:$pcount, pfree:$pavail, ".
"vcount:$vcount vfree:$vfree\n";
foreach my $type (keys(%type_count)) {
my $count = $type_count{$type};
my $avail = $type_avail{$type};
if ($debug || $impotent) {
print "$type $count:$avail\n";
}
if (!$impotent) {
DBQueryWarn("replace into apt_aggregate_nodetypes set ".
" urn='$urn',type='$type',".
" count='$count',free='$avail'");
}
}
if (!$impotent) {
$aggregate->pcount($pcount);
$aggregate->pfree($pavail);
$aggregate->vcount($vcount);
$aggregate->vfree($vfree);
}
}
return 0;
}
......@@ -476,7 +267,7 @@ sub SendEmail()
#
# We send a summary email once every 24 hours. Maybe do this at a
# set tim of day?
# set time of day?
#
if (time() - $lastdaily >= (24 * 3600)) {
$dailymail = 1;
......@@ -499,16 +290,13 @@ sub SendEmail()
# probably knows by the time this message turns up.
#
next
if ($aggregate->IsLocalCluster());
if ($aggregate->IsLocalCluster() && !$debug);
if ($aggregate->status() eq "down") {
if ($aggregate->status() ne "up") {
my $last = str2time($aggregate->last_success());
#
# At least 10 minutes (which is two checks above).
#
next
if (time() - $last < 600);
if (time() - $last < $DOWN_THRESHOLD);
# Only once per day or once per event.
next
......@@ -517,7 +305,7 @@ sub SendEmail()
$downmail{$urn} = $aggregate;
$lastmail{$urn} = time();
}
elsif ($aggregate->status() eq "up") {
else {
if (exists($lastmail{$urn})) {
$upmail{$urn} = $aggregate;
delete($lastmail{$urn});
......@@ -529,8 +317,7 @@ sub SendEmail()
my $body = "${subject}:\n\n";
foreach my $aggregate (values(%upmail)) {
$body .= $aggregate->name() . ": was offline since ".
TBDateStringLocal($aggregate->last_success()) . "\n";
$body .= $aggregate->name() . ": is now online " . "\n";
}
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
}
......@@ -540,7 +327,9 @@ sub SendEmail()
my $body = "${subject}:\n\n";
foreach my $aggregate (values(%downmail)) {
$body .= $aggregate->name() . ": is offline since ".
my $status = $aggregate->status();
$body .= $aggregate->name() . ": is $status since ".
TBDateStringLocal($aggregate->last_success()) . "\n";
}
SENDMAIL($TBOPS, $subject, $body, $TBOPS);
......@@ -548,7 +337,7 @@ sub SendEmail()
}
while (1) {
if (NoLogins()) {
if (0 && NoLogins()) {
sleep(5);
next;
}
......
#!/usr/bin/perl -w
#
# Copyright (c) 2008-2018 University of Utah and the Flux Group.
#
# {{{GENIPUBLIC-LICENSE
#
# GENI Public License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and/or hardware specification (the "Work") to
# deal in the Work without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Work, and to permit persons to whom the Work
# is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Work.
#
# THE WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE WORK OR THE USE OR OTHER DEALINGS
# IN THE WORK.
#
# }}}
#
use strict;
use English;
use Getopt::Std;
use Data::Dumper;
use File::Basename;
use Compress::Zlib;
use MIME::Base64;
use Date::Parse;
#
# Contact all clusters and get resource availability, for the web UI.
#
sub usage()
{
print "Usage: portal_resources [-d] [-s] [-n]\n";
exit(1);
}
my $optlist = "dns";
my $debug = 0;
my $impotent = 0;
my $oneshot = 0;
# Debugging
my $usemydevtree = 0;
sub devurl($)
{
my ($cmurl) = @_;
if ($usemydevtree) {
$cmurl =~ s/protogeni/protogeni\/stoller/;
# $cmurl =~ s/12369/12396/;
}
return $cmurl;
}
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@;
my $LOGFILE = "$TB/log/portal_resources.log";
my $SLEEP_INTERVAL = 300;
my $DAILY_INTERVAL = 24 * 3600;
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub fatal($);
#
# Turn off line buffering on output
#
$| = 1;
if ($UID != 0) {
fatal("Must be root to run this script\n");
}
#
# Check args early so we get the right DB.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"d"})) {
$debug = 1;
}
if (defined($options{"s"})) {
$oneshot = 1;
}
if (defined($options{"n"})) {
$impotent = 1;
}
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use emdb;
use libtestbed;
use emutil;
use libEmulab;
use APT_Aggregate;
use APT_Geni;
use Genixmlrpc;
use GeniResponse;
use GeniCredential;
use GeniXML;
use POSIX qw(strftime ceil);
if (! ($oneshot || $impotent)) {
if (CheckDaemonRunning("portal_resources")) {
fatal("Not starting another portal_resources daemon!");
}
# Go to ground.
if (! $debug) {
if (TBBackGround($LOGFILE)) {
exit(0);
}
}
if (MarkDaemonRunning("portal_resources")) {
fatal("Could not mark daemon as running!");
}
}
my $context = APT_Geni::GeniContext();
fatal("Could not load our XMLRPC context")
if (!defined($context));
#
# Setup a signal handler for newsyslog.
#
sub handler()
{
my $SAVEEUID = $EUID;
$EUID = 0;
ReOpenLog($LOGFILE);
$EUID = $SAVEEUID;
}
$SIG{HUP} = \&handler
if (! ($debug || $oneshot));
#
# Request an advertisement.
#
sub CheckAggregates()
{
my $query_result =
DBQueryWarn("select urn from apt_aggregates ".
"where nomonitor=0");
return 0
if (!$query_result->numrows);
my $credential = APT_Geni::GenAuthCredential($context->certificate());
if (!defined($credential)) {
print STDERR "Could not generate credential!\n";
return -1;
}
#
# AM V3 API.
#
my @params = ([{"geni_type" => "geni_sfa",
"geni_version" => 3,
"geni_value" => $credential->asString()},
],
# Options array.
{"geni_compressed" => 1,
"geni_rspec_version" => {'type' => 'GENI',
'version' => '3'}}
);
my $markError = sub($$) {
my ($aggregate, $error) = @_;
my $nickname = $aggregate->nickname();
print STDERR "$nickname: $error\n";
};
while (my ($urn) = $query_result->fetchrow_array()) {
my $aggregate = APT_Aggregate->Lookup($urn);
if (!defined($aggregate)) {
print STDERR "Could not lookup aggregate: $urn\n";
next;
}
my $nickname = $aggregate->nickname();
my $authority = APT_Geni::GetAuthority($urn);
if (!defined($authority)) {
&$markError($aggregate, "Could not lookup authority: $urn");
next;
}
my $cmurl = $authority->url();
# Convert URL.
$cmurl =~ s/\/cm$/\/am/;
$cmurl = devurl($cmurl) if ($usemydevtree);
$cmurl .= "/3.0";