Commit d7c4230e authored by Leigh Stoller's avatar Leigh Stoller

Bring the cluster monitor "inhouse", rather then depending on the jfed

monitoring system.

New portal_monitor daemon does a GetVersion/ListResources call at each
of the clusters every five minutes, and updates the new table in the
DB called apt_aggregate_status. We calculate free/inuse counts for
physical nodes and a free count for VMs. Failure to contact the
aggregate for more then 10 minutes sets the aggregate as down, since
from our perspective if we cannot get to it, the cluster is down.

Unlike the jfed monitoring system, we are not going to try to
instantiate a new experiment or ssh into it. Wait and see if that is
necessary in our context.

On the instantiate page, generate a json structure for each cluster,
similar the one described in issue #172 by Keith. This way we can easily
switch the existing code over to this new system, but fail back to the
old mechanism if this turn out to be a bust.

Some other related changes to how we hand cluster into the several web
pages.
parent 1fd592b5
......@@ -35,6 +35,7 @@ use vars qw(@ISA @EXPORT $AUTOLOAD);
# Must come after package declaration!
use emdb;
use emutil;
use GeniHRN;
use overload ('""' => 'Stringify');
......@@ -43,17 +44,19 @@ my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $OURDOMAIN = "@OURDOMAIN@";
# Protos
sub STATUS($$;$);
#
# Lookup by uuid.
#
sub Lookup($$)
{
my ($class, $token) = @_;
my $safe_urn = DBQuoteSpecial($token);
my $query_result;
if (GeniHRN::IsValid($token)) {
my $safe_urn = DBQuoteSpecial($token);
$query_result =
DBQueryWarn("select * from apt_aggregates where urn=$safe_urn");
}
......@@ -66,6 +69,25 @@ sub Lookup($$)
my $self = {};
$self->{'AGGREGATE'} = $query_result->fetchrow_hashref();
#
# Look to see if there is a status row. Create it if it does not exist.
#
$query_result =
DBQueryWarn("select * from apt_aggregate_status where urn=$safe_urn");
return undef
if (!$query_result);
if (!$query_result->numrows) {
DBQueryWarn("replace into apt_aggregate_status set ".
" urn=$safe_urn, status='down'");
$query_result =
DBQueryWarn("select * from apt_aggregate_status ".
"where urn=$safe_urn");
return undef
if (!$query_result);
}
$self->{'STATUS'} = $query_result->fetchrow_hashref();
bless($self, $class);
return $self;
}
......@@ -80,6 +102,17 @@ AUTOLOAD {
if (exists($self->{'AGGREGATE'}->{$name})) {
return $self->{'AGGREGATE'}->{$name};
}
elsif (exists($self->{'STATUS'}->{$name})) {
#
# We always want to go to the DB for this.
#
if (scalar(@_) == 2) {
return STATUS($self, $name, $_[1]);
}
else {
return STATUS($self, $name);
}
}
carp("No such slot '$name' field in class $type");
return undef;
}
......@@ -125,5 +158,35 @@ sub Stringify($)
return "[APT_Aggregate: $urn]";
}
#
# We always want to go to the DB when updating the status table.
#
sub STATUS($$;$)
{
my ($self, $name, $newval) = @_;
my $urn = $self->urn();
if (!defined($newval)) {
return $self->{'STATUS'}->{$name};
}
my $set = "";
#
# Convenience.
#
if (($name eq "last_success" || $name eq "last_attempt") &&
$newval =~ /^\d+$/) {
$newval = TBDateStringLocal($newval);
}
$set = "${name}=" . DBQuoteSpecial($newval);
DBQueryWarn("update apt_aggregate_status set $set ".
"where urn='$urn'")
or return undef;
$self->{'STATUS'}->{$name} = $newval;
return $self->{'STATUS'}->{$name};
}
# _Always_ make sure that this 1 is at the end of the file...
1;
......@@ -33,7 +33,8 @@ SUBDIRS =
BIN_SCRIPTS = manage_profile manage_instance manage_dataset \
create_instance rungenilib ns2rspec nsgenilib.py \
rspec2genilib ns2genilib manage_reservations
SBIN_SCRIPTS = apt_daemon aptevent_daemon portal_xmlrpc apt_checkup
SBIN_SCRIPTS = apt_daemon aptevent_daemon portal_xmlrpc apt_checkup \
portal_monitor
LIB_SCRIPTS = APT_Profile.pm APT_Instance.pm APT_Dataset.pm APT_Geni.pm \
APT_Aggregate.pm APT_Utility.pm
WEB_BIN_SCRIPTS = webmanage_profile webmanage_instance webmanage_dataset \
......
#!/usr/bin/perl -w
#
# Copyright (c) 2008-2016 University of Utah and the Flux Group.
#
# {{{GENIPUBLIC-LICENSE
#
# GENI Public License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and/or hardware specification (the "Work") to
# deal in the Work without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Work, and to permit persons to whom the Work
# is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Work.
#
# THE WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE WORK OR THE USE OR OTHER DEALINGS
# IN THE WORK.
#
# }}}
#
use strict;
use English;
use Getopt::Std;
use Data::Dumper;
use File::Basename;
use Compress::Zlib;
use MIME::Base64;
use Date::Parse;
#
# Contact all clusters and get status.
#
sub usage()
{
print "Usage: portal_monitor [-d] [-s] [-n]\n";
exit(1);
}
my $optlist = "dns";
my $debug = 0;
my $impotent = 0;
my $oneshot = 0;
# Debugging
my $usemydevtree = 0;
sub devurl($)
{
my ($cmurl) = @_;
if ($usemydevtree) {
$cmurl =~ s/protogeni/protogeni\/stoller/;
# $cmurl =~ s/12369/12396/;
}
return $cmurl;
}
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@;
my $LOGFILE = "$TB/log/portal_monitor.log";
my $PROTOUSER = "elabman";
my $SUDO = "/usr/local/bin/sudo";
my $WGET = "/usr/local/bin/wget";
my $SLEEP_INTERVAL = 300;
my $DAILY_INTERVAL = 24 * 3600;
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub fatal($);
#
# Turn off line buffering on output
#
$| = 1;
if ($UID != 0) {
fatal("Must be root to run this script\n");
}
#
# Check args early so we get the right DB.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"d"})) {
$debug = 1;
}
if (defined($options{"s"})) {
$oneshot = 1;
}
if (defined($options{"n"})) {
$impotent = 1;
}
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use emdb;
use libtestbed;
use emutil;
use libEmulab;
use APT_Aggregate;
use APT_Geni;
use Genixmlrpc;
use GeniResponse;
use GeniCredential;
use GeniXML;
use POSIX qw(strftime ceil);
if (!$oneshot) {
if (CheckDaemonRunning("portal_monitor")) {
fatal("Not starting another portal_monitor daemon!");
}
# Go to ground.
if (! $debug) {
if (TBBackGround($LOGFILE)) {
exit(0);
}
}
if (MarkDaemonRunning("portal_monitor")) {
fatal("Could not mark daemon as running!");
}
}
my $context = APT_Geni::GeniContext();
fatal("Could not load our XMLRPC context")
if (!defined($context));
#
# Setup a signal handler for newsyslog.
#
sub handler()
{
my $SAVEEUID = $EUID;
$EUID = 0;
ReOpenLog($LOGFILE);
$EUID = $SAVEEUID;
}
$SIG{HUP} = \&handler
if (! ($debug || $oneshot));
#
# Request an advertisement.
#
sub CheckAggregates()
{
my $query_result =
DBQueryWarn("select urn from apt_aggregates ".
"where nomonitor=0");
return 0
if (!$query_result->numrows);
my $credential = APT_Geni::GenAuthCredential($context->certificate());
if (!defined($credential)) {
print STDERR "Could not generate credential!\n";
return -1;
}
#
# AM V3 API.
#
my @params = ([{"geni_type" => "geni_sfa",
"geni_version" => 3,
"geni_value" => $credential->asString()},
],
# Options array.
{"geni_compressed" => 1,
"geni_rspec_version" => {'type' => 'GENI',
'version' => '3'}}
);
my $markError = sub($$) {
my ($aggregate, $error) = @_;
$aggregate->last_error($error);
print STDERR $error . "\n";
#
# Decide if aggregate should be marked as down.
#
if (!defined($aggregate->last_contact())) {
$aggregate->status("down");
return;
}
my $last_attempt = str2time($aggregate->last_attempt());
my $last_contact = str2time($aggregate->last_contact());
if ($last_attempt - $last_contact < 600) {
$aggregate->status("unknown");
}
else {
$aggregate->status("down");
}
};
while (my ($urn) = $query_result->fetchrow_array()) {
my $aggregate = APT_Aggregate->Lookup($urn);
if (!defined($aggregate)) {
print STDERR "Could not lookup aggregate: $urn\n";
next;
}
my $nickname = $aggregate->nickname();
my $authority = APT_Geni::GetAuthority($urn);
if (!defined($authority)) {
&$markError($aggregate, "Could not lookup authority: $urn");
next;
}
my $cmurl = $authority->url();
# Convert URL.
$cmurl =~ s/\/cm$/\/am/;
$cmurl = devurl($cmurl) if ($usemydevtree);
$cmurl .= "/3.0";
if ($debug) {
print "$nickname -> $cmurl\n";
}
#
# Mark as trying to contact.
#
$aggregate->last_attempt(time());
#
# Do a quick test to see if we can even get there.
#
Genixmlrpc->SetTimeout(10);
my $response =
Genixmlrpc::CallMethod($cmurl, $context, "GetVersion");
if ($response->code() != GENIRESPONSE_SUCCESS) {
&$markError("$nickname GetVersion error: " . $response->output());
next;
}
#
# This can take some time on a big cluster, which is why we
# did the GetVersion above, cause we know that will be fast,
# so its a good initial check.
#
Genixmlrpc->SetTimeout(180);
$response =
Genixmlrpc::CallMethod($cmurl, $context, "ListResources", @params);
if ($response->code() != GENIRESPONSE_SUCCESS) {
&$markError("$nickname ListResources error: ". $response->output());
next;
}
if ($debug > 1) {
print $response->value() . "\n";
}
#
# Decode and decompress.
#
my $decoded = eval { decode_base64($response->value()); };
if ($@) {
&$markError("$nickname: Could not base64 decode response");
next;
}
my $xml = eval { uncompress($decoded); };
if ($@) {
&$markError("$nickname: Could not uncompress response");
next;
}
if ($debug > 1) {
print $xml . "\n";
}
my $manifest = GeniXML::Parse($xml);
if (!defined($manifest)) {
&$markError("$nickname: Could not parse manifest");
next;
}
#
# Mark that we could get the advertisement. Also mark it as up.
# Not sure about when to mark it down though.
#
$aggregate->last_success($aggregate->last_attempt());
$aggregate->status("up");
$aggregate->last_error("");
my $pcount = 0;
my $pavail = 0;
my $vcount = 0;
foreach my $ref (GeniXML::FindNodes("n:node",
$manifest)->get_nodelist()) {
my $node_id = GeniXML::GetNodeId($ref);
#
# Need to search the sliver types for raw-pc.
#
foreach my $sref (GeniXML::FindNodes("n:sliver_type",
$ref)->get_nodelist()) {
my $name = GeniXML::GetText("name", $sref);
if (defined($name)) {
if ($name eq "raw-pc") {
$pcount++;
$pavail++
if (GeniXML::IsAvailable($ref));
}
elsif ($name eq "emulab-xen") {
my $exclusive = GeniXML::GetExclusive($ref);
#print "$node_id, $exclusive\n";
# Shared nodes are marked as not exclusive.
next
if (!defined($exclusive) || $exclusive);
# And they are available.
next
if (!GeniXML::IsAvailable($ref));
#
# We need the pcvm type to find the slots.
#
foreach my $htype (FindNodes("n:hardware_type",
$ref)->get_nodelist()) {
my $hname = GeniXML::GetText("name", $htype);
next
if (!defined($hname) || $hname ne "pcvm");
my $ntype =
GeniXML::FindNodesNS("n:node_type",
$htype,
$GeniXML::EMULAB_NS)->pop();
next
if (!$ntype);
my $slots = GeniXML::GetText("type_slots", $ntype);
next
if (!defined($slots) || $slots !~ /^\d+$/);
$vcount += $slots;
}
}
}
}
}
print "$nickname: pcount:$pcount, pfree:$pavail, vcount:$vcount\n";
$aggregate->pcount($pcount);
$aggregate->pfree($pavail);
$aggregate->vcount($vcount);
}
return 0;
}
if ($oneshot) {
CheckAggregates();
exit(0);
}
while (1) {
if (NoLogins()) {
sleep(5);
next;
}
print "Running at ".
POSIX::strftime("20%y-%m-%d %H:%M:%S", localtime()) . "\n";
CheckAggregates();
exit(0)
if ($oneshot);
sleep($SLEEP_INTERVAL);
}
exit(0);
sub fatal($)
{
my ($msg) = @_;
if (! ($oneshot || $debug)) {
#
# Send a message to the testbed list.
#
SENDMAIL($TBOPS,
"portal_monitor died",
$msg,
$TBOPS);
}
MarkDaemonStopped("portal_monitor")
if (!$oneshot);
die("*** $0:\n".
" $msg\n");
}
......@@ -793,6 +793,20 @@ sub GetVirtualizationSubtype($)
return $result;
}
sub IsAvailable($)
{
my ($node) = @_;
my $result = 0;
my $available = FindFirst("n:available", $node);
if (defined($available)) {
my $now = GetText("now", $available);
if (defined($now) && lc($now) eq "true") {
$result = 1;
}
}
return $result;
}
sub GetDiskImage($)
{
my ($node) = @_;
......
......@@ -68,6 +68,23 @@ CREATE TABLE `apt_aggregate_nodetypes` (
PRIMARY KEY (`urn`,`type`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
--
-- Table structure for table `apt_aggregate_status`
--
DROP TABLE IF EXISTS `apt_aggregate_status`;
CREATE TABLE `apt_aggregate_status` (
`urn` varchar(128) NOT NULL default '',
`status` enum("up","down","unknown") NOT NULL default "unknown",
`last_success` datetime default NULL,
`last_attempt` datetime default NULL,
`pcount` int(11) default '0',
`pfree` int(11) default '0',
`vcount` int(11) default '0',
`last_error` text,
PRIMARY KEY (`urn`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
--
-- Table structure for table `apt_aggregates`
--
......@@ -82,6 +99,7 @@ CREATE TABLE `apt_aggregates` (
`isfederate` tinyint(1) NOT NULL default '0',
`disabled` tinyint(1) NOT NULL default '0',
`noupdate` tinyint(1) NOT NULL default '0',
`nomonitor` tinyint(1) NOT NULL default '0',
`updated` datetime NOT NULL default '0000-00-00 00:00:00',
`weburl` tinytext,
`has_datasets` tinyint(1) NOT NULL default '0',
......
use strict;
use libdb;
sub DoUpdate($$$)
{
my ($dbhandle, $dbname, $version) = @_;
if (!DBSlotExists("apt_aggregates", "nomonitor")) {
DBQueryFatal("alter table apt_aggregates add " .
" `nomonitor` tinyint(1) NOT NULL default '0' ".
" after noupdate");
DBQueryFatal("update apt_aggregates set nomonitor=noupdate");
}
if (!DBTableExists("apt_aggregate_status")) {
DBQueryFatal("CREATE TABLE `apt_aggregate_status` ( ".
" `urn` varchar(128) NOT NULL default '', ".
" `status` enum('up','down','unknown') ".
" NOT NULL default 'unknown', ".
" `last_success` datetime default NULL, ".
" `last_attempt` datetime default NULL, ".
" `pcount` int(11) default '0', ".
" `pfree` int(11) default '0', ".
" `vcount` int(11) default '0', ".
" `last_error` text, ".
" PRIMARY KEY (`urn`) ".
") ENGINE=MyISAM DEFAULT CHARSET=latin1");
}
return 0;
}
# Local Variables:
# mode:perl
# End:
......@@ -30,6 +30,7 @@ class Aggregate
{
var $aggregate;
var $typeinfo;
var $statusinfo;
#
# Constructor by lookup by urn
......@@ -58,6 +59,16 @@ class Aggregate
$this->typeinfo[$type] = array("count" => $row["count"],
"free" => $row["free"]);
}
#
# And the status info.
#
$query_result =
DBQueryWarn("select * from apt_aggregate_status ".
"where urn='$safe_urn'");
if ($query_result || mysql_num_rows($query_result)) {
$this->statusinfo = mysql_fetch_array($query_result);
}
}
# accessors
function field($name) {
......@@ -73,6 +84,18 @@ class Aggregate
function isfederate() { return $this->field('isfederate'); }
function portals() { return $this->field('portals'); }
# accessors for the status info.
function sfield($name) {
return (is_null($this->statusinfo) ? null : $this->statusinfo[$name]);
}
function status() { return $this->sfield('status'); }
function last_success() { return $this->sfield('last_success'); }
function last_attempt() { return $this->sfield('last_attempt'); }
function pcount() { return $this->sfield('pcount'); }
function pfree() { return $this->sfield('pfree'); }
function vcount() { return $this->sfield('vcount'); }
function last_error() { return $this->sfield('last_error'); }
# Hmm, how does one cause an error in a php constructor?
function IsValid() {
return !is_null($this->aggregate);
......@@ -171,13 +194,16 @@ class Aggregate
while ($row = mysql_fetch_array($query_result)) {
$urn = $row["urn"];
$name = $row["name"];
$adminonly = $row["adminonly"];
if ($adminonly && !(ISADMIN() || STUDLY())) {
continue;
}
$am_array[$name] = $urn;
if (! ($aggregate = Aggregate::Lookup($urn))) {
TBERROR("Aggregate::SupportsReservations: ".
"Could not load aggregate $urn!", 1);
}
$am_array[$urn] = $aggregate;
}
return $am_array;
}
......
......@@ -564,9 +564,16 @@ function CheckStep2()
return -1;
}
$formfields = $ajax_args["formfields"];
$am_array = Instance::DefaultAggregateList();
$amlist = Instance::DefaultAggregateList();
$am_array = array();
$errors = array();
while (list($index, $aggregate) = each($amlist)) {
$urn = $aggregate->urn();
$name = $aggregate->name();
$am_array[$name] = $urn;
}
session_start();
#
# The initial page load did profile checking, this is just a
......@@ -786,11 +793,18 @@ function Do_Submit()
return;
}
$formfields = $ajax_args["formfields"];
$am_array = Instance::DefaultAggregateList();
$amlist = Instance::DefaultAggregateList();
$am_array = array();
$errors = array();
$args = array("portal" => $PORTAL_GENESIS);
$profile = Profile::Lookup($formfields["profile"]);
while (list($index, $aggregate) = each($amlist)) {
$urn = $aggregate->urn();
$name = $aggregate->name();
$am_array[$name] = $urn;
}
#
# SSH keys are optional for guest users; they just have to
# use the web based ssh window.
......
......@@ -48,6 +48,8 @@ elseif (!$ISAPT) {
RedirectLoginPage();
}
error_log("A: " . time());
#
# Verify page arguments.
#
......@@ -150,6 +152,8 @@ else {
$profile_array = array();
$am_array = Instance::DefaultAggregateList();
error_log("B: " . time());
#
# if using the super secret URL, make sure the profile exists, and
# add to the array now since it might not be public or belong to the user.
......@@ -292,6 +296,8 @@ else {
}
}
error_log("C: " . time());
#
# Rebuild the array with extra info for the profile picker.
#
......@@ -314,6 +320,8 @@ while (list ($uuid, $title) = each ($profile_array)) {
"usecount" => $count);
}
}
error_log("D: " . time());
#
# Now we want to order the list.
#
......@@ -334,6 +342,8 @@ else {
});
}
$profile_array = $tmp_array;
error_log("E: " . time());