Commit 5e777be4 authored by Leigh B Stoller's avatar Leigh B Stoller

First try at saved max extensions so that we can allow an experiment to

be extended when flagged aggregate is down. Only FEs will be so marked,
since this only allows a single extension, we must reestablish contact
with the aggregate before another extension can be done. Work in
progress.
parent aaed675b
......@@ -437,5 +437,26 @@ sub CanInstantiate($$$$)
return 1;
}
#
# Is there a pending extension for this aggregate. Return the instance.
# Note that there is only one pending push per aggregate.
#
sub ExtensionPushPending($)
{
my ($self) = @_;
my $urn = $self->urn();
require APT_Instance;
my $query_result =
DBQueryWarn("select uuid from apt_instance_aggregates ".
"where aggregate_urn='$urn' ".
" and extension_needpush is not null");
return undef
if (!$query_result || !$query_result->numrows);
my ($uuid) = $query_result->fetchrow_array();
return APT_Instance->Lookup($uuid);
}
# _Always_ make sure that this 1 is at the end of the file...
1;
......@@ -56,6 +56,7 @@ use Brand;
use APT_Profile;
use APT_Aggregate;
use APT_Geni;
use APT_Utility;
use Genixmlrpc;
use GeniResponse;
use GeniCertificate;
......@@ -78,6 +79,7 @@ my $MANAGEDATASET = "$TB/bin/manage_dataset";
my $MANAGEIMAGES = "$TB/bin/manage_images";
my $GENIUSER = "geniuser";
my $MAINSITE = @TBMAINSITE@;
my $MYURN = "urn:publicid:IDN+${OURDOMAIN}+authority+cm";
my $PROTOGENI_LOCALUSER= @PROTOGENI_LOCALUSER@;
# Cache of instances to avoid regenerating them.
......@@ -284,6 +286,7 @@ sub Refresh($)
if (!$query_result || !$query_result->numrows);
$self->{'INSTANCE'} = $query_result->fetchrow_hashref();
$self->{'AGGREGATES'} = APT_Instance::Aggregate->LookupForInstance($self);
return 0;
}
......@@ -1926,6 +1929,191 @@ sub ResolveDefer($)
return 0;
}
sub SetMaxExtension($$)
{
my ($self, $when) = @_;
my $uuid = $self->uuid();
if (!defined($when)) {
DBQueryWarn("update apt_instances set ".
" maxextension=null,maxextension_timestamp=null ".
"where uuid='$uuid'")
or return -1;
}
else {
DBQueryWarn("update apt_instances set ".
" maxextension=FROM_UNIXTIME($when)," .
" maxextension_timestamp=now() ".
"where uuid='$uuid'")
or return -1;
}
return 0;
}
#
# Does the experiment do/need precalc. If any aggregates are doing precalc,
# we way the experiment does precalc. HOWEVER; the aggregates that are not
# marked as doing precalc, MUST BE REACHABLE. It is only the aggreagates
# marked for precalc that are allowed to be unreachable.
#
sub DoesPrecalcMaxExtension($)
{
my ($self) = @_;
my $uuid = $self->uuid();
my $precalc = 0;
foreach my $sliver ($self->AggregateList()) {
my $aggregate = $sliver->GetAptAggregate();
if ($aggregate->precalcmaxext()) {
$precalc = 1;
last;
}
}
return 1
if ($precalc);
return 0;
}
sub PurgeMaxExtension($)
{
my ($self) = @_;
my $uuid = $self->uuid();
my %instances = ();
my %precalcers = ();
return 0
if (!$self->DoesPrecalcMaxExtension());
# We can just clear this, the recalc script will regen if needed.
$self->SetMaxExtension(undef);
#
# If this experiment is using any clusters that are marked for
# preccalc, we have to find all the other experiments using them,
# and clear the max extension since it is no longer valid. The
# daemon will eventually do a new recalc for all experiments that
# need it.
#
foreach my $sliver ($self->AggregateList()) {
my $aggregate = $sliver->GetAptAggregate();
if ($aggregate->precalcmaxext()) {
$precalcers{$aggregate->urn()} = $aggregate;
}
}
return 0
if (!keys(%precalcers));
#
# Now find all experiments using any of the precalcers.
#
my $clause = join(" or ", map("aggregate_urn='$_'", keys(%precalcers)));
my $query_result =
DBQueryWarn("select distinct uuid from apt_instance_aggregates ".
"where $clause");
return -1
if (!$query_result);
return 0
if (!$query_result->numrows);
while (my ($uuid) = $query_result->fetchrow_array()) {
my $instance = APT_Instance->Lookup($uuid);
next
if (!defined($instance));
$instance->SetMaxExtension(undef);
}
return 0;
}
#
# Given an instance, save the provided max extension if its an instance that
# is doing precalc.
#
sub SaveMaxExtension($$)
{
my ($self, $newmax) = @_;
return 0
if (!$self->DoesPrecalcMaxExtension());
return $self->SetMaxExtension(undef)
if (!defined($newmax));
#
# Okay, we cannot save the max extension if any of the aggregates
# are in the process of provisioning a new experiment. This is a
# critical section cause of the time it takes to provision an
# experiment.
#
if (APT_Utility::ExtensionLock()) {
print STDERR "Could not get max extension lock. Clearing to be safe\n";
$self->SetMaxExtension(undef);
return 0;
}
my $safe = 1;
foreach my $sliver ($self->AggregateList()) {
my $aggregate_urn = $sliver->aggregate_urn();
my $query_result =
DBQueryWarn("select uuid from apt_instance_aggregates ".
"where aggregate_urn='$aggregate_urn' and ".
" status='provisioning'");
if (!$query_result || $query_result->numrows) {
$safe = 0;
}
}
if (!$safe) {
print STDERR "Some aggregates are provisioning, clearing to be safe\n";
$self->SetMaxExtension(undef);
}
else {
$self->SetMaxExtension($newmax);
}
# Critical section end.
APT_Utility::ExtensionUnlock();
return 0;
}
# Check if any aggregates are marked for an extension push.
sub ExtensionPushPending($)
{
my ($self) = @_;
my $uuid = $self->uuid();
my $query_result =
DBQueryWarn("select aggregate_urn from apt_instance_aggregates ".
"where uuid='$uuid' and extension_needpush is not null");
return -1
if (!$query_result);
return $query_result->numrows;
}
#
# This is used to clear the precalculated max extension for all
# instances using the provided aggregate. Used from manage_reservations.
#
sub ClearAllMaxExtension($$)
{
my ($class, $aggregate_urn) = @_;
my $query_result =
DBQueryWarn("select uuid from apt_instance_aggregates ".
"where aggregate_urn='$aggregate_urn'");
return -1
if (!$query_result);
while (my ($uuid) = $query_result->fetchrow_array()) {
my $instance = APT_Instance->Lookup($uuid);
next
if (!defined($instance));
$instance->SetMaxExtension(undef);
}
return 0;
}
###################################################################
package APT_Instance::History;
use emdb;
......@@ -2225,8 +2413,6 @@ use vars qw($AUTOLOAD);
use overload ('""' => 'Stringify');
sub devurl($) { return APT_Instance::devurl($_[0]); }
my $MYURN = "urn:publicid:IDN+$OURDOMAIN+authority+cm";
#
# Lookup and create a class instance to return.
#
......@@ -2652,6 +2838,34 @@ sub AptAggregateName($)
return $self->GetAptAggregate()->name();
}
# Mark for needing an extension pushed out.
sub MarkExtensionPush($)
{
my ($self) = @_;
my $uuid = $self->uuid();
my $urn = $self->aggregate_urn();
DBQueryWarn("update apt_instance_aggregates set ".
" extension_needpush=now() ".
"where uuid='$uuid' and aggregate_urn='$urn'")
or return -1;
return 0;
}
sub ClearExtensionPush($)
{
my ($self) = @_;
my $uuid = $self->uuid();
my $urn = $self->aggregate_urn();
DBQueryWarn("update apt_instance_aggregates set ".
" extension_needpush=null ".
"where uuid='$uuid' and aggregate_urn='$urn'")
or return -1;
return 0;
}
#
# Update the sliverstatus in the webtask.
#
......
......@@ -248,3 +248,20 @@ sub LookupAggregate($)
return undef;
}
#
# The precalculated max extension lock.
#
sub ExtensionLock()
{
my $lock_result = DBQueryWarn("select GET_LOCK('maxextlock', 15)");
if (!$lock_result ||
!$lock_result->numrows) {
return -1;
}
return 0;
}
sub ExtensionUnlock()
{
DBQueryWarn("select RELEASE_LOCK('maxextlock')");
}
......@@ -38,7 +38,7 @@ BIN_SCRIPTS = manage_profile manage_instance manage_dataset \
SBIN_SCRIPTS = apt_daemon aptevent_daemon portal_xmlrpc apt_checkup \
portal_monitor apt_scheduler portal_resources \
manage_licenses manage_aggregate powder_shutdown \
rfmonitor_daemon aptimage_daemon
rfmonitor_daemon aptimage_daemon aptexpire_daemon recalcmaxext
LIB_SCRIPTS = APT_Profile.pm APT_Instance.pm APT_Dataset.pm APT_Geni.pm \
APT_Aggregate.pm APT_Utility.pm APT_Rspec.pm
WEB_BIN_SCRIPTS = webmanage_profile webmanage_instance webmanage_dataset \
......
......@@ -247,56 +247,6 @@ sub FixFailedImaging()
}
}
#
# Expire instances.
#
sub ExpireInstances()
{
my $query_result =
DBQueryWarn("select a.uuid,s.expires from apt_instances as a ".
"left join geni.geni_slices as s on s.uuid=a.slice_uuid ".
"where (a.status='ready') and ".
" a.admin_lockdown=0 and s.lockdown=0 and ".
" a.paniced=0 and ".
" (UNIX_TIMESTAMP(now()) > ".
" UNIX_TIMESTAMP(s.expires))");
return
if (!$query_result);
while (my ($uuid,$expires) = $query_result->fetchrow_array()) {
my $instance = APT_Instance->Lookup($uuid);
if (!defined($instance)) {
print STDERR "No such instance $uuid\n";
next;
}
print STDERR "$uuid expired at $expires\n";
#
# Try to terminate the instance. We cannot take the lock since
# we are going to call manage_instance to do the termination.
# So, manage_instance might collide with the sa_daemon which
# locks the underlying slice, but if that happens we will just
# try again after a short wait. If it still fails, then
# something is wrong and we will notify.
#
if ($impotent) {
print STDERR "Would try to terminate $instance\n";
next;
}
# Use debug option to keep it from going into the background.
my $output = emutil::ExecQuiet("$SUDO -u $PROTOUSER ".
"$MANAGEINSTANCE -d terminate $uuid -e");
my $status = $?;
print STDERR "$output\n"
if ($output ne "");
if ($status) {
#
# Need to send email at some point.
#
}
}
}
#
# Warn about locked down instances that have expired.
#
......@@ -676,7 +626,6 @@ while (1) {
KillInstances();
FixFailedImaging();
ExpireInstances();
if ($MAINSITE) {
UpdateAggregateGraphs();
}
......
#!/usr/bin/perl -w
#
# Copyright (c) 2008-2019 University of Utah and the Flux Group.
#
# {{{GENIPUBLIC-LICENSE
#
# GENI Public License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and/or hardware specification (the "Work") to
# deal in the Work without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Work, and to permit persons to whom the Work
# is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Work.
#
# THE WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE WORK OR THE USE OR OTHER DEALINGS
# IN THE WORK.
#
# }}}
#
use strict;
use English;
use Getopt::Std;
use Data::Dumper;
use JSON;
use File::Basename;
#
# Look for APT things that need to be dealt with.
#
sub usage()
{
print "Usage: apt_daemon [-d] [-s] [-n]\n";
exit(1);
}
my $optlist = "dns";
my $debug = 0;
my $impotent = 0;
my $oneshot = 0;
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@;
my $LOGFILE = "$TB/log/aptexpire_daemon.log";
my $MANAGEINSTANCE = "$TB/bin/manage_instance";
my $RECALC = "$TB/sbin/recalcmaxext";
my $PROTOUSER = "elabman";
my $SUDO = "/usr/local/bin/sudo";
my $SLEEP_INTERVAL = 60;
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub fatal($);
#
# Turn off line buffering on output
#
$| = 1;
if ($UID != 0) {
fatal("Must be root to run this script\n");
}
#
# Check args early so we get the right DB.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"d"})) {
$debug = 1;
}
if (defined($options{"s"})) {
$oneshot = 1;
}
if (defined($options{"n"})) {
$impotent = 1;
}
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use emdb;
use libtestbed;
use emutil;
use libEmulab;
use GeniResponse;
use APT_Instance;
use POSIX qw(strftime ceil);
if (!$oneshot) {
if (CheckDaemonRunning("aptexpire_daemon")) {
fatal("Not starting another aptexpire daemon!");
}
# Go to ground.
if (! $debug) {
if (TBBackGround($LOGFILE)) {
exit(0);
}
}
if (MarkDaemonRunning("aptexpire_daemon")) {
fatal("Could not mark daemon as running!");
}
}
#
# Setup a signal handler for newsyslog.
#
sub handler()
{
my $SAVEEUID = $EUID;
$EUID = 0;
ReOpenLog($LOGFILE);
$EUID = $SAVEEUID;
}
$SIG{HUP} = \&handler
if (! ($debug || $oneshot));
#
# Expire instances.
#
sub ExpireInstances()
{
my $query_result =
DBQueryWarn("select a.uuid,s.expires from apt_instances as a ".
"left join geni.geni_slices as s on s.uuid=a.slice_uuid ".
"where a.status!='failed' and ".
" a.admin_lockdown=0 and s.lockdown=0 and ".
" a.paniced=0 and ".
" (UNIX_TIMESTAMP(now()) > ".
" UNIX_TIMESTAMP(s.expires))");
return
if (!$query_result);
while (my ($uuid,$expires) = $query_result->fetchrow_array()) {
my $instance = APT_Instance->Lookup($uuid);
if (!defined($instance)) {
print STDERR "No such instance $uuid\n";
next;
}
print STDERR "$uuid expired at $expires\n";
#
# Try to terminate the instance. We cannot take the lock since
# we are going to call manage_instance to do the termination.
# So, manage_instance might collide with the sa_daemon which
# locks the underlying slice, but if that happens we will just
# try again after a short wait. If it still fails, then
# something is wrong and we will notify.
#
if ($impotent) {
print STDERR "Would try to terminate $instance\n";
next;
}
# Use debug option to keep it from going into the background.
my $output = emutil::ExecQuiet("$SUDO -u $PROTOUSER ".
"$MANAGEINSTANCE -d terminate $uuid -e");
my $status = $?;
print STDERR "$output\n"
if ($output ne "");
if ($status) {
#
# Need to send email at some point.
#
}
}
}
#
# Push pending extensions.
#
sub PushExtensions()
{
my $query_result =
DBQueryWarn("select distinct uuid from apt_instance_aggregates ".
"where extension_needpush is not null");
return
if (!$query_result);
while (my ($uuid) = $query_result->fetchrow_array()) {
my $instance = APT_Instance->Lookup($uuid);
if (!defined($instance)) {
print STDERR "No such instance $uuid\n";
next;
}
print STDERR "$uuid has pending extensions to push\n";
if ($impotent) {
print STDERR "Would try to push extensions for $instance\n";
next;
}
my $output = emutil::ExecQuiet("$SUDO -u $PROTOUSER ".
"$MANAGEINSTANCE -d pushextension $uuid");
my $status = $?;
print STDERR "$output\n"
if ($output ne "");
if ($status) {
my $creator = $instance->GetGeniUser();
my $brand = $instance->Brand();
my $pid = $instance->pid();
my $name = $instance->name();
#
# Bad news!
#
$brand->SendEmail($creator->email(),
"Extension failure for experiment $pid/$name",
"Could not push the delayed extension to some ".
"clusters in experiment $pid/$name\n\n".
$output,
$brand->OpsEmailAddress(),
"CC: " . $brand->OpsEmailAddress());
}
}
}
#
# Call out to script that recalcs max extensions for experiments running
# on clusters that support stored max extensions.
#
sub RecalcMaxExtensions()
{
return
if (!$MAINSITE);
print "Starting $RECALC\n";
system("$SUDO -u $PROTOUSER $RECALC");
}
if ($oneshot) {
# ExpireInstances();
PushExtensions();
exit(0);
}
# Wait a bit before making a bunch of noise.
sleep($SLEEP_INTERVAL) if (! $debug);
while (1) {
if (NoLogins()) {
sleep(5);
next;
}
print "Running at ".
POSIX::strftime("20%y-%m-%d %H:%M:%S", localtime()) . "\n";
#ExpireInstances();
PushExtensions();
RecalcMaxExtensions();
print "Waiting for $SLEEP_INTERVAL seconds ...\n";
sleep($SLEEP_INTERVAL);
}
exit(0);
sub fatal($)
{
my ($msg) = @_;
if (! ($oneshot || $debug)) {
#
# Send a message to the testbed list.
#
SENDMAIL($TBOPS,
"APT Expire daemon died",
$msg,
$TBOPS);
}
MarkDaemonStopped("aptexpire_daemon")
if (!$oneshot);
die("*** $0:\n".
" $msg\n");
}
......@@ -94,6 +94,7 @@ use APT_Instance;
use APT_Geni;
use APT_Dataset;
use APT_Aggregate;
use APT_Utility;
use Experiment;
use User;
use emutil;
......@@ -799,9 +800,29 @@ sub CreateSliver($)
my $manifest;
$webtask->Refresh();
#
# Clear any stored max extensions for the aggregate between the
# provisioning and provisioned stages. This is a critical section,
# we cannot have another experiment use its saved maxextension until
# the CreateSliver returns.
#
# Critical section start.
if (APT_Utility::ExtensionLock()) {
print STDERR "Could not get max extension lock\n";
$webtask->Exited(GENIRESPONSE_ERROR);
$webtask->output("Internal locking error");
$aggobj->SetStatus("failed");
return -1;
}
# Mark that it is doing something.
$aggobj->SetStatus("provisioning");
# Clear max extensions for any instances using this aggregate
APT_Instance->ClearAllMaxExtension($aggobj->aggregate_urn());
# Critical section end.
APT_Utility::ExtensionUnlock();
# Debugging
$cmurl = APT_Instance::devurl($cmurl);
......
......@@ -212,6 +212,7 @@ sub DoShow()
print "LocalImages: " . ($aggregate->nolocalimages() ? "No" : "Yes")."\n";
print "PrestageImages: " . ($aggregate->prestageimages() ? "Yes":"No")."\n";
print "PanicPowerOff: " . ($aggregate->panicpoweroff() ? "Yes" : "No")."\n";
print "PreCalcMaxExt: " . ($aggregate->precalcmaxext() ? "Yes" : "No")."\n";
print "Portals: " . $aggregate->portals() . "\n";
print "Use Feature: " . ($aggregate->canuse_feature() || "") . "\n";
print "Status: " . $aggregate->status() . "\n";
......@@ -296,6 +297,11 @@ sub DoFlags()
or fatal("Could not update flag");
last;
};
/^precalcmaxext$/ && do {
$aggregate->Update({"precalcmaxext" => $onoff}) == 0
or fatal("Could not update flag");
last;
};
fatal("Unknown flag");
}
return 0;
......
This diff is collapsed.
......@@ -104,6 +104,7 @@ sub DoApprove();
sub DoPrediction();
sub DoCancel();
sub DoHistory();