Commit 6f17de73 authored by Leigh Stoller's avatar Leigh Stoller

Big set of changes for deferred/scheduled/offline aggregates:

* I started out to add just deferred aggregates; those that are offline
  when starting an experiment (and marked in the apt_aggregates table as
  being deferable). When an aggregate is offline, we add an entry to the
  new apt_deferred_aggregates table, and periodically retry to start the
  missing slivers. In order to accomplish this, I split create_instance
  into two scripts, first part to create the instance in the DB, and the
  second (create_slivers) to create slivers for the instance. The daemon
  calls create_slivers for any instances in the deferred table, until
  all deferred aggregates are resolved.

  On the UI side, there are various changes to deal with allowing
  experiments to be partially create. For example used to wait till we
  have all the manifests until showing the topology. Now we show the
  topo on the first manifest, and then add them as they come in. Various
  parts of the UI had to change to deal with missing aggregates, I am
  sure I did not get them all.

* And then once I had that, I realized that "scheduled" experiments was
  an "easy" addition, its just a degenerate case of deferred. For this I
  added some new slots to the tables to hold the scheduled start time,
  and added a started stamp so we can distinguish between the time it
  was created and the time it was actually started. Lots of data.

  On the UI side, there is a new fourth step on the instantiate page to
  give the user a choice of immediate or scheduled start. I moved the
  experiment duration to this step. I was originally going to add a
  calendar choice for termination, but I did not want to change the
  existing 16 hour max duration policy, yet.
parent cac4744f
......@@ -71,9 +71,12 @@ use overload ('""' => 'Stringify');
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBAUDIT = "@TBAUDITEMAIL@";
my $OURDOMAIN = "@OURDOMAIN@";
my $GENEXTENDCRED = "$TB/sbin/protogeni/genextendcred";
my $MANAGEDATASET = "$TB/bin/manage_dataset";
my $GENIUSER = "geniuser";
my $MAINSITE = @TBMAINSITE@;
my $PROTOGENI_LOCALUSER= @PROTOGENI_LOCALUSER@;
# Cache of instances to avoid regenerating them.
my %instances = ();
......@@ -81,7 +84,7 @@ BEGIN { use emutil; emutil::AddCache(\%instances); }
my $debug = 0;
# Debugging
my $usemydevtree = 0;
my $usemydevtree = ($MAINSITE ? 0 : 0);
sub devurl($)
{
my ($cmurl) = @_;
......@@ -191,6 +194,18 @@ sub isPowder($) { return $_[0]->Brand()->isPowder() ? 1 : 0; }
sub AggregateList($) { return values(%{ $_[0]->{'AGGREGATES'} }); }
sub AggregateHash($) { return $_[0]->{'AGGREGATES'}; }
sub Start($)
{
my ($self) = @_;
my $uuid = $self->uuid();
DBQueryWarn("update apt_instances set started=now() ".
"where uuid='$uuid'")
or return -1;
return Refresh($self);
}
#
# Grab the webtask. Backwards compat mode, see if there is one associated
# with the object, use that. Otherwise create a new one.
......@@ -427,8 +442,12 @@ sub Delete($)
$agg->Delete() == 0
or return -1;
}
DBQueryWarn("delete from apt_deferred_instances where uuid='$uuid'") or
return -1;
# We do not "own" the webtask until create_sliver exits successfully.
$self->webtask()->Delete() if (defined($self->{'WEBTASK'}));
DBQueryWarn("delete from apt_instances where uuid='$uuid'") or
return -1;
......@@ -485,6 +504,31 @@ sub SetStatus($$)
return 0;
}
sub ActiveAggregateList($)
{
my ($self) = @_;
my @result = ();
foreach my $agg ($self->AggregateList()) {
push(@result, $agg)
if ($agg->status() ne "terminated" &&
$agg->status() ne "deferred");
}
return @result;
}
sub DeferredAggregateList($)
{
my ($self) = @_;
my @result = ();
foreach my $agg ($self->AggregateList()) {
push(@result, $agg)
if ($agg->status() eq "deferred");
}
return @result;
}
sub SetPublicURL($$)
{
my ($self,$url) = @_;
......@@ -795,6 +839,9 @@ sub ComputeNodeCounts($)
return -1;
}
foreach my $sliver (@slivers) {
next
if ($sliver->status() eq "deferred");
my $manifest = GeniXML::Parse($sliver->manifest());
if (! defined($manifest)) {
print STDERR "Could not parse manifest for $sliver\n";
......@@ -1382,6 +1429,10 @@ sub CheckProvisioned($)
foreach my $agg ($self->AggregateList()) {
$agg->Refresh();
# Skip deferred for now, not sure how to handle this.
next
if ($agg->status() eq "deferred");
$provisioned = 0
if (!defined($agg->manifest()));
}
......@@ -1396,6 +1447,179 @@ sub CheckProvisioned($)
return 0;
}
#
# Create credentials required by this instance, to access its datasets.
#
sub CreateDatasetCreds($$$)
{
my ($self, $pmsg, $pref) = @_;
my $rspecstr = $self->rspec();
my $project = $self->GetProject();
my $geniuser = $self->GetGeniUser();
my @credentials = ();
my $rspec = GeniXML::Parse($rspecstr);
if (! defined($rspec)) {
print STDERR "CreateDatasetCreds: Could not parse rspec\n";
return -1;
}
foreach my $ref (GeniXML::FindNodes("n:node", $rspec)->get_nodelist()) {
my $manager_urn = GetManagerId($ref);
foreach my $blockref (GeniXML::FindNodesNS("n:blockstore", $ref,
$GeniXML::EMULAB_NS)->get_nodelist()) {
my $dataset_id = GeniXML::GetText("persistent", $blockref);
if (!defined($dataset_id)) {
# persistent is deprecated.
$dataset_id = GeniXML::GetText("dataset", $blockref);
}
#
# We only care about datasets here, we let the backend
# do the error checking on ephemeral blockstores.
#
next
if (!defined($dataset_id));
my $class = GeniXML::GetText("class", $blockref);
if (!defined($class)) {
$class = "remote";
}
# Image backed referenced by URL. No checking since the
# image has to be global anyway. Needs more thought.
next
if ($class eq "local" && $dataset_id =~ /^(http|https):/);
my $dataset_urn = GeniHRN->new($dataset_id);
my $dataset = APT_Dataset->LookupByRemoteURN($dataset_urn);
if (!defined($dataset)) {
if ($dataset_urn->domain() eq $OURDOMAIN) {
#
# Local image backed dataset or lease.
#
my ($image,$lease);
my $pid = $dataset_urn->project();
my $id = $dataset_urn->id();
if ($dataset_urn->type() eq "imdataset") {
$image = Image->Lookup($pid, $id);
if ($image && !$image->isdataset()) {
$$pmsg = "$dataset_urn is an image not a dataset ";
return 1;
}
#
# Do a partial permission check here to catch
# errors early. The CM will do its own check
# of course.
#
if (!$image->global() &&
$PROTOGENI_LOCALUSER && $geniuser->IsLocal() &&
!$image->AccessCheck($geniuser->emulab_user(),
TB_IMAGEID_ACCESS())) {
$$pmsg = "No permission to use $dataset_urn";
return 1;
}
}
else {
$lease = Lease->Lookup($pid, $id);
}
# We cannot generate a credential for "legacy" datasets.
# So if it is not global, it cannot be transferred. Maybe
# this is okay, we will find out. We could generate a
# credential if we needed to.
next
if ($image || $lease);
}
$$pmsg = "Dataset '$dataset_urn' does not exist";
return 1;
}
#
# We do not need a credential for leases, only real users
# can use those, and so standard emulab permission checks
# are applied at the CM.
#
next
if ($dataset->type() ne "imdataset");
#
# For image backed datasets, we need to send along a credential
# that allows the remote CM to securely download the dataset if
# it does not already have it. To do that we need to send it a
# credential from the CM where the dataset lives. We do that by
# requesting a credential, and delegating it to the target CM.
#
my $pid = $dataset->pid();
my $id = $dataset->dataset_id();
my $cmd = "$MANAGEDATASET getcredential -a $manager_urn $pid/$id";
my $output = emutil::ExecQuiet($cmd);
if ($?) {
$$pmsg = "Could not generate credential for $dataset_urn";
if (($? >> 8) > 0) {
if ($output ne "") {
$$pmsg = "$dataset_urn" . ": " . $output;
}
return 1;
}
return -1;
}
push(@credentials, $output);
}
}
@$pref = @credentials;
return 0;
}
#
# Defer aggregate setup until missing aggregates come online.
# Optional start time indicates we are deferring the entire experiment
# till a later time.
#
sub Defer($$)
{
my ($self, $start) = @_;
my $uuid = $self->uuid();
my $name = $self->name();
my $query_result =
DBQueryWarn("select uuid from apt_deferred_instances ".
"where uuid='$uuid'");
return -1
if (!defined($query_result));
if ($query_result->numrows) {
DBQueryWarn("update apt_deferred_instances set ".
" last_retry=now() ".
"where uuid='$uuid'")
or return -1;
}
else {
DBQueryWarn("replace into apt_deferred_instances set ".
" uuid='$uuid', name='$name'" .
(defined($start) ?
", start_at=FROM_UNIXTIME($start)" : ""))
or return -1;
if (defined($start)) {
DBQueryWarn("update apt_instance_aggregates set status='deferred' ".
"where uuid='$uuid'")
or return -1;
}
}
return 0;
}
sub ResolveDefer($)
{
my ($self) = @_;
my $uuid = $self->uuid();
DBQueryWarn("delete from apt_deferred_instances ".
"where uuid='$uuid'")
or return -1;
return 0;
}
###################################################################
package APT_Instance::ExtensionInfo;
use emdb;
......@@ -1543,7 +1767,6 @@ use vars qw($AUTOLOAD);
use overload ('""' => 'Stringify');
sub devurl($) { return APT_Instance::devurl($_[0]); }
my $OURDOMAIN = "@OURDOMAIN@";
my $MYURN = "urn:publicid:IDN+$OURDOMAIN+authority+cm";
#
......@@ -1856,6 +2079,23 @@ sub Update($$)
return Refresh($self);
}
#
# Mark as deferred.
#
sub Defer($)
{
my ($self) = @_;
my $uuid = $self->uuid();
my $urn = $self->aggregate_urn();
DBQueryWarn("update apt_instance_aggregates set status='deferred', ".
" last_retry=now(),retry_count=retry_count+1 ".
"where uuid='$uuid' and aggregate_urn='$urn'")
or return -1;
return Refresh($self);
}
#
# Is the sliver on the local cluster.
#
......
......@@ -1411,6 +1411,40 @@ sub SetSites($$$$$$)
return 0;
}
#
# Does the rspec reference more then one manager in any of the links.
# If so, we need the stitcher.
#
sub NeedStitcher($$)
{
my ($rspecstr, $perrmsg) = @_;
my $rspec = GeniXML::Parse($rspecstr);
if (! defined($rspec)) {
$$perrmsg = "Could not parse rspec\n";
return -1;
}
my %linksites = ();
foreach my $ref (GeniXML::FindNodes("n:link", $rspec)->get_nodelist()) {
my $client_id = GetVirtualId($ref);
my $manager_urn = GetManagerId($ref);
if (!exists($linksites{$client_id})) {
$linksites{$client_id} = {};
}
next
if (!defined($manager_urn));
$linksites{$client_id}->{$manager_urn} = 1;
}
foreach my $ref (values(%linksites)) {
return 1
if (keys(%{$ref}) > 1);
}
return 0;
}
#
# Set the repository for the rspec. This is a top level element. At
# some point we can think about per-node repos.
......
......@@ -154,7 +154,9 @@ sub KillInstances()
"where (a.canceled!=0 or ".
" (a.status='failed' and ".
" (UNIX_TIMESTAMP(now()) - ".
" UNIX_TIMESTAMP(a.created) > 7200))) and ".
" UNIX_TIMESTAMP(IF(a.started is null, ".
" a.created, a.started)) ".
" > 7200))) and ".
# Not locked or corresponding slice does not exist.
" (s.locked is null or s.idx is null)");
return
......
#!/usr/bin/perl -w
#
# Copyright (c) 2008-2018 University of Utah and the Flux Group.
#
# {{{GENIPUBLIC-LICENSE
#
# GENI Public License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and/or hardware specification (the "Work") to
# deal in the Work without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Work, and to permit persons to whom the Work
# is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Work.
#
# THE WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE WORK OR THE USE OR OTHER DEALINGS
# IN THE WORK.
#
# }}}
#
use strict;
use English;
use Getopt::Std;
use Data::Dumper;
use File::Basename;
#
# Look for more APT things that need to be dealt with.
#
sub usage()
{
print "Usage: apt_scheduler [-d] [-s] [-n]\n";
exit(1);
}
my $optlist = "dns";
my $debug = 0;
my $impotent = 0;
my $oneshot = 0;
my %emailedErrors = ();
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@;
my $CREATESLIVERS = "$TB/bin/create_slivers";
my $LOGFILE = "$TB/log/apt_scheduler.log";
my $SUDO = "/usr/local/bin/sudo";
my $PROTOUSER = "elabman";
my $SLEEP_INTERVAL = 15;
my $DAILY_INTERVAL = 24 * 3600;
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub fatal($);
#
# Turn off line buffering on output
#
$| = 1;
if ($UID != 0) {
fatal("Must be root to run this script\n");
}
#
# Check args early so we get the right DB.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"d"})) {
$debug = 1;
}
if (defined($options{"s"})) {
$oneshot = 1;
}
if (defined($options{"n"})) {
$impotent = 1;
}
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use emdb;
use Experiment;
use Node;
use libtestbed;
use emutil;
use libEmulab;
use GeniResponse;
use APT_Instance;
use POSIX qw(strftime ceil);
if (!$oneshot) {
if (CheckDaemonRunning("apt_scheduler")) {
fatal("Not starting another apt scheduler daemon!");
}
# Go to ground.
if (! $debug) {
if (TBBackGround($LOGFILE)) {
exit(0);
}
}
if (MarkDaemonRunning("apt_scheduler")) {
fatal("Could not mark daemon as running!");
}
}
#
# Setup a signal handler for newsyslog.
#
sub handler()
{
my $SAVEEUID = $EUID;
$EUID = 0;
ReOpenLog($LOGFILE);
$EUID = $SAVEEUID;
}
$SIG{HUP} = \&handler
if (! ($debug || $oneshot));
while (1) {
if (NoLogins()) {
sleep(5);
next;
}
print "Running at ".
POSIX::strftime("20%y-%m-%d %H:%M:%S", localtime()) . "\n";
my $query_result =
DBQueryWarn("select uuid from apt_deferred_instances ".
"where start_at is null or now() >= start_at");
if ($query_result && $query_result->numrows) {
while (my ($uuid) = $query_result->fetchrow_array()) {
my $instance = APT_Instance->Lookup($uuid);
next
if (!defined($instance));
my $genislice = $instance->GetGeniSlice();
goto skip
if (!defined($genislice));
print $genislice->locked() . "\n" if ($genislice->locked());
goto skip
if ($genislice->locked());
if ($impotent) {
print "Would try to call create_slivers on $uuid\n";
}
else {
print "Calling create_slivers on $uuid\n";
my $output = emutil::ExecQuiet("$SUDO -u $PROTOUSER ".
"$CREATESLIVERS $uuid");
my $code = $? >> 8;
$code = -1
if ($code == 255);
if ($code < 0) {
print STDERR $output;
if (!exists($emailedErrors{$uuid}) ||
time() - $emailedErrors{$uuid} > (3 * 3600)) {
SENDMAIL($TBOPS,
"Could not start scheduled experiment",
"Error starting deferred experiment: ".
$instance . "\n\n" . $output . "\n",
$TBOPS);
$emailedErrors{$uuid} = time();
}
}
else {
delete($emailedErrors{$uuid});
}
}
# Slight delay between calling create_slivers.
if ($query_result->numrows) {
sleep(10);
}
skip:
$genislice->Flush()
if (defined($genislice));
$instance->Purge();
}
}
exit(0)
if ($oneshot);
sleep($SLEEP_INTERVAL);
}
exit(0);
sub fatal($)
{
my ($msg) = @_;
if (! ($oneshot || $debug)) {
#
# Send a message to the testbed list.
#
SENDMAIL($TBOPS,
"APT scheduler died",
$msg,
$TBOPS);
}
MarkDaemonStopped("apt_scheduler")
if (!$oneshot);
die("*** $0:\n".
" $msg\n");
}
This diff is collapsed.
This diff is collapsed.
......@@ -1282,6 +1282,14 @@ sub DoTerminate()
my $old_status = $instance->status();
$instance->SetStatus("terminating");
#
# If deferred, then no reason to save this in the history, it
# was never started.
#
if ($old_status eq "deferred") {
goto killit;
}
#
# Exit and let caller poll for status.
#
......@@ -1306,10 +1314,12 @@ sub DoTerminate()
}
# Skip terminated aggregates, since we retry later on failure.
# Also skip deferred aggregates, these were never setup.
my @agglist;
foreach my $agg ($instance->AggregateList()) {
push(@agglist, $agg)
if ($agg->status() ne "terminated");
if ($agg->status() ne "terminated" &&
$agg->status() ne "deferred");
}
my $response;
......@@ -1451,7 +1461,7 @@ sub DoExtend()
$instance->AggregateList());
my $pcount = $instance->physnode_count();
my $expires_time = str2time($slice->expires());
my $created_time = str2time($instance->created());
my $created_time = str2time($instance->started());
my $extensions = $instance->Brand()->ExtensionsEmailAddress();
my $granted = 0;
my $needapproval = 0;
......@@ -1554,7 +1564,7 @@ sub DoExtend()
localtime(str2time($slice->expires())+
($howlong * 3600)));
my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($instance->created())));
localtime(str2time($instance->started())));
$instance->Brand()->SendEmail($extensions,
"Experiment Extension Request: $name",
"A request to extend this experiment was made but requires\n".
......@@ -1772,7 +1782,7 @@ sub DoExtend()
my $expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($slice->expires())));
my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($instance->created())));
localtime(str2time($instance->started())));
my $now = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime());
my $before = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime($expires_time));
......@@ -2033,10 +2043,23 @@ sub DoMaxExtensionInternal($$)
my $slice = $instance->GetGeniSlice();
my $maxinfo;
my $errmsg;
my $errcode;
my $newmax;
my $blob = {"maxextension" => undef, "reservations" => {}};
my @aggregates = ();
#
# For now, if there are any deferred aggregates (offline), then
# there is no point in asking any of them, we cannot determine
# a max extension, have to wait till later.
#
if ($instance->DeferredAggregateList()) {
$errcode = GENIRESPONSE_SERVER_UNAVAILABLE();
$errmsg = "Cannot determine max extension because some ".
"aggregates are deferred";
goto bad;
}
foreach my $aggregate ($instance->AggregateList()) {
next
if ($aggregate->isAL2S() || $aggregate->isStitch());
......@@ -2048,8 +2071,8 @@ sub DoMaxExtensionInternal($$)
#
my $responses;
my $errcode = CallMethodOnAggregates("MaxExtension", 0, \$responses,
@aggregates);
$errcode = CallMethodOnAggregates("MaxExtension", 0, \$responses,
@aggregates);
if ($errcode) {
$errmsg = $responses;
goto bad;
......@@ -2193,7 +2216,7 @@ sub DoDenyOrMoreInfo($)
my $expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($slice->expires())));
my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($instance->created())));
localtime(str2time($instance->started())));
my $now = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime());
my $url = $instance->webURL();
my $pcount = $instance->physnode_count();
......@@ -2296,7 +2319,7 @@ sub DoRefresh()
my $response;
$errcode = CallMethodOnAggregates("SliceStatus", 0, \$response,
$instance->AggregateList());
$instance->ActiveAggregateList());
if ($errcode) {
$errmsg = $response;
goto bad;
......@@ -2309,7 +2332,7 @@ sub DoRefresh()
# user or admin intervention. We do not go in the other direction.
#
my @responses = @{$response};
foreach my $agg ($instance->AggregateList()) {
foreach my $agg ($instance->ActiveAggregateList()) {
my $response = shift(@responses);
if ($response->code() != GENIRESPONSE_SUCCESS) {
......@@ -2335,7 +2358,7 @@ sub DoRefresh()
goto bad
if ($errcode);
if ($readycount == scalar($instance->AggregateList())) {