#!/usr/bin/perl -w
#
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
# This file is part of the Emulab network testbed software.
#
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this file. If not, see .
#
# }}}
#
use English;
use strict;
use Getopt::Std;
use XML::Simple;
use Data::Dumper;
use CGI;
use POSIX ":sys_wait_h";
use POSIX qw(setsid strftime ceil floor);
use Date::Parse;
#
# Back-end script to manage APT profiles.
#
sub usage()
{
print("Usage: manage_instance snapshot instance ".
"[-n node_id] [-i imagename] [-u node|all]\n");
print("Usage: manage_instance consoleurl instance node\n");
print("Usage: manage_instance terminate instance\n");
print("Usage: manage_instance refresh instance\n");
print("Usage: manage_instance reboot instance node_id ...\n");
print("Usage: manage_instance reload instance node_id ...\n");
print("Usage: manage_instance deletenodes instance node_id ...\n");
print("Usage: manage_instance monitor instance\n");
print("Usage: manage_instance lockdown instance set|clear user|admin\n");
print("Usage: manage_instance panic instance set|clear\n");
print("Usage: manage_instance linktest instance [-k | level]\n");
print("Usage: manage_instance writecreds instance directory\n");
print("Usage: manage_instance updatekeys instance [uid] \n");
print("Usage: manage_instance extend instance [-m message] days [filename]\n");
print("Usage: manage_instance denyextension instance [-m message] [filename]\n");
print("Usage: manage_instance moreinfo instance [-m message] [filename]\n");
print("Usage: manage_instance extendold instance [-f] seconds\n");
print("Usage: manage_instance utilization instance\n");
print("Usage: manage_instance schedterminate instance [-m message] days [filename]\n");
print("Usage: manage_instance idledata instance\n");
print("Usage: manage_instance openstackstats instance\n");
exit(-1);
}
my $optlist = "dt:s";
my $debug = 0;
my $silent = 0;
my $webtask_id;
my $webtask;
my $this_user;
my $geniuser;
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $QUICKVM = "$TB/sbin/protogeni/quickvm";
#
# Untaint the path
#
$ENV{'PATH'} = "$TB/bin:$TB/sbin:/bin:/usr/bin:/usr/bin:/usr/sbin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
#
# Turn off line buffering on output
#
$| = 1;
#
# Load the Testbed support stuff.
#
use lib "@prefix@/lib";
use EmulabConstants;
use emdb;
use emutil;
use libEmulab;
use libtestbed;
use User;
use Project;
use APT_Profile;
use APT_Instance;
use APT_Geni;
use GeniXML;
use GeniHRN;
use Genixmlrpc;
use GeniResponse;
use GeniSlice;
use GeniImage;
use GeniUser;
use WebTask;
use EmulabFeatures;
# Protos
sub fatal($);
sub UserError($);
sub DoSnapshot();
sub DoConsole();
sub DoTerminate();
sub DoSchedTerminate();
sub DoExtend();
sub DoExtendOld();
sub DoDenyOrMoreInfo($);
sub DoRefresh();
sub DoReboot();
sub DoReload();
sub DoLockdown();
sub DoPanic();
sub DoManifests();
sub DoLinktest();
sub DoUpdateKeys();
sub DoDeleteNodes();
sub DoUtilization();
sub DoIdleData();
sub DoOpenstack();
sub WriteCredentials();
sub StartMonitor();
sub StartMonitorInternal(;$@);
sub DoImageTrackerStuff($$$$$$);
sub DenyExtensionInternal($);
sub ExtendInternal($$$$);
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"t"})) {
$webtask_id = $options{"t"};
}
if (defined($options{"d"})) {
$debug++;
}
if (defined($options{"s"})) {
$silent = 1;
}
if (@ARGV < 2) {
usage();
}
my $action = shift(@ARGV);
my $uuid = shift(@ARGV);
my $instance = APT_Instance->Lookup($uuid);
if (!defined($instance)) {
$instance = APT_Instance->LookupBySlice($uuid);
}
if (!defined($instance)) {
fatal("No such instance $uuid");
}
if (getpwuid($UID) eq "nobody") {
$this_user = User->ImpliedUser();
}
else {
$this_user = User->ThisUser();
}
# If a guest user, we will not have an actual user, which is okay.
if (defined($this_user)) {
$geniuser = GeniUser->CreateFromLocal($this_user);
}
if ($action eq "snapshot") {
DoSnapshot();
}
if ($action eq "extend") {
DoExtend();
}
if ($action eq "extendold") {
DoExtendOld();
}
elsif ($action eq "denyextension") {
DoDenyOrMoreInfo("deny")
}
elsif ($action eq "moreinfo") {
DoDenyOrMoreInfo("info")
}
elsif ($action eq "consoleurl") {
DoConsole()
}
elsif ($action eq "terminate") {
DoTerminate()
}
elsif ($action eq "schedterminate") {
DoSchedTerminate()
}
elsif ($action eq "refresh") {
DoRefresh()
}
elsif ($action eq "reboot") {
DoReboot()
}
elsif ($action eq "reload") {
DoReload()
}
elsif ($action eq "monitor") {
StartMonitor()
}
elsif ($action eq "lockdown") {
DoLockdown()
}
elsif ($action eq "panic") {
DoPanic()
}
elsif ($action eq "linktest") {
DoLinktest()
}
elsif ($action eq "updatekeys") {
DoUpdateKeys()
}
elsif ($action eq "writecreds") {
WriteCredentials()
}
elsif ($action eq "getmanifests") {
DoManifests()
}
elsif ($action eq "deletenodes") {
DoDeleteNodes()
}
elsif ($action eq "utilization") {
DoUtilization()
}
elsif ($action eq "idledata") {
DoIdleData()
}
elsif ($action eq "openstackstats") {
DoOpenstack()
}
else {
usage();
}
exit(0);
#
# Take a snapshot. Implies a single node instance, for now.
#
sub DoSnapshot()
{
my $errmsg;
my $logfile;
my $errcode = -1;
my $needunlock = 0;
my $old_status = $instance->status();
my $node_id;
my $imagename;
my $cloneprofile;
my $update_profile;
my $copyback_uuid;
my $copyback_urn;
my $update_prepare = 0;
my $doversions = 0;
my $usetracker = 0;
my $optlist = "n:i:u:Uc:";
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"n"})) {
$node_id = $options{"n"};
}
if (defined($options{"i"})) {
$imagename = $options{"i"};
}
if (defined($options{"c"})) {
$cloneprofile = $options{"c"};
}
if (defined($options{"u"})) {
$update_profile = $options{"u"};
if ($update_profile !~ /^(node|all)$/) {
usage();
}
}
if (defined($options{"U"})) {
$update_prepare = 1;
}
if (defined($cloneprofile) && defined($update_profile)) {
fatal("Not allowed to update profile when cloning a profile");
}
if (defined($cloneprofile) && !defined($imagename)) {
fatal("Must supply image name when cloning a profile");
}
if ($old_status ne "ready") {
fatal("Instance must be in the ready state to take a snapshot");
}
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for quick VM: $uuid");
}
#
# Might be a clone (manage_profile).
#
my $sliver_urn;
my $aggregate;
my $node;
my $profile;
if (defined($cloneprofile)) {
$profile = APT_Profile->Lookup($cloneprofile);
}
else {
$profile = APT_Profile->Lookup($instance->profile_id());
}
if (!defined($profile)) {
fatal("Could not lookup profile for " .
(defined($cloneprofile) ? "cloning" : "snapshot"));
}
my $project = Project->Lookup($profile->pid_idx());
if (!defined($project)) {
fatal("Could not lookup project for $profile");
}
#
# Sanity checks.
#
my @aggs = $instance->AggregateList();
if (! @aggs) {
fatal("No slivers for instance!");
}
if (!defined($node_id)) {
# We snapshot the one node in the instance.
if (@aggs != 1) {
fatal("Too many aggregates (> 1) to snapshot");
}
my ($agg) = @aggs;
my $manifest = GeniXML::Parse($agg->manifest());
if (! defined($manifest)) {
fatal("Could not parse manifest for $agg");
}
my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
if (@nodes != 1) {
fatal("Too many nodes (> 1) to snapshot");
}
($node) = @nodes;
$sliver_urn = GeniXML::GetSliverId($node);
$node_id = GeniXML::GetVirtualId($node);
$aggregate = $agg;
# Profile Snapshot, always use the profile name. Clone passes in name.
if (!defined($imagename)) {
$imagename = $profile->name();
}
}
else {
my $nodecount = 0;
# Find the node in its manifest.
foreach my $agg (@aggs) {
my $manifest = GeniXML::Parse($agg->manifest());
if (! defined($manifest)) {
fatal("Could not parse manifest for $agg");
}
foreach my $ref (GeniXML::FindNodes("n:node",
$manifest)->get_nodelist()) {
$nodecount++;
my $client_id = GeniXML::GetVirtualId($ref);
my $manager_urn = GetManagerId($ref);
my $urn = GeniXML::GetSliverId($ref);
# No sliver urn or a different aggregate.
next
if (! (defined($urn) &&
defined($manager_urn) &&
$manager_urn eq $agg->aggregate_urn()));
if ($node_id eq $client_id) {
$node = $ref;
$sliver_urn = $urn;
$aggregate = $agg;
last;
}
}
}
if (!defined($sliver_urn)) {
fatal("Could not find node '$node_id' in manifest");
}
#
# So, we want Profile snapshot above (of a single node profile) and
# Node snapshot in a single node profile to behave the same wrt the
# image name, so look at the nodecount to see if need to append the
# nodeid to the imagename.
#
if (!defined($imagename)) {
$imagename = $profile->name();
if ($nodecount > 1) {
$imagename .= "." . $node_id;
}
}
}
#
# Make sure a valid imagename. This a local test of course, but this
# only works on IG aggregates anyway.
#
if (! TBcheck_dbslot($imagename, "images",
"imagename", TBDB_CHECKDBSLOT_ERROR)) {
$imagename = $profile->profileid();
$imagename .= "." . $node_id
if (defined($node_id));
}
#
# Instruct the remote cluster to copy the image back to its origin,
# but we need to ask the IMS for uuid of the image that is running,
# so we can tell the cluster, which then tells the origin cluster.
# We also need to know what the new URN of the image will be, for
# updating the profile.
#
if (GetSiteVar("protogeni/use_imagetracker") &&
EmulabFeatures->FeatureEnabled("APT_UseImageTracker",
$this_user, $project)) {
$usetracker = 1;
#
# When cloning, we use the URN returned by the cluster; it is
# the origin of the new image.
#
if (!defined($cloneprofile)) {
my $rval = DoImageTrackerStuff($aggregate, $node, $project,
\$copyback_uuid, \$copyback_urn,
\$errmsg);
if ($rval) {
if ($rval < 0) {
fatal($errmsg);
}
else {
$errcode = 1;
goto uerror;
}
}
}
}
if (0) {
fatal("$copyback_uuid, $copyback_urn\n");
}
#
# We are not going to allow this if the instance is on a different
# cluster then where the image was originally created, since otherwise
# the image provenancewill look like spaghetti.
#
if (defined($update_profile)) {
my $diskref = GeniXML::GetDiskImage($node);
if (defined($diskref)) {
my $authority = $aggregate->GetGeniAuthority();
my $image_url = GeniXML::GetText("url", $diskref);
if (defined($image_url) && !$usetracker) {
require URI;
# Get the hostname for the image URL.
my $uri = URI->new($image_url);
if (!defined($uri)) {
fatal("Could not parse $image_url");
}
my $image_host = $uri->host();
# Get the hostname for the authority.
$uri = URI->new($authority->url());
if (!defined($uri)) {
fatal("Could not parse authority URL");
}
my $authority_host = $uri->host();
# Compare domains.
$image_host =~ s/^([^.]+\.)//;
$authority_host =~ s/^([^.]+\.)//;
if ($image_host ne $authority_host) {
$errmsg = "Not allowed to take a snapshot on this cluster";
$errcode = 1;
goto uerror;
}
}
}
# Do this here to avoid output to logfile.
$doversions =
EmulabFeatures->FeatureEnabled("APT_ProfileVersions",
$this_user, $project);
}
if ($slice->Lock()) {
$errmsg = "Experiment is busy, please try again later.";
$errcode = 1;
goto uerror;
}
$needunlock = 1;
#
# Create the webtask object, but AFTER locking the slice so we do
# not destroy one in use.
#
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
# This is convenience for the web server.
if (defined($webtask)) {
$webtask->aggregate_urn($aggregate->aggregate_urn());
$webtask->client_id($node_id);
}
}
$instance->SetStatus("imaging");
$aggregate->SetStatus("imaging");
#
# This returns pretty fast, and then the imaging takes place in
# the background at the aggregate.
#
my $response =
$aggregate->CreateImage($sliver_urn, $imagename,
$update_prepare, $copyback_uuid);
if (!defined($response)) {
$errmsg = "Internal error creating image";
$instance->SetStatus($old_status);
$aggregate->SetStatus($old_status);
goto uerror;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
$errmsg = "Could not create image: " . $response->output() . "\n";
$errcode = 1
if ($response->code() == GENIRESPONSE_BUSY ||
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_FORBIDDEN);
$instance->SetStatus($old_status);
$aggregate->SetStatus($old_status);
goto uerror;
}
my ($image_urn, $image_url,
$version_urn, $version_url) = @{ $response->value() };
if (!defined($version_urn)) {
$version_urn = $image_urn;
$version_url = $image_url
}
if (defined($webtask)) {
$webtask->image_urn($version_urn);
$webtask->image_url($version_url);
my $image_name;
if ($usetracker) {
# DoImageTrackerStuff determined that we use whatever the cluster
# tells us, cause it is the home of the image.
if (!defined($copyback_urn)) {
$image_name = $version_urn;
}
else {
$image_name = $copyback_urn;
}
}
else {
$image_name = $version_url;
}
$webtask->image_name($image_name);
# We tell the web interface that the image has to be copied
# back,
if (defined($copyback_uuid)) {
$webtask->copyback_uuid($copyback_uuid);
}
}
else {
print "$image_urn,$image_url\n";
}
#
# Exit and leave child to poll.
#
if (! $debug) {
$logfile = TBMakeLogname("snapshot");
if (my $childpid = TBBackGround($logfile)) {
# Parent exits normally, web interface watches.
exit(0);
}
# Let parent exit;
sleep(2);
}
# Bind the process id. This is important when the caller is
# manage_profile, doing a clone.
$webtask->SetProcessID($PID)
if (defined($webtask));
#
# Poll for a reasonable amount of time.
#
my $seconds = 1500;
my $interval = 15;
my $ready = 0;
my $sliver_ready = 0;
my $failed = 0;
while ($seconds > 0) {
sleep($interval);
$seconds -= $interval;
my $response = $aggregate->SliceStatus();
if ($response->code() != GENIRESPONSE_SUCCESS &&
$response->code() != GENIRESPONSE_RPCERROR &&
$response->code() != GENIRESPONSE_SERVER_UNAVAILABLE &&
$response->code() != GENIRESPONSE_BUSY) {
$errmsg = "Sliverstatus failed: ". $response->output() . "\n";
$failed = 1;
last;
}
next
if ($response->code() == GENIRESPONSE_BUSY ||
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_RPCERROR);
my $blob = $response->value();
# This is the per-aggregate status, we always set this for web UI.
$aggregate->UpdateWebStatus($blob->{'details'});
if ($blob->{'status'} eq "failed") {
$failed = 1;
last;
}
elsif ($blob->{'status'} eq "ready") {
$sliver_ready = 1;
}
#
# We are watching for the image status to report ready or failed.
#
$response = $aggregate->ImageInfo($image_urn);
if ($response->code() != GENIRESPONSE_SUCCESS &&
$response->code() != GENIRESPONSE_RPCERROR &&
$response->code() != GENIRESPONSE_SERVER_UNAVAILABLE &&
$response->code() != GENIRESPONSE_BUSY) {
$errmsg = "Imageinfo failed: ". $response->output() . "\n";
$failed = 1;
last;
}
next
if ($response->code() == GENIRESPONSE_BUSY ||
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_RPCERROR);
my $imageblob = $response->value();
if (defined($webtask)) {
my %blobcopy = %{ $imageblob };
#
# If the image is ready, but needs to be copied back to
# its origin, hold of ready till later. We will wait for
# the copyback to finish, see below.
#
if ($imageblob->{'status'} eq "ready" && defined($copyback_uuid)) {
$blobcopy{'status'} = "copying";
}
# This is also being updated by the event system.
$instance->UpdateImageStatus(\%blobcopy);
}
if ($imageblob->{'status'} eq "ready") {
$ready = 1;
last;
}
elsif ($imageblob->{'status'} eq "failed") {
$failed = 1;
last;
}
}
# Cause of image status events.
$webtask->Refresh()
if (defined($webtask));
if ($failed) {
$errmsg = "Imaging failed"
if (!defined($errmsg));
goto bad;
}
elsif (!$ready) {
$errmsg = "Imaging timed out";
$errcode = -2;
goto bad;
}
elsif (defined($update_profile)) {
#
# If successful, we create a new version of the profile and
# update the rspec to reflect the new image version. Note
# that we expect the CM is doing image versioning, so do not
# bother to check if the image version is actually new.
#
if ($doversions) {
$profile = $profile->NewVersion($this_user);
if (!defined($profile)) {
print STDERR "Could not create new profile version\n";
$webtask->Exited(70)
if (defined($webtask));
exit(1);
}
}
# DoImageTrackerStuff determined that we use whatever the cluster
# tells us, cause it is the home of the image.
$copyback_urn = $version_urn
if ($usetracker && !defined($copyback_urn));
$profile->UpdateDiskImage($node_id,
(defined($copyback_urn) ?
$copyback_urn : $version_url),
($update_profile eq "all" ? 1 : 0));
}
$instance->SetStatus("ready");
$aggregate->SetStatus("ready");
#
# If there is a copyback_uuid, we want to wait for that to finish.
#
if (defined($copyback_uuid)) {
#
# We know the copyback is done when the IMS has the info.
#
my $copied = 0;
$seconds = 1000;
while ($seconds > 0) {
sleep($interval);
$seconds -= $interval;
#
# It would clearly be more more efficient to just look in
# the IMS database.
#
Genixmlrpc->SetContext(APT_Geni::GeniContext());
my $blob = GeniImage::GetImageData($copyback_urn, \$errmsg);
Genixmlrpc->SetContext(undef);
# We get back undefined if the image is not posted yet.
if (defined($blob)) {
$copied = 1;
last;
}
sleep($interval);
}
# Tell the web interface.
if (!$copied) {
$errmsg = "Failed to copy image back to its origin cluster";
$errcode = 1;
goto bad;
}
elsif (defined($webtask)) {
$webtask->image_status("ready");
}
}
# We garbage collect these later, so anyone waiting has a chance
# to see the exit status
$webtask->Exited(0)
if (defined($webtask));
$slice->UnLock();
if (defined($logfile) && -s $logfile) {
SENDMAIL($TBOPS,
"Instance Snapshot Complete",
"Finished taking snapshot of $instance.\n",
$TBOPS, undef, $logfile);
unlink($logfile);
}
if (!$sliver_ready) {
#
# Image is ready, but sliver is not. Start a monitor so that
# web interface is updated.
#
StartMonitorInternal();
}
exit(0);
bad:
if (!$sliver_ready) {
#
# Image is ready, but sliver is not. Start a monitor so that
# web interface is updated.
#
StartMonitorInternal();
}
$instance->SetStatus("ready");
$aggregate->SetStatus("ready");
if (defined($logfile)) {
SENDMAIL($TBOPS,
"Snapshot failed",
"Error taking snapshot of $instance:\n\n".
"$errmsg\n",
$TBOPS, undef, $logfile);
unlink($logfile);
}
uerror:
print STDERR "$errmsg\n";
if (defined($errmsg) && defined($webtask)) {
$webtask->Exited($errcode);
$webtask->output($errmsg);
}
$slice->UnLock()
if ($needunlock);
exit($errcode);
}
sub DoImageTrackerStuff($$$$$$)
{
my ($aggregate, $node, $project, $puuid, $purn, $perrmsg) = @_;
my $node_id = GeniXML::GetVirtualId($node);
my $errmsg;
#
# If we do not have a diskinfo section, we will use the URN we get back
# from the cluster (it is a snapshot of the default image).
#
my $diskinfo = GeniXML::GetDiskImage($node);
return 0
if (!defined($diskinfo));
#
# This one needs more thought, it might be a URL.
#
my $image_token = GeniXML::GetText("name", $diskinfo);
if (!defined($image_token)) {
$image_token = GeniXML::GetText("url", $diskinfo);
return 0
if (!defined($image_token));
}
if (GeniHRN::IsValid($image_token)) {
my ($auth,$ospid) = GeniHRN::ParseImage($image_token);
if (!defined($ospid)) {
$$perrmsg = "Invalid image urn: $image_token";
return 1;
}
}
Genixmlrpc->SetContext(APT_Geni::GeniContext());
my $blob = GeniImage::GetImageData($image_token, \$errmsg);
Genixmlrpc->SetContext(undef);
if (!defined($blob)) {
if (GeniHRN::IsValid($image_token)) {
#
# See if this is for a system image (emulab-ops). If it is,
# and the domain is not the MS, then retry with a MS URN.
#
# This is sorta temporary; at some point there will not be any
# profiles using the URNs that are not in the image tracker.
# Of course a user is free to set the URN to anything the want,
# which is why I expect this code to be here for a while.
#
my $urn;
my $hrn = GeniHRN->new($image_token);
my ($auth,$ospid,$os,$vers) = $hrn->ParseImage();
if ($ospid eq TBOPSPID() && $auth ne "emulab.net") {
$urn = GeniHRN::GenerateImage("emulab.net",
TBOPSPID(), $os, $vers);
Genixmlrpc->SetContext(APT_Geni::GeniContext());
$blob = GeniImage::GetImageData($urn, \$errmsg);
Genixmlrpc->SetContext(undef);
}
}
if (!defined($blob)) {
$$perrmsg = "Could not get info from the image server for ".
"$image_token:\n" . $errmsg;
return 1;
}
}
#
# System Image? We use the URN we get back from CreateSliver().
# The cluster will be the origin for the new image.
#
return 0
if ($blob->{'issystem'});
my $image_urn = $blob->{'urn'};
my $copyback_uuid = $blob->{'version_uuid'};
my $copyback_urn = $image_urn;
my $hrn = GeniHRN->Parse($image_urn);
my (undef,$ospid,$os,$vers) = $hrn->ParseImage();
#
# What happens if the user is doing a snapshot on the cluster where
# the image lives? The copyback (import) makes no sense in that case,
# but what if its the same cluster but different projects? In this case
# we want a standard image clone, and we use whatever URN the cluster
# hands back to us.
#
# Aside; should we allow snapshots (in the web ui) across projects?
#
if (lc($hrn->domain()) eq lc($aggregate->domain())) {
my $projhrn = GeniHRN->Parse($blob->{'project_urn'});
if (!defined($projhrn)) {
$$perrmsg = "Could not parse " . $blob->{'project_urn'} . "\n";
return -1;
}
if (lc($projhrn->subauth()) eq lc($project->pid())) {
# We use the URN we get back from CreateSliver().
return 0;
}
# Ditto
return 0;
}
#
# If we are going to update the profile, we need to know what to
# change the image urn to, and that depends on what version the
# image is currently at, AT THE ORIGIN CLUSTER. The urn we get back
# from the snapshotting cluster is not what we care about, we need
# a urn for the origin cluster. But that depends on what version the
# origin cluster is at (the highest numbered version). But if we are
# doing a snapshot of an earlier version, we cannot generate the
# version here, we have to ask what it will be.
#
if ($blob->{'isversioned'}) {
$copyback_urn = GeniHRN::GenerateImage($hrn->authority(),
$ospid, $os,
$blob->{'maxversion'} + 1);
}
$$puuid = $copyback_uuid;
$$purn = $copyback_urn;
return 0;
}
#
# Ask the console URL for a node in an instance.
#
sub DoConsole()
{
usage()
if (!@ARGV);
my $node_id = shift(@ARGV);
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate(undef, $webtask_id);
if (!defined($webtask)) {
fatal("Could not lookup/create webtask for $webtask_id");
}
# Convenient.
$webtask->AutoStore(1);
}
#
# Sanity check to make sure the node is really in the rspec, since
# we need its sliver urn to ask for the console url.
#
my $sliver_urn;
my $sliver;
foreach my $obj ($instance->AggregateList()) {
my $manifest = GeniXML::Parse($obj->manifest());
if (! defined($manifest)) {
fatal("Could not parse manifest for $obj");
}
my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
foreach my $node (@nodes) {
my $client_id = GeniXML::GetVirtualId($node);
my $urn = GeniXML::GetSliverId($node);
my $manager_urn = GetManagerId($node);
# No sliver urn or a different aggregate.
next
if (! (defined($urn) &&
defined($manager_urn) &&
$manager_urn eq $obj->aggregate_urn()));
if ($node_id eq $client_id) {
$sliver_urn = $urn;
$sliver = $obj;
}
}
}
if (!defined($sliver_urn)) {
fatal("Could not find node '$node_id' in manifest");
}
my $response = $sliver->ConsoleInfo($sliver_urn);
if (!defined($response)) {
fatal("RPC Error calling ConsoleInfo");
}
if ($response->code() == GENIRESPONSE_UNAVAILABLE) {
print STDERR "Server says there is no console for $node_id\n";
if (defined($webtask)) {
$webtask->output("Sorry, $node_id does not have a console line");
$webtask->Exited($response->code());
}
exit($response->code());
}
if ($response->code() == GENIRESPONSE_SEARCHFAILED) {
print STDERR "Server says $node_id has been deallocated\n";
if (defined($webtask)) {
$webtask->output("Sorry, $node_id has been deallocated");
$webtask->Exited($response->code());
}
exit($response->code());
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
$response = $sliver->ConsoleURL($sliver_urn);
if (!defined($response)) {
fatal("RPC Error calling ConsoleURL");
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
if ($response->value()) {
fatal($response->output());
}
fatal("Server returned error: " .
GENIRESPONSE_STRING($response->code));
}
}
my $url;
my $pswd;
my $logurl;
if (ref($response->value())) {
$url = $response->value()->{'url'};
$pswd = $response->value()->{'password'}
if (exists($response->value()->{'password'}));
$logurl = $response->value()->{'logurl'}
if (exists($response->value()->{'logurl'}));
print Dumper($response->value());
}
else {
$url = $response->value();
}
if (defined($webtask)) {
if ($response->code()) {
$webtask->output($response->output());
}
else {
$webtask->url($url);
$webtask->password($pswd) if (defined($pswd));
$webtask->logurl($logurl) if (defined($logurl));
}
$webtask->Exited($response->code());
exit($response->code());
}
# For command line operation too.
if ($response->code()) {
fatal($response->output());
}
print $url . "\n";
print $pswd . "\n" if (defined($pswd));
print $logurl . "\n" if (defined($logurl));
exit(0);
}
#
# Terminate
#
sub DoTerminate()
{
my $errmsg;
my $logfile;
my $expired = $RECORDHISTORY_TERMINATED;
if (@ARGV) {
my $arg = shift(@ARGV);
if ($arg eq "-e") {
$expired = $RECORDHISTORY_EXPIRED;
}
else {
usage();
}
}
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
#
# No slice (typically) means we never got far enough to the
# get the sliver created on the backend cluster.
#
goto killit;
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
#
# A special case is if the slice is provisioning. This means the
# user is giving up on it, and we want to tell the aggregate to
# kill it. Not all aggregates are going to allow this, so need
# to be able to deal with that.
#
if ($instance->status() ne "provisioned") {
fatal("Slice is busy, cannot lock it");
}
if (!$instance->canceled()) {
print "Marking instance canceled\n";
$instance->MarkCanceled();
}
sleep(1);
# We have an obvious race here since we do not have the lock.
exit(0);
}
my $old_status = $instance->status();
$instance->SetStatus("terminating");
#
# Exit and let caller poll for status.
#
if (!$debug) {
$logfile = TBMakeLogname("terminate");
if (my $childpid = TBBackGround($logfile)) {
my $status = 0;
#
# Wait a couple of seconds to see if there is going to be an
# immediate error. Then return and let it continue to run. This
# allows the web server to see quick errors. Later errors will
# have to be emailed.
#
sleep(3);
my $foo = waitpid($childpid, &WNOHANG);
if ($foo) {
$status = $? >> 8;
}
exit($status);
}
}
my $coderef = sub {
my ($sliver) = @_;
my $urn = $sliver->aggregate_urn();
my $errmsg;
return 0
if ($sliver->status() eq "terminated");
my $response = $sliver->Terminate();
if (!defined($response)) {
$errmsg = "RPC Error calling Terminate";
goto bad;
}
# SEARCHFAILED is success.
if ($response->code() != GENIRESPONSE_SUCCESS &&
$response->code() != GENIRESPONSE_SEARCHFAILED) {
if ($response->code() == GENIRESPONSE_BUSY ||
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE) {
$errmsg = "Slice was busy for too long; try again later?";
goto bad;
}
$errmsg = "Could not delete slice: ". $response->output();
goto bad;
}
$instance->SetStatus("terminated");
return 0;
bad:
print STDERR "$urn: $errmsg\n";
return -1;
};
#print STDERR Dumper($instance);
my @return_codes = ();
my @agglist = $instance->AggregateList();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error calling Terminate()";
goto bad;
}
#
# Check the exit codes.
#
foreach my $code (@return_codes) {
if ($code) {
$errmsg = "Some slivers would not terminate";
goto bad;
}
}
$slice->Delete();
$instance->RecordHistory($expired);
killit:
$instance->Delete();
unlink($logfile)
if (defined($logfile));
exit(0);
bad:
print STDERR $errmsg . "\n";
$instance->SetStatus($old_status);
$slice->UnLock();
if (defined($logfile)) {
my $instance_name = $instance->name();
my $slice_uuid = $slice->uuid();
SENDMAIL($TBOPS,
"Unable to terminate instance $uuid",
"Name: $instance_name\n".
"Slice: $slice_uuid\n\n".
"$errmsg\n",
$TBOPS, undef, $logfile)
if (!$silent);
unlink($logfile);
}
exit(1);
}
#
# Request an extension; all this code used to be in PHP, that was silly.
#
sub DoExtend()
{
my $force = 0;
my $lockdown = 0;
my $errcode = 1;
my $autoextend_maximum = GetSiteVar("aptui/autoextend_maximum");
my $autoextend_maxage = GetSiteVar("aptui/autoextend_maxage");
my $autoextend_freedays= 2;
my $creator = $instance->GetGeniUser();
my $slice = $instance->GetGeniSlice();
my $name = $instance->name();
my $url = $instance->webURL();
my $clusters = join(",", map { $_->domain() }
$instance->AggregateList());
my $pcount = $instance->physnode_count();
my $expires_time = str2time($slice->expires());
my $created_time = str2time($instance->created());
my $extensions = $instance->Brand()->ExtensionsEmailAddress();
my $granted = 0;
my $needapproval = 0;
my $message;
my $reason;
my $errmsg;
usage()
if (!@ARGV);
my $wanted = shift(@ARGV);
if (@ARGV == 2) {
my $arg = shift(@ARGV);
if ($arg eq "-m") {
$reason = shift(@ARGV);
}
else {
usage();
}
}
elsif (@ARGV == 1) {
my $filename = shift(@ARGV);
if (! -e $filename) {
fatal("$filename does not exist");
}
open(MSG, $filename) or
fatal("Could not open $filename");
$reason = "";
while () {
$reason .= $_;
}
close(MSG);
}
#
# Create the webtask object; the web interface gave us an anonymous
# webtask, so we can use it before lock.
#
if (defined($webtask_id)) {
$webtask = WebTask->Lookup($webtask_id);
fatal("Could not lookup webtask object")
if (!defined($webtask));
# Convenient.
$webtask->AutoStore(1);
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
$errcode = GENIRESPONSE_BUSY;
$errmsg ="Experiment is busy, cannot lock it. Try again later.";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
print STDERR "$errmsg\n";
exit($errcode);
}
if (defined($reason) &&
!TBcheck_dbslot($reason, "default", "fulltext",
TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
$errmsg = "Illegal characters in your reason";
$errcode = 1;
goto bad;
}
if (!TBcheck_dbslot($wanted, "default", "int",
TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
$errmsg = "Illegal integer for length";
$errcode = 1;
goto bad;
}
# Helper function.
my $needAdminApproval = sub {
my ($wanted, $granted, $reason, $message) = @_;
# Subtract out the extra free time we added.
my $howlong = $wanted - $granted;
my $new_expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($slice->expires())+
($howlong * 3600 * 24)));
my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($instance->created())));
$instance->Brand()->SendEmail($extensions,
"Experiment Extension Request: $name",
"A request to extend this experiment was made but requires\n".
"administrator approval" .
($message ? " $message" : "") . ".\n\n" .
"The request was for $wanted days, we granted $granted days, ".
"the reason given is:\n\n".
$reason . "\n\n".
"This experiment was started on $created\n".
"Granting the request would set the expiration to $new_expires\n".
"It is running on $clusters\n".
"\n\n". $url . "\n\n",
$creator->email());
# Flag for the dashboard page.
$instance->ExtensionRequested($reason, $granted);
# Need to return this to the web interface via the webtask.
return "Your request requires admininstrator approval".
($message ? " because $message" : "") . ". " .
"You will receive email if/when your ".
"request is granted (or denied). Thanks!";
};
#
# If no physical nodes (only VMs), double the maximums.
#
if (!$instance->physnode_count()) {
$autoextend_maxage *= 2;
$autoextend_maximum *= 2;
}
#
# Guest users are treated differently.
#
if (!defined($this_user)) {
# Only extend for 24 hours.
$granted = 1;
if ($expires_time > time() + (3600 * 24 * $granted)) {
$errmsg = "You still have a day left. Try again tomorrow";
$errcode = 1;
goto bad;
}
}
#
# Admin user, we do whatever it says to do.
#
elsif ($this_user->IsAdmin()) {
$message = "Your experiment was extended by the site administrator.";
$granted = $wanted;
}
else {
my $diff = $expires_time - time();
my $cdiff = time() - $created_time;
if (! defined($reason)) {
fatal("You must supply a reason for this extension");
}
#
# If admin lockout, we are refusing any more free time.
#
if ($instance->extension_adminonly()) {
$message = "because you are not allowed any more extensions";
$granted = 0;
}
#
# After maxage, all extension requests require admin approval.
#
elsif ($cdiff > (3600 * 24 * $autoextend_maxage)) {
#
# Well, if they asked for less then the free grant, and
# the experiment is going to expire very soon, we give
# them some extra time. This is a nice loophole people will
# probably notice.
#
my $mindiff = $autoextend_freedays * 3600 * 24;
if ($diff < $mindiff) {
$granted = POSIX::ceil(($mindiff - $diff) / (3600 * 24));
}
else {
$granted = 0;
}
if ($wanted > $granted) {
$needapproval = 1;
$message = "because it was started more then ".
"$autoextend_maxage days ago";
}
}
#
# Temporary for GEC23, this should be generalized next time.
#
elsif (0 && (time() + ($wanted * 3600 * 24) >
str2time("2015-06-15 12:00:00"))) {
$granted = 1;
$needapproval = 1;
$message = "because the testbed is mostly reserved for GEC23";
}
#
# Registered users are granted up to the autoextend_maximum
# automatically. Beyond that, requires approval, but we still
# give them whatever the free extension is, since we want to
# give them extra time until the next meeting of the "resource
# management committee."
#
elsif ($wanted > $autoextend_maximum) {
$needapproval = 1;
$message = "because it was for longer then $autoextend_maximum days";
#
# Plenty of time left, no extension just a message.
#
if ($diff > (3600 * 24 * 3)) {
$granted = 0;
}
else {
$granted = $autoextend_maximum;
}
}
elsif ($diff > (3600 * 24 * 7)) {
my $days = POSIX::ceil($diff / (3600 * 24.0));
$errmsg = "You still have $days day(s) left before expiration!";
$errcode = 1;
goto bad;
}
else {
$granted = $wanted;
}
#
# The most we allow is the autoextend_maximum out, no
# matter what they asked for. So, if the autoextend_maximum
# is a week and there are five days left and they asked
# for seven, we give them two.
#
if ($expires_time + ($granted * 3600 * 24) >
time() + (3600 * 24 * $autoextend_maximum)) {
$granted =
POSIX::ceil(((3600 * 24 * $autoextend_maximum) - $diff) /
(3600 * 24.0));
}
}
#
# Do the extension.
#
if ($granted) {
if ($errcode = ExtendInternal($slice,
$granted * 3600 * 24, 0, \$errmsg)) {
goto bad;
}
}
my $expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($slice->expires())));
my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($instance->created())));
my $now = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime());
my $before = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime($expires_time));
#
# New extension mechanism
#
my $extensionargs = {
"action" => "request",
"wanted" => $wanted,
"granted" => $granted,
"admin" => $this_user->IsAdmin() ? 1 : 0};
if (defined($message)) {
$extensionargs->{"message"} = $message;
}
if (defined($reason)) {
$extensionargs->{"reason"} = $reason;
}
if (defined($this_user)) {
$extensionargs->{"uid"} = $this_user->uid();
$extensionargs->{"uid_idx"} = $this_user->uid_idx();
}
else {
# A guest user, only the creator can request an extension.
$extensionargs->{"uid"} = $instance->creator();
$extensionargs->{"uid_idx"} = $instance->creator_idx();
}
my $extensioninfo =
APT_Instance::ExtensionInfo->Create($instance, $extensionargs);
if (!defined($extensioninfo)) {
print STDERR "Could not create extension info object\n";
}
#
# We store each extension request in an ongoing text field.
#
my $text =
"Date: $now\n".
"Wanted: $wanted, Granted: $granted\n".
"Before: $before\n".
"After $expires\n".
"Reason:\n".
$reason . "\n\n".
"-----------------------------------------------\n";
$instance->AddExtensionHistory($text);
if ($needapproval) {
$errmsg = &$needAdminApproval($wanted, $granted, $reason, $message);
# The web interface (JS code) uses this error code.
$errcode = 2;
goto bad;
}
$instance->Brand()->SendEmail($creator->email(),
"Experiment Extension: $name",
($this_user->IsAdmin() ? "$message\n\n$reason" :
"A request to extend your experiment was made and ".
"granted.\n".
"Your reason was:\n\n". $reason) .
"\n\n".
"Your experiment was started on $created\n".
"Your experiment will now expire at $expires\n".
"You are using $pcount physical nodes.\n".
"It is running on $clusters\n\n".
"$url\n",
"$extensions",
"BCC: $extensions");
if (!$this_user->IsAdmin()) {
#
# We do not want to overwrite the reason in the DB if this
# was an admin extension, we want to keep whatever the user
# has written previously. This currently used by the web interface
# to show the latest reason.
#
$instance->Update({"extension_reason" => $reason});
}
else {
#
# Any time an admin issues an extension, we clear the flag that tells
# the dashboard page there is an oustanding request.
#
$instance->Update({"extension_requested" => 0});
}
$instance->BumpExtensionCount($granted);
if (defined($webtask)) {
$webtask->Exited(0);
}
$slice->UnLock();
exit(0);
bad:
$slice->UnLock();
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit($errcode);
}
sub ExtendInternal($$$$)
{
my ($slice, $seconds, $force, $perrmsg) = @_;
my $lockdown = 0;
my $errcode = -1;
my $errmsg;
# Save in case of error.
my $oldexpires = $slice->expires();
# Lockdown on admin extensions longer then XX days.
if (defined($this_user) && $this_user->IsAdmin() &&
($seconds / (24 * 60 * 60)) > 10) {
$lockdown = 1
}
# Need to update slice before creating new credential.
if ($slice->IsExpired()) {
$slice->SetExpiration(time() + $seconds);
}
else {
$slice->AddToExpiration($seconds);
}
my $new_expires = $slice->ExpirationGMT();
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $domain = $sliver->domain();
my $errmsg;
my $response = $sliver->Extend($new_expires, $this_user);
if (!defined($response)) {
$errmsg = "Internal error calling Renew at $domain";
goto bad;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
$errmsg = "Failed to extend slice at $domain: ".
$response->output();
# This is something the user should see.
if ($response->code() == GENIRESPONSE_REFUSED ||
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_BUSY) {
# For web interface.
$webtask->output($errmsg);
$webtask->Exited($response->code());
return 1;
}
goto bad;
}
return 0;
bad:
print STDERR "$errmsg\n";
$webtask->output($errmsg);
$webtask->Exited(-1);
return -1;
};
my @return_codes = ();
my @agglist = $instance->AggregateList();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error calling Extend\n";
goto bad;
}
#
# Check the exit codes.
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
if ($code) {
$agg->webtask()->Refresh();
$errmsg = $agg->webtask()->output();
$errcode = $agg->webtask()->exitcode();
goto bad;
}
}
# Lockdown.
if ($lockdown) {
if (DoLockdownInternal("set", "admin")) {
SENDMAIL($TBOPS,
"Failed to lock down APT Instance",
"Failed to lock down $instance\n".
$instance->webURL() . "\n",
$TBOPS);
}
}
return 0;
bad:
# Reset back to original expiration, sorry.
$slice->SetExpiration($oldexpires);
$$perrmsg = $errmsg;
return $errcode;
}
#
# Deny extension, sending optional email to user (which is also saved in
# the extension history). We used to do this in PHP, which was silly.
#
sub DoDenyOrMoreInfo($)
{
my ($action) = @_;
my $errcode = -1;
my $reason;
if (! $this_user->IsAdmin()) {
fatal("Only administrators can deny extensions or request info");
}
if (@ARGV == 2) {
my $arg = shift(@ARGV);
if ($arg eq "-m") {
$reason = shift(@ARGV);
}
else {
usage();
}
}
elsif (@ARGV == 1) {
my $filename = shift(@ARGV);
if (! -e $filename) {
fatal("$filename does not exist");
}
open(MSG, $filename) or
fatal("Could not open $filename");
$reason = "";
while () {
$reason .= $_;
}
close(MSG);
}
my $creator = $instance->GetGeniUser();
my $slice = $instance->GetGeniSlice();
my $name = $instance->name();
my $expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($slice->expires())));
my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($instance->created())));
my $now = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime());
my $url = $instance->webURL();
my $pcount = $instance->physnode_count();
my $extensions= $instance->Brand()->ExtensionsEmailAddress();
my $clusters = join(",", map { $_->domain() }
$instance->AggregateList());
my ($message,$subject);
if ($action eq "deny") {
$message = "Your extension was denied by the site administrator!";
$subject = "Experiment Extension Denied: $name";
}
else {
$message = "Hi, we need more information about your experiment: $name";
$subject = "Information request for Experiment: $name";
}
#
# New extension mechanism
#
my $extensionargs = {
"action" => ($action eq "deny" ? "deny" : "info"),
"uid" => $this_user->uid(),
"uid_idx" => $this_user->uid_idx(),
"message" => $message,
"admin" => $this_user->IsAdmin() ? 1 : 0};
if (defined($reason)) {
$extensionargs->{"reason"} = $reason;
}
my $extensioninfo =
APT_Instance::ExtensionInfo->Create($instance, $extensionargs);
if (!defined($extensioninfo)) {
print STDERR "Could not create extension info object\n";
return -1;
}
#
# We store each extension request in an ongoing text field.
#
my $text =
"Date: $now\n".
"Expires: $expires\n".
"Reason:\n".
$message . "\n\n".
$reason . "\n\n".
"-----------------------------------------------\n";
$instance->Brand()->SendEmail($creator->email(), $subject,
$message . "\n\n" .
$reason . "\n\n".
"Your experiment was started on $created\n".
"Your experiment expires at $expires\n".
"You are using $pcount physical nodes.\n".
"It is running on $clusters\n\n".
"$url\n",
"$extensions",
"BCC: $extensions");
$instance->AddExtensionHistory($text);
# For the dashboard and status page.
if ($action eq "deny") {
$instance->Update({"extension_requested" => 0,
"extension_denied" => 1,
"extension_denied_reason" => $reason});
}
return 0;
}
#
# Old Extend.
#
sub DoExtendOld()
{
my $force = 0;
my $lockdown = 0;
my $errcode = -1;
usage()
if (!@ARGV);
if (@ARGV == 2) {
my $arg = shift(@ARGV);
if ($arg eq "-f") {
$force = 1;
}
else {
usage();
}
}
my $seconds = shift(@ARGV);
if ($seconds !~ /^\d*$/) {
usage();
}
if ($instance->status() eq "failed" && !$force) {
fatal("Cannot extend failed instance!");
}
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance!");
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image. This happens all the time, users are silly. Lets
# stop the email about it.
#
if ($slice->Lock()) {
print STDERR "Experiment is busy, cannot lock it. Try again later.\n";
exit(GENIRESPONSE_BUSY);
}
# Save in case of error.
my $oldexpires = $slice->expires();
# Lockdown on admin extensions longer then XX days.
if (defined($this_user) && $this_user->IsAdmin() &&
($seconds / (24 * 60 * 60)) > 10) {
$lockdown = 1
}
# Need to update slice before creating new credential.
$slice->AddToExpiration($seconds);
my $new_expires = $slice->ExpirationGMT();
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $domain = $sliver->domain();
my $errmsg;
my $response = $sliver->Extend($new_expires, $this_user);
if (!defined($response)) {
$errmsg = "Internal error calling Renew at $domain";
goto bad;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
# This is something the user should see.
if ($response->code() == GENIRESPONSE_REFUSED ||
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_BUSY) {
print STDERR $response->output() . "\n";
# For web interface.
$webtask->output($response->output());
$webtask->Exited($response->code());
return 1;
}
$errmsg = "Failed to extend slice at $domain: ".
$response->output();
goto bad;
}
return 0;
bad:
print STDERR "$errmsg\n";
$webtask->output($errmsg);
$webtask->Exited(-1);
return -1;
};
my @return_codes = ();
my @agglist = $instance->AggregateList();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
print STDERR "Internal error calling Extend\b";
goto bad;
}
#
# Check the exit codes.
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
if ($code) {
$agg->webtask()->Refresh();
print STDERR "Some slivers could not be extended.\n";
$errcode = $agg->webtask()->exitcode();
goto bad;
}
}
# Lockdown.
if ($lockdown) {
if (DoLockdownInternal("set", "admin")) {
SENDMAIL($TBOPS,
"Failed to lock down APT Instance",
"Failed to lock down $instance\n".
$instance->webURL() . "\n",
$TBOPS);
}
}
$slice->UnLock();
exit(0);
bad:
# Reset back to original expiration, sorry.
$slice->SetExpiration($oldexpires);
$slice->UnLock();
exit($errcode);
}
#
# Refresh; ask the aggregate for status and set the instance status
# accordingly.
#
sub DoRefresh()
{
my $errmsg;
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
print STDERR "No slice for instance\n";
goto killit;
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
$errmsg = "Experiment is busy, cannot lock it. Please try again later";
goto bad;
}
#
# Create the webtask object, but AFTER locking the slice so we do
# not destroy one in use.
#
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
}
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $errmsg;
my $response = $sliver->SliceStatus();
if (!defined($response)) {
$errmsg = "RPC Error calling SliceStatus";
goto bad;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
if ($response->code() == GENIRESPONSE_SEARCHFAILED) {
$errmsg = "Slice is gone";
goto bad;
}
if ($response->code() == GENIRESPONSE_BUSY) {
$errmsg = "Slice is busy; try again later";
goto bad;
}
$errmsg = "Could not get status: ". $response->output();
goto bad;
}
my $blob = $response->value();
if ($blob->{'status'} eq "ready") {
$sliver->SetStatus("ready");
}
elsif ($blob->{'status'} eq "failed") {
$sliver->SetStatus("failed");
}
# This is the per-aggregate status, we always set this for web UI.
my $statusblob = $sliver->UpdateWebStatus($blob->{'details'});
if ($debug) {
print STDERR Dumper($statusblob);
}
return 0;
bad:
print STDERR "$errmsg\n";
$webtask->output($errmsg);
$webtask->Exited(1);
return 1;
};
my @return_codes = ();
my @agglist = $instance->AggregateList();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error calling Refresh";
goto bad;
}
#
# Check the exit codes.
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
if ($code) {
$agg->webtask()->Refresh();
$errmsg = "Some slivers could not be refreshed";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
}
goto bad;
}
}
$slice->UnLock();
exit(0);
killit:
$instance->RecordHistory($RECORDHISTORY_TERMINATED);
$instance->Delete();
exit(0);
bad:
$slice->UnLock();
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited(1);
}
exit(1);
}
#
# Reboot or Reload nodes.
#
sub DoRebootOrReload($)
{
my ($which) = @_;
my $errmsg;
usage()
if (!@ARGV);
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
print STDERR "No slice for instance\n";
goto killit;
}
my %sliver_urns = ();
my %node_ids = ();
my @slivers = ();
foreach my $obj ($instance->AggregateList()) {
my $manifest = GeniXML::Parse($obj->manifest());
if (! defined($manifest)) {
fatal("Could not parse manifest");
}
my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
foreach my $node (@nodes) {
my $client_id = GeniXML::GetVirtualId($node);
if (grep {$_ eq $client_id} @ARGV) {
my $sliver_urn = GeniXML::GetSliverId($node);
my $manager_urn = GetManagerId($node);
# No sliver urn or a different aggregate.
next
if (! (defined($sliver_urn) &&
defined($manager_urn) &&
$manager_urn eq $obj->aggregate_urn()));
if (!exists($sliver_urns{$obj->aggregate_urn()})) {
$sliver_urns{$obj->aggregate_urn()} = [];
push(@slivers, $obj);
}
push(@{ $sliver_urns{$obj->aggregate_urn()} }, $sliver_urn);
$node_ids{$sliver_urn} = $client_id;
}
}
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
$errmsg = "Experiment is busy, cannot lock it. Please try again later";
goto bad;
}
#
# Create the webtask object, but AFTER locking the slice so we do
# not destroy one in use.
#
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
}
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my @urns = @{ $sliver_urns{$sliver->aggregate_urn()} };
my $errmsg;
my $response = $sliver->SliverAction(\$errmsg, $which, @urns);
if (!defined($response)) {
$errmsg = "RPC Error calling SliverAction";
goto bad;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
if ($response->code() == GENIRESPONSE_SEARCHFAILED) {
print STDERR "Slice is already gone on $sliver";
goto gone;
}
if ($response->code() == GENIRESPONSE_BUSY) {
$errmsg = "Experiment is busy; try again later";
goto bad;
}
$errmsg = $response->output();
goto bad;
}
gone:
# Tell the web interface something is different. Real status will
# come later when the monitor starts up.
if ($webtask->sliverstatus()) {
my $blob = $webtask->sliverstatus();
foreach my $urn (@urns) {
my $node_id = $node_ids{$urn};
$blob->{$node_id}->{'status'} = "changing";
}
$webtask->sliverstatus($blob);
}
return 0;
bad:
print STDERR "$errmsg\n";
$webtask->output($errmsg);
$webtask->Exited(1);
return 1;
};
my @return_codes = ();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@slivers)},
\@return_codes, $coderef, @slivers)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error calling SliverAction";
goto bad;
}
#
# Check the exit codes.
#
foreach my $code (@return_codes) {
if ($code) {
$errmsg = "Some slivers could not be ${which}'ed";
goto bad;
}
}
$slice->UnLock();
if (defined($webtask)) {
$webtask->Exited(0);
}
#
# Start the monitor so the web interface will see when the node
# has actually come back up.
#
# XXX This will not return unless a monitor is already running.
StartMonitorInternal();
exit(0);
killit:
$instance->RecordHistory($RECORDHISTORY_TERMINATED);
$instance->Delete();
exit(0);
bad:
$slice->UnLock();
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited(1);
}
exit(1);
}
sub DoReboot() { return DoRebootOrReload("reboot"); }
sub DoReload() { return DoRebootOrReload("reload"); }
#
#
#
sub DoManifests()
{
my $errmsg;
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
print STDERR "No slice for instance\n";
goto killit;
}
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $errmsg;
my $response = $sliver->GetManifest();
if (!defined($response)) {
$errmsg = "RPC Error calling GetManifest";
goto bad;
}
return 0;
bad:
return 1;
};
my @return_codes = ();
my @agglist = $instance->AggregateList();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error calling GetManifest";
goto bad;
}
#
# Check the exit codes.
#
foreach my $code (@return_codes) {
if ($code) {
$errmsg = "Could not get manifest for some slivers";
goto bad;
}
}
exit(0);
bad:
print STDERR $errmsg . "\n";
exit(1);
}
#
# Delete nodes.
#
sub DoDeleteNodes()
{
my $logname;
my $errmsg;
my $errcode = 1;
usage()
if (!@ARGV);
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance");
}
my @aggregates = ();
my %node_ids = ();
my %aggmap = ();
foreach my $obj ($instance->AggregateList()) {
my $manifest = GeniXML::Parse($obj->manifest());
if (! defined($manifest)) {
fatal("Could not parse manifest");
}
my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
foreach my $node (@nodes) {
my $client_id = GeniXML::GetVirtualId($node);
if (grep {$_ eq $client_id} @ARGV) {
my $sliver_urn = GeniXML::GetSliverId($node);
my $manager_urn = GetManagerId($node);
# No sliver urn or a different aggregate.
next
if (! (defined($sliver_urn) &&
defined($manager_urn) &&
$manager_urn eq $obj->aggregate_urn()));
if (!exists($aggmap{$obj->aggregate_urn()})) {
$aggmap{$obj->aggregate_urn()} = [];
push(@aggregates, $obj);
}
push(@{ $aggmap{$obj->aggregate_urn()} }, $client_id);
$node_ids{$sliver_urn} = $client_id;
}
}
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
$errmsg = "Experiment is busy, cannot lock it. Please try again later";
goto bad;
}
#
# Create the webtask object, but AFTER locking the slice so we do
# not destroy one in use.
#
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
}
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my @nodes = @{ $aggmap{$sliver->aggregate_urn()} };
my $errcode = -1;
my $errmsg;
$sliver->SetStatus("provisioning");
my $response = $sliver->DeleteNodes(\$errmsg, @nodes);
if (!defined($response)) {
$errmsg = "RPC Error calling DeleteNode";
goto bad;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
if ($response->code() == GENIRESPONSE_SEARCHFAILED) {
print STDERR "Slice is gone on $sliver";
goto bad;
}
if ($response->code() == GENIRESPONSE_BUSY) {
$errmsg = "Experiment is busy; try again later";
goto bad;
}
$errmsg = $response->output();
$errcode = $response->code();
goto bad;
}
# We get back a new manifest.
my $manifest = $response->value();
$sliver->SetManifest($manifest);
# Delete the nodes from the status blob.
if ($webtask->sliverstatus()) {
my $blob = $webtask->sliverstatus();
foreach my $node_id (@nodes) {
delete($blob->{$node_id});
}
$webtask->sliverstatus($blob);
}
$sliver->SetStatus("provisioned");
return 0;
bad:
$sliver->SetStatus("ready");
$webtask->output($errmsg);
$webtask->Exited($errcode);
print STDERR "Returning $errcode from coderef\n";
return $errcode;
};
#
# Set the status back to provisioning for the web interface.
#
$instance->SetStatus("provisioning");
my @return_codes = ();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@aggregates)},
\@return_codes, $coderef, @aggregates)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error calling DeleteNodes";
goto bad;
}
#
# Check the exit codes.
#
foreach my $aggobj (@aggregates) {
my $code = shift(@return_codes);
# Updated in a forked child, must refresh.
$aggobj->Refresh();
if ($code) {
if ($aggobj->webtask()->output()) {
$errmsg = $aggobj->webtask()->output();
}
else {
$errmsg = "Some nodes could not be deleted";
}
$errcode = $aggobj->webtask()->exitcode();
goto bad;
}
}
#
# Let the web interface continue, we poll now.
#
if (!$debug) {
$logname = TBMakeLogname("deletenode");
if (TBBackGround($logname)) {
exit(0);
}
}
$instance->SetStatus("provisioned");
$instance->ComputeNodeCounts();
@return_codes = ();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@aggregates)}, \@return_codes,
\&APT_Instance::Aggregate::WaitForSliver, @aggregates)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error waiting for slivers";
goto bad;
}
#
# Check the exit codes.
#
foreach my $aggobj (@aggregates) {
my $code = shift(@return_codes);
# Updated in a forked child, must refresh.
$aggobj->Refresh();
if ($code) {
if ($aggobj->webtask()->output()) {
$errmsg = $aggobj->webtask()->output();
}
else {
$errmsg = "WaitforSliver Failure at ".$aggobj->aggregate_urn();
}
$errcode = $aggobj->webtask()->output();
goto bad;
}
}
$slice->UnLock();
$instance->SetStatus("ready");
$webtask->Exited(0);
exit(0);
bad:
$instance->SetStatus("ready");
$slice->UnLock();
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit($errcode);
}
#
# Start up the monitor for an instance. Only one though.
#
sub StartMonitor()
{
my $waitforstartup = 0;
if (@ARGV && $ARGV[0] eq "-w") {
$waitforstartup = 1;
}
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
if (!defined($webtask)) {
fatal("Could not lookup/create webtask for $webtask_id");
}
# Convenient.
$webtask->AutoStore(1);
}
return StartMonitorInternal($waitforstartup);
}
sub StartMonitorInternal(;$@)
{
my ($waitforstartup, @aggregatelist) = @_;
my $logfile;
my $signaled = 0;
# Wait for the startup command to finish.
$waitforstartup = 0
if (!defined($waitforstartup));
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance");
}
if ($instance->monitor_pid()) {
my $pid = $instance->monitor_pid();
if (kill(0, $pid)) {
print STDERR "Monitor already running ($pid). ".
"Kill it before starting a new one.\n";
return 1;
}
$instance->Update({"monitor_pid" => 0});
}
if (!$debug) {
$logfile = TBMakeLogname("aptmonitor");
if (TBBackGround($logfile)) {
return $PID;
}
}
$instance->Update({"monitor_pid" => '$PID'});
#
# We just did the operation, no need to ask so soon, and we
# avoid locking the slice in case the user wants to reboot
# another node right away. For reboot/reload, nothing interesting
# is going to be reported for a while.
#
sleep(30);
my $seconds = ($waitforstartup ? 7200 : 900);
my $interval = 15;
# Shorten default timeout now.
Genixmlrpc->SetTimeout(30);
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $errmsg;
my $response = $sliver->SliceStatus();
if (!defined($response)) {
print STDERR "RPC Error calling SliceStatus\n";
return GENIRESPONSE_RPCERROR;
}
if (($response->code() != GENIRESPONSE_SUCCESS &&
$response->code() != GENIRESPONSE_SERVER_UNAVAILABLE &&
$response->code() != GENIRESPONSE_BUSY)) {
print STDERR "SliverStatus failed";
print STDERR ": " . $response->output() . "\n";
if (defined($webtask)) {
if ($response->output() =~ /read timeout/) {
$webtask->output("Lost contact with the aggregate. " .
"Possibly a network failure, ".
"please try again later.");
}
else {
$webtask->output($response->output());
}
$webtask->exitcode($response->code());
}
return -1;
}
if ($response->code() == GENIRESPONSE_BUSY) {
# Indicate not done.
return GENIRESPONSE_BUSY;
}
my $blob = $response->value();
# This is the per-aggregate status, we always set this for web UI.
my $statusblob = $sliver->UpdateWebStatus($blob->{'details'});
if ($debug) {
print STDERR Dumper($statusblob);
}
# Look for nodes still executing
my $executing = 0;
if ($waitforstartup) {
foreach my $node_id (keys(%{$statusblob})) {
my $details = $statusblob->{'node_id'};
$executing++
if (exists($details->{'execute_state'}) &&
$details->{'execute_state'} ne "exited");
}
}
#
# We poll until the status goes ready, and if waiting for the
# startup commands to finish, for all of them to no longer be
# executing.
#
if ($blob->{'status'} eq "ready") {
return 0
if (!$executing || !$waitforstartup);
}
# Not done yet.
return 1;
};
while ($seconds > 0) {
$seconds -= $interval;
#
# Lock the slice in case it is doing something else, like taking
# a disk image. Just skip this turn.
#
goto delay
if ($slice->Lock());
my $handler = sub {
# This is so we can catch when Parrun gets signaled, but not
# exit till it exits.
$signaled = 1;
};
local $SIG{TERM} = $handler;
if ($debug) {
local $SIG{INT} = $handler;
}
my @return_codes = ();
my @agglist = $instance->AggregateList() if (! @aggregatelist);
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
print STDERR "Internal error calling Status()\n";
$slice->UnLock();
last;
}
local $SIG{TERM} = 'DEFAULT';
local $SIG{INT} = 'DEFAULT';
$slice->UnLock();
#
# Check the exit codes.
#
my $done = 1;
foreach my $code (@return_codes) {
if ($code) {
last
if ($code < 0);
$done = 0;
}
}
last
if ($done);
delay:
sleep($interval);
}
unlink($logfile)
if (defined($logfile) && !$debug);
exit($seconds < 0 ? -1 : 0);
}
#
# Experiment lockdown.
#
sub DoLockdownInternal($$)
{
my ($setclr,$which) = @_;
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance");
}
if ($which eq "all") {
if ($instance->SetLockdown("user", ($setclr eq "clear" ? 1 : 0))) {
print STDERR "Could not update instance lockdown\n";
return -1
}
$which = "admin"
}
if ($instance->SetLockdown($which, ($setclr eq "clear" ? 1 : 0))) {
print STDERR "Could not update instance lockdown\n";
return -1
}
my $clear = ($instance->admin_lockdown() ||
$instance->user_lockdown() ? 0 : 1);
#
# Have to set/clear the lockdown on the local slice.
#
if ($slice->SetLockdown($clear)) {
print STDERR "Could not update slice lockdown\n";
return -1
}
#
# And tell the backend clusters to lockdown the slice.
#
my $coderef = sub {
my ($sliver) = @_;
#
# We cannot do lockdown at AL2S. More generally, it only works at
# Emulab based aggregates, but we do not talk to other aggregates.
#
if ($sliver->isAL2S()) {
return 0
if ($clear);
my $project = $instance->GetProject();
$project->SendEmail($this_user->email(),
"Failed to completely lock down APT Instance",
"Failed to completely lock down $instance,\n".
"cause it crosses AL2S.\n\n".
$instance->webURL() . "\n",
$project->OpsEmailAddress(),
"CC: " . $project->OpsEmailAddress());
return 0;
}
my $response = $sliver->Lockdown($clear);
if (!defined($response)) {
print STDERR "RPC Error calling Lockdown\n";
return -1;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
print STDERR "Could not lockdown sliver: ".
$response->output() . "\n";
return -1;
}
return 0;
};
my @return_codes = ();
my @agglist = $instance->AggregateList();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
print STDERR "Internal error calling Lockdown()\n";
return -1;
}
#
# Check the exit codes.
#
foreach my $code (@return_codes) {
if ($code) {
print STDERR "Some slivers could not be locked down.\n";
return -1;
}
}
return 0;
}
sub DoLockdown()
{
usage()
if (@ARGV != 2);
my $setclr = shift(@ARGV);
my $which = shift(@ARGV);
fatal("Must specify either 'admin' or 'user'")
if ($which !~ /^(admin|user|all)$/);
fatal("Must specify either 'set' or 'clear'")
if ($setclr !~ /^(set|clear)$/);
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance");
}
if ($slice->Lock()) {
fatal("Experiment is busy, cannot lock it. Please try again later");
}
if (DoLockdownInternal($setclr, $which)) {
$slice->UnLock();
fatal("Could not lockdown instance!");
}
$slice->UnLock();
exit(0);
}
sub DoPanic()
{
my $emsg;
usage()
if (@ARGV != 1);
my $setclr = shift(@ARGV);
fatal("Must specify either 'set' or 'clear'")
if ($setclr !~ /^(set|clear)$/);
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance");
}
if ($slice->Lock()) {
fatal("Experiment is busy, cannot lock it. Please try again later");
}
#
# Create the webtask object, but AFTER locking the slice so we do
# not destroy one in use.
#
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
}
#
# And tell the backend clusters to lockdown the slice.
#
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $response = $sliver->Panic(($setclr eq "clear" ? 1 : 0));
if (!defined($response)) {
print STDERR "RPC Error calling Panic\n";
return -1;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
print STDERR "Could not panic sliver: ".
$response->output() . "\n";
return -1;
}
# Tell the web interface something is different. Real status will
# come later when the monitor starts up.
if ($webtask->sliverstatus()) {
my $blob = $webtask->sliverstatus();
foreach my $node_id (keys(%{ $blob })) {
$blob->{$node_id}->{'status'} = "changing";
}
$webtask->sliverstatus($blob);
}
return 0;
};
my @return_codes = ();
my @agglist = $instance->AggregateList();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
$emsg = "Internal error calling Lockdown()";
goto bad;
}
#
# Check the exit codes.
#
foreach my $code (@return_codes) {
if ($code) {
print STDERR "Some slivers could not be paniced";
goto bad;
}
}
if ($instance->SetPanic(($setclr eq "clear" ? 1 : 0))) {
$emsg = "Could not update instance panic flag";
goto bad;
}
StartMonitorInternal();
$slice->UnLock();
exit(0);
bad:
$slice->UnLock();
exit(-1);
}
#
# Linktest
#
sub DoLinktest()
{
my $action = "start";
my $level = 1;
my $errmsg;
my $errcode = 1;
if (@ARGV) {
my $arg = shift(@ARGV);
if ($arg eq "-k") {
$action = "stop";
}
elsif ($arg =~ /^\d$/ && $arg >= 1 && $arg <= 4) {
$level = $arg;
}
else {
usage();
}
}
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance!");
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
fatal("Slice is busy, cannot lock it");
}
# Check after lock to prevent concurrent startup.
if ($action eq "start") {
if ($instance->status() ne "ready") {
$slice->UnLock();
fatal("Must be ready to run linktest!");
}
}
else {
if ($instance->status() ne "linktest") {
$slice->UnLock();
fatal("Linktest is not running!");
}
}
#
# Create the webtask object, but AFTER locking the slice so we do
# not destroy one in use.
#
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
}
#
# And tell the backend clusters to run linktest
#
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $response = $sliver->RunLinktest($action, $level);
if (!defined($response)) {
print STDERR "RPC Error calling linktest on $sliver\n";
return -1;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
print STDERR "Could not $action linktest on sliver: ".
$response->output() . "\n";
$webtask->output($response->output());
$webtask->Exited($response->code());
return $response->code();
}
my $blob = $response->value();
if ($blob->{'status'} eq "running") {
$webtask->status("running");
$webtask->url($blob->{'url'});
}
elsif ($blob->{'status'} eq "stopped") {
$webtask->status("stopped");
$webtask->results($blob->{'results'});
$webtask->Exited(0);
}
return 0;
};
# Change status now.
my $old_status = $instance->status();
$instance->SetStatus("linktest");
my @return_codes = ();
my @agglist = ();
#
# Cull out any aggregates with no nodes.
#
foreach my $agg ($instance->AggregateList()) {
push(@agglist, $agg)
if ($agg->physnode_count() || $agg->virtnode_count());
}
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
$errmsg = "Internal error calling Lockdown()";
goto bad;
}
#
# Check the exit codes.
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
$agg->webtask()->Refresh();
if ($code) {
$errmsg = "Could not $action linktest on some slivers";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
$errcode = $agg->webtask()->exitcode();
}
goto bad;
}
if (!defined($webtask) && $agg->webtask()->results()) {
print $agg->webtask()->results();
}
}
if ($action eq "stop") {
$instance->SetStatus("ready");
$slice->UnLock();
exit(0);
}
#
# Okay, now we want to wait for linktest to finish on all the clusters
# so that we can change the status back to ready.
#
my $logfile = TBMakeLogname("linktest");
if (my $childpid = TBBackGround($logfile)) {
sleep(1);
my $status = 0;
my $foo = waitpid($childpid, &WNOHANG);
if ($foo) {
$status = $? >> 8;
}
# Unlock so user can stop linktest.
$slice->UnLock();
exit($status);
}
#
# Loop, asking each cluster for the linktest status.
#
my $tlimit = 3600;
my $errors = 0;
my %running = map { $_->aggregate_urn() => $_ } @agglist;
while ($tlimit > 0 && keys(%running)) {
foreach my $sliver (values(%running)) {
my $response = $sliver->RunLinktest("status");
if (!defined($response)) {
print STDERR "RPC Error calling linktest on $sliver\n";
next;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
if ($response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_BUSY) {
next;
}
print STDERR "Could not get linktest status for sliver: ".
$response->output() . "\n";
delete($running{$sliver->aggregate_urn()});
# If the sliver was deleted during linktest, we do not
# consider it an error.
if ($response->code() != GENIRESPONSE_SEARCHFAILED) {
$errors++;
}
next;
}
my $blob = $response->value();
if ($blob->{'status'} eq "stopped") {
delete($running{$sliver->aggregate_urn()});
}
}
$tlimit -= 5;
sleep(5);
}
if ($tlimit <= 0) {
print STDERR "Linktest run timed out!\n";
# Lets generate email for now, still debugging.
$errors++;
}
$instance->SetStatus($old_status);
if ($errors) {
SENDMAIL($TBOPS,
"Error running linktest",
"Error running linktest on $instance.\n",
$TBOPS, undef, $logfile);
}
unlink($logfile);
exit(0);
bad:
$instance->SetStatus($old_status);
$slice->UnLock();
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit(1);
}
#
# Update SSH keys.
#
sub DoUpdateKeys()
{
my $target_user;
my $errmsg;
my $errcode = 1;
if (@ARGV) {
my $uid = shift(@ARGV);
# If a target user, we are operating on that user, not the
# entire instance.
$target_user = User->Lookup($uid);
if (!defined($target_user)) {
fatal("no such target user $uid");
}
$target_user = GeniUser::LocalUser->Create($target_user);
}
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance!");
}
# This returns in CM format.
my $sshkeys;
if ($instance->GetSSHKeys(\$sshkeys, $target_user) < 0 || !@{$sshkeys}) {
fatal("Could not get ssh keys for instance");
}
#
# The AM API uses a different ssh key structure.
#
my $users = [];
foreach my $user (@{$sshkeys}) {
my @tmp = map { $_->{'key'} } @{$user->{'keys'}};
push(@{$users},
{"urn" => $user->{'urn'},
"keys" => [ @tmp ] });
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
fatal("Slice is busy, cannot lock it");
}
#
# Create the webtask object, but AFTER locking the slice so we do
# not destroy one in use.
#
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
}
#
# And tell the backend clusters to do the update.
#
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $response = $sliver->UpdateKeys($users);
if (!defined($response)) {
print STDERR "RPC Error calling updatekeys on $sliver\n";
return -1;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
print STDERR "Could not update keys on sliver: ".
$response->output() . "\n";
$webtask->output($response->output());
$webtask->Exited($response->code());
return $response->code();
}
return 0;
};
my @return_codes = ();
my @agglist = ();
#
# Cull out any aggregates with no nodes.
#
foreach my $agg ($instance->AggregateList()) {
push(@agglist, $agg)
if ($agg->physnode_count() || $agg->virtnode_count());
}
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
$errmsg = "Internal error calling UpdateKeys()";
goto bad;
}
#
# Check the exit codes.
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
$agg->webtask()->Refresh();
if ($code) {
$errmsg = "Could not update keys on some slivers";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
$errcode = $agg->webtask()->output();
}
goto bad;
}
}
$slice->UnLock();
exit(0);
bad:
$slice->UnLock();
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit($errcode);
}
#
# Get utilization info from the the clusters.
#
sub DoUtilization()
{
my $errmsg;
my $errcode = 1;
#
# Create the webtask object; the web interface gave us an anonymous
# webtask, so we can use it before lock.
#
if (defined($webtask_id)) {
$webtask = WebTask->Lookup($webtask_id);
fatal("Could not lookup webtask object")
if (!defined($webtask));
# Convenient.
$webtask->AutoStore(1);
}
#
# Get the nodeid to client id mapping
#
my %client_ids = ();
foreach my $obj ($instance->AggregateList()) {
my $manifest = GeniXML::Parse($obj->manifest());
if (! defined($manifest)) {
fatal("Could not parse manifest");
}
$client_ids{$obj->aggregate_urn()} = {};
my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
foreach my $node (@nodes) {
my $client_id = GeniXML::GetVirtualId($node);
my $node_id = GeniXML::GetVnodeId($node);
$client_ids{$obj->aggregate_urn()}->{$node_id} = $client_id;
}
}
#
# And tell the backend clusters to do the update.
#
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $response = $sliver->Utilization();
if (!defined($response)) {
print STDERR "RPC Error calling utilization on $sliver\n";
return -1;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
print STDERR "Could not get utilization for sliver: ".
$response->output() . "\n";
$webtask->output($response->output());
$webtask->Exited($response->code());
return $response->code();
}
$webtask->results($response->value());
return 0;
};
my @return_codes = ();
my @agglist = ();
#
# Cull out any aggregates with no nodes.
#
foreach my $agg ($instance->AggregateList()) {
push(@agglist, $agg)
if ($agg->physnode_count() || $agg->virtnode_count());
}
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
$errmsg = "Internal error calling UpdateKeys()";
goto bad;
}
#
# Check the exit codes.
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
$agg->webtask()->Refresh();
if ($code) {
$errmsg = "Could not get utilization from some slivers";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
$errcode = $agg->webtask()->output();
}
goto bad;
}
#
# Annotate the result with some extra info for the web UI.
#
my $blob = $agg->webtask()->results();
foreach my $node_id (keys(%{ $blob->{'details'}->{'nodes'} })) {
$blob->{'details'}->{'nodes'}->{$node_id}->{"client_id"} =
$client_ids{$agg->aggregate_urn()}->{$node_id};
}
if ($debug) {
print Dumper($agg->webtask()->results());
}
$agg->webtask()->results($blob);
$agg->webtask()->Store();
}
exit(0);
bad:
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit($errcode);
}
#
# Grab the openstack utilization file and stick it into the DB.
#
sub DoOpenstack()
{
my $errmsg;
my $errcode = 1;
#
# Create the webtask object; the web interface gave us an anonymous
# webtask, so we can use it before lock.
#
if (defined($webtask_id)) {
$webtask = WebTask->Lookup($webtask_id);
fatal("Could not lookup webtask object")
if (!defined($webtask));
# Convenient.
$webtask->AutoStore(1);
}
#
# Need to look inside the rspec to find the name of the controller node.
#
if (scalar($instance->AggregateList()) > 1) {
$errmsg = "Too many aggregates, ".
"is this really an Openstack experiment?";
goto bad;
}
my ($aggregate) = $instance->AggregateList();
if (!defined($aggregate->manifest())) {
$errmsg = "Mo manifest for experiment";
goto bad;
}
my $manifest = GeniXML::Parse($aggregate->manifest());
if (! defined($manifest)) {
$errmsg = "Could not parse manifest for $aggregate";
goto bad;
}
#
# We have to look inside the parameters to find the controller node.
#
my $NS = "http://www.protogeni.net/resources/rspec/ext/johnsond/1";
my $controller;
foreach my $param (GeniXML::FindNodesNS("n:profile_parameters/n:parameter",
$manifest, $NS)->get_nodelist()) {
my $value = $param->textContent();
if ($value =~ /^([^=]+)="(.+)"$/) {
if (lc($1) eq "controller") {
$controller = $2;
print "Controller = $controller\n"
if ($debug);
last;
}
}
}
if (!defined($controller)) {
$errmsg = "Could not find the CONTROLLER parameter";
goto bad;
}
#
# So now we can ask the aggregate to grab the file from the proper
# node in the topology; we do not want the cluster to have to figure
# that part out. Hmm, maybe we should tell the cluster what file too?
#
my $response = $aggregate->OpenstackData($controller);
if (!defined($response)) {
$errmsg = "RPC Error calling GetOpenstackStats on $aggregate\n";
$errcode = -1;
goto bad;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
$errmsg = "Could not get openstack json file for sliver: ".
$response->output();
$errcode = $response->code();
goto bad;
}
if ($debug) {
print $response->value() . "\n";
}
$instance->Update({"openstack_utilization" => $response->value()});
exit(0);
bad:
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit($errcode);
}
#
# Get idledata info from the clusters.
#
sub DoIdleData()
{
my $errmsg;
my $errcode = 1;
#
# Create the webtask object; the web interface gave us an anonymous
# webtask, so we can use it before lock.
#
if (defined($webtask_id)) {
$webtask = WebTask->Lookup($webtask_id);
fatal("Could not lookup webtask object")
if (!defined($webtask));
# Convenient.
$webtask->AutoStore(1);
}
#
# And ask the backend clusters for the data.
#
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $response = $sliver->IdleData();
if (!defined($response)) {
print STDERR "RPC Error calling idledata on $sliver\n";
return -1;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
print STDERR "Could not get idledata for sliver: ".
$response->output() . "\n";
$webtask->output($response->output());
$webtask->Exited($response->code());
return $response->code();
}
if ($debug) {
print Dumper($response->value());
}
$webtask->idledata($response->value());
return 0;
};
my @return_codes = ();
my @agglist = ();
#
# Cull out any aggregates with no nodes.
#
foreach my $agg ($instance->AggregateList()) {
push(@agglist, $agg)
if ($agg->physnode_count() || $agg->virtnode_count());
}
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
$errmsg = "Internal error calling IdleData()";
goto bad;
}
#
# Check the exit codes.
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
$agg->webtask()->Refresh();
if ($code) {
$errmsg = "Could not get idledata from some slivers";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
$errcode = $agg->webtask()->output();
}
goto bad;
}
if ($debug) {
print Dumper($agg->webtask()->idledata());
}
}
exit(0);
bad:
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit($errcode);
}
#
# Schedule slice to terminate. This is an admin action. The lockdown bit
# is cleared, and the lockout bit is set (no more free extensions). The
# expiration is set, and we send email.
#
sub DoSchedTerminate()
{
my $errcode = 1;
my $errmsg;
my $days;
my $reason;
my $creator = $instance->GetGeniUser();
my $slice = $instance->GetGeniSlice();
my $name = $instance->name();
my $url = $instance->webURL();
my $clusters = join(",", map { $_->domain() }
$instance->AggregateList());
my $pcount = $instance->physnode_count();
my $expires_time = str2time($slice->expires());
my $created_time = str2time($instance->created());
my $extensions = $instance->Brand()->ExtensionsEmailAddress();
usage()
if (!@ARGV);
$days = shift(@ARGV);
if (@ARGV == 2) {
my $arg = shift(@ARGV);
if ($arg eq "-m") {
$reason = shift(@ARGV);
}
else {
usage();
}
}
elsif (@ARGV == 1) {
my $filename = shift(@ARGV);
if (! -e $filename) {
fatal("$filename does not exist");
}
open(MSG, $filename) or
fatal("Could not open $filename");
$reason = "";
while () {
$reason .= $_;
}
close(MSG);
}
#
# Create the webtask object; the web interface gave us an anonymous
# webtask, so we can use it before lock.
#
if (defined($webtask_id)) {
$webtask = WebTask->Lookup($webtask_id);
fatal("Could not lookup webtask object")
if (!defined($webtask));
# Convenient.
$webtask->AutoStore(1);
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
$errcode = GENIRESPONSE_BUSY;
$errmsg ="Experiment is busy, cannot lock it. Try again later.";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
print STDERR "$errmsg\n";
exit($errcode);
}
if (defined($reason) &&
!TBcheck_dbslot($reason, "default", "fulltext",
TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
$errmsg = "Illegal characters in your reason";
$errcode = 1;
goto bad;
}
if (!TBcheck_dbslot($days, "default", "int",
TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
$errmsg = "Illegal integer for length";
$errcode = 1;
goto bad;
}
# No free time.
$instance->Update({"extension_adminonly" => 1});
#
# Need to set the new expiration before we clear the lockdown bit,
# else it might get terminated at the cluster. But, if the
# expiration is already beyond the desired termination point,
# leave it alone, all we need to do is set our local expiration,
# the daemon will take care of it. The reason we do this, is cause
# it is unclear if setting the expiration backwards (at the CM) is
# a legal thing to do (although our CM actually permits this).
#
if ($expires_time < time() + ($days * 3600 * 24)) {
my $seconds = (time() + ($days * 3600 * 24)) - $expires_time;
if ($errcode = ExtendInternal($slice, $seconds, 1, \$errmsg)) {
goto bad;
}
}
else {
$slice->SetExpiration(time() + ($days * 3600 * 24));
}
# Now we can clear this.
if ($instance->admin_lockdown()) {
if (DoLockdownInternal("clear", "all")) {
SENDMAIL($TBOPS,
"Failed to clear lock down on APT Instance",
"Failed to clear lock down $instance\n".
$instance->webURL() . "\n",
$TBOPS);
$errmsg = "Failed to clear lockdown";
$errcode = -1;
goto bad;
}
}
my $expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($slice->expires())));
my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z",
localtime(str2time($instance->created())));
my $message = "The site administrator has scheduled this experiment\n".
"to terminate in $days days.";
my $subject = "Experiment Termination Warning: $name";
#
# New extension mechanism
#
my $extensionargs = {
"action" => "request",
"wanted" => $days,
"granted" => $days,
"admin" => 1,
"uid" => $this_user->uid(),
"uid_idx" => $this_user->uid_idx(),
"message" => $message,
};
if (defined($reason)) {
$extensionargs->{"reason"} = $reason;
}
if (!defined(APT_Instance::ExtensionInfo->Create($instance,
$extensionargs))) {
print STDERR "Could not create extension info object\n";
}
$instance->Brand()->SendEmail($creator->email(), $subject,
$message . "\n\n" .
(defined($reason) ? $reason . "\n\n" : "") .
"Your experiment was started on $created\n".
"Your experiment will now expire at $expires\n".
"You are using $pcount physical nodes.\n".
"It is running on $clusters\n\n".
"$url\n",
"$extensions",
"BCC: $extensions");
if (defined($webtask)) {
$webtask->Exited(0);
}
$slice->UnLock();
exit(0);
bad:
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit($errcode);
}
#
# Write instance credentials to files.
#
sub WriteCredentials()
{
usage()
if (!@ARGV);
my $directory = shift(@ARGV);
fatal("$directory does not exist")
if (! -e $directory);
fatal("$directory is not a directory")
if (! -d $directory);
return $instance->WriteCredentials($directory);
}
sub fatal($)
{
my ($mesg) = @_;
if (defined($webtask)) {
$webtask->output($mesg);
$webtask->code(-1);
}
print STDERR "*** $0:\n".
" $mesg\n";
# Exit with negative status so web interface treats it as system error.
exit(-1);
}
sub UserError($)
{
my ($mesg) = @_;
if (defined($webtask)) {
$webtask->output($mesg);
$webtask->code(1);
}
print STDERR "*** $0:\n".
" $mesg\n";
exit(1);
}
sub escapeshellarg($)
{
my ($str) = @_;
$str =~ s/[^[:alnum:]]/\\$&/g;
return $str;
}