#!/usr/bin/perl -w # # Copyright (c) 2000-2016 University of Utah and the Flux Group. # # {{{EMULAB-LICENSE # # This file is part of the Emulab network testbed software. # # This file is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or (at # your option) any later version. # # This file is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this file. If not, see . # # }}} # use English; use strict; use Getopt::Std; use XML::Simple; use Data::Dumper; use CGI; use POSIX ":sys_wait_h"; use POSIX qw(setsid strftime ceil floor); use Date::Parse; # # Back-end script to manage APT profiles. # sub usage() { print("Usage: manage_instance snapshot instance ". "[-n node_id] [-i imagename] [-u node|all]\n"); print("Usage: manage_instance consoleurl instance node\n"); print("Usage: manage_instance terminate instance\n"); print("Usage: manage_instance refresh instance\n"); print("Usage: manage_instance reboot instance node_id ...\n"); print("Usage: manage_instance reload instance node_id ...\n"); print("Usage: manage_instance deletenodes instance node_id ...\n"); print("Usage: manage_instance monitor instance\n"); print("Usage: manage_instance lockdown instance set|clear user|admin\n"); print("Usage: manage_instance panic instance set|clear\n"); print("Usage: manage_instance linktest instance [-k | level]\n"); print("Usage: manage_instance writecreds instance directory\n"); print("Usage: manage_instance updatekeys instance [uid] \n"); print("Usage: manage_instance extend instance [-m message] days [filename]\n"); print("Usage: manage_instance denyextension instance [-m message] [filename]\n"); print("Usage: manage_instance moreinfo instance [-m message] [filename]\n"); print("Usage: manage_instance extendold instance [-f] seconds\n"); print("Usage: manage_instance utilization instance\n"); print("Usage: manage_instance schedterminate instance [-m message] days [filename]\n"); print("Usage: manage_instance idledata instance\n"); print("Usage: manage_instance openstackstats instance\n"); exit(-1); } my $optlist = "dt:s"; my $debug = 0; my $silent = 0; my $webtask_id; my $webtask; my $this_user; my $geniuser; # # Configure variables # my $TB = "@prefix@"; my $TBOPS = "@TBOPSEMAIL@"; my $QUICKVM = "$TB/sbin/protogeni/quickvm"; # # Untaint the path # $ENV{'PATH'} = "$TB/bin:$TB/sbin:/bin:/usr/bin:/usr/bin:/usr/sbin"; delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; # # Turn off line buffering on output # $| = 1; # # Load the Testbed support stuff. # use lib "@prefix@/lib"; use EmulabConstants; use emdb; use emutil; use libEmulab; use libtestbed; use User; use Project; use APT_Profile; use APT_Instance; use APT_Geni; use GeniXML; use GeniHRN; use Genixmlrpc; use GeniResponse; use GeniSlice; use GeniImage; use GeniUser; use WebTask; use EmulabFeatures; # Protos sub fatal($); sub UserError($); sub DoSnapshot(); sub DoConsole(); sub DoTerminate(); sub DoSchedTerminate(); sub DoExtend(); sub DoExtendOld(); sub DoDenyOrMoreInfo($); sub DoRefresh(); sub DoReboot(); sub DoReload(); sub DoLockdown(); sub DoPanic(); sub DoManifests(); sub DoLinktest(); sub DoUpdateKeys(); sub DoDeleteNodes(); sub DoUtilization(); sub DoIdleData(); sub DoOpenstack(); sub WriteCredentials(); sub StartMonitor(); sub StartMonitorInternal(;$@); sub DoImageTrackerStuff($$$$$$); sub DenyExtensionInternal($); sub ExtendInternal($$$$); # # Parse command arguments. Once we return from getopts, all that should be # left are the required arguments. # my %options = (); if (! getopts($optlist, \%options)) { usage(); } if (defined($options{"t"})) { $webtask_id = $options{"t"}; } if (defined($options{"d"})) { $debug++; } if (defined($options{"s"})) { $silent = 1; } if (@ARGV < 2) { usage(); } my $action = shift(@ARGV); my $uuid = shift(@ARGV); my $instance = APT_Instance->Lookup($uuid); if (!defined($instance)) { $instance = APT_Instance->LookupBySlice($uuid); } if (!defined($instance)) { fatal("No such instance $uuid"); } if (getpwuid($UID) eq "nobody") { $this_user = User->ImpliedUser(); } else { $this_user = User->ThisUser(); } # If a guest user, we will not have an actual user, which is okay. if (defined($this_user)) { $geniuser = GeniUser->CreateFromLocal($this_user); } if ($action eq "snapshot") { DoSnapshot(); } if ($action eq "extend") { DoExtend(); } if ($action eq "extendold") { DoExtendOld(); } elsif ($action eq "denyextension") { DoDenyOrMoreInfo("deny") } elsif ($action eq "moreinfo") { DoDenyOrMoreInfo("info") } elsif ($action eq "consoleurl") { DoConsole() } elsif ($action eq "terminate") { DoTerminate() } elsif ($action eq "schedterminate") { DoSchedTerminate() } elsif ($action eq "refresh") { DoRefresh() } elsif ($action eq "reboot") { DoReboot() } elsif ($action eq "reload") { DoReload() } elsif ($action eq "monitor") { StartMonitor() } elsif ($action eq "lockdown") { DoLockdown() } elsif ($action eq "panic") { DoPanic() } elsif ($action eq "linktest") { DoLinktest() } elsif ($action eq "updatekeys") { DoUpdateKeys() } elsif ($action eq "writecreds") { WriteCredentials() } elsif ($action eq "getmanifests") { DoManifests() } elsif ($action eq "deletenodes") { DoDeleteNodes() } elsif ($action eq "utilization") { DoUtilization() } elsif ($action eq "idledata") { DoIdleData() } elsif ($action eq "openstackstats") { DoOpenstack() } else { usage(); } exit(0); # # Take a snapshot. Implies a single node instance, for now. # sub DoSnapshot() { my $errmsg; my $logfile; my $errcode = -1; my $needunlock = 0; my $old_status = $instance->status(); my $node_id; my $imagename; my $cloneprofile; my $update_profile; my $copyback_uuid; my $copyback_urn; my $update_prepare = 0; my $doversions = 0; my $usetracker = 0; my $optlist = "n:i:u:Uc:"; my %options = (); if (! getopts($optlist, \%options)) { usage(); } if (defined($options{"n"})) { $node_id = $options{"n"}; } if (defined($options{"i"})) { $imagename = $options{"i"}; } if (defined($options{"c"})) { $cloneprofile = $options{"c"}; } if (defined($options{"u"})) { $update_profile = $options{"u"}; if ($update_profile !~ /^(node|all)$/) { usage(); } } if (defined($options{"U"})) { $update_prepare = 1; } if (defined($cloneprofile) && defined($update_profile)) { fatal("Not allowed to update profile when cloning a profile"); } if (defined($cloneprofile) && !defined($imagename)) { fatal("Must supply image name when cloning a profile"); } if ($old_status ne "ready") { fatal("Instance must be in the ready state to take a snapshot"); } my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for quick VM: $uuid"); } # # Might be a clone (manage_profile). # my $sliver_urn; my $aggregate; my $node; my $profile; if (defined($cloneprofile)) { $profile = APT_Profile->Lookup($cloneprofile); } else { $profile = APT_Profile->Lookup($instance->profile_id()); } if (!defined($profile)) { fatal("Could not lookup profile for " . (defined($cloneprofile) ? "cloning" : "snapshot")); } my $project = Project->Lookup($profile->pid_idx()); if (!defined($project)) { fatal("Could not lookup project for $profile"); } # # Sanity checks. # my @aggs = $instance->AggregateList(); if (! @aggs) { fatal("No slivers for instance!"); } if (!defined($node_id)) { # We snapshot the one node in the instance. if (@aggs != 1) { fatal("Too many aggregates (> 1) to snapshot"); } my ($agg) = @aggs; my $manifest = GeniXML::Parse($agg->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest for $agg"); } my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist(); if (@nodes != 1) { fatal("Too many nodes (> 1) to snapshot"); } ($node) = @nodes; $sliver_urn = GeniXML::GetSliverId($node); $node_id = GeniXML::GetVirtualId($node); $aggregate = $agg; # Profile Snapshot, always use the profile name. Clone passes in name. if (!defined($imagename)) { $imagename = $profile->name(); } } else { my $nodecount = 0; # Find the node in its manifest. foreach my $agg (@aggs) { my $manifest = GeniXML::Parse($agg->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest for $agg"); } foreach my $ref (GeniXML::FindNodes("n:node", $manifest)->get_nodelist()) { $nodecount++; my $client_id = GeniXML::GetVirtualId($ref); my $manager_urn = GetManagerId($ref); my $urn = GeniXML::GetSliverId($ref); # No sliver urn or a different aggregate. next if (! (defined($urn) && defined($manager_urn) && $manager_urn eq $agg->aggregate_urn())); if ($node_id eq $client_id) { $node = $ref; $sliver_urn = $urn; $aggregate = $agg; last; } } } if (!defined($sliver_urn)) { fatal("Could not find node '$node_id' in manifest"); } # # So, we want Profile snapshot above (of a single node profile) and # Node snapshot in a single node profile to behave the same wrt the # image name, so look at the nodecount to see if need to append the # nodeid to the imagename. # if (!defined($imagename)) { $imagename = $profile->name(); if ($nodecount > 1) { $imagename .= "." . $node_id; } } } # # Make sure a valid imagename. This a local test of course, but this # only works on IG aggregates anyway. # if (! TBcheck_dbslot($imagename, "images", "imagename", TBDB_CHECKDBSLOT_ERROR)) { $imagename = $profile->profileid(); $imagename .= "." . $node_id if (defined($node_id)); } # # Instruct the remote cluster to copy the image back to its origin, # but we need to ask the IMS for uuid of the image that is running, # so we can tell the cluster, which then tells the origin cluster. # We also need to know what the new URN of the image will be, for # updating the profile. # if (GetSiteVar("protogeni/use_imagetracker") && EmulabFeatures->FeatureEnabled("APT_UseImageTracker", $this_user, $project)) { $usetracker = 1; # # When cloning, we use the URN returned by the cluster; it is # the origin of the new image. # if (!defined($cloneprofile)) { my $rval = DoImageTrackerStuff($aggregate, $node, $project, \$copyback_uuid, \$copyback_urn, \$errmsg); if ($rval) { if ($rval < 0) { fatal($errmsg); } else { $errcode = 1; goto uerror; } } } } if (0) { fatal("$copyback_uuid, $copyback_urn\n"); } # # We are not going to allow this if the instance is on a different # cluster then where the image was originally created, since otherwise # the image provenancewill look like spaghetti. # if (defined($update_profile)) { my $diskref = GeniXML::GetDiskImage($node); if (defined($diskref)) { my $authority = $aggregate->GetGeniAuthority(); my $image_url = GeniXML::GetText("url", $diskref); if (defined($image_url) && !$usetracker) { require URI; # Get the hostname for the image URL. my $uri = URI->new($image_url); if (!defined($uri)) { fatal("Could not parse $image_url"); } my $image_host = $uri->host(); # Get the hostname for the authority. $uri = URI->new($authority->url()); if (!defined($uri)) { fatal("Could not parse authority URL"); } my $authority_host = $uri->host(); # Compare domains. $image_host =~ s/^([^.]+\.)//; $authority_host =~ s/^([^.]+\.)//; if ($image_host ne $authority_host) { $errmsg = "Not allowed to take a snapshot on this cluster"; $errcode = 1; goto uerror; } } } # Do this here to avoid output to logfile. $doversions = EmulabFeatures->FeatureEnabled("APT_ProfileVersions", $this_user, $project); } if ($slice->Lock()) { $errmsg = "Experiment is busy, please try again later."; $errcode = 1; goto uerror; } $needunlock = 1; # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); # This is convenience for the web server. if (defined($webtask)) { $webtask->aggregate_urn($aggregate->aggregate_urn()); $webtask->client_id($node_id); } } $instance->SetStatus("imaging"); $aggregate->SetStatus("imaging"); # # This returns pretty fast, and then the imaging takes place in # the background at the aggregate. # my $response = $aggregate->CreateImage($sliver_urn, $imagename, $update_prepare, $copyback_uuid); if (!defined($response)) { $errmsg = "Internal error creating image"; $instance->SetStatus($old_status); $aggregate->SetStatus($old_status); goto uerror; } if ($response->code() != GENIRESPONSE_SUCCESS) { $errmsg = "Could not create image: " . $response->output() . "\n"; $errcode = 1 if ($response->code() == GENIRESPONSE_BUSY || $response->code() == GENIRESPONSE_SERVER_UNAVAILABLE || $response->code() == GENIRESPONSE_FORBIDDEN); $instance->SetStatus($old_status); $aggregate->SetStatus($old_status); goto uerror; } my ($image_urn, $image_url, $version_urn, $version_url) = @{ $response->value() }; if (!defined($version_urn)) { $version_urn = $image_urn; $version_url = $image_url } if (defined($webtask)) { $webtask->image_urn($version_urn); $webtask->image_url($version_url); my $image_name; if ($usetracker) { # DoImageTrackerStuff determined that we use whatever the cluster # tells us, cause it is the home of the image. if (!defined($copyback_urn)) { $image_name = $version_urn; } else { $image_name = $copyback_urn; } } else { $image_name = $version_url; } $webtask->image_name($image_name); # We tell the web interface that the image has to be copied # back, if (defined($copyback_uuid)) { $webtask->copyback_uuid($copyback_uuid); } } else { print "$image_urn,$image_url\n"; } # # Exit and leave child to poll. # if (! $debug) { $logfile = TBMakeLogname("snapshot"); if (my $childpid = TBBackGround($logfile)) { # Parent exits normally, web interface watches. exit(0); } # Let parent exit; sleep(2); } # Bind the process id. This is important when the caller is # manage_profile, doing a clone. $webtask->SetProcessID($PID) if (defined($webtask)); # # Poll for a reasonable amount of time. # my $seconds = 1500; my $interval = 15; my $ready = 0; my $sliver_ready = 0; my $failed = 0; while ($seconds > 0) { sleep($interval); $seconds -= $interval; my $response = $aggregate->SliceStatus(); if ($response->code() != GENIRESPONSE_SUCCESS && $response->code() != GENIRESPONSE_RPCERROR && $response->code() != GENIRESPONSE_SERVER_UNAVAILABLE && $response->code() != GENIRESPONSE_BUSY) { $errmsg = "Sliverstatus failed: ". $response->output() . "\n"; $failed = 1; last; } next if ($response->code() == GENIRESPONSE_BUSY || $response->code() == GENIRESPONSE_SERVER_UNAVAILABLE || $response->code() == GENIRESPONSE_RPCERROR); my $blob = $response->value(); # This is the per-aggregate status, we always set this for web UI. $aggregate->UpdateWebStatus($blob->{'details'}); if ($blob->{'status'} eq "failed") { $failed = 1; last; } elsif ($blob->{'status'} eq "ready") { $sliver_ready = 1; } # # We are watching for the image status to report ready or failed. # $response = $aggregate->ImageInfo($image_urn); if ($response->code() != GENIRESPONSE_SUCCESS && $response->code() != GENIRESPONSE_RPCERROR && $response->code() != GENIRESPONSE_SERVER_UNAVAILABLE && $response->code() != GENIRESPONSE_BUSY) { $errmsg = "Imageinfo failed: ". $response->output() . "\n"; $failed = 1; last; } next if ($response->code() == GENIRESPONSE_BUSY || $response->code() == GENIRESPONSE_SERVER_UNAVAILABLE || $response->code() == GENIRESPONSE_RPCERROR); my $imageblob = $response->value(); if (defined($webtask)) { my %blobcopy = %{ $imageblob }; # # If the image is ready, but needs to be copied back to # its origin, hold of ready till later. We will wait for # the copyback to finish, see below. # if ($imageblob->{'status'} eq "ready" && defined($copyback_uuid)) { $blobcopy{'status'} = "copying"; } # This is also being updated by the event system. $instance->UpdateImageStatus(\%blobcopy); } if ($imageblob->{'status'} eq "ready") { $ready = 1; last; } elsif ($imageblob->{'status'} eq "failed") { $failed = 1; last; } } # Cause of image status events. $webtask->Refresh() if (defined($webtask)); if ($failed) { $errmsg = "Imaging failed" if (!defined($errmsg)); goto bad; } elsif (!$ready) { $errmsg = "Imaging timed out"; $errcode = -2; goto bad; } elsif (defined($update_profile)) { # # If successful, we create a new version of the profile and # update the rspec to reflect the new image version. Note # that we expect the CM is doing image versioning, so do not # bother to check if the image version is actually new. # if ($doversions) { $profile = $profile->NewVersion($this_user); if (!defined($profile)) { print STDERR "Could not create new profile version\n"; $webtask->Exited(70) if (defined($webtask)); exit(1); } } # DoImageTrackerStuff determined that we use whatever the cluster # tells us, cause it is the home of the image. $copyback_urn = $version_urn if ($usetracker && !defined($copyback_urn)); $profile->UpdateDiskImage($node_id, (defined($copyback_urn) ? $copyback_urn : $version_url), ($update_profile eq "all" ? 1 : 0)); } $instance->SetStatus("ready"); $aggregate->SetStatus("ready"); # # If there is a copyback_uuid, we want to wait for that to finish. # if (defined($copyback_uuid)) { # # We know the copyback is done when the IMS has the info. # my $copied = 0; $seconds = 1000; while ($seconds > 0) { sleep($interval); $seconds -= $interval; # # It would clearly be more more efficient to just look in # the IMS database. # Genixmlrpc->SetContext(APT_Geni::GeniContext()); my $blob = GeniImage::GetImageData($copyback_urn, \$errmsg); Genixmlrpc->SetContext(undef); # We get back undefined if the image is not posted yet. if (defined($blob)) { $copied = 1; last; } sleep($interval); } # Tell the web interface. if (!$copied) { $errmsg = "Failed to copy image back to its origin cluster"; $errcode = 1; goto bad; } elsif (defined($webtask)) { $webtask->image_status("ready"); } } # We garbage collect these later, so anyone waiting has a chance # to see the exit status $webtask->Exited(0) if (defined($webtask)); $slice->UnLock(); if (defined($logfile) && -s $logfile) { SENDMAIL($TBOPS, "Instance Snapshot Complete", "Finished taking snapshot of $instance.\n", $TBOPS, undef, $logfile); unlink($logfile); } if (!$sliver_ready) { # # Image is ready, but sliver is not. Start a monitor so that # web interface is updated. # StartMonitorInternal(); } exit(0); bad: if (!$sliver_ready) { # # Image is ready, but sliver is not. Start a monitor so that # web interface is updated. # StartMonitorInternal(); } $instance->SetStatus("ready"); $aggregate->SetStatus("ready"); if (defined($logfile)) { SENDMAIL($TBOPS, "Snapshot failed", "Error taking snapshot of $instance:\n\n". "$errmsg\n", $TBOPS, undef, $logfile); unlink($logfile); } uerror: print STDERR "$errmsg\n"; if (defined($errmsg) && defined($webtask)) { $webtask->Exited($errcode); $webtask->output($errmsg); } $slice->UnLock() if ($needunlock); exit($errcode); } sub DoImageTrackerStuff($$$$$$) { my ($aggregate, $node, $project, $puuid, $purn, $perrmsg) = @_; my $node_id = GeniXML::GetVirtualId($node); my $errmsg; # # If we do not have a diskinfo section, we will use the URN we get back # from the cluster (it is a snapshot of the default image). # my $diskinfo = GeniXML::GetDiskImage($node); return 0 if (!defined($diskinfo)); # # This one needs more thought, it might be a URL. # my $image_token = GeniXML::GetText("name", $diskinfo); if (!defined($image_token)) { $image_token = GeniXML::GetText("url", $diskinfo); return 0 if (!defined($image_token)); } if (GeniHRN::IsValid($image_token)) { my ($auth,$ospid) = GeniHRN::ParseImage($image_token); if (!defined($ospid)) { $$perrmsg = "Invalid image urn: $image_token"; return 1; } } Genixmlrpc->SetContext(APT_Geni::GeniContext()); my $blob = GeniImage::GetImageData($image_token, \$errmsg); Genixmlrpc->SetContext(undef); if (!defined($blob)) { if (GeniHRN::IsValid($image_token)) { # # See if this is for a system image (emulab-ops). If it is, # and the domain is not the MS, then retry with a MS URN. # # This is sorta temporary; at some point there will not be any # profiles using the URNs that are not in the image tracker. # Of course a user is free to set the URN to anything the want, # which is why I expect this code to be here for a while. # my $urn; my $hrn = GeniHRN->new($image_token); my ($auth,$ospid,$os,$vers) = $hrn->ParseImage(); if ($ospid eq TBOPSPID() && $auth ne "emulab.net") { $urn = GeniHRN::GenerateImage("emulab.net", TBOPSPID(), $os, $vers); Genixmlrpc->SetContext(APT_Geni::GeniContext()); $blob = GeniImage::GetImageData($urn, \$errmsg); Genixmlrpc->SetContext(undef); } } if (!defined($blob)) { $$perrmsg = "Could not get info from the image server for ". "$image_token:\n" . $errmsg; return 1; } } # # System Image? We use the URN we get back from CreateSliver(). # The cluster will be the origin for the new image. # return 0 if ($blob->{'issystem'}); my $image_urn = $blob->{'urn'}; my $copyback_uuid = $blob->{'version_uuid'}; my $copyback_urn = $image_urn; my $hrn = GeniHRN->Parse($image_urn); my (undef,$ospid,$os,$vers) = $hrn->ParseImage(); # # What happens if the user is doing a snapshot on the cluster where # the image lives? The copyback (import) makes no sense in that case, # but what if its the same cluster but different projects? In this case # we want a standard image clone, and we use whatever URN the cluster # hands back to us. # # Aside; should we allow snapshots (in the web ui) across projects? # if (lc($hrn->domain()) eq lc($aggregate->domain())) { my $projhrn = GeniHRN->Parse($blob->{'project_urn'}); if (!defined($projhrn)) { $$perrmsg = "Could not parse " . $blob->{'project_urn'} . "\n"; return -1; } if (lc($projhrn->subauth()) eq lc($project->pid())) { # We use the URN we get back from CreateSliver(). return 0; } # Ditto return 0; } # # If we are going to update the profile, we need to know what to # change the image urn to, and that depends on what version the # image is currently at, AT THE ORIGIN CLUSTER. The urn we get back # from the snapshotting cluster is not what we care about, we need # a urn for the origin cluster. But that depends on what version the # origin cluster is at (the highest numbered version). But if we are # doing a snapshot of an earlier version, we cannot generate the # version here, we have to ask what it will be. # if ($blob->{'isversioned'}) { $copyback_urn = GeniHRN::GenerateImage($hrn->authority(), $ospid, $os, $blob->{'maxversion'} + 1); } $$puuid = $copyback_uuid; $$purn = $copyback_urn; return 0; } # # Ask the console URL for a node in an instance. # sub DoConsole() { usage() if (!@ARGV); my $node_id = shift(@ARGV); if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate(undef, $webtask_id); if (!defined($webtask)) { fatal("Could not lookup/create webtask for $webtask_id"); } # Convenient. $webtask->AutoStore(1); } # # Sanity check to make sure the node is really in the rspec, since # we need its sliver urn to ask for the console url. # my $sliver_urn; my $sliver; foreach my $obj ($instance->AggregateList()) { my $manifest = GeniXML::Parse($obj->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest for $obj"); } my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist(); foreach my $node (@nodes) { my $client_id = GeniXML::GetVirtualId($node); my $urn = GeniXML::GetSliverId($node); my $manager_urn = GetManagerId($node); # No sliver urn or a different aggregate. next if (! (defined($urn) && defined($manager_urn) && $manager_urn eq $obj->aggregate_urn())); if ($node_id eq $client_id) { $sliver_urn = $urn; $sliver = $obj; } } } if (!defined($sliver_urn)) { fatal("Could not find node '$node_id' in manifest"); } my $response = $sliver->ConsoleInfo($sliver_urn); if (!defined($response)) { fatal("RPC Error calling ConsoleInfo"); } if ($response->code() == GENIRESPONSE_UNAVAILABLE) { print STDERR "Server says there is no console for $node_id\n"; if (defined($webtask)) { $webtask->output("Sorry, $node_id does not have a console line"); $webtask->Exited($response->code()); } exit($response->code()); } if ($response->code() == GENIRESPONSE_SEARCHFAILED) { print STDERR "Server says $node_id has been deallocated\n"; if (defined($webtask)) { $webtask->output("Sorry, $node_id has been deallocated"); $webtask->Exited($response->code()); } exit($response->code()); } if ($response->code() != GENIRESPONSE_SUCCESS) { $response = $sliver->ConsoleURL($sliver_urn); if (!defined($response)) { fatal("RPC Error calling ConsoleURL"); } if ($response->code() != GENIRESPONSE_SUCCESS) { if ($response->value()) { fatal($response->output()); } fatal("Server returned error: " . GENIRESPONSE_STRING($response->code)); } } my $url; my $pswd; my $logurl; if (ref($response->value())) { $url = $response->value()->{'url'}; $pswd = $response->value()->{'password'} if (exists($response->value()->{'password'})); $logurl = $response->value()->{'logurl'} if (exists($response->value()->{'logurl'})); print Dumper($response->value()); } else { $url = $response->value(); } if (defined($webtask)) { if ($response->code()) { $webtask->output($response->output()); } else { $webtask->url($url); $webtask->password($pswd) if (defined($pswd)); $webtask->logurl($logurl) if (defined($logurl)); } $webtask->Exited($response->code()); exit($response->code()); } # For command line operation too. if ($response->code()) { fatal($response->output()); } print $url . "\n"; print $pswd . "\n" if (defined($pswd)); print $logurl . "\n" if (defined($logurl)); exit(0); } # # Terminate # sub DoTerminate() { my $errmsg; my $logfile; my $expired = $RECORDHISTORY_TERMINATED; if (@ARGV) { my $arg = shift(@ARGV); if ($arg eq "-e") { $expired = $RECORDHISTORY_EXPIRED; } else { usage(); } } my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { # # No slice (typically) means we never got far enough to the # get the sliver created on the backend cluster. # goto killit; } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { # # A special case is if the slice is provisioning. This means the # user is giving up on it, and we want to tell the aggregate to # kill it. Not all aggregates are going to allow this, so need # to be able to deal with that. # if ($instance->status() ne "provisioned") { fatal("Slice is busy, cannot lock it"); } if (!$instance->canceled()) { print "Marking instance canceled\n"; $instance->MarkCanceled(); } sleep(1); # We have an obvious race here since we do not have the lock. exit(0); } my $old_status = $instance->status(); $instance->SetStatus("terminating"); # # Exit and let caller poll for status. # if (!$debug) { $logfile = TBMakeLogname("terminate"); if (my $childpid = TBBackGround($logfile)) { my $status = 0; # # Wait a couple of seconds to see if there is going to be an # immediate error. Then return and let it continue to run. This # allows the web server to see quick errors. Later errors will # have to be emailed. # sleep(3); my $foo = waitpid($childpid, &WNOHANG); if ($foo) { $status = $? >> 8; } exit($status); } } my $coderef = sub { my ($sliver) = @_; my $urn = $sliver->aggregate_urn(); my $errmsg; return 0 if ($sliver->status() eq "terminated"); my $response = $sliver->Terminate(); if (!defined($response)) { $errmsg = "RPC Error calling Terminate"; goto bad; } # SEARCHFAILED is success. if ($response->code() != GENIRESPONSE_SUCCESS && $response->code() != GENIRESPONSE_SEARCHFAILED) { if ($response->code() == GENIRESPONSE_BUSY || $response->code() == GENIRESPONSE_SERVER_UNAVAILABLE) { $errmsg = "Slice was busy for too long; try again later?"; goto bad; } $errmsg = "Could not delete slice: ". $response->output(); goto bad; } $instance->SetStatus("terminated"); return 0; bad: print STDERR "$urn: $errmsg\n"; return -1; }; #print STDERR Dumper($instance); my @return_codes = (); my @agglist = $instance->AggregateList(); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error calling Terminate()"; goto bad; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { $errmsg = "Some slivers would not terminate"; goto bad; } } $slice->Delete(); $instance->RecordHistory($expired); killit: $instance->Delete(); unlink($logfile) if (defined($logfile)); exit(0); bad: print STDERR $errmsg . "\n"; $instance->SetStatus($old_status); $slice->UnLock(); if (defined($logfile)) { my $instance_name = $instance->name(); my $slice_uuid = $slice->uuid(); SENDMAIL($TBOPS, "Unable to terminate instance $uuid", "Name: $instance_name\n". "Slice: $slice_uuid\n\n". "$errmsg\n", $TBOPS, undef, $logfile) if (!$silent); unlink($logfile); } exit(1); } # # Request an extension; all this code used to be in PHP, that was silly. # sub DoExtend() { my $force = 0; my $lockdown = 0; my $errcode = 1; my $autoextend_maximum = GetSiteVar("aptui/autoextend_maximum"); my $autoextend_maxage = GetSiteVar("aptui/autoextend_maxage"); my $autoextend_freedays= 2; my $creator = $instance->GetGeniUser(); my $slice = $instance->GetGeniSlice(); my $name = $instance->name(); my $url = $instance->webURL(); my $clusters = join(",", map { $_->domain() } $instance->AggregateList()); my $pcount = $instance->physnode_count(); my $expires_time = str2time($slice->expires()); my $created_time = str2time($instance->created()); my $extensions = $instance->Brand()->ExtensionsEmailAddress(); my $granted = 0; my $needapproval = 0; my $message; my $reason; my $errmsg; usage() if (!@ARGV); my $wanted = shift(@ARGV); if (@ARGV == 2) { my $arg = shift(@ARGV); if ($arg eq "-m") { $reason = shift(@ARGV); } else { usage(); } } elsif (@ARGV == 1) { my $filename = shift(@ARGV); if (! -e $filename) { fatal("$filename does not exist"); } open(MSG, $filename) or fatal("Could not open $filename"); $reason = ""; while () { $reason .= $_; } close(MSG); } # # Create the webtask object; the web interface gave us an anonymous # webtask, so we can use it before lock. # if (defined($webtask_id)) { $webtask = WebTask->Lookup($webtask_id); fatal("Could not lookup webtask object") if (!defined($webtask)); # Convenient. $webtask->AutoStore(1); } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { $errcode = GENIRESPONSE_BUSY; $errmsg ="Experiment is busy, cannot lock it. Try again later."; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } print STDERR "$errmsg\n"; exit($errcode); } if (defined($reason) && !TBcheck_dbslot($reason, "default", "fulltext", TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) { $errmsg = "Illegal characters in your reason"; $errcode = 1; goto bad; } if (!TBcheck_dbslot($wanted, "default", "int", TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) { $errmsg = "Illegal integer for length"; $errcode = 1; goto bad; } # Helper function. my $needAdminApproval = sub { my ($wanted, $granted, $reason, $message) = @_; # Subtract out the extra free time we added. my $howlong = $wanted - $granted; my $new_expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime(str2time($slice->expires())+ ($howlong * 3600 * 24))); my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime(str2time($instance->created()))); $instance->Brand()->SendEmail($extensions, "Experiment Extension Request: $name", "A request to extend this experiment was made but requires\n". "administrator approval" . ($message ? " $message" : "") . ".\n\n" . "The request was for $wanted days, we granted $granted days, ". "the reason given is:\n\n". $reason . "\n\n". "This experiment was started on $created\n". "Granting the request would set the expiration to $new_expires\n". "It is running on $clusters\n". "\n\n". $url . "\n\n", $creator->email()); # Flag for the dashboard page. $instance->ExtensionRequested($reason, $granted); # Need to return this to the web interface via the webtask. return "Your request requires admininstrator approval". ($message ? " because $message" : "") . ". " . "You will receive email if/when your ". "request is granted (or denied). Thanks!"; }; # # If no physical nodes (only VMs), double the maximums. # if (!$instance->physnode_count()) { $autoextend_maxage *= 2; $autoextend_maximum *= 2; } # # Guest users are treated differently. # if (!defined($this_user)) { # Only extend for 24 hours. $granted = 1; if ($expires_time > time() + (3600 * 24 * $granted)) { $errmsg = "You still have a day left. Try again tomorrow"; $errcode = 1; goto bad; } } # # Admin user, we do whatever it says to do. # elsif ($this_user->IsAdmin()) { $message = "Your experiment was extended by the site administrator."; $granted = $wanted; } else { my $diff = $expires_time - time(); my $cdiff = time() - $created_time; if (! defined($reason)) { fatal("You must supply a reason for this extension"); } # # If admin lockout, we are refusing any more free time. # if ($instance->extension_adminonly()) { $message = "because you are not allowed any more extensions"; $granted = 0; } # # After maxage, all extension requests require admin approval. # elsif ($cdiff > (3600 * 24 * $autoextend_maxage)) { # # Well, if they asked for less then the free grant, and # the experiment is going to expire very soon, we give # them some extra time. This is a nice loophole people will # probably notice. # my $mindiff = $autoextend_freedays * 3600 * 24; if ($diff < $mindiff) { $granted = POSIX::ceil(($mindiff - $diff) / (3600 * 24)); } else { $granted = 0; } if ($wanted > $granted) { $needapproval = 1; $message = "because it was started more then ". "$autoextend_maxage days ago"; } } # # Temporary for GEC23, this should be generalized next time. # elsif (0 && (time() + ($wanted * 3600 * 24) > str2time("2015-06-15 12:00:00"))) { $granted = 1; $needapproval = 1; $message = "because the testbed is mostly reserved for GEC23"; } # # Registered users are granted up to the autoextend_maximum # automatically. Beyond that, requires approval, but we still # give them whatever the free extension is, since we want to # give them extra time until the next meeting of the "resource # management committee." # elsif ($wanted > $autoextend_maximum) { $needapproval = 1; $message = "because it was for longer then $autoextend_maximum days"; # # Plenty of time left, no extension just a message. # if ($diff > (3600 * 24 * 3)) { $granted = 0; } else { $granted = $autoextend_maximum; } } elsif ($diff > (3600 * 24 * 7)) { my $days = POSIX::ceil($diff / (3600 * 24.0)); $errmsg = "You still have $days day(s) left before expiration!"; $errcode = 1; goto bad; } else { $granted = $wanted; } # # The most we allow is the autoextend_maximum out, no # matter what they asked for. So, if the autoextend_maximum # is a week and there are five days left and they asked # for seven, we give them two. # if ($expires_time + ($granted * 3600 * 24) > time() + (3600 * 24 * $autoextend_maximum)) { $granted = POSIX::ceil(((3600 * 24 * $autoextend_maximum) - $diff) / (3600 * 24.0)); } } # # Do the extension. # if ($granted) { if ($errcode = ExtendInternal($slice, $granted * 3600 * 24, 0, \$errmsg)) { goto bad; } } my $expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime(str2time($slice->expires()))); my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime(str2time($instance->created()))); my $now = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime()); my $before = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime($expires_time)); # # New extension mechanism # my $extensionargs = { "action" => "request", "wanted" => $wanted, "granted" => $granted, "admin" => $this_user->IsAdmin() ? 1 : 0}; if (defined($message)) { $extensionargs->{"message"} = $message; } if (defined($reason)) { $extensionargs->{"reason"} = $reason; } if (defined($this_user)) { $extensionargs->{"uid"} = $this_user->uid(); $extensionargs->{"uid_idx"} = $this_user->uid_idx(); } else { # A guest user, only the creator can request an extension. $extensionargs->{"uid"} = $instance->creator(); $extensionargs->{"uid_idx"} = $instance->creator_idx(); } my $extensioninfo = APT_Instance::ExtensionInfo->Create($instance, $extensionargs); if (!defined($extensioninfo)) { print STDERR "Could not create extension info object\n"; } # # We store each extension request in an ongoing text field. # my $text = "Date: $now\n". "Wanted: $wanted, Granted: $granted\n". "Before: $before\n". "After $expires\n". "Reason:\n". $reason . "\n\n". "-----------------------------------------------\n"; $instance->AddExtensionHistory($text); if ($needapproval) { $errmsg = &$needAdminApproval($wanted, $granted, $reason, $message); # The web interface (JS code) uses this error code. $errcode = 2; goto bad; } $instance->Brand()->SendEmail($creator->email(), "Experiment Extension: $name", ($this_user->IsAdmin() ? "$message\n\n$reason" : "A request to extend your experiment was made and ". "granted.\n". "Your reason was:\n\n". $reason) . "\n\n". "Your experiment was started on $created\n". "Your experiment will now expire at $expires\n". "You are using $pcount physical nodes.\n". "It is running on $clusters\n\n". "$url\n", "$extensions", "BCC: $extensions"); if (!$this_user->IsAdmin()) { # # We do not want to overwrite the reason in the DB if this # was an admin extension, we want to keep whatever the user # has written previously. This currently used by the web interface # to show the latest reason. # $instance->Update({"extension_reason" => $reason}); } else { # # Any time an admin issues an extension, we clear the flag that tells # the dashboard page there is an oustanding request. # $instance->Update({"extension_requested" => 0}); } $instance->BumpExtensionCount($granted); if (defined($webtask)) { $webtask->Exited(0); } $slice->UnLock(); exit(0); bad: $slice->UnLock(); print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } exit($errcode); } sub ExtendInternal($$$$) { my ($slice, $seconds, $force, $perrmsg) = @_; my $lockdown = 0; my $errcode = -1; my $errmsg; # Save in case of error. my $oldexpires = $slice->expires(); # Lockdown on admin extensions longer then XX days. if (defined($this_user) && $this_user->IsAdmin() && ($seconds / (24 * 60 * 60)) > 10) { $lockdown = 1 } # Need to update slice before creating new credential. if ($slice->IsExpired()) { $slice->SetExpiration(time() + $seconds); } else { $slice->AddToExpiration($seconds); } my $new_expires = $slice->ExpirationGMT(); my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $domain = $sliver->domain(); my $errmsg; my $response = $sliver->Extend($new_expires, $this_user); if (!defined($response)) { $errmsg = "Internal error calling Renew at $domain"; goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { $errmsg = "Failed to extend slice at $domain: ". $response->output(); # This is something the user should see. if ($response->code() == GENIRESPONSE_REFUSED || $response->code() == GENIRESPONSE_SERVER_UNAVAILABLE || $response->code() == GENIRESPONSE_BUSY) { # For web interface. $webtask->output($errmsg); $webtask->Exited($response->code()); return 1; } goto bad; } return 0; bad: print STDERR "$errmsg\n"; $webtask->output($errmsg); $webtask->Exited(-1); return -1; }; my @return_codes = (); my @agglist = $instance->AggregateList(); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error calling Extend\n"; goto bad; } # # Check the exit codes. # foreach my $agg (@agglist) { my $code = shift(@return_codes); if ($code) { $agg->webtask()->Refresh(); $errmsg = $agg->webtask()->output(); $errcode = $agg->webtask()->exitcode(); goto bad; } } # Lockdown. if ($lockdown) { if (DoLockdownInternal("set", "admin")) { SENDMAIL($TBOPS, "Failed to lock down APT Instance", "Failed to lock down $instance\n". $instance->webURL() . "\n", $TBOPS); } } return 0; bad: # Reset back to original expiration, sorry. $slice->SetExpiration($oldexpires); $$perrmsg = $errmsg; return $errcode; } # # Deny extension, sending optional email to user (which is also saved in # the extension history). We used to do this in PHP, which was silly. # sub DoDenyOrMoreInfo($) { my ($action) = @_; my $errcode = -1; my $reason; if (! $this_user->IsAdmin()) { fatal("Only administrators can deny extensions or request info"); } if (@ARGV == 2) { my $arg = shift(@ARGV); if ($arg eq "-m") { $reason = shift(@ARGV); } else { usage(); } } elsif (@ARGV == 1) { my $filename = shift(@ARGV); if (! -e $filename) { fatal("$filename does not exist"); } open(MSG, $filename) or fatal("Could not open $filename"); $reason = ""; while () { $reason .= $_; } close(MSG); } my $creator = $instance->GetGeniUser(); my $slice = $instance->GetGeniSlice(); my $name = $instance->name(); my $expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime(str2time($slice->expires()))); my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime(str2time($instance->created()))); my $now = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime()); my $url = $instance->webURL(); my $pcount = $instance->physnode_count(); my $extensions= $instance->Brand()->ExtensionsEmailAddress(); my $clusters = join(",", map { $_->domain() } $instance->AggregateList()); my ($message,$subject); if ($action eq "deny") { $message = "Your extension was denied by the site administrator!"; $subject = "Experiment Extension Denied: $name"; } else { $message = "Hi, we need more information about your experiment: $name"; $subject = "Information request for Experiment: $name"; } # # New extension mechanism # my $extensionargs = { "action" => ($action eq "deny" ? "deny" : "info"), "uid" => $this_user->uid(), "uid_idx" => $this_user->uid_idx(), "message" => $message, "admin" => $this_user->IsAdmin() ? 1 : 0}; if (defined($reason)) { $extensionargs->{"reason"} = $reason; } my $extensioninfo = APT_Instance::ExtensionInfo->Create($instance, $extensionargs); if (!defined($extensioninfo)) { print STDERR "Could not create extension info object\n"; return -1; } # # We store each extension request in an ongoing text field. # my $text = "Date: $now\n". "Expires: $expires\n". "Reason:\n". $message . "\n\n". $reason . "\n\n". "-----------------------------------------------\n"; $instance->Brand()->SendEmail($creator->email(), $subject, $message . "\n\n" . $reason . "\n\n". "Your experiment was started on $created\n". "Your experiment expires at $expires\n". "You are using $pcount physical nodes.\n". "It is running on $clusters\n\n". "$url\n", "$extensions", "BCC: $extensions"); $instance->AddExtensionHistory($text); # For the dashboard and status page. if ($action eq "deny") { $instance->Update({"extension_requested" => 0, "extension_denied" => 1, "extension_denied_reason" => $reason}); } return 0; } # # Old Extend. # sub DoExtendOld() { my $force = 0; my $lockdown = 0; my $errcode = -1; usage() if (!@ARGV); if (@ARGV == 2) { my $arg = shift(@ARGV); if ($arg eq "-f") { $force = 1; } else { usage(); } } my $seconds = shift(@ARGV); if ($seconds !~ /^\d*$/) { usage(); } if ($instance->status() eq "failed" && !$force) { fatal("Cannot extend failed instance!"); } my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance!"); } # # Lock the slice in case it is doing something else, like taking # a disk image. This happens all the time, users are silly. Lets # stop the email about it. # if ($slice->Lock()) { print STDERR "Experiment is busy, cannot lock it. Try again later.\n"; exit(GENIRESPONSE_BUSY); } # Save in case of error. my $oldexpires = $slice->expires(); # Lockdown on admin extensions longer then XX days. if (defined($this_user) && $this_user->IsAdmin() && ($seconds / (24 * 60 * 60)) > 10) { $lockdown = 1 } # Need to update slice before creating new credential. $slice->AddToExpiration($seconds); my $new_expires = $slice->ExpirationGMT(); my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $domain = $sliver->domain(); my $errmsg; my $response = $sliver->Extend($new_expires, $this_user); if (!defined($response)) { $errmsg = "Internal error calling Renew at $domain"; goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { # This is something the user should see. if ($response->code() == GENIRESPONSE_REFUSED || $response->code() == GENIRESPONSE_SERVER_UNAVAILABLE || $response->code() == GENIRESPONSE_BUSY) { print STDERR $response->output() . "\n"; # For web interface. $webtask->output($response->output()); $webtask->Exited($response->code()); return 1; } $errmsg = "Failed to extend slice at $domain: ". $response->output(); goto bad; } return 0; bad: print STDERR "$errmsg\n"; $webtask->output($errmsg); $webtask->Exited(-1); return -1; }; my @return_codes = (); my @agglist = $instance->AggregateList(); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # print STDERR "Internal error calling Extend\b"; goto bad; } # # Check the exit codes. # foreach my $agg (@agglist) { my $code = shift(@return_codes); if ($code) { $agg->webtask()->Refresh(); print STDERR "Some slivers could not be extended.\n"; $errcode = $agg->webtask()->exitcode(); goto bad; } } # Lockdown. if ($lockdown) { if (DoLockdownInternal("set", "admin")) { SENDMAIL($TBOPS, "Failed to lock down APT Instance", "Failed to lock down $instance\n". $instance->webURL() . "\n", $TBOPS); } } $slice->UnLock(); exit(0); bad: # Reset back to original expiration, sorry. $slice->SetExpiration($oldexpires); $slice->UnLock(); exit($errcode); } # # Refresh; ask the aggregate for status and set the instance status # accordingly. # sub DoRefresh() { my $errmsg; my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { print STDERR "No slice for instance\n"; goto killit; } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { $errmsg = "Experiment is busy, cannot lock it. Please try again later"; goto bad; } # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); } my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $errmsg; my $response = $sliver->SliceStatus(); if (!defined($response)) { $errmsg = "RPC Error calling SliceStatus"; goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { if ($response->code() == GENIRESPONSE_SEARCHFAILED) { $errmsg = "Slice is gone"; goto bad; } if ($response->code() == GENIRESPONSE_BUSY) { $errmsg = "Slice is busy; try again later"; goto bad; } $errmsg = "Could not get status: ". $response->output(); goto bad; } my $blob = $response->value(); if ($blob->{'status'} eq "ready") { $sliver->SetStatus("ready"); } elsif ($blob->{'status'} eq "failed") { $sliver->SetStatus("failed"); } # This is the per-aggregate status, we always set this for web UI. my $statusblob = $sliver->UpdateWebStatus($blob->{'details'}); if ($debug) { print STDERR Dumper($statusblob); } return 0; bad: print STDERR "$errmsg\n"; $webtask->output($errmsg); $webtask->Exited(1); return 1; }; my @return_codes = (); my @agglist = $instance->AggregateList(); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error calling Refresh"; goto bad; } # # Check the exit codes. # foreach my $agg (@agglist) { my $code = shift(@return_codes); if ($code) { $agg->webtask()->Refresh(); $errmsg = "Some slivers could not be refreshed"; if ($agg->webtask()->output()) { $errmsg .= ": " . $agg->webtask()->output(); } goto bad; } } $slice->UnLock(); exit(0); killit: $instance->RecordHistory($RECORDHISTORY_TERMINATED); $instance->Delete(); exit(0); bad: $slice->UnLock(); print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited(1); } exit(1); } # # Reboot or Reload nodes. # sub DoRebootOrReload($) { my ($which) = @_; my $errmsg; usage() if (!@ARGV); my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { print STDERR "No slice for instance\n"; goto killit; } my %sliver_urns = (); my %node_ids = (); my @slivers = (); foreach my $obj ($instance->AggregateList()) { my $manifest = GeniXML::Parse($obj->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest"); } my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist(); foreach my $node (@nodes) { my $client_id = GeniXML::GetVirtualId($node); if (grep {$_ eq $client_id} @ARGV) { my $sliver_urn = GeniXML::GetSliverId($node); my $manager_urn = GetManagerId($node); # No sliver urn or a different aggregate. next if (! (defined($sliver_urn) && defined($manager_urn) && $manager_urn eq $obj->aggregate_urn())); if (!exists($sliver_urns{$obj->aggregate_urn()})) { $sliver_urns{$obj->aggregate_urn()} = []; push(@slivers, $obj); } push(@{ $sliver_urns{$obj->aggregate_urn()} }, $sliver_urn); $node_ids{$sliver_urn} = $client_id; } } } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { $errmsg = "Experiment is busy, cannot lock it. Please try again later"; goto bad; } # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); } my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my @urns = @{ $sliver_urns{$sliver->aggregate_urn()} }; my $errmsg; my $response = $sliver->SliverAction(\$errmsg, $which, @urns); if (!defined($response)) { $errmsg = "RPC Error calling SliverAction"; goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { if ($response->code() == GENIRESPONSE_SEARCHFAILED) { print STDERR "Slice is already gone on $sliver"; goto gone; } if ($response->code() == GENIRESPONSE_BUSY) { $errmsg = "Experiment is busy; try again later"; goto bad; } $errmsg = $response->output(); goto bad; } gone: # Tell the web interface something is different. Real status will # come later when the monitor starts up. if ($webtask->sliverstatus()) { my $blob = $webtask->sliverstatus(); foreach my $urn (@urns) { my $node_id = $node_ids{$urn}; $blob->{$node_id}->{'status'} = "changing"; } $webtask->sliverstatus($blob); } return 0; bad: print STDERR "$errmsg\n"; $webtask->output($errmsg); $webtask->Exited(1); return 1; }; my @return_codes = (); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@slivers)}, \@return_codes, $coderef, @slivers)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error calling SliverAction"; goto bad; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { $errmsg = "Some slivers could not be ${which}'ed"; goto bad; } } $slice->UnLock(); if (defined($webtask)) { $webtask->Exited(0); } # # Start the monitor so the web interface will see when the node # has actually come back up. # # XXX This will not return unless a monitor is already running. StartMonitorInternal(); exit(0); killit: $instance->RecordHistory($RECORDHISTORY_TERMINATED); $instance->Delete(); exit(0); bad: $slice->UnLock(); print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited(1); } exit(1); } sub DoReboot() { return DoRebootOrReload("reboot"); } sub DoReload() { return DoRebootOrReload("reload"); } # # # sub DoManifests() { my $errmsg; my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { print STDERR "No slice for instance\n"; goto killit; } my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $errmsg; my $response = $sliver->GetManifest(); if (!defined($response)) { $errmsg = "RPC Error calling GetManifest"; goto bad; } return 0; bad: return 1; }; my @return_codes = (); my @agglist = $instance->AggregateList(); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error calling GetManifest"; goto bad; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { $errmsg = "Could not get manifest for some slivers"; goto bad; } } exit(0); bad: print STDERR $errmsg . "\n"; exit(1); } # # Delete nodes. # sub DoDeleteNodes() { my $logname; my $errmsg; my $errcode = 1; usage() if (!@ARGV); my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance"); } my @aggregates = (); my %node_ids = (); my %aggmap = (); foreach my $obj ($instance->AggregateList()) { my $manifest = GeniXML::Parse($obj->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest"); } my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist(); foreach my $node (@nodes) { my $client_id = GeniXML::GetVirtualId($node); if (grep {$_ eq $client_id} @ARGV) { my $sliver_urn = GeniXML::GetSliverId($node); my $manager_urn = GetManagerId($node); # No sliver urn or a different aggregate. next if (! (defined($sliver_urn) && defined($manager_urn) && $manager_urn eq $obj->aggregate_urn())); if (!exists($aggmap{$obj->aggregate_urn()})) { $aggmap{$obj->aggregate_urn()} = []; push(@aggregates, $obj); } push(@{ $aggmap{$obj->aggregate_urn()} }, $client_id); $node_ids{$sliver_urn} = $client_id; } } } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { $errmsg = "Experiment is busy, cannot lock it. Please try again later"; goto bad; } # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); } my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my @nodes = @{ $aggmap{$sliver->aggregate_urn()} }; my $errcode = -1; my $errmsg; $sliver->SetStatus("provisioning"); my $response = $sliver->DeleteNodes(\$errmsg, @nodes); if (!defined($response)) { $errmsg = "RPC Error calling DeleteNode"; goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { if ($response->code() == GENIRESPONSE_SEARCHFAILED) { print STDERR "Slice is gone on $sliver"; goto bad; } if ($response->code() == GENIRESPONSE_BUSY) { $errmsg = "Experiment is busy; try again later"; goto bad; } $errmsg = $response->output(); $errcode = $response->code(); goto bad; } # We get back a new manifest. my $manifest = $response->value(); $sliver->SetManifest($manifest); # Delete the nodes from the status blob. if ($webtask->sliverstatus()) { my $blob = $webtask->sliverstatus(); foreach my $node_id (@nodes) { delete($blob->{$node_id}); } $webtask->sliverstatus($blob); } $sliver->SetStatus("provisioned"); return 0; bad: $sliver->SetStatus("ready"); $webtask->output($errmsg); $webtask->Exited($errcode); print STDERR "Returning $errcode from coderef\n"; return $errcode; }; # # Set the status back to provisioning for the web interface. # $instance->SetStatus("provisioning"); my @return_codes = (); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@aggregates)}, \@return_codes, $coderef, @aggregates)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error calling DeleteNodes"; goto bad; } # # Check the exit codes. # foreach my $aggobj (@aggregates) { my $code = shift(@return_codes); # Updated in a forked child, must refresh. $aggobj->Refresh(); if ($code) { if ($aggobj->webtask()->output()) { $errmsg = $aggobj->webtask()->output(); } else { $errmsg = "Some nodes could not be deleted"; } $errcode = $aggobj->webtask()->exitcode(); goto bad; } } # # Let the web interface continue, we poll now. # if (!$debug) { $logname = TBMakeLogname("deletenode"); if (TBBackGround($logname)) { exit(0); } } $instance->SetStatus("provisioned"); $instance->ComputeNodeCounts(); @return_codes = (); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@aggregates)}, \@return_codes, \&APT_Instance::Aggregate::WaitForSliver, @aggregates)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error waiting for slivers"; goto bad; } # # Check the exit codes. # foreach my $aggobj (@aggregates) { my $code = shift(@return_codes); # Updated in a forked child, must refresh. $aggobj->Refresh(); if ($code) { if ($aggobj->webtask()->output()) { $errmsg = $aggobj->webtask()->output(); } else { $errmsg = "WaitforSliver Failure at ".$aggobj->aggregate_urn(); } $errcode = $aggobj->webtask()->output(); goto bad; } } $slice->UnLock(); $instance->SetStatus("ready"); $webtask->Exited(0); exit(0); bad: $instance->SetStatus("ready"); $slice->UnLock(); print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } exit($errcode); } # # Start up the monitor for an instance. Only one though. # sub StartMonitor() { my $waitforstartup = 0; if (@ARGV && $ARGV[0] eq "-w") { $waitforstartup = 1; } if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); if (!defined($webtask)) { fatal("Could not lookup/create webtask for $webtask_id"); } # Convenient. $webtask->AutoStore(1); } return StartMonitorInternal($waitforstartup); } sub StartMonitorInternal(;$@) { my ($waitforstartup, @aggregatelist) = @_; my $logfile; my $signaled = 0; # Wait for the startup command to finish. $waitforstartup = 0 if (!defined($waitforstartup)); my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance"); } if ($instance->monitor_pid()) { my $pid = $instance->monitor_pid(); if (kill(0, $pid)) { print STDERR "Monitor already running ($pid). ". "Kill it before starting a new one.\n"; return 1; } $instance->Update({"monitor_pid" => 0}); } if (!$debug) { $logfile = TBMakeLogname("aptmonitor"); if (TBBackGround($logfile)) { return $PID; } } $instance->Update({"monitor_pid" => '$PID'}); # # We just did the operation, no need to ask so soon, and we # avoid locking the slice in case the user wants to reboot # another node right away. For reboot/reload, nothing interesting # is going to be reported for a while. # sleep(30); my $seconds = ($waitforstartup ? 7200 : 900); my $interval = 15; # Shorten default timeout now. Genixmlrpc->SetTimeout(30); my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $errmsg; my $response = $sliver->SliceStatus(); if (!defined($response)) { print STDERR "RPC Error calling SliceStatus\n"; return GENIRESPONSE_RPCERROR; } if (($response->code() != GENIRESPONSE_SUCCESS && $response->code() != GENIRESPONSE_SERVER_UNAVAILABLE && $response->code() != GENIRESPONSE_BUSY)) { print STDERR "SliverStatus failed"; print STDERR ": " . $response->output() . "\n"; if (defined($webtask)) { if ($response->output() =~ /read timeout/) { $webtask->output("Lost contact with the aggregate. " . "Possibly a network failure, ". "please try again later."); } else { $webtask->output($response->output()); } $webtask->exitcode($response->code()); } return -1; } if ($response->code() == GENIRESPONSE_BUSY) { # Indicate not done. return GENIRESPONSE_BUSY; } my $blob = $response->value(); # This is the per-aggregate status, we always set this for web UI. my $statusblob = $sliver->UpdateWebStatus($blob->{'details'}); if ($debug) { print STDERR Dumper($statusblob); } # Look for nodes still executing my $executing = 0; if ($waitforstartup) { foreach my $node_id (keys(%{$statusblob})) { my $details = $statusblob->{'node_id'}; $executing++ if (exists($details->{'execute_state'}) && $details->{'execute_state'} ne "exited"); } } # # We poll until the status goes ready, and if waiting for the # startup commands to finish, for all of them to no longer be # executing. # if ($blob->{'status'} eq "ready") { return 0 if (!$executing || !$waitforstartup); } # Not done yet. return 1; }; while ($seconds > 0) { $seconds -= $interval; # # Lock the slice in case it is doing something else, like taking # a disk image. Just skip this turn. # goto delay if ($slice->Lock()); my $handler = sub { # This is so we can catch when Parrun gets signaled, but not # exit till it exits. $signaled = 1; }; local $SIG{TERM} = $handler; if ($debug) { local $SIG{INT} = $handler; } my @return_codes = (); my @agglist = $instance->AggregateList() if (! @aggregatelist); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { print STDERR "Internal error calling Status()\n"; $slice->UnLock(); last; } local $SIG{TERM} = 'DEFAULT'; local $SIG{INT} = 'DEFAULT'; $slice->UnLock(); # # Check the exit codes. # my $done = 1; foreach my $code (@return_codes) { if ($code) { last if ($code < 0); $done = 0; } } last if ($done); delay: sleep($interval); } unlink($logfile) if (defined($logfile) && !$debug); exit($seconds < 0 ? -1 : 0); } # # Experiment lockdown. # sub DoLockdownInternal($$) { my ($setclr,$which) = @_; my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance"); } if ($which eq "all") { if ($instance->SetLockdown("user", ($setclr eq "clear" ? 1 : 0))) { print STDERR "Could not update instance lockdown\n"; return -1 } $which = "admin" } if ($instance->SetLockdown($which, ($setclr eq "clear" ? 1 : 0))) { print STDERR "Could not update instance lockdown\n"; return -1 } my $clear = ($instance->admin_lockdown() || $instance->user_lockdown() ? 0 : 1); # # Have to set/clear the lockdown on the local slice. # if ($slice->SetLockdown($clear)) { print STDERR "Could not update slice lockdown\n"; return -1 } # # And tell the backend clusters to lockdown the slice. # my $coderef = sub { my ($sliver) = @_; # # We cannot do lockdown at AL2S. More generally, it only works at # Emulab based aggregates, but we do not talk to other aggregates. # if ($sliver->isAL2S()) { return 0 if ($clear); my $project = $instance->GetProject(); $project->SendEmail($this_user->email(), "Failed to completely lock down APT Instance", "Failed to completely lock down $instance,\n". "cause it crosses AL2S.\n\n". $instance->webURL() . "\n", $project->OpsEmailAddress(), "CC: " . $project->OpsEmailAddress()); return 0; } my $response = $sliver->Lockdown($clear); if (!defined($response)) { print STDERR "RPC Error calling Lockdown\n"; return -1; } if ($response->code() != GENIRESPONSE_SUCCESS) { print STDERR "Could not lockdown sliver: ". $response->output() . "\n"; return -1; } return 0; }; my @return_codes = (); my @agglist = $instance->AggregateList(); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { print STDERR "Internal error calling Lockdown()\n"; return -1; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { print STDERR "Some slivers could not be locked down.\n"; return -1; } } return 0; } sub DoLockdown() { usage() if (@ARGV != 2); my $setclr = shift(@ARGV); my $which = shift(@ARGV); fatal("Must specify either 'admin' or 'user'") if ($which !~ /^(admin|user|all)$/); fatal("Must specify either 'set' or 'clear'") if ($setclr !~ /^(set|clear)$/); my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance"); } if ($slice->Lock()) { fatal("Experiment is busy, cannot lock it. Please try again later"); } if (DoLockdownInternal($setclr, $which)) { $slice->UnLock(); fatal("Could not lockdown instance!"); } $slice->UnLock(); exit(0); } sub DoPanic() { my $emsg; usage() if (@ARGV != 1); my $setclr = shift(@ARGV); fatal("Must specify either 'set' or 'clear'") if ($setclr !~ /^(set|clear)$/); my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance"); } if ($slice->Lock()) { fatal("Experiment is busy, cannot lock it. Please try again later"); } # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); } # # And tell the backend clusters to lockdown the slice. # my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $response = $sliver->Panic(($setclr eq "clear" ? 1 : 0)); if (!defined($response)) { print STDERR "RPC Error calling Panic\n"; return -1; } if ($response->code() != GENIRESPONSE_SUCCESS) { print STDERR "Could not panic sliver: ". $response->output() . "\n"; return -1; } # Tell the web interface something is different. Real status will # come later when the monitor starts up. if ($webtask->sliverstatus()) { my $blob = $webtask->sliverstatus(); foreach my $node_id (keys(%{ $blob })) { $blob->{$node_id}->{'status'} = "changing"; } $webtask->sliverstatus($blob); } return 0; }; my @return_codes = (); my @agglist = $instance->AggregateList(); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { $emsg = "Internal error calling Lockdown()"; goto bad; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { print STDERR "Some slivers could not be paniced"; goto bad; } } if ($instance->SetPanic(($setclr eq "clear" ? 1 : 0))) { $emsg = "Could not update instance panic flag"; goto bad; } StartMonitorInternal(); $slice->UnLock(); exit(0); bad: $slice->UnLock(); exit(-1); } # # Linktest # sub DoLinktest() { my $action = "start"; my $level = 1; my $errmsg; my $errcode = 1; if (@ARGV) { my $arg = shift(@ARGV); if ($arg eq "-k") { $action = "stop"; } elsif ($arg =~ /^\d$/ && $arg >= 1 && $arg <= 4) { $level = $arg; } else { usage(); } } my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance!"); } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { fatal("Slice is busy, cannot lock it"); } # Check after lock to prevent concurrent startup. if ($action eq "start") { if ($instance->status() ne "ready") { $slice->UnLock(); fatal("Must be ready to run linktest!"); } } else { if ($instance->status() ne "linktest") { $slice->UnLock(); fatal("Linktest is not running!"); } } # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); } # # And tell the backend clusters to run linktest # my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $response = $sliver->RunLinktest($action, $level); if (!defined($response)) { print STDERR "RPC Error calling linktest on $sliver\n"; return -1; } if ($response->code() != GENIRESPONSE_SUCCESS) { print STDERR "Could not $action linktest on sliver: ". $response->output() . "\n"; $webtask->output($response->output()); $webtask->Exited($response->code()); return $response->code(); } my $blob = $response->value(); if ($blob->{'status'} eq "running") { $webtask->status("running"); $webtask->url($blob->{'url'}); } elsif ($blob->{'status'} eq "stopped") { $webtask->status("stopped"); $webtask->results($blob->{'results'}); $webtask->Exited(0); } return 0; }; # Change status now. my $old_status = $instance->status(); $instance->SetStatus("linktest"); my @return_codes = (); my @agglist = (); # # Cull out any aggregates with no nodes. # foreach my $agg ($instance->AggregateList()) { push(@agglist, $agg) if ($agg->physnode_count() || $agg->virtnode_count()); } if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { $errmsg = "Internal error calling Lockdown()"; goto bad; } # # Check the exit codes. # foreach my $agg (@agglist) { my $code = shift(@return_codes); $agg->webtask()->Refresh(); if ($code) { $errmsg = "Could not $action linktest on some slivers"; if ($agg->webtask()->output()) { $errmsg .= ": " . $agg->webtask()->output(); $errcode = $agg->webtask()->exitcode(); } goto bad; } if (!defined($webtask) && $agg->webtask()->results()) { print $agg->webtask()->results(); } } if ($action eq "stop") { $instance->SetStatus("ready"); $slice->UnLock(); exit(0); } # # Okay, now we want to wait for linktest to finish on all the clusters # so that we can change the status back to ready. # my $logfile = TBMakeLogname("linktest"); if (my $childpid = TBBackGround($logfile)) { sleep(1); my $status = 0; my $foo = waitpid($childpid, &WNOHANG); if ($foo) { $status = $? >> 8; } # Unlock so user can stop linktest. $slice->UnLock(); exit($status); } # # Loop, asking each cluster for the linktest status. # my $tlimit = 3600; my $errors = 0; my %running = map { $_->aggregate_urn() => $_ } @agglist; while ($tlimit > 0 && keys(%running)) { foreach my $sliver (values(%running)) { my $response = $sliver->RunLinktest("status"); if (!defined($response)) { print STDERR "RPC Error calling linktest on $sliver\n"; next; } if ($response->code() != GENIRESPONSE_SUCCESS) { if ($response->code() == GENIRESPONSE_SERVER_UNAVAILABLE || $response->code() == GENIRESPONSE_BUSY) { next; } print STDERR "Could not get linktest status for sliver: ". $response->output() . "\n"; delete($running{$sliver->aggregate_urn()}); # If the sliver was deleted during linktest, we do not # consider it an error. if ($response->code() != GENIRESPONSE_SEARCHFAILED) { $errors++; } next; } my $blob = $response->value(); if ($blob->{'status'} eq "stopped") { delete($running{$sliver->aggregate_urn()}); } } $tlimit -= 5; sleep(5); } if ($tlimit <= 0) { print STDERR "Linktest run timed out!\n"; # Lets generate email for now, still debugging. $errors++; } $instance->SetStatus($old_status); if ($errors) { SENDMAIL($TBOPS, "Error running linktest", "Error running linktest on $instance.\n", $TBOPS, undef, $logfile); } unlink($logfile); exit(0); bad: $instance->SetStatus($old_status); $slice->UnLock(); print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } exit(1); } # # Update SSH keys. # sub DoUpdateKeys() { my $target_user; my $errmsg; my $errcode = 1; if (@ARGV) { my $uid = shift(@ARGV); # If a target user, we are operating on that user, not the # entire instance. $target_user = User->Lookup($uid); if (!defined($target_user)) { fatal("no such target user $uid"); } $target_user = GeniUser::LocalUser->Create($target_user); } my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance!"); } # This returns in CM format. my $sshkeys; if ($instance->GetSSHKeys(\$sshkeys, $target_user) < 0 || !@{$sshkeys}) { fatal("Could not get ssh keys for instance"); } # # The AM API uses a different ssh key structure. # my $users = []; foreach my $user (@{$sshkeys}) { my @tmp = map { $_->{'key'} } @{$user->{'keys'}}; push(@{$users}, {"urn" => $user->{'urn'}, "keys" => [ @tmp ] }); } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { fatal("Slice is busy, cannot lock it"); } # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); } # # And tell the backend clusters to do the update. # my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $response = $sliver->UpdateKeys($users); if (!defined($response)) { print STDERR "RPC Error calling updatekeys on $sliver\n"; return -1; } if ($response->code() != GENIRESPONSE_SUCCESS) { print STDERR "Could not update keys on sliver: ". $response->output() . "\n"; $webtask->output($response->output()); $webtask->Exited($response->code()); return $response->code(); } return 0; }; my @return_codes = (); my @agglist = (); # # Cull out any aggregates with no nodes. # foreach my $agg ($instance->AggregateList()) { push(@agglist, $agg) if ($agg->physnode_count() || $agg->virtnode_count()); } if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { $errmsg = "Internal error calling UpdateKeys()"; goto bad; } # # Check the exit codes. # foreach my $agg (@agglist) { my $code = shift(@return_codes); $agg->webtask()->Refresh(); if ($code) { $errmsg = "Could not update keys on some slivers"; if ($agg->webtask()->output()) { $errmsg .= ": " . $agg->webtask()->output(); $errcode = $agg->webtask()->output(); } goto bad; } } $slice->UnLock(); exit(0); bad: $slice->UnLock(); print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } exit($errcode); } # # Get utilization info from the the clusters. # sub DoUtilization() { my $errmsg; my $errcode = 1; # # Create the webtask object; the web interface gave us an anonymous # webtask, so we can use it before lock. # if (defined($webtask_id)) { $webtask = WebTask->Lookup($webtask_id); fatal("Could not lookup webtask object") if (!defined($webtask)); # Convenient. $webtask->AutoStore(1); } # # Get the nodeid to client id mapping # my %client_ids = (); foreach my $obj ($instance->AggregateList()) { my $manifest = GeniXML::Parse($obj->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest"); } $client_ids{$obj->aggregate_urn()} = {}; my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist(); foreach my $node (@nodes) { my $client_id = GeniXML::GetVirtualId($node); my $node_id = GeniXML::GetVnodeId($node); $client_ids{$obj->aggregate_urn()}->{$node_id} = $client_id; } } # # And tell the backend clusters to do the update. # my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $response = $sliver->Utilization(); if (!defined($response)) { print STDERR "RPC Error calling utilization on $sliver\n"; return -1; } if ($response->code() != GENIRESPONSE_SUCCESS) { print STDERR "Could not get utilization for sliver: ". $response->output() . "\n"; $webtask->output($response->output()); $webtask->Exited($response->code()); return $response->code(); } $webtask->results($response->value()); return 0; }; my @return_codes = (); my @agglist = (); # # Cull out any aggregates with no nodes. # foreach my $agg ($instance->AggregateList()) { push(@agglist, $agg) if ($agg->physnode_count() || $agg->virtnode_count()); } if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { $errmsg = "Internal error calling UpdateKeys()"; goto bad; } # # Check the exit codes. # foreach my $agg (@agglist) { my $code = shift(@return_codes); $agg->webtask()->Refresh(); if ($code) { $errmsg = "Could not get utilization from some slivers"; if ($agg->webtask()->output()) { $errmsg .= ": " . $agg->webtask()->output(); $errcode = $agg->webtask()->output(); } goto bad; } # # Annotate the result with some extra info for the web UI. # my $blob = $agg->webtask()->results(); foreach my $node_id (keys(%{ $blob->{'details'}->{'nodes'} })) { $blob->{'details'}->{'nodes'}->{$node_id}->{"client_id"} = $client_ids{$agg->aggregate_urn()}->{$node_id}; } if ($debug) { print Dumper($agg->webtask()->results()); } $agg->webtask()->results($blob); $agg->webtask()->Store(); } exit(0); bad: print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } exit($errcode); } # # Grab the openstack utilization file and stick it into the DB. # sub DoOpenstack() { my $errmsg; my $errcode = 1; # # Create the webtask object; the web interface gave us an anonymous # webtask, so we can use it before lock. # if (defined($webtask_id)) { $webtask = WebTask->Lookup($webtask_id); fatal("Could not lookup webtask object") if (!defined($webtask)); # Convenient. $webtask->AutoStore(1); } # # Need to look inside the rspec to find the name of the controller node. # if (scalar($instance->AggregateList()) > 1) { $errmsg = "Too many aggregates, ". "is this really an Openstack experiment?"; goto bad; } my ($aggregate) = $instance->AggregateList(); if (!defined($aggregate->manifest())) { $errmsg = "Mo manifest for experiment"; goto bad; } my $manifest = GeniXML::Parse($aggregate->manifest()); if (! defined($manifest)) { $errmsg = "Could not parse manifest for $aggregate"; goto bad; } # # We have to look inside the parameters to find the controller node. # my $NS = "http://www.protogeni.net/resources/rspec/ext/johnsond/1"; my $controller; foreach my $param (GeniXML::FindNodesNS("n:profile_parameters/n:parameter", $manifest, $NS)->get_nodelist()) { my $value = $param->textContent(); if ($value =~ /^([^=]+)="(.+)"$/) { if (lc($1) eq "controller") { $controller = $2; print "Controller = $controller\n" if ($debug); last; } } } if (!defined($controller)) { $errmsg = "Could not find the CONTROLLER parameter"; goto bad; } # # So now we can ask the aggregate to grab the file from the proper # node in the topology; we do not want the cluster to have to figure # that part out. Hmm, maybe we should tell the cluster what file too? # my $response = $aggregate->OpenstackData($controller); if (!defined($response)) { $errmsg = "RPC Error calling GetOpenstackStats on $aggregate\n"; $errcode = -1; goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { $errmsg = "Could not get openstack json file for sliver: ". $response->output(); $errcode = $response->code(); goto bad; } if ($debug) { print $response->value() . "\n"; } $instance->Update({"openstack_utilization" => $response->value()}); exit(0); bad: print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } exit($errcode); } # # Get idledata info from the clusters. # sub DoIdleData() { my $errmsg; my $errcode = 1; # # Create the webtask object; the web interface gave us an anonymous # webtask, so we can use it before lock. # if (defined($webtask_id)) { $webtask = WebTask->Lookup($webtask_id); fatal("Could not lookup webtask object") if (!defined($webtask)); # Convenient. $webtask->AutoStore(1); } # # And ask the backend clusters for the data. # my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $response = $sliver->IdleData(); if (!defined($response)) { print STDERR "RPC Error calling idledata on $sliver\n"; return -1; } if ($response->code() != GENIRESPONSE_SUCCESS) { print STDERR "Could not get idledata for sliver: ". $response->output() . "\n"; $webtask->output($response->output()); $webtask->Exited($response->code()); return $response->code(); } if ($debug) { print Dumper($response->value()); } $webtask->idledata($response->value()); return 0; }; my @return_codes = (); my @agglist = (); # # Cull out any aggregates with no nodes. # foreach my $agg ($instance->AggregateList()) { push(@agglist, $agg) if ($agg->physnode_count() || $agg->virtnode_count()); } if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { $errmsg = "Internal error calling IdleData()"; goto bad; } # # Check the exit codes. # foreach my $agg (@agglist) { my $code = shift(@return_codes); $agg->webtask()->Refresh(); if ($code) { $errmsg = "Could not get idledata from some slivers"; if ($agg->webtask()->output()) { $errmsg .= ": " . $agg->webtask()->output(); $errcode = $agg->webtask()->output(); } goto bad; } if ($debug) { print Dumper($agg->webtask()->idledata()); } } exit(0); bad: print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } exit($errcode); } # # Schedule slice to terminate. This is an admin action. The lockdown bit # is cleared, and the lockout bit is set (no more free extensions). The # expiration is set, and we send email. # sub DoSchedTerminate() { my $errcode = 1; my $errmsg; my $days; my $reason; my $creator = $instance->GetGeniUser(); my $slice = $instance->GetGeniSlice(); my $name = $instance->name(); my $url = $instance->webURL(); my $clusters = join(",", map { $_->domain() } $instance->AggregateList()); my $pcount = $instance->physnode_count(); my $expires_time = str2time($slice->expires()); my $created_time = str2time($instance->created()); my $extensions = $instance->Brand()->ExtensionsEmailAddress(); usage() if (!@ARGV); $days = shift(@ARGV); if (@ARGV == 2) { my $arg = shift(@ARGV); if ($arg eq "-m") { $reason = shift(@ARGV); } else { usage(); } } elsif (@ARGV == 1) { my $filename = shift(@ARGV); if (! -e $filename) { fatal("$filename does not exist"); } open(MSG, $filename) or fatal("Could not open $filename"); $reason = ""; while () { $reason .= $_; } close(MSG); } # # Create the webtask object; the web interface gave us an anonymous # webtask, so we can use it before lock. # if (defined($webtask_id)) { $webtask = WebTask->Lookup($webtask_id); fatal("Could not lookup webtask object") if (!defined($webtask)); # Convenient. $webtask->AutoStore(1); } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { $errcode = GENIRESPONSE_BUSY; $errmsg ="Experiment is busy, cannot lock it. Try again later."; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } print STDERR "$errmsg\n"; exit($errcode); } if (defined($reason) && !TBcheck_dbslot($reason, "default", "fulltext", TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) { $errmsg = "Illegal characters in your reason"; $errcode = 1; goto bad; } if (!TBcheck_dbslot($days, "default", "int", TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) { $errmsg = "Illegal integer for length"; $errcode = 1; goto bad; } # No free time. $instance->Update({"extension_adminonly" => 1}); # # Need to set the new expiration before we clear the lockdown bit, # else it might get terminated at the cluster. But, if the # expiration is already beyond the desired termination point, # leave it alone, all we need to do is set our local expiration, # the daemon will take care of it. The reason we do this, is cause # it is unclear if setting the expiration backwards (at the CM) is # a legal thing to do (although our CM actually permits this). # if ($expires_time < time() + ($days * 3600 * 24)) { my $seconds = (time() + ($days * 3600 * 24)) - $expires_time; if ($errcode = ExtendInternal($slice, $seconds, 1, \$errmsg)) { goto bad; } } else { $slice->SetExpiration(time() + ($days * 3600 * 24)); } # Now we can clear this. if ($instance->admin_lockdown()) { if (DoLockdownInternal("clear", "all")) { SENDMAIL($TBOPS, "Failed to clear lock down on APT Instance", "Failed to clear lock down $instance\n". $instance->webURL() . "\n", $TBOPS); $errmsg = "Failed to clear lockdown"; $errcode = -1; goto bad; } } my $expires = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime(str2time($slice->expires()))); my $created = POSIX::strftime("20%y-%m-%d %H:%M:%S %Z", localtime(str2time($instance->created()))); my $message = "The site administrator has scheduled this experiment\n". "to terminate in $days days."; my $subject = "Experiment Termination Warning: $name"; # # New extension mechanism # my $extensionargs = { "action" => "request", "wanted" => $days, "granted" => $days, "admin" => 1, "uid" => $this_user->uid(), "uid_idx" => $this_user->uid_idx(), "message" => $message, }; if (defined($reason)) { $extensionargs->{"reason"} = $reason; } if (!defined(APT_Instance::ExtensionInfo->Create($instance, $extensionargs))) { print STDERR "Could not create extension info object\n"; } $instance->Brand()->SendEmail($creator->email(), $subject, $message . "\n\n" . (defined($reason) ? $reason . "\n\n" : "") . "Your experiment was started on $created\n". "Your experiment will now expire at $expires\n". "You are using $pcount physical nodes.\n". "It is running on $clusters\n\n". "$url\n", "$extensions", "BCC: $extensions"); if (defined($webtask)) { $webtask->Exited(0); } $slice->UnLock(); exit(0); bad: print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited($errcode); } exit($errcode); } # # Write instance credentials to files. # sub WriteCredentials() { usage() if (!@ARGV); my $directory = shift(@ARGV); fatal("$directory does not exist") if (! -e $directory); fatal("$directory is not a directory") if (! -d $directory); return $instance->WriteCredentials($directory); } sub fatal($) { my ($mesg) = @_; if (defined($webtask)) { $webtask->output($mesg); $webtask->code(-1); } print STDERR "*** $0:\n". " $mesg\n"; # Exit with negative status so web interface treats it as system error. exit(-1); } sub UserError($) { my ($mesg) = @_; if (defined($webtask)) { $webtask->output($mesg); $webtask->code(1); } print STDERR "*** $0:\n". " $mesg\n"; exit(1); } sub escapeshellarg($) { my ($str) = @_; $str =~ s/[^[:alnum:]]/\\$&/g; return $str; }