#!/usr/bin/perl -w # # Copyright (c) 2000-2015 University of Utah and the Flux Group. # # {{{EMULAB-LICENSE # # This file is part of the Emulab network testbed software. # # This file is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or (at # your option) any later version. # # This file is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this file. If not, see . # # }}} # use English; use strict; use Getopt::Std; use XML::Simple; use Data::Dumper; use CGI; use POSIX ":sys_wait_h"; use POSIX qw(setsid close); # # Back-end script to manage APT profiles. # sub usage() { print("Usage: manage_instance snapshot instance ". "[-n node_id] [-i imagename] [-u node|all]\n"); print("Usage: manage_instance consoleurl instance node\n"); print("Usage: manage_instance extend instance [-f] seconds\n"); print("Usage: manage_instance terminate instance\n"); print("Usage: manage_instance refresh instance\n"); print("Usage: manage_instance reboot instance node_id [node_id ...]\n"); print("Usage: manage_instance reload instance node_id [node_id ...]\n"); print("Usage: manage_instance monitor instance\n"); print("Usage: manage_instance lockdown instance set|clear user|admin\n"); exit(-1); } my $optlist = "dt:"; my $debug = 0; my $webtask_id; my $webtask; # # Configure variables # my $TB = "@prefix@"; my $TBOPS = "@TBOPSEMAIL@"; my $QUICKVM = "$TB/sbin/protogeni/quickvm"; # Debugging my $usemydevtree = 0; # # Untaint the path # $ENV{'PATH'} = "$TB/bin:$TB/sbin:/bin:/usr/bin:/usr/bin:/usr/sbin"; delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; # # Turn off line buffering on output # $| = 1; # # Load the Testbed support stuff. # use lib "@prefix@/lib"; use EmulabConstants; use emdb; use emutil; use libtestbed; use User; use Project; use APT_Profile; use APT_Instance; use APT_Geni; use GeniXML; use GeniHRN; use Genixmlrpc; use GeniResponse; use GeniSlice; use WebTask; use EmulabFeatures; # Protos sub fatal($); sub DoSnapshot(); sub DoConsole(); sub DoTerminate(); sub DoExtend(); sub DoRefresh(); sub DoReboot(); sub DoReload(); sub DoLockdown(); sub StartMonitor(); # # Parse command arguments. Once we return from getopts, all that should be # left are the required arguments. # my %options = (); if (! getopts($optlist, \%options)) { usage(); } if (defined($options{"t"})) { $webtask_id = $options{"t"}; } if (defined($options{"d"})) { $debug++; } if (@ARGV < 2) { usage(); } my $action = shift(@ARGV); my $uuid = shift(@ARGV); my $instance = APT_Instance->Lookup($uuid); if (!defined($instance)) { $instance = APT_Instance->LookupBySlice($uuid); } if (!defined($instance)) { fatal("No such instance $uuid"); } if ($action eq "snapshot") { DoSnapshot(); } if ($action eq "extend") { DoExtend(); } elsif ($action eq "consoleurl") { DoConsole() } elsif ($action eq "terminate") { DoTerminate() } elsif ($action eq "refresh") { DoRefresh() } elsif ($action eq "reboot") { DoReboot() } elsif ($action eq "reload") { DoReload() } elsif ($action eq "monitor") { StartMonitor() } elsif ($action eq "lockdown") { DoLockdown() } else { usage(); } exit(0); # # Take a snapshot. Implies a single node instance, for now. # sub DoSnapshot() { my $errmsg; my $logfile; my $errcode = 1; my $needunlock = 0; my $old_status = $instance->status(); my $node_id; my $imagename; my $update_profile; my $update_prepare = 0; my $optlist = "n:i:u:U"; my %options = (); if (! getopts($optlist, \%options)) { usage(); } if (defined($options{"n"})) { $node_id = $options{"n"}; } if (defined($options{"i"})) { $imagename = $options{"i"}; } if (defined($options{"u"})) { $update_profile = $options{"u"}; if ($update_profile !~ /^(node|all)$/) { usage(); } } if (defined($options{"U"})) { $update_prepare = 1; } if ($old_status ne "ready") { fatal("Instance must be in the ready state to take a snapshot"); } my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for quick VM: $uuid"); } # The web interface (and in the future the xmlrpc interface) sets this. my $this_user = User->ImpliedUser(); if (! defined($this_user)) { $this_user = User->ThisUser(); if (!defined($this_user)) { fatal("You ($UID) do not exist!"); } } # # Might be a clone (manage_profile). # my $sliver_urn; my $aggregate; my $node; my $profile = APT_Profile->Lookup($instance->profile_id()); if (!defined($profile)) { fatal("Could not lookup profile for instance"); } my $project = Project->Lookup($profile->pid_idx()); if (!defined($project)) { fatal("Could not lookup project for profile"); } if (defined($node_id)) { if (!defined($imagename)) { $imagename = $profile->name() . "." . $node_id; } } else { if (!defined($imagename)) { $imagename = $profile->name(); } } # # Sanity checks. # my @aggs = @{ $instance->AggregateList() }; if (! @aggs) { fatal("No slivers for instance!"); } if (!defined($node_id)) { # We snapshot the one node in the instance. if (@aggs != 1) { fatal("Too many aggregates (> 1) to snapshot"); } my ($agg) = @aggs; my $manifest = GeniXML::Parse($agg->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest for $agg"); } my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist(); if (@nodes != 1) { fatal("Too many nodes (> 1) to snapshot"); } ($node) = @nodes; $sliver_urn = GeniXML::GetSliverId($node); $aggregate = $agg; } else { # Find the node in its manifest. foreach my $agg (@aggs) { my $manifest = GeniXML::Parse($agg->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest for $agg"); } foreach my $ref (GeniXML::FindNodes("n:node", $manifest)->get_nodelist()) { my $client_id = GeniXML::GetVirtualId($ref); my $urn = GeniXML::GetSliverId($ref); # No sliver urn or a different aggregate. next if (! (defined($urn) && $urn eq $agg->aggregate_urn())); if ($node_id eq $client_id) { $node = $ref; $sliver_urn = $urn; $aggregate = $agg; last; } } } if (!defined($sliver_urn)) { fatal("Could not find node '$node_id' in manifest"); } } # # We are not going to allow this if the instance is on a different # cluster then where the image was originally created, since otherwise # the image provenancewill look like spaghetti. # if (defined($update_profile)) { my $diskref = GeniXML::GetDiskImage($node); if (defined($diskref)) { my $authority = $aggregate->GetGeniAuthority(); my $image_url = GeniXML::GetText("url", $diskref); if (defined($image_url)) { require URI; # Get the hostname for the image URL. my $uri = URI->new($image_url); if (!defined($uri)) { fatal("Could not parse $image_url"); } my $image_host = $uri->host(); # Get the hostname for the authority. $uri = URI->new($authority->url()); if (!defined($uri)) { fatal("Could not parse authority URL"); } my $authority_host = $uri->host(); # Compare domains. $image_host =~ s/^([^.]+\.)//; $authority_host =~ s/^([^.]+\.)//; if ($image_host ne $authority_host) { $errmsg = "Not allowed to take a snapshot on this cluster"; $errcode = 1; goto bad; } } } } if ($slice->Lock()) { fatal("Slice is busy, cannot lock it"); } $needunlock = 1; # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); } # # This returns pretty fast, and then the imaging takes place in # the background at the aggregate. # my $response = $aggregate->CreateImage($sliver_urn, $imagename, $update_prepare); if (!defined($response)) { $errmsg = "Internal error creating image"; $instance->SetStatus($old_status); goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { $errmsg = "Could not create image: " . $response->output() . "\n"; $instance->SetStatus($old_status); goto bad; } my ($image_urn, $image_url, $version_urn, $version_url) = @{ $response->value() }; if (!defined($version_urn)) { $version_urn = $image_urn; $version_url = $image_url } if (defined($webtask)) { $webtask->image_urn($version_urn); $webtask->image_url($version_url); } else { print "$image_urn,$image_url\n"; } # # Exit and leave child to poll. # if (! $debug) { $logfile = TBMakeLogname("snapshot"); if (my $childpid = TBBackGround($logfile)) { # Parent exits normally, web interface watches. exit(0); } # Let parent exit; sleep(2); } # Bind the process id. This is important when the caller is # manage_profile, doing a clone. $webtask->SetProcessID($PID) if (defined($webtask)); # # Poll for a reasonable amount of time. # my $seconds = 1500; my $interval = 10; my $ready = 0; my $sliver_ready = 0; my $failed = 0; while ($seconds > 0) { sleep($interval); $seconds -= $interval; my $response = $aggregate->SliverStatus(); if ($response->code() != GENIRESPONSE_SUCCESS && $response->code() != GENIRESPONSE_BUSY) { $errmsg = "Sliverstatus failed: ". $response->output() . "\n"; $failed = 1; last; } next if ($response->code() == GENIRESPONSE_BUSY); my $blob = $response->value(); if (defined($webtask)) { # Special for imaging status display foreach my $urn (keys(%{$blob->{'details'}})) { my $details = $blob->{'details'}->{$urn}; if ($urn eq $sliver_urn) { $webtask->state($details->{'state'}); $webtask->rawstate($details->{'rawstate'}); } } } # This is the per-aggregate status, we always set this for web UI. my $statusblob = {}; foreach my $urn (keys(%{$blob->{'details'}})) { my $details = $blob->{'details'}->{$urn}; my $node_id = $details->{'client_id'}; $statusblob->{$node_id} = $details; } $aggregate->webtask()->sliverstatus($statusblob); if ($blob->{'status'} eq "failed") { $failed = 1; last; } elsif ($blob->{'status'} eq "ready") { $sliver_ready = 1; } # # We are watching for the image status to report ready or failed. # $response = $aggregate->ImageInfo($image_urn); if ($response->code() != GENIRESPONSE_SUCCESS && $response->code() != GENIRESPONSE_BUSY) { $errmsg = "Imageinfo failed: ". $response->output() . "\n"; $failed = 1; last; } next if ($response->code() == GENIRESPONSE_BUSY); $blob = $response->value(); if (defined($webtask)) { $webtask->image_size($blob->{'size'}) if (exists($blob->{'size'})); $webtask->image_status($blob->{'status'}) if (exists($blob->{'status'})); } if ($blob->{'status'} eq "ready") { $ready = 1; last; } elsif ($blob->{'status'} eq "failed") { $failed = 1; last; } } if ($failed) { $errmsg = "Imaging failed" if (!defined($errmsg)); $errcode = 1; goto bad; } elsif (!$ready) { $errmsg = "Imaging timed out"; $errcode = 60; goto bad; } elsif (defined($update_profile)) { # # If successful, we create a new version of the profile and # update the rspec to reflect the new image version. Note # that we expect the CM is doing image versioning, so do not # bother to check if the image version is actually new. # my $doversions = EmulabFeatures->FeatureEnabled("APT_ProfileVersions", $this_user, $project); if ($doversions) { $profile = $profile->NewVersion($this_user); if (!defined($profile)) { print STDERR "Could not create new profile version\n"; $webtask->Exited(70) if (defined($webtask)); exit(1); } } $profile->UpdateDiskImage($node_id, $version_url, ($update_profile eq "all" ? 1 : 0)); } $instance->SetStatus("ready"); # We garbage collect these later, so anyone waiting has a chance # to see the exit status $webtask->Exited(0) if (defined($webtask)); $slice->UnLock(); unlink($logfile) if (defined($logfile)); exit(0); bad: if ($sliver_ready) { # # If the sliver comes back ready in spite of the imaging failure, # then change the instance back to ready. User will already know # that the imaging failed. # $instance->SetStatus("ready"); } else { $instance->SetStatus("imaging-failed"); } print STDERR "$errmsg\n"; if (defined($errmsg)) { $webtask->Exited($errcode); $webtask->output($errmsg); } $slice->UnLock() if ($needunlock); if (defined($logfile)) { SENDMAIL($TBOPS, "Snapshot failed", "Error taking snapshot of $instance:\n\n". "$errmsg\n", $TBOPS, undef, $logfile); unlink($logfile); } exit($errcode); } # # Ask the console URL for a node in an instance. # sub DoConsole() { usage() if (!@ARGV); my $node_id = shift(@ARGV); if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate(undef, $webtask_id); if (!defined($webtask)) { fatal("Could not lookup/create webtask for $webtask_id"); } # Convenient. $webtask->AutoStore(1); } # # Sanity check to make sure the node is really in the rspec, since # we need its sliver urn to ask for the console url. # my $sliver_urn; my $sliver; foreach my $obj (@{ $instance->AggregateList() }) { my $manifest = GeniXML::Parse($obj->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest for $obj"); } my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist(); foreach my $node (@nodes) { my $client_id = GeniXML::GetVirtualId($node); my $sliver_urn = GeniXML::GetSliverId($node); # No sliver urn or a different aggregate. next if (! (defined($sliver_urn) && $sliver_urn eq $obj->aggregate_urn())); if ($node_id eq $client_id) { $sliver_urn = GeniXML::GetSliverId($node); $sliver = $obj; } } } if (!defined($sliver_urn)) { fatal("Could not find node '$node_id' in manifest"); } my $response = $sliver->ConsoleInfo($sliver_urn); if (!defined($response)) { fatal("RPC Error calling ConsoleInfo"); } if ($response->code() != GENIRESPONSE_SUCCESS) { $response = $sliver->ConsoleURL($sliver_urn); if (!defined($response)) { fatal("RPC Error calling ConsoleURL"); } if ($response->code() != GENIRESPONSE_SUCCESS) { if ($response->value()) { fatal($response->output()); } fatal("Server returned error: " . GENIRESPONSE_STRING($response->code)); } } my $url; my $pswd; if (ref($response->value())) { $url = $response->value()->{'url'}; $pswd = $response->value()->{'password'} if (exists($response->value()->{'password'})); } else { $url = $response->value(); } if (defined($webtask)) { if ($response->code()) { $webtask->output($response->output()); } else { $webtask->url($url); $webtask->password($pswd) if (defined($pswd)); } $webtask->Exited($response->code()); exit($response->code()); } # For command line operation too. if ($response->code()) { fatal($response->output()); } print $url . "\n"; print $pswd . "\n" if (defined($pswd)); exit(0); } # # Terminate # sub DoTerminate() { my $errmsg; my $logfile; my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { # # No slice (typically) means we never got far enough to the # get the sliver created on the backend cluster. # goto killit; } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { fatal("Slice is busy, cannot lock it"); } my $old_status = $instance->status(); $instance->SetStatus("terminating"); # # Exit and let caller poll for status. # if (!$debug) { $logfile = TBMakeLogname("terminate"); if (my $childpid = TBBackGround($logfile)) { my $status = 0; # # Wait a couple of seconds to see if there is going to be an # immediate error. Then return and let it continue to run. This # allows the web server to see quick errors. Later errors will # have to be emailed. # sleep(3); my $foo = waitpid($childpid, &WNOHANG); if ($foo) { $status = $? >> 8; } exit($status); } } my $coderef = sub { my ($sliver) = @_; my $errmsg; my $response = $sliver->Terminate(); if (!defined($response)) { $errmsg = "RPC Error calling Terminate"; goto bad; } # SEARCHFAILED is success. if ($response->code() != GENIRESPONSE_SUCCESS && $response->code() != GENIRESPONSE_SEARCHFAILED) { if ($response->code() == GENIRESPONSE_BUSY) { $errmsg = "Slice was busy for too long; try again later?"; goto bad; } $errmsg = "Could not delete slice: ". $response->output(); goto bad; } return 0; bad: print STDERR "$errmsg\n"; return -1; }; my @return_codes = (); my @agglist = @{ $instance->AggregateList() }; if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error calling Terminate()"; goto bad; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { $errmsg = "Some slivers would not terminate"; goto bad; } } $slice->Delete(); $instance->RecordHistory(); killit: $instance->Delete(); unlink($logfile) if (defined($logfile)); exit(0); bad: print STDERR $errmsg . "\n"; $instance->SetStatus($old_status); $slice->UnLock(); if (defined($logfile)) { SENDMAIL($TBOPS, "Unable to terminate instance $uuid", "Error terminating $instance:\n\n". "$errmsg\n", $TBOPS, undef, $logfile); unlink($logfile); } exit(1); } # # Extend. # sub DoExtend() { my $force = 0; usage() if (!@ARGV); if (@ARGV == 2) { my $arg = shift(@ARGV); if ($arg eq "-f") { $force = 1; } else { usage(); } } my $seconds = shift(@ARGV); if ($seconds !~ /^\d*$/) { usage(); } if ($instance->status() eq "failed" && !$force) { fatal("Cannot extend failed instance!"); } my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance!"); } # The web interface (and in the future the xmlrpc interface) sets this. my $this_user = User->ImpliedUser(); if (! defined($this_user)) { $this_user = User->ThisUser(); } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { fatal("Slice is busy, cannot lock it"); } # Save in case of error. my $oldexpires = $slice->expires(); # Need to update slice before creating new credential. $slice->AddToExpiration($seconds); my $new_expires = $slice->ExpirationGMT(); my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $errmsg; my $response = $sliver->Extend($new_expires); if (!defined($response)) { $errmsg = "Internal error calling Extend"; goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { # This is something the user should see. if ($response->code() == GENIRESPONSE_REFUSED || $response->code() == GENIRESPONSE_BUSY) { print STDERR $response->output() . "\n"; # For web interface. $webtask->output($response->output()); $webtask->Exited(1); return 1; } $errmsg = "Failed to extend slice: ". $response->output(); goto bad; } return 0; bad: print STDERR "$errmsg\n"; $webtask->output($errmsg); $webtask->Exited(-1); return -1; }; my @return_codes = (); my @agglist = @{ $instance->AggregateList() }; if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # print STDERR "Internal error calling Extend\b"; goto bad; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { print STDERR "Some slivers could not be extended\n"; goto bad; } } # Lockdown. if (defined($this_user) && $this_user->IsAdmin() && ($seconds / (24 * 60 * 60)) > 10) { if (DoLockdownInternal("set", "admin")) { SENDMAIL($TBOPS, "Failed to lock down APT Instance", "Failed to lock down $instance\n". $instance->webURL() . "\n", $TBOPS); } } $slice->UnLock(); exit(0); bad: # Reset back to original expiration, sorry. $slice->SetExpiration($oldexpires); $slice->UnLock(); exit(-1); } # # Refresh; ask the aggregate for status and set the instance status # accordingly. # sub DoRefresh() { my $errmsg; my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { print STDERR "No slice for instance\n"; goto killit; } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { $errmsg = "Experiment is busy, cannot lock it. Please try again later"; goto bad; } # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); } my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $errmsg; my $response = $sliver->SliceStatus(); if (!defined($response)) { $errmsg = "RPC Error calling SliceStatus"; goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { if ($response->code() == GENIRESPONSE_SEARCHFAILED) { $errmsg = "Slice is gone"; goto bad; } if ($response->code() == GENIRESPONSE_BUSY) { $errmsg = "Slice is busy; try again later"; goto bad; } $errmsg = "Could not get status: ". $response->output(); goto bad; } my $blob = $response->value(); if ($blob->{'status'} eq "ready") { $sliver->SetStatus("ready"); } elsif ($blob->{'status'} eq "failed") { $sliver->SetStatus("failed"); } # # Convert to something smaller, with info the web interface # cares about. # my $statusblob = {}; foreach my $urn (keys(%{$blob->{'details'}})) { my $details = $blob->{'details'}->{$urn}; my $node_id = $details->{'client_id'}; $statusblob->{$node_id} = $details; } $webtask->sliverstatus($statusblob); if ($debug) { print STDERR Dumper($statusblob); } return 0; bad: print STDERR "$errmsg\n"; $webtask->output($errmsg); $webtask->Exited(1); return 1; }; my @return_codes = (); my @agglist = @{ $instance->AggregateList() }; if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error calling Refresh"; goto bad; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { $errmsg = "Some slivers could not be refreshed"; goto bad; } } $slice->UnLock(); exit(0); killit: $instance->RecordHistory(); $instance->Delete(); exit(0); bad: $slice->UnLock(); print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited(1); } exit(1); } # # Reboot or Reload nodes. # sub DoRebootOrReload($) { my ($which) = @_; my $errmsg; usage() if (!@ARGV); my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { print STDERR "No slice for instance\n"; goto killit; } my %sliver_urns = (); my @slivers = (); foreach my $obj (@{ $instance->AggregateList() }) { my $manifest = GeniXML::Parse($obj->manifest()); if (! defined($manifest)) { fatal("Could not parse manifest"); } my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist(); foreach my $node (@nodes) { my $client_id = GeniXML::GetVirtualId($node); if (grep {$_ eq $client_id} @ARGV) { my $sliver_urn = GeniXML::GetSliverId($node); # No sliver urn, means a different aggregate. next if (!defined($sliver_urn)); if (!exists($sliver_urns{$obj->aggregate_urn()})) { $sliver_urns{$obj->aggregate_urn()} = []; push(@slivers, $obj); } push(@{ $sliver_urns{$obj->aggregate_urn()} }, $sliver_urn); } } } # # Lock the slice in case it is doing something else, like taking # a disk image. # if ($slice->Lock()) { $errmsg = "Experiment is busy, cannot lock it. Please try again later"; goto bad; } # # Create the webtask object, but AFTER locking the slice so we do # not destroy one in use. # if (defined($webtask_id)) { $webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id); # Convenient. $webtask->AutoStore(1); } my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my @urns = @{ $sliver_urns{$sliver->aggregate_urn()} }; my $errmsg; # Clear this so that web interface will not update it. $webtask->sliverstatus({}); my $response = $sliver->SliverAction($which, @urns); if (!defined($response)) { $errmsg = "RPC Error calling SliverAction"; goto bad; } if ($response->code() != GENIRESPONSE_SUCCESS) { if ($response->code() == GENIRESPONSE_SEARCHFAILED) { $errmsg = "Slice is gone"; goto bad; } if ($response->code() == GENIRESPONSE_BUSY) { $errmsg = "Experiment is busy; try again later"; goto bad; } $errmsg = $response->output(); goto bad; } return 0; bad: print STDERR "$errmsg\n"; $webtask->output($errmsg); $webtask->Exited(1); return 1; }; my @return_codes = (); if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@slivers)}, \@return_codes, $coderef, @slivers)) { # # The parent caught a signal. Leave things intact so that we can # kill things cleanly later. # $errmsg = "Internal error calling SliverAction"; goto bad; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { $errmsg = "Some slivers could not be ${which}'ed"; goto bad; } } $slice->UnLock(); if (defined($webtask)) { $webtask->Exited(0); } # # Start the monitor so the web interface will see when the node # has actually come back up. # # XXX This will not return unless a monitor is already running. StartMonitor(); exit(0); killit: $instance->RecordHistory(); $instance->Delete(); exit(0); bad: $slice->UnLock(); print STDERR $errmsg . "\n"; if (defined($webtask)) { $webtask->output($errmsg); $webtask->Exited(1); } exit(1); } sub DoReboot() { return DoRebootOrReload("reboot"); } sub DoReload() { return DoRebootOrReload("reload"); } # # Start up the monitor for an instance. Only one though. # sub StartMonitor() { my $logfile; my $needunlock = 0; my $signaled = 0; my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance"); } if ($instance->monitor_pid()) { my $pid = $instance->monitor_pid(); if (kill(0, $pid)) { print STDERR "Monitor already running ($pid). ". "Kill it before starting a new one.\n"; exit(0); } $instance->Update({"monitor_pid" => 0}); } if (!$debug) { $logfile = TBMakeLogname("aptmonitor"); if (TBBackGround($logfile)) { exit(0); } } $instance->Update({"monitor_pid" => '$PID'}); my $seconds = 1500; my $interval = 15; # Shorten default timeout now. Genixmlrpc->SetTimeout(30); my $coderef = sub { my ($sliver) = @_; my $webtask = $sliver->webtask(); my $errmsg; my $response = $sliver->SliceStatus(); if (!defined($response)) { print STDERR "RPC Error calling SliceStatus\n"; return GENIRESPONSE_RPCERROR; } if (($response->code() != GENIRESPONSE_SUCCESS && $response->code() != GENIRESPONSE_BUSY)) { print STDERR "SliverStatus failed"; print STDERR ": " . $response->output() . "\n"; if (defined($webtask)) { if ($response->output() =~ /read timeout/) { $webtask->output("Lost contact with the aggregate. " . "Possibly a network failure, ". "please try again later."); } else { $webtask->output($response->output()); } } return -1; } if ($response->code() == GENIRESPONSE_BUSY) { # Indicate not done. return GENIRESPONSE_BUSY; } my $blob = $response->value(); # # Convert to something smaller, with info the web interface # cares about. # my $statusblob = {}; foreach my $urn (keys(%{$blob->{'details'}})) { my $details = $blob->{'details'}->{$urn}; my $node_id = $details->{'client_id'}; $statusblob->{$node_id} = $details; } if ($debug) { print STDERR Dumper($statusblob); } $webtask->sliverstatus($statusblob); # # We poll until the status goes ready. Might not be a good idea. # if ($blob->{'status'} eq "ready") { return 0; } # Not done yet. return 1; }; while ($seconds > 0) { $seconds -= $interval; # # Lock the slice in case it is doing something else, like taking # a disk image. Just skip this turn. # next if ($slice->Lock()); my $handler = sub { # This is so we can catch when Parrun gets signaled, but not # exit till it exits. $signaled = 1; }; local $SIG{TERM} = $handler; if ($debug) { local $SIG{INT} = $handler; } my @return_codes = (); my @agglist = @{ $instance->AggregateList() }; if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { print STDERR "Internal error calling Status()\n"; $slice->UnLock(); last; } local $SIG{TERM} = 'DEFAULT'; local $SIG{INT} = 'DEFAULT'; $slice->UnLock(); # # Check the exit codes. # my $done = 1; foreach my $code (@return_codes) { if ($code) { last if ($code < 0); $done = 0; } } last if ($done); sleep($interval); } unlink($logfile) if (defined($logfile)); exit(0); } # # Experiment lockdown. # sub DoLockdownInternal($$) { my ($setclr,$which) = @_; my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance"); } if ($which eq "all") { if ($instance->SetLockdown("user", ($setclr eq "clear" ? 1 : 0))) { print STDERR "Could not update instance lockdown\n"; return -1 } $which = "admin" } if ($instance->SetLockdown($which, ($setclr eq "clear" ? 1 : 0))) { print STDERR "Could not update instance lockdown\n"; return -1 } my $clear = ($instance->admin_lockdown() || $instance->user_lockdown() ? 0 : 1); # # Have to set/clear the lockdown on the local slice. # if ($slice->SetLockdown($clear)) { print STDERR "Could not update slice lockdown\n"; return -1 } # # And tell the backend clusters to lockdown the slice. # my $coderef = sub { my ($sliver) = @_; my $response = $sliver->Lockdown($clear); if (!defined($response)) { print STDERR "RPC Error calling Lockdown\n"; return -1; } if ($response->code() != GENIRESPONSE_SUCCESS) { print STDERR "Could not lockdown sliver: ". $response->output() . "\n"; return -1; } return 0; }; my @return_codes = (); my @agglist = @{ $instance->AggregateList() }; if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@agglist)}, \@return_codes, $coderef, @agglist)) { print STDERR "Internal error calling Lockdown()\n"; return -1; } # # Check the exit codes. # foreach my $code (@return_codes) { if ($code) { print STDERR "Some slivers could not be locked down.\n"; return -1; } } exit(0); } sub DoLockdown() { usage() if (@ARGV != 2); my $setclr = shift(@ARGV); my $which = shift(@ARGV); fatal("Must specify either 'admin' or 'user'") if ($which !~ /^(admin|user|all)$/); fatal("Must specify either 'set' or 'clear'") if ($setclr !~ /^(set|clear)$/); my $slice = $instance->GetGeniSlice(); if (!defined($slice)) { fatal("No slice for instance"); } if ($slice->Lock()) { fatal("Experiment is busy, cannot lock it. Please try again later"); } if (DoLockdownInternal($setclr, $which)) { $slice->UnLock(); fatal("Could not lockdown instance!"); } $slice->UnLock(); exit(0); } sub fatal($) { my ($mesg) = @_; if (defined($webtask)) { $webtask->output($mesg); $webtask->code(-1); } print STDERR "*** $0:\n". " $mesg\n"; # Exit with negative status so web interface treats it as system error. exit(-1); } sub escapeshellarg($) { my ($str) = @_; $str =~ s/[^[:alnum:]]/\\$&/g; return $str; }