Commit bd129835 authored by Leigh Stoller's avatar Leigh Stoller

Beta version of delete node.

parent c7e7ae60
......@@ -75,7 +75,7 @@ my %instances = ();
my $debug = 0;
# Debugging
my $usemydevtree = 0;
my $usemydevtree = 1;
sub devurl($)
{
my ($cmurl) = @_;
......@@ -958,6 +958,7 @@ sub UpdateImageStatus($$)
package APT_Instance::Aggregate;
use emdb;
use WebTask;
use libtestbed;
use Carp;
use POSIX qw(tmpnam);
use English;
......@@ -2145,5 +2146,222 @@ sub UpdateKeys($$)
return $response;
}
#
# Delete some nodes
#
sub DeleteNodes($$@)
{
my ($self, $perrmsg, @nodes) = @_;
my $authority = $self->GetGeniAuthority();
my $geniuser = $self->instance()->GetGeniUser();
my $urn = $self->aggregate_urn();
my $slice = $self->instance()->GetGeniSlice();
my $context = APT_Geni::GeniContext();
return undef
if (! (defined($geniuser) && defined($authority) &&
defined($slice) && defined($context)));
my ($slice_credential, $speaksfor_credential) =
APT_Geni::GenCredentials($slice, $geniuser, undef, 1);
return undef
if (!defined($slice_credential));
my $credentials = [$slice_credential->asString()];
if (defined($speaksfor_credential)) {
$credentials = [@$credentials, $speaksfor_credential->asString()];
}
my $args = {
"slice_urn" => $slice->urn(),
"credentials" => $credentials,
"nodes" => \@nodes,
};
my $cmurl = $authority->url();
$cmurl = devurl($cmurl) if ($usemydevtree);
my $response;
my $tries = 5;
while ($tries) {
$response = Genixmlrpc::CallMethod($cmurl, $context,
"DeleteNodes", $args);
if (!defined($response) || $response->code() != GENIRESPONSE_SUCCESS) {
if (defined($response) &&
($response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_BUSY) &&
$tries >= 0) {
print STDERR "Server for $urn reports too busy or slice busy, ".
"waiting a while ...\n";
sleep(int(rand(20)) + 10);
$tries--;
next;
}
$$perrmsg = $response->output()
if (defined($response));
return $response;
}
last;
}
return $response;
}
sub WaitForSliver($)
{
my ($self) = @_;
my $aggobj = $self;
$aggobj->Refresh();
my $webtask = $aggobj->webtask();
my $authority = $aggobj->GetGeniAuthority();
my $cmurl = $authority->url();
my $urn = $authority->urn();
$webtask->Refresh();
$webtask->output("");
$webtask->exitcode(0);
# Debugging
$cmurl = APT_Instance::devurl($cmurl);
my $seconds = 900;
my $interval = 15;
my $ready = 0;
my $failed = 0;
my $rpcfail = 0;
my $public_url;
my $repblob;
my $laststatus;
while ($seconds > 0) {
sleep($interval);
$seconds -= $interval;
my $response = $aggobj->SliceStatus();
if (!defined($response) || !defined($response->value()) ||
($response->code() != GENIRESPONSE_SUCCESS &&
$response->code() != GENIRESPONSE_SERVER_UNAVAILABLE &&
$response->code() != GENIRESPONSE_BUSY &&
$response->code() != GENIRESPONSE_RPCERROR)) {
print STDERR "SliverStatus failed";
if (defined($response)) {
print STDERR ": " . $response->output();
print STDERR Dumper($response);
if ($response->output() =~ /read timeout/) {
$webtask->output("Lost contact with the aggregate. " .
"Possibly a network failure, ".
"please try again later.");
}
else {
$webtask->output($response->output());
}
}
print STDERR "\n";
$failed = 1;
last;
}
if ($response->code() == GENIRESPONSE_RPCERROR) {
if ($rpcfail > 10) {
if ($response->output() =~ /read timeout/) {
$webtask->output("Lost contact with the aggregate. " .
"Possibly a network failure, ".
"please try again later.");
}
else {
$webtask->output($response->output());
}
$failed = 1;
last;
}
$rpcfail++;
next;
}
$rpcfail = 0;
next
if ($response->code() == GENIRESPONSE_BUSY ||
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE);
$repblob = $response->value();
#
# Convert to something smaller, with info the web interface
# cares about. We get this on each loop, update so the web
# interface can show changes.
#
my $statusblob = $aggobj->UpdateWebStatus($repblob->{'details'});
my $changed = 0;
foreach my $urn (keys(%{$repblob->{'details'}})) {
my $details = $repblob->{'details'}->{$urn};
my $node_id = $details->{'client_id'};
#
# Look at the last blob. If we changed, view that as progress.
#
# The idea is that as long as progress is being made we keep
# waiting, by resetting the waittime if it gets below a
# threshold. That way, if we go too long with nothing happening,
# we will stop. The user can always use the Refresh button on
# the status page.
#
if (defined($laststatus)) {
if (!exists($laststatus->{$node_id})) {
$seconds = 600
if ($seconds < 600);
}
else {
if (exists($details->{"rawstate"}) &&
$laststatus->{$node_id}->{"rawstate"} ne
$details->{"rawstate"}) {
# This is IG specific.
$seconds = 300
if ($seconds < 300);
}
elsif ($laststatus->{$node_id}->{"status"} ne
$details->{"status"}) {
$seconds = 450
if ($seconds < 450);
}
}
}
}
$laststatus = $statusblob;
if (exists($repblob->{'public_url'})) {
$public_url = $repblob->{'public_url'};
$aggobj->SetPublicURL($public_url);
}
if ($repblob->{'status'} eq "ready") {
$ready = 1;
last;
}
elsif ($repblob->{'status'} eq "failed") {
$failed = 1;
print STDERR "*** $urn failed\n";
$webtask->output("Experiment setup on $urn failed");
last;
}
elsif ($aggobj->instance()->IsCanceled()) {
last;
}
}
if ($aggobj->instance()->IsCanceled()) {
$webtask->Exited(0);
return 0;
}
if ($failed || !$ready) {
$aggobj->SetStatus("failed");
if (!$ready) {
# XXX Need better handling for timeout.
print STDERR "*** $urn timed out.\n";
$webtask->output("Experiment setup on $urn timed out");
$webtask->Exited(GENIRESPONSE_TIMEDOUT);
}
else {
$webtask->Exited(1);
}
return $webtask->exitcode();
}
$aggobj->SetStatus("ready");
$webtask->Exited(0);
return 0;
}
# _Always_ make sure that this 1 is at the end of the file...
1;
......@@ -849,169 +849,13 @@ $instance->ComputeNodeCounts();
#
Genixmlrpc->SetTimeout(60);
sub WaitForSliver($)
{
my ($ref) = @_;
my $aggobj = $ref;
$aggobj->Refresh();
my $webtask = $aggobj->webtask();
my $authority = $aggobj->_authority();
my $cmurl = $authority->url();
my $urn = $authority->urn();
$webtask->Refresh();
# Debugging
$cmurl = APT_Instance::devurl($cmurl);
my $seconds = 900;
my $interval = 15;
my $ready = 0;
my $failed = 0;
my $rpcfail = 0;
my $public_url;
my $repblob;
my $laststatus;
while ($seconds > 0) {
sleep($interval);
$seconds -= $interval;
my $response = $aggobj->SliceStatus();
if (!defined($response) || !defined($response->value()) ||
($response->code() != GENIRESPONSE_SUCCESS &&
$response->code() != GENIRESPONSE_SERVER_UNAVAILABLE &&
$response->code() != GENIRESPONSE_BUSY &&
$response->code() != GENIRESPONSE_RPCERROR)) {
print STDERR "SliverStatus failed";
if (defined($response)) {
print STDERR ": " . $response->output();
print STDERR Dumper($response);
if ($response->output() =~ /read timeout/) {
$webtask->output("Lost contact with the aggregate. " .
"Possibly a network failure, ".
"please try again later.");
}
else {
$webtask->output($response->output());
}
}
print STDERR "\n";
$failed = 1;
last;
}
if ($response->code() == GENIRESPONSE_RPCERROR) {
if ($rpcfail > 10) {
if ($response->output() =~ /read timeout/) {
$webtask->output("Lost contact with the aggregate. " .
"Possibly a network failure, ".
"please try again later.");
}
else {
$webtask->output($response->output());
}
$failed = 1;
last;
}
$rpcfail++;
next;
}
$rpcfail = 0;
next
if ($response->code() == GENIRESPONSE_BUSY ||
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE);
$repblob = $response->value();
#
# Convert to something smaller, with info the web interface
# cares about. We get this on each loop, update so the web
# interface can show changes.
#
my $statusblob = $aggobj->UpdateWebStatus($repblob->{'details'});
my $changed = 0;
foreach my $urn (keys(%{$repblob->{'details'}})) {
my $details = $repblob->{'details'}->{$urn};
my $node_id = $details->{'client_id'};
#
# Look at the last blob. If we changed, view that as progress.
#
# The idea is that as long as progress is being made we keep
# waiting, by resetting the waittime if it gets below a
# threshold. That way, if we go too long with nothing happening,
# we will stop. The user can always use the Refresh button on
# the status page.
#
if (defined($laststatus)) {
if (!exists($laststatus->{$node_id})) {
$seconds = 600
if ($seconds < 600);
}
else {
if (exists($details->{"rawstate"}) &&
$laststatus->{$node_id}->{"rawstate"} ne
$details->{"rawstate"}) {
# This is IG specific.
$seconds = 300
if ($seconds < 300);
}
elsif ($laststatus->{$node_id}->{"status"} ne
$details->{"status"}) {
$seconds = 450
if ($seconds < 450);
}
}
}
}
$laststatus = $statusblob;
if (exists($repblob->{'public_url'})) {
$public_url = $repblob->{'public_url'};
$aggobj->SetPublicURL($public_url);
}
if ($repblob->{'status'} eq "ready") {
$ready = 1;
last;
}
elsif ($repblob->{'status'} eq "failed") {
$failed = 1;
print STDERR "*** $urn failed\n";
$webtask->output("Experiment setup on $urn failed");
last;
}
elsif ($instance->IsCanceled()) {
last;
}
}
if ($instance->IsCanceled()) {
$webtask->Exited(0);
return 0;
}
if ($failed || !$ready) {
$aggobj->SetStatus("failed");
if (!$ready) {
# XXX Need better handling for timeout.
print STDERR "*** $urn timed out.\n";
$webtask->output("Experiment setup on $urn timed out");
$webtask->Exited(GENIRESPONSE_TIMEDOUT);
}
else {
$webtask->Exited(1);
}
return $webtask->exitcode();
}
$aggobj->SetStatus("ready");
$webtask->Exited(0);
return 0;
}
#
# Okay, fire off the waits for each aggregate
#
my @return_codes = ();
if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@aggregate_list)},
\@return_codes, \&WaitForSliver, @aggregate_list)) {
\@return_codes,
\&APT_Instance::Aggregate::WaitForSliver, @aggregate_list)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
......
......@@ -41,8 +41,9 @@ sub usage()
print("Usage: manage_instance extend instance [-f] seconds\n");
print("Usage: manage_instance terminate instance\n");
print("Usage: manage_instance refresh instance\n");
print("Usage: manage_instance reboot instance node_id [node_id ...]\n");
print("Usage: manage_instance reload instance node_id [node_id ...]\n");
print("Usage: manage_instance reboot instance node_id ...\n");
print("Usage: manage_instance reload instance node_id ...\n");
print("Usage: manage_instance deletenodes instance node_id ...\n");
print("Usage: manage_instance monitor instance\n");
print("Usage: manage_instance lockdown instance set|clear user|admin\n");
print("Usage: manage_instance panic instance set|clear\n");
......@@ -116,9 +117,10 @@ sub DoPanic();
sub DoManifests();
sub DoLinktest();
sub DoUpdateKeys();
sub DoDeleteNodes();
sub WriteCredentials();
sub StartMonitor();
sub StartMonitorInternal(;$);
sub StartMonitorInternal(;$@);
sub DoImageTrackerStuff($$$$$$);
#
......@@ -203,6 +205,9 @@ elsif ($action eq "writecreds") {
elsif ($action eq "getmanifests") {
DoManifests()
}
elsif ($action eq "deletenodes") {
DoDeleteNodes()
}
else {
usage();
}
......@@ -1625,6 +1630,216 @@ sub DoManifests()
exit(1);
}
#
# Delete nodes.
#
sub DoDeleteNodes()
{
my $logname;
my $errmsg;
my $errcode = 1;
usage()
if (!@ARGV);
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance");
}
my @aggregates = ();
my %node_ids = ();
my %aggmap = ();
foreach my $obj ($instance->AggregateList()) {
my $manifest = GeniXML::Parse($obj->manifest());
if (! defined($manifest)) {
fatal("Could not parse manifest");
}
my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
foreach my $node (@nodes) {
my $client_id = GeniXML::GetVirtualId($node);
if (grep {$_ eq $client_id} @ARGV) {
my $sliver_urn = GeniXML::GetSliverId($node);
my $manager_urn = GetManagerId($node);
# No sliver urn or a different aggregate.
next
if (! (defined($sliver_urn) &&
defined($manager_urn) &&
$manager_urn eq $obj->aggregate_urn()));
if (!exists($aggmap{$obj->aggregate_urn()})) {
$aggmap{$obj->aggregate_urn()} = [];
push(@aggregates, $obj);
}
push(@{ $aggmap{$obj->aggregate_urn()} }, $client_id);
$node_ids{$sliver_urn} = $client_id;
}
}
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
$errmsg = "Experiment is busy, cannot lock it. Please try again later";
goto bad;
}
#
# Create the webtask object, but AFTER locking the slice so we do
# not destroy one in use.
#
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
}
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my @nodes = @{ $aggmap{$sliver->aggregate_urn()} };
my $errcode = -1;
my $errmsg;
$sliver->SetStatus("provisioning");
my $response = $sliver->DeleteNodes(\$errmsg, @nodes);
if (!defined($response)) {
$errmsg = "RPC Error calling DeleteNode";
goto bad;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
if ($response->code() == GENIRESPONSE_SEARCHFAILED) {
print STDERR "Slice is gone on $sliver";
goto bad;
}
if ($response->code() == GENIRESPONSE_BUSY) {
$errmsg = "Experiment is busy; try again later";
goto bad;
}
$errmsg = $response->output();
$errcode = $response->code();
goto bad;
}
# We get back a new manifest.
my $manifest = $response->value();
$sliver->SetManifest($manifest);
# Delete the nodes from the status blob.
if ($webtask->sliverstatus()) {
my $blob = $webtask->sliverstatus();
foreach my $node_id (@nodes) {
delete($blob->{$node_id});
}
$webtask->sliverstatus($blob);
}
$sliver->SetStatus("provisioned");
return 0;
bad:
$sliver->SetStatus("ready");
$webtask->output($errmsg);
$webtask->Exited($errcode);
print STDERR "Returning $errcode from coderef\n";
return $errcode;
};
#
# Set the status back to provisioning for the web interface.
#
$instance->SetStatus("provisioning");
my @return_codes = ();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@aggregates)},
\@return_codes, $coderef, @aggregates)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error calling DeleteNodes";
goto bad;
}
#
# Check the exit codes.
#
foreach my $aggobj (@aggregates) {
my $code = shift(@return_codes);
# Updated in a forked child, must refresh.
$aggobj->Refresh();
if ($code) {
if ($aggobj->webtask()->output()) {
$errmsg = $aggobj->webtask()->output();
}
else {
$errmsg = "Some nodes could not be deleted";
}
$errcode = $aggobj->webtask()->exitcode();
goto bad;
}
}
#
# Let the web interface continue, we poll now.
#
if (!$debug) {
$logname = TBMakeLogname("deletenode");
if (TBBackGround($logname)) {
exit(0);
}
}
$instance->SetStatus("provisioned");
$instance->ComputeNodeCounts();
@return_codes = ();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@aggregates)}, \@return_codes,
\&APT_Instance::Aggregate::WaitForSliver, @aggregates)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error waiting for slivers";
goto bad;
}
#
# Check the exit codes.
#
foreach my $aggobj (@aggregates) {
my $code = shift(@return_codes);
# Updated in a forked child, must refresh.
$aggobj->Refresh();
if ($code) {
if ($aggobj->webtask()->output()) {
$errmsg = $aggobj->webtask()->output();
}
else {
$errmsg = "WaitforSliver Failure at ".$aggobj->aggregate_urn();
}
$errcode = $aggobj->webtask()->output();
goto bad;
}
}
$slice->UnLock();
$instance->SetStatus("ready");
$webtask->Exited(0);
exit(0);
bad:
$instance->SetStatus("ready");
$slice->UnLock();
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit($errcode);
}
#
# Start up the monitor for an instance. Only one though.
#
......@@ -1644,12 +1859,12 @@ sub StartMonitor()
# Convenient.
$webtask->AutoStore(1);
}
return StartMonitorInternal($waitforstartup)
return StartMonitorInternal($waitforstartup);
}
sub StartMonitorInternal(;$)
sub StartMonitorInternal(;$@)
{
my ($waitforstartup) = @_;
my ($waitforstartup, @aggregatelist) = @_;
my $logfile;
my $signaled = 0;
......@@ -1673,7 +1888,7 @@ sub StartMonitorInternal(;$)
if (!$debug) {
$logfile = TBMakeLogname("aptmonitor");
if (TBBackGround($logfile)) {
return 0;
return $PID;
}
}
$instance->Update({"monitor_pid" => '$PID'});
......@@ -1715,6 +1930,7 @@ sub StartMonitorInternal(;$)
else {
$webtask->output($response->output());
}
$webtask->exitcode($response->code());
}
return -1;
}
......@@ -1741,7 +1957,9 @@ sub StartMonitorInternal(;$)
}
}
#
# We poll until the status goes ready. Might not be a good idea.
# We poll until the status goes ready, and if waiting for the
# startup commands to finish, for all of them to no longer be
# executing.
#
if ($blob->{'status'} eq "ready") {
return 0
......@@ -1771,7 +1989,7 @@ sub StartMonitorInternal(;$)
local $SIG{INT} = $handler;
}
my @return_codes = ();
my @agglist = $instance->AggregateList();
my @agglist = $instance->AggregateList() if (! @aggregatelist);
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
......@@ -1801,7 +2019,7 @@ sub StartMonitorInternal(;$)
}
unlink($logfile)
if (defined($logfile) && !$debug);
exit(0);
exit($seconds < 0 ? -1 : 0);
}
#
......@@ -2123,7 +2341,7 @@ sub DoLinktest()
$errmsg = "Could not $action linktest on some slivers";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
$errcode = $code;
$errcode = $agg->webtask()->exitcode();
}
goto bad;
}
......@@ -2321,7 +2539,7 @@ sub DoUpdateKeys()
$errmsg = "Could not update keys on some slivers";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
$errcode = $code;
$errcode = $agg->webtask()->output();
}
goto bad;
}
......
......@@ -45,6 +45,7 @@ function (_, sup, moment, marked, UriTemplate, ShowImagingModal,
var consolenodes = {};
var showlinktest = false;
var hidelinktest = false;
var changingtopo = false;
var EMULAB_NS = "http://www.protogeni.net/resources/rspec/ext/emulab/1";