From 32c3d934d1b8ec8f53f721576d8415107199fa37 Mon Sep 17 00:00:00 2001 From: Leigh B Stoller Date: Tue, 1 Dec 2015 17:50:17 -0700 Subject: [PATCH] Add support for cancelation; stopping an experiment setup early, instead of waiting till it finished setting up (or fails). This is really nice when a 1000 node experiment has gone awry and it is pointless to wait for it to finish. When we do this, we mark the instance as canceled in the DB, and then wait for create_instance() to notice it. When it does, it stops waiting and invokes terminate with a new cancel option at the backend. --- apt/APT_Instance.pm.in | 40 +++++++++++++++++++++++++++++++++++- apt/create_instance.in | 41 ++++++++++++++++++++++++++++++++++++- apt/manage_instance.in | 17 ++++++++++++++- www/aptui/instance_defs.php | 1 + www/aptui/js/status.js | 8 ++++++++ www/aptui/myexperiments.php | 4 ++++ www/aptui/status.ajax | 1 + 7 files changed, 109 insertions(+), 3 deletions(-) diff --git a/apt/APT_Instance.pm.in b/apt/APT_Instance.pm.in index 5d7a2eeff..a43e87bbd 100644 --- a/apt/APT_Instance.pm.in +++ b/apt/APT_Instance.pm.in @@ -406,6 +406,42 @@ sub SetManifest($$) return 0; } +sub MarkCanceled($) +{ + my ($self) = @_; + + # Must be a real reference. + return -1 + if (! ref($self)); + + my $uuid = $self->uuid(); + + DBQueryWarn("update apt_instances set ". + " canceled=1,canceled_timestamp=now() ". + "where uuid='$uuid'") + or return -1; + + $self->{'INSTANCE'}->{'canceled'} = 1; + return 0; +} +# We need to bypass the in memory state for this. +sub IsCanceled($) +{ + my ($self) = @_; + my $uuid = $self->uuid(); + + my $query_result = + DBQueryWarn("select canceled from apt_instances ". + "where uuid='$uuid'"); + + return -1 + if (!$query_result); + return 0 + if (!$query_result->numrows); + my ($canceled) = $query_result->fetchrow_array(); + return $canceled; +} + # # Set to use the logfile. # @@ -1205,7 +1241,9 @@ sub Terminate($) $speaksfor_credential->asString()]; } $method = "DeleteSliver"; - @params = ($slice->urn(), $credentials, {"blocking" => 'true'}); + @params = ($slice->urn(), $credentials, + {"blocking" => 'true', + "cancel" => 'true'}); # Convert URL to use AM interface. $cmurl =~ s/\/cm$/\/am/; diff --git a/apt/create_instance.in b/apt/create_instance.in index 14864fc2e..bae618fda 100755 --- a/apt/create_instance.in +++ b/apt/create_instance.in @@ -847,6 +847,7 @@ sub WaitForSliver($) my $interval = 15; my $ready = 0; my $failed = 0; + my $rpcfail = 0; my $public_url; my $repblob; my $laststatus; @@ -859,7 +860,8 @@ sub WaitForSliver($) if (!defined($response) || !defined($response->value()) || ($response->code() != GENIRESPONSE_SUCCESS && $response->code() != GENIRESPONSE_SERVER_UNAVAILABLE && - $response->code() != GENIRESPONSE_BUSY)) { + $response->code() != GENIRESPONSE_BUSY && + $response->code() != GENIRESPONSE_RPCERROR)) { print STDERR "SliverStatus failed"; if (defined($response)) { @@ -878,6 +880,23 @@ sub WaitForSliver($) $failed = 1; last; } + if ($response->code() == GENIRESPONSE_RPCERROR) { + if ($rpcfail > 10) { + if ($response->output() =~ /read timeout/) { + $webtask->output("Lost contact with the aggregate. " . + "Possibly a network failure, ". + "please try again later."); + } + else { + $webtask->output($response->output()); + } + $failed = 1; + last; + } + $rpcfail++; + next; + } + $rpcfail = 0; next if ($response->code() == GENIRESPONSE_BUSY || $response->code() == GENIRESPONSE_SERVER_UNAVAILABLE); @@ -888,6 +907,7 @@ sub WaitForSliver($) # cares about. We get this on each loop, update so the web # interface can show changes. # + my $changed = 0; my $statusblob = {}; foreach my $urn (keys(%{$repblob->{'details'}})) { my $details = $repblob->{'details'}->{$urn}; @@ -940,6 +960,13 @@ sub WaitForSliver($) $webtask->output("Experiment setup on $urn failed"); last; } + elsif ($instance->IsCanceled()) { + last; + } + } + if ($instance->IsCanceled()) { + $webtask->Exited(0); + return 0; } if ($failed || !$ready) { $aggobj->SetStatus("failed"); @@ -981,6 +1008,18 @@ print "$slice_urn\n"; # Count up nodes running a startup service. my $startuprunning = 0; +# +# If we were canceled, then none of the stuff below matters, we +# are going to do a terminate. +# +if ($instance->IsCanceled()) { + $instance->SetStatus("canceled"); + $slice->UnLock(); + + system("$MANAGEINSTANCE -t $webtask_id terminate $quickvm_uuid"); + exit(0); +} + # # Check the exit codes; any failure is a total failure (for now). # diff --git a/apt/manage_instance.in b/apt/manage_instance.in index 5c6bb2b10..53d9a4d52 100644 --- a/apt/manage_instance.in +++ b/apt/manage_instance.in @@ -1015,7 +1015,22 @@ sub DoTerminate() # a disk image. # if ($slice->Lock()) { - fatal("Slice is busy, cannot lock it"); + # + # A special case is if the slice is provisioning. This means the + # user is giving up on it, and we want to tell the aggregate to + # kill it. Not all aggregates are going to allow this, so need + # to be able to deal with that. + # + if ($instance->status() ne "provisioned") { + fatal("Slice is busy, cannot lock it"); + } + if (!$instance->canceled()) { + print "Marking instance canceled\n"; + $instance->MarkCanceled(); + } + sleep(1); + # We have an obvious race here since we do not have the lock. + exit(0); } my $old_status = $instance->status(); $instance->SetStatus("terminating"); diff --git a/www/aptui/instance_defs.php b/www/aptui/instance_defs.php index 69adb8508..795cdffee 100644 --- a/www/aptui/instance_defs.php +++ b/www/aptui/instance_defs.php @@ -134,6 +134,7 @@ class Instance function profile_id() { return $this->field('profile_id'); } function profile_version() { return $this->field('profile_version'); } function status() { return $this->field('status'); } + function canceled() { return $this->field('canceled'); } function pid() { return $this->field('pid'); } function pid_idx() { return $this->field('pid_idx'); } function public_url() { return $this->field('public_url'); } diff --git a/www/aptui/js/status.js b/www/aptui/js/status.js index f0b11316c..49e4be4a9 100644 --- a/www/aptui/js/status.js +++ b/www/aptui/js/status.js @@ -379,6 +379,13 @@ function (_, sup, moment, marked, UriTemplate, ShowImagingModal, else if (status == 'provisioned') { $("#status_progress_bar").width("66%"); status_html = "booting"; + if (json.value.canceled) { + status_html += " (but canceled)"; + } + else { + // So the user can cancel. + EnableButton("terminate"); + } } else if (status == 'ready') { bgtype = "panel-success"; @@ -397,6 +404,7 @@ function (_, sup, moment, marked, UriTemplate, ShowImagingModal, $("#status_progress_div").addClass("progress-bar-success"); $("#status_progress_bar").width("100%"); } + $('#error_panel').addClass("hidden"); EnableButtons(); // We should be looking at the node status instead. if (lastStatus != "imaging") { diff --git a/www/aptui/myexperiments.php b/www/aptui/myexperiments.php index e8daa582a..776c62679 100644 --- a/www/aptui/myexperiments.php +++ b/www/aptui/myexperiments.php @@ -180,6 +180,7 @@ function SPITROWS($showall, $name, $result) $uuid = $row["uuid"]; $name = $row["name"]; $status = $row["status"]; + $canceled = $row["canceled"]; $created = DateStringGMT($row["created"]); $expires = DateStringGMT($row["expires"]); $creator_idx = $row["creator_idx"]; @@ -206,6 +207,9 @@ function SPITROWS($showall, $name, $result) if ($row["expired"]) { $status = "expired"; } + elseif ($canceled) { + $status = "canceled"; + } $profile = Profile::Lookup($profile_id, $version); if ($profile) { $profile_name = $profile->name(); diff --git a/www/aptui/status.ajax b/www/aptui/status.ajax index 5035f54dd..e7da73d1d 100644 --- a/www/aptui/status.ajax +++ b/www/aptui/status.ajax @@ -120,6 +120,7 @@ function Do_GetInstanceStatus() } $blob = array(); $blob["status"] = $instance->status(); + $blob["canceled"] = $instance->canceled() ? 1 : 0; $blob["sliverstatus"] = array(); $blob["sliverurls"] = array(); -- GitLab