Commit 5bd9ad1a authored by Leigh B Stoller's avatar Leigh B Stoller

Add cancel support. The idea is that a DeleteSlice() with our internal

cancel option, will stop a CreateSliver() in its tracks. We stop the
monitor, then cleanup the slice. I also added an optimization for tearing
down large numbers of VMs on shared nodes, previously we were doing them
one at a time. Note that only the Portal is going to use this option, since
it loosely depends on code in the XEN clientside (described in another
commit).
parent aacaf7b8
......@@ -430,6 +430,11 @@ sub DeleteSliver()
exists($options->{'blocking'}) && $options->{'blocking'}) {
$delete_args->{'blocking'} = 1;
}
# And cancel
if (defined($options) &&
exists($options->{'cancel'}) && $options->{'cancel'}) {
$delete_args->{'cancel'} = 1;
}
my $response = GeniCMV2::DeleteSlice($delete_args);
if (!ref($response)) {
......
......@@ -1076,6 +1076,7 @@ sub Action($$$;$)
goto bad;
}
$node->_reloaded(0);
$node->_rebooted(0);
# Backpointer used in WaitForNodes().
$node->_sliver($sliver);
$node->_image(undef);
......@@ -1549,6 +1550,9 @@ sub Action($$$;$)
if (keys(%reboots)) {
my @node_ids = keys(%reboots);
# Mark them as being rebooted. See below.
map { $_->_rebooted(1) } values(%reboots);
#
# Should waiting be an option?
#
......@@ -1773,10 +1777,18 @@ sub WaitForNodes($$@)
$canceled = $experiment->canceled();
if ($canceled) {
print STDERR "WaitForNodes canceled; terminating early!\n";
#
# Mark the remaining nodes as canceled (which really means failure).
#
foreach my $node (values(%nodes)) {
$node->_sliver()->SetStatus("canceled")
if (defined($node->_sliver()));
}
# Reset before return; do not want it left.
$slice->LockTables();
$experiment->SetCancelFlag(0);
# Do this first so others know we are reacting to the cancel.
$slice->ClearMonitorPid();
$experiment->SetCancelFlag(0);
$slice->UnLockTables();
return -1;
}
......@@ -1864,17 +1876,18 @@ sub WaitForNodes($$@)
($state eq TBDB_NODESTATE_BOOTING &&
time() - $node->_laststatestamp() > 400))) {
my $giveup = $node->_retried();
my $minutes = int($waittime / 60);
# physnode, give up right away.
$giveup = 1
if (!$node->isvirtnode());
if ($giveup) {
print STDERR
"$node_id still stuck in $state, giving up.\n";
print STDERR "$node_id still stuck in $state at $minutes, ".
"giving up.\n";
}
else {
print STDERR
"$node_id is stuck in $state, restarting it.\n";
print STDERR "$node_id is stuck in $state at $minutes, ".
"restarting it.\n";
$node->_retried(1);
system("$NODEREBOOT $node_id");
$giveup = 1
......@@ -1893,6 +1906,9 @@ sub WaitForNodes($$@)
if ($node->_laststate() ne $state) {
$node->_laststatestamp(time());
$node->_laststate($state);
my $minutes = int($waittime / 60);
print STDERR "$node switches to $state at ".
"$minutes minutes (" . time() . ")\n";
}
if (int($waittime / 60) > $minutes) {
# Changing minutes is why we get this print for just
......@@ -1978,11 +1994,11 @@ sub WaitForNodes($$@)
}
# Too late, but reset before return; do not want it left set.
$slice->LockTables();
# Do this first. See cancel in DeleteSlice().
$slice->ClearMonitorPid();
if ($experiment->canceled()) {
$experiment->SetCancelFlag(0);
}
# Do this last.
$slice->ClearMonitorPid();
$slice->UnLockTables();
return 0;
......@@ -2180,8 +2196,9 @@ sub UnProvision($;$)
# Might be an aggregate that includes link aggregates. Lets do those
# first to avoid work when tearing down the nodes.
#
my @links = ();
my @nodes = ();
my @links = ();
my @nodes = ();
my @shared = ();
foreach my $sliver (@slivers) {
if (ref($sliver) eq "GeniAggregate::Link" ||
......@@ -2193,7 +2210,22 @@ sub UnProvision($;$)
return -1;
}
elsif (ref($sliver) eq "GeniSliver::Node") {
push(@nodes, $sliver);
#
# We need an optimization here, to speed up teardown of
# vnodes on shared nodes, which will happen one at a time
# if we let them go through the sliver Unprovision() call.
#
my $node = Node->Lookup($sliver->resource_id());
if (!defined($node)) {
print STDERR "Could not map $sliver to its a node\n";
next;
}
if ($node->isvirtnode() && $node->sharing_mode()) {
push(@shared, $sliver);
}
else {
push(@nodes, $sliver);
}
}
}
foreach my $sliver (@links) {
......@@ -2210,6 +2242,34 @@ sub UnProvision($;$)
next;
}
}
# Now do the shared nodes in a group.
if (@shared) {
my $experiment = Experiment->Lookup($self->slice_uuid());
if (!defined($experiment)) {
print STDERR "Could not map $self to its experiment\n";
return 0;
}
my $pid = $experiment->pid();
my $eid = $experiment->eid();
my @ids = map { $_->resource_id() } @shared;
system("$VNODESETUP -j -q -m -k $pid $eid @ids");
if ($?) {
print STDERR "$VNODESETUP -k failed on @ids\n";
}
# Mark as stopped to avoid duplicate work later.
foreach my $sliver (@shared) {
$sliver->SetStatus("stopped");
#
# Now we can unprovision, which will be much faster now.
#
if ($sliver->UnProvision($nophysfree) != 0) {
print STDERR "Could not unprovision $sliver in $self\n";
$sliver->SetStatus("broken");
next;
}
}
}
return 0;
}
......
......@@ -715,7 +715,9 @@ sub DeleteSlice($)
my $slice_urn = $argref->{'slice_urn'};
my $credentials = $argref->{'credentials'};
my $impotent = $argref->{'impotent'} || 0;
my $cancel = 0;
my $blocking = 0;
my $canceled = 0;
if (! (defined($credentials) && defined($slice_urn))) {
return GeniResponse->MalformedArgsResponse("Missing arguments");
......@@ -726,6 +728,9 @@ sub DeleteSlice($)
if (exists($argref->{'blocking'}) && $argref->{'blocking'}) {
$blocking = 1;
}
if (exists($argref->{'cancel'}) && $argref->{'cancel'}) {
$cancel = 1;
}
my ($credential,$speaksfor) = GeniStd::CheckCredentials($credentials);
return $credential
if (GeniResponse::IsResponse($credential));
......@@ -767,11 +772,26 @@ sub DeleteSlice($)
#
GeniCM::CheckMonitor($slice);
if ($slice->monitor_pid()) {
$slice->UnLock();
return GeniResponse->MonitorResponse();
if (!$cancel) {
$slice->UnLock();
return GeniResponse->MonitorResponse()
}
#
# But what we can do is set the cancel flag, which the monitor is
# checking each time through the loop. This will cause it to stop
# rebooting timed out nodes, and quit earlier. The caller will not
# have to retry as long.
#
my $slice_experiment = $slice->GetExperiment();
if (defined($slice_experiment)) {
$slice_experiment->SetCancelFlag(1);
}
print STDERR "Canceling the monitor (" . $slice->monitor_pid() . ")\n";
$canceled = 1;
}
# If any slivers are imaging, then we are busy as well.
if (defined($aggregate) &&
elsif (defined($aggregate) &&
$aggregate->CheckSliverStates("imaging")) {
$slice->UnLock();
return GeniResponse->BusyResponse();
......@@ -787,6 +807,16 @@ sub DeleteSlice($)
return GeniResponse->Create(GENIRESPONSE_SUCCESS);
}
}
#
# If we were canceled, we wait for the monitor to stop, instead of
#
if ($canceled) {
while ($slice->monitor_pid()) {
sleep(10);
GeniCM::CheckMonitor($slice);
print STDERR "Checking to see if monitor has stopped ...\n";
}
}
my $retval = GeniCM::CleanupDeadSlice($slice, 1);
if ($retval) {
return -1
......
......@@ -1004,18 +1004,24 @@ sub UnProvision($;$)
my $pid = $experiment->pid();
my $eid = $experiment->eid();
if ($node->isremotenode() && $node->isvirtnode()) {
system("$VNODESETUP -p -q -m -k $pid $eid $node_id");
if ($?) {
print STDERR "$VNODESETUP -k failed on $node_id\n";
return -1;
#
# Shared virtnode only for this part, we want to avoid duplicated
# or unnecessary work (no need to tear down on dedicated hosts).
#
if ($self->status() ne "stopped") {
if ($node->isremotenode() && $node->isvirtnode()) {
system("$VNODESETUP -p -q -m -k $pid $eid $node_id");
if ($?) {
print STDERR "$VNODESETUP -k failed on $node_id\n";
return -1;
}
}
}
elsif ($node->sharing_mode()) {
system("$VNODESETUP -j -q -m -k $pid $eid $node_id");
if ($?) {
print STDERR "$VNODESETUP -k failed on $node_id\n";
return -1;
elsif ($node->sharing_mode()) {
system("$VNODESETUP -j -q -m -k $pid $eid $node_id");
if ($?) {
print STDERR "$VNODESETUP -k failed on $node_id\n";
return -1;
}
}
}
......@@ -1311,7 +1317,7 @@ sub ComputeStatus($$)
# If the sliver is "broken" then call it failed. It might be failed,
# but if the node actually came up okay later, we set it back to okay.
#
if ($self->status() eq "broken") {
if ($self->status() eq "broken" || $self->status() eq "canceled") {
$$pref = "failed";
return 0;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment