All new accounts created on Gitlab now require administrator approval. If you invite any collaborators, please let Flux staff know so they can approve the accounts.

Commit 0199f565 authored by Leigh B Stoller's avatar Leigh B Stoller

Some tweaks to DeleteSlice() and cancelation while still setting up,

hopefully fixes the race.
parent b6f5a1c2
...@@ -1579,6 +1579,8 @@ sub ActionStart($$;$) ...@@ -1579,6 +1579,8 @@ sub ActionStart($$;$)
$self->ComputeState(); $self->ComputeState();
$experiment->SetState($expstate); $experiment->SetState($expstate);
$slice->ClearMonitorPid(); $slice->ClearMonitorPid();
# in case we were canceled by DeleteSlice()
$experiment->SetCancelFlag(0);
return 0; return 0;
bad: bad:
...@@ -1599,6 +1601,8 @@ sub ActionStart($$;$) ...@@ -1599,6 +1601,8 @@ sub ActionStart($$;$)
} }
$experiment->SetState($expstate); $experiment->SetState($expstate);
$slice->ClearMonitorPid(); $slice->ClearMonitorPid();
# in case we were canceled by DeleteSlice()
$experiment->SetCancelFlag(0);
return -1; return -1;
} }
......
...@@ -904,6 +904,7 @@ sub DeleteSlice($) ...@@ -904,6 +904,7 @@ sub DeleteSlice($)
if ($slice->Lock() != 0) { if ($slice->Lock() != 0) {
return GeniResponse->BusyResponse(); return GeniResponse->BusyResponse();
} }
my $slice_experiment = $slice->GetExperiment();
# #
# If a monitor process is running, then the slice is busy. # If a monitor process is running, then the slice is busy.
...@@ -928,7 +929,6 @@ sub DeleteSlice($) ...@@ -928,7 +929,6 @@ sub DeleteSlice($)
# rebooting timed out nodes, and quit earlier. The caller will not # rebooting timed out nodes, and quit earlier. The caller will not
# have to retry as long. # have to retry as long.
# #
my $slice_experiment = $slice->GetExperiment();
if (defined($slice_experiment)) { if (defined($slice_experiment)) {
$slice_experiment->SetCancelFlag(1); $slice_experiment->SetCancelFlag(1);
} }
...@@ -953,7 +953,8 @@ sub DeleteSlice($) ...@@ -953,7 +953,8 @@ sub DeleteSlice($)
} }
} }
# #
# If we were canceled, we wait for the monitor to stop, instead of # If we were canceled, we wait for the monitor to stop before
# we can kill it.
# #
if ($canceled) { if ($canceled) {
while ($slice->GetMonitorPid()) { while ($slice->GetMonitorPid()) {
...@@ -961,7 +962,11 @@ sub DeleteSlice($) ...@@ -961,7 +962,11 @@ sub DeleteSlice($)
GeniCM::CheckMonitor($slice); GeniCM::CheckMonitor($slice);
print STDERR "Checking to see if monitor has stopped ...\n"; print STDERR "Checking to see if monitor has stopped ...\n";
} }
#
# The monitor has stopped and we have the lock. Clear the
# cancel flag so we can actually terminate (checked in endexp).
#
$slice_experiment->SetCancelFlag(1);
} }
my $retval = GeniCM::CleanupDeadSlice($slice, 1); my $retval = GeniCM::CleanupDeadSlice($slice, 1);
if ($retval) { if ($retval) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment