Commit 104fe855 authored by Leigh Stoller's avatar Leigh Stoller

Plumb through optional power off when quarantining an experiment.

Part of the Power Panic Button.
parent 66bf14c2
#!/usr/bin/perl -wT
#
# Copyright (c) 2007-2018 University of Utah and the Flux Group.
# Copyright (c) 2007-2019 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -775,6 +775,17 @@ sub GetGroup($)
return Group->Lookup($self->gid_idx());
}
#
# Map an instance to the local experiment (when its running locally)
#
sub LocalExperiment($)
{
my ($self) = @_;
require Experiment;
return Experiment->Lookup($self->slice_uuid());
}
#
# Warn creator that the experiment is going to expire. This is hooked
# in from the sa_daemon, so we can send a message that is less geni like
......@@ -3145,9 +3156,9 @@ sub Lockdown($$$)
#
# Panic
#
sub Panic($$)
sub Panic($$$)
{
my ($self, $clear) = @_;
my ($self, $clear, $poweroff) = @_;
my $authority = $self->GetGeniAuthority();
my $slice = $self->instance()->GetGeniSlice();
my $context = APT_Geni::GeniContext();
......@@ -3165,6 +3176,8 @@ sub Panic($$)
};
$args->{"clear"} = 1
if ($clear);
$args->{"poweroff"} = 1
if ($poweroff);
my $cmurl = $authority->url();
$cmurl = devurl($cmurl) if ($usemydevtree);
......
......@@ -3259,10 +3259,12 @@ sub DoPanic()
my $errmsg;
my $errcode = -1;
my $exitcode = -1;
my $takelock = 0;
my $response;
my $optlist = "L";
my $optlist = "Lp";
my $takelock = 0;
my $poweroff = 0;
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
......@@ -3270,6 +3272,9 @@ sub DoPanic()
if (defined($options{"L"})) {
$takelock = 1;
}
if (defined($options{"p"})) {
$poweroff = 1;
}
usage()
if (@ARGV != 1);
......@@ -3293,6 +3298,13 @@ sub DoPanic()
$exitcode = 1;
goto bad;
}
# Check the panic flag after locking, in case someone beat us to it.
$instance->Refresh();
if ($instance->paniced()) {
print "Already in a panic mode!\n";
$slice->UnLock();
exit(0);
}
#
# No panic at stitching aggregates, so look to see if we cross any.
......@@ -3316,7 +3328,7 @@ sub DoPanic()
my $coderef = sub {
my ($sliver) = @_;
return $sliver->Panic(($setclr eq "clear" ? 1 : 0));
return $sliver->Panic(($setclr eq "clear" ? 1 : 0), $poweroff);
};
# Invoke on all clusters
......@@ -4458,6 +4470,7 @@ sub DoWarn()
my $freeze = 0;
my $terminate = 0;
my $panic = 0;
my $poweroff = 0;
my $errmsg;
my $reason;
my $logfile;
......@@ -4470,7 +4483,7 @@ sub DoWarn()
my $slice_uuid = $slice->uuid();
my $weburl = $instance->webURL();
my $optlist = "f:FTQ";
my $optlist = "f:FTQP";
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
......@@ -4483,6 +4496,9 @@ sub DoWarn()
}
elsif (defined($options{"Q"})) {
$panic = 1;
if (defined($options{"P"})) {
$poweroff = 1;
}
}
if (defined($options{"f"})) {
my $filename = $options{"f"};
......@@ -4605,7 +4621,7 @@ sub DoWarn()
# We pass the lock through.
system("$MANAGEINSTANCE " .
(defined($webtask) ? "-t $webtask_id " : "").
" -d -- panic $uuid -L set");
" -d -- panic $uuid -L " . ($poweroff ? "-p " : "") . "set");
}
else {
# We pass the lock through.
......@@ -4886,7 +4902,7 @@ sub CallMethodOnAggregates($$$@)
"$sliver");
last;
}
print Dumper($response);
#print Dumper($response);
# We can keep trying for these, but not an RPC error.
last
......
......@@ -5724,6 +5724,7 @@ sub Panic($)
my ($argref) = @_;
my $slice_urn = $argref->{'slice_urn'};
my $clear = $argref->{'clear'};
my $poweroff = $argref->{'poweroff'};
my $credentials = $argref->{'credentials'};
if (! (defined($credentials) && defined($slice_urn))) {
......@@ -5773,7 +5774,8 @@ sub Panic($)
}
my $pid = $experiment->pid();
my $eid = $experiment->eid();
my $command = "$WAP $PANIC -l 1 " . ($clear ? "-r " : "") . "$pid $eid";
my $command = "$WAP $PANIC -l " . ($poweroff ? "3" : "1") . " " .
($clear ? "-r " : "") . "$pid $eid";
#
# The backend script sends a bunch of stuff to stdout, so capture it.
# We want to do this in the background cause it is going to take a long
......@@ -5782,20 +5784,6 @@ sub Panic($)
#
my $mypid = main::WrapperFork();
if ($mypid) {
sleep(3);
my $kid = waitpid($mypid, &WNOHANG);
if ($kid == $mypid) {
my $stat = $?;
if ($stat & 127) {
# died with a signal, return the signal
$stat = $stat & 127;
} else {
# else return the exit code
$stat = $stat >> 8;
}
$slice->UnLock();
return GeniResponse->Create(GENIRESPONSE_ERROR);
}
# We want to unlock it so we can get status, so we set the shutdown
# flag since that will prevent any other changes from happening.
if (!$clear) {
......@@ -5804,10 +5792,10 @@ sub Panic($)
$slice->UnLock();
return GeniResponse->Create(GENIRESPONSE_SUCCESS);
}
sleep(1);
GeniUtil::FlipToElabMan();
my $output = GeniUtil::ExecQuiet($command);
# Not a typical op, so always print debugging info;
print STDERR $output;
print STDERR "Running '$command'\n";
system($command);
if ($?) {
return -1;
}
......
......@@ -1566,6 +1566,7 @@ sub ComputeStatus($;$)
$status = "failed";
}
elsif ($eventstate eq TBDB_NODESTATE_SHUTDOWN() ||
$eventstate eq TBDB_NODESTATE_POWEROFF() ||
$eventstate eq TBDB_NODESTATE_PXEWAIT()) {
$status = "notready";
}
......
#!/usr/bin/perl -w
#
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
# Copyright (c) 2000-2019 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -37,6 +37,7 @@ sub usage()
"switches and arguments:\n".
"-l level - Level 1; reboot nodes into the admin MFS\n".
" - Level 2; disable the control network\n".
" - Level 3; power off all nodes\n".
"-r - Reset panic state (admin people only)\n".
"-c - Clear panic state but do not do anything else\n".
"-w - From web interface, create a log file.\n".
......@@ -52,6 +53,7 @@ my $dolog = 0;
sub fatal($);
sub DoIt();
sub PowerMode();
#
# Exit codes are important; they tell the web page what has happened so
......@@ -83,6 +85,7 @@ sub ExitWithStatus($$)
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $snmpit = "$TB/bin/snmpit";
my $POWER = "$TB/bin/power";
#
# Testbed Support libraries
......@@ -131,9 +134,9 @@ if (defined($options{"w"})) {
$dolog = 1;
}
if (defined($options{"l"})) {
$level = 1;
$level = $options{"l"};
usage()
if ($level < 1 || $level > 2);
if ($level < 1 || $level > 3);
}
my $this_user = User->ThisUser();
......@@ -212,10 +215,17 @@ $experiment->Lock(($reset || $clear ?
if ($clear) {
$experiment->SetPanicBit(0);
$experiment->Unlock();
print "Panic state has been cleared, but nodes are still in the MFS\n";
print "Panic state has been cleared, but nothing else has been done\n";
exit(0);
}
#
# Deal with level 3.
#
if ($experiment->paniced() == 3 || $level == 3) {
exit(PowerMode());
}
# Force level 2 for firewalled experiments.
if ($experiment->IsFirewalled()) {
$level = 2
......@@ -276,7 +286,7 @@ sub fatal($)
my($mesg) = $_[0];
#
# Send a message to the testbed list. Append the logfile.
# Send a message to the testbed list.
#
SENDMAIL("$user_name <$user_email>",
"Quarantine Failure for Experiment $pid/$eid",
......@@ -291,3 +301,53 @@ sub fatal($)
die("*** $0:\n".
" $mesg\n");
}
#
# Power mode.
#
sub PowerMode()
{
my @failures = ();
# Need a list of nodes, going to call power individually.
my @nodes = $experiment->NodeList(1, 0);
if (!@nodes) {
$experiment->Unlock();
fatal("Could not get node list for experiment");
}
foreach my $node_id (@nodes) {
# Tell power to send an event.
system("$POWER " . ($reset ? "on" : "off") . " $node_id");
if ($?) {
push(@failures, $node_id);
}
}
# Change the mode even if we failed on some nodes, will need to deal
# with it by hand.
if ($reset) {
$experiment->SetPanicBit(0);
}
else {
$experiment->SetPanicBit($level);
}
if (@failures) {
$experiment->Unlock();
fatal("Could not power ". ($reset ? "on" : "off") .
" some nodes: @failures");
}
$experiment->Unlock();
print "Panic Button has been ". ($reset ? "cleared" : "pressed") . "\n";
#
# Send email notification to user *and* to tbops.
#
SENDMAIL("$user_name <$user_email>",
"Quarantine notification for Experiment $pid/$eid",
"Experiment $pid/$eid has been ".
($reset ? "released from quarantine" : "quarantined") . "\n",
"$user_name <$user_email>",
"Cc: $swapper_name <$swapper_email>\n".
"Bcc: $TBOPS");
return 0;
}
......@@ -522,7 +522,7 @@ $(function ()
function DoQuarantine(mode)
{
mode = (mode ? 1 : 0);
var callback = function(json) {
if (json.code) {
sup.HideModal('#waitwait-modal', function () {
......@@ -544,11 +544,49 @@ $(function ()
}
sup.HideModal('#waitwait-modal');
}
sup.ShowModal('#waitwait-modal');
var xmlthing = sup.CallServerMethod(null, "status", "Quarantine",
{"uuid" : window.UUID,
"quarantine" : mode});
xmlthing.done(callback);
// Handler for hide modal, this is the cancel operation.
$('#confirm-quarantine-modal').on('hidden.bs.modal', function (event) {
$(this).unbind(event);
$('#confirm-quarantine').unbind("click.quarantine");
if (mode) {
// Flip the checkbox back.
$('#quarantine-checkbox')
.prop("checked", false);
}
else {
// Flip the checkbox back.
$('#quarantine-checkbox')
.prop("checked", true);
}
});
// Handler for the confirm button,
$('#confirm-quarantine').bind("click.quarantine", function (event) {
// Unbind the handlers.
$('#confirm-quarantine').unbind("click.quarantine");
$('#confirm-quarantine-modal').off('hidden.bs.modal');
sup.HideModal('#confirm-quarantine-modal', function () {
var args = {"uuid" : window.UUID,
"quarantine" : mode};
if (mode &&
$('#quarantine-poweroff-checkbox').is(":checked")) {
args["poweroff"] = 1;
}
sup.ShowModal('#waitwait-modal');
var xmlthing = sup.CallServerMethod(null, "status",
"Quarantine", args);
xmlthing.done(callback);
});
});
if (mode) {
$('#confirm-quarantine-modal .q-on').removeClass("hidden");
$('#confirm-quarantine-modal .q-off').addClass("hidden");
}
else {
$('#confirm-quarantine-modal .q-on').addClass("hidden");
$('#confirm-quarantine-modal .q-off').removeClass("hidden");
}
sup.ShowModal('#confirm-quarantine-modal');
}
//
......
......@@ -450,6 +450,19 @@ $(function ()
if (_.has(json.value, "sliverstatus")) {
ShowSliverURLs(json.value.sliverstatus);
}
if (0) {
console.info("GetStatus", instanceStatus,
expinfo.paniced, json.value.paniced);
}
// Watch for experiment going into or out of panic mode.
if (expinfo.paniced && !json.value.paniced) {
// Left panic mode.
expinfo.paniced = 0;
}
if (expinfo.paniced || json.value.paniced) {
expinfo.paniced = 1;
instanceStatus = "quarantined";
}
if (instanceStatus != lastStatus) {
APT_OPTIONS.updatePage({ 'instance-status': instanceStatus });
console.info("New Status: ", json);
......@@ -541,6 +554,12 @@ $(function ()
status_html = "<font color=red>failed</font>";
ProgressBarUpdate();
}
else if (instanceStatus == 'quarantined') {
bgtype = "panel-warning";
status_message = "Your experiment has been quarantined";
status_html = "<font color=red>quarantined</font>";
ProgressBarUpdate();
}
else if (instanceStatus == 'imaging') {
bgtype = "panel-warning";
status_message = "Your experiment is busy while we " +
......@@ -952,9 +971,10 @@ $(function ()
if (jacksID === undefined) {
return;
}
// Is the node in recovery.
// Is the node in recovery. Panic mode does not count.
var recovery = false;
if (_.has(details, "recovery") && details.recovery != 0) {
if (!expinfo.paniced &&
_.has(details, "recovery") && details.recovery != 0) {
recovery = true;
inrecovery[node_id] = true;
}
......@@ -3392,6 +3412,7 @@ $(function ()
spinwidth = "66";
}
else if (instanceStatus == "ready" || instanceStatus == "failed" ||
instanceStatus == "quarantined" ||
instanceStatus == "pending" || instanceStatus == "deferred") {
spinwidth = null;
}
......@@ -3675,6 +3696,7 @@ $(function ()
var kill = $('#destroy-terminate-checkbox').is(':checked');
var panic = $('#destroy-quarantine-checkbox').is(':checked');
var freeze = $('#destroy-freeze-checkbox').is(':checked');
var poweroff = $('#destroy-poweroff-checkbox').is(':checked');
var args = {"uuid" : uuid};
if (reason != "") {
args["reason"] = reason;
......@@ -3687,6 +3709,9 @@ $(function ()
}
else if (panic) {
args["quarantine"] = true;
if (poweroff) {
args["poweroff"] = true;
}
}
sup.HideModal("#destroy-experiment-modal", function () {
sup.ShowWaitWait();
......
......@@ -100,6 +100,7 @@ function Do_GetInstanceStatus()
$blob = array();
$blob["status"] = $instance->status();
$blob["canceled"] = $instance->canceled() ? 1 : 0;
$blob["paniced"] = $instance->paniced() ? 1 : 0;
$blob["sliverstatus"] = array();
if ($instance->logfileid()) {
......@@ -226,6 +227,11 @@ function Do_TerminateInstance()
if (StatusSetupAjax(0)) {
return;
}
if ($instance->paniced() && !ISADMIN()) {
SPITAJAX_ERROR(1, "This experiment has been quarantined and ".
"must be terminated my an administrator.");
return;
}
if (!(ISADMIN() || $instance->CanTerminate($this_user))) {
SPITAJAX_ERROR(1, "Not enough permission to terminate");
return;
......@@ -689,11 +695,17 @@ function Do_Snapshot()
SPITAJAX_ERROR(1, "Only registered users can snapshot nodes");
return;
}
if ($instance->paniced()) {
SPITAJAX_ERROR(1, "This experiment has been quarantined.");
return;
}
$this_idx = $this_user->uid_idx();
$uuid = $ajax_args["uuid"];
$checkonly = (isset($ajax_args["checkonly"]) &&
$ajax_args["checkonly"] == 1 ? 1 : 0);
$pid = $instance->pid();
#
# As per Rob, if an experiment is locked down, then only the creator,
# project leader, or an admininstrator.
......@@ -991,6 +1003,12 @@ function Do_RebootOrReload($which)
SPITAJAX_ERROR(1, "Only registered users can reboot/reload nodes");
return;
}
if ($instance->paniced() && !ISADMIN()) {
SPITAJAX_ERROR(1, "This experiment has been quarantined, only ".
"an admininstrator can reboot/reload.");
return;
}
$this_idx = $this_user->uid_idx();
$uuid = $ajax_args["uuid"];
......@@ -1073,6 +1091,10 @@ function Do_DeleteNodes()
SPITAJAX_ERROR(1, "Only registered users can delete nodes");
return;
}
if ($instance->paniced()) {
SPITAJAX_ERROR(1, "This experiment has been quarantined.");
return;
}
$this_idx = $this_user->uid_idx();
$uuid = $ajax_args["uuid"];
......@@ -1405,16 +1427,19 @@ function Do_Quarantine()
}
$which = ($ajax_args["quarantine"] ? "set" : "clear");
$uuid = $ajax_args["uuid"];
$opt = "";
if (isset($ajax_args["poweroff"]) && $ajax_args["poweroff"]) {
$opt = "-p";
}
$instance = Instance::Lookup($uuid);
if (!$instance) {
SPITAJAX_ERROR(1, "Unknown instance uuid");
return;
}
$webtask = WebTask::CreateAnonymous();
$retval = SUEXEC($this_user->uid(), "nobody",
"webmanage_instance -t " . $webtask->task_id() .
" panic $uuid $which",
" panic $uuid $opt $which",
SUEXEC_ACTION_IGNORE);
$webtask->Refresh();
......@@ -1440,6 +1465,10 @@ function Do_Linktest()
if (StatusSetupAjax(0)) {
return;
}
if ($instance->paniced()) {
SPITAJAX_ERROR(1, "This experiment has been quarantined.");
return;
}
if (!isset($ajax_args["action"])) {
SPITAJAX_ERROR(1, "Missing action");
return;
......@@ -1919,6 +1948,9 @@ function Do_WarnExperiment()
}
elseif (isset($ajax_args["quarantine"]) && $ajax_args["quarantine"]) {
$options .= "-Q ";
if (isset($ajax_args["poweroff"]) && $ajax_args["poweroff"]) {
$options .= "-P ";
}
}
# Freeze user
if (isset($ajax_args["freeze"]) && $ajax_args["freeze"]) {
......@@ -1964,6 +1996,10 @@ function Do_Recovery()
if (StatusSetupAjax(1)) {
return;
}
if ($instance->paniced()) {
SPITAJAX_ERROR(1, "This experiment has been quarantined.");
return;
}
if (!isset($ajax_args["node"])) {
SPITAJAX_ERROR(1, "Missing node argument");
return 1;
......
......@@ -402,6 +402,32 @@ pre {
</div>
</div>
</div>
<!-- Confirm Quarantine -->
<div id='confirm-quarantine-modal' class='modal fade'>
<div class='modal-dialog'>
<div class='modal-content'>
<div class='modal-body'>
<button type='button' class='close' data-dismiss='modal'
aria-hidden='true'>&times;</button>
<center>
<div class="q-on hidden">
<h3>Confirm to Quarantine</h3>
<div>
Power Off Nodes? <input type="checkbox"
id="quarantine-poweroff-checkbox">
</div>
</div>
<div class="q-off hidden">
<h3>Confirm to release from Quarantine</h3>
</div>
<button class='btn btn-danger btn-sm'
style="margin-top: 10px;"
id='confirm-quarantine'>Confirm</button>
</center>
</div>
</div>
</div>
</div>
<!-- Finished -->
<div id='success-modal' class='modal fade'>
<div class='modal-dialog'>
......
......@@ -19,7 +19,10 @@
</div>
<input type=checkbox
id='destroy-quarantine-checkbox' value=yes>
Quarantine Experiment or
Quarantine Experiment
(with power off? <input type=checkbox
id='destroy-poweroff-checkbox' value=yes>)
or
<input type=checkbox style="margin-left: 5px"
id='destroy-terminate-checkbox' value=yes>
Terminate Experiment
......
......@@ -269,8 +269,8 @@ pre {
class="exp-running hidden" <% } %>>
<div class='pull-left'
data-toggle='popover'
data-delay='{"hide":500, "show":500}'
data-content="When checked, only administrator can extend
data-delay='{"hide":100, "show":500}'
data-content="When checked, only administrators can extend
this experiment. No free time is granted to
user at all.">
<label class="checkbox-inline" style='margin-right: 10px;'>
......@@ -282,7 +282,7 @@ pre {
class="exp-running hidden" <% } %>>
<div class='pull-left'
data-toggle='popover'
data-delay='{"hide":500, "show":500}'
data-delay='{"hide":100, "show":500}'
data-content="When checked, the experiment
cannot be terminated by the user unless the
user verifies
......@@ -302,7 +302,7 @@ pre {
class="exp-running hidden" <% } %>>
<div class='pull-left'
data-toggle='popover'
data-delay='{"hide":500, "show":500}'
data-delay='{"hide":100, "show":500}'
data-content="When checked, the experiment
cannot be terminated by the user. Only an
admin can do it.">
......@@ -312,20 +312,6 @@ pre {
Admin Lockdown</label>
</div>
</div>
<div <% if (expinfo.status == "deferred") { %>
class="exp-running hidden" <% } %>>
<div class='pull-left'
data-toggle='popover'
data-delay='{"hide":500, "show":500}'
data-content="When checked, the experiment is put into
Quarantine (emulab panic) mode.">
<label class="checkbox-inline" style='margin-right: 10px;'>
<input type="checkbox" id="quarantine_checkbox" disabled
<% if (expinfo.paniced) { %>checked<% } %> >
<span <% if (expinfo.paniced) { %>class="text-danger"<% } %> >
Quarantined</span></label>
</div>
</div>
<div <% if (expinfo.status == "deferred") { %>
class="exp-running hidden" <% } %>>
<div class='pull-left'>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment