Commit 5c1ced81 authored by Leigh Stoller's avatar Leigh Stoller

Changes to "panic" mode.

* Any experiment can now be paniced; formerlly, only firewalled
  experiments could be paniced. There are new options in the admin
  menu section of the Show Experiment page. ProtoGeni experiments can
  also be paniced.

  At the moment, this is an admin-only option. Should it be?

* There are now two levels of panic. The first level (and the default)
  is to throw all the machines into the Admin MFS and reboot them.

  Level 2 (OH NO!) disables the control network. Not sure we need two
  levels, but hey, it was easy.

* Admins can swapout a paniced experiment. There is new code in tbswap
  to watch for a paniced experiment, and zaps the disks.

* Add menu option to clear the panic. Also admin-only. 

* Minor tweaks to Show Experiment when experiment is paniced.
parent 3026644a
......@@ -566,3 +566,185 @@ done:
return 0;
}
#
# The guts of "panic", which is also used for non-firewalled experiments.
#
# Flags.
sub PANIC_PANIC() { return 1; }
sub PANIC_CLEAR() { return 2; }
sub PANIC_ZAP() { return 3; }
sub Panic($$$)
{
my ($experiment, $level, $which) = @_;
my $pid = $experiment->pid();
my $eid = $experiment->eid();
my $firewalled = $experiment->IsFirewalled();
my ($firewall, $port);
if ($firewalled) {
if ($experiment->FirewallAndPort(\$firewall, \$port) != 0) {
print STDERR "Could not determine firewall port for $experiment\n";
return -1;
}
}
my @nodes = ();
$experiment->LocalNodeListNames(\@nodes, 1);
if ($which == PANIC_ZAP()) {
$level = $experiment->paniced();
print STDERR "Powering down paniced nodes.\n";
system("$POWER off @nodes");
if ($?) {
print STDERR "Failed to power off all nodes!\n";
print STDERR "Nodes NOT switched to admin MFS\n";
goto badzap;
}
#
# Force all nodes into admin mode.
#
my %myargs;
$myargs{'name'} = "$0";
$myargs{'on'} = 1;
$myargs{'clearall'} = 1;
if (TBAdminMfsSelect(\%myargs, undef, @nodes)) {
print STDERR "Failed to force all nodes into admin mode!\n";
goto badzap;
}
#
# This code is not used for firewalled experiments, so we only
# have to worry about the control network.
#
if ($level == 2) {
system("$SNMPIT -R $pid $eid");
if ($?) {
goto badzap;
}
}
#
# Now we power on the nodes and let them boot into the MFS,
# where they will run the disk bootblock zapper.
#
# If this fails, we power off all the nodes again and get a
# little edgy in our error messages to emphasize the gravity
# of the situation. Someday we could just move the failed
# nodes into a special firewalled holding experiment, and
# let the experiment swapout finish, freeing up the nodes that
# did succeed.
#
print STDERR "Booting nodes into admin MFS and zapping bootblocks.\n";
my @failed = ();
%myargs = ();
$myargs{'name'} = "$0";
$myargs{'command'} = "sudo /usr/local/bin/diskzap";
$myargs{'poweron'} = 1;
$myargs{'retry'} = 1;
if (TBAdminMfsRunCmd(\%myargs, \@failed, @nodes)) {
print STDERR "Failed to invalidate bootblocks on @failed!\n";
badzap:
print STDERR "Powering nodes off again ...\n";
system("$POWER off @nodes");
if ($?) {
print STDERR "Some nodes may NOT be powered off.\n";
} else {
print STDERR "All nodes are powered off.\n";
}
print STDERR "MAKE SURE THESE NODES DO NOT BOOT FROM DISK!\n";
SENDMAIL($TBOPS,
"Failed to invalidate boot blocks for $pid/$eid\n",
"Failed to invalidate boot blocks while swapping out\n".
"paniced experiment $pid/$eid.\n");
goto bad;
}
}
elsif ($which == PANIC_CLEAR()) {
$level = $experiment->paniced();
if ($level == 1) {
#
# Turn admin mode back off and reboot back to the old OS
#
print "Allowing all nodes to reboot out of admin mode ...\n";
my %myargs;
$myargs{'name'} = "$0";
$myargs{'on'} = 0;
$myargs{'clearall'} = 0;
if (TBAdminMfsSelect(\%myargs, undef, @nodes)) {
print STDERR "Could not turn admin mode off for nodes\n";
goto bad;
}
$myargs{'reboot'} = 1;
$myargs{'wait'} = 0;
if (TBAdminMfsBoot(\%myargs, undef, @nodes)) {
print STDERR "Failed to reboot nodes out of admin mode\n";
goto bad;
}
}
else {
print "Enabling the control network ...\n";
if ($firewalled) {
system("$SNMPIT -e ${firewall}:${port}");
}
else {
system("$SNMPIT -R $pid $eid");
}
if ($?) {
print STDERR "snmpit exited with $?\n";
goto bad;
}
}
$experiment->SetPanicBit(0);
}
elsif ($which == PANIC_PANIC()) {
if ($level == 1) {
#
# Boot into the admin MFS
#
print "Booting all nodes into admin mode and waiting ...\n";
my %myargs;
$myargs{'name'} = "$0";
$myargs{'on'} = 1;
if (TBAdminMfsSelect(\%myargs, undef, @nodes)) {
print STDERR "Failed to force all nodes into admin mode!\n";
print STDERR "Falling back to control network disable\n";
$level = 2;
goto level2;
}
$myargs{'reboot'} = 1;
$myargs{'wait'} = 1;
$myargs{'retry'} = 1;
if (TBAdminMfsBoot(\%myargs, undef, @nodes)) {
print STDERR "Failed to boot all nodes into admin mode!\n";
print STDERR "Falling back to control network disable\n";
$level = 2;
goto level2;
}
}
else {
level2:
print "Disabling the control network ...\n";
if ($firewalled) {
system("$SNMPIT -d ${firewall}:${port}");
}
else {
system("$SNMPIT -D $pid $eid");
}
if ($?) {
print STDERR "snmpit exited with $?\n";
goto bad;
}
}
$experiment->SetPanicBit($level);
}
return 0;
bad:
return -1;
}
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2007 University of Utah and the Flux Group.
# Copyright (c) 2000-2011 University of Utah and the Flux Group.
# All rights reserved.
#
use strict;
......@@ -15,17 +15,22 @@ use POSIX qw(isatty setsid);
sub usage()
{
print(STDERR
"Usage: panicbutton [-r] <pid> <eid>\n".
"Usage: panic [-l level] <pid> <eid>\n".
" panic -r <pid> <eid>\n".
"switches and arguments:\n".
"-l level - Level 1; reboot nodes into the admin MFS\n".
" - Level 2; disable the control network\n".
"-r - Reset panic state (admin people only)\n".
"<pid> - The project the experiment belongs to\n".
"<eid> - The experiment name (id)\n");
exit(-1);
}
my $optlist = "r";
my $optlist = "rl:";
my $reset = 0;
my $level = 1;
sub fatal($);
sub DoIt();
#
# Exit codes are important; they tell the web page what has happened so
......@@ -63,6 +68,7 @@ my $TBOPS = "@TBOPSEMAIL@";
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use Firewall;
use Experiment;
use User;
......@@ -93,6 +99,11 @@ if (@ARGV != 2) {
if (defined($options{"r"})) {
$reset = 1;
}
if (defined($options{"l"})) {
$level = 1;
usage()
if ($level < 1 || $level > 2);
}
my $this_user = User->ThisUser();
if (! defined($this_user)) {
......@@ -108,14 +119,6 @@ if (!defined($experiment)) {
my $pid = $experiment->pid();
my $eid = $experiment->eid();
#
# See if the experiment is firewalled. Error if not.
#
if (!$experiment->IsFirewalled()) {
die("*** $0:\n".
" $experiment is not firewalled!\n");
}
#
# Verify that this person is allowed to press the panic button.
# Note that any script down the line has to do an admin check also.
......@@ -123,7 +126,7 @@ if (!$experiment->IsFirewalled()) {
if ($UID && !$this_user->IsAdmin() &&
!$experiment->AccessCheck($this_user, TB_EXPT_MODIFY)) {
die("*** $0:\n".
" You do not have permission to end this experiment!\n");
" You do not have permission for this experiment!\n");
}
#
......@@ -152,7 +155,10 @@ $experiment->LockTables() == 0
#
if ($reset) {
ExitWithStatus(1, "Experiment $pid/$eid is not paniced!\n")
if ($experiment->state() ne EXPTSTATE_PANICED);
if ($experiment->state() ne EXPTSTATE_PANICED &&
$experiment->paniced());
$level = $experiment->paniced();
}
else {
ExitWithStatus(1, "Experiment $pid/$eid is not active!\n")
......@@ -167,42 +173,23 @@ else {
$experiment->Lock(($reset ? EXPTSTATE_ACTIVE : EXPTSTATE_PANICED), 1) == 0
or fatal("Could not lock $experiment");
#
# XXX - At this point a failure is going to leave things in an
# inconsistent state. Be sure to call fatal() only since we are
# going into the background, and we have to send email since no
# one is going to see printed error messages (output goes into the
# log file, which will be sent along in the email).
#
#
# Get firewall node and port info
#
my ($firewall, $port);
if ($experiment->FirewallAndPort(\$firewall, \$port) != 0) {
fatal("Could not determine firewall port for $experiment");
# Force level 2 for firewalled experiments.
if ($experiment->IsFirewalled()) {
$level = 2
if (!$reset);
}
#
# Call snmpit.
# XXX - At this point a failure is going to leave things in an
# inconsistent state.
#
if ($reset) {
system("$snmpit -e ${firewall}:${port}");
if ($?) {
fatal("snmpit exited with $?!");
}
$experiment->SetPanicBit(0);
print "Panic situation has been cleared!\n";
}
else {
system("$snmpit -d ${firewall}:${port}");
if ($?) {
fatal("snmpit exited with $?!");
}
$experiment->SetPanicBit(1);
print "Panic Button has been pressed!\n";
if (Firewall::Panic($experiment, $level,
($reset ?
Firewall::PANIC_RESET() : Firewall::PANIC_PANIC()))) {
fatal("Failure in Firewall::Panic()");
}
$experiment->Unlock();
print "Panic Button has been ". ($reset ? "cleared" : "pressed") . "\n";
#
# Send email notification to user *and* to tbops.
......@@ -215,8 +202,7 @@ SENDMAIL("$user_name <$user_email>",
"$user_name <$user_email>",
"Cc: $swapper_name <$swapper_email>\n".
"Bcc: $TBOPS");
exit 0;
exit(0);
sub fatal($)
{
......
......@@ -649,10 +649,17 @@ sub doSwapout($) {
# Nodes behind a firewall are treated special.
# See undoFWNodes for details.
#
# Non-firewalled experiments can now be paniced, and the swapout
# action is similar.
#
if ($firewalled && undoFWNodes($experiment)) {
tblog_set_cleanup(0);
return 1;
}
elsif ($experiment->paniced() &&
Firewall::Panic($experiment, 0, Firewall::PANIC_ZAP())) {
return 1;
}
#
# Perform swapout time admin actions. Right now there is at most
......
......@@ -731,6 +731,8 @@ class Experiment
$dpdbname = $exprow["dpdbname"];
$dpdbpassword= $exprow["dpdbpassword"];
$uuid = $exprow["eid_uuid"];
$paniced = $exprow["paniced"];
$panic_date = $exprow["panic_date"];
$autoswap_hrs= ($autoswap_timeout/60.0);
$idleswap_hrs= ($idleswap_timeout/60.0);
......@@ -934,6 +936,14 @@ class Experiment
</tr>\n";
}
if ($paniced) {
echo "<tr>
<td>Paniced on: </td>
<td class=left>$panic_date</td>
</tr>\n";
}
if ($linktest_pid) {
$linktest_running = "<b>(Linktest Running)</b>";
}
......
......@@ -18,7 +18,9 @@ $isadmin = ISADMIN();
#
$reqargs = RequiredPageArguments("experiment", PAGEARG_EXPERIMENT);
$optargs = OptionalPageArguments("canceled", PAGEARG_BOOLEAN,
"confirmed", PAGEARG_BOOLEAN);
"confirmed", PAGEARG_BOOLEAN,
"level", PAGEARG_INTEGER,
"clear", PAGEARG_BOOLEAN);
# Need these below.
$pid = $experiment->pid();
......@@ -42,8 +44,19 @@ PAGEHEADER("Press the Panic Button!");
# Verify permissions.
#
if (!$experiment->AccessCheck($this_user, $TB_EXPT_MODIFY)) {
USERERROR("You do not have permission to press the panic button for ".
"experiment $eid!", 1);
USERERROR("You do not have permission to press/clear the panic button.", 1);
}
if (isset($level)) {
if ($level < 1 || $level > 2) {
USERERROR("Improper level argument", 1);
}
}
else {
$level = 1;
}
if (!isset($clear)) {
$clear = 0;
}
echo $experiment->PageHeader();
......@@ -58,7 +71,8 @@ echo "<br>\n";
if (!isset($confirmed)) {
echo "<center><h3><br>
Are you <b>REALLY</b>
sure you want to press the panic button for Experiment '$eid?'
sure you want to " . ($clear ? "clear" : "press") .
" the panic button for Experiment '$eid?'
</h3>\n";
$experiment->Show(1);
......@@ -68,6 +82,10 @@ if (!isset($confirmed)) {
echo "<form action='$url' method=post>";
echo "<b><input type=submit name=confirmed value=Confirm></b>\n";
echo "<b><input type=submit name=canceled value=Cancel></b>\n";
echo "<b><input type=hidden name=level value=$level></b>\n";
if ($clear) {
echo "<b><input type=hidden name=clear value=$clear></b>\n";
}
echo "</form>\n";
echo "</center>\n";
......@@ -78,8 +96,19 @@ if (!isset($confirmed)) {
#
# We run a wrapper script that does all the work.
#
STARTBUSY("Pressing the panic button");
$retval = SUEXEC($uid, "$unix_pid,$unix_gid", "webpanic $pid $eid",
if ($clear) {
STARTBUSY("Clearing the panic button");
}
else {
STARTBUSY("Pressing the panic button");
}
if ($clear) {
$opt = "-r";
}
else {
$opt = "-l $level";
}
$retval = SUEXEC($uid, "$unix_pid,$unix_gid", "webpanic $opt $pid $eid",
SUEXEC_ACTION_IGNORE);
#
......@@ -105,8 +134,13 @@ if ($retval) {
echo "<blockquote><pre>$suexec_output<pre></blockquote>";
}
else {
echo "<h3>The panic button has been pressed!</h3><br>
You will need to contact testbed operations to continue.\n";
if ($clear) {
echo "<h3>The panic situation has been cleared!</h3><br>\n";
}
else {
echo "<h3>The panic button has been pressed!</h3><br>
You will need to contact testbed operations to continue.\n";
}
}
#
......
<?php
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2010 University of Utah and the Flux Group.
# Copyright (c) 2000-2011 University of Utah and the Flux Group.
# All rights reserved.
#
include("defs.php3");
......@@ -556,8 +556,10 @@ if (isset($classes['mote']) && $expstate == $TB_EXPTSTATE_ACTIVE) {
}
if ($isadmin) {
if ($expstate == $TB_EXPTSTATE_ACTIVE) {
if (!$geniflags) {
if ($expstate == $TB_EXPTSTATE_ACTIVE ||
$expstate == $TB_EXPTSTATE_PANICED) {
if ($expstate == $TB_EXPTSTATE_ACTIVE && !$geniflags) {
SUBMENUSECTION("Beta-Test Options");
WRITESUBMENUBUTTON("Restart Experiment",
CreateURL("swapexp", $experiment,
......@@ -568,17 +570,30 @@ if ($isadmin) {
SUBMENUSECTION("Admin Options");
if (!$geniflags) {
if ($expstate == $TB_EXPTSTATE_ACTIVE && !$geniflags) {
WRITESUBMENUBUTTON("Send an Idle Info Request",
CreateURL("request_idleinfo", $experiment));
WRITESUBMENUBUTTON("Send a Swap Request",
CreateURL("request_swapexp", $experiment));
}
WRITESUBMENUBUTTON("Force Swap Out (Idle-Swap)",
CreateURL("swapexp", $experiment,
"inout", "out", "force", 1));
if ($expstate == $TB_EXPTSTATE_PANICED) {
WRITESUBMENUBUTTON("Clear Panic Mode",
CreateURL("panicbutton", $experiment,
"clear", 1));
}
else {
WRITESUBMENUBUTTON("Panic Mode (level 1)",
CreateURL("panicbutton", $experiment,
"level", 1));
WRITESUBMENUBUTTON("Panic Mode (level 2)",
CreateURL("panicbutton", $experiment,
"level", 2));
WRITESUBMENUBUTTON("Force Swap Out (Idle-Swap)",
CreateURL("swapexp", $experiment,
"inout", "out", "force", 1));
}
SUBMENUSECTIONEND();
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment