Commit 69b90e79 authored by Mike Hibler's avatar Mike Hibler

Attempt to make firewall experiment swapout more robust by addressing a

couple of MFS booting problems:
 * in the RPC power controller, make sure that an "on" command succeeds
   by checking the status, retrying if it failed (we already did this for
   "off")
 * if nodes fail to boot up the MFS after a power on, try again with a
   power cycle.  I have seen "power on" leave pc600s hung, and a power
   cycle seems to cure it.
parent 30ff680b
......@@ -70,6 +70,8 @@ my $sleepwait = 10;
# 'prepare' 1 to run prepare on the way down
# 'wait' 1 to wait for all nodes to reach MFS before returning,
# 0 to return after reboot/power-on
# 'retry' 1 if we should retry once on nodes that do not make it
# into the MFS.
#
# Returns zero if all nodes successfully complete the reboot (wait==0)
# or reach the MFS (wait==1), and non-zero otherwise. If the $failed ref
......@@ -88,6 +90,10 @@ sub TBAdminMfsBoot($$@)
my $reboot = $args->{'reboot'};
my $wait = $args->{'wait'};
my $prepare = $args->{'prepare'};
my $retry = $args->{'retry'};
$retry = 0
if (!defined($retry) || !$wait);
#
# Reboot or power on nodes...
......@@ -201,16 +207,42 @@ sub TBAdminMfsBoot($$@)
}
}
if (@failed) {
print STDERR "*** $me:\n".
" Failed to boot " . ($on ? "MFS" : "regular OS") .
" on: @failed\n";
@$failedref = @failed
if (defined($failedref));
return 1;
if (@failed == 0) {
print STDOUT "All nodes are up.\n";
return 0;
}
print STDOUT "All nodes are up.\n";
print STDERR "*** $me:\n".
" Failed to boot " . ($on ? "MFS" : "regular OS") .
" on: @failed\n";
#
# Failures to boot after powering on may be due to issues with
# power-on and not related to the nodes themselves. So we will
# retry once, trying a power cycle the second time around.
# This addresses cases where I have seen power on leave a machine
# (pc600) in a hung state that can be cured by cycling (I have no
# idea why...).
#
if ($reboot < 0 && $retry) {
print STDERR "Retrying on failed nodes ...\n";
my @myfailed;
my %myargs;
$myargs{'name'} = $me;
$myargs{'on'} = $on;
$myargs{'reboot'} = 1;
$myargs{'wait'} = $wait;
$myargs{'prepare'} = $prepare;
$myargs{'retry'} = 0;
if (TBAdminMfsBoot(\%myargs, \@myfailed, @failed) == 0) {
return 0;
}
@failed = @myfailed;
}
@$failedref = @failed
if (defined($failedref));
return 1;
}
return 0;
......@@ -344,6 +376,8 @@ sub TBAdminMfsSelect($$@)
# a timeout.
# 'timestamp' if timestamps after significant events are desired
# 'prepare' 1 if nodes should be rebooted with "prepare" flag
# 'retry' 1 if we should retry once on nodes that do not make it
# into the MFS.
#
# Returns zero if all nodes successfully run the command.
# If the $failed ref is defined, it is an arrayref in which we return the
......@@ -368,6 +402,7 @@ sub TBAdminMfsRunCmd($$@)
my $pinterval = $args->{'pinterval'};
my $pcookie = $args->{'pcookie'};
my $prepare = $args->{'prepare'};
my $retry = $args->{'retry'};
# we always need a value
$timeout = $commandtimo
......@@ -416,6 +451,7 @@ sub TBAdminMfsRunCmd($$@)
$myargs{'name'} = $me;
$myargs{'on'} = 1;
$myargs{'reboot'} = $poweron ? -1 : 1;
$myargs{'retry'} = $retry;
$myargs{'prepare'} = $prepare;
$myargs{'wait'} = 1;
my @failed = ();
......
......@@ -2,7 +2,7 @@
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002, 2005 University of Utah and the Flux Group.
# Copyright (c) 2000-2002, 2005, 2006 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -133,11 +133,13 @@ sub rpc27ctrl {
for my $try (1..$ntries) {
$status = syncandsend($controller, $TIP, $command, undef);
#
# Double check that an off command really turned the outlet(s)
# off. We assume reliable power off in our security environment
# and we have seen cases where powering off doesn't.
# Double check that an off or on command really turned the
# outlet(s) off or on. We assume reliable power off in our
# security environment and we have seen cases where powering
# off doesn't. Reliable power on also avoid many unnecessary
# failures during firewalled experiment swapout.
#
if ($status == 0 && $cmd eq "off") {
if ($status == 0 && ($cmd eq "off" || $cmd eq "on")) {
my %stathash;
for my $stry (1..$ntries) {
$status = syncandsend($controller, $TIP, "status",
......@@ -149,7 +151,7 @@ sub rpc27ctrl {
my @noutlets = ();
for my $o (split(",", $outlet)) {
if (!defined($stathash{"outlet$o"}) ||
$stathash{"outlet$o"} !~ /^off$/i) {
$stathash{"outlet$o"} !~ /^$cmd$/i) {
push(@noutlets, $o);
}
}
......@@ -159,12 +161,12 @@ sub rpc27ctrl {
$status = -1;
if ($try == $ntries) {
print STDERR
"*** Failed to turn off $controller $outlet\n";
"*** Failed to turn $cmd $controller $outlet\n";
}
}
} elsif ($status > 0) {
print STDERR
"*** Post-off status command failed on $controller\n";
"*** Post-$cmd status command failed on $controller\n";
}
}
last
......
......@@ -1578,6 +1578,7 @@ sub undoFWNodes($$;@) {
$myargs{'name'} = "tbswap";
$myargs{'command'} = "sudo /usr/local/bin/diskzap";
$myargs{'poweron'} = 1;
$myargs{'retry'} = 1;
if (TBAdminMfsRunCmd(\%myargs, \@failed, @nodes)) {
$fwerr = "Failed to invalidate bootblocks on @failed!";
@fwstate = ("Firewall is NOT in place",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment