All new accounts created on Gitlab now require administrator approval. If you invite any collaborators, please let Flux staff know so they can approve the accounts.

Commit 69b90e79 authored by Mike Hibler's avatar Mike Hibler

Attempt to make firewall experiment swapout more robust by addressing a

couple of MFS booting problems:
 * in the RPC power controller, make sure that an "on" command succeeds
   by checking the status, retrying if it failed (we already did this for
   "off")
 * if nodes fail to boot up the MFS after a power on, try again with a
   power cycle.  I have seen "power on" leave pc600s hung, and a power
   cycle seems to cure it.
parent 30ff680b
......@@ -70,6 +70,8 @@ my $sleepwait = 10;
# 'prepare' 1 to run prepare on the way down
# 'wait' 1 to wait for all nodes to reach MFS before returning,
# 0 to return after reboot/power-on
# 'retry' 1 if we should retry once on nodes that do not make it
# into the MFS.
#
# Returns zero if all nodes successfully complete the reboot (wait==0)
# or reach the MFS (wait==1), and non-zero otherwise. If the $failed ref
......@@ -88,6 +90,10 @@ sub TBAdminMfsBoot($$@)
my $reboot = $args->{'reboot'};
my $wait = $args->{'wait'};
my $prepare = $args->{'prepare'};
my $retry = $args->{'retry'};
$retry = 0
if (!defined($retry) || !$wait);
#
# Reboot or power on nodes...
......@@ -201,16 +207,42 @@ sub TBAdminMfsBoot($$@)
}
}
if (@failed) {
print STDERR "*** $me:\n".
" Failed to boot " . ($on ? "MFS" : "regular OS") .
" on: @failed\n";
@$failedref = @failed
if (defined($failedref));
return 1;
if (@failed == 0) {
print STDOUT "All nodes are up.\n";
return 0;
}
print STDOUT "All nodes are up.\n";
print STDERR "*** $me:\n".
" Failed to boot " . ($on ? "MFS" : "regular OS") .
" on: @failed\n";
#
# Failures to boot after powering on may be due to issues with
# power-on and not related to the nodes themselves. So we will
# retry once, trying a power cycle the second time around.
# This addresses cases where I have seen power on leave a machine
# (pc600) in a hung state that can be cured by cycling (I have no
# idea why...).
#
if ($reboot < 0 && $retry) {
print STDERR "Retrying on failed nodes ...\n";
my @myfailed;
my %myargs;
$myargs{'name'} = $me;
$myargs{'on'} = $on;
$myargs{'reboot'} = 1;
$myargs{'wait'} = $wait;
$myargs{'prepare'} = $prepare;
$myargs{'retry'} = 0;
if (TBAdminMfsBoot(\%myargs, \@myfailed, @failed) == 0) {
return 0;
}
@failed = @myfailed;
}
@$failedref = @failed
if (defined($failedref));
return 1;
}
return 0;
......@@ -344,6 +376,8 @@ sub TBAdminMfsSelect($$@)
# a timeout.
# 'timestamp' if timestamps after significant events are desired
# 'prepare' 1 if nodes should be rebooted with "prepare" flag
# 'retry' 1 if we should retry once on nodes that do not make it
# into the MFS.
#
# Returns zero if all nodes successfully run the command.
# If the $failed ref is defined, it is an arrayref in which we return the
......@@ -368,6 +402,7 @@ sub TBAdminMfsRunCmd($$@)
my $pinterval = $args->{'pinterval'};
my $pcookie = $args->{'pcookie'};
my $prepare = $args->{'prepare'};
my $retry = $args->{'retry'};
# we always need a value
$timeout = $commandtimo
......@@ -416,6 +451,7 @@ sub TBAdminMfsRunCmd($$@)
$myargs{'name'} = $me;
$myargs{'on'} = 1;
$myargs{'reboot'} = $poweron ? -1 : 1;
$myargs{'retry'} = $retry;
$myargs{'prepare'} = $prepare;
$myargs{'wait'} = 1;
my @failed = ();
......
......@@ -2,7 +2,7 @@
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002, 2005 University of Utah and the Flux Group.
# Copyright (c) 2000-2002, 2005, 2006 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -133,11 +133,13 @@ sub rpc27ctrl {
for my $try (1..$ntries) {
$status = syncandsend($controller, $TIP, $command, undef);
#
# Double check that an off command really turned the outlet(s)
# off. We assume reliable power off in our security environment
# and we have seen cases where powering off doesn't.
# Double check that an off or on command really turned the
# outlet(s) off or on. We assume reliable power off in our
# security environment and we have seen cases where powering
# off doesn't. Reliable power on also avoid many unnecessary
# failures during firewalled experiment swapout.
#
if ($status == 0 && $cmd eq "off") {
if ($status == 0 && ($cmd eq "off" || $cmd eq "on")) {
my %stathash;
for my $stry (1..$ntries) {
$status = syncandsend($controller, $TIP, "status",
......@@ -149,7 +151,7 @@ sub rpc27ctrl {
my @noutlets = ();
for my $o (split(",", $outlet)) {
if (!defined($stathash{"outlet$o"}) ||
$stathash{"outlet$o"} !~ /^off$/i) {
$stathash{"outlet$o"} !~ /^$cmd$/i) {
push(@noutlets, $o);
}
}
......@@ -159,12 +161,12 @@ sub rpc27ctrl {
$status = -1;
if ($try == $ntries) {
print STDERR
"*** Failed to turn off $controller $outlet\n";
"*** Failed to turn $cmd $controller $outlet\n";
}
}
} elsif ($status > 0) {
print STDERR
"*** Post-off status command failed on $controller\n";
"*** Post-$cmd status command failed on $controller\n";
}
}
last
......
......@@ -1578,6 +1578,7 @@ sub undoFWNodes($$;@) {
$myargs{'name'} = "tbswap";
$myargs{'command'} = "sudo /usr/local/bin/diskzap";
$myargs{'poweron'} = 1;
$myargs{'retry'} = 1;
if (TBAdminMfsRunCmd(\%myargs, \@failed, @nodes)) {
$fwerr = "Failed to invalidate bootblocks on @failed!";
@fwstate = ("Firewall is NOT in place",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment