Commit 73437a5c authored by Leigh B. Stoller's avatar Leigh B. Stoller

Another Shark hack. Well, maybe not. Batch node_reboots in groups of 8

to avoid a blizzard of reboots all at once. This might solve the
problem of sharks rebooting okay, but failing to become proper members
of the testbed. A good thing to do in any event, especially with
people trying to run 50 node experiments. The reason for 8 of course
is that I want to isolate each shelf (after sorting the list). I pause
15 seconds between each shelf, and 10 seconds between each batch of 8
pcs.
parent 58798153
...@@ -28,10 +28,11 @@ my $optlist = "dfe:"; ...@@ -28,10 +28,11 @@ my $optlist = "dfe:";
my $TB = "@prefix@"; my $TB = "@prefix@";
# #
# Load the Testbed support stuff. # Testbed Support libraries
# #
push(@INC, "$TB/lib"); use lib "@prefix@/lib";
require libdb; use libdb;
use libtestbed;
my $ssh = "$TB/bin/sshtb -n"; my $ssh = "$TB/bin/sshtb -n";
my $power = "$TB/bin/power"; my $power = "$TB/bin/power";
...@@ -141,27 +142,85 @@ if ($UID && !TBAdmin($UID)) { ...@@ -141,27 +142,85 @@ if ($UID && !TBAdmin($UID)) {
} }
# #
# In force mode, just call the power program for the whole bunch and # Another shark hack. Well, perhaps not. We really don't want 50 nodes
# be done with it. # all rebooting at the same time, PCs *or* sharks. Lets order them
# so that the shelves are grouped together at least, and issue the reboots
# in batches.
# #
if ($force) { my @sortednodes = sort(@nodes);
system("$power cycle @nodes");
exit $? >> 8; while (@sortednodes) {
my @batch = ();
my $i = 0;
my $lastshelf = 0;
while ($i < 8 && @sortednodes > 0) {
my $node = shift(@sortednodes);
my $shelf;
my $unit;
#
# The point of this sillyness is stop at each shelf transition.
#
if (IsShelved($node, \$shelf, \$unit)) {
if ($lastshelf && $lastshelf ne $shelf) {
unshift(@sortednodes, $node);
last;
}
$lastshelf = $shelf;
}
push(@batch, $node);
$i++;
}
if ($force) {
#
# In force mode, call the power program for the whole batch, and
# continue on. We don't wait for them to go down or reboot.
#
system("$power cycle @batch");
if ($?) {
exit ($? >> 8);
}
}
else {
#
# Fire off a reboot process so that we can overlap them all.
# We need the pid so we can wait for them all before preceeding.
#
foreach my $node ( @batch ) {
$mypid = RebootNode($node);
$pids{$node} = $mypid;
}
}
#
# If there are more nodes to go, then lets pause a bit so that we
# do not get a flood of machines coming up all at the same exact
# moment.
#
if (@sortednodes) {
print STDOUT "Pausing to give some nodes time to reboot ...\n";
if ($lastshelf) {
sleep(15);
}
sleep(10);
else
}
} }
# #
# Fire off a reboot process so that we can overlap them all. # In force mode, we are done.
# We need the pid so we can wait for them all before preceeding.
# #
foreach my $node ( @nodes ) { if ($force) {
$mypid = RebootNode($node); exit 0;
$pids{$node} = $mypid;
} }
# #
# Wait for all the reboot children to exit before continuing. # Wait for all the reboot children to exit before continuing.
# #
foreach my $node ( @nodes ) { foreach my $node ( sort(@nodes) ) {
my $mypid = $pids{$node}; my $mypid = $pids{$node};
waitpid($mypid, 0); waitpid($mypid, 0);
...@@ -234,7 +293,7 @@ sub RebootNode { ...@@ -234,7 +293,7 @@ sub RebootNode {
# The ssh can return non-zero exit status, but still have worked. # The ssh can return non-zero exit status, but still have worked.
# FreeBSD for example. # FreeBSD for example.
# #
print STDERR "reboot of $pc returned $?.\n"; print STDERR "reboot of $pc returned $?.\n" if $debug;
# #
# If either ssh is not running or it timed out, # If either ssh is not running or it timed out,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment