Commit 10973d41 authored by Leigh B. Stoller's avatar Leigh B. Stoller
Browse files

Run reboots in parallel by forking a sub process in perl, and then

wait for all of them. Also protect against an ssh hang with a
surrounding alarm. Also reduce output for web page.
parent b600feaf
......@@ -28,13 +28,14 @@ my $rsh = "sshtb -q";
my $ssh = "sshtb -q";
my $power = "$TB/bin/power";
my $ping = "/sbin/ping";
my $dbg = 1;
my $dbg = 0;
my %imagepaths = ();
my %imageparts = ();
my %nodeos = ();
my %nodepath = ();
my %nodepart = ();
my %waitfor = ();
my %pids = ();
my $SAVEUID = $UID;
my @row;
......@@ -245,65 +246,24 @@ foreach my $node ( keys %nodeos ) {
}
#
# See if the machine is pingable. If its not pingable, then
# we just power cycle the machine rather than wait for a bunch
# of ssh/rsh commands to time out.
#
print STDERR "Pinging $pc ... \n" if $dbg;
if (-e $ping) {
open(PING, "$ping -c 4 $pc 2>&1 |");
}
else {
die("PING command $ping not found!\n");
}
do {
}
until ( <PING> =~ /transmitted, (\d*) packets received/ );
close(PING);
print STDERR "Got back $1 ping packets from $pc.\n" if $dbg;
#
# Power cycle if the machine is dead. It will come back up with the
# proper OS, cause we modified the database above.
#
if ( $1 == 0 ) {
print STDERR "$pc appears to be dead. Power cycling ...\n";
PowerCycle($pc);
next;
}
#
# Machine is pingable at least. Try to reboot it gracefully,
# or power cycle anyway if that does not work. To this, we must
# change our real UID to root so that ssh will work.
# Fire off a reboot process so that we can overlap them all.
# We need the pid so we can wait for them all before preceeding.
#
print STDERR "Rebooting $pc ...\n";
$UID = 0;
if (system("$ssh -l root $pc /sbin/reboot") == 0) {
$UID = $SAVEUID;
print STDERR "$pc appears to be rebooting\n" if $dbg;
next;
}
$UID = $SAVEUID;
$pid = RebootNode($pc);
$pids{$pc} = $pid;
}
#
# Okay, before we power cycle lets really make sure. On FreeBSD, it might
# have rebooted, but since the connection is terminated, system returns
# an error status. So, lets ping it again and if its pingable, the
# reboot must have failed. If it is not pingable, I assume that the
# reboot really worked, and the exit value can be ignored.
#
my $exit_value = $? >> 8;
print STDERR "reboot returned $exit_value. Lets make sure it dies\n"
if $dbg;
#
# Wait for all the reboot children to exit before continuing.
#
foreach my $node ( keys %nodeos ) {
my $pc = $node;
my $pid = $pids{$pc};
if (WaitTillDead($pc) == 0) {
next;
waitpid($pid, 0);
if ($?) {
die("Reboot of node $pc failed!");
}
print STDERR "$pc appears to still be running Power cycling ...\n";
PowerCycle($pc);
}
print STDOUT "Waiting for testbed nodes to finish rebooting ...\n";
......@@ -354,8 +314,8 @@ sub WaitTillAlive {
# packets are sent from all the pings, before it will exit. So,
# loop doing a bunch of shorter pings.
#
for ($i = 0; $i < 30; $i++) {
open(PING, "$ping -c 5 $pc 2>&1 |");
for ($i = 0; $i < 40; $i++) {
open(PING, "$ping -c 4 $pc 2>&1 |");
do {
$_ = <PING>;
if ( $_ =~ /bytes from/ ) {
......@@ -366,10 +326,108 @@ sub WaitTillAlive {
until ( $_ =~ /transmitted, (\d*) packets received/ );
}
close(PING);
print STDERR "$pc is not responding. Better check into it\n";
print STDERR "$pc is not responding. Better check into it\n" if $dbg;
return 1;
}
#
# Reboot a node in a child process. Return the pid to the parent so
# that it can wait on all the children later.
#
sub RebootNode {
local($pc) = @_;
print STDOUT "Rebooting $pc ...\n";
$pid = fork();
if ($pid) {
return $pid;
}
#
# See if the machine is pingable. If its not pingable, then
# we just power cycle the machine rather than wait for a bunch
# of ssh/rsh commands to time out.
#
print STDERR "Pinging $pc ... \n" if $dbg;
if (-e $ping) {
open(PING, "$ping -c 4 $pc 2>&1 |");
}
else {
die("PING command $ping not found!\n");
}
do {
}
until ( <PING> =~ /transmitted, (\d*) packets received/ );
close(PING);
print STDERR "Got back $1 ping packets from $pc.\n" if $dbg;
#
# Power cycle if the machine is dead. It will come back up with the
# proper OS, cause we modified the database above.
#
if ( $1 == 0 ) {
print STDERR "$pc appears to be dead. Power cycling ...\n" if $dbg;
PowerCycle($pc);
exit(0);
}
#
# Machine is pingable at least. Try to reboot it gracefully,
# or power cycle anyway if that does not work. To this, we must
# change our real UID to root so that ssh will work.
#
print STDERR "Rebooting $pc with ssh command ...\n" if $dbg;
#
# Run an ssh command in a child process, protected by an alarm to
# ensure that the ssh is not hung up forever if the machine is in
# some funky state.
#
$syspid = fork();
if ($syspid) {
local $SIG{ALRM} = sub { kill("TERM", $syspid); };
alarm 15;
waitpid($pid, 0);
alarm 0;
#
# If ssh times out, just punch the button.
#
if ($? == 15) {
print STDERR "$pc appears to be wedged. Power cycling ...\n"
if $dbg;
PowerCycle($pc);
exit(0);
}
}
else {
$UID = 0;
exec("$ssh -l root $pc /sbin/reboot");
exit(0);
}
#
# Okay, before we power cycle lets really make sure. On FreeBSD, it might
# have rebooted, but since the connection is terminated, system returns
# an error status. So, lets ping it again and if its pingable, the
# reboot must have failed. If it is not pingable, I assume that the
# reboot really worked, and the exit value can be ignored.
#
my $exit_value = $? >> 8;
print STDERR "reboot returned $exit_value. Lets make sure it dies\n"
if $dbg;
if (WaitTillDead($pc) == 0) {
exit(0);
}
print STDERR "$pc appears to still be running. Power cycling ...\n"
if $dbg;
PowerCycle($pc);
exit(0);
}
sub WaitTillDead {
local($pc) = @_;
......@@ -379,8 +437,8 @@ sub WaitTillDead {
# packets are sent from all the pings, before it will exit. So,
# loop doing a bunch of shorter pings.
#
for ($i = 0; $i < 12; $i++) {
open(PING, "$ping -c 5 $pc 2>&1 |");
for ($i = 0; $i < 15; $i++) {
open(PING, "$ping -c 4 $pc 2>&1 |");
do {
}
until ( <PING> =~ /transmitted, (\d*) packets received/ );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment