Commit fb805ce3 authored by Mike Hibler's avatar Mike Hibler

bug: make sure it recognizes ssh "failing" with a "connection reset by

     remote host" which commonly happens if the machine reboots before
     the ssh connection finishes.
enhancement: attempt to tighten up the timing by doing 8 pings in 2
     seconds (using the root-only -i option)
parent 4b8cbd97
......@@ -5,13 +5,13 @@ use Getopt::Std;
#
# Reboot a node. Will power cycle the node as a last resort.
#
# usage: node_reboot [-d] node [node ...]
# usage: node_reboot [-d] [-f] node [node ...]
# Exit value is 0 if all nodes reboot okay, or the number of nodes
# could not be rebooted.
#
sub usage()
{
print STDOUT "Usage: node_reboot [-d] node [node ...]\n" .
print STDOUT "Usage: node_reboot [-d] [-f] node [node ...]\n" .
"Use the -d option to turn on debugging\n" .
"Use the -f option to shoot the node in the head\n";
exit(-1);
......@@ -177,14 +177,7 @@ sub RebootNode {
#
# ping returns 0 if any packets make it through.
#
system("$ping -q -c 4 -t 4 $pc >/dev/null 2>&1");
$status = $? >> 8;
print STDERR "Ping $pc returned $status.\n" if $debug;
#
# Power cycle if the machine is dead.
#
if ($status) {
if (! DoesPing($pc)) {
print STDERR "$pc appears to be dead. Power cycling ...\n" if $debug;
if (PowerCycle($pc)) {
exit(-1);
......@@ -196,7 +189,7 @@ sub RebootNode {
# Machine is pingable at least. Try to reboot it gracefully,
# or power cycle anyway if that does not work.
#
print STDERR "Rebooting $pc with ssh command ...\n" if $debug;
print STDERR "Trying ssh reboot of $pc ...\n" if $debug;
#
# Run an ssh command in a child process, protected by an alarm to
......@@ -217,10 +210,17 @@ sub RebootNode {
print STDERR "reboot returned $?.\n" if $debug;
#
# Did the ssh time out? Send it a ping of death.
# If either ssh is not running or it timed out,
# send it a ping of death.
#
if ($? == 15) {
print STDERR "$pc is wedged. Sending a POD.\n" if $debug;
if ($? == 256 || $? == 15) {
if ($? == 256) {
print STDERR "$pc is not running sshd.\n" if $debug;
} else {
print STDERR "$pc is wedged.\n" if $debug;
}
print STDERR "Trying Ping-of-Death on $pc ...\n" if $debug;
system("$ipod $pc");
}
}
......@@ -271,16 +271,9 @@ sub WaitTillDead {
# packets are sent from all the pings, before it will exit. So,
# loop doing a bunch of shorter pings.
#
for ($i = 0; $i < 15; $i++) {
system("$ping -q -c 4 -t 4 $pc >/dev/null 2>&1");
$status = $? >> 8;
#
# Returns 0 if any packets are returned. Returns 2 if pingable
# but no packets are returned. Other non-zero error codes indicate
# other problems. Assume that these other problems do not matter.
#
if ($status) {
for ($i = 0; $i < 30; $i++) {
if (! DoesPing($pc)) {
print STDERR "$pc is rebooting.\n" if $debug;
return 0;
}
}
......@@ -288,6 +281,32 @@ sub WaitTillDead {
return 1;
}
#
# Returns 1 if host is responding to pings, 0 otherwise
#
sub DoesPing {
local($pc) = @_;
local($status);
local($saveuid);
$saveuid = $UID;
$UID = 0;
system("$ping -q -i 0.25 -c 8 -t 2 $pc >/dev/null 2>&1");
$UID = $saveuid;
$status = $? >> 8;
#
# Returns 0 if any packets are returned. Returns 2 if pingable
# but no packets are returned. Other non-zero error codes indicate
# other problems. Any non-zero return indicates "not pingable" to us.
#
print STDERR "$ping $pc returned $status\n" if $debug;
if ($status) {
return 0;
}
return 1;
}
sub DBquery($)
{
my($query) = $_[0];
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment