Commit d8ed2605 authored by Robert Ricci's avatar Robert Ricci

Added code to clear out the %retried and %warned hashes, so that it will

retry (or warn) about nodes that get stuck more than once.
parent f711dfc8
......@@ -101,6 +101,12 @@ while (1) {
#
sleep(10);
#
# We use this to figure out when to delete nodes from the retried and
# warned hashes
#
my $time = time();
#
# First, look for nodes that have been in the reloading experiment for
# longer than $retry_time, and try rebooting them
......@@ -115,10 +121,21 @@ while (1) {
fatal("$node was wedged, but could not be power cycled.");
}
print "\nReload appears wedged. Power cycling and trying once more!\n";
$retried{$node} = 1;
}
$retried{$node} = $time;
}
#
# We can pull out all nodes that were not 'touched' (matched by the select above)
# during this pass
#
foreach $node (keys %retried) {
if ($retried{$node} != $time) {
delete $retried{$node};
}
}
#
# Next, we do the same thing for nodes in the reloading experiment for
# longer than $warn_time, and warn the admins.
......@@ -131,7 +148,17 @@ while (1) {
if (!$warned{$node}) {
notify("Node $node has been in $RELOADPID/$RELOADEID for " .
"more than $warn_time minutes");
$warned{$node} = 1;
}
$warned{$node} = $time;
}
#
# We can pull out all nodes that were not 'touched' (matched by the select above)
# during this pass
#
foreach $node (keys %warned) {
if ($warned{$node} != $time) {
delete $warned{$node};
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment