Commit 8d5b66eb authored by Robert Ricci's avatar Robert Ricci

Now watches for nodes that have been reloading for too long. "too long" is

currently defined as 30 minutes, to keep false positives to a minimum. Sends
mail to testbed-ops if/when it finds any. The timing is not precise, as it
only polls in between loading machines, but this is fine for our purposes.
parent b33ca16b
......@@ -46,6 +46,7 @@ my $sched_reload= "$TB/sbin/sched_reload";
my $reboot = "$TB/bin/node_reboot";
my $logfile = "$TB/log/reloadlog";
my $debug = 0;
my $warn_time = 30 * 60; # 30 minutes
#
# Turn off line buffering on output (dots ...).
......@@ -89,6 +90,55 @@ while (1) {
# Partial delay between loops in case of an error.
#
sleep(10);
#
# Check for nodes in the reloading experiment. Notify the admins if
# any node stays in the reloading experiment for longer than $warn_time
#
#
# Build up a set of nodes in the reloading experiment for this run
#
$query_result =
DBQueryWarn("select node_id from reserved where pid='$RELOADPID' " .
"and eid='$RELOADEID'");
$time = time();
%this_run = ();
while (($node) = $query_result->fetchrow()) {
$this_run{$node} = 1;
}
#
# Take appropriate action for all nodes in the reloading experiment:
# warn if it's been in there too long, or put it in node_times if this
# is the first time we've seen it
#
foreach $node (keys %this_run) {
if (exists $node_times{$node}) {
#
# %warned is so that we won't warn the admins about the same
# node twice
#
if ((($time - $node_times{$node}) > $warn_time)
&& !$warned{$node}) {
notify("Node $node has been in $RELOADPID/$RELOADEID for " .
"more than $warn_time seconds");
$warned{$node} = 1;
}
} else {
$node_times{$node} = $time;
}
}
#
# Remove any nodes from node_times that were not in the reloading
# experiment this time
#
foreach $node (keys %node_times) {
if (!$this_run{$node}) {
delete $this_run{$node};
}
}
#
# Find all of the free node that have not been reloaded (no pid entry
......@@ -295,7 +345,7 @@ sub notify($)
my($mesg) = $_[0];
print "$mesg\n";
SENDMAIL($TBOPS, "TESTBED: Reload Daemon Message", $msg);
SENDMAIL($TBOPS, "TESTBED: Reload Daemon Message", $mesg);
}
#
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment