Commit e4dd337c authored by Mike Hibler's avatar Mike Hibler

Send mail if a power controller isn't responding or is failing.

parent 4b6bad6f
......@@ -42,7 +42,8 @@ use POSIX qw(strftime);
sub usage() {
print << "END";
Usage: $0 [-TPC] [ <type> ... ]
Usage: $0 [-w] [-TPC] [ <type> ... ]
-w whiner flag, send mail to $TBOPS about errors
-T turn on temperature (in degrees F) monitoring
-P turn on power (watts) consumption monitoring
-C turn on current (amps) used monitoring
......@@ -55,10 +56,19 @@ Usage: $0 [-TPC] [ <type> ... ]
END
}
my $optlist = "ANTPC";
my $optlist = "wANTPC";
my $dotemps = 1;
my $dopower = 0;
my $docurrent = 0;
my $whiner = 0;
my $whinefile = "/tmp/powermon.$$";
my $childpid = -1;
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
$| = 1; #Turn off line buffering on output
#
# Parse command arguments. Once we return from getopts, all that should be
......@@ -68,6 +78,13 @@ my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"w"})) {
if (open(WHINE, ">$whinefile")) {
$whiner = 1;
} else {
print "*** Could not open $whinefile, $TBOPS whining disabled\n";
}
}
if (defined($options{"A"})) {
$dotemps = $dopower = $docurrent = 1;
}
......@@ -137,33 +154,52 @@ sub dostatus(@) {
# We do this in a child process so we can time it out if
# one of the controllers is not responding.
#
my $syspid = fork();
if ($syspid) {
$childpid = fork();
if ($childpid) {
#
# Allow 5 seconds per controller
#
local $SIG{ALRM} = sub { kill("TERM", $syspid); };
local $SIG{ALRM} = sub { kill("TERM", $childpid); };
my $to = 5 * scalar(@wanted);
alarm $to;
waitpid($syspid, 0);
waitpid($childpid, 0);
alarm 0;
if ($?) {
logit("*** Status fetch failed after $to seconds with exit val $?");
my $ecode = $?;
if ($ecode) {
my $cause;
if (($ecode & 0xff) > 0) {
$cause = "timed out after $to seconds";
} else {
$cause = "failed with exit value ". ($ecode >> 8);
}
logit("*** Status fetch $cause");
#
# If pissin' and moanin' send the last run log to TBOPS
#
if ($whiner) {
SENDMAIL($TBOPS,
"WARNING: power controller(s) $cause",
"There were errors getting status from one or more ".
"of the power controllers,\nthe log is appended.",
$TBOPS, undef,
($whinefile));
}
}
return $?;
return $ecode;
}
for my $ctrl (@wanted) {
my %status;
if (!defined($ctrls{$ctrl})) {
warn "No such power controller '$ctrl', ignored\n";
logit("No such power controller '$ctrl', ignored");
$errors++;
next;
}
if ($ctrls{$ctrl} =~ /^RPC/) {
if (rpc27status($ctrl,\%status)) {
warn "Could not get status for $ctrl.\n";
logit("Could not get status for $ctrl.");
$errors++;
next;
}
......@@ -182,11 +218,12 @@ sub dostatus(@) {
}
logit($msg);
} elsif (!$doall) {
warn "Cannot get status for $ctrl (type " .
$ctrls{$ctrl} . ") yet\n";
logit("Cannot get status for $ctrl (type " .
$ctrls{$ctrl} . ") yet");
$errors++;
}
}
close(WHINE);
exit($errors);
}
......@@ -194,4 +231,16 @@ sub logit($) {
my ($msg) = @_;
print strftime("%b %e %H:%M:%S", localtime)." powermon[$$]: $msg\n";
if ($whiner && $childpid == 0) {
print WHINE $msg, "\n";
}
}
END {
return
if ($childpid == 0 || !$whiner);
my $ecode = $?;
unlink($whinefile);
$? = $ecode;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment