Commit da5e8604 authored by Timothy Stack's avatar Timothy Stack

Some power-by-mail hacking:

  - Bump the timeout for waiting for the operators to flip the switch
    to 20 minutes.

  - Fail fast if the node is in hwdown.  This case is intended to make
    an os_load fail for a robot-mounted mote whose robot is in hwdown.

  - Fail if the robotlab is not open since noone is around to do
    anything about it anyways.

  - Assume success if the event state for a node was updated
    "recently."  This is a fall back in case the powertime web page
    isn't used to notify the system that the node was powered
    on/cycled.  Also, do not send the SHUTDOWN event in this case.

  - Add a TBNodeEventStateUpdated() function to libdb.pm that returns
    true if the eventstate for a node was updated within N seconds
    from the current time.
parent d767c345
......@@ -132,6 +132,7 @@ use vars qw(@ISA @EXPORT);
TBDB_EXPT_WORKDIR
TBSetNodeEventState TBGetNodeEventState
TBNodeEventStateUpdated
TBSetNodeAllocState TBGetNodeAllocState
TBSetNodeOpMode TBGetNodeOpMode TBSetNodeNextOpMode
TB_OSID_MBKERNEL TB_OSID_PXEBOOT TB_OSID_FRISBEE
......@@ -2474,6 +2475,29 @@ sub TBGetNodeEventState($$)
return 1;
}
#
# Check if the event state for a node was updated recently.
#
# usage: TBNodeEventStateUpdated(char *node, int tolerance)
# Returns 1 if the state was updated.
# Returns 0 if failed.
#
sub TBNodeEventStateUpdated($$)
{
my ($node, $tol) = @_;
my $query_result =
DBQueryFatal("select UNIX_TIMESTAMP(now()) - state_timestamp < $tol ".
"from nodes where node_id='$node'");
if ($query_result->numrows == 0) {
return 0;
}
my ($under) = $query_result->fetchrow_array();
return $under;
}
#
# Check if a node has timed out in its current state. If it has, it gets
# stated involved to handle the situation.
......
......@@ -309,6 +309,7 @@ foreach my $power_id (keys %outlets) {
print "Control of $nodestr failed.\n"; $exitval++;
$errors++;
}
$sendevent = 0; # power_mail sends this itself.
} else {
print "power: Unknown power type '$type'\n";
$errors++;
......
......@@ -20,8 +20,9 @@ use libtestbed;
my $WWW = "@WWW@";
my $TBOPS = "@TBOPSEMAIL@";
my $default_tries = 10;
my $time_tolerance = 2 * 60;
my $default_tries = 40;
my $time_tolerance = 2 * 60; # seconds
my $state_update_tolerance = 45; # seconds
# Turn off line buffering on output
$| = 1;
......@@ -36,20 +37,38 @@ sub mailctrl($@) {
my ($cmd, @nodes) = @_;
my %actual = ();
my $open = 1;
# XXX Hack so that we only send mail if the robotlab is open, which ought
# to be the only time this script gets run. Otherwise, noone is around to
# do anything about it.
TBGetSiteVar("robotlab/open", \$open);
if (!$open) {
print "Lab not open, no operators available to power $cmd nodes.\n";
return 1;
}
# Check to see if we have to send mail first.
foreach my $node (@nodes) {
my $dbres = DBQueryFatal(
"select (UNIX_TIMESTAMP(NOW()) - UNIX_TIMESTAMP(last_power)) ".
" < $time_tolerance from outlets where node_id='$node'");
" < $time_tolerance,r.pid,r.eid from outlets as o ".
"left join reserved as r on r.node_id=o.node_id ".
"where o.node_id='$node'");
if ($dbres->num_rows() == 0) {
print "Unknown node $node";
next;
}
($ok) = $dbres->fetchrow();
my ($ok, $pid, $eid) = $dbres->fetchrow();
if (defined($pid) && defined($eid) &&
$pid eq NODEDEAD_PID() && $eid eq NODEDEAD_EID()) {
print "Can't power nodes that are dead.\n";
return 1;
}
if (!$ok) {
$actual{$node} = 1;
......@@ -86,7 +105,21 @@ sub mailctrl($@) {
($ok) = $dbres->fetchrow();
if ($tries == 0) {
if (($cmd eq "on" || $cmd eq "cycle") &&
TBNodeEventStateUpdated($node, $state_update_tolerance)) {
# This is something of a hack... We don't want to wait
# forever if someone forgets to update the webpage, so we
# check if the event state was updated recently. And, we
# DO NOT send the shutdown event since the thing is already
# going.
$ok = 1;
}
elsif ($ok) {
# The operator notified via the web page.
my $state = TBDB_NODESTATE_SHUTDOWN;
TBSetNodeEventState($node,$state);
}
elsif ($tries == 0) {
print "No more tries left for $node...";
return 1;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment