Commit 2c03632f authored by Mac Newbold's avatar Mac Newbold
Browse files

Big changes.

Added default behavior so that when no pid/eid is given, it looks up
all the nodes that haven't reported in the last 2 hours and runs it
on those nodes.
Also added magic around ssh calls. Fork and set an alarm, if it doesn't come
back, declare the node unreachable and kill the child.
(This checkin is for checkpointing, I'm now going to add this to configure
so that it properly gets devel instead of hardcoded to the real one.)
parent 5379f6e9
......@@ -3,6 +3,7 @@
use lib '/usr/testbed/lib';
use libdb;
#use Mysql;
use English;
use Getopt::Long;
use strict;
......@@ -10,44 +11,116 @@ use strict;
# Turn off line buffering
my $t = 120;
sub usage() {
print "Usage: $0 [-s] <pid> <eid>\n".
" -s start slothd if not running.\n";
die("Usage: $0 [-s] [<pid> <eid>]
-s start slothd if not running.
If no pid/eid are given, do all nodes that haven't reported slothd
data in the last $t minutes.\n"); }
my %opts = ();
if ($opts{"h"}) {
exit &usage;
if (@ARGV < 1) {
exit &usage;
my @nodes = ();
if (@ARGV > 0) {
my ($pid, $eid) = @ARGV;
@nodes = ExpNodes($pid, $eid);
} else {
DBQueryFatal("drop table if exists idletemp2");
#(print "create temporary table idletemp2
DBQueryFatal("create temporary table idletemp2
select pid,eid,r.node_id,max(tstamp) as t from reserved as r
left join node_idlestats as n on r.node_id=n.node_id
where r.node_id not like \"sh%\" and r.node_id not like \"wireless%\"
and r.node_id not like \"%ron%\"
group by pid,eid,node_id
having t is null or (unix_timestamp(now())-unix_timestamp(t) >= $t*60)
order by pid,eid,node_id");
# We now have a table that says the last time each node reported
# for all nodes that haven't reported in last $t minutes.
# (Note: Don't change group by above to pid,eid!)
my $r = DBQueryFatal("select node_id from idletemp2");
while (my %row = $r->fetchhash) {
my ($pid, $eid) = @ARGV;
sub pcnum { substr($a,2) <=> substr($b,2) }
print "pid: $pid\neid: $eid\n";
foreach my $n (sort pcnum @nodes) {
print "checking slothd on $n: ";
#print "\n"; next;
print check($n);
print "\n";
my $j;
exit 0;
foreach $j (ExpNodes($pid, $eid)) {
my $sld;
print "checking slothd on $j: ";
$sld = `sudo ssh -q $j ps auxwww | grep slothd | grep -v grep`;
if ($sld) {
print "running.\n";
else {
print "not running. ";
if ($opts{"s"}) {
`sudo ssh -q $j /etc/testbed/slothd -f`;
print "started..";
print "\n";
sub check {
#my $ssh="sshtb -o \"FallBackToRsh no\" -o \"ConnectionAttempts 3\" -q";
my $ssh="sshtb -q";
my $node = shift;
my $cmd1 = "ps auxwww | grep slothd | grep -v grep";
my $cmd2 = "/etc/testbed/slothd -f";
# Run an ssh command in a child process, protected by an alarm to
# ensure that the ssh is not hung up forever if the machine is in
# some funky state.
my $str = "";
my $syspid = fork();
if ($syspid) {
# parent
local $SIG{ALRM} = sub { kill("TERM", $syspid); };
alarm 5;
#print "$syspid - Alarm set.\n";
waitpid($syspid, 0);
my $rv = $?;
#print "$syspid - Done waiting. Got '$rv'\n";
alarm 0;
if ($rv == 15) {
#print "Node is wedged.\n";
} elsif ($rv == 256) {
#print "Node is not running sshd.\n";
$str="SSH not available";
} elsif ($rv == 512) {
$str="not running, couldn't start slothd";
} elsif ($rv == 0) {
} elsif ($rv == 1) {
$str="not running";
} elsif ($rv == 2) {
$str="not running, started";
} else {
$str="I don't know what happened...$rv";
} else {
# child
$str = `sudo $ssh $node $cmd1`;
#print "$syspid - ssh succeeded:'$str'\n";
if ($str) {
#print "running.\n";
} else {
#print "not running. ";
if ($opts{"s"}) {
$str = `sudo $ssh $node $cmd2`;
if ($str) {
#print "(start returned '$str') ";
#print "started...";
#print "\n";
return $str;
exit 0;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment