Commit 89d5a8de authored by Leigh B. Stoller's avatar Leigh B. Stoller
Browse files

The rest of node status changes. All old uses of nodes.status should

be flushed (fingers crossed!).
parent 97e1c98c
#!/usr/bin/perl -w
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002 University of Utah and the Flux Group.
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
# All rights reserved.
#
#
# node_status - Updates the 'status' column in the nodes table to indicate
# whether nodes are pingable, etc.
# Intended to be run as a cronjob
# Requires 'fping' to be installed
#
############################## Defines and includes
my $fping = "/usr/local/sbin/fping"; # Path to fping
use strict;
use English;
use IPC::Open2;
# Configure variables
use lib '@prefix@/lib';
use libdb;
#
# Only root and admins are allowed to use this script
# node_status - Updates the 'status' column in the node_status table.
# Currently run as a cron job, but is probably better as a testbed
# daemon.
#
if (($UID != 0) && (!TBAdmin())) {
die "Only root and admins are allowed to use this script\n";
}
#
# Node list. We only care about local, non virtual nodes.
# Remote nodes handled below.
# Configure variables
#
my $query =
"select n.node_id, n.status, os_info.osfeatures FROM nodes as n ".
"left join node_types as nt on n.type=nt.type ".
"left join os_info ON n.def_boot_osid = os_info.osid ".
"where nt.isvirtnode=0 and nt.isremotenode=0 and nt.class!='shark'";
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $result = DBQueryFatal($query);
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
############################## Determine pingable/unpingable nodes
my (@newlyUp, @newlyDown, @newlyPD, @newlyUnpingable) = ();
my $fpingPID = &open2("FOUT","FIN","$fping 2>&1"); # Throws an exception on failure
my %oldStatus = (); # Status of node from the DB, so we can tell if it changed
while (my %row = $result->fetchhash) {
my $node = $row{node_id};
$oldStatus{$node} = $row{status};
if ($row{osfeatures} && ($row{osfeatures} =~ /ping/)) {
# We have a node that should be capable of returning pings
print FIN $node,"\n"; # Give fping another node to ping
} else {
# This node can't return pings
if ((!$row{status}) || ($row{status} ne 'unpingable')) {
push @newlyUnpingable, $node;
}
}
}
close(FIN); # Tell fping we're done giving it nodes
############################## Determine status changes
while (<FOUT>) { # Read fping results
chomp;
my ($node,$status) = split /\s+/,$_,2;
# Skip ICMP messages
next if ($node eq "ICMP");
if ($status eq "is alive") {
if ($oldStatus{$node} ne "up") {
push @newlyUp,$node;
}
} else { # Node must not have returned a ping
if ($oldStatus{$node} eq "possibly down") {
push @newlyDown, $node;
} elsif ($oldStatus{$node} ne "down") {
push @newlyPD, $node;
}
}
}
close(FOUT);
#
# Turn off line buffering on output
#
$| = 1;
############################## Write back changes
# When I started this section, it seemed a clever way to avoid code
# duplication. Now, I'm not so sure :)
foreach (['down', @newlyDown], ['up', @newlyUp],
['possibly down', @newlyPD], ['unpingable', @newlyUnpingable]) {
my $status = shift @$_;
my @nodes = @$_;
if (@nodes) {
my $query = "UPDATE nodes SET status='$status' WHERE " .
join " OR ", map("node_id='$_'",@nodes);
DBQueryFatal($query);
}
}
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libdb;
#
# Now look at widearea nodes.
# Only root and admins are allowed to use this script
#
$result =
DBQueryFatal("SELECT n.node_id,n.status, ".
" UNIX_TIMESTAMP(n.status_timestamp) ".
"from nodes as n ".
"left join node_types as nt on n.type=nt.type ".
"left join reserved as r on n.node_id=r.node_id ".
"where nt.isvirtnode=0 and nt.isremotenode=1 and ".
"(UNIX_TIMESTAMP(now()) - ".
" UNIX_TIMESTAMP(n.status_timestamp)) > 90");
while (my ($node, $status, $stamp) = $result->fetchrow_array) {
#
# Repeat the time check to avoid dropping a node that just came up.
#
DBQueryFatal("update nodes set status='down' ".
"where node_id='$node' and ".
"(UNIX_TIMESTAMP(now()) - ".
" UNIX_TIMESTAMP(status_timestamp)) > 90");
if (($UID != 0) && (!TBAdmin())) {
die("*** $0:\n".
" You do not have permission to run this script!\n");
}
########################################################################
#
# New Stuff. The idea is simple; any nodes that have not reported in
# (isalive in tmcd) within the last XX seconds are moved to the down
# category.
# The idea is simple; any nodes that have not reported in (isalive in
# tmcd) within the last XX seconds are moved to the down category.
#
my $query_result =
DBQueryFatal("SELECT n.node_id,ns.status,r.pid,nt.isremotenode, ".
" UNIX_TIMESTAMP(now()) - ".
......@@ -154,7 +67,9 @@ while (my ($node,$status,$pid,$remote,$timediff) = $result->fetchrow_array) {
#
# If its reserved and not reporting isalive, then its a user
# image not doing what it is supposed to. Mark as possibly
# down since we do not really know whats up.
# down since we do not really know whats up. This includes old
# images as well, but that would only happen when the node is
# reserved since free nodes run the default image and report in.
#
if (defined($pid)) {
$newstatus = "possibly down";
......
......@@ -137,12 +137,12 @@ $result =
DBQueryFatal("select a.node_id,a.type from nodes as a ".
"left join reserved as b on a.node_id=b.node_id ".
"left join reserved as m on a.phys_nodeid=m.node_id ".
"left join nodes as n on a.phys_nodeid=n.node_id ".
"left join node_status as ns on a.phys_nodeid=ns.node_id ".
"left join node_types as t on t.type=a.type ".
"where $free_condition and $pcvmhack ".
" $pc601hack_none ".
" (a.role='testnode' or ".
" (a.role='virtnode' and n.status='up' and ".
" (a.role='virtnode' and ns.status='up' and ".
" (m.node_id is null or ".
" m.pid!='$DEADPID' or m.eid!='$DEADEID')))");
......
......@@ -2,7 +2,7 @@
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002 University of Utah and the Flux Group.
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -448,9 +448,10 @@ if ($typecount || $classcount == $fixedcount) {
DBQueryFatal("select a.node_id from nodes as a ".
"left join reserved as b on a.node_id=b.node_id ".
"left join reserved as m on a.phys_nodeid=m.node_id ".
"left join nodes as n on a.phys_nodeid=n.node_id ".
"left join node_status as ns on ".
" a.phys_nodeid=ns.node_id ".
"where b.node_id is null and a.type='$type' and ".
" (n.status='up' and ".
" (ns.status='up' and ".
" (m.node_id is null or ".
" m.pid!='$DEADPID' or m.eid!='$DEADEID')) ".
"$omit ".
......
......@@ -2,7 +2,7 @@
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002 University of Utah and the Flux Group.
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -131,10 +131,10 @@ $result =
DBQueryFatal("select a.node_id,a.phys_nodeid,count(*) from nodes as a ".
"left join reserved as b on a.node_id=b.node_id ".
"left join reserved as m on a.phys_nodeid=m.node_id ".
"left join nodes as n on a.phys_nodeid=n.node_id ".
"left join node_status as ns on a.phys_nodeid=ns.node_id ".
"left join node_types as nt on a.type=nt.type ".
"where b.node_id is null and ".
" (nt.isremotenode=1 and n.status='up' and ".
" (nt.isremotenode=1 and ns.status='up' and ".
" (m.node_id is null or ".
" m.pid!='$DEADPID' or m.eid!='$DEADEID')) ".
"group by a.phys_nodeid");
......
......@@ -3587,18 +3587,8 @@ COMMAND_PROTOTYPE(doisalive)
char buf[MYBUFSIZE];
/*
* Only for remote nodes now; local node status determined with
* with fping via db/node_status script. Need to replace that.
*/
if (! reqp->islocal) {
mydb_update("update nodes "
"set status='up',status_timestamp=now() "
"where node_id='%s' or phys_nodeid='%s'",
reqp->nodeid, reqp->nodeid);
}
/*
* New stuff. Old stuff above will come out when node_status
* table completely implemented.
* See db/node_status script, which uses this info (timestamps)
* to determine when nodes are down.
*/
mydb_update("replace delayed into node_status "
" (node_id, status, status_timestamp) "
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment