Commit 89d5a8de authored by Leigh Stoller's avatar Leigh Stoller

The rest of node status changes. All old uses of nodes.status should

be flushed (fingers crossed!).
parent 97e1c98c
#!/usr/bin/perl -w
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002 University of Utah and the Flux Group.
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
# All rights reserved.
#
#
# node_status - Updates the 'status' column in the nodes table to indicate
# whether nodes are pingable, etc.
# Intended to be run as a cronjob
# Requires 'fping' to be installed
#
############################## Defines and includes
my $fping = "/usr/local/sbin/fping"; # Path to fping
use strict;
use English;
use IPC::Open2;
# Configure variables
use lib '@prefix@/lib';
use libdb;
#
# Only root and admins are allowed to use this script
# node_status - Updates the 'status' column in the node_status table.
# Currently run as a cron job, but is probably better as a testbed
# daemon.
#
if (($UID != 0) && (!TBAdmin())) {
die "Only root and admins are allowed to use this script\n";
}
#
# Node list. We only care about local, non virtual nodes.
# Remote nodes handled below.
# Configure variables
#
my $query =
"select n.node_id, n.status, os_info.osfeatures FROM nodes as n ".
"left join node_types as nt on n.type=nt.type ".
"left join os_info ON n.def_boot_osid = os_info.osid ".
"where nt.isvirtnode=0 and nt.isremotenode=0 and nt.class!='shark'";
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $result = DBQueryFatal($query);
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
############################## Determine pingable/unpingable nodes
my (@newlyUp, @newlyDown, @newlyPD, @newlyUnpingable) = ();
my $fpingPID = &open2("FOUT","FIN","$fping 2>&1"); # Throws an exception on failure
my %oldStatus = (); # Status of node from the DB, so we can tell if it changed
while (my %row = $result->fetchhash) {
my $node = $row{node_id};
$oldStatus{$node} = $row{status};
if ($row{osfeatures} && ($row{osfeatures} =~ /ping/)) {
# We have a node that should be capable of returning pings
print FIN $node,"\n"; # Give fping another node to ping
} else {
# This node can't return pings
if ((!$row{status}) || ($row{status} ne 'unpingable')) {
push @newlyUnpingable, $node;
}
}
}
close(FIN); # Tell fping we're done giving it nodes
############################## Determine status changes
while (<FOUT>) { # Read fping results
chomp;
my ($node,$status) = split /\s+/,$_,2;
# Skip ICMP messages
next if ($node eq "ICMP");
if ($status eq "is alive") {
if ($oldStatus{$node} ne "up") {
push @newlyUp,$node;
}
} else { # Node must not have returned a ping
if ($oldStatus{$node} eq "possibly down") {
push @newlyDown, $node;
} elsif ($oldStatus{$node} ne "down") {
push @newlyPD, $node;
}
}
}
close(FOUT);
#
# Turn off line buffering on output
#
$| = 1;
############################## Write back changes
# When I started this section, it seemed a clever way to avoid code
# duplication. Now, I'm not so sure :)
foreach (['down', @newlyDown], ['up', @newlyUp],
['possibly down', @newlyPD], ['unpingable', @newlyUnpingable]) {
my $status = shift @$_;
my @nodes = @$_;
if (@nodes) {
my $query = "UPDATE nodes SET status='$status' WHERE " .
join " OR ", map("node_id='$_'",@nodes);
DBQueryFatal($query);
}
}
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libdb;
#
# Now look at widearea nodes.
# Only root and admins are allowed to use this script
#
$result =
DBQueryFatal("SELECT n.node_id,n.status, ".
" UNIX_TIMESTAMP(n.status_timestamp) ".
"from nodes as n ".
"left join node_types as nt on n.type=nt.type ".
"left join reserved as r on n.node_id=r.node_id ".
"where nt.isvirtnode=0 and nt.isremotenode=1 and ".
"(UNIX_TIMESTAMP(now()) - ".
" UNIX_TIMESTAMP(n.status_timestamp)) > 90");
while (my ($node, $status, $stamp) = $result->fetchrow_array) {
#
# Repeat the time check to avoid dropping a node that just came up.
#
DBQueryFatal("update nodes set status='down' ".
"where node_id='$node' and ".
"(UNIX_TIMESTAMP(now()) - ".
" UNIX_TIMESTAMP(status_timestamp)) > 90");
if (($UID != 0) && (!TBAdmin())) {
die("*** $0:\n".
" You do not have permission to run this script!\n");
}
########################################################################
#
# New Stuff. The idea is simple; any nodes that have not reported in
# (isalive in tmcd) within the last XX seconds are moved to the down
# category.
# The idea is simple; any nodes that have not reported in (isalive in
# tmcd) within the last XX seconds are moved to the down category.
#
my $query_result =
DBQueryFatal("SELECT n.node_id,ns.status,r.pid,nt.isremotenode, ".
" UNIX_TIMESTAMP(now()) - ".
......@@ -154,7 +67,9 @@ while (my ($node,$status,$pid,$remote,$timediff) = $result->fetchrow_array) {
#
# If its reserved and not reporting isalive, then its a user
# image not doing what it is supposed to. Mark as possibly
# down since we do not really know whats up.
# down since we do not really know whats up. This includes old
# images as well, but that would only happen when the node is
# reserved since free nodes run the default image and report in.
#
if (defined($pid)) {
$newstatus = "possibly down";
......
......@@ -137,12 +137,12 @@ $result =
DBQueryFatal("select a.node_id,a.type from nodes as a ".
"left join reserved as b on a.node_id=b.node_id ".
"left join reserved as m on a.phys_nodeid=m.node_id ".
"left join nodes as n on a.phys_nodeid=n.node_id ".
"left join node_status as ns on a.phys_nodeid=ns.node_id ".
"left join node_types as t on t.type=a.type ".
"where $free_condition and $pcvmhack ".
" $pc601hack_none ".
" (a.role='testnode' or ".
" (a.role='virtnode' and n.status='up' and ".
" (a.role='virtnode' and ns.status='up' and ".
" (m.node_id is null or ".
" m.pid!='$DEADPID' or m.eid!='$DEADEID')))");
......
......@@ -2,7 +2,7 @@
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002 University of Utah and the Flux Group.
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -448,9 +448,10 @@ if ($typecount || $classcount == $fixedcount) {
DBQueryFatal("select a.node_id from nodes as a ".
"left join reserved as b on a.node_id=b.node_id ".
"left join reserved as m on a.phys_nodeid=m.node_id ".
"left join nodes as n on a.phys_nodeid=n.node_id ".
"left join node_status as ns on ".
" a.phys_nodeid=ns.node_id ".
"where b.node_id is null and a.type='$type' and ".
" (n.status='up' and ".
" (ns.status='up' and ".
" (m.node_id is null or ".
" m.pid!='$DEADPID' or m.eid!='$DEADEID')) ".
"$omit ".
......
......@@ -2,7 +2,7 @@
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2002 University of Utah and the Flux Group.
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -131,10 +131,10 @@ $result =
DBQueryFatal("select a.node_id,a.phys_nodeid,count(*) from nodes as a ".
"left join reserved as b on a.node_id=b.node_id ".
"left join reserved as m on a.phys_nodeid=m.node_id ".
"left join nodes as n on a.phys_nodeid=n.node_id ".
"left join node_status as ns on a.phys_nodeid=ns.node_id ".
"left join node_types as nt on a.type=nt.type ".
"where b.node_id is null and ".
" (nt.isremotenode=1 and n.status='up' and ".
" (nt.isremotenode=1 and ns.status='up' and ".
" (m.node_id is null or ".
" m.pid!='$DEADPID' or m.eid!='$DEADEID')) ".
"group by a.phys_nodeid");
......
......@@ -3587,18 +3587,8 @@ COMMAND_PROTOTYPE(doisalive)
char buf[MYBUFSIZE];
/*
* Only for remote nodes now; local node status determined with
* with fping via db/node_status script. Need to replace that.
*/
if (! reqp->islocal) {
mydb_update("update nodes "
"set status='up',status_timestamp=now() "
"where node_id='%s' or phys_nodeid='%s'",
reqp->nodeid, reqp->nodeid);
}
/*
* New stuff. Old stuff above will come out when node_status
* table completely implemented.
* See db/node_status script, which uses this info (timestamps)
* to determine when nodes are down.
*/
mydb_update("replace delayed into node_status "
" (node_id, status, status_timestamp) "
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment