node_status.in 2.39 KB
Newer Older
1
#!/usr/bin/perl -w
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5 6
# All rights reserved.
#
7 8 9
use English;

#
10 11 12
# node_status - Updates the 'status' column in the node_status table.
# Currently run as a cron job, but is probably better as a testbed
# daemon. 
13
#
14
#
15
# Configure variables
16
#
17 18 19
my $TB		= "@prefix@";
my $TBOPS       = "@TBOPSEMAIL@";
my $TBLOGS      = "@TBLOGSEMAIL@";
20

21 22 23
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
24

25 26 27 28
#
# Turn off line buffering on output
#
$| = 1; 
29

30 31 32
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libdb;
33 34

#
35
# Only root and admins are allowed to use this script
36
#
37 38 39
if (($UID != 0) && (!TBAdmin())) {
    die("*** $0:\n".
	"    You do not have permission to run this script!\n");
40
}
41 42

#
43 44
# The idea is simple; any nodes that have not reported in (isalive in
# tmcd) within the last XX seconds are moved to the down category.
45 46 47 48 49 50 51 52 53 54
#
my $query_result =
    DBQueryFatal("SELECT n.node_id,ns.status,r.pid,nt.isremotenode, ".
		 "  UNIX_TIMESTAMP(now()) - ".
		 "    UNIX_TIMESTAMP(ns.status_timestamp) ".
		 " from nodes as n ".
		 "left join node_types as nt on n.type=nt.type ".
		 "left join node_status as ns on ns.node_id=n.node_id ".
		 "left join reserved as r on r.node_id=n.node_id ".
		 "where nt.isvirtnode=0 and ".
55 56 57
		 # We ignore plab nodes, because the physical plab nodes do not
		 # do the keepalive
		 " n.type != 'pcplabphys' and " .
58 59 60
		 " (UNIX_TIMESTAMP(now()) - ".
		 "   UNIX_TIMESTAMP(ns.status_timestamp)) > 90");

61 62
while (my ($node,$status,$pid,$remote,$timediff) =
       $query_result->fetchrow_array) {
63 64 65 66 67 68 69 70 71 72 73
    my $newstatus = "down";

    if (! $remote) {
	# This time is hardwired onto the client, which report isalive
	# every 3 minutes locally, and every 60 seconds remotely. 
	next
	    if ($timediff <= 180);

	#
	# If its reserved and not reporting isalive, then its a user
	# image not doing what it is supposed to. Mark as possibly
74 75 76
	# down since we do not really know whats up. This includes old
	# images as well, but that would only happen when the node is
	# reserved since free nodes run the default image and report in.
77 78 79 80 81 82 83 84 85 86 87
	#
	if (defined($pid)) {
	    $newstatus = "possibly down";
	}
    }
    #
    # Repeat the time check to avoid dropping a node that just came up.
    # 
    DBQueryFatal("update node_status set status='$newstatus' ".
		 "where node_id='$node'");
}