node_status.in 4.99 KB
Newer Older
1
#!/usr/bin/perl -w
Leigh Stoller's avatar
Leigh Stoller committed
2
#
3
# Copyright (c) 2000-2018 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
Leigh Stoller's avatar
Leigh Stoller committed
23
#
24 25 26
use English;

#
27 28 29
# node_status - Updates the 'status' column in the node_status table.
# Currently run as a cron job, but is probably better as a testbed
# daemon. 
30
#
31
#
32
# Configure variables
33
#
34 35 36
my $TB		= "@prefix@";
my $TBOPS       = "@TBOPSEMAIL@";
my $TBLOGS      = "@TBLOGSEMAIL@";
37
my $BOSSNODE    = "@BOSSNODE@";
38

39 40 41
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
42

43
my $verbose = 0;
Mike Hibler's avatar
Mike Hibler committed
44 45 46
my $debug = 0;
$verbose = 1
    if ($debug);
47

48 49 50 51
#
# Turn off line buffering on output
#
$| = 1; 
52

53 54 55
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libdb;
56 57
use EmulabConstants;
use event;
58
use Node;
59 60

#
61
# Only root and admins are allowed to use this script
62
#
63 64 65
if (($UID != 0) && (!TBAdmin())) {
    die("*** $0:\n".
	"    You do not have permission to run this script!\n");
66
}
67 68

#
69
# Part 1: Set the node_status.status column for nodes.
70 71
# The idea is simple; any nodes that have not reported in (isalive in
# tmcd) within the last XX seconds are moved to the down category.
72
#
73 74 75 76

#
# Grab the reporting intervals for the various types of nodes - we convert
# them into seconds, and make the timeout twice as big as the reporting
Mike Hibler's avatar
Mike Hibler committed
77
# interval.
78 79 80
#
my $vnode_timeout = TBGetSiteVar("watchdog/isalive/vnode") * 60 * 2;
my $local_timeout = TBGetSiteVar("watchdog/isalive/local") * 60 * 2;
Mike Hibler's avatar
Mike Hibler committed
81 82

print "\n=== node_status ".
83
    "(local=$local_timeout, vnode=$vnode_timeout) ".
Mike Hibler's avatar
Mike Hibler committed
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
    "running at " . `date`
    if ($verbose);

#
# A zero timeout value means no timeout
#
if ($vnode_timeout == 0) {
    print " WARNING: no timeout on local vnodes\n"
	if ($verbose);
    $vnode_timeout = 999999999;
}
if ($local_timeout == 0) {
    print " WARNING: no timeout on local nodes\n"
	if ($verbose);
    $local_timeout = 999999999;
}

#
# Fun facts:
103
#	isvirtnode     -> vnode
Mike Hibler's avatar
Mike Hibler committed
104 105 106 107
# so:
#	local phys node == !isremotenode && !isvirtnode
#	local virt node == !isremotenode && isvirtnode
#
108
my $query_result =
109 110
    DBQueryFatal("SELECT n.node_id,ns.status,r.pid, ".
		 "       nt.isvirtnode, ".
111
                 "       unix_timestamp()-unix_timestamp(ns.status_timestamp) ".
Kirk Webb's avatar
Kirk Webb committed
112
		 "from nodes as n ".
113 114 115
		 "left join node_types as nt on n.type=nt.type ".
		 "left join node_status as ns on ns.node_id=n.node_id ".
		 "left join reserved as r on r.node_id=n.node_id ".
116
		 "where ".
117
		 " ns.status != 'down' && " .
Mike Hibler's avatar
Mike Hibler committed
118 119
		 # Local phys nodes:
		 " ((nt.isremotenode=0 && nt.isvirtnode=0 && ".
120
		 "   ((unix_timestamp()-unix_timestamp(ns.status_timestamp)) > $local_timeout)) || ".
Mike Hibler's avatar
Mike Hibler committed
121 122 123 124
		 # Local virtual node:
		 # must be allocated to an experiment to be considered.
		 "  (nt.isremotenode=0 && nt.isvirtnode=1 && ".
		 "   r.pid is not null && ".
125 126
		 "   ((unix_timestamp()-unix_timestamp(ns.status_timestamp)) ".
		 "     > $vnode_timeout)))");
127

128
while (my ($node_id,$status,$pid,$isvirt,$timediff) =
129
       $query_result->fetchrow_array) {
130 131
    my $newstatus = "down";

132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
    if ($verbose > 1) {
	print "$node_id, $status, $pid, $isvirt, $timediff\n";
    }

    #
    # If its reserved and not reporting isalive, then its a user
    # image not doing what it is supposed to. Mark as possibly
    # down since we do not really know whats up. This includes old
    # images as well, but that would only happen when the node is
    # reserved since free nodes run the default image and report in.
    #
    if (defined($pid)) {
	$newstatus = "possibly down";
    }
    else {
147
	#
148
	# We do not mess with nodes that are running an ALWAYSUP osid.
149
	#
150 151 152 153 154 155 156 157 158 159 160 161
	my $node = Node->Lookup($node_id);
	next
	    if (!defined($node));
	my $image = $node->RunningOsImage();

	if (defined($image)) {
	    if ($image->op_mode() && $image->op_mode() eq "ALWAYSUP") {
		if ($verbose) {
		    print "Skipping ALWAYSUP up node $node_id\n";
		}
		next;
	    }
162 163
	}
    }
Mike Hibler's avatar
Mike Hibler committed
164 165 166
    next
	if ($status eq $newstatus);

167
    print "  $node_id: $status to $newstatus after $timediff\n"
Mike Hibler's avatar
Mike Hibler committed
168 169
	if ($verbose);

170
    #
Mike Hibler's avatar
Mike Hibler committed
171
    # Repeat the time check to avoid dropping a node that just came up?
172
    #
Mike Hibler's avatar
Mike Hibler committed
173 174
    if (!$debug) {
	DBQueryFatal("update node_status set status='$newstatus' ".
175
		     "where node_id='$node_id'");
176 177 178 179
	#
	# Generate local event.
	#
	event::EventSendWarn(objtype   => TBDB_TBEVENT_NODESTATUS,
180
			     objname   => $node_id,
181 182
			     eventtype => $newstatus,
			     host      => $BOSSNODE);	
183
    }
184
}
185
exit(0);