Commit cbf598ef authored by Leigh Stoller's avatar Leigh Stoller

Do not move ALWAYSUP nodes into the down state; they are not real

images and do not send isalive. This stuff is pretty old and crufty!

Remove a bunch of obsolete code while I was here.
parent 37c5e50b
#!/usr/bin/perl -w
#
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
# Copyright (c) 2000-2018 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -55,6 +55,7 @@ use lib "@prefix@/lib";
use libdb;
use EmulabConstants;
use event;
use Node;
#
# Only root and admins are allowed to use this script
......@@ -75,25 +76,17 @@ if (($UID != 0) && (!TBAdmin())) {
# them into seconds, and make the timeout twice as big as the reporting
# interval.
#
my $plab_timeout = TBGetSiteVar("watchdog/isalive/plab") * 60 * 2;
my $vnode_timeout = TBGetSiteVar("watchdog/isalive/vnode") * 60 * 2;
my $local_timeout = TBGetSiteVar("watchdog/isalive/local") * 60 * 2;
my $wa_timeout = TBGetSiteVar("watchdog/isalive/wa") * 60 * 2;
print "\n=== node_status ".
"(local=$local_timeout, vnode=$vnode_timeout, ".
"plab=$plab_timeout, wa=$wa_timeout) ".
"(local=$local_timeout, vnode=$vnode_timeout) ".
"running at " . `date`
if ($verbose);
#
# A zero timeout value means no timeout
#
if ($plab_timeout == 0) {
print " WARNING: no timeout on plab nodes\n"
if ($verbose);
$plab_timeout = 999999999;
}
if ($vnode_timeout == 0) {
print " WARNING: no timeout on local vnodes\n"
if ($verbose);
......@@ -104,28 +97,17 @@ if ($local_timeout == 0) {
if ($verbose);
$local_timeout = 999999999;
}
if ($wa_timeout == 0) {
print " WARNING: no timeout on widearea nodes\n"
if ($verbose);
$wa_timeout = 999999999;
}
#
# Fun facts:
# isremotenode -> widearea, plabvirt, plabphys
# isvirtnode -> vnode, plabvirt
# isplabdslice -> plabvirt
# isplabphysnode -> plabphys
# isvirtnode -> vnode
# so:
# local phys node == !isremotenode && !isvirtnode
# local virt node == !isremotenode && isvirtnode
# widearea node == isremotenode && !isplabdslice && !isplabphysnode
# plab phys node == isplabphysnode
# plab virt node == isplabdslice
#
my $query_result =
DBQueryFatal("SELECT n.node_id,ns.status,r.pid,nt.isremotenode, ".
" nt.isvirtnode, nt.isplabdslice, ".
DBQueryFatal("SELECT n.node_id,ns.status,r.pid, ".
" nt.isvirtnode, ".
" unix_timestamp()-unix_timestamp(ns.status_timestamp) ".
"from nodes as n ".
"left join node_types as nt on n.type=nt.type ".
......@@ -140,42 +122,49 @@ my $query_result =
# must be allocated to an experiment to be considered.
" (nt.isremotenode=0 && nt.isvirtnode=1 && ".
" r.pid is not null && ".
" ((unix_timestamp()-unix_timestamp(ns.status_timestamp)) > $vnode_timeout)) || ".
# Widearea nodes:
" (nt.isremotenode=1 && nt.isplabdslice=0 && ".
" nt.isplabphysnode=0 && nt.isfednode=0 && ".
" ((unix_timestamp()-unix_timestamp(ns.status_timestamp)) > $wa_timeout)) || ".
# Plab physical nodes:
" (nt.isplabphysnode=1 && ".
" ((unix_timestamp()-unix_timestamp(ns.status_timestamp)) > $plab_timeout)) || ".
# Plab virtual nodes:
# must be allocated to an experiment to be considered.
" (nt.isplabdslice=1 && ".
" r.pid is not null && ".
" ((unix_timestamp()-unix_timestamp(ns.status_timestamp)) > $plab_timeout)))");
" ((unix_timestamp()-unix_timestamp(ns.status_timestamp)) ".
" > $vnode_timeout)))");
while (my ($node,$status,$pid,$remote,$isvirt,$isplab,$timediff) =
while (my ($node_id,$status,$pid,$isvirt,$timediff) =
$query_result->fetchrow_array) {
my $newstatus = "down";
if (! $remote) {
if ($verbose > 1) {
print "$node_id, $status, $pid, $isvirt, $timediff\n";
}
#
# If its reserved and not reporting isalive, then its a user
# image not doing what it is supposed to. Mark as possibly
# down since we do not really know whats up. This includes old
# images as well, but that would only happen when the node is
# reserved since free nodes run the default image and report in.
#
if (defined($pid)) {
$newstatus = "possibly down";
}
else {
#
# If its reserved and not reporting isalive, then its a user
# image not doing what it is supposed to. Mark as possibly
# down since we do not really know whats up. This includes old
# images as well, but that would only happen when the node is
# reserved since free nodes run the default image and report in.
# We do not mess with nodes that are running an ALWAYSUP osid.
#
if (defined($pid)) {
$newstatus = "possibly down";
my $node = Node->Lookup($node_id);
next
if (!defined($node));
my $image = $node->RunningOsImage();
if (defined($image)) {
if ($image->op_mode() && $image->op_mode() eq "ALWAYSUP") {
if ($verbose) {
print "Skipping ALWAYSUP up node $node_id\n";
}
next;
}
}
}
next
if ($status eq $newstatus);
print " $node: $status to $newstatus after $timediff\n"
print " $node_id: $status to $newstatus after $timediff\n"
if ($verbose);
#
......@@ -183,39 +172,14 @@ while (my ($node,$status,$pid,$remote,$isvirt,$isplab,$timediff) =
#
if (!$debug) {
DBQueryFatal("update node_status set status='$newstatus' ".
"where node_id='$node'");
"where node_id='$node_id'");
#
# Generate local event.
#
event::EventSendWarn(objtype => TBDB_TBEVENT_NODESTATUS,
objname => $node,
objname => $node_id,
eventtype => $newstatus,
host => $BOSSNODE);
}
}
#
# Part 2: Push nodes into hwdown that have been down for quite a while.
# For now, we only do this for planetlab physnodes, but we may want to do
# it for others, too.
#
my $timeout = TBGetSiteVar("watchdog/isalive/dead_time") * 60;
$query_result =
DBQueryFatal("SELECT n.node_id from nodes as n " .
"left join node_status as ns on ns.node_id=n.node_id ".
"left join reserved as r on r.node_id=n.node_id " .
"where n.type='pcplabphys' " .
" and unix_timestamp()-unix_timestamp(ns.status_timestamp) > $timeout " .
" and !(r.pid='" . NODEDEAD_PID . "' and r.eid='" .
NODEDEAD_EID . "')");
while (my ($node) = $query_result->fetchrow_array) {
print " $node: moved to hwdown\n"
if ($verbose);
if (!$debug) {
MarkPhysNodeDown($node);
TBSetNodeLogEntry($node, "root", TB_DEFAULT_NODELOGTYPE(),
"'Moved to hwdown by node_status; ".
"did not report in for more than $timeout seconds.'");
}
}
exit(0);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment