Commit bf4217c4 authored by Kirk Webb's avatar Kirk Webb

Changes to allow plab pnodes to check in normally (via the rusage/isalive

mechanism in the service sliver).
parent 19479ec6
......@@ -47,15 +47,14 @@ if (($UID != 0) && (!TBAdmin())) {
#
my $query_result =
DBQueryFatal("SELECT n.node_id,ns.status,r.pid,nt.isremotenode, ".
" nt.isvirtnode, now() - ns.status_timestamp ".
" nt.isvirtnode, nt.isplabdslice".
"now() - ns.status_timestamp ".
" from nodes as n ".
"left join node_types as nt on n.type=nt.type ".
"left join node_status as ns on ns.node_id=n.node_id ".
"left join reserved as r on r.node_id=n.node_id ".
"where ".
# We ignore plab physical nodes, because the physical nodes
# do not do keepalive; handled by plab daemon instead.
" n.type != 'pcplabphys' && ns.status != 'down' && " .
" ns.status != 'down' && " .
# Jailed and PLAB virtnodes report every 600 seconds.
# Must be allocated to an experiment to be considerd.
" ((nt.isvirtnode=1 && r.pid is not null && ".
......@@ -68,7 +67,7 @@ my $query_result =
" ((now() - ns.status_timestamp) > 100)))");
while (my ($node,$status,$pid,$remote,$isvirt,$timediff) =
while (my ($node,$status,$pid,$remote,$isvirt,$isplab,$timediff) =
$query_result->fetchrow_array) {
my $newstatus = "down";
......@@ -89,6 +88,20 @@ while (my ($node,$status,$pid,$remote,$isvirt,$timediff) =
$newstatus = "possibly down";
}
}
#
# If the plab management sliver hasn't checked in for a couple of
# hours, move the pnode to hwdown.
#
if ($isplab && !$isvirt) {
if ($timediff > 7200) {
MarkNodeDown($node);
TBSetNodeLogEntry($node, $UID, TB_DEFAULT_NODELOGTYPE(),
"'Moved to hwdown; ".
"$node has not reported in for a long time.'");
}
}
#
# Repeat the time check to avoid dropping a node that just came up.
#
......
......@@ -741,22 +741,26 @@ class Plab:
print "Got known pnodes:"
print known
#
# Disable up/down marking - we now do this normally via isalive from
# the emulab service slice. Eventually, this code should be removed.
#
# Mark known nodes that are not available as down and make sure
# those that are available are marked as up
todown = [] # List of nodeid's
toup = [] # List of nodeid's
for ip in known.keys():
if not ip in avail:
todown.append(known[ip])
else:
toup.append(known[ip])
if verbose:
print "%d known Plab nodes not available" % len(todown)
print "%d known Plab nodes available" % len(toup)
self.__setVnodesStatus(todown, "down")
self.__setPnodesStatus(todown, "down")
self.__setVnodesStatus(toup, "up")
self.__setPnodesStatus(toup, "up")
#todown = [] # List of nodeid's
#toup = [] # List of nodeid's
#for ip in known.keys():
# if not ip in avail:
# todown.append(known[ip])
# else:
# toup.append(known[ip])
#if verbose:
# print "%d known Plab nodes not available" % len(todown)
# print "%d known Plab nodes available" % len(toup)
#self.__setVnodesStatus(todown, "down")
#self.__setPnodesStatus(todown, "down")
#self.__setVnodesStatus(toup, "up")
#self.__setPnodesStatus(toup, "up")
# Add new nodes
toadd = [] # List of IP's
......
......@@ -39,7 +39,7 @@ my $action = "start";
my $logname = "$LOGDIR/rusaged.debug";
my $pidfile = "/var/run/emulab-rusaged.pid";
my $debug = 0;
my $isalivewait = 300; # Seconds to wait.
my $isalivewait = 60; # Seconds to wait.
my $svcslice = "utah_elab_svc";
#
......
......@@ -4889,9 +4889,8 @@ COMMAND_PROTOTYPE(dorusage)
* See db/node_status script, which uses this info (timestamps)
* to determine when nodes are down.
*
* XXX: Plab physnodes do not report at all; we copy the virtnode
* info to the physnode. Do not update status field though; that
* is handled from one of the plab daemons.
* XXX: Plab physnode status is reported from the management slice.
*
*/
mydb_update("replace delayed into node_rusage "
" (node_id, status_timestamp, "
......@@ -4901,11 +4900,10 @@ COMMAND_PROTOTYPE(dorusage)
if (reqp->isplabdslice) {
mydb_update("replace delayed into node_status "
" (node_id, status_timestamp, "
" load_1min, load_5min, load_15min, disk_used) "
" values ('%s', now(), %f, %f, %f, %f)",
reqp->pnodeid, la1, la5, la15, dused);
}
" (node_id, status, status_timestamp "
" values ('%s', 'up', now())",
reqp->pnodeid);
}
/*
* At some point, maybe what we will do is have the client
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment