Commit bf4217c4 authored by Kirk Webb's avatar Kirk Webb

Changes to allow plab pnodes to check in normally (via the rusage/isalive

mechanism in the service sliver).
parent 19479ec6
...@@ -47,15 +47,14 @@ if (($UID != 0) && (!TBAdmin())) { ...@@ -47,15 +47,14 @@ if (($UID != 0) && (!TBAdmin())) {
# #
my $query_result = my $query_result =
DBQueryFatal("SELECT n.node_id,ns.status,r.pid,nt.isremotenode, ". DBQueryFatal("SELECT n.node_id,ns.status,r.pid,nt.isremotenode, ".
" nt.isvirtnode, now() - ns.status_timestamp ". " nt.isvirtnode, nt.isplabdslice".
"now() - ns.status_timestamp ".
" from nodes as n ". " from nodes as n ".
"left join node_types as nt on n.type=nt.type ". "left join node_types as nt on n.type=nt.type ".
"left join node_status as ns on ns.node_id=n.node_id ". "left join node_status as ns on ns.node_id=n.node_id ".
"left join reserved as r on r.node_id=n.node_id ". "left join reserved as r on r.node_id=n.node_id ".
"where ". "where ".
# We ignore plab physical nodes, because the physical nodes " ns.status != 'down' && " .
# do not do keepalive; handled by plab daemon instead.
" n.type != 'pcplabphys' && ns.status != 'down' && " .
# Jailed and PLAB virtnodes report every 600 seconds. # Jailed and PLAB virtnodes report every 600 seconds.
# Must be allocated to an experiment to be considerd. # Must be allocated to an experiment to be considerd.
" ((nt.isvirtnode=1 && r.pid is not null && ". " ((nt.isvirtnode=1 && r.pid is not null && ".
...@@ -68,7 +67,7 @@ my $query_result = ...@@ -68,7 +67,7 @@ my $query_result =
" ((now() - ns.status_timestamp) > 100)))"); " ((now() - ns.status_timestamp) > 100)))");
while (my ($node,$status,$pid,$remote,$isvirt,$timediff) = while (my ($node,$status,$pid,$remote,$isvirt,$isplab,$timediff) =
$query_result->fetchrow_array) { $query_result->fetchrow_array) {
my $newstatus = "down"; my $newstatus = "down";
...@@ -89,6 +88,20 @@ while (my ($node,$status,$pid,$remote,$isvirt,$timediff) = ...@@ -89,6 +88,20 @@ while (my ($node,$status,$pid,$remote,$isvirt,$timediff) =
$newstatus = "possibly down"; $newstatus = "possibly down";
} }
} }
#
# If the plab management sliver hasn't checked in for a couple of
# hours, move the pnode to hwdown.
#
if ($isplab && !$isvirt) {
if ($timediff > 7200) {
MarkNodeDown($node);
TBSetNodeLogEntry($node, $UID, TB_DEFAULT_NODELOGTYPE(),
"'Moved to hwdown; ".
"$node has not reported in for a long time.'");
}
}
# #
# Repeat the time check to avoid dropping a node that just came up. # Repeat the time check to avoid dropping a node that just came up.
# #
......
...@@ -741,22 +741,26 @@ class Plab: ...@@ -741,22 +741,26 @@ class Plab:
print "Got known pnodes:" print "Got known pnodes:"
print known print known
#
# Disable up/down marking - we now do this normally via isalive from
# the emulab service slice. Eventually, this code should be removed.
#
# Mark known nodes that are not available as down and make sure # Mark known nodes that are not available as down and make sure
# those that are available are marked as up # those that are available are marked as up
todown = [] # List of nodeid's #todown = [] # List of nodeid's
toup = [] # List of nodeid's #toup = [] # List of nodeid's
for ip in known.keys(): #for ip in known.keys():
if not ip in avail: # if not ip in avail:
todown.append(known[ip]) # todown.append(known[ip])
else: # else:
toup.append(known[ip]) # toup.append(known[ip])
if verbose: #if verbose:
print "%d known Plab nodes not available" % len(todown) # print "%d known Plab nodes not available" % len(todown)
print "%d known Plab nodes available" % len(toup) # print "%d known Plab nodes available" % len(toup)
self.__setVnodesStatus(todown, "down") #self.__setVnodesStatus(todown, "down")
self.__setPnodesStatus(todown, "down") #self.__setPnodesStatus(todown, "down")
self.__setVnodesStatus(toup, "up") #self.__setVnodesStatus(toup, "up")
self.__setPnodesStatus(toup, "up") #self.__setPnodesStatus(toup, "up")
# Add new nodes # Add new nodes
toadd = [] # List of IP's toadd = [] # List of IP's
......
...@@ -39,7 +39,7 @@ my $action = "start"; ...@@ -39,7 +39,7 @@ my $action = "start";
my $logname = "$LOGDIR/rusaged.debug"; my $logname = "$LOGDIR/rusaged.debug";
my $pidfile = "/var/run/emulab-rusaged.pid"; my $pidfile = "/var/run/emulab-rusaged.pid";
my $debug = 0; my $debug = 0;
my $isalivewait = 300; # Seconds to wait. my $isalivewait = 60; # Seconds to wait.
my $svcslice = "utah_elab_svc"; my $svcslice = "utah_elab_svc";
# #
......
...@@ -4889,9 +4889,8 @@ COMMAND_PROTOTYPE(dorusage) ...@@ -4889,9 +4889,8 @@ COMMAND_PROTOTYPE(dorusage)
* See db/node_status script, which uses this info (timestamps) * See db/node_status script, which uses this info (timestamps)
* to determine when nodes are down. * to determine when nodes are down.
* *
* XXX: Plab physnodes do not report at all; we copy the virtnode * XXX: Plab physnode status is reported from the management slice.
* info to the physnode. Do not update status field though; that *
* is handled from one of the plab daemons.
*/ */
mydb_update("replace delayed into node_rusage " mydb_update("replace delayed into node_rusage "
" (node_id, status_timestamp, " " (node_id, status_timestamp, "
...@@ -4901,11 +4900,10 @@ COMMAND_PROTOTYPE(dorusage) ...@@ -4901,11 +4900,10 @@ COMMAND_PROTOTYPE(dorusage)
if (reqp->isplabdslice) { if (reqp->isplabdslice) {
mydb_update("replace delayed into node_status " mydb_update("replace delayed into node_status "
" (node_id, status_timestamp, " " (node_id, status, status_timestamp "
" load_1min, load_5min, load_15min, disk_used) " " values ('%s', 'up', now())",
" values ('%s', now(), %f, %f, %f, %f)", reqp->pnodeid);
reqp->pnodeid, la1, la5, la15, dused); }
}
/* /*
* At some point, maybe what we will do is have the client * At some point, maybe what we will do is have the client
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment