diff --git a/db/node_status.in b/db/node_status.in index d41ce86ec988fbd5b1c10f58a86f63c5310e49a9..f848b47868b244f5371da0b63ae80a6a78b947c2 100755 --- a/db/node_status.in +++ b/db/node_status.in @@ -47,15 +47,14 @@ if (($UID != 0) && (!TBAdmin())) { # my $query_result = DBQueryFatal("SELECT n.node_id,ns.status,r.pid,nt.isremotenode, ". - " nt.isvirtnode, now() - ns.status_timestamp ". + " nt.isvirtnode, nt.isplabdslice". + "now() - ns.status_timestamp ". " from nodes as n ". "left join node_types as nt on n.type=nt.type ". "left join node_status as ns on ns.node_id=n.node_id ". "left join reserved as r on r.node_id=n.node_id ". "where ". - # We ignore plab physical nodes, because the physical nodes - # do not do keepalive; handled by plab daemon instead. - " n.type != 'pcplabphys' && ns.status != 'down' && " . + " ns.status != 'down' && " . # Jailed and PLAB virtnodes report every 600 seconds. # Must be allocated to an experiment to be considerd. " ((nt.isvirtnode=1 && r.pid is not null && ". @@ -68,7 +67,7 @@ my $query_result = " ((now() - ns.status_timestamp) > 100)))"); -while (my ($node,$status,$pid,$remote,$isvirt,$timediff) = +while (my ($node,$status,$pid,$remote,$isvirt,$isplab,$timediff) = $query_result->fetchrow_array) { my $newstatus = "down"; @@ -89,6 +88,20 @@ while (my ($node,$status,$pid,$remote,$isvirt,$timediff) = $newstatus = "possibly down"; } } + + # + # If the plab management sliver hasn't checked in for a couple of + # hours, move the pnode to hwdown. + # + if ($isplab && !$isvirt) { + if ($timediff > 7200) { + MarkNodeDown($node); + TBSetNodeLogEntry($node, $UID, TB_DEFAULT_NODELOGTYPE(), + "'Moved to hwdown; ". + "$node has not reported in for a long time.'"); + } + } + # # Repeat the time check to avoid dropping a node that just came up. # diff --git a/tbsetup/plab/libplab.py.in b/tbsetup/plab/libplab.py.in index 564520634b458fe98e2d7ccf05801c1fb2f42c7f..88a089d2886e18843e479198b50089fd7a5d34bf 100644 --- a/tbsetup/plab/libplab.py.in +++ b/tbsetup/plab/libplab.py.in @@ -741,22 +741,26 @@ class Plab: print "Got known pnodes:" print known +# +# Disable up/down marking - we now do this normally via isalive from +# the emulab service slice. Eventually, this code should be removed. +# # Mark known nodes that are not available as down and make sure # those that are available are marked as up - todown = [] # List of nodeid's - toup = [] # List of nodeid's - for ip in known.keys(): - if not ip in avail: - todown.append(known[ip]) - else: - toup.append(known[ip]) - if verbose: - print "%d known Plab nodes not available" % len(todown) - print "%d known Plab nodes available" % len(toup) - self.__setVnodesStatus(todown, "down") - self.__setPnodesStatus(todown, "down") - self.__setVnodesStatus(toup, "up") - self.__setPnodesStatus(toup, "up") + #todown = [] # List of nodeid's + #toup = [] # List of nodeid's + #for ip in known.keys(): + # if not ip in avail: + # todown.append(known[ip]) + # else: + # toup.append(known[ip]) + #if verbose: + # print "%d known Plab nodes not available" % len(todown) + # print "%d known Plab nodes available" % len(toup) + #self.__setVnodesStatus(todown, "down") + #self.__setPnodesStatus(todown, "down") + #self.__setVnodesStatus(toup, "up") + #self.__setPnodesStatus(toup, "up") # Add new nodes toadd = [] # List of IP's diff --git a/tmcd/plab/rusaged b/tmcd/plab/rusaged index 4cbf587a4321343c5f797d03a2cd25bc80996b9f..730e91b1e384c7b57faf4bdb90ce440cb8f6d7ac 100644 --- a/tmcd/plab/rusaged +++ b/tmcd/plab/rusaged @@ -39,7 +39,7 @@ my $action = "start"; my $logname = "$LOGDIR/rusaged.debug"; my $pidfile = "/var/run/emulab-rusaged.pid"; my $debug = 0; -my $isalivewait = 300; # Seconds to wait. +my $isalivewait = 60; # Seconds to wait. my $svcslice = "utah_elab_svc"; # diff --git a/tmcd/tmcd.c b/tmcd/tmcd.c index e514544b3d312236200866819ff0fe823ae07cbd..80beb24590b8fe204f9c36b4f829320b5cc35ccc 100644 --- a/tmcd/tmcd.c +++ b/tmcd/tmcd.c @@ -4889,9 +4889,8 @@ COMMAND_PROTOTYPE(dorusage) * See db/node_status script, which uses this info (timestamps) * to determine when nodes are down. * - * XXX: Plab physnodes do not report at all; we copy the virtnode - * info to the physnode. Do not update status field though; that - * is handled from one of the plab daemons. + * XXX: Plab physnode status is reported from the management slice. + * */ mydb_update("replace delayed into node_rusage " " (node_id, status_timestamp, " @@ -4901,11 +4900,10 @@ COMMAND_PROTOTYPE(dorusage) if (reqp->isplabdslice) { mydb_update("replace delayed into node_status " - " (node_id, status_timestamp, " - " load_1min, load_5min, load_15min, disk_used) " - " values ('%s', now(), %f, %f, %f, %f)", - reqp->pnodeid, la1, la5, la15, dused); - } + " (node_id, status, status_timestamp " + " values ('%s', 'up', now())", + reqp->pnodeid); + } /* * At some point, maybe what we will do is have the client