From 6d205dc5e5deffd13c704fe8b5dc00569ca2930d Mon Sep 17 00:00:00 2001 From: Kirk Webb Date: Wed, 31 Dec 2003 01:34:14 +0000 Subject: [PATCH] Commit to usher in the new PLC regime. Added a config variable to vnode_setup for the timeout on waiting for child processes. I've set it to 10 minutes since all ancillary setup programs have their own time bounds (I think - the plab ones do anyway). The function of plabmonitord has changed slightly. Instead of setting up and tearing down vnodes, its job is to just setup the emulab management sliver on plab nodes in hwdown. Once the vserver comes up and reports isalive, it moves the node out of hwdown. Currently, it first tries to tear down the vserver before reinstantiating it. In the future, we could get fancier and try interacting with the service sliver directly before simply tearing it down. All new plab nodes now start life in hwdown, and must be summoned forth into production by plabmonitord. This commit does NOT include support for the node-local httpd. That will come soon. --- tbsetup/plab/libplab.py.in | 11 +++++++++-- tbsetup/plab/plabdaemon.in | 19 +++++++++++-------- tbsetup/plab/plabmonitord.in | 14 ++++++++------ tbsetup/plab/plabnode.in | 2 +- tbsetup/vnode_setup.in | 4 +++- 5 files changed, 32 insertions(+), 18 deletions(-) diff --git a/tbsetup/plab/libplab.py.in b/tbsetup/plab/libplab.py.in index 81c859e1c..564520634 100644 --- a/tbsetup/plab/libplab.py.in +++ b/tbsetup/plab/libplab.py.in @@ -922,7 +922,9 @@ class Plab: defosid, controliface = self.__getNodetypeInfo() id, priority = self.__nextFreeNodeid() nodeid = "plab%d" % id - hostname = string.replace(hostname, ".", "-") + hostonly = hostname.replace(".", "-") + site = hostname[(hostname.find(".")+1):] + print "Creating pnode %s as %s, priority %d." % (ip, nodeid, priority) try: @@ -939,6 +941,11 @@ class Plab: 'FREE_CLEAN', 'ISUP')) + DBQueryFatal("insert into widearea_nodeinfo" + " (node_id, contact_uid, hostname, site)" + " values (%s, %s, %s, %s)", + (nodeid, 'bnc', hostname, site)) + DBQueryFatal("insert into interfaces" " (node_id, card, port, IP, interface_type," " iface, role)" @@ -948,7 +955,7 @@ class Plab: DBQueryFatal("insert into reserved" " (node_id, pid, eid, rsrv_time, vname)" " values (%s, %s, %s, now(), %s)", - (nodeid, RESERVED_PID, RESERVED_EID, hostname)) + (nodeid, RESERVED_PID, RESERVED_EID, hostonly)) DBQueryFatal("insert into node_auxtypes" " (node_id, type, count)" diff --git a/tbsetup/plab/plabdaemon.in b/tbsetup/plab/plabdaemon.in index 8f2451576..f3b08c012 100755 --- a/tbsetup/plab/plabdaemon.in +++ b/tbsetup/plab/plabdaemon.in @@ -15,9 +15,10 @@ RENEW_PERIOD = 60*60 MAXCONSECEXC = 3 def usage(me): - print "Usage: %s [ -vd ] { getfree [-u] | renew }" % me - print " Passing -u to getfree will cause it to only update" - print " existing data and not fork off as a daemon" + print "Usage: %s [ -vd ] { getfree [-o [-i]] | renew }" % me + print " Passing -o to getfree will cause it to only run once." + print " Passing -i to getfree will cause if to only update using" + print " existing node info (no new nodes will be added)." sys.exit(1) class logfile: @@ -125,15 +126,17 @@ def main(args): plab = libplab.Plab() if command == "getfree": - opts, args = getopt.getopt(args[1:], "i") - update = False + opts, args = getopt.getopt(args[1:], "io") + updateOnly = False for o, a in opts: if o == "-i": - update = True + updateOnly = True + elif o == "-o": + runonce = True if len(args): usage(me) - if update: - plab.getFree(True) + if runonce: + plab.getFree(updateOnly) else: doDaemon(plab.getFree, GETFREE_PERIOD, "plabgetfree") elif command == "renew": diff --git a/tbsetup/plab/plabmonitord.in b/tbsetup/plab/plabmonitord.in index a4ed6a2c2..d8a2ae463 100644 --- a/tbsetup/plab/plabmonitord.in +++ b/tbsetup/plab/plabmonitord.in @@ -145,6 +145,12 @@ while (1) { sleep(5); + + # * Try full vnode_setup on node - mgmt sliver. + # - If (multiple?) fail, try to delete node/sliver from mgmt slice. + # (can we push out this action?) + # + # # Make sure the node is still in $NODEDEAD_* # @@ -165,8 +171,9 @@ while (1) { print "\n\#\#\# Checking $vnode on $pnode at " . TimeStamp() . "\n"; # - # Try to set it up, wait for ISUP, then tear it down. + # Try to tear it down, set it up, and wait for ISUP. # + system("vnode_setup -f -k -d $PLABMOND_PID $PLABMOND_EID $vnode"); system("vnode_setup -f -d $PLABMOND_PID $PLABMOND_EID $vnode"); if ($?) { print "Failed to allocate $vnode on $pnode\n"; @@ -175,11 +182,6 @@ while (1) { $revive = 1; } } - system("vnode_setup -f -k -d $PLABMOND_PID $PLABMOND_EID $vnode"); - if ($?) { - print "Failed to teardown $vnode on $pnode\n"; - $revive = 0; - } # # That all worked. Move the pnode out of hwdown and back into diff --git a/tbsetup/plab/plabnode.in b/tbsetup/plab/plabnode.in index 76768339a..a9c1818d8 100755 --- a/tbsetup/plab/plabnode.in +++ b/tbsetup/plab/plabnode.in @@ -33,7 +33,7 @@ def main(args): if command == "alloc": node = slice.createNode(nodeid) -# node.addKey("/root/.ssh/identity.pub") + node.addKey("/root/.ssh/identity.pub") while 1: try: diff --git a/tbsetup/vnode_setup.in b/tbsetup/vnode_setup.in index 79002f9c9..54c49df4f 100644 --- a/tbsetup/vnode_setup.in +++ b/tbsetup/vnode_setup.in @@ -49,6 +49,8 @@ my $killmode = 0; my $dbuid; my $MAX_CHILDREN = 8; +# Try to let auxiliary setup programs timeout naturally first. +my $CHILD_TIMEOUT = 600; # ten minutes # # Load the Testbed support stuff. @@ -381,7 +383,7 @@ while (1) { # back around the loop # my $now = time(); - my $waittime = ($oldest + 120) - time(); + my $waittime = ($oldest + $CHILD_TIMEOUT) - time(); # # Kill of the oldest if he gets too old while we're waiting -- GitLab