Commit 6d205dc5 authored by Kirk Webb's avatar Kirk Webb

Commit to usher in the new PLC regime. Added a config variable to

vnode_setup for the timeout on waiting for child processes.  I've
set it to 10 minutes since all ancillary setup programs have their own
time bounds (I think - the plab ones do anyway).

The function of plabmonitord has changed slightly.  Instead of setting
up and tearing down vnodes, its job is to just setup the emulab management
sliver on plab nodes in hwdown.  Once the vserver comes up and reports isalive,
it moves the node out of hwdown.  Currently, it first tries to tear down the
vserver before reinstantiating it.  In the future, we could get fancier and
try interacting with the service sliver directly before simply tearing it down.

All new plab nodes now start life in hwdown, and must be summoned forth
into production by plabmonitord.

This commit does NOT include support for the node-local httpd.  That will
come soon.
parent 1e2dba15
......@@ -922,7 +922,9 @@ class Plab:
defosid, controliface = self.__getNodetypeInfo()
id, priority = self.__nextFreeNodeid()
nodeid = "plab%d" % id
hostname = string.replace(hostname, ".", "-")
hostonly = hostname.replace(".", "-")
site = hostname[(hostname.find(".")+1):]
print "Creating pnode %s as %s, priority %d." % (ip, nodeid, priority)
try:
......@@ -939,6 +941,11 @@ class Plab:
'FREE_CLEAN',
'ISUP'))
DBQueryFatal("insert into widearea_nodeinfo"
" (node_id, contact_uid, hostname, site)"
" values (%s, %s, %s, %s)",
(nodeid, 'bnc', hostname, site))
DBQueryFatal("insert into interfaces"
" (node_id, card, port, IP, interface_type,"
" iface, role)"
......@@ -948,7 +955,7 @@ class Plab:
DBQueryFatal("insert into reserved"
" (node_id, pid, eid, rsrv_time, vname)"
" values (%s, %s, %s, now(), %s)",
(nodeid, RESERVED_PID, RESERVED_EID, hostname))
(nodeid, RESERVED_PID, RESERVED_EID, hostonly))
DBQueryFatal("insert into node_auxtypes"
" (node_id, type, count)"
......
......@@ -15,9 +15,10 @@ RENEW_PERIOD = 60*60
MAXCONSECEXC = 3
def usage(me):
print "Usage: %s [ -vd ] { getfree [-u] | renew }" % me
print " Passing -u to getfree will cause it to only update"
print " existing data and not fork off as a daemon"
print "Usage: %s [ -vd ] { getfree [-o [-i]] | renew }" % me
print " Passing -o to getfree will cause it to only run once."
print " Passing -i to getfree will cause if to only update using"
print " existing node info (no new nodes will be added)."
sys.exit(1)
class logfile:
......@@ -125,15 +126,17 @@ def main(args):
plab = libplab.Plab()
if command == "getfree":
opts, args = getopt.getopt(args[1:], "i")
update = False
opts, args = getopt.getopt(args[1:], "io")
updateOnly = False
for o, a in opts:
if o == "-i":
update = True
updateOnly = True
elif o == "-o":
runonce = True
if len(args):
usage(me)
if update:
plab.getFree(True)
if runonce:
plab.getFree(updateOnly)
else:
doDaemon(plab.getFree, GETFREE_PERIOD, "plabgetfree")
elif command == "renew":
......
......@@ -145,6 +145,12 @@ while (1) {
sleep(5);
# * Try full vnode_setup on node - mgmt sliver.
# - If (multiple?) fail, try to delete node/sliver from mgmt slice.
# (can we push out this action?)
#
#
# Make sure the node is still in $NODEDEAD_*
#
......@@ -165,8 +171,9 @@ while (1) {
print "\n\#\#\# Checking $vnode on $pnode at " . TimeStamp() . "\n";
#
# Try to set it up, wait for ISUP, then tear it down.
# Try to tear it down, set it up, and wait for ISUP.
#
system("vnode_setup -f -k -d $PLABMOND_PID $PLABMOND_EID $vnode");
system("vnode_setup -f -d $PLABMOND_PID $PLABMOND_EID $vnode");
if ($?) {
print "Failed to allocate $vnode on $pnode\n";
......@@ -175,11 +182,6 @@ while (1) {
$revive = 1;
}
}
system("vnode_setup -f -k -d $PLABMOND_PID $PLABMOND_EID $vnode");
if ($?) {
print "Failed to teardown $vnode on $pnode\n";
$revive = 0;
}
#
# That all worked. Move the pnode out of hwdown and back into
......
......@@ -33,7 +33,7 @@ def main(args):
if command == "alloc":
node = slice.createNode(nodeid)
# node.addKey("/root/.ssh/identity.pub")
node.addKey("/root/.ssh/identity.pub")
while 1:
try:
......
......@@ -49,6 +49,8 @@ my $killmode = 0;
my $dbuid;
my $MAX_CHILDREN = 8;
# Try to let auxiliary setup programs timeout naturally first.
my $CHILD_TIMEOUT = 600; # ten minutes
#
# Load the Testbed support stuff.
......@@ -381,7 +383,7 @@ while (1) {
# back around the loop
#
my $now = time();
my $waittime = ($oldest + 120) - time();
my $waittime = ($oldest + $CHILD_TIMEOUT) - time();
#
# Kill of the oldest if he gets too old while we're waiting
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment