Commit 56e67515 authored by Kirk Webb's avatar Kirk Webb

Several updates to libplab.py and plabnode.in

- getfree daemon doesn't die anymore when communcation with the plab dslice
  agent fails.

- the link classifier logic has been changed slightly to allow nodes
  to be classified as inet2 even if they don't reverse resolve.  The problem
  here is that intl nodes that don't resolve, but which go through abilene
  will look like inet2 nodes, which is wrong.  Manual verification of the
  node_auxtypes table is still recommended.

- The fping verifier has been disabled for now (since some plab nodes
  block ICMP traffic).

- made some error messages more descriptive

- plabnodes script now handles more agent communication errors gracefully
 (retries when if encounters them).

- rearranged plabnode's retry loops to be a little easier to read, and
  more general.
parent 7a1d2b86
......@@ -216,17 +216,26 @@ class Plab:
"""
print "Getting free Plab nodes ..."
agent = self._createAgentProxy(insecure = True)
avail = agent.getads()
try:
avail = agent.getads()
except:
extype, exval, extrace = sys.exc_info()
print "Error talking to dslice agent: %s: %s" % (extype, exval)
if debug:
print extrace
print "Going back to sleep until next scheduled poll"
return
if debug:
print "Got advertisement list:"
print avail
if debug:
print "Pinging nodes in advertisement list"
avail = self.__getPingStatus(avail)
if debug:
print "Refined node list after ping verification:"
print avail
# if debug:
# print "Pinging nodes in advertisement list"
# avail = self.__getPingStatus(avail)
# if debug:
# print "Refined node list after ping verification:"
# print avail
known = self.__getKnownPnodes()
if debug:
......@@ -365,22 +374,23 @@ class Plab:
This can't detect DSL links, but those are probably rare for
Plab nodes.
"""
# Is host international?
# Is host international (or flux/emulab local)?
from socket import gethostbyaddr, getfqdn, herror
hostname = ip
try:
hostname, aliaslist, ipaddrlist = gethostbyaddr(ip)
hostname = getfqdn(hostname)
tld = hostname.split(".")[-1].lower()
if not tld in ("edu", "org", "net", "com", "gov", "us", "ca"):
return "pcplabintl", hostname
# Is it us?
if hostname.endswith(LOCAL_PLAB_DOMAIN):
return LOCAL_PLAB_LINKTYPE, hostname
except herror:
hostname = ip
print "WARNING: Failed to get hostname for %s" % ip
return "pcplabinet", ip
tld = hostname.split(".")[-1]
if not tld in ("edu", "org", "net", "com", "gov", "us"):
return "pcplabintl", hostname
# Is it us?
if hostname.endswith(LOCAL_PLAB_DOMAIN):
return LOCAL_PLAB_LINKTYPE, hostname
# Is host on I2?
traceroute = os.popen("traceroute -nm 10 -q 1 %s" % ip)
......@@ -390,6 +400,7 @@ class Plab:
for gw in MAGIC_INET2_GATEWAYS:
if trace.find(gw) != -1:
return "pcplabinet2", hostname
for gw in MAGIC_INET_GATEWAYS:
if trace.find(gw) != -1:
break
......@@ -841,8 +852,8 @@ class Node:
self.__perform("sudo wget -q -nH -P /tmp " +
ROOTBALL_HTTP_URLPATH + tgzname)
except RuntimeError:
print "Warning: couldn't get tarball via local service:"
print "\tFalling back to remote transfer."
print "Warning: couldn't get tarball via local service on %s: " \
"Falling back to remote transfer." % self.nodeid
self.__copy(tgzpath + tgzname, "/tmp/" + tgzname)
self.__perform("sudo tar -xzf /tmp/" + tgzname + " -C %s" % destpath)
......
......@@ -7,6 +7,7 @@ import getopt
import libplab
import xmlrpclib
import time
import socket
TRIES = 3
SLEEPINT = 5
......@@ -35,17 +36,24 @@ def main(args):
try:
node = slice.createNode(nodeid)
except xmlrpclib.Fault, e:
print "XML-RPC Fault happened while attempting node alloc:"
print "Code: %s, Error: %s" % (e.faultCode, e.faultString)
alloctries = alloctries - 1
if alloctries > 0:
print "Sleeping %s and trying again" % SLEEPINT
time.sleep(SLEEPINT)
else:
print "Giving up after %s tries" % TRIES
sys.exit(1)
print "XML-RPC Fault happened while attempting " \
"node alloc for %s" % nodeid
print "\tCode: %s, Error: %s" % (e.faultCode, e.faultString)
except (socket.error, xmlrpclib.ProtocolError):
print "Encountered problem communicating with an agent " \
"while setting up plab vnode %s" % nodeid
else:
break
alloctries = alloctries - 1
if alloctries > 0:
print "Sleeping for %s seconds, then retrying alloc on %s" % \
(SLEEPINT, nodeid)
time.sleep(SLEEPINT)
else:
print "Giving up after %s tries" % TRIES
raise
while 1:
try:
......@@ -55,19 +63,21 @@ def main(args):
# node.putConfig("/etc/vnodeid", nodeid)
# Note that vnode_setup boots the node
except:
print "Node setup failed."
setuptries = setuptries - 1
if setuptries > 0:
print "Sleeping for %s seconds and retrying" % \
SLEEPINT
time.sleep(SLEEPINT)
else:
print "Giving up after %s tries" % TRIES
node.free()
raise
print "Node setup failed on %s" % nodeid
else:
break
setuptries = setuptries - 1
if setuptries > 0:
print "Sleeping for %s seconds and then retrying " \
"setup on %s" % (SLEEPINT, nodeid)
time.sleep(SLEEPINT)
else:
print "Giving up after %s setup attempts on %s" % \
(TRIES, nodeid)
node.free()
raise
elif command == "free":
try:
node = slice.loadNode(nodeid)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment