Commit e1a2fabc authored by Kirk Webb's avatar Kirk Webb

Some PLAB dslice manager updates:

- in addition to asking the dslice agent (on plab) for a list of available
  nodes, we now also fping them all to weed out unresponsive ones.  One problem
  here is that several plab nodes block ICMP; could be solved by pinging with
  nmap (tries both a ICMP, and TCP ping).  This affects the plabdaemon getfree
  command, and subsequently which plab nodes appear as "up" in the DB

- Changed slice naming scheme:  we now append the experiment index onto the
  slice name to try to ensure uniqueness (emulab_<pid>_<eid>_<idx>)

- Modified plabnode to try to cope with flaky nodes - there is some retry
  code in there now

- Added the "fixsudo" shell script which is run very first as root (via the
  cumbersome "su" command) to fix sudoers for later sudo use on plab nodes.
parent 7848e357
#!/bin/sh
{
cat >/tmp/edscript <<EOF
/^emulab_/s/ALL[ \t]*$/NOPASSWD: ALL/
w
q
EOF
su -c "ed /etc/sudoers < /tmp/edscript"
} > /dev/null 2>&1
exit $?
......@@ -214,6 +214,13 @@ class Plab:
print "Got advertisement list:"
print avail
if debug:
print "Pinging nodes in advertisement list"
avail = self.__getPingStatus(avail)
if debug:
print "Refined node list after ping verification:"
print avail
known = self.__getKnownPnodes()
if debug:
print "Got known pnodes:"
......@@ -276,6 +283,28 @@ class Plab:
ret[ip] = nodeid
return ret
def __getPingStatus(self, ips):
"""
getFree helper function. Uses fping to test the reachability of
the PLAB nodes returned from the dslice agent. If a node doesn't
respond, it is not returned (i.e., not available)
"""
from socket import inet_aton
fpin, fpout = os.popen2("fping -aA")
fpin.write("\n".join(ips))
fpin.close()
results = fpout.read().split()
# Get rid of potential junk in output
for ip in results:
try:
inet_aton(ip)
except:
if verbose:
print "Removing junk from fping output:"
print "\t%s" % ip
results.remove(ip)
return results
def __setVnodesStatus(self, pnodeids, status):
"""
getFree helper function. Sets the status of all vnodes that are
......@@ -527,7 +556,15 @@ class Slice:
Creates a new slice that initially contains no nodes. Don't call
this directly, use Plab.createSlice instead.
"""
self.slicename = "emulab_%s_%s" % (self.pid, self.eid)
# self.slicename = "emulab_%s_%s" % (self.pid, self.eid)
res = DBQueryFatal("select idx from experiments "
"where pid=%s "
"and eid=%s",
(self.pid, self.eid))
if !len(res):
raise RuntimeError, "Didn't get any results while looking for idx"
((eindex, ), ) = res
self.slicename = "emulab_%s_%s_%s" % (self.pid, self.eid, eindex)
print "Creating Plab slice %s." % self.slicename
self.privkey, self.pubkey = self.__genKeypair()
try:
......@@ -667,6 +704,7 @@ class Node:
if debug:
print "Obtained ticket:"
print self.ticketdata
nodemgr = self._createNodemgrProxy()
try:
self.leasedata = nodemgr.newleasevm(self.ticketdata,
......
......@@ -5,6 +5,11 @@ import sys
sys.path.append("@prefix@/lib")
import getopt
import libplab
import xmlrpclib
import time
TRIES = 3
SLEEPINT = 5
def usage(me):
print "Usage: %s [ -vd ] { alloc | free } pid eid nodeid" % me
......@@ -12,6 +17,9 @@ def usage(me):
def main(args):
me = args[0]
alloctries = TRIES
setuptries = TRIES
try:
command, pid, eid, nodeid = libplab.handleArgs(args[1:])
except getopt.GetoptError:
......@@ -21,21 +29,54 @@ def main(args):
plab = libplab.Plab()
slice = plab.loadSlice(pid, eid)
if command == "alloc":
node = slice.createNode(nodeid)
while 1:
try:
node = slice.createNode(nodeid)
except xmlrpclib.Fault, e:
print "XML-RPC Fault happened while attempting node alloc:"
print "Code: %s, Error: %s" % (e.faultCode, e.faultString)
alloctries = alloctries - 1
if alloctries > 0:
print "Sleeping %s and trying again" % SLEEPINT
time.sleep(SLEEPINT)
else:
print "Giving up after %s tries" % TRIES
sys.exit(1)
else:
break
while 1:
try:
node.addKey("/root/.ssh/identity.pub")
node.emulabify()
# XXX This file is redundant
# node.putConfig("/etc/vnodeid", nodeid)
# Note that vnode_setup boots the node
except:
print "Node setup failed."
setuptries = setuptries - 1
if setuptries > 0:
print "Sleeping for %s seconds and retrying" % \
SLEEPINT
time.sleep(SLEEPINT)
else:
print "Giving up after %s tries" % TRIES
node.free()
raise
else:
break
elif command == "free":
try:
node.addKey("/root/.ssh/identity.pub")
node.emulabify()
# XXX This file is redundant
#node.putConfig("/etc/vnodeid", nodeid)
# Note that vnode_setup boots the node
node = slice.loadNode(nodeid)
except:
print "Node setup failed. Cleaning up"
node.free()
raise
elif command == "free":
node = slice.loadNode(nodeid)
print "Node %s wasn't really allocated" % nodeid
sys.exit(0)
node.free()
else:
usage(me)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment