Commit ae2eec76 authored by Kirk Webb's avatar Kirk Webb
Browse files

Kirk takes the weed whacker to the plab code. This is the first pass result.

I'll come along for a closer cut in the future.

* Modularized the plab communications 'adaptor' interface and moved the
  dslice- and PLC-specific code into their own modules.

* Wrote an API definition README

* Separated out generic routines from libplab into their own library modules
  (libtestbed.py and libdb.py)

Functionally, not much has changed - this was just a massive re-org with some
other cleanup.  Should be much easier to code up new PLAB interfaces as the
plab folks flail around in their attempt to standardize on something.

XXX: may want to re-think where the generic library modules should go.  If
more python code enters Elab, we'll probably want to move 'em to more standard
locations.

This isn't the end of the cleanup - I would eventually like to go back and
rethink the class structures, beef up the comments, and extend the API.
parent 515b0d7a
......@@ -1431,6 +1431,8 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/tarfiles_setup tbsetup/webtarfiles_setup \
tbsetup/fetchtar.proxy tbsetup/webfrisbeekiller \
tbsetup/plab/GNUmakefile tbsetup/plab/libplab.py \
tbsetup/plab/libtestbed.py tbsetup/plab/libdb.py \
tbsetup/plab/mod_dslice.py tbsetup/plab/mod_PLC.py \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabdaemon \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord tbsetup/plab/plablinkdata \
......
......@@ -476,6 +476,8 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/tarfiles_setup tbsetup/webtarfiles_setup \
tbsetup/fetchtar.proxy tbsetup/webfrisbeekiller \
tbsetup/plab/GNUmakefile tbsetup/plab/libplab.py \
tbsetup/plab/libtestbed.py tbsetup/plab/libdb.py \
tbsetup/plab/mod_dslice.py tbsetup/plab/mod_PLC.py \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabdaemon \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord tbsetup/plab/plablinkdata \
......
# -*- python -*-
#
# Database functions
#
import sys
import os
import pwd
import MySQLdb
# XXX: inherit!
debug = 1
__dbName = "@TBDBNAME@"
__dbQueryMaxtries = 1
__dbConnMaxtries = 5
__dbConnection = None
def TBDBConnect():
global __dbConnection
if __dbConnection:
return
# Create a DB username for accounting purposes
uid = os.getuid()
try:
name = pwd.getpwuid(uid)[0]
except KeyError:
name = "uid%d" % uid
dbuser = "%s:%s:%d" % (sys.argv[0], name, os.getpid())
if debug:
print "Connecting to db %s as %s" % (__dbName, dbuser)
# Connect, with retries
for tries in range(__dbConnMaxtries):
try:
__dbConnection = MySQLdb.connect(db = __dbName, user = dbuser)
except:
time.sleep(1)
else:
break
else:
raise RuntimeError, "Cannot connect to DB after several attempts!"
def DBQueryFatal(queryPat, querySub = (), asDict = False):
TBDBConnect()
if asDict:
cursor = __dbConnection.cursor(MySQLdb.cursors.DictCursor)
else:
cursor = __dbConnection.cursor()
if debug:
print "Executing DB query %s" % queryPat
tries = __dbQueryMaxtries
while tries:
try:
cursor.execute(queryPat, querySub)
ret = cursor.fetchall()
if debug:
rs = `ret`
if len(rs) > 60:
rs = rs[:60] + "..."
print "Result: %s" % rs
return ret
except MySQLdb.MySQLError:
tries -= 1
if tries == 0:
raise
else:
time.sleep(1)
try:
__dbConnection.ping()
except MySQLdb.MySQLError: pass
tbmsg = "".join(traceback.format_exception(*sys.exc_info()))
SENDMAIL(TBOPS, "DB query failed", "DB query failed:\n\n%s" % tbmsg, TBOPS)
raise RuntimeError, "Aah! Escaped DBQueryFatal loop"
def DBQuery(*args):
try:
ret = DBQueryFatal(*args)
except MySQLdb.MySQLError:
return None
# -*- python -*-
import sys
import os
import time
import signal
import xmlrpclib
#
# Constants
#
TBOPS = "@TBOPSEMAIL@".replace("\\","")
MAILTAG = "@THISHOMEBASE@"
SCRIPTNAME = sys.argv[0][sys.argv[0].rfind("/")+1:]
DEFAULT_DATA_PATH = "@prefix@/etc/plab/" # ensure this ends in a slash
#
# How many seconds to sleep between failures and how many times to try
# commands to both the dslice agent, and individual node managers.
#
DEF_SLEEPINT = 5
DEF_TRIES = 3
DEF_TIMEOUT = 1*60 # default timeout interval
#
# informational output control variables
#
debug = 0
verbose = 0
#
# Handle generic arguments.
#
def handleArgs(args):
"""
Takes a list of command-line arguments, interprets those at the
beginning that are meant for libplab (-vd), and returns the remainder
of the arguments.
"""
global verbose, debug
import getopt
opts, args = getopt.getopt(args, "vd")
for o, a in opts:
if o == "-v":
verbose = 1
if o == "-d":
debug = 1
return args
def SENDMAIL(To, Subj, Msg, From = None, Headers = None, Files = ()):
"""
Sends email to someone about something :)
This function is similar to its perl library counterpart.
ARGS:
To: <string> Email address of recipient.
Subj: <string> Subject of email.
Msg: <string> Message text.
From: <string> Email address of sender (optional).
Headers: <string> Extra header strings (must newline terminate all but
the last one) (optional).
Files: <tuple> List of files to append to message body (optional).
RETURNS:
Always returns 1
SIDE EFFECTS:
Can raise exceptions via called methods/functions.
"""
Tag = MAILTAG.upper()
# damn, no good way to tell if this fails
sm = os.popen("/usr/sbin/sendmail -t", "w")
#
# Sendmail will figure this out if not given.
#
if From:
sm.write("From: %s\n" % From)
if Headers:
sm.write("%s\n" % Headers)
sm.write("X-NetBed: %s\n" % SCRIPTNAME)
sm.write("To: %s\n" % To)
sm.write("Subject: %s: %s\n" % (Tag, Subj))
sm.write("\n")
sm.write("%s\n" % Msg)
sm.write("\n")
if len(Files):
for fname in Files:
try:
infile = open(fname)
except IOError:
continue
sm.write("\n--------- %s --------\n" % fname)
for line in infile.readlines():
sm.write(line)
infile.close()
sm.write("\n")
sm.close()
return 1
#
# General library functions
#
#
# Print out a timestamp with optional message
#
def TIMESTAMP(msgstr = ""):
mytime = time.strftime("%H:%M:%S")
print "TIMESTAMP: %s %s" % (mytime, msgstr)
#
# Termination signals, and global var to track if we got one when
# they are disabled with disable_sigs
#
TERMSIGS = (signal.SIGTERM, signal.SIGHUP, signal.SIGINT)
gotsig = 0
class SignalInterrupt(Exception):
def __init__(self, signum):
self.signum = signum
#
# Keep track of last terminal signal received
#
def localSigHandler(signum, frame):
"""
Keep track of received signals.
"""
global gotsig
gotsig = signum
if verbose:
print "Caught signal %s" % signum
def disable_sigs(sigs):
"""
Put signal watcher into place. I wish you could just temporarily
block (but not ignore) signals in python - alas.
"""
osigs = {}
for sig in sigs:
osigs[sig] = signal.signal(sig, localSigHandler)
return osigs
def enable_sigs(osigs):
"""
Reinstate old signal handlers and then raise an exception if
one was caught while we had them disabled.
"""
global gotsig
for sig,handler in osigs.items():
signal.signal(sig, handler)
if gotsig:
tmp = gotsig
gotsig = 0
raise SignalInterrupt(tmp)
#
# Local timeout error class and generic alarm handler
# Also listed are a couple of state saving vars for the alarm handler
# when the local one is installed. The *_alarm calls are nestable
#
class TimeoutError: pass
def alrmhandler(signum, frame):
if debug:
print "Timeout! Raising TimeoutError."
raise TimeoutError
oalrmhandlerstk = [] # alarm handler stack
oalrmtmostk = [] # alarm timeout stack
def enable_alarm():
"""
Install a little local alarm handler, stash away old one, and
it's pending alarm timeout (if set).
"""
global oalrmhandlerstk, oalrmtmostk
oalrmhandlerstk.append(signal.signal(signal.SIGALRM, alrmhandler))
oalrmtmo = signal.alarm(0)
if oalrmtmo:
oalrmtmo += time.time()
oalrmtmostk.append(oalrmtmo)
def disable_alarm():
"""
Restore old handler and timeout. If the old timeout has passed, warn,
and send the alarm signal immediately.
"""
signal.signal(signal.SIGALRM, oalrmhandlerstk.pop())
oalrmtmo = oalrmtmostk.pop()
if oalrmtmo:
diff = oalrmtmo - time.time()
if diff > 0:
signal.alarm(diff)
else:
warn("missed a timeout deadline, sending SIGALRM immediately!")
os.kill(os.getpid(), signal.SIGALRM)
def ForkCmd(cmd, args=(), timeout=DEF_TIMEOUT,
disable_sigs_parent=(), disable_sigs_child=()):
"""
Fork and run the given command, and optionally timeout in the parent.
ARGS:
cmd: <bound method | function> command to run.
args: <tuple> arguments to the above command.
timeout: <int> seconds to wait for child/command to complete
before killing it off and returning
disable_sigs_parent: <tuple of ints> signals to disable in parent
disable_sigs_child: <tuple of ints> signals to disable in child
RETURNS:
two element tuple. The first element is a boolean, indicating whether
or not an exception was caught while executing the command. The second
element is the return code from the command (which could be meaningless
if an exception was caught).
SIDE EFFECTS:
Forks child process to run provided command. Blocks signals
if instructed to with disable_sigs() (does an enable_sigs() before
returning).
"""
osigs = None
childpid = os.fork()
# parent
if childpid:
gotexc = 0
exval = 256
if disable_sigs_parent:
osigs = disable_sigs(disable_sigs_parent)
enable_alarm()
signal.alarm(timeout)
while 1:
try:
exval = os.waitpid(childpid, 0)[1]
except TimeoutError:
print "Timeout waiting for command completion: %s" % \
cmd.func_name
gotexc = 1
break
except OSError, e:
# Interrupted syscall: just jump back on it.
if e.errno == 4:
continue
else:
gotexc = 1
break
except:
gotexc = 1
break
else:
break
signal.alarm(0)
if gotexc:
tb = "".join(traceback.format_exception_only(*sys.exc_info()[:2]))
print "Exception caught while trying to " \
"run command %s\n%s" % (cmd.func_name, tb)
try: os.kill(childpid, signal.SIGUSR1)
except: pass
try: exval = os.wait()[1]
except: exval = 256
else:
if debug:
if os.WIFEXITED(exval):
print "Process complete, exit value: %d" % \
os.WEXITSTATUS(exval)
if os.WIFSIGNALED(exval):
print "Process signalled: %d" % \
os.WTERMSIG(exval)
disable_alarm()
if osigs:
enable_sigs(osigs)
return (gotexc, os.WEXITSTATUS(exval))
# child
else:
def sigusrexit(signum, frame):
if debug:
print "Received SIGUSR1, bailing out"
os._exit(1)
retval = 1
if disable_sigs_child:
osigs = disable_sigs(disable_sigs_child)
signal.signal(signal.SIGUSR1, sigusrexit)
try:
if type(args) == tuple:
retval = cmd(*args)
else:
retval = cmd(args)
except:
traceback.print_exception(*sys.exc_info())
os._exit(retval)
def tryXmlrpcCmd(cmd, args = (),
inittries = DEF_TRIES,
sleepint = DEF_SLEEPINT,
raisefault = False):
"""
This helper/wrapper function's job is to invoke the commands to the
central agent, or local node manager, taking steps to retry and
recover from failure.
ARGS:
cmd: <bound method | function> command to try.
args: <tuple> arguments to pass to the above command.
inittries: <int> number of retries before the function gives up
and reraises the last caught exception.
sleepint: <int> how long to sleep (in seconds) between retries.
raisefault: <boolean> indicates whether or not to reraise an
xmlrpclib Fault exception when caught. When true it
also adds a new 'triesleft' member to the Fault class
instance containing the number of attempts this
function had remaining when the Fault exception was
encountered.
RETURNS:
This function returns the result returned by the passed in command.
SIDE EFFECTS:
Invokes the passed in command with the passed in arguments.
Catches protocol/socket exceptions for command retry.
(Optionally) catches xmlrpclib.Fault exceptions for command retry.
Adds a 'triesleft' member to all exceptions reraised prior to tries=0.
Understands TimeoutError exceptions, and will reraise them.
"""
tries = inittries
if debug:
print "About to perform command %s with args:\n\t%s" % \
(cmd, args)
while 1:
tries = tries - 1
try:
if args:
# have to differentiate since the '*' operator wants
# a tuple - throws an exception if its operand isn't
if type(args) == tuple:
return cmd(*args)
else:
return cmd(args)
else:
return cmd()
except xmlrpclib.Fault, e:
print "XML-RPC Fault happened while executing agent " \
"command: %s" % cmd.func_name
print "\tCode: %s, Error: %s" % (e.faultCode, e.faultString)
if raisefault:
e.triesleft = tries
raise xmlrpclib.Fault, e
except TimeoutError, e:
if debug:
print "Caught a timeout error, setting triesleft and raising."
e.triesleft = tries
raise TimeoutError, e
except (socket.error, xmlrpclib.ProtocolError), e:
print "Encountered problem communicating with agent " \
"while executing command: %s" % cmd.func_name
if debug:
print "Exception is of type: %s" % e
if tries > 0:
print "Sleeping for %s seconds, then retrying %s command" % \
(sleepint, cmd.func_name)
time.sleep(sleepint)
else:
# XXX: perhaps this should raise its own, new type of
# exception.
print "Giving up after %s tries" % inittries
raise
......@@ -17,7 +17,7 @@ SUBDIRS = libdslice etc
SBIN_STUFF = plabslice plabnode plabdaemon plabmetrics plabstats \
plabmonitord plablinkdata plabdist plabhttpd
LIB_STUFF = libplab.py
LIB_STUFF = libplab.py libtestbed.py libdb.py mod_dslice.py mod_PLC.py
LIBEXEC_STUFF = webplabstats
......
This document describes the Planetlab<->Elab adapter module API.
Modules that wish to function as adaptors between Emulab and Planetlab
must implement all the functions in this document that are not marked
"optional".
** Design principles:
* Separation of concerns
To the greatest extent possible, Elab should not be aware of Plab
specifics, and vice versa. libplab is the Emulab-centric side of this
separation, while the Plab-specific access/adaptor modules are in
separate files which represent different ways of interacting with
Planetlab (for e.g., mod_PLC uses the PLC interface, while mod_dslice
uses the dslice interface).
* Modular architecture
Nodes and slices are encapsulated in classes. Some function
prototypes below specify these objects as parameters.
Node objects have the following accessible elements:
nodeid - string naming the node this object represents (Elab specific)
nodemeta - string of data specific to adapter module, related to node.
slice - slice object this node belongs to (or will belong to).
IP - IP address of this node
Slice objects have the following accessible elements:
slicename - string naming the slice this object represents
slicemeta - string of data specific to adapter module, related to slice.
Adapter modules should not rely on the presence of any other object
members.
** API:
CONSTANT: modname - text string naming this module - just informational.
FUNCTION: getFree
ARGS: None
RETURNS: list of IP addresses representing the available set of plab nodes.
SIDE EFFECTS: None
FUNCTION: createSlice
ARGS: slicename - REQUIRED string naming the slice to create
RETURNS: 2 element tuple (<int1>, <string1>)
<int1> - integer representing success (1) or failure (0)
<string1> - string containing module-specific data. 'None'
may be returned. This data will be fed back to
other module functions and will be saved off.
SIDE EFFECTS: Causes the named plab slice to be created.
FUNCTION: destroySlice
ARGS: slicename - REQUIRED string naming the slice to destroy
RETURNS: integer representing success (1) or failure (0)
SIDE EFFECTS: Causes the named plab slice to be destroyed/removed.
FUNCTION: createNode
ARGS: node - REQUIRED node object representing the node to create
RETURNS: 3 element tuple (<int1>, <string1>, <string2>)
<int1> - integer representing success (1) or failure (0)
<string1> - string containing node expiration date/time in
SQL time format: 'YYYY-MM-DD HH:MM:SS'
<string2> - string containing module-specific data for this node.
This string will be fed back to other module functions
and will be saved off.
SIDE EFFECTS: Allocates the given node to the given slice, and ensures
"emulabman" has ssh access to it. Should not
successfully return until the plab vnode has been
allocated and instantiated (i.e., is ready to interact
with). May be interrupted externally if it takes too long.
FUNCTION: freeNode
ARGS: node - REQUIRED node object representing the node to free
RETURNS: integer representing success (1) or failure (0)
SIDE EFFECTS: Causes the given node to be removed from the slice, and its
resources freed.
FUNCTION: renewNode
ARGS: node - REQUIRED node object representing the node to renew
length - OPTIONAL time (in seconds) length for renewal.
RETURNS: 3 element tuple (<int1>, <string1>, <string2>)
<int1> - integer representing success (1) or failure (0)
<string1> - string containing node expiration date/time in
SQL time format: 'YYYY-MM-DD HH:MM:SS'
<string2> - string containing module-specific data for this node.
This string will be fed back to other module functions
and will be saved off.
SIDE EFFECTS: Causes this node's 'lease' or 'lifetime' to be renewed. This
function should strive to add as much time to the lease as
possible if 'length' is not specified.
# -*- python -*-
#
# Database functions
#
import sys
import os
import pwd