Commit d4a887e5 authored by Kirk Webb's avatar Kirk Webb

More plab updates/changes

* implemented PLC slice renewal
* restructured daemon code/startup
  - removed getfree daemon (replaced by plabdiscover; run from cron)
  - moved generic daemonizing code into libtestbed (class)
  - created plabrenewd - small script that utilizes daemonizing class
  - removed plabdaemon file.
  - updated bossnode startup scripts
* changed slice prefix - PLC denies permission w/ anything other than "utah"
* Minor semantic changes to module API to be more consistent with other parts.
* Some bug fixes.
parent e1871b51
......@@ -910,7 +910,7 @@ IPBASE=10
SFSSUPPORT=1
PLABSUPPORT=0
PLAB_ROOTBALL="plabroot-9.tar.bz2"
PLAB_SLICEPREFIX="emulab"
PLAB_SLICEPREFIX="utah"
TBLOGFACIL="local5"
LINKTEST_NSPATH="/share/linktest-ns"
......@@ -1433,12 +1433,12 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/plab/GNUmakefile tbsetup/plab/libplab.py \
tbsetup/plab/libtestbed.py tbsetup/plab/libdb.py \
tbsetup/plab/mod_dslice.py tbsetup/plab/mod_PLC.py \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabdaemon \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabrenewd \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord tbsetup/plab/plablinkdata \
tbsetup/plab/libdslice/GNUmakefile tbsetup/plab/etc/GNUmakefile \
tbsetup/plab/plabdist tbsetup/plab/plabhttpd \
tbsetup/plab/etc/netbed_files/GNUmakefile \
tbsetup/plab/plabdiscover tbsetup/plab/etc/netbed_files/GNUmakefile \
tbsetup/ipassign/GNUmakefile tbsetup/ipassign/src/GNUmakefile \
tbsetup/ipassign/ipassign_wrapper tbsetup/assign_prepass \
tip/GNUmakefile \
......
......@@ -102,7 +102,7 @@ IPBASE=10
SFSSUPPORT=1
PLABSUPPORT=0
PLAB_ROOTBALL="plabroot-9.tar.bz2"
PLAB_SLICEPREFIX="emulab"
PLAB_SLICEPREFIX="utah"
TBLOGFACIL="local5"
LINKTEST_NSPATH="/share/linktest-ns"
......@@ -478,12 +478,12 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/plab/GNUmakefile tbsetup/plab/libplab.py \
tbsetup/plab/libtestbed.py tbsetup/plab/libdb.py \
tbsetup/plab/mod_dslice.py tbsetup/plab/mod_PLC.py \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabdaemon \
tbsetup/plab/plabslice tbsetup/plab/plabnode tbsetup/plab/plabrenewd \
tbsetup/plab/plabmetrics tbsetup/plab/plabstats \
tbsetup/plab/plabmonitord tbsetup/plab/plablinkdata \
tbsetup/plab/libdslice/GNUmakefile tbsetup/plab/etc/GNUmakefile \
tbsetup/plab/plabdist tbsetup/plab/plabhttpd \
tbsetup/plab/etc/netbed_files/GNUmakefile \
tbsetup/plab/plabdiscover tbsetup/plab/etc/netbed_files/GNUmakefile \
tbsetup/ipassign/GNUmakefile tbsetup/ipassign/src/GNUmakefile \
tbsetup/ipassign/ipassign_wrapper tbsetup/assign_prepass \
tip/GNUmakefile \
......
......@@ -4,11 +4,9 @@
#
case "$1" in
start)
if [ -x @prefix@/sbin/plabdaemon ]; then
echo -n " plabrenew"
@prefix@/sbin/plabdaemon -v renew
echo -n " plabgetfree"
@prefix@/sbin/plabdaemon -v getfree
if [ -x @prefix@/sbin/plabrenewd ]; then
echo -n " plabrenewd"
@prefix@/sbin/plabrenewd -v
fi
if [ -x @prefix@/sbin/plabmonitord ]; then
......
......@@ -4,9 +4,12 @@ import sys
import os
import time
import signal
import syslog
import traceback
import xmlrpclib
from warnings import warn
#
# Constants
#
......@@ -23,6 +26,8 @@ DEF_SLEEPINT = 5
DEF_TRIES = 3
DEF_TIMEOUT = 1*60 # default timeout interval
MAXCONSECEXC = 3
#
# informational output control variables
#
......@@ -389,3 +394,138 @@ def tryXmlrpcCmd(cmd, args = (),
# exception.
print "Giving up after %s tries" % inittries
raise
class logger:
def __init__(self, logname):
syslog.openlog(logname, syslog.LOG_PID, syslog.LOG_USER)
self.buf = ""
return
def close(self):
syslog.closelog()
return
def flush(self): pass
def write(self, str):
# Ugh
self.buf += str
while self.buf.find("\n") >= 0:
pos = self.buf.find("\n")
line = self.buf[:pos]
self.buf = self.buf[pos+1:]
syslog.syslog(line)
pass
return # XXX: need to return # bytes written?
pass
class pydaemon:
SYSLOG = "__SysLog__"
def __init__(self, logname = ""):
self.logname = logname
return
def daemonize(self):
"""
Fork off into a daemon process, redirecting stdout and stderr to
logfile.
Based on code from the ASPN Python Cookbook.
"""
# First fork
if os.fork():
sys.exit(0)
pass
# Decouple from parent environment.
os.chdir("/")
os.umask(0)
os.setsid()
# Second fork
if os.fork():
sys.exit(0)
pass
# Redirect standard fd's
si = open("/dev/null", 'r')
so = open("/dev/null", 'a+', 0)
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(so.fileno(), sys.stderr.fileno())
# Redirect output
outfile = None
if self.logname == self.SYSLOG:
outfile = logger(SCRIPTNAME)
pass
elif self.logname:
outfile = open(logname, "a+")
pass
else:
return
if outfile:
sys.stdout = sys.stderr = outfile
pass
else:
print "Couldn't open output log"
pass
return
def daemonLoop(self, func, period, maxconsecexc = MAXCONSECEXC):
"""
Forks off into a daemon process with output directed to logfile, and
calls the given func every period seconds.
"""
import time
import traceback
consecexc = maxconsecexc
while True:
start = time.clock()
try:
func()
pass
except SignalInterrupt, e:
print "Received signal %s in daemon loop, exiting." % e.signum
sys.exit(0)
pass
except KeyboardInterrupt:
print "Received keyboard interrupt in daemon loop, exiting."
sys.exit(1)
pass
except:
print "Exception caught in plab daemon loop:"
print "".join(traceback.format_exception(*sys.exc_info()))
consecexc -= 1
if consecexc > 0:
print "Going back to sleep until next scheduled run"
else:
print "Too many consecutive exceptions seen, bailing out!"
SENDMAIL(TBOPS, "%s Exiting",
"The plab %s daemon has seen too many "
"consecutive exceptions and is bailing out."
"Someone needs to check the log!" %
(SCRIPTNAME, func.func_name), TBOPS)
raise
pass
else:
consecexc = maxconsecexc
pass
end = time.clock()
if end - start < period:
wait = period - (end - start)
print "Sleeping %g seconds" % wait
time.sleep(wait)
pass
pass
return # NOTREACHED
......@@ -14,8 +14,8 @@ include $(OBJDIR)/Makeconf
SUBDIRS = libdslice etc
SBIN_STUFF = plabslice plabnode plabdaemon plabmetrics plabstats \
plabmonitord plablinkdata plabdist plabhttpd
SBIN_STUFF = plabslice plabnode plabrenewd plabmetrics plabstats \
plabmonitord plablinkdata plabdist plabhttpd plabdiscover
LIB_STUFF = libplab.py libtestbed.py libdb.py mod_dslice.py mod_PLC.py
......
......@@ -43,7 +43,6 @@ agents = {'PLC' : mod_PLC,
#
DEF_AGENT = "PLC";
LEASELEN = 14*24*60*60 # Two weeks (maximum lease length)
RENEW_TIME = 2*24*60*60 # Renew two days before lease expires
RENEW_TIMEOUT = 1*60 # give the node manager a minute to respond to renew
......@@ -58,7 +57,8 @@ MONITOR_PID = "emulab-ops"
MONITOR_EID = "plab-monitor"
MAGIC_INET2_GATEWAYS = ("205.124.237.10", )
MAGIC_INET_GATEWAYS = ("205.124.249.123", "205.124.249.113")
MAGIC_INET_GATEWAYS = ("205.124.249.123", "205.124.249.113",
"205.124.249.121", "205.124.249.115")
LOCAL_PLAB_DOMAIN = ".flux.utah.edu"
LOCAL_PLAB_LINKTYPE = "pcplabinet2"
# right now these are the only 2.0 machines running the new slice interface:
......@@ -408,7 +408,7 @@ class Plab:
osigs = disable_sigs(TERMSIGS)
defosid, controliface = self.__getNodetypeInfo()
nodeid = NODEPREFIX + nodeent['NODEID']
priority = PLABBASEPRIO + int(nodeid)
priority = PLABBASEPRIO + int(nodeent['NODEID'])
hostonly = nodeent['HNAME'].replace(".", "-")
site = BADSITECHARS.sub("-", nodeent['SITE'])
......@@ -559,65 +559,124 @@ class Plab:
return nodeid, priority
# XXX: Review..
# XXX: might want to just call into slice.renew and let module specific
# code deal with individual node renew if it tracks at that level.
def renew(self):
"""
Renews all of the Plab leases that are going to expire soon.
"""
print "Renewing Plab leases ..."
# Ugh, MySQL doesn't know UTC until v4.1.1, and unix_timestamp()
# returns the local time
import time
now = int(time.mktime(time.gmtime()))
endtime = now + RENEW_TIME
res = DBQueryFatal("select node_id, pid, eid,"
" unix_timestamp(leaseend) from plab_slice_nodes"
" where %s > unix_timestamp(leaseend)",
(endtime, ))
res = DBQueryFatal("select NULL, pid, eid,"
" unix_timestamp(leaseend) from plab_slices"
" where leaseend != NULL and"
" %s > unix_timestamp(leaseend)",
(endtime, ))
res += DBQueryFatal("select node_id, pid, eid,"
" unix_timestamp(leaseend) from plab_slice_nodes"
" where leaseend != NULL and"
" %s > unix_timestamp(leaseend)",
(endtime, ))
loadedSlices = {}
global failedrenew
global failedrenew # XXX
newfail = []
failsoon = []
ret = 0
for entry in res:
nodeid, pid, eid, tstamp = entry
if tstamp <= now:
if nodeid:
print "WARNING: Node lease for %s (%s/%s) has expired!" % \
(nodeid, pid, eid)
pass
else:
print "WARNING: Slice lease for %s/%s has expired!" % \
(pid, eid)
pass
continue
try:
slice = loadedSlices[(pid, eid)]
pass
except KeyError:
slice = self.loadSlice(pid, eid)
loadedSlices[(pid, eid)] = slice
node = slice.loadNode(nodeid)
if tstamp <= now:
print "WARNING: Lease for %s %s/%s has expired!" % entry[:3]
continue
pass
if node.renew():
print "Failed to renew lease for %s %s/%s" % entry[:3]
if nodeid:
node = slice.loadNode(nodeid)
ret = node.renew()
pass
else:
try:
res, slicemeta, leaselen = \
self.agent.renewSlice(slice.slicename)
ret = 1
pass
except:
ret = 0
if not ret:
if nodeid:
print "Failed to renew lease for %s (%s/%s)" % \
(nodeid, pid, eid)
pass
else:
print "Failed to renew lease for %s/%s" % \
(pid, eid)
pass
if entry not in failedrenew:
newfail.append(entry)
pass
if (tstamp - now) < (2*3600):
failsoon.append(entry)
pass
pass
else:
if entry in failedrenew:
failedrenew.remove(entry)
if newfail:
failedrenew += newfail
failstr = ""
for n in newfail:
failstr += "%s %s/%s\n" % n[:3]
if n[0]:
failstr += "%s (%s/%s)\n" % n[:3]
pass
else:
failstr += "%s/%s\n" % n[1:3]
pass
pass
SENDMAIL(TBOPS, "Lease renewal(s) failed",
"Failed to renew lease on the following nodes:\n%s" %
failstr + "\n\nPlease check the plabrenew log", TBOPS)
pass
if failsoon:
failstr = ""
for n in failsoon:
failstr += "%s %s/%s: expires: %s\n" % \
(n[:3] + (time.ctime(n[3]),))
if n[0]:
failstr += "%s (%s/%s): expires: %s\n" % \
(n[:3] + (time.ctime(n[3]),))
pass
else:
failstr += "%s/%s: expires: %s\n" % \
(n[1:3] + (time.ctime(n[3]),))
pass
SENDMAIL(TBOPS, "WARNING: PLAB leases are about to expire",
"The following plab leases are about to expire:\n%s" %
failstr + "\n\nPlease look into it!", TBOPS)
pass
return
pass # end class Plab
#
......@@ -646,20 +705,18 @@ class Slice:
self.slicename = "%s_%s" % (SLICEPREFIX, eindex)
print "Creating Plab slice %s." % self.slicename
try:
res, self.slicemeta = \
self.plab.agent.createSlice(self.slicename)
if res:
DBQueryFatal("insert into plab_slices"
" (pid, eid, slicename, slicemeta) "
" values (%s, %s, %s, %s)",
(self.pid, self.eid, self.slicename,
self.slicemeta))
pass
res, self.slicemeta, self.leaseend = \
self.plab.agent.createSlice(self)
DBQueryFatal("insert into plab_slices"
" values (%s, %s, %s, %s, FROM_UNIXTIME(%s))",
(self.pid, self.eid, self.slicename,
self.slicemeta, self.leaseend))
pass
except:
self.plab.agent.deleteSlice(self.slicename)
self.plab.agent.deleteSlice(self)
DBQueryFatal("delete from plab_slices where slicename=%s",
(self.slicename,))
raise
......@@ -686,6 +743,31 @@ class Slice:
((self.slicename, self.slicemeta), ) = res
pass
def renew(self):
"""
Renews slice lease, if applicable to selected backend agent.
"""
print "Renewing lease for slice %s" % self.slicename
ret = 0
try:
res, self.slicemeta, self.leaseend = \
self.plab.agent.renewSlice(self)
DBQueryFatal("replace into plab_slices"
" values (%s, %s, %s, %s, FROM_UNIXTIME(%s))",
(self.pid, self.eid, self.slicename,
self.slicemeta, self.leaseend))
ret = 1
pass
except:
print "Slice renewal failed!"
traceback.print_exc()
ret = 0
pass
return ret
def destroy(self):
"""
Frees all nodes in this slice and destroys the slice. Note
......@@ -706,7 +788,7 @@ class Slice:
osigs = disable_sigs(TERMSIGS)
try:
self.plab.agent.deleteSlice(self.slicename)
self.plab.agent.deleteSlice(self)
pass
except:
print "Failed to delete slice!"
......@@ -786,7 +868,7 @@ class Node:
print "Creating Plab node %s on %s." % (self.nodeid, self.IP)
res, self.nodemeta, self.leaselen = \
self.slice.plab.agent.createNode(self)
self.slice.plab.agent.createNode(self)
DBQueryFatal("replace into plab_slice_nodes"
" (pid, eid, slicename, node_id,"
......@@ -865,7 +947,7 @@ class Node:
"""
res = ForkCmd(self._renew, timeout = RENEW_TIMEOUT,
disable_sigs_parent = TERMSIGS)
return res[0] | res[1]
return not (res[0] | res[1])
def _renew(self):
res, self.nodemeta, self.leaseend = \
......
......@@ -4,9 +4,12 @@ import sys
import os
import time
import signal
import syslog
import traceback
import xmlrpclib
from warnings import warn
#
# Constants
#
......@@ -23,6 +26,8 @@ DEF_SLEEPINT = 5
DEF_TRIES = 3
DEF_TIMEOUT = 1*60 # default timeout interval
MAXCONSECEXC = 3
#
# informational output control variables
#
......@@ -389,3 +394,138 @@ def tryXmlrpcCmd(cmd, args = (),
# exception.
print "Giving up after %s tries" % inittries
raise
class logger:
def __init__(self, logname):
syslog.openlog(logname, syslog.LOG_PID, syslog.LOG_USER)
self.buf = ""
return
def close(self):
syslog.closelog()
return
def flush(self): pass
def write(self, str):
# Ugh
self.buf += str
while self.buf.find("\n") >= 0:
pos = self.buf.find("\n")
line = self.buf[:pos]
self.buf = self.buf[pos+1:]
syslog.syslog(line)
pass
return # XXX: need to return # bytes written?
pass
class pydaemon:
SYSLOG = "__SysLog__"
def __init__(self, logname = ""):
self.logname = logname
return
def daemonize(self):
"""
Fork off into a daemon process, redirecting stdout and stderr to
logfile.
Based on code from the ASPN Python Cookbook.
"""
# First fork
if os.fork():
sys.exit(0)
pass
# Decouple from parent environment.
os.chdir("/")
os.umask(0)
os.setsid()
# Second fork
if os.fork():
sys.exit(0)
pass
# Redirect standard fd's
si = open("/dev/null", 'r')
so = open("/dev/null", 'a+', 0)
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(so.fileno(), sys.stderr.fileno())
# Redirect output
outfile = None
if self.logname == self.SYSLOG:
outfile = logger(SCRIPTNAME)
pass
elif self.logname:
outfile = open(logname, "a+")
pass
else:
return
if outfile:
sys.stdout = sys.stderr = outfile
pass
else:
print "Couldn't open output log"
pass
return
def daemonLoop(self, func, period, maxconsecexc = MAXCONSECEXC):
"""
Forks off into a daemon process with output directed to logfile, and
calls the given func every period seconds.
"""
import time
import traceback
consecexc = maxconsecexc
while True:
start = time.clock()
try:
func()
pass
except SignalInterrupt, e:
print "Received signal %s in daemon loop, exiting." % e.signum
sys.exit(0)
pass
except KeyboardInterrupt:
print "Received keyboard interrupt in daemon loop, exiting."
sys.exit(1)
pass
except:
print "Exception caught in plab daemon loop:"
print "".join(traceback.format_exception(*sys.exc_info()))
consecexc -= 1
if consecexc > 0:
print "Going back to sleep until next scheduled run"
else:
print "Too many consecutive exceptions seen, bailing out!"
SENDMAIL(TBOPS, "%s Exiting",
"The plab %s daemon has seen too many "
"consecutive exceptions and is bailing out."
"Someone needs to check the log!" %
(SCRIPTNAME, func.func_name), TBOPS)
raise
pass
else:
consecexc = maxconsecexc
pass
end = time.clock()
if end - start < period:
wait = period - (end - start)
print "Sleeping %g seconds" % wait
time.sleep(wait)
pass
pass
return # NOTREACHED
......@@ -86,6 +86,7 @@ class PLCagent:
def listSlice(self):
return self.__server.listSlice(self.__auth)
pass # end of PLCagent class
class mod_PLC:
......@@ -93,9 +94,10 @@ class mod_PLC:
self.modname = "mod_PLC"
pass
def createSlice(self, slicename):
def createSlice(self, slice):
agent = PLCagent(slicename)
agent = PLCagent(slice.slicename)
res = None
try:
res = tryXmlrpcCmd(agent.createSlice)
......@@ -104,7 +106,7 @@ class mod_PLC:
pass
pass
except:
print "Failed to create slice %s" % slicename
print "Failed to create slice %s" % slice.slicename
raise
try:
......@@ -115,7 +117,7 @@ class mod_PLC:
pass
pass
except:
print "Failed to assign emulabman to slice %s" % slicename
print "Failed to assign emulabman to slice %s" % slice.slicename
raise
try:
......@@ -127,16 +129,36 @@ class mod_PLC:
pass
pass
except:
print "Failed to assign shares to slice %s" % slicename
print "Failed to assign shares to slice %s" % slice.slicename
raise
return (res, None)
leaseend = time.time() + DEF_PLC_LEASELEN
return (res, None, leaseend)
def deleteSlice(self, slicename):
agent = PLCagent(slicename)
def deleteSlice(self, slice):
agent = PLCagent(slice.slicename)
tryXmlrpcCmd(agent.deleteSlice)
pass
def renewSlice(self, slice):
agent = PLCagent(slice.slicename)
res = None
try:
res = tryXmlrpcCmd(agent.AssignShares,
(DEF_PLC_LEASELEN,
DEF_PLC_SHARES))
if debug:
print res
pass
pass
except:
print "Failed to assign shares to slice %s" % slice.slicename
raise
leaseend = time.time() + DEF_PLC_LEASELEN
return (res, None, leaseend)
def createNode(self, node):
# add the node to the PLC slice.
agent = PLCagent(node.slice.slicename)
......@@ -184,10 +206,10 @@ class mod_PLC:
except:
print "Failed to instantiate sliver %s on slice %s" % \
(node.nodeid, node.slice.slicename)
self.freeNode(node)
raise
leaselen = time.time() + DEF_PLC_LEASELEN
return (res, None, leaselen)
return (res, None, None)