Commit 1b6ef602 authored by David Johnson's avatar David Johnson

These are the rest of the changes that have been accumulating in my dev

tree for v4 planetlab node support.  Currently, we support both v3 and
v4 NMs via a little wrapper, and we dist out different versions of the
rootball depending on NM version.  Also updated various parts of libplab
to log success and failure from interactions with planetlab nodes to the
db, and there are beginnings of support for that in plabmonitord.in.
parent c7999da3
...@@ -143,6 +143,31 @@ DEF_SLICE_DESC = "Slice created by Emulab" ...@@ -143,6 +143,31 @@ DEF_SLICE_DESC = "Slice created by Emulab"
PLABEXPIREWARN = 1*WEEK # one week advance warning for slice expiration. PLABEXPIREWARN = 1*WEEK # one week advance warning for slice expiration.
NODEEXPIREWARN = 2*WEEK+2*DAY # about two weeks advance warning for slivers. NODEEXPIREWARN = 2*WEEK+2*DAY # about two weeks advance warning for slivers.
#
# This is a trigger table for dist'ing out multiple rootballs depending on
# which NM is running on the node. Since we don't know which version is
# running on a node until we call into (DEF|COMPAT)_AGENT and actually connect
# to the node, it's based off classnames.
#
# The table specifies a prefix to the default rootball name; this rootball
# should be placed in the normal location.
#
# If the plcagent class and nodeagent class are both not in the trigger table,
# we simply push the default rootball.
#
# Note that we could store version info in the database, but that doesn't
# really have long-term value. Plus, we have to be checking sites.xml all the
# time to catch version changes during rollout. This way, plabmonitord and
# the web interface know exactly what is the version whenever a node is
# contacted for setup.
#
# For now, only NM4agent has a custom tarball, since v4 nodes are in the
# distinct minority right now.
#
from mod_PLC4 import NM4agent
rootball_triggers = { mod_PLC4 : { NM4agent : 'NM4-' } }
# #
# var to track failed renewals # var to track failed renewals
# #
...@@ -1393,7 +1418,6 @@ class Slice: ...@@ -1393,7 +1418,6 @@ class Slice:
wrap_around(Slice._create, timeAdvice) wrap_around(Slice._create, timeAdvice)
wrap_around(Slice.destroy, timeAdvice) wrap_around(Slice.destroy, timeAdvice)
# #
# Node abstraction # Node abstraction
# #
...@@ -1401,12 +1425,33 @@ class Node: ...@@ -1401,12 +1425,33 @@ class Node:
def __init__(self, slice, nodeid, pollNode = False): def __init__(self, slice, nodeid, pollNode = False):
self.slice = slice self.slice = slice
self.nodeid = nodeid self.nodeid = nodeid
self.IP = self.__findIP() #self.IP = self.__findIP()
#self.hostname = self.__findHostname()
#self.phys_nodeid = self.__findPhysNodeID()
(self.IP,self.hostname,self.phys_nodeid) = self.__findHostInfo()
self.leaseend = 0 self.leaseend = 0
self.nodemeta = None self.nodemeta = None
self.pollNode = pollNode self.pollNode = pollNode
# must be set in mod_<PLCAGENT>.createNode if you want to use
# multiple rootball and triggering support
self.nmagent = None
return return
def __logNodeHist(self,component,operation,status,msg):
try:
DBQueryFatal("insert into plab_nodehist values "
"(NULL,%s,%s,%s,%s,%s,%s,%s)",
(self.nodeid,self.phys_nodeid,
time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(time.time())),
component,operation,status,str(msg)))
except:
# do nothing
print "WARNING: could not log (%s,%s,%s,%s) into plab_nodehist!" % \
(component,operation,status,msg)
pass
pass
# XXX: may want to rethink signal handling here. # XXX: may want to rethink signal handling here.
def _create(self, force=False): def _create(self, force=False):
""" """
...@@ -1430,8 +1475,15 @@ class Node: ...@@ -1430,8 +1475,15 @@ class Node:
pass pass
print "Creating Plab node %s on %s." % (self.nodeid, self.IP) print "Creating Plab node %s on %s." % (self.nodeid, self.IP)
res, self.nodemeta, self.leaseend = \ res = None
self.slice.plab.agent.createNode(self) try:
res, self.nodemeta, self.leaseend = \
self.slice.plab.agent.createNode(self)
self.__logNodeHist('node','create','success','')
except:
self.__logNodeHist('node','create','failure',
traceback.format_exception(*sys.exc_info()))
raise
DBQueryFatal("replace into plab_slice_nodes" DBQueryFatal("replace into plab_slice_nodes"
" (exptidx, pid, eid, slicename, node_id," " (exptidx, pid, eid, slicename, node_id,"
...@@ -1521,7 +1573,14 @@ class Node: ...@@ -1521,7 +1573,14 @@ class Node:
"\n\n%s" % (self.nodeid, tbstr), TBOPS) "\n\n%s" % (self.nodeid, tbstr), TBOPS)
pass pass
deleted = self.slice.plab.agent.freeNode(self) deleted = 0
try:
deleted = self.slice.plab.agent.freeNode(self)
self.__logNodeHist('node','free','success','')
except:
self.__logNodeHist('node','free','failure',
traceback.format_exception(*sys.exc_info()))
raise
TIMESTAMP("freenode %s finished." % self.nodeid) TIMESTAMP("freenode %s finished." % self.nodeid)
return not deleted return not deleted
...@@ -1537,8 +1596,16 @@ class Node: ...@@ -1537,8 +1596,16 @@ class Node:
return res[0] | res[1] return res[0] | res[1]
def _renew(self): def _renew(self):
res, self.nodemeta, self.leaseend = \ res = None
self.slice.plab.agent.renewNode(self) try:
res, self.nodemeta, self.leaseend = \
self.slice.plab.agent.renewNode(self)
self.__logNodeHist('node','renew','success','')
except:
self.__logNodeHist('node','renew','failure',
traceback.format_exception(*sys.exc_info()))
raise
DBQueryFatal("update plab_slice_nodes" DBQueryFatal("update plab_slice_nodes"
" set nodemeta = %s, leaseend = %s" " set nodemeta = %s, leaseend = %s"
" where node_id = %s", " where node_id = %s",
...@@ -1556,6 +1623,41 @@ class Node: ...@@ -1556,6 +1623,41 @@ class Node:
Emulab/Plab node. Primarily, this unpacks the magic files on to Emulab/Plab node. Primarily, this unpacks the magic files on to
the node. the node.
""" """
# check to see if we should use a custom rootball
rrootballname = rootballname
try:
if (not self.nmagent == None) \
and (not self.slice.plab.agent == None):
tpc = self.slice.plab.agent.__class__
tnc = self.nmagent.getAgentClass()
if rootball_triggers.has_key(tpc) \
and rootball_triggers[tpc].has_key(tnc):
# found a valid prefix for the rootball; update the name
rrootballname = rootball_triggers[tpc][tnc] + rootballname
if debug:
print "Updated rootball name from %s to %s" % (rootballname,
rrootballname)
pass
pass
else:
if debug:
print "rbtriggers: %s; looking for %s/%s" % (str(rootball_triggers),
str(tpc),
str(tnc))
pass
pass
pass
else:
if debug:
print "One of the agents was null!"
pass
pass
pass
except:
print "WARNING: exception while trying to update rootball name"
traceback.print_exc()
pass
TIMESTAMP("emulabify started on %s." % self.nodeid) TIMESTAMP("emulabify started on %s." % self.nodeid)
print "Overlaying Emulab files on %s ..." % self.nodeid print "Overlaying Emulab files on %s ..." % self.nodeid
try: try:
...@@ -1573,7 +1675,13 @@ class Node: ...@@ -1573,7 +1675,13 @@ class Node:
print "Adding slice user to 'root' group on %s failed; " \ print "Adding slice user to 'root' group on %s failed; " \
"attempting to carry on anyway." % self.nodeid "attempting to carry on anyway." % self.nodeid
pass pass
self.unpackRootball(rootballpath, rootballname) try:
self.unpackRootball(rootballpath, rrootballname)
self.__logNodeHist('node','emulabify','success','')
except:
self.__logNodeHist('node','emulabify','failure',
traceback.format_exception(*sys.exc_info()))
raise
TIMESTAMP("emulabify finished on %s." % self.nodeid) TIMESTAMP("emulabify finished on %s." % self.nodeid)
def addToGroup(self, user, group): def addToGroup(self, user, group):
...@@ -1711,6 +1819,59 @@ class Node: ...@@ -1711,6 +1819,59 @@ class Node:
if debug: if debug:
print "IP is %s for node %s" % (IP, self.nodeid) print "IP is %s for node %s" % (IP, self.nodeid)
return IP return IP
def __findHostname(self):
"""
Grabs the publicly-routable hostname of the remote node.
"""
res = DBQueryFatal("select pm.hostname,i.IP from nodes as nv"
" left join interfaces as i"
" on nv.phys_nodeid=i.node_id"
" left join plab_mapping as pm"
" on i.IP=pm.IP"
" where nv.node_id='%s'" % (self.nodeid))
if (not res or len(res) == 0):
print "Warning: no hostname found for nodeid %s" % self.nodeid
hostname = None
pass
else:
((hostname,IP),) = res
pass
if debug:
print "hostname is %s for node %s" % (hostname,IP)
pass
return hostname
#
# Returns (IP,hostname,phys_nodeid).
#
def __findHostInfo(self):
"""
Grabs the publicly-routable IP and hostname of the remote node,
and also our phys_nodeid for it.
"""
res = DBQueryFatal("select i.IP,pm.hostname,nv.phys_nodeid "
" from nodes as nv"
" left join interfaces as i"
" on nv.phys_nodeid=i.node_id"
" left join plab_mapping as pm"
" on i.IP=pm.IP"
" where nv.node_id='%s'" % (self.nodeid))
if (not res or len(res) == 0):
print "Warning: no hostinfo found for nodeid %s" % self.nodeid
(IP,hostname,phys_nodeid) = (None,None,None)
pass
else:
((IP,hostname,phys_nodeid),) = res
pass
if debug:
print "hostname is %s for node %s" % (hostname,IP)
pass
return (IP,hostname,phys_nodeid)
pass # end of class Node pass # end of class Node
# AOP wrappers for class Node # AOP wrappers for class Node
wrap_around(Node._create, timeAdvice) wrap_around(Node._create, timeAdvice)
......
...@@ -23,12 +23,14 @@ import time ...@@ -23,12 +23,14 @@ import time
import calendar import calendar
import cPickle import cPickle
import os import os
import socket
from libtestbed import * from libtestbed import *
from aspects import wrap_around from aspects import wrap_around
from timer_advisories import timeAdvice from timer_advisories import timeAdvice
#import sshhttp #from sshhttp import *
import popen2
# #
# output control vars # output control vars
...@@ -46,8 +48,8 @@ DEF_PLC_USER = "" ...@@ -46,8 +48,8 @@ DEF_PLC_USER = ""
DEF_PLC_PASS = "" DEF_PLC_PASS = ""
DEF_PLC_PASS_FILE = "@prefix@/etc/plab/plc.pw" DEF_PLC_PASS_FILE = "@prefix@/etc/plab/plc.pw"
#DEF_NM_PORT = "812" DEF_NM_PORT = "812"
DEF_NM_PORT = "814" DEF_NM_LEGACY_PORT = "814"
# #
# A bunch of time constants / intervals (in seconds) # A bunch of time constants / intervals (in seconds)
...@@ -67,12 +69,13 @@ INSTMETHOD_PLCINST = "plc-instantiated" ...@@ -67,12 +69,13 @@ INSTMETHOD_PLCINST = "plc-instantiated"
DEF_EMULAB_INSTMETHOD = INSTMETHOD_DELEGATED DEF_EMULAB_INSTMETHOD = INSTMETHOD_DELEGATED
# XXX: need to figure out what these are in the new NM's context # XXX: need to figure out what these are in the new NM's context
# hm, doesn't seem to have a version() call
MAJOR_VERS = 1 MAJOR_VERS = 1
MINOR_VERS = 0 MINOR_VERS = 0
MIN_REV = 10 MIN_REV = 10
class NMagent: class NM3agent:
def __init__(self, IP, nodeid, nmport = DEF_NM_PORT): def __init__(self, IP, nodeid, nmport = DEF_NM_LEGACY_PORT):
self.__server = xmlrpclib.ServerProxy("http://" + IP + ":" + self.__server = xmlrpclib.ServerProxy("http://" + IP + ":" +
nmport + "/") nmport + "/")
self.__vers = [0,0,0] self.__vers = [0,0,0]
...@@ -81,10 +84,16 @@ class NMagent: ...@@ -81,10 +84,16 @@ class NMagent:
pass pass
def create_sliver(self, ticket): def create_sliver(self, ticket):
return self.__server.create_sliver(xmlrpclib.Binary(ticket)) res = self.__server.create_sliver(xmlrpclib.Binary(ticket))
if debug:
print "NM3: create_sliver: res = %s" % str(res)
return res
def delete_sliver(self, rcap): def delete_sliver(self, rcap):
return self.__server.delete_sliver(rcap) res = self.__server.delete_sliver(rcap)
if debug:
print "NM3: delete_sliver: res = %s" % str(res)
return res
def version(self): def version(self):
if self.__vers == [0,0,0]: if self.__vers == [0,0,0]:
...@@ -97,18 +106,235 @@ class NMagent: ...@@ -97,18 +106,235 @@ class NMagent:
pass pass
pass pass
return self.__vers return self.__vers
def getAgentClass(self):
return self.__class__
pass pass
#wrap_around(NMagent.create_sliver, timeAdvice)
#wrap_around(NMagent.delete_sliver, timeAdvice)
DEF_NM_DELEGATE_ACCT = "utah_elab_delegate"
DEF_NM_DELEGATE_KEY = "/root/.ssh/id_rsa.plabdel"
DEF_NM_SSHCMD = "/usr/bin/ssh -oStrictHostKeyChecking=no -l %s -i %s %s"
class NM4agent:
def __init__(self,IP,nodeid,nmport=DEF_NM_PORT,
del_acct=DEF_NM_DELEGATE_ACCT,
del_key=DEF_NM_DELEGATE_KEY):
#self._sPipeMethod = SshConnection(DEF_NM_SSHCMD % (del_acct,
# del_key,
# IP))
# Instead of ssh xmlrpc transport, we use xmlrpclib load/dumps.
self._isopen = False
self.delacct = del_acct
self.delkey = del_key
self.__vers = [4,0,0]
self.IP = IP
self.nodeid = nodeid
pass
def _open(self):
if not self._isopen:
try:
self.__agentconn = popen2.Popen3(DEF_NM_SSHCMD % (self.delacct,
self.delkey,
self.IP))
self._isopen = True
except:
raise
pass
pass
def _close(self):
if self._isopen:
# Nothing else we can do except wait for the connection to die,
# and that's silly... or kill the pid ourself---but the connection
# will naturally die after the response.
self._isopen = False
pass
def _xcall(self,cmd,args=()):
self._open()
if debug:
print "NM4agent: sending xmlrpc request (%s,%s)" % (cmd,str(args))
print >>self.__agentconn.tochild, xmlrpclib.dumps(args,cmd)
self.__agentconn.tochild.close()
if debug:
print "NM4agent: waiting for response"
retval = xmlrpclib.loads(self.__agentconn.fromchild.read())
if debug:
print "NM4agent: response = '%s'" % str(retval)
self.__agentconn.wait()
if debug:
print "NM4agent: _xcall complete"
self._close()
# XXX: we whack the retval to be compat with NMv3
rret = retval[0][0]
if rret == 1:
rret = 0
pass
retval = [ rret, [''] ]
return retval
def deliver_ticket(self,ticket):
if debug:
print "NM4agent: delivering ticket '%s'" % str(ticket)
return self._xcall('Ticket',(ticket,))
def create_sliver(self,slice_name):
if debug:
print "NM4agent: creating sliver for slice %s" % slice_name
return self._xcall('Create',(slice_name,))
def delete_sliver(self,slice_name):
if debug:
print "NM4agent: destroying sliver for slice %s" % slice_name
return self._xcall('Destroy',(slice_name,))
def start_sliver(self,slice_name):
return self._xcall('Start',(slice_name,))
def stop_sliver(self,slice_name):
return self._xcall('Stop',(slice_name,))
# NM v4 does not have a version method...
def version(self):
return self.__vers
def getAgentClass(self):
return self.__class__
pass
DEF_NM_LEGACY_AGENT = NM3agent
DEF_NM_AGENT = NM4agent
class NMagent_wrapper:
def __init__(self,IP,nodeid):
# first try a tcp connection to the legacy NM port; if timeout,
# try default agent right away.
self.__agent = None
if DEF_NM_LEGACY_AGENT != None:
legacy_nm = False
#sock = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
#sock.settimeout(10.0)
try:
#sock.connect((str(IP),int(DEF_NM_LEGACY_PORT)))
tmp_agent = DEF_NM_LEGACY_AGENT(IP,nodeid)
tmp_agent.version()
legacy_nm = True
if debug:
print "NM %s (%s) IS legacy" % (IP,nodeid)
pass
except:
print "NM on %s (%s) does not appear to be legacy." % (IP,
nodeid)
traceback.print_exc()
pass
if legacy_nm:
self.__agent = DEF_NM_LEGACY_AGENT(IP,nodeid)
pass
else:
self.__agent = DEF_NM_AGENT(IP,nodeid)
pass
pass
else:
self.__agent = DEF_NM_AGENT(IP,nodeid)
pass
pass
def getAgent(self):
return self.__agent
def getAgentClass(self):
return self.__agent.__class__
def create_sliver(self,slicename,ticketdata):
arg = None
if self.__agent.__class__ == NM4agent:
arg = slicename
pass
else:
arg = ticketdata
pass
# if NM4agent, try delivering the ticket first, to maximize our
# chances:
if self.__agent.__class__ == NM4agent:
try:
res = tryXmlrpcCmd(self.__agent.deliver_ticket,ticketdata)
if res[0] == 0:
print "WARNING: while trying to deliver ticket for slice %s: %s" % (slicename,str(res[1]))
else:
if debug:
print "Ticket delivery succeeded for slice %s" % slicename
pass
pass
pass
except:
print "WARNING: exception while delivering ticket for slice %s" % slicename
traceback.print_exc()
pass
pass
else:
if debug:
print "DEBUG: type(agent) = %s %s" % (str(type(self.__agent)),
str(type(NM4agent)))
pass
res = tryXmlrpcCmd(self.__agent.create_sliver,arg)
if self.__agent.__class__ == NM4agent:
#
# XXX - fix later
#
# return the new way -- we don't want to store a ticket
# until rollout is mostly done.
return (res,'')
else:
if debug:
print "res is %s" % str(res)
pass
retval = (res,None)
try:
retval = (res,cPickle.dumps(res[1][0]))
except:
pass
return retval
return None
def delete_sliver(self,slicename,ticketdata):
arg = None
if self.__agent.__class__ == NM4agent:
arg = slicename
pass
else:
arg = ticketdata
pass
return tryXmlrpcCmd(self.__agent.delete_sliver,arg)
def update_sliver(self,slicename,ticketdata):
arg = None
if self.__agent.__class__ == NM4agent:
return tryXmlrpcCmd(self.__agent.deliver_ticket,ticketdata)
else:
return tryXmlrpcCmd(self.__agent.create_sliver,ticketdata)
return None
pass
# #
# The real PLC agent. Wraps up standard arguments to the # The real PLC agent. Wraps up standard arguments to the
# PLC XMLRPC interface. # PLC XMLRPC interface.
# #
class PLCagent: class PLCagent:
def __init__(self, slicename, def __init__(self, slicename,
uri = DEF_PLC_URI, uri = DEF_PLC_URI,
...@@ -340,6 +566,8 @@ class mod_PLC4: ...@@ -340,6 +566,8 @@ class mod_PLC4:
raise raise
try: try:
# XXX - fix for new NM and rollout.
#PLCticket = tryXmlrpcCmd(agent.SliceGetTicket)
PLCticket = tryXmlrpcCmd(agent.SliceGetTicketLegacy) PLCticket = tryXmlrpcCmd(agent.SliceGetTicketLegacy)
if debug: if debug:
print PLCticket print PLCticket
...@@ -424,7 +652,7 @@ class mod_PLC4: ...@@ -424,7 +652,7 @@ class mod_PLC4:
return ret return ret
def getSliceMeta(self, slice): def getSliceMetaLegacy(self, slice):
agent = self.__getAgent(slice.slicename) agent = self.__getAgent(slice.slicename)