Commit 1b6ef602 authored by David Johnson's avatar David Johnson

These are the rest of the changes that have been accumulating in my dev

tree for v4 planetlab node support.  Currently, we support both v3 and
v4 NMs via a little wrapper, and we dist out different versions of the
rootball depending on NM version.  Also updated various parts of libplab
to log success and failure from interactions with planetlab nodes to the
db, and there are beginnings of support for that in plabmonitord.in.
parent c7999da3
...@@ -143,6 +143,31 @@ DEF_SLICE_DESC = "Slice created by Emulab" ...@@ -143,6 +143,31 @@ DEF_SLICE_DESC = "Slice created by Emulab"
PLABEXPIREWARN = 1*WEEK # one week advance warning for slice expiration. PLABEXPIREWARN = 1*WEEK # one week advance warning for slice expiration.
NODEEXPIREWARN = 2*WEEK+2*DAY # about two weeks advance warning for slivers. NODEEXPIREWARN = 2*WEEK+2*DAY # about two weeks advance warning for slivers.
#
# This is a trigger table for dist'ing out multiple rootballs depending on
# which NM is running on the node. Since we don't know which version is
# running on a node until we call into (DEF|COMPAT)_AGENT and actually connect
# to the node, it's based off classnames.
#
# The table specifies a prefix to the default rootball name; this rootball
# should be placed in the normal location.
#
# If the plcagent class and nodeagent class are both not in the trigger table,
# we simply push the default rootball.
#
# Note that we could store version info in the database, but that doesn't
# really have long-term value. Plus, we have to be checking sites.xml all the
# time to catch version changes during rollout. This way, plabmonitord and
# the web interface know exactly what is the version whenever a node is
# contacted for setup.
#
# For now, only NM4agent has a custom tarball, since v4 nodes are in the
# distinct minority right now.
#
from mod_PLC4 import NM4agent
rootball_triggers = { mod_PLC4 : { NM4agent : 'NM4-' } }
# #
# var to track failed renewals # var to track failed renewals
# #
...@@ -1393,7 +1418,6 @@ class Slice: ...@@ -1393,7 +1418,6 @@ class Slice:
wrap_around(Slice._create, timeAdvice) wrap_around(Slice._create, timeAdvice)
wrap_around(Slice.destroy, timeAdvice) wrap_around(Slice.destroy, timeAdvice)
# #
# Node abstraction # Node abstraction
# #
...@@ -1401,12 +1425,33 @@ class Node: ...@@ -1401,12 +1425,33 @@ class Node:
def __init__(self, slice, nodeid, pollNode = False): def __init__(self, slice, nodeid, pollNode = False):
self.slice = slice self.slice = slice
self.nodeid = nodeid self.nodeid = nodeid
self.IP = self.__findIP() #self.IP = self.__findIP()
#self.hostname = self.__findHostname()
#self.phys_nodeid = self.__findPhysNodeID()
(self.IP,self.hostname,self.phys_nodeid) = self.__findHostInfo()
self.leaseend = 0 self.leaseend = 0
self.nodemeta = None self.nodemeta = None
self.pollNode = pollNode self.pollNode = pollNode
# must be set in mod_<PLCAGENT>.createNode if you want to use
# multiple rootball and triggering support
self.nmagent = None
return return
def __logNodeHist(self,component,operation,status,msg):
try:
DBQueryFatal("insert into plab_nodehist values "
"(NULL,%s,%s,%s,%s,%s,%s,%s)",
(self.nodeid,self.phys_nodeid,
time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(time.time())),
component,operation,status,str(msg)))
except:
# do nothing
print "WARNING: could not log (%s,%s,%s,%s) into plab_nodehist!" % \
(component,operation,status,msg)
pass
pass
# XXX: may want to rethink signal handling here. # XXX: may want to rethink signal handling here.
def _create(self, force=False): def _create(self, force=False):
""" """
...@@ -1430,8 +1475,15 @@ class Node: ...@@ -1430,8 +1475,15 @@ class Node:
pass pass
print "Creating Plab node %s on %s." % (self.nodeid, self.IP) print "Creating Plab node %s on %s." % (self.nodeid, self.IP)
res, self.nodemeta, self.leaseend = \ res = None
self.slice.plab.agent.createNode(self) try:
res, self.nodemeta, self.leaseend = \
self.slice.plab.agent.createNode(self)
self.__logNodeHist('node','create','success','')
except:
self.__logNodeHist('node','create','failure',
traceback.format_exception(*sys.exc_info()))
raise
DBQueryFatal("replace into plab_slice_nodes" DBQueryFatal("replace into plab_slice_nodes"
" (exptidx, pid, eid, slicename, node_id," " (exptidx, pid, eid, slicename, node_id,"
...@@ -1521,7 +1573,14 @@ class Node: ...@@ -1521,7 +1573,14 @@ class Node:
"\n\n%s" % (self.nodeid, tbstr), TBOPS) "\n\n%s" % (self.nodeid, tbstr), TBOPS)
pass pass
deleted = self.slice.plab.agent.freeNode(self) deleted = 0
try:
deleted = self.slice.plab.agent.freeNode(self)
self.__logNodeHist('node','free','success','')
except:
self.__logNodeHist('node','free','failure',
traceback.format_exception(*sys.exc_info()))
raise
TIMESTAMP("freenode %s finished." % self.nodeid) TIMESTAMP("freenode %s finished." % self.nodeid)
return not deleted return not deleted
...@@ -1537,8 +1596,16 @@ class Node: ...@@ -1537,8 +1596,16 @@ class Node:
return res[0] | res[1] return res[0] | res[1]
def _renew(self): def _renew(self):
res, self.nodemeta, self.leaseend = \ res = None
self.slice.plab.agent.renewNode(self) try:
res, self.nodemeta, self.leaseend = \
self.slice.plab.agent.renewNode(self)
self.__logNodeHist('node','renew','success','')
except:
self.__logNodeHist('node','renew','failure',
traceback.format_exception(*sys.exc_info()))
raise
DBQueryFatal("update plab_slice_nodes" DBQueryFatal("update plab_slice_nodes"
" set nodemeta = %s, leaseend = %s" " set nodemeta = %s, leaseend = %s"
" where node_id = %s", " where node_id = %s",
...@@ -1556,6 +1623,41 @@ class Node: ...@@ -1556,6 +1623,41 @@ class Node:
Emulab/Plab node. Primarily, this unpacks the magic files on to Emulab/Plab node. Primarily, this unpacks the magic files on to
the node. the node.
""" """
# check to see if we should use a custom rootball
rrootballname = rootballname
try:
if (not self.nmagent == None) \
and (not self.slice.plab.agent == None):
tpc = self.slice.plab.agent.__class__
tnc = self.nmagent.getAgentClass()
if rootball_triggers.has_key(tpc) \
and rootball_triggers[tpc].has_key(tnc):
# found a valid prefix for the rootball; update the name
rrootballname = rootball_triggers[tpc][tnc] + rootballname
if debug:
print "Updated rootball name from %s to %s" % (rootballname,
rrootballname)
pass
pass
else:
if debug:
print "rbtriggers: %s; looking for %s/%s" % (str(rootball_triggers),
str(tpc),
str(tnc))
pass
pass
pass
else:
if debug:
print "One of the agents was null!"
pass
pass
pass
except:
print "WARNING: exception while trying to update rootball name"
traceback.print_exc()
pass
TIMESTAMP("emulabify started on %s." % self.nodeid) TIMESTAMP("emulabify started on %s." % self.nodeid)
print "Overlaying Emulab files on %s ..." % self.nodeid print "Overlaying Emulab files on %s ..." % self.nodeid
try: try:
...@@ -1573,7 +1675,13 @@ class Node: ...@@ -1573,7 +1675,13 @@ class Node:
print "Adding slice user to 'root' group on %s failed; " \ print "Adding slice user to 'root' group on %s failed; " \
"attempting to carry on anyway." % self.nodeid "attempting to carry on anyway." % self.nodeid
pass pass
self.unpackRootball(rootballpath, rootballname) try:
self.unpackRootball(rootballpath, rrootballname)
self.__logNodeHist('node','emulabify','success','')
except:
self.__logNodeHist('node','emulabify','failure',
traceback.format_exception(*sys.exc_info()))
raise
TIMESTAMP("emulabify finished on %s." % self.nodeid) TIMESTAMP("emulabify finished on %s." % self.nodeid)
def addToGroup(self, user, group): def addToGroup(self, user, group):
...@@ -1711,6 +1819,59 @@ class Node: ...@@ -1711,6 +1819,59 @@ class Node:
if debug: if debug:
print "IP is %s for node %s" % (IP, self.nodeid) print "IP is %s for node %s" % (IP, self.nodeid)
return IP return IP
def __findHostname(self):
"""
Grabs the publicly-routable hostname of the remote node.
"""
res = DBQueryFatal("select pm.hostname,i.IP from nodes as nv"
" left join interfaces as i"
" on nv.phys_nodeid=i.node_id"
" left join plab_mapping as pm"
" on i.IP=pm.IP"
" where nv.node_id='%s'" % (self.nodeid))
if (not res or len(res) == 0):
print "Warning: no hostname found for nodeid %s" % self.nodeid
hostname = None
pass
else:
((hostname,IP),) = res
pass
if debug:
print "hostname is %s for node %s" % (hostname,IP)
pass
return hostname
#
# Returns (IP,hostname,phys_nodeid).
#
def __findHostInfo(self):
"""
Grabs the publicly-routable IP and hostname of the remote node,
and also our phys_nodeid for it.
"""
res = DBQueryFatal("select i.IP,pm.hostname,nv.phys_nodeid "
" from nodes as nv"
" left join interfaces as i"
" on nv.phys_nodeid=i.node_id"
" left join plab_mapping as pm"
" on i.IP=pm.IP"
" where nv.node_id='%s'" % (self.nodeid))
if (not res or len(res) == 0):
print "Warning: no hostinfo found for nodeid %s" % self.nodeid
(IP,hostname,phys_nodeid) = (None,None,None)
pass
else:
((IP,hostname,phys_nodeid),) = res
pass
if debug:
print "hostname is %s for node %s" % (hostname,IP)
pass
return (IP,hostname,phys_nodeid)
pass # end of class Node pass # end of class Node
# AOP wrappers for class Node # AOP wrappers for class Node
wrap_around(Node._create, timeAdvice) wrap_around(Node._create, timeAdvice)
......
This diff is collapsed.
...@@ -17,11 +17,16 @@ use POSIX ":sys_wait_h"; ...@@ -17,11 +17,16 @@ use POSIX ":sys_wait_h";
# #
sub usage() sub usage()
{ {
print STDERR "Usage: plabmonitor [-d]\n"; print STDERR "Usage: plabmonitor [-dS]\n";
print STDERR " -d Debug mode.\n";
print STDERR " -S Run WITHOUT reading monitor state from database;\n";
print STDERR " new state will still be written (default is to" .
" load state).\n";
exit(-1); exit(-1);
} }
my $optlist = "d"; my $optlist = "dS";
my $debug = 0; my $debug = 0;
my $stateful = 1;
# #
# Only real root can call this. # Only real root can call this.
...@@ -80,12 +85,16 @@ if (@ARGV) { ...@@ -80,12 +85,16 @@ if (@ARGV) {
if (defined($options{"d"})) { if (defined($options{"d"})) {
$debug = 1; $debug = 1;
} }
if (defined($options{'S'})) {
$stateful = 0;
}
# #
# Function prototypes # Function prototypes
# #
sub updatenodepool($); sub updatenodepool($);
sub fatal($); sub fatal($);
#sub loadstate($$);
# #
# Global vars # Global vars
...@@ -101,6 +110,11 @@ my $NEVER = 0; # "Never" in seconds since the Epoch. ...@@ -101,6 +110,11 @@ my $NEVER = 0; # "Never" in seconds since the Epoch.
my $MAXLA = 20; # Don't let the system load get out of hand. my $MAXLA = 20; # Don't let the system load get out of hand.
# Keep only this many (success,fail) sequences in memory at a
# time (they get saved in the db anyway... we just need to minimize cost
# across fork()s)
my $MAX_STATE_SEQUENCES = 10;
# #
# daemonize # daemonize
# #
...@@ -242,6 +256,94 @@ while (1) { ...@@ -242,6 +256,94 @@ while (1) {
sleep($CHILLTIME); sleep($CHILLTIME);
} }
#
# Load monitor state from the plab_nodehist table for the given pnode.
#
sub loadstate($$) {
my $pool = shift;
my $pnode = shift;
my $poolpid = $pool->{'PID'};
my $pooleid = $pool->{'EID'};
my $poolpnodes = $pool->{'PNODES'};
if (!defined($pool) || !defined($pnode))
return;
my $res =
DBQueryFatal("select unix_timestamp(timestamp),component,operation," .
"status " .
" from plab_nodehist where phys_node_id='$pnode' " .
" order by timestamp asc" .
);
# calculate sequence lengths; store most recent 10
my @sequences = ();
my $lastseqstat = '', $lastts = 0;
if ($res && $res->num_rows()) {
while (my @row = $res->fetchrow_array()) {
my ($ts,$com,$op,$status) = @row;
# filter out entries older than two weeks
# NOTE: after we create the sequence for this node, we will
# check the last (most recent) timestamp; if it's older than
# three days, we reject all sequence state for this node
# and start clean. The reason for this is pretty obvious;
# if a node was good for two weeks, then we don't check for two
# weeks (i.e., cause plabmonitord didn't run), we might try to use
# old state to assume new check intervals, and this might be bad.
if ($lastts == 0) {
# first row
$lastseqstat = $status;
$lastts = $ts;
next;
}
# @entry = (up/down,seqtime,conseccount)
# add this one on to the last.
@sequences[0][1] += ($ts - $lastts);
if ($lastseqstat eq $status) {
++(@sequences[0][2]);
}
else {
# new entry
my @entry = ();
if ($status eq 'success') {
entry[0] = 'up';
}
else {
entry[0] = 'down';
}
entry[1] = 0;
entry[2] = 1;
# save off this sequence
push @sequences, \@entry;
}
$lastseqstat = $status;
$lastts = $ts;
}
}
# remove the oldest N sequences...
if (scalar(@sequences) $MAX_STATE_SEQUENCES) {
for (my $i = scalar(@sequences); $i >= $MAX_STATE_SEQUENCES; --$i) {
undef($sequences[$i]);
}
}
$poolpnodes->{$pnode}->{'history'} = \@sequences;
Log(STATUSLOG, "plabmonitord, $pnodename, loadstate, ".
(scalar(@{$poolpnodes->{$pnodename}->{'history'}})).
" sequences added.");
return;
}
# #
# Go through the PID/EID associated with the pool and grab any new nodes # Go through the PID/EID associated with the pool and grab any new nodes
# that have appeared. # that have appeared.
......
...@@ -62,6 +62,10 @@ def main(args): ...@@ -62,6 +62,10 @@ def main(args):
if command == "alloc": if command == "alloc":
try: try:
node = slice.createNode(nodeid, force=parser.values.force) node = slice.createNode(nodeid, force=parser.values.force)
# With the v4 NM, we have to sleep a couple seconds to give
# the slice the a chance to get keys/acct stuff straightened out
# so we can actually slogin and copy fixsudo.sh to the node.
time.sleep(2)
node.emulabify() node.emulabify()
# Note that vnode_setup boots the node # Note that vnode_setup boots the node
pass pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment