Commit 63ca25ad authored by David Johnson's avatar David Johnson

Turn off the backwards compat functionality (for NM v3). Fix up logging

in libplab.  Parallelize plabhttpd and make it take command line args.
Increase parallelization in plabdist.

Also make plabmonitord dist out the rootball and restart thttpd on plab
service slivers that it has just resurrected.
parent 6b8f4d08
......@@ -167,7 +167,8 @@ NODEEXPIREWARN = 2*WEEK+2*DAY # about two weeks advance warning for slivers.
#
from mod_PLC4 import NM4agent
rootball_triggers = { mod_PLC4 : { NM4agent : 'NM4-' } }
#rootball_triggers = { mod_PLC4 : { NM4agent : 'NM4-' } }
rootball_triggers = dict({})
#
# var to track failed renewals
......@@ -2152,10 +2153,14 @@ class Node:
success = True
except:
if rtries == 0:
print "Warning: perform %s on %s failed after %d tries!" \
% (command,self.nodeid,tries)
raise
else:
print "Warning: perform %s on %s failed, try %d of %d" \
% (command,self.nodeid,tries-rtries,tries)
if debug:
print "Warning: perform %s on %s failed, try %d of %d" \
% (command,self.nodeid,tries-rtries,tries)
pass
try:
time.sleep(interval)
except:
......@@ -2165,7 +2170,7 @@ class Node:
pass
return retval
def __copy(self, localfile, remotefile, tries=1, interval=5):
def __copy(self, localfile, remotefile, quiet=False, tries=1, interval=5):
"""
Copies a file from the local system to the remote node, doing so
as the slice user.
......@@ -2185,11 +2190,15 @@ class Node:
success = True
except:
if rtries == 0:
print "Warning: copy %s to %s on %s failed after %d tries!" \
% (localfile,remotefile,self.nodeid,tries)
raise
else:
print "Warning: copy %s to %s on %s failed, try %d of %d" \
% (localfile,remotefile,self.nodeid,
tries-rtries,tries)
if debug:
print "Warning: copy %s to %s on %s failed, try %d of %d" \
% (localfile,remotefile,self.nodeid,
tries-rtries,tries)
pass
try:
time.sleep(interval)
except:
......
......@@ -131,9 +131,11 @@ class NM3agent:
DEF_NM_DELEGATE_ACCT = "utah_nmcontrol"
DEF_NM_DELEGATE_KEY = "/root/.ssh/id_rsa"
TRY_SECONDARY_DEL_ACCT = True
# A hook for connecting to old plab NMs during an upgrade.
TRY_SECONDARY_DEL_ACCT = False
SECONDARY_NM_DELEGATE_ACCT = "utah_elab_delegate"
SECONDARY_NM_DELEGATE_KEY = "/root/.ssh/id_rsa.plabdel"
DEF_NM_SSHCMD = "/usr/bin/ssh -q -o StrictHostKeyChecking=no" \
" -o PasswordAuthentication=no -o NumberOfPasswordPrompts=0" \
" -l %s -i %s %s"
......@@ -346,7 +348,7 @@ class NM4agent:
return self.__class__
pass
DEF_NM_LEGACY_AGENT = NM3agent
DEF_NM_LEGACY_AGENT = None
DEF_NM_AGENT = NM4agent
class NMagent_wrapper:
......
......@@ -34,7 +34,7 @@ my $REMOTE_SYNC_DIR = "netbed_files";
# Maximum number of children to run at once - we'll keep this low, since it
# consumes bandwidth on boss
#
my $max_children = 4;
my $max_children = 10;
#
# Die unless the rootball exists - this serves two purposes:
......
#!/usr/bin/perl -w
#
# EMULAB-COPYRIGHT
# Copyright (c) 2003 University of Utah and the Flux Group.
# Copyright (c) 2003-2007 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -10,9 +10,6 @@
# restarts the server gracefully, so that any downloads already going on will
# get to complete
#
# NOTE: This script currently requires YOUR keys, because it logs in as utah1
# on the planetlab nodes
#
use lib '@prefix@/lib';
use libdb;
......@@ -27,24 +24,79 @@ my $PLAB_USER = "utah_svc_slice";
#
my $THTTPD_START = "netbed_files/sbin/thttpd.restart";
#
# max parallelization
#
my $MAX_CHILDREN = 10;
#
# SSH command
#
my $SSH = "ssh -q -oBatchMode=yes -oStrictHostKeyChecking=no -l $PLAB_USER";
my @nodes = ();
if (@ARGV) {
@nodes = @ARGV;
}
if (!scalar(@nodes)) {
#
# Get a list of planetlab nodes that are up
#
my $query_result =
DBQueryFatal("SELECT n.node_id FROM nodes as n " .
"LEFT JOIN node_status AS s ON n.node_id=s.node_id " .
"LEFT JOIN reserved AS r ON n.node_id = r.node_id " .
"WHERE n.type=\"pcplabphys\" AND s.status=\"up\" AND " .
"!(r.pid=\"" . NODEDEAD_PID .
"\" AND r.eid=\"" . NODEDEAD_EID . "\") " .
"order by n.node_id");
while (my ($node) = $query_result->fetchrow()) {
push @nodes, $node;
}
}
#
# Get a list of planetlab nodes that are up
# Run up to $max_children rsyncs at a time
#
my $query_result =
DBQueryFatal("SELECT n.node_id FROM nodes as n " .
"LEFT JOIN node_status AS s ON n.node_id=s.node_id " .
"LEFT JOIN reserved AS r ON n.node_id = r.node_id " .
"WHERE n.type=\"pcplabphys\" AND s.status=\"up\" AND " .
"!(r.pid=\"" . NODEDEAD_PID .
"\" AND r.eid=\"" . NODEDEAD_EID . "\") " .
"order by n.node_id");
my $current_children = 0;
my @failed = ();
my %children = ();
while (@nodes || $current_children) {
if (($current_children < $MAX_CHILDREN) && @nodes) {
#
# If we have room for another child, start one up
#
my $node = pop @nodes;
if (my $pid = fork()) {
$current_children++;
$children{$pid} = $node;
} else {
print "Starting up webserver on $node\n\n";
exec "$SSH $node $THTTPD_START";
}
} else {
#
# Wait for a child to die, and see if it failed
#
my $childpid = wait();
if ($childpid < 0) {
die "Bad return value from wait(): $childpid\n";
}
if ($children{$childpid}) {
$current_children--;
if ($?) {
push @failed, $children{$childpid};
}
}
}
}
while (my ($node) = $query_result->fetchrow()) {
print "Starting up webserver on $node ...\n";
system "$SSH $node $THTTPD_START";
if (@failed) {
print "Some nodes failed: \n";
print map {"$_\n"} @failed;
}
exit scalar @failed;
......@@ -37,6 +37,8 @@ my $BIGINT = 9999999999;
my $CLIENT_BIN = "@CLIENT_BINDIR@";
my $SSH = "@prefix@/bin/sshtb -n";
my $PLABNODE = "@prefix@/sbin/plabnode";
my $PLABDIST = "@prefix@/sbin/plabdist";
my $PLABHTTPD = "@prefix@/sbin/plabhttpd";
# XXX - testing
#my $PLABNODE = "/home/kwebb/bin/randsleep.pl";
......@@ -130,8 +132,33 @@ sub checknextnode($) {
exit($BADINST);
}
exec "$SSH -host $vnode $CLIENT_BIN/vnodesetup -p $vnode" or
die "Yike! Can't exec command!\n";
# Do this before rootball/httpd to avoid wasted bandwidth/time.
# However, wastes resources on boss by not exec'ing this; we have a
# whole extra process hanging around just so we can do the rootball
# and httpd stuff after. XXX!
my $vres = TBForkCmd("$SSH -host $vnode $CLIENT_BIN/vnodesetup" .
" -p $vnode");
if ($vres) {
print "*** Vnodesetup failed: $vnode\n";
exit($vres);
}
# Try to dist out the rootball and other files, but don't make
# setup conditional on success/failure.
if (TBForkCmd("$PLABDIST $vnode")) {
print "*** Could not dist out rootball on $vnode, ignoring.\n";
}
# Try to start the thttpd server in the service sliver, but don't make
# setup conditional on success/failure
if (TBForkCmd("$PLABHTTPD $vnode")) {
print "*** Could not start thttpd on $vnode, ignoring.\n";
}
exit($vres);
#exec "$SSH -host $vnode $CLIENT_BIN/vnodesetup -p $vnode" or
# die "Yike! Can't exec command!\n";
}
# NOTREACHED
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment