Commit 87eed168 authored by Mike Hibler's avatar Mike Hibler
Browse files

Performance improvements to the vnode startup path.

The bigest improvement happened on day one when I took out the 20 second sleep
between vnode starts in bootvnodes. That appears to have been an artifact of
an older time and an older Xen. Or, someone smarter than me saw the potential
of getting bogged down for, oh say three weeks, trying to micro-optimize the
process and instead just went for the conservative fix!

Following day one, the ensuing couple of weeks was a long strange trip to
find the maximum number of simultaneous vnode creations that could be done
without failure. In that time I tried a lot of things, generated a lot of
graphs, produced and tweaked a lot of new constants, and in the end, wound
up with the same two magic numbers (3 and 5) that were in the original code!
To distinguish myself, I added a third magic number (1, the loneliest of
them all).

All I can say is that now, the choice of 3 or 5 (or 1), is based on more
solid evidence than before. Previously it was 5 if you had a thin-provisioning
LVM, 3 otherwise. Now it is based more directly on host resources, as
described in a long comment in the code, the important part of which is:

 #
 # if (dom0 physical RAM < 1GB) MAX = 1;
 # if (any swap activity) MAX = 1;
 #
 #    This captures pc3000s/other old machines and overloaded (RAM) machines.
 #
 # if (# physical CPUs <= 2) MAX = 3;
 # if (# physical spindles == 1) MAX = 3;
 # if (dom0 physical RAM <= 2GB) MAX = 3;
 #
 #    This captures d710s, Apt r320, and Cloudlab m510s. We may need to
 #    reconsider the latter since its single drive is an NVMe device.
 #    But first we have to get Xen working with them (UEFI issues)...
 #
 # else MAX = 5;

In my defense, I did fix some bugs and stuff too (and did I mention
the cool graphs?) See comments in the code and gitlab emulab/emulab-devel
issue #148.
parent ee854767
#!/usr/bin/perl -w
#
# Copyright (c) 2000-2015 University of Utah and the Flux Group.
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -35,10 +35,10 @@ use POSIX qw(strftime);
#
sub usage()
{
print "Usage: bootvnodes [-d] [-f] [-k | -h | -r | -b | -c]\n";
print STDERR "Usage: bootvnodes [-w sec] [-d] [-f] [-k|-h|-r|-b|-c]\n";
exit(1);
}
my $optlist = "kdfhrcb";
my $optlist = "kdfhrcbw:";
#
# Turn off line buffering on output
......@@ -62,6 +62,7 @@ my $vndir = "$VARDIR/jails";
my $debug = 0;
my $daemon = 1;
my $reconfig= 0;
my $waittime = 0;
my $action;
# Prototypes
......@@ -105,6 +106,13 @@ if (defined($options{"b"})) {
if (defined($options{"c"})) {
$reconfig = 1;
}
if (defined($options{"w"})) {
if ($options{"w"} !~ /^(\d+)$/) {
print STDERR "Invalid wait time for -w\n";
usage();
}
$waittime = $1;
}
if (@ARGV) {
usage();
}
......@@ -153,6 +161,20 @@ if (GENVNODE() &&
exit($? >> 8);
}
#
# Sort by vnode number if it follows are convention. We need this because
# we don't use a fixed field width for the vnode number.
# XXX I am sure there are more perl-ish ways to do this.
#
sub byvnode {
if ($a =~ /^(.*)-(\d+)$/ && ($apre = $1) && ($anum = $2) &&
$b =~ /^(.*)-(\d+)$/ && ($bpre = $1) && ($bnum = $2) &&
$apre eq $bpre) {
return $anum <=> $bnum;
}
return $a cmp $b;
}
#
# This applies to whatever vnodes are running. Do it and exit.
#
......@@ -162,11 +184,23 @@ if (defined($action) && !$reconfig) {
my @files = readdir(DIR);
closedir(DIR);
#
# XXX make a feeble effort to weed out random files.
#
# We used to do this by making sure the name was of the form
# pcvm<foo>-<num>, but not all clusters use the "pc" naming scheme.
# So now, the best we can do is ensure that the name is like
# <foo>vm<bar>-<num> and is a directory.
#
my @vnfiles = ();
foreach my $file (@files) {
if ($file =~ /^((?:pc|homenet)vm[-\w]*)$/) {
bootvnode($1, $action, (-e "$vndir/$file/fakejail" ? 0 : 1));
if ($file =~ /^([-\w]+vm[-\w]*-\d+)$/ && -d "$vndir/$file") {
push(@vnfiles, $1);
}
}
foreach my $file (sort byvnode @vnfiles) {
bootvnode($file, $action, (-e "$vndir/$file/fakejail" ? 0 : 1));
}
exit(0);
}
......@@ -190,22 +224,8 @@ foreach my $str (@tmccresults) {
if ($2 eq "0");
}
else {
warn("*** WARNING: Skipping bad subnodeid: '$str'\n");
}
}
#
# Sort by vnode number if it follows are convention. We need this because
# we don't use a fixed field width for the vnode number.
# XXX I am sure there are more perl-ish ways to do this.
#
sub byvnode {
if ($a =~ /^(.*)-(\d+)$/ && ($apre = $1) && ($anum = $2) &&
$b =~ /^(.*)-(\d+)$/ && ($bpre = $1) && ($bnum = $2) &&
$apre eq $bpre) {
return $anum <=> $bnum;
warn("*** WARNING: Skipping bad VNODEID: '$str'\n");
}
return $a cmp $b;
}
#
......@@ -218,14 +238,19 @@ if ($reconfig) {
my @files = readdir(DIR);
closedir(DIR);
# XXX make a feeble effort to weed out random files.
my @vnfiles = ();
foreach my $file (@files) {
if ($file =~ /^((?:pc|homenet)vm[-\w]*)$/) {
if (-e "$vndir/$file/fakejail") {
$fakejails++;
$curvnodelist{$1} = 0;
} else {
$curvnodelist{$1} = 1;
}
if ($file =~ /^([-\w]+vm[-\w]*-\d+)$/ && -d "$vndir/$file") {
push(@vnfiles, $1);
}
}
foreach my $file (sort byvnode @vnfiles) {
if (-e "$vndir/$file/fakejail") {
$fakejails++;
$curvnodelist{$file} = 0;
} else {
$curvnodelist{$file} = 1;
}
}
......@@ -360,7 +385,7 @@ sub bootvnode($$$)
my ($vnode, $action, $jailed) = @_;
my $opt;
my $act;
my $extrawait;
my $extrawait = $waittime;
if ($action eq "halt") {
$opt = "-h";
......@@ -369,8 +394,8 @@ sub bootvnode($$$)
elsif ($action eq "reboot") {
$opt = "-r";
$act = "Rebooting";
$extrawait = 20
if (GENVNODETYPE() eq "xen");
## XXX should no longer be needed
#$extrawait = 20 if (GENVNODETYPE() eq "xen");
}
elsif ($action eq "kill") {
$opt = "-k";
......@@ -379,8 +404,8 @@ sub bootvnode($$$)
else {
$opt = "-b";
$act = "Booting";
$extrawait = 20
if (GENVNODETYPE() eq "xen");
## XXX should no longer be needed
#$extrawait = 20 if (GENVNODETYPE() eq "xen");
}
$opt .= ($jailed ? " -jVt" : " -i");
......@@ -390,6 +415,6 @@ sub bootvnode($$$)
return($?)
if ($?);
sleep($extrawait)
if (defined($extrawait));
if ($extrawait > 0);
return 0;
}
......@@ -628,7 +628,7 @@ if (! -e "$VNDIR/vnode.info") {
($ret,$err) = safeLibOp('vnodeCreate',0,0);
if ($err) {
MyFatal("vnodeCreate failed");
MyFatal("vnodeCreate failed: $err");
}
$vmid = $ret;
......
......@@ -356,6 +356,7 @@ sub handler ($) {
my ($signame) = @_;
print STDERR "vnodesetup ($PID) caught a SIG${signame}!\n";
TBDebugTimeStampWithDate("vnodesetup shutting down ...");
$SIG{USR1} = 'IGNORE';
$SIG{USR2} = 'IGNORE';
......
......@@ -795,9 +795,14 @@ sub restartDHCP()
if (mysystem2("/sbin/initctl restart $dhcpd_service") != 0) {
mysystem2("/sbin/initctl start $dhcpd_service");
}
} else {
#sysvinit
} elsif (-x '/bin/systemctl') {
# systemd
mysystem2("/bin/systemctl restart $dhcpd_service.service");
} elsif (-x '/etc/init.d/$dhcpd_service') {
# sysvinit
mysystem2("/etc/init.d/$dhcpd_service restart");
} else {
print STDERR "restartDHCP: could not restart dhcpd!\n";
}
}
......
......@@ -27,6 +27,7 @@ use English;
use Data::Dumper;
use POSIX qw(setsid);
use POSIX ":sys_wait_h";
use POSIX ":signal_h";
use Socket;
#
......@@ -302,6 +303,9 @@ sub Online()
else {
POSIX::setsid();
# XXX make sure we can kill the proxy when done
local $SIG{TERM} = 'DEFAULT';
exec("$BINDIR/tmcc.bin -d -t 15 -n $vnode_id ".
" -X $host_ip:$local_tmcd_port -s $boss_ip -p $TMCD_PORT ".
" -o $LOGDIR/tmccproxy.$vnode_id.log");
......
......@@ -128,6 +128,16 @@ my $IMAGEDUMP = "/usr/local/bin/imagedump";
my $XM = "/usr/sbin/xm";
my $debug = 0;
my $lockdebug = 0;
my $sleepdebug = 0;
#
# Set to enable vnodesetup to exit before vnode is completely up
# (see vnodesetup::hackwaitandexit). Allows more parallelism during
# boot-time vnode setup. Note that concurrency may still be constrained
# by $MAXCONCURRENT (defined below) which limits how many new VMs can
# be created at once.
#
my $vsrelease = "immediate"; # or "early" or "none"
#
# Some commands/subsystems have evolved in incompatible ways over time,
......@@ -136,6 +146,18 @@ my $lockdebug = 0;
my $newsfdisk = 0;
my $newlvm = 0;
#
# Image wait time.
#
# How long (seconds) we will wait to when trying to grab a lock on
# an image. Should be set to the max time you think it could take frisbee
# to download the largest (compressed) OS image you will support in a VM.
# Also consider that there could be multiple frisbees running at once for
# multiple images (currently limited by the vnode create lock concurrency
# ($MAXCONCURRENT) below.
#
my $MAXIMAGEWAIT = 1800;
#
# Serial console handling. We fire up a capture per active vnode.
# We use a fine assortment of capture options:
......@@ -308,10 +330,6 @@ my $VIFROUTING = ((-e "$ETCDIR/xenvifrouting") ? 1 : 0);
my $TMCD_PORT = 7777;
# Number of concurrent containers set up in parallel. We bump this up
# a bit down in doingThinLVM().
my $MAXCONCURRENT = 3;
#
# Information about the running Xen hypervisor
#
......@@ -369,6 +387,118 @@ sub getXenInfo()
close XM;
}
#
# Things that matter:
#
# - RAM in dom0.
# Swapping is deadly. Looks like 1024MB is NOT enough based on experience
# noted below. 4096MB is plenty and seems to override most of the other
# concerns.
#
# - Number of CPUs.
# Have not seen any appreciable difference with 32 CPUs vs. 4. Other
# things cause problems well before this.
#
# - The number of disks in the VG.
# LVM performance is generally unpredicable. More than one disk is
# good, but haven't seen much improvement with, e.g., 6 instead of 2.
# The killer is concurrent frisbees (write to LVM) and even more so,
# imageunzips (read from and write to LVM).
#
# - The BW from the frisbee server.
# Possibly an issue if nothing else stands in the way, due to subboss
# disk speed that tops out at about 150MB/sec. Given random I/O and
# multiple images, probably going to get less than 50MB/sec.
#
# Random proposal based on tests run on Emulab d710/d820/d430 nodes and
# Apt c6220 nodes:
#
# * Change the arbitrary 164MB write buf memory to an equally arbitrary,
# but more aestetically pleasing, 128MB (where the hell did 164 come from?)
#
# * Adjust concurrency based on:
#
# if (dom0 physical RAM < 1GB) MAX = 1;
# if (any swap activity) MAX = 1;
#
# This captures pc3000s/other old machines and overloaded (RAM) machines.
#
# if (# physical CPUs <= 2) MAX = 3;
# if (# physical spindles == 1) MAX = 3;
# if (dom0 physical RAM <= 2GB) MAX = 3;
#
# This captures d710s, Apt r320, and Cloudlab m510s. We may need to
# reconsider the latter since its single drive is an NVMe device.
# But first we have to get Xen working with them (UEFI issues)...
#
# MAX = 5;
#
# This captures Emulab d430/d820s, Apt c6220s, and probably all
# Clemson and Wisconsin Cloudlab nodes.
#
# Random observations based on waaay too much time spent on d710s:
#
# Observation: d710 with 5 vnodes and all different images does not
# boot first time with MAXCONCURRENT==5. 4 vnodes appear to be downloading
# their disk image when the BSD domU tries to boot--qemu times out.
# Restarting the vnode later works fine. The reason for this is that dom0
# starts swapping due to imageunzip processes running. 164MB of write
# buffering per imageunzip is too much for 3 imageunzips and 1GB of dom0 mem.
# Even dropping to 128MB of write buffering is not enough. Single-threading
# imageunzip (-n -W 1) works fine, but things go really slow. We need a
# buffering based on the available dom0 RAM and the max number of concurrent
# imageunzips (MAXCONCURRENT) independent of how the latter is calculated.
#
# Observation: qemu processes blow up huge when they first start (500MB) but
# don't require that much afterward (20MB). So most of our d710 problems
# stem from qemu blasting off while imageunzips are running. This happens
# because qemu is outside of the MAXCONCURRENT lock.
#
# Empirically, based on a d710 with 1GB of dom0 RAM, we can pull off 3
# imageunzips + 1 qemu with just a tad of swapping--not enough to cause
# the qemu to timeout. Full imageunzips seem to be about 32MB + writebuf
# memory. No swapping until the qemu starts. The number of qemus we launch
# will be implicitly constrained by this limit as qemu startups (vnodeBoot)
# take less time than vnodeCreate so we should not have more than MAXCONCURRENT
# vnodes in vnodeBoot at once.
#
#
# Historic concurrency value. Should get overwritten in setConcurrency.
#
my $MAXCONCURRENT = 3;
#
# Number of concurrent containers set up in parallel. See the big, long
# navel-gazing comment just above...
#
sub setConcurrency($)
{
my ($maxval) = @_;
if ($maxval) {
$MAXCONCURRENT = 5;
} else {
my ($ram,$cpus) = domain0Resources();
my $disks = $STRIPE_COUNT;
my $hasswapped = domain0Swapping();
print STDERR "setConcurrency: cpus=$cpus, ram=$ram, disks=$disks hasswapped=$hasswapped\n"
if ($debug);
if ($cpus > 0 && $disks > 0 && $ram > 0) {
if ($ram < 1024 || (!SHAREDHOST() && $hasswapped)) {
$MAXCONCURRENT = 1;
} elsif ($cpus <= 2 || $disks == 1 || $ram <= 2048) {
$MAXCONCURRENT = 3;
} else {
$MAXCONCURRENT = 5;
}
}
}
print STDERR "Limiting to $MAXCONCURRENT concurrent vnode creations.\n";
}
sub init($)
{
my ($pnode_id,) = @_;
......@@ -416,6 +546,10 @@ sub setDebug($)
{
$debug = shift;
libvnode::setDebug($debug);
$lockdebug = 1;
if ($debug > 1) {
$sleepdebug = 1;
}
print "libvnode_xen: debug=$debug\n"
if ($debug);
}
......@@ -855,6 +989,11 @@ sub rootPreConfigNetwork($$$$)
TBDebugTimeStamp(" releasing global lock")
if ($lockdebug);
TBScriptUnlock();
# XXX let vnodesetup exit early
if ($vsrelease eq "immediate") {
TBDebugTimeStamp("rootPreConfigNetwork: touching $VMS/$vnode_id/running");
mysystem2("touch $VMS/$vnode_id/running");
}
return 0;
bad:
TBScriptUnlock();
......@@ -893,6 +1032,27 @@ sub vnodeCreate($$$$)
}
$vninfo->{'vmid'} = $vmid;
#
# XXX future optimization possibility.
#
# Try to be smart about holding the vnode creation lock which is not
# a single lock, but rather a small set of locks intended to limit
# concurrency in the vnode creation process. Specifically, if we grab
# a create_vnode lock and then block waiting for our image lock, then
# we might prevent someone else (using a different image) from making
# progress. So we could instead: grab a create_vnode lock, make a short
# attempt (5-10 seconds) to grab the image lock and, failing that, back
# off of the create_vnode lock, wait and then try the whole process again.
#
# The problem is that we may block again down in downloadOneImage when
# we try to grab the image lock exclusively. Not sure we can back all
# the way out easily in that case!
#
# This is also a bit of a de-optimization when we have a set of vnodes
# all using the same image. We just cause a bit of excess context
# switching in that (probably more common) case.
#
if (CreateVnodeLock() != 0) {
fatal("CreateVnodeLock()");
}
......@@ -907,7 +1067,7 @@ sub vnodeCreate($$$$)
if ($lockdebug);
if (TBScriptLock($imagelockname,
TBSCRIPTLOCK_INTERRUPTIBLE()|TBSCRIPTLOCK_SHAREDLOCK(),
1800) != TBSCRIPTLOCK_OKAY()) {
$MAXIMAGEWAIT) != TBSCRIPTLOCK_OKAY()) {
fatal("Could not get $imagelockname lock!");
}
TBDebugTimeStamp(" got image lock")
......@@ -942,8 +1102,8 @@ sub vnodeCreate($$$$)
TBScriptUnlock();
TBDebugTimeStamp("grabbing image lock $imagelockname exclusive")
if ($lockdebug);
if (TBScriptLock($imagelockname, TBSCRIPTLOCK_INTERRUPTIBLE(), 1800)
!= TBSCRIPTLOCK_OKAY()) {
if (TBScriptLock($imagelockname, TBSCRIPTLOCK_INTERRUPTIBLE(),
$MAXIMAGEWAIT) != TBSCRIPTLOCK_OKAY()) {
fatal("Could not get $imagelockname write lock!");
}
TBDebugTimeStamp(" got image lock")
......@@ -962,7 +1122,7 @@ sub vnodeCreate($$$$)
if ($lockdebug);
if (TBScriptLock($imagelockname,
TBSCRIPTLOCK_INTERRUPTIBLE()|
TBSCRIPTLOCK_SHAREDLOCK(), 1800)
TBSCRIPTLOCK_SHAREDLOCK(), $MAXIMAGEWAIT)
!= TBSCRIPTLOCK_OKAY()) {
fatal("Could not get $imagelockname lock back ".
"after a long time!");
......@@ -1000,6 +1160,8 @@ sub vnodeCreate($$$$)
libutil::setState("RELOADING");
if (createImageDisk($imagename, $vnode_id, $raref, $dothinlv)) {
# XXX not strictly necessary since our caller will send TBFAILED
libutil::setState("RELOADFAILED");
TBScriptUnlock();
fatal("xen_vnodeCreate: ".
"cannot create logical volume for $imagename");
......@@ -1036,6 +1198,9 @@ sub vnodeCreate($$$$)
}
if ($inreload) {
libutil::setState("RELOADDONE");
# XXX why do we need to wait for this to take effect?
print "waiting 4 sec after asserting RELOADDONE...\n"
if ($sleepdebug);
sleep(4);
}
......@@ -1201,6 +1366,7 @@ sub vnodeCreate($$$$)
#
# Create the snapshot LVM.
#
my $mustsleep = 0;
if (!lvmFindVolume($vnode_id)) {
#
# Need to create a new disk for the container. But lets see
......@@ -1295,22 +1461,24 @@ okay:
#
if ($loadslice == 0 && !exists($imagemetadata->{'BOOTPART'})) {
my @tmp;
my $gotit = 0;
#
# XXX If may take a while for the state change above to
# take effect and set the bootwhat info. Sleep a short
# time and try. If that fails, sleep longer and try
# one more time.
# time and try a couple of times as necessary.
#
sleep(1);
my $rv = getbootwhat(\@tmp);
if ($rv || !scalar(@tmp) || !exists($tmp[0]->{"WHAT"})) {
sleep(4);
$rv = getbootwhat(\@tmp);
foreach my $sl (1, 2, 2) {
print "waiting $sl sec to make getbootwhat call...\n"
if ($sleepdebug);
sleep($sl);
my $rv = getbootwhat(\@tmp);
if (!$rv && @tmp > 0 && exists($tmp[0]->{"WHAT"})) {
$gotit = 1;
last;
}
}
if ($rv || !scalar(@tmp) || !exists($tmp[0]->{"WHAT"}) ||
$tmp[0]->{"WHAT"} !~ /^\d*$/) {
if (!$gotit || $tmp[0]->{"WHAT"} !~ /^\d*$/) {
print STDERR Dumper(\@tmp);
TBScriptUnlock();
fatal("libvnode_xen: could not get bootwhat info");
......@@ -1335,8 +1503,9 @@ okay:
TBScriptUnlock();
fatal("libvnode_xen: could not add /dev/mapper entries");
}
# Hmm, some kind of kpartx race ...
sleep(2);
$mustsleep = 2;
}
# Need to tell slicefix where to find the root partition.
# Naming convention is a pain.
......@@ -1357,6 +1526,13 @@ okay:
TBScriptUnlock();
CreateVnodeUnlock();
# Sleep outside of the vnode/image locks
if ($mustsleep) {
print "waiting $mustsleep sec after kpartx call...\n"
if ($sleepdebug);
sleep($mustsleep);
}
#
# Extract kernel and ramdisk.
#
......@@ -1648,15 +1824,27 @@ sub vnodePreConfig($$$$$){
}
#
# We rely on the UFS module (with write support compiled in) to
# deal with FBSD filesystems.
# XXX because of the squirrelly nature of the write-enabled UFS module
# in Linux, we try to avoid write mounting the FS as much as possible.
# So we first mount RO and see if we have already been customized.
#
if ($vninfo->{'os'} eq "FreeBSD") {
mysystem2("mount -t ufs -o ufstype=44bsd $dev $vnoderoot >/dev/null 2>&1");
my $utype = "44bsd";
mysystem2("mount -t ufs -o ro,ufstype=$utype $dev $vnoderoot ".
">/dev/null 2>&1");
if ($?) {
# try UFS2
mysystem("mount -t ufs -o ufstype=ufs2 $dev $vnoderoot");
$utype = "ufs2";
mysystem("mount -t ufs -o ro,ufstype=$utype $dev $vnoderoot");
}
if (-e "$vnoderoot/etc/emulab/genvmtype") {
print STDERR "vnodePreConfig: $vnode_id root already localized\n";
goto done;
}
# needs to be customized, remount RW
mysystem("umount $dev");
mysystem("mount -t ufs -o ufstype=$utype $dev $vnoderoot");
}
else {
mysystem("mount $dev $vnoderoot");
......@@ -1671,8 +1859,7 @@ sub vnodePreConfig($$$$$){
print STDERR
"vnodePreConfig: WARNING: $vnode_id appears to be a configured ".
"elabinelab server; skipping localizations\n";
mysystem("umount $dev");
return 0;
goto done;
}
# XXX We need to get rid of this or get it from tmcd!
......@@ -1882,6 +2069,11 @@ sub vnodePreConfig($$$$$){
$retval = &$callback($vnoderoot);
done:
mysystem("umount $dev");
# XXX let vnodesetup exit early
if ($vsrelease eq "early" && $retval == 0) {
TBDebugTimeStamp("vnodePreConfig: touching $VMS/$vnode_id/running");
mysystem2("touch $VMS/$vnode_id/running");
}
return $retval;
bad:
mysystem("umount $dev");
......@@ -2429,6 +2621,14 @@ sub vnodeBoot($$$$)
# stated to do its thing, this state name is treated specially.
libutil::setState("BOOTING");