libvnode_docker.pm 217 KB
Newer Older
1
#!/usr/bin/perl -T
2
#
3
# Copyright (c) 2008-2018 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
#
# Implements the libvnode API for Docker support in Emulab.
#
# Note that there is no distinguished first or last call of this library
# in the current implementation.  Every vnode creation (through mkvnode.pl)
# will invoke all the root* and vnode* functions.  It is up to us to make
# sure that "one time" operations really are executed only once.
#
package libvnode_docker;
use Exporter;
@ISA    = "Exporter";
@EXPORT = qw( init setDebug rootPreConfig
              rootPreConfigNetwork rootPostConfig
	      vnodeCreate vnodeDestroy vnodeState vnodePoll vnodePollCleanup
	      vnodeBoot vnodePreBoot vnodeHalt vnodeReboot
	      vnodeUnmount
	      vnodePreConfig vnodePreConfigControlNetwork
              vnodePreConfigExpNetwork vnodeConfigResources
              vnodeConfigDevices vnodePostConfig vnodeExec vnodeTearDown VGNAME
	    );
use vars qw($VGNAME);

%ops = ( 'init' => \&init,
         'setDebug' => \&setDebug,
         'rootPreConfig' => \&rootPreConfig,
         'rootPreConfigNetwork' => \&rootPreConfigNetwork,
         'rootPostConfig' => \&rootPostConfig,
         'vnodeCreate' => \&vnodeCreate,
         'vnodeDestroy' => \&vnodeDestroy,
	 'vnodeTearDown' => \&vnodeTearDown,
         'vnodeState' => \&vnodeState,
	 'vnodePoll' => \&vnodePoll,
	 'vnodePollCleanup' => \&vnodePollCleanup,
         'vnodeBoot' => \&vnodeBoot,
         'vnodeHalt' => \&vnodeHalt,
         'vnodeUnmount' => \&vnodeUnmount,
         'vnodeReboot' => \&vnodeReboot,
         'vnodeExec' => \&vnodeExec,
         'vnodePreConfig' => \&vnodePreConfig,
         'vnodePreConfigControlNetwork' => \&vnodePreConfigControlNetwork,
         'vnodePreConfigExpNetwork' => \&vnodePreConfigExpNetwork,
         'vnodeConfigResources' => \&vnodeConfigResources,
         'vnodeConfigDevices' => \&vnodeConfigDevices,
         'vnodePostConfig' => \&vnodePostConfig,
       );


use strict;
71
use warnings;
72 73 74 75 76 77 78 79
use English;
use Data::Dumper;
use Socket;
use IO::Handle;
use IO::Select;
use File::Basename;
use File::Path;
use File::Copy;
80
use File::Temp qw(tempdir);
81 82
use POSIX;
use JSON::PP;
83
use Digest::SHA qw(sha1_hex);
84
use LWP::Simple;
85
use MIME::Base64;
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130

# Pull in libvnode
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }
use libutil;
use libgenvnode;
use libvnode;
use libtestbed;
use libsetup;
use libtmcc;
use liblocsetup;

#
# Turn off line buffering on output
#
$| = 1;

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 

##
## Standard utilities and files section
##

my $DOCKER = "/usr/bin/docker";
my $CURL = "/usr/bin/curl";
my $BRCTL = "brctl";
my $IP = "/sbin/ip";
my $IFCONFIG = "/sbin/ifconfig";
my $ETHTOOL = "/sbin/ethtool";
my $ROUTE = "/sbin/route";
my $SYSCTL = "/sbin/sysctl";
my $VLANCONFIG = "/sbin/vconfig";
my $MODPROBE = "/sbin/modprobe";
my $IPTABLES	= "/sbin/iptables";
my $NETSTAT     = "/bin/netstat";
my $IMAGEZIP    = "/usr/local/bin/imagezip";
my $IMAGEUNZIP  = "/usr/local/bin/imageunzip";
my $IMAGEDUMP   = "/usr/local/bin/imagedump";

##
## Runtime configuration options.
##
my $debug  = 0;
131
my $apidebug = 5;
132 133 134
my $lockdebug = 0;
my $sleepdebug = 0;

135 136 137 138 139 140 141 142 143
#
# Set to enable vnodesetup to exit before vnode is completely up
# (see vnodesetup::hackwaitandexit). Allows more parallelism during
# boot-time vnode setup. Note that concurrency may still be constrained
# by $MAXCONCURRENT (defined below) which limits how many new VMs can
# be created at once.
#
my $vsrelease = "immediate";	# or "early" or "none"

144 145 146 147 148 149 150 151 152 153 154 155 156
#
# If Docker is not already installed, which one should we use?  If it's
# not installed, we default to the community edition.  This is a
# runtime-checked param, so we'll use whatever is installed by default,
# not necessarily what is specified here.
#
# You really don't want to use docker.io <= 1.12, because it will take
# too many liberties with the control net bridge.  For instance, if you
# attempt a `systemctl restart docker.service`, you may be SOL and no
# longer on the control net!  docker-ce has patches against this rolled
# in already.
#
my $USE_DOCKER_CE = 1;
157 158 159
#
# Should we use LVM for extra storage space?  This should remain set.
#
160 161
my $USE_LVM = 1;
#
162 163 164 165 166 167
# Which docker storage driver should we use; see rootPreConfig().  Note,
# if you change this, you should change USE_DOCKER_LVM to 0 if
# !devicemapper; 1 if devicemapper.
#
my $DOCKER_STORAGE_DRIVER = 'overlay2';
#
168 169 170 171 172
# Should we use the Docker devicemapper direct-lvm storage backend?
# This should remain set, so that it is used for shared hosts.  User
# should be able to change to the default AUFS backend on dedicated
# hosts.
#
173
my $USE_DOCKER_LVM = 0;
174 175 176 177 178 179
#
# Default NFS mounts to read-only for now so that nothing in the
# container can blow them away accidentally!
#
my $NFS_MOUNTS_READONLY = 0;
#
180 181 182 183 184 185 186
# Should we use libvnode's network data structure caching/indexing
# powers.  We do not by default, because we can effectively use quick
# operations in /sys for everything we need -- no need to index to avoid
# slow calls to brctl or whatnot.
#
my $USE_LIBVNODE_NETCACHE = 0;
#
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
# Should we log packets the firewall rejects?
#
my $IPTABLES_PACKET_LOG = 1;
#
# Defaults for the default docker bridge (not our control net bridge).
#
my $DOCKER_DEFAULT_BRIDGE_IP = '192.168.254.1';
my $DOCKER_DEFAULT_BRIDGE_CIDR = '192.168.254.1/24';
#
# Docker supports both macvlan and regular bridging, but we use regular
# bridges because we need to impose traffic control on the host context
# half of the veth.
#
my $USE_MACVLAN = 0;
#
# We support macvlans on the control net, but we don't use them because
# we need to apply iptables rules outside the containers, so we need the
# host context half of the veth to use as a source interface.  It is
# tempting to use a cgroup ID plus net_cls, but apparently the markings
# only hold within the container's netns, and don't make it into the
# root (i.e. https://github.com/docker/docker/issues/19802).  So we're
# really stuck with real bridges -- and thus this should not be enabled,
# unless someone else can find a way around this.
#
my $USE_MACVLAN_CNET = 0;
#
213
# We try to use $IP instead of $BRCTL.
214
#
215
my $USE_BRCTL = 0;
216 217 218 219 220
#
# Attempt to replace simple COPY instructions from Dockerfile- fragments
# in image augmentation/emulabization with a single COPY.
#
my $COPY_OPTIMIZE = 1;
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245

##
## Detected configuration variables.
##

#
# Is this our customized version of Docker?
#
my $ISOURDOCKER = 0;
#
# Some commands/subsystems have evolved in incompatible ways over time,
# these vars keep track of such things.
#
my $NEW_LVM = 0;

##
## Various constants.
##

#
# Image wait time.  How long (seconds) we will wait to when trying to
# grab a lock on an image. Should be set to the max time you think it
# could take to pull a large Docker image.  This is a wild guess, obviously.
#
my $MAXIMAGEWAIT = 1800;
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268

#
# Serial console handling. We fire up a capture per active vnode.
# We use a fine assortment of capture options:
#
#	-i: standalone mode, don't try to contact capserver directly
#	-l: (added later) set directory where log, ACL, and pid files are kept.
#	-C: use a circular buffer to capture activity while no user
#	    is connected. This gets dumped to the user when they connect.
#	-T: Put out a timestamp if there has been no previous output
#	    for at least 10 seconds.
#	-L: In conjunction with -T, the timestamp message includes how
#	    long it has been since the last output.
#	-R: Retry interval of 1 second. When capture is disconnected
#	    from the pty (due to container reboot/shutdowns), this is how
#	    long we wait between attempts to reconnect.
#       -y: When capture disconnects from the pty, we retry forever to reopen.
#       -A: tell capture not to prepend '/dev' to the device path we supply.
#
my $CAPTURE     = "/usr/local/sbin/capture-nossl";
my $CAPTUREOPTS	= "-i -C -L -T 10 -R 1000 -y -1 -A";
my $C2P = "/usr/local/etc/emulab/container2pty.py";

269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
#
# Create a thin pool with the name $POOL_NAME using not more
# than $POOL_FRAC of any disk.
# 
my $USE_THIN_LVM = 1;
my $POOL_NAME = "disk-pool";
my $POOL_FRAC = 0.75;
#
# Minimum acceptible size (in GB) of LVM VG for containers.
#
# XXX we used to calculate this in terms of anticipated maximum number
# of vnodes and minimum vnode images size, blah, blah. Now we just pick
# a value that allows us to use a pc3000 node with a single 144GB disk!
#
my $DOCKER_MIN_VGSIZE = 120;
# Striping
my $STRIPE_COUNT   = 1;
# Avoid using SSDs unless there are only SSDs
my $LVM_AVOIDSSD = 1;
# Whether or not to use only unpartitioned (unused) disks to form the Xen VG.
my $LVM_FULLDISKONLY = 0;
# Whether or not to use partitions only when they are big.
my $LVM_ONLYLARGEPARTS = 1;
my $LVM_LARGEPARTPCT = 10;
# In general, you only want to use one partition per disk since we stripe.
my $LVM_ONEPARTPERDISK = 1;
#
# Flags for allocating LVs
#
sub ALLOC_NOPOOL()	{ return 0; }
sub ALLOC_INPOOL()	{ return 1; }
sub ALLOC_PREFERNOPOOL	{ return 2; }
sub ALLOC_PREFERINPOOL	{ return 3; }

##
## Randomly chosen convention section
##

# Locks.
my $GLOBAL_CONF_LOCK = "emulabdockerconf";
my $GLOBAL_MOUNT_LOCK = "emulabmounts";
310 311 312 313 314
my $SSHD_EXEC_LOCK = "sshdockerexec";

my $DOCKER_EXEC_SSHD_CONFIGFILE = "/etc/ssh/sshd_config-docker-exec";
my $DOCKER_EXEC_SSHD_CONFIGFILE_HEAD = "/etc/ssh/sshd_config-docker-exec.head";
my $DOCKER_EXEC_SSHD_CONFIGDIR = "/etc/ssh/docker-exec.conf.d";
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335

# Config done file.
my $READYFILE = "/var/run/emulab.docker.ready";

# default image to load on logical disks
# Just symlink /boot/vmlinuz-xenU and /boot/initrd-xenU
# to the kernel and ramdisk you want to use by default.
my %defaultImage = (
    'name'      => "ubuntu:16.04",
#    'hub'    => "",
);

# Where we store all our config files.
my $VMS    = "/var/emulab/vms";
my $VMDIR  = "$VMS/vminfo";
# Extra space for VM info.
my $EXTRAFS = "/vms";
# Extra space for vminfo (/var/emulab/vms) between reloads.
my $INFOFS = "/vminfo";

# Docker LVM volume group name. Accessible outside this file.
336
$VGNAME = "docker";
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
# So we can ask this from outside;
sub VGNAME()  { return $VGNAME; }
    
my $CTRLIPFILE = "/var/emulab/boot/myip";
# XXX needs lifting up
my $JAILCTRLNET = "172.16.0.0";
my $JAILCTRLNETMASK = "255.240.0.0";

#
# NB: Total hack.  Docker doesn't give you control over default gateway
# for a multi-homed container, other than to ensure that virtual NICs
# are added in lexical order of name, and to promise that the default
# gateway set by the first-added network will remain.  So make sure the
# control net has a lexical name at the beginning of everything.
#
my $DOCKERCNET = "_dockercnet";

#
# Some of the core dirs for Emulabization existing Docker images.
#
my $EMULABSRC = "$EXTRAFS/emulab-devel";
my $PUBSUBSRC = "$EXTRAFS/pubsub";
359
my $RUNITSRC = "$EXTRAFS/runit";
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
my $CONTEXTDIR = "$EXTRAFS/contexts";
my $DOCKERFILES = "/etc/emulab/docker/dockerfiles";

# IFBs
my $IFBDB      = "/var/emulab/db/ifbdb";

# Use openvswitch for gre tunnels.
# Use a custom version if present, the standard version otherwise.
my $OVSCTL   = "/usr/local/bin/ovs-vsctl";
my $OVSSTART = "/usr/local/share/openvswitch/scripts/ovs-ctl";
if (! -x "$OVSCTL") {
    $OVSCTL   = "/usr/bin/ovs-vsctl";
    $OVSSTART = "/usr/share/openvswitch/scripts/ovs-ctl";
}

my $ISREMOTENODE = REMOTEDED();

##
## Emulab constants.
##
my $TMCD_PORT	 = 7777;
my $SLOTHD_PORT  = 8509;
my $EVPROXY_PORT = 16505;

384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
##
## Docker constants.
##
#
# The options as far as what to install in an image to support its use
# in Emulab.
#
#   none: we do not alter the image at all!
#   basic: install only sshd and syslogd, and whatever init the user wants
#   core: basic + install a custom-build of the clientside, using a buildenv of
#     the image, but only installing the DESTDIR clientside binaries/fs stuff;
#     also install a whole bunch of packages the clientside stuff needs.
#   buildenv: basic + full + install all build tools for clientside, and
#     install the clientside.
#   full: buildenv + packages to make the image identical to a normal Emulab
#     disk image.
#
sub DOCKER_EMULABIZE_NONE() { return "none"; }
sub DOCKER_EMULABIZE_BASIC() { return "basic"; }
sub DOCKER_EMULABIZE_CORE() { return "core"; }
sub DOCKER_EMULABIZE_BUILDENV() { return "buildenv"; }
sub DOCKER_EMULABIZE_FULL() { return "full"; }
#
# Most of the Linux images that users will use will be generic images
# whose startup command is sh or bash.  We need something that (at
# minimum) runs infinitely, reaps processes like init, and allows remote
# logins via ssh, syslogs, etc.  Users are free to specify no
# emulabization to cover the cases where the image runs a bona fide
# daemon or pre-configured init.  But we cannot help them with those
# cases automatically.
#
#sub DOCKER_EMULABIZE_DEFAULT() { return DOCKER_EMULABIZE_BASIC(); }
sub DOCKER_EMULABIZE_DEFAULT() { return DOCKER_EMULABIZE_NONE(); }

#
# On modern (ie.e. 2016) Linux images, systemd is already installed (on
# Ubuntu/Debian, and Fedora/CentOS).  We really want to let people use
# it if it's there, instead of falling back to runit (which we install
# during Emulabization).  However, the problem is that we cannot use
# systemd as the init on shared nodes -- systemd requires at least
# read-only access to /sys/fs/cgroup, and docker as of 1.26 does not
# virtualize the cgroup mount (although it's in kernels >= 4.4) -- even
# if Docker did, it might not work; I don't know what systemd wants out
# of /sys/fs/cgroup.
#
# Thus, we must default to runit so that users have images that work on
# both shared and dedicated container hosts.  Ugh!
#
sub DOCKER_INIT_INSTALLED() { return "installed"; }
sub DOCKER_INIT_RUNIT() { return "runit"; }

#
# Either we always pull the reference image when setting up a new
# container, or we only pull the first time.  Simple.
#
sub DOCKER_PULLPOLICY_LATEST() { return "latest"; }
sub DOCKER_PULLPOLICY_CACHED() { return "cached"; }

442 443 444 445 446 447 448 449 450
# Local functions
sub findRoot();
sub copyRoot($$);
sub replace_hacks($);
sub disk_hacks($);
sub hostMemory();
sub hostResources();
sub hostIP($);
sub fixupMac($);
451
sub lvmVGSize($);
452 453 454 455 456 457 458 459
sub checkForInterrupt();
sub genhostspairlist($$);
sub addMounts($$);
sub removeMounts($);
sub bindNetNS($$);
sub moveNetDeviceToNetNS($$$);
sub moveNetDeviceFromNetNS($$$);
sub unbindNetNS($$);
460
sub setupImage($$$$$$$$$$$);
461
sub pullImage($$$$;$);
462
sub emulabizeImage($;$$$$$$$$$);
463
sub analyzeImage($$;$);
464 465 466 467 468 469 470 471
sub AllocateIFBs($$$);
sub ReleaseIFBs($$);
sub CreateShapingScripts($$$$;$);
sub RunShapingScripts($$);
sub CreateRoutingScripts($$);
sub RunRoutingScripts($$);
sub RunWithSignalsBlocked($@);
sub RunProxies($$);
472
sub AreProxiesRunning($$);
473 474 475
sub KillProxies($$);
sub InsertPostBootIptablesRules($$$$);
sub RemovePostBootIptablesRules($$$$);
476 477
sub captureRunning($);
sub captureStart($$);
478 479 480 481 482 483 484 485 486 487 488

#
# A single client object per load of this file is safe.
#
my $_CLIENT;

sub getClient()
{
    return $_CLIENT
	if (defined($_CLIENT));
    # Load late, because this requires a bunch of deps we might have
489
    # installed in ensureDeps().
490 491 492 493 494 495
    require dockerclient;
    $_CLIENT = dockerclient->new();
    $_CLIENT->debug($apidebug);
    return $_CLIENT;
}

496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
#
# Historic concurrency value. Should get overwritten in setConcurrency.
#
my $MAXCONCURRENT = 5;

#
# Number of concurrent containers set up in parallel.  Lifted from
# libvnode_xen; will be changed later.
#
sub setConcurrency($)
{
    my ($maxval) = @_;
   
    if ($maxval) {
	$MAXCONCURRENT = 5;
    } else {
	my ($ram,$cpus) = hostResources();
	my $disks = $STRIPE_COUNT;
	my $hasswapped = hostSwapping();

	print STDERR "setConcurrency: cpus=$cpus, ram=$ram, disks=$disks".
	    " hasswapped=$hasswapped\n"
	    if ($debug);

	if ($cpus > 0 && $disks > 0 && $ram > 0) {
	    if ($ram < 1024 || (!SHAREDHOST() && $hasswapped)) {
		$MAXCONCURRENT = 3;
523
	    } elsif ($cpus <= 2 || $disks == 1 || $ram <= 2048) {
524
		$MAXCONCURRENT = 5;
525 526
	    } else {
		$MAXCONCURRENT = 16;
527 528 529 530 531 532 533 534 535 536 537 538 539
	    }
	}
    }
    print STDERR "Limiting to $MAXCONCURRENT concurrent vnode creations.\n";
}

sub setDebug($)
{
    $debug = shift;
    libvnode::setDebug($debug);
    $lockdebug = 1;
    if ($debug > 1) {
	$sleepdebug = 1;
540
	$apidebug = 5;
541
    }
542
    print "libvnode_docker: debug=$debug, apidebug=$apidebug\n"
543 544 545 546 547 548 549
	if ($debug);
}

sub ImageLockName($)
{
    my ($imagename) = @_;

550
    my $ln = "dockerimage." .
551
	(defined($imagename) ? $imagename : $defaultImage{'name'});
552 553 554
    $ln =~ tr/\//-/;

    return $ln;
555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
}

sub ImageLVName($)
{
    my ($imagename) = @_;

    return "image+" . $imagename;
}

#
# Apt constants and helper functions.
#
my $APTGET = "/usr/bin/apt-get";
my $APTGETINSTALL = "$APTGET -o Dpkg::Options::='--force-confold'".
    " -o Dpkg::Options::='--force-confdef' install -y ";
my $APTLOCK = "emulab.apt.running";
my $APTLOCK_REF;
my $APTUPDATEDFILE = "/var/run/emulab.apt.updated";

sub aptLock()
{
    TBDebugTimeStamp("aptLock: grabbing global lock $APTLOCK")
	if ($lockdebug);
    my $locked = TBScriptLock($APTLOCK,
			      TBSCRIPTLOCK_GLOBALWAIT(),900,\$APTLOCK_REF);
    if ($locked != TBSCRIPTLOCK_OKAY()) {
	return 0
	    if ($locked == TBSCRIPTLOCK_IGNORE());
	print STDERR "Could not get the apt-get lock after a long time!\n";
	return -1;
    }
    TBDebugTimeStamp("  got global lock $APTLOCK")
	if ($lockdebug);
    return 0;
}

sub aptUnlock()
{
    return TBScriptUnlock($APTLOCK_REF);
}

# Only run once per boot.
sub aptGetUpdate()
{
    if (-f $APTUPDATEDFILE) {
	return 0;
    }
    aptLock();
    mysystem2("apt-get update");
    if (!$?) {
	mysystem("touch $APTUPDATEDFILE");
    }
    my $rc = $?;
    aptUnlock();
    return $rc;
}

#
# Returns 0 if all packages are installed; else the number of
# non-installed packages.
#
sub aptNotInstalled(@)
{
    my @packages = @_;
    my $rc = 0;

    foreach my $P (@packages) {
	my $pstat = `dpkg-query -L $P 2>&1 >/dev/null`;
	if ($pstat) {
	    ++$rc;
	}
    }

    return $rc;
}

sub aptGetInstall(@)
{
    my @packages = @_;
    my $rc = 0;

    aptGetUpdate();

    $ENV{DEBIAN_FRONTEND} = 'noninteractive';
    aptLock();
    foreach my $P (@packages) {
	mysystem2("$APTGETINSTALL $P");
	if ($?) {
	    ++$rc;
	}
    }
    aptUnlock();
    $ENV{DEBIAN_FRONTEND} = undef;

    return $rc;
}

sub aptGetEnsureInstalled(@)
{
    my @packages = @_;
    my $rc = 0;

    foreach my $P (@packages) {
	$rc += aptGetInstall($P)
	    if (aptNotInstalled($P));
    }

    return $rc;
}

665
sub refreshLibVnodeNetCache()
666
{
667 668 669
    return
	if (!$USE_LIBVNODE_NETCACHE);

670 671 672 673 674 675 676 677 678
    makeIfaceMaps();
    if (!$USE_MACVLAN) {
	makeBridgeMaps();
    }
    else {
	makeMacvlanMaps();
    }
}

679
sub ensureDeps()
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
{
    if (aptNotInstalled("libwww-perl")) {
	aptGetInstall("libwww-perl");
    }
    if (aptNotInstalled("liburi-perl")) {
	aptGetInstall("liburi-perl");
    }
    if (aptNotInstalled("libhash-merge-perl")) {
	aptGetInstall("libhash-merge-perl");
    }
    if (aptNotInstalled("libmime-base64-urlsafe-perl")) {
	aptGetInstall("libmime-base64-urlsafe-perl");
    }
    eval {
	use LWP::Protocol::http::SocketUnixAlt;
    };
    if ($@) {
	mysystem("cpan -i LWP::Protocol::http::SocketUnixAlt");
    }
699 700 701
    if (aptNotInstalled("python-docker")) {
	aptGetInstall("python-docker");
    }
702 703
}

704
# (Must be called only after refreshLibVnodeNetCache() is called for
705 706 707
# the first time in init.)
sub ensureDockerInstalled()
{
708 709 710 711 712 713 714 715
    if (!aptNotInstalled("docker.io")) {
	TBDebugTimeStamp("docker.io installed; using that");
	$USE_DOCKER_CE = 0;
    }
    elsif (!aptNotInstalled("docker-ce")) {
	TBDebugTimeStamp("docker-ce installed; using that");
	$USE_DOCKER_CE = 1;
    }
716

717 718 719 720 721 722 723
    if (!$USE_DOCKER_CE) {
	TBDebugTimeStamp("Ensuring docker.io installed...");
	if (aptNotInstalled("docker.io")) {
	    TBDebugTimeStamp("Installing docker.io...");
	    if (aptGetInstall("docker.io")) {
		die("Failed to install docker.io; aborting!\n");
	    }
724

725 726 727
	    mysystem2("service docker restart");

	    # Remap, cause Docker creates some ifaces.
728
	    refreshLibVnodeNetCache();
729 730 731 732 733
	}

	#
	# Check which docker this is.
	#
734 735
	my $rc = system('grep -q PrivatePoolId `which dockerd`');
	if ($rc == 0) {
736
	    $ISOURDOCKER = 1;
737
	    TBDebugTimeStamp("init: ISOURDOCKER=1");
738
	}
739
    }
740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766
    else {
	TBDebugTimeStamp("Ensuring docker-ce installed...");
	# Ensure the Docker CE repo is configured.
	system("grep -q docker.com /etc/apt/sources.list /etc/apt/sources.list.d");
	if ($?) {
	    TBDebugTimeStamp("Installing docker-ce Apt repos...");
	    aptGetEnsureInstalled("apt-transport-https","ca-certificates",
				  "curl","software-properties-common");
	    mysystem("curl -fsSL https://download.docker.com/linux/ubuntu/gpg".
		     " | sudo apt-key add -");
	    my $release = `lsb_release -cs`;
	    chomp($release);
	    my $arch = `uname -m`;
	    chomp($arch);
	    if ($arch eq 'x86_64' || $arch eq 'amd64') {
		$arch = "amd64";
	    }
	    elsif ($arch eq 'armhf') {
		;
	    }
	    else {
		fatal("currently docker CE is only available on amd64/armhf!");
	    }
	    mysystem("add-apt-repository".
		     " \"deb [arch=$arch] https://download.docker.com/linux/ubuntu $release stable\"");
	    aptGetUpdate();
	}
767

768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783
	if (aptNotInstalled("docker-ce")) {
	    TBDebugTimeStamp("Installing docker-ce...");
	    if (aptGetInstall("docker-ce")) {
		warn("Failed to install docker-ce; retrying in 8 seconds!\n");
		sleep(8);
		system("systemctl restart docker.service");
		sleep(2);
		system("apt-get install -y docker-ce");
		if ($?) {
		    fatal("Failed to install docker-ce; aborting!\n");
		}
	    }

	    mysystem2("service docker restart");

	    # Remap, cause Docker creates some ifaces.
784
	    refreshLibVnodeNetCache();
785 786 787 788 789
	}

	#
	# Check which docker this is.
	#
790 791
	my $rc = system('grep -q PrivatePoolId `which dockerd`');
	if ($rc == 0) {
792
	    $ISOURDOCKER = 1;
793
	    TBDebugTimeStamp("init: ISOURDOCKER=1");
794
	}
795 796
    }

797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812
    #
    # Wait for docker to be running and responding; this may take awhile if
    # we are running hundreds of containers.
    #
    my @lines = `systemctl is-active docker.service 2>&1`;
    my $needrestart = 0;
    if (!($? == 0 || (@lines > 0 && $lines[0] =~ /^activ/))) {
	$needrestart = 1;
    }
    if ($needrestart) {
	mysystem2("systemctl try-restart docker.service");
    }
    my $startwaittime = time();
    while ((time() - $startwaittime) < 900) {
	my $rc = system("docker info");
	if (!$rc) {
David Johnson's avatar
David Johnson committed
813
	    TBDebugTimeStamp("docker appears to be running");
814 815 816
	    last;
	}
	else {
David Johnson's avatar
David Johnson committed
817
	    TBDebugTimeStamp("docker is not yet running; waiting...");
818 819 820 821
	    sleep(1);
	}
    }

822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
    #if (aptNotInstalled("systemd-container")
    #	&& aptGetInstall("systemd-container")) {
    #	die("Failed to install systemd-container; aborting!\n");
    #}

    #
    # Check or create the Docker config file; if we have to modify it,
    # restart Docker.
    #
    mkdir("/etc")
	if (! -d "/etc");
    mkdir("/etc/docker")
	if (! -d "/etc/docker");
    my $origjsontext = '';
    my $json = {};
    my $changed = 0;
    if (-e "/etc/docker/daemon.json") {
	open(FD,"/etc/docker/daemon.json")
	    or die("could not open /etc/docker/daemon.json: $!");
	my @lines = <FD>;
	close(FD);
	$origjsontext = join("",@lines);
	$json = decode_json($origjsontext);
    }

847 848
    # Check to ensure the docker iface has a non-172.16 subnet, unless
    # we already fixed that:
849 850 851
    if (!exists($json->{'bip'})
	|| $json->{'bip'} ne $DOCKER_DEFAULT_BRIDGE_CIDR) {
	TBDebugTimeStamp("Moving docker0 to $DOCKER_DEFAULT_BRIDGE_CIDR");
852

853 854 855
	# Blast our docker opts into the right place:
	$json->{'bip'} = $DOCKER_DEFAULT_BRIDGE_CIDR;
	$changed = 1;
856 857 858
    }

    # Check to ensure we're doing the right thing w.r.t. iptables:
859 860
    my $have_ipt_docker_user = (mysystem("$IPTABLES -L | grep DOCKER-USER") == 0);
    my $iptval = ($have_ipt_docker_user) ? JSON::PP::true : JSON::PP::false;
861
    my $ichanged = 0;
862 863 864 865
    if (!defined($json) || !exists($json->{"iptables"})
	|| $json->{'iptables'} != $iptval) {
	$json->{'iptables'} = $iptval;
	$changed = 1;
866
	$ichanged = 1;
867 868 869 870 871
    }
    if (!defined($json) || !exists($json->{"ip-masq"})
	|| $json->{'ip-masq'} != $iptval) {
	$json->{'ip-masq'} = $iptval;
	$changed = 1;
872
	$ichanged = 1;
873 874 875 876 877 878 879 880 881 882 883
    }

    if ($changed) {
	TBDebugTimeStamp("Updating /etc/docker/daemon.json");

	my $newjsontext = encode_json($json);

	open(FD,">/etc/docker/daemon.json")
	    or die("could not write /etc/docker/daemon.json: $!");
	print FD $newjsontext;
	close(FD);
884 885 886

	mysystem2("service docker stop");

887
	if ($ichanged && !$have_ipt_docker_user) {
888 889 890 891 892 893 894 895 896 897
	    #
	    # Make sure all the Docker stuff is undone, if this is not
	    # our Docker.
	    #
	    mysystem("$IPTABLES -P FORWARD ACCEPT");
	    mysystem("$IPTABLES -F INPUT");
	    mysystem("$IPTABLES -F OUTPUT");
	    mysystem("$IPTABLES -F FORWARD");
	    mysystem("$IPTABLES -F DOCKER");
	    mysystem2("$IPTABLES -X DOCKER");
898
	    mysystem2("$IPTABLES -F DOCKER-ISOLATION");
899
	    mysystem2("$IPTABLES -X DOCKER-ISOLATION");
900 901 902 903
	    mysystem2("$IPTABLES -F DOCKER-ISOLATION-STAGE-1");
	    mysystem2("$IPTABLES -X DOCKER-ISOLATION-STAGE-1");
	    mysystem2("$IPTABLES -F DOCKER-ISOLATION-STAGE-2");
	    mysystem2("$IPTABLES -X DOCKER-ISOLATION-STAGE-2");
904 905 906
	}

	mysystem2("service docker start");
907 908

	# Remap, cause Docker creates some ifaces.
909
	refreshLibVnodeNetCache();
910 911 912 913 914
    }

    return 0;
}

915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
sub setupDockerExecSSH() {
    #
    # We need to read the default sshd config; comment out any Port or
    # ListenAddress lines; and write it out to the head config file.
    # Note, we blow away the head file when first configuring the phost
    # to support docker.
    #
    my @newlines = ();
    open(FD,"/etc/ssh/sshd_config");
    my @lines = <FD>;
    close(FD);
    foreach my $line (@lines) {
	if ($line =~ /^\s*(Port|ListenAddress)/) {
	    $line = "#$line";
	}
	push(@newlines,$line);
    }
    open(FD,">$DOCKER_EXEC_SSHD_CONFIGFILE_HEAD");
    print FD @newlines;
    close(FD);

    #
    # Then make the dir where we put the per-vhost sshd config bits.
    #
    mysystem("mkdir -p $DOCKER_EXEC_SSHD_CONFIGDIR");

    return 0;
}

sub rebuildAndReloadDockerExecSSH() {
    my $retval;

    TBDebugTimeStamp("rebuildAndReloadDockerExecSSH: grabbing sshd lock".
		     " $SSHD_EXEC_LOCK")
	if ($lockdebug);
    my $locked = TBScriptLock($SSHD_EXEC_LOCK,TBSCRIPTLOCK_GLOBALWAIT(), 900);
    if ($locked != TBSCRIPTLOCK_OKAY()) {
	return 0
	    if ($locked == TBSCRIPTLOCK_IGNORE());
	print STDERR "Could not get the $SSHD_EXEC_LOCK lock".
	    " after a long time!\n";
	return -1;
    }

    #
    # Our private Docker Exec sshd listens on the private VM ports and
    # when a user authenticates, we use the ForceCommand directive in a
    # Match block to gateway them into the container that is supposed to
    # be reachable via ssh on that port.  However, only Match blocks may
    # follow other Match blocks -- in particular, a Port directive (to
    # listen on) must precede the Match blocks.  Thus, for each
    # container, we create one file in the configdir like
    # 0.$vnode_id.port with the Port line, and another like
    # 1.$vnode_id.match with the match and command directives).
    #
    # Thus, we need an rcsorted order of files in $DOCKER_EXEC_SSHD_CONFIGDIR.
    #
    my @pmlines = ();
    if (sortedreadallfilesindir($DOCKER_EXEC_SSHD_CONFIGDIR,\@pmlines)) {
	$retval = -1;
	goto out;
    }

    open(FD,"$DOCKER_EXEC_SSHD_CONFIGFILE_HEAD");
    my @hlines = <FD>;
    close(FD);

    open(FD,">$DOCKER_EXEC_SSHD_CONFIGFILE");
    print FD "".join('',@hlines)."\n".join('',@pmlines)."\n";
    close(FD);

    #
    # But, if there were no port/match lines, *stop* the service instead of
    # restarting -- because it would probably try to start on port 22, which
    # of course will just fail it.
    #
    if (@pmlines == 0) {
	TBDebugTimeStamp("No more ports/commands in sshd_config-docker-exec;".
			 " stopping service!");
	mysystem2("systemctl stop sshd-docker-exec.service");
    }
    else {
	TBDebugTimeStamp("Restarting sshd-docker-exec.service for changes to".
			 " sshd_config-docker-exec");
	mysystem2("systemctl restart sshd-docker-exec.service");
    }
    $retval = 0;

  out:
    TBScriptUnlock();
    return $retval;
}

sub addContainerToDockerExecSSH($$$) {
    my ($vnode_id,$port,$shell)  = @_;

    open(FD,">$DOCKER_EXEC_SSHD_CONFIGDIR/0.${vnode_id}.port");
    print FD "Port $port\n";
    close(FD);

    open(FD,">$DOCKER_EXEC_SSHD_CONFIGDIR/1.${vnode_id}.match");
    print FD "Match LocalPort=$port\n";
    print FD "ForceCommand /usr/bin/sudo /usr/bin/docker exec -it $vnode_id $shell\n";
    close(FD);

    return rebuildAndReloadDockerExecSSH();
}

sub removeContainerFromDockerExecSSH($) {
    my ($vnode_id,) = @_;

    unlink("$DOCKER_EXEC_SSHD_CONFIGDIR/0.${vnode_id}.port");
    unlink("$DOCKER_EXEC_SSHD_CONFIGDIR/0.${vnode_id}.match");

    return rebuildAndReloadDockerExecSSH();
}

1032 1033 1034 1035
sub getDockerNetMemberIds($)
{
    my ($netname,) = @_;

1036 1037
    my ($code,$content,$resp) = getClient()->network_inspect($netname);
    if ($code) {
1038 1039
	return undef;
    }
1040
    if (ref($content) eq 'ARRAY') {
1041 1042
	$content = $content->[0];
    }
1043 1044 1045
    if (ref($content) ne 'HASH') {
	return undef;
    }
1046 1047 1048
    if (!exists($content->{"Containers"})) {
	return ();
    }
1049 1050

    my @retval = ();
1051 1052 1053 1054
    foreach my $cid (keys(%{$content->{"Containers"}})) {
	next
	    if (!exists($content->{"Containers"}{$cid}{"Name"}));
	push(@retval,$cid);
1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
    }
    return @retval;
}

sub setupLVM()
{

    print "Enabling LVM...\n"
	if ($debug);

    # We assume our kernels support this.
    mysystem2("$MODPROBE dm-snapshot");
    if ($?) {
	print STDERR "ERROR: could not load snaphot module!\n";
	return -1;
    }

    #
1073
    # Make sure pieces are at least 32 GiB.
1074
    #
1075
    my $minpsize = 32 * 1024;
1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232
    my %devs = libvnode::findSpareDisks($minpsize, $LVM_AVOIDSSD);

    # if ignoring SSDs but came up with nothing, we have to use them!
    if ($LVM_AVOIDSSD && keys(%devs) == 0) {
	%devs = libvnode::findSpareDisks($minpsize, 0);
    }

    #
    # Turn on write caching. Hacky. 
    # XXX note we do not use the returned "path" here as we need to
    # change the setting on all devices, not just the whole disk devices.
    #
    my %diddev = ();
    foreach my $dev (keys(%devs)) {
	# only mess with the disks we are going to use
	if (!exists($diddev{$dev}) &&
	    (exists($devs{$dev}{"size"}) || $LVM_FULLDISKONLY == 0)) {
	    mysystem2("hdparm -W1 /dev/$dev");
	    $diddev{$dev} = 1;
	}
    }
    undef %diddev;

    #
    # See if our LVM volume group for VMs exists and create it if not.
    #
    my $vg = `vgs | grep $VGNAME`;
    if ($vg !~ /^\s+${VGNAME}\s/) {
	print "Creating volume group...\n"
	    if ($debug);

	#
	# Total up potential maximum size.
	# Also determine mix of SSDs and non-SSDs if required.
	#
	my $maxtotalSize = 0;
	my $sizeThreshold = 0;
	foreach my $dev (keys(%devs)) {
	    if (defined($devs{$dev}{"size"})) {
		$maxtotalSize += $devs{$dev}{"size"};
	    } else {
		foreach my $part (keys(%{$devs{$dev}})) {
		    $maxtotalSize += $devs{$dev}{$part}{"size"};
		}
	    }
	}
	if ($maxtotalSize > 0) {
	    $sizeThreshold = int($maxtotalSize * $LVM_LARGEPARTPCT / 100.0);
	}

	#
	# Find available devices of sufficient size, prepare them,
	# and incorporate them into a volume group.
	#
	my $totalSize = 0;
	my @blockdevs = ();
	foreach my $dev (sort keys(%devs)) {
	    #
	    # Whole disk is available, use it.
	    #
	    if (defined($devs{$dev}{"size"})) {
		push(@blockdevs, $devs{$dev}{"path"});
		$totalSize += $devs{$dev}{"size"};
		next;
	    }

	    #
	    # Disk contains partitions that are available.
	    #
	    my ($lpsize,$lppath);
	    foreach my $part (keys(%{$devs{$dev}})) {
		my $psize = $devs{$dev}{$part}{"size"};
		my $ppath = $devs{$dev}{$part}{"path"};

		#
		# XXX one way to avoid using the system disk, just ignore
		# all partition devices. However, in cases where the
		# remainder of the system disk represents the majority of
		# the available space (e.g., Utah d710s), this is a bad
		# idea.
		#
		if ($LVM_FULLDISKONLY) {
		    print STDERR
			"WARNING: not using partition $ppath for LVM\n";
		    next;
		}

		#
		# XXX Another heurstic to try to weed out the system
		# disk whenever feasible: if a partition device represents
		# less than some percentage of the max possible space,
		# avoid it. At Utah this one is tuned (10%) to avoid using
		# left over space on the system disk of d820s (which have
		# six other larger drives) or d430s (which have two large
		# disks) while using it on the pc3000s and d710s.
		#
		if ($LVM_ONLYLARGEPARTS && $psize < $sizeThreshold) {
		    print STDERR "WARNING: not using $ppath for LVM (too small)\n";
		    next;
		}

		#
		# XXX If we are only going to use one partition per disk,
		# record the largest one we find here. This check will
		# filter out the small "other OS" partition (3-6GB) in
		# favor of the larger "rest of the disk" partition.
		#
		if ($LVM_ONEPARTPERDISK) {
		    if (!defined($lppath) || $psize > $lpsize) {
			$lppath = $ppath;
			$lpsize = $psize;
		    }
		    next;
		}

		#
		# It ran the gauntlet of feeble filters, use it!
		#
		push(@blockdevs, $ppath);
		$totalSize += $psize;
	    }
	    if ($LVM_ONEPARTPERDISK && defined($lppath)) {
		push(@blockdevs, $lppath);
		$totalSize += $lpsize;
	    }
	}
	if (@blockdevs == 0) {
	    print STDERR "ERROR: findSpareDisks found no disks for LVM!\n";
	    return -1;
	}
		    
	my $blockdevstr = join(' ', sort @blockdevs);
	mysystem("pvcreate $blockdevstr");
	mysystem("vgcreate $VGNAME $blockdevstr");

	my $size = lvmVGSize($VGNAME);
	if ($size < $DOCKER_MIN_VGSIZE) {
	    print STDERR "WARNING: physical disk space below the desired ".
		" minimum value ($size < $DOCKER_MIN_VGSIZE), expect trouble.\n";
	}
    }
    $STRIPE_COUNT = computeStripeSize($VGNAME);
    
    #
    # Make sure our volumes are active -- they seem to become inactive
    # across reboots
    #
    mysystem("vgchange -a y $VGNAME");

    return 0;
}

#
# Bridge stuff
#
sub addbr($)
{
1233
    my ($br) = @_;
1234

1235 1236 1237 1238 1239 1240
    if ($USE_BRCTL) {
	system("$BRCTL addbr $br");
    }
    else {
	system("$IP link add $br type bridge");
    }
1241 1242 1243
}
sub delbr($)
{
1244 1245 1246 1247 1248
    my ($br) = @_;

    if ($USE_BRCTL) {
	mysystem2("$IP link set $br down");
	mysystem2("$BRCTL delbr $br");
1249 1250
    }
    else {
1251
	mysystem2("$IP link del $br");
1252 1253 1254 1255
    }
}
sub addbrif($$)
{
1256
    my ($br,$if) = @_;
1257

1258 1259 1260 1261 1262 1263
    if ($USE_BRCTL) {
	system("$BRCTL addif $br $if");
    }
    else {
	system("$IP link set $if master $br");
    }
1264 1265 1266
}
sub delbrif($$)
{
1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328
    my ($br,$if) = @_;

    if ($USE_BRCTL) {
	system("$BRCTL delif $br $if");
    }
    else {
	system("$IP link set $if nomaster");
    }
}

#
# Network support.
#

sub ifaceInfo($) {
    my ($iface) = @_;

    if ($USE_LIBVNODE_NETCACHE) {
	return libvnode::getIfaceInfo($iface);
    }
    else {
	return libvnode::getIfaceInfoNoCache($iface);
    }
}

sub findIfaceByMAC($) {
    my ($mac) = @_;

    if ($USE_LIBVNODE_NETCACHE) {
	return libvnode::findIface($mac);
    }
    else {
	if ($mac !~ /:/) {
	    $mac = fixupMac($mac);
	}
	my $line = `ip -br link | grep $mac`;
	if (defined($line) && $line =~ /^([^\@\s]+)/) {
	    return $1;
	}
	return undef;
    }
}

sub isIfaceInBridge($$) {
    my ($iface,$bridge) = @_;

    if ($USE_LIBVNODE_NETCACHE) {
	my $br = libvnode::findBridge($iface);
	return 1
	    if (defined($br) && $br eq $bridge);
	return 0;
    }
    else {
	if (-e "/sys/class/net/$iface/lower_$bridge") {
	    return 1;
	}
	return 0;
    }
}

sub getBridgeForIface($) {
    my ($iface) = @_;
1329

1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
    if ($USE_LIBVNODE_NETCACHE) {
	return libvnode::findBridge($iface);
    }
    else {
	opendir(DIR,"/sys/class/net/$iface")
	    or return undef;
	while (my $dir = readdir(DIR)) {
	    chomp($dir);
	    if ($dir =~ /upper_(.+)$/) {
		return $1;
	    }
	}
	return undef;
    }
}

sub getBridgeIfaces($) {
    my ($brname) = @_;

    if ($USE_LIBVNODE_NETCACHE) {
	return libvnode::findBridgeIfaces($brname);
    }
    else {
	my @ret = ();
	opendir(DIR,"/sys/class/net/$brname/")
	    or return undef;
	while (my $dir = readdir(DIR)) {
	    chomp($dir);
	    if ($dir =~ /lower_(.+)$/) {
		push(@ret,$1);
	    }
	}
	return @ret;
    }
}

sub getMacvlanIfaces($) {
    my ($brname) = @_;

    if ($USE_LIBVNODE_NETCACHE) {
	return libvnode::findMacvlanIfaces($brname);
    }
    else {
	my @ret = ();
	opendir(DIR,"/sys/class/net/$brname/")
	    or return undef;
	while (my $dir = readdir(DIR)) {
	    chomp($dir);
	    if ($dir =~ /lower_(.+)$/) {
		push(@ret,$1);
	    }
	}
	return @ret;
    }
}

sub getControlNet() {
    open(FD,"/var/emulab/boot/controlif")
	or return undef;
    my $controlif = <FD>;
    close(FD);
    chomp($controlif);
    return undef 
	if ($controlif eq '');
    open(FD,"/var/emulab/boot/routerip")
	or return undef;
    my $gw = <FD>;
    close(FD);
    chomp($gw);
    return undef 
	if ($gw eq '');
    open(FD,"/var/emulab/boot/myip")
	or return undef;
    my $ip = <FD>;
    close(FD);
    chomp($ip);
    return undef 
	if ($ip eq '');

    my $ref = getIfaceInfoNoCache($controlif);
    return undef
	if (!defined($ref));
    return undef
	if ($ref->{'ip'} ne $ip);

    return ($ref->{'iface'},$ref->{'ip'},$ref->{'mask'},$ref->{'maskbits'},
	    $ref->{'network'},$ref->{'mac'},$gw);
1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446
}

##
## libvnode API implementation
##

sub init($)
{
    my ($pnode_id,) = @_;

    if ($USE_LVM) {
	# See what version of LVM we have. Again, some commands are different.
	my $out = `lvm version | grep 'LVM version'`;
	if (defined($out) && $out =~ /LVM version:\s+(\d+)\.(\d+)\.(\d+)/) {
	    if (int($1) > 2 ||
		(int($1) == 2 && int($2) > 2) ||
		(int($1) == 2 && int($2) == 2 && int($3) >= 99)) {
		$NEW_LVM = 1;
	    }
	}

	# Compute the strip size for new lvms.
	if (-e "$READYFILE") {
	    $STRIPE_COUNT = computeStripeSize($VGNAME);
	}
    }

    #
    # Check which docker this is.
    #
1447 1448
    my $rc = system('grep -q PrivatePoolId `which dockerd`');
    if ($rc == 0) {
1449
	$ISOURDOCKER = 1;
1450
	TBDebugTimeStamp("init: ISOURDOCKER=1");
1451 1452 1453 1454 1455 1456 1457 1458 1459
    }

    return 0;
}

#
# Called on each vnode, but should only be executed once per boot.
# We use a file in /var/run (cleared on reboots) to ensure this.
#
1460
sub rootPreConfig($;$)
1461
{
1462
    my ($bossip,$hostattributes) = @_;
1463
    my ($code,$content,$resp);
1464 1465 1466 1467 1468 1469 1470 1471 1472

    #
    # Haven't been called yet, grab the lock and double check that someone
    # didn't do it while we were waiting.
    #
    if (! -e "$READYFILE") {
	TBDebugTimeStamp("rootPreConfig: grabbing global lock $GLOBAL_CONF_LOCK")
	    if ($lockdebug);
	my $locked = TBScriptLock($GLOBAL_CONF_LOCK,
1473
				  TBSCRIPTLOCK_GLOBALWAIT(), 1200);
1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492
	if ($locked != TBSCRIPTLOCK_OKAY()) {
	    return 0
		if ($locked == TBSCRIPTLOCK_IGNORE());
	    print STDERR "Could not get the $GLOBAL_CONF_LOCK lock".
		" after a long time!\n";
	    return -1;
	}
    }
    TBDebugTimeStamp("  got global lock")
	if ($lockdebug);
    if (-e "$READYFILE") {
	TBDebugTimeStamp("  releasing global lock")
	    if ($lockdebug);
        TBScriptUnlock();
        return 0;
    }
    
    TBDebugTimeStamp("Configuring root vhost context");

1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511
    #
    # Check if we are using an alternate storage driver.
    #
    if (defined($hostattributes)
	&& exists($hostattributes->{"DOCKER_STORAGE_DRIVER"})) {
	my $driver = $hostattributes->{"DOCKER_STORAGE_DRIVER"};
	if ($driver eq 'overlay2' || $driver eq 'aufs') {
	    $DOCKER_STORAGE_DRIVER = $driver;
	    $USE_DOCKER_LVM = 0;
	}
	elsif ($driver eq 'devicemapper') {
	    $DOCKER_STORAGE_DRIVER = $driver;
	    $USE_DOCKER_LVM = 1;
	}
	else {
	    warn("bogus storage driver $driver; ignoring!\n");
	}
    }

1512 1513 1514
    #
    # Ensure we have the latest bridge/iface state!
    #
1515
    refreshLibVnodeNetCache();
1516

1517 1518 1519 1520 1521
    #
    # Make sure we actually have Docker.
    #
    ensureDockerInstalled();

1522 1523 1524
    #
    # Make sure we have all our Perl deps.
    #
1525
    ensureDeps();
1526

1527 1528 1529 1530 1531 1532
    #
    # Make sure we have a bunch of other common tools.
    #
    aptGetEnsureInstalled("lvm2","thin-provisioning-tools",
			  "bridge-utils","iproute2","vlan");

1533 1534 1535 1536 1537
    #
    # Set up the docker exec sshd service.
    #
    setupDockerExecSSH();

1538 1539 1540
    #
    # Setup our control net device if not already up.
    #
1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552
    if ($USE_MACVLAN_CNET || $USE_MACVLAN) {
	#
	# If we build dummy shortbridge nets atop either a physical
	# device, or atop a dummy device, load these!
	#
	mysystem("$MODPROBE macvlan");
	mysystem("$MODPROBE dummy");
    }
    if (!$USE_MACVLAN_CNET || !$USE_MACVLAN) {
	mysystem("$MODPROBE bridge");
    }

1553
    my ($cnet_iface,$cnet_ip,$cnet_mask,
1554 1555 1556 1557 1558 1559
	$cnet_maskbits,$cnet_net,$cnet_mac,$cnet_gw) = getControlNet();
    if (!defined($cnet_iface) || !defined($cnet_ip)) {
	print STDERR "ERROR: failed to detect control network interface!\n";
	return -1;
    }
    my ($alias_ip,$alias_mask,$vmac) = hostControlNet($cnet_ip,$cnet_mask);
1560 1561 1562
    my ($VCNET_NET,undef,$VCNET_GW,$VCNET_SLASHMASK) = findVirtControlNet();
    my $nettype = ($USE_MACVLAN_CNET) ? "macvlan" : "bridge";

1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598
    #
    # NB: in the case of !$USE_MACVLAN_CNET (i.e. using bridges for
    # control net) and !$ISREMOTENODE, we place the real routable
    # control net addr on the bridge and put the real control net dev in
    # the bridge.  So we want to track the orig_cnet_iface.  Once we
    # shuffle that dev into the bridge, we reset the
    # /var/emulab/boot/controlif file to point to the bridge -- and thus
    # if this gets re-run, it won't get the real control net dev as in
    # arg to this function.  So the code that handles this case is
    # careful to use orig_cnet_iface instead of cnet_iface!  None of the
    # other cases care, since they don't re-write
    # /var/emulab/boot/controlif.
    #
    my $orig_cnet_iface;
    #
    # Assume if this is not present, this is the first time running.  If
    # so, the real control net device must have the real control net IP;
    # not $DOCKERCNET!  So if you wipe this file out to retry, make sure
    # to reset the real controlif with proper info from dhclient.
    #
    if (! -e "/var/run/emulab-controlif-orig") {
	$orig_cnet_iface = $cnet_iface;
	open(FD,">/var/run/emulab-controlif-orig")
	    or fatal("could not open /var/run/emulab-controlif-orig: $!");
	print FD "$cnet_iface";
	close(FD);
    }
    else {
	open(FD,"/var/run/emulab-controlif-orig")
	    or fatal("could not open /var/run/emulab-controlif-orig: $!");
	$orig_cnet_iface = <FD>;
	chomp($orig_cnet_iface);
	close(FD);
    }

    my $dcnexists = 0;
1599 1600 1601
    TBDebugTimeStamp("checking for docker network $DOCKERCNET...");
    ($code,$content,$resp) = getClient()->network_inspect($DOCKERCNET);
    if ($code == 0) {
1602 1603 1604
	$dcnexists = 1;
    }

1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618
    if ($USE_MACVLAN_CNET && ! -e "/sys/class/net/$DOCKERCNET") {
	my $alias_net =
	    inet_ntoa(inet_aton($alias_ip) & inet_aton($alias_mask));

	if (!$ISREMOTENODE) {
            #
            # We first add a macvlan "alias" to the control net device
            # so that we (the physical host) are in the same subnet as
            # the vnodes.  With the macvlan interfaces, you cannot
            # directly alias the parent device and talk to/from the
            # other macvlan children on the parent.
            #
	    print "Creating $DOCKERCNET macvlan on $cnet_iface".
		" ($alias_ip,$alias_mask)...\n";
1619
	    mysystem("$IP link add link $cnet_iface name $DOCKERCNET".
1620
		     " address $vmac type macvlan mode bridge");
1621 1622
	    mysystem("$IP addr replace $alias_ip/$alias_mask dev $DOCKERCNET");
	    mysystem("$IP link set up $DOCKERCNET");
1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649

	    #my $isroutable = isRoutable($alias_ip);
	    ## Add a route to reach the vnodes. Do it for the entire
	    ## network, and no need to remove it.
	    #if (!$ISREMOTENODE && !$isroutable
	    #	&& system("$NETSTAT -r | grep -q $alias_net")) {
	    #	mysystem2("$ROUTE add -net $alias_net netmask $alias_mask dev $cnet_iface");
	    #	if ($?) {
	    #	    warn("could not add non-routable local virt control net route!");
	    #	    #return -1;
	    #	}
	    #}
	}
	else {
	    #
	    # XXX will this actually work? macvlan children can't talk to host?
	    # XXX probably need to add a dummy device to back the docker
	    # macvlan network!
	    # $alias_ip = $cnet_ip;
            #
            # Ok, since that won't work, in this case, we add a dummy
            # device to host our control net macvlan devices atop; we
            # don't want anything bridged to the outside world in the
            # remoteded case.  Then we add our control net alias like
            # above.
            #
	    $cnet_iface = "dummycnet";
1650
	    mysystem2("$IP link add dummycnet type dummy");
1651 1652
	    print "Creating $DOCKERCNET macvlan on $cnet_iface".
		" ($alias_ip,$alias_mask)...\n";
1653
	    mysystem("$IP link add link $cnet_iface".
1654
		     " name $DOCKERCNET address $vmac type macvlan mode bridge");
1655 1656
	    mysystem("$IP addr replace $alias_ip/$alias_mask dev $DOCKERCNET");
	    mysystem("$IP link set up $DOCKERCNET");
1657 1658
	}
    }
1659
    elsif (!$USE_MACVLAN_CNET
1660
	   && (!$dcnexists || !isIfaceInBridge($orig_cnet_iface,$DOCKERCNET))) {
1661 1662 1663
	my $alias_net =
	    inet_ntoa(inet_aton($alias_ip) & inet_aton($alias_mask));

1664 1665 1666
	#
	# If the bridge doesn't exist, add it first.
	#
1667 1668 1669 1670 1671 1672
	if (! -e "/sys/class/net/$DOCKERCNET") {
	    addbr($DOCKERCNET);
	    if ($?) {
		fatal("failed to create $DOCKERCNET bridge!");
		return -1;
	    }
1673 1674
	}

1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705
	#
	# The $ISREMOTENODE case is easy, because the real control net
	# device doesn't go into the bridge, and we and Docker expect
	# the bridge to have the fake virtual control net address.  So
	# harmony ensues.
	#
	# The !$ISREMOTENODE case is very, very tricky.  The first time
	# we boot, the docker network doesn't exist; the bridge doesn't
	# exist; all the control net state is as dhclient left it.  The
	# correct order there is create bridge; flush control net ip
	# addr; move control net dev into bridge; add control net as
	# docker network; flush bridge ip addr Docker set; set our
	# proper public control net IP as the bridge ip addr; and add
	# the unroutable virtual control net addr (the docker network
	# gateway) as an alias.  NB: Docker will not accept or add the
	# virtual control net IP as an alias; it will error, or force
	# the IP to the virtual addr.  That is why we must fix it up
	# after creating the Docker network.
	#
	# On subsequent boots, the control net already exists as a
	# Docker network, and Docker will create the control net device
	# before we run.  However, Docker doesn't put the real control
	# net device into that bridge (it doesn't know that kind of
	# thing); but it does give the bridge the virtual control IP as
	# its primary IP.  So, we have to flush the bridge IP, and *not*
	# remake the Docker cnet.
	#
	# What a pain, all because Docker cannot just leave an existing
	# bridge alone (i.e.,
	# https://github.com/docker/docker/issues/20758).
	#
1706
	if (!$ISREMOTENODE) {
1707 1708 1709 1710
	    my $ipandmaskbits = "$cnet_ip/$cnet_maskbits";

	    # First grab the default gateway.
	    my ($defroute,$defrouteiface);
1711
	    open(ROUTEOUTPUT,"$IP route list |")
1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726
		or fatal("unable to get route list via 'ip'!");
	    while (!eof(ROUTEOUTPUT)) {
		my $line = <ROUTEOUTPUT>;
		chomp($line);
		if ($line =~ /^default via (\d+\.\d+\.\d+\.\d+)/) {
		    $defroute = $1;
		}
		if ($line =~ /^default via [\w\.\/]+\s+dev\s+([\w\.]+)/) {
		    $defrouteiface = $1;
		}
	    }
	    if (!$defroute) {
		fatal("could not find default route!");
	    }

1727
	    #
1728 1729 1730 1731
	    # Undo the existing control net config we obtained on boot,
	    # and move that interface into our $DOCKERCNET bridge, IFF
	    # it's not in the bridge already.  If it's already in the
	    # bridge, no need to do any of this.
1732
	    #
1733 1734 1735 1736
	    if (!isIfaceInBridge($orig_cnet_iface,$DOCKERCNET)) {
		mysystem2("$IP link set down $orig_cnet_iface");
		mysystem2("$IP addr del $ipandmaskbits dev $orig_cnet_iface");
		mysystem2("$IP addr flush dev $orig_cnet_iface");
1737 1738
		addbrif($DOCKERCNET,$orig_cnet_iface);
	    }
1739

1740
	    #
1741 1742 1743 1744 1745
	    # If the Docker network does not exist in Docker itself, but
	    # it *does* exist as a device, flush its IP addr since
	    # Docker insists on setting that itself.
	    #
	    if (!$dcnexists && -e "/sys/class/net/$DOCKERCNET") {
1746
		mysystem2("$IP addr flush dev $DOCKERCNET");
1747 1748
	    }

1749
	    #
1750 1751 1752
	    # If the docker network isn't yet built, do that now.
	    #
	    if (!$dcnexists) {
1753 1754 1755 1756 1757 1758 1759 1760
		TBDebugTimeStamp("creating bridged docker network $DOCKERCNET");
		($code,$content) = getClient()->network_create_bridge(
		    $DOCKERCNET,"${VCNET_NET}/${VCNET_SLASHMASK}",$alias_ip,
		    $DOCKERCNET);
		if ($code) {
		    fatal("failed to create bridged Docker $DOCKERCNET control net:".
			  " $content");
		}
1761 1762 1763 1764 1765 1766 1767
		$dcnexists = 1;
	    }

	    #
	    # Always flush the bridge's Docker-imposed addr immediately,
	    # whether it existed or we created it.
	    #
1768
	    mysystem("$IP addr flush dev $DOCKERCNET");
1769 1770 1771 1772 1773

	    #
	    # Set the $DOCKERCNET configuration to one that both we and
	    # Docker are happy with.
	    #
1774
	    mysystem2("$IP addr add $ipandmaskbits dev $DOCKERCNET");
1775
	    if ($?) {
1776
		mysystem("$IP addr replace $ipandmaskbits dev $DOCKERCNET");
1777
	    }
1778 1779
	    mysystem("$IP link set up $DOCKERCNET");
	    mysystem("$IP link set up $orig_cnet_iface");
1780 1781
	    if ($defrouteiface eq $cnet_iface
		|| $defrouteiface eq $orig_cnet_iface) {
1782
		mysystem("$IP route replace default via $defroute");
1783
	    }
1784
	    mysystem("$IP addr add $alias_ip/$alias_mask dev $DOCKERCNET".
1785
		     " label $DOCKERCNET:1");
1786

1787 1788 1789
	    #
	    # Save the bridge as the real control net iface.
	    #
1790 1791 1792 1793 1794 1795 1796 1797 1798
	    open(CONTROLIF,">$BOOTDIR/controlif");
	    print CONTROLIF "$DOCKERCNET\n";
	    close(CONTROLIF);
	}
	else {
	    #
	    # If this node is remote, then it gets a bridge without the
	    # control net.
	    #
1799 1800
	    mysystem("$IP addr replace $alias_ip/$alias_mask dev $DOCKERCNET");
	    mysystem("$IP link set up $DOCKERCNET");
1801 1802 1803 1804
	}
    }

    #
1805
    # Now if the Docker control net still doesn't exist, create that.
1806
    #
1807
    if (!$dcnexists) {
1808 1809 1810 1811 1812
	if ($USE_MACVLAN_CNET) {
	    #
	    # Next, we create a docker macvlan network to front for the
	    # virt control net.
	    #
1813 1814 1815 1816 1817 1818 1819 1820
	    TBDebugTimeStamp("creating macvlan docker network $DOCKERCNET");
	    ($code,$content) = getClient()->network_create_macvlan(
		$DOCKERCNET,"${VCNET_NET}/${VCNET_SLASHMASK}",$alias_ip,
		$cnet_iface);
	    if ($code) {
		fatal("failed to create bridged Docker $DOCKERCNET control net:".
		      " $content");
	    }
1821 1822
	}
	else {
1823 1824 1825 1826 1827 1828 1829 1830
	    TBDebugTimeStamp("creating bridged docker network $DOCKERCNET");
	    ($code,$content) = getClient()->network_create_bridge(
		$DOCKERCNET,"${VCNET_NET}/${VCNET_SLASHMASK}",$alias_ip,
		$DOCKERCNET);
	    if ($code) {
		fatal("failed to create bridged Docker $DOCKERCNET control net:".
		      " $content");
	    }
1831 1832 1833 1834 1835 1836 1837
	}
    }

    #
    # Mesh our iptables setup with docker's.  This is nontrivial because
    # Docker does one nasty thing: it continually forces its -j
    # DOCKER-ISOLATION rule into the top of the FORWARD chain on
1838 1839 1840 1841 1842 1843 1844 1845 1846
    # significant operations (like creating a container).  This has
    # since been fixed in more recent versions (there is a DOCKER-USER
    # chain at the top of the forward chain that we hook into), so we
    # have two strategies.  First, if DOCKER-USER exists, we hook it;
    # second, if that is not available, we disable its use of iptables
    # and do all the stuff Docker would normally do that we actually
    # need (a subset of what Docker normally does).  However, in this
    # latter case, iptables won't behave as expected for regular
    # containers.  Nothing we can do about that.
1847 1848 1849 1850 1851 1852 1853 1854 1855
    #
    # We use the same basic strategy in either case: what we want to do
    # is flow all packets on the control net bridge through our
    # EMULAB-ISOLATION chain.  But we do return to the DOCKER-ISOLATION
    # chain so that Docker rules can affect other Docker networks.
    #
    mysystem2("$IPTABLES -N EMULAB-ISOLATION");
    mysystem("$IPTABLES -F EMULAB-ISOLATION");
    mysystem("$IPTABLES -A EMULAB-ISOLATION -j RETURN");
1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869
    if (mysystem("$IPTABLES -L | grep DOCKER-USER") == 0) {
	mysystem("$IPTABLES -F DOCKER-USER");
	mysystem("$IPTABLES -A DOCKER-USER -j EMULAB-ISOLATION");
	#
	# In more recent versions of Docker, by default, bridge networks
	# are not allowed to leave the host (i.e. via masquerading).
	# So, fix that.
	#
	mysystem("$IPTABLES -A DOCKER-USER -o docker0 -j ACCEPT");
	mysystem("$IPTABLES -A DOCKER-USER -o _dockercnet -j ACCEPT");
    }
    else {
	mysystem("$IPTABLES -I FORWARD -j EMULAB-ISOLATION");
    }
1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890

    #
    # Also, Docker handles MASQUERADING for us by default.  We don't
    # want to turn off Docker's iptables (it's on or off) functionality,
    # because people should be able to bring up Docker VMs manually if
    # they want, using the default Docker host network (or one of the
    # experiment networks, if they safely manage IP addr assignment).
    # However, as discussed above, we have to turn it off if it's not
    # our modified version.  So we have to add the MASQ rule if iptables
    # is off in Docker.
    #
    # If this is a local testbed node, we want to allow unroutable
    # packets on the control net.  So we have to add local control net
    # exceptions ahead of Docker's default MASQ-all rules.
    #
    if (!$ISREMOTENODE) {
	mysystem("$IPTABLES -t nat -I POSTROUTING".
		 " -s ${VCNET_NET}/${VCNET_SLASHMASK}".
		 " -d ${VCNET_NET}/${VCNET_SLASHMASK} -j ACCEPT");
	mysystem("$IPTABLES -t nat -I POSTROUTING".
		 " -s ${VCNET_NET}/${VCNET_SLASHMASK}".
1891
		 " -d ${cnet_net}/${cnet_mask} -j ACCEPT");
1892 1893 1894
	# NB: Ok, more recent versions of Docker no longer seem to allow
	# default outbound masquerading -- so always do it.
	if (1 || !$ISOURDOCKER) {
1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926
	    mysystem("$IPTABLES -t nat -A POSTROUTING".
		     " -s ${VCNET_NET}/${VCNET_SLASHMASK}".
		     " -j MASQUERADE");
	    # Also do the default docker0 bridge CIDR, since Docker
	    # won't be doing it and we want temp user containers to
	    # work.
	    mysystem("$IPTABLES -t nat -A POSTROUTING".
		     " -s $DOCKER_DEFAULT_BRIDGE_CIDR".
		     " -j MASQUERADE");
	}
    }

    #
    # XXX: antispoofing!  Can't do it with macvlan control net though.
    #
    # We also choose not to use the style here; instead, we are
    # draconian and drop everything that comes from the vnode that does
    # not have its IP.  We do that later.
    #
    # We want to change the below code not to DROP on the FORWARD chain
    # by default, but rather to drop anything that comes from a vnode's
    # cnet iface that is not sourced from its assigned control net IP.
    #
    if (0) {
	mysystem("$IPTABLES -P FORWARD DROP");
	mysystem("$IPTABLES -F FORWARD");
	# This says to forward traffic across the bridge.
	mysystem("$IPTABLES -A FORWARD ".
		 "-m physdev --physdev-in $cnet_iface -j ACCEPT");
    }

    # For tunnels
1927
    mysystem("$MODPROBE ip_gre");
1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952

    # For VLANs
    mysystem("$MODPROBE 8021q");

    # We need this stuff for traffic shaping -- only root context can
    # modprobe.
    mysystem("$MODPROBE sch_netem");
    mysystem("$MODPROBE sch_htb");

    # For bandwidth contraints.
    mysystem("$MODPROBE ifb");

    # Create a DB to manage them. 
    my %MDB;
    if (!dbmopen(%MDB, $IFBDB, 0660)) {
	print STDERR "*** Could not create $IFBDB\n";
	TBScriptUnlock();
	return -1;
    }
    dbmclose(%MDB);
    
    #
    # Ensure that LVM is loaded in the kernel and ready.
    #
    if ($USE_LVM) {
1953 1954 1955 1956 1957 1958 1959 1960
	# There are several reasons we might need a Docker restart in
	# this LVM setup bit; they will be noted along the way, and we
	# will restart if necessary.
	my $needdockerrestart = 0;

	#
	# Sets up our PVs and VG ($VGNAME).
	#
1961 1962
	setupLVM();

1963 1964 1965 1966 1967 1968 1969 1970 1971 1972
	#
	# Figure out how big various volumes should be.
	#
	# If we are using the aufs storage backend for Docker, we want
	# most of our space in $EXTRAFS (since /var/lib/docker gets
	# symlinked there, our heaviest space usage may be there); in
	# that case, we save a ~10%VG buffer of free space.  Wild guess.
	#
	# If we are instead using the devicemapper direct-lvm backend,
	# we need both $EXTRAFS and $INFOFS, but we also need a beefy
1973
	# thinpool for Docker.  In this case, we use min(32GB,15%VG) LV
1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
	# for $INFOFS; use min(32GB,15%remainingVG) for the $EXTRAFS;
	# then we provision the thin pool with 90% of the remaining
	# space (i.e., 0.90*(totalVG - sizeof($EXTRAFS) -
	# sizeof($INFOFS))).  This results in at least some spare space
	# in case some heavy usage happens, for autoextension of the
	# thinpool.  And we could even consider garbage-collecting
	# context build dirs in $EXTRAFS and downsizing that so that the
	# thin pool can grow more, for instance on a shared host, if
	# necessary.
	#
	my ($extrasize,$infosize,$thinpoolsize) = (0,0,0);
	my $vgsize = lvmVGSize($VGNAME);
	my $remaining = $vgsize;

	if (!$USE_DOCKER_LVM) {
	    # We will only create $EXTRAFS and $INFOFS.
1990 1991
	    if (0.15 * $remaining < 32) {
		$infosize = 0.15 * $remaining;
1992 1993
	    }
	    else {
1994
		$infosize = 32;
1995 1996 1997 1998
	    }
	    $remaining -= $infosize;
	    $extrasize = 0.90 * $remaining;
	    $remaining -= $extrasize;
1999
	}
2000 2001 2002
	else {
	    # We will create $EXTRAFS and $INFOFS, as well as the Docker
	    # thin pool.
2003 2004
	    if (0.15 * $remaining < 32) {
		$infosize = 0.15 * $remaining;
2005 2006
	    }
	    else {
2007
		$infosize = 32;
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
	    }
	    $remaining -= $infosize;
	    if (0.15 * $remaining < 32) {
		$extrasize = 0.15 * $remaining;
	    }
	    else {
		$extrasize = 32;
	    }
	    $remaining -= $extrasize;
	    $thinpoolsize = 0.90 * $remaining;
	    $remaining -= $thinpoolsize;
	}

	my $tmplvname;
	if ($INFOFS =~ /\/(.*)$/) {
	    $tmplvname = $1;
	}
	if (!libvnode::lvExists($VGNAME,$tmplvname)) {
2026 2027 2028 2029 2030 2031 2032 2033
	    print "Creating container info FS $tmplvname ...\n";
	}
	else {
	    print "Mounting container info FS $tmplvname ...\n";
	}
	if (createExtraFS($INFOFS, $VGNAME, "${infosize}G")) {
	    TBScriptUnlock();
	    return -1;
2034 2035 2036 2037
	}
	if ($EXTRAFS =~ /\/(.*)$/) {
	    $tmplvname = $1;
	}
2038
	my $already = 0;
2039
	if (!libvnode::lvExists($VGNAME,$tmplvname)) {
2040
	    print "Creating scratch FS $tmplvname ...\n";
2041 2042 2043 2044
	    if (-d $EXTRAFS) {
		$already = 1;
		mysystem("mv $EXTRAFS ${EXTRAFS}.bak");
	    }
2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058
	}
	else {
	    print "Mounting scratch FS $tmplvname ...\n";
	}
	if (createExtraFS($EXTRAFS, $VGNAME, "${extrasize}G")) {
	    TBScriptUnlock();
	    return -1;
	}
	if ($already) {
	    my @files = glob("${EXTRAFS}.bak/*");
	    foreach my $file (@files) {
		my $base = basename($file);
		mysystem("/bin/mv $file $EXTRAFS")
		    if (! -e "$EXTRAFS/$base");
2059
	    }
2060
	    mysystem("/bin/rm -rf ${EXTRAFS}.bak");
2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077
	}
	if ($USE_DOCKER_LVM && !libvnode::lvExists($VGNAME,"thinpool")) {
	    print "Creating Docker Thin Pool...\n";
	    #
	    # Docker wants a thinpool and a metadata pool.  Size of the
	    # metadata pool cannot exceed 16GB.  So we create that as
	    # min(16,0.01*$thinpoolsize).
	    #
	    my ($tps,$tpms) = (0,0);
	    if (0.01 * $thinpoolsize < 16) {
		$tpms = 0.01 * $thinpoolsize;
	    }
	    else {
		$tpms = 16;
	    }
	    $tps = $thinpoolsize - $tpms;
	    # XXX: --wipesignatures y ?
2078 2079
	    mysystem("lvcreate -n thinpool $VGNAME -L ${tps}G");
	    mysystem("lvcreate -n thinpoolmeta $VGNAME -L ${tpms}G");
2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093
	    mysystem("lvconvert -y --zero n -c 512K".
		     " --thinpool $VGNAME/thinpool".
		     " --poolmetadata $VGNAME/thinpoolmeta");
	    mkdir("/etc/lvm/profile");
	    open(FD,">/etc/lvm/profile/$VGNAME-thinpool.profile")
		or fatal("could not open /etc/lvm/profile/$VGNAME-thinpool.profile: $@");
	    print FD "activation {\n".
		"  thin_pool_autoextend_threshold=90\n".
		"  thin_pool_autoextend_percent=10\n".
		"}\n";
	    close(FD);
	    mysystem("lvchange --metadataprofile $VGNAME-thinpool".
		     " $VGNAME/thinpool");
	    mysystem("lvs -o+seg_monitor");
2094 2095
	}
	if (defined($DOCKER_STORAGE_DRIVER)) {
2096
	    #
2097 2098
	    # Setup the Docker storage backend.
	    # If devicemapper direct-lvm storage backend, like
2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126
	    # { "storage-driver": "devicemapper",
	    #   "storage-opts": [
	    #     "dm.thinpooldev=/dev/mapper/docker-thinpool",
	    #     "dm.use_deferred_removal=true",
	    #     "dm.use_deferred_deletion=true" ] }
	    #
	    my $origjsontext = '';
	    my $json = {};
	    if (-e "/etc/docker/daemon.json") {
		open(FD,"/etc/docker/daemon.json")
		    or die("could not open /etc/docker/daemon.json: $!");
		my @lines = <FD>;
		close(FD);
		$origjsontext = join("",@lines);
		$json = decode_json($origjsontext);
	    }

	    # If it exists, just delete it; we only want valid stuff in here.
	    if (defined($json->{"storage-driver"})) {
		delete($json->{"storage-driver"});
	    }
	    if (defined($json->{"storage-opts"})) {
		delete($json->{"storage-opts"});
	    }

	    # Write our config.
	    # Don't restart docker; that happens at the end of $USE_LVM.
	    $needdockerrestart = 1;
2127 2128 2129 2130 2131 2132 2133 2134
	    $json->{"storage-driver"} = "$DOCKER_STORAGE_DRIVER";
	    if ($DOCKER_STORAGE_DRIVER eq 'devicemapper') {
		$json->{"storage-opts"} = [
		    "dm.thinpooldev=/dev/mapper/${VGNAME}-thinpool",
		    "dm.use_deferred_removal=true",
		    "dm.use_deferred_deletion=true"
		    ];
	    }
2135 2136 2137 2138 2139 2140 2141 2142 2143

	    TBDebugTimeStamp("Updating /etc/docker/daemon.json");

	    my $newjsontext = encode_json($json);

	    open(FD,">/etc/docker/daemon.json")
		or die("could not write /etc/docker/daemon.json: $!");
	    print FD $newjsontext;
	    close(FD);
2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159
	}
	if (! -l $VMS) {
	    #
	    # We need this stuff to be sticky across reloads, so move it
	    # into an lvm. If we lose the lvm, well then we are screwed.
	    #
	    my @files = glob("$VMS/*");
	    foreach my $file (@files) {
		my $base = basename($file);
		mysystem("/bin/mv $file $INFOFS")
		    if (! -e "$INFOFS/$base");
	    }
	    mysystem("/bin/rm -rf $VMS");
	    mysystem("/bin/ln -s $INFOFS $VMS");
	}
	if (! -l '/var/lib/docker') {
2160 2161
	    # Make sure Docker is stopped before we do this, if it
	    # wasn't stopped above already!
2162
	    mysystem2("systemctl stop docker.service");
2163
	    $needdockerrestart = 1;
2164 2165 2166 2167 2168 2169
	    if ($?) {
		warn("could not stop docker service before moving".
		     " /var/lib/docker to LVM; aborting!");
		TBScriptUnlock();
		return -1;
	    }
2170 2171
	    my $rca = mysystem2("mount -t aufs | grep /var/lib/docker/");
	    my $rco = mysystem2("mount -t overlay2 | grep /var/lib/docker/");
2172 2173 2174 2175 2176
	    if ($? == 0) {
		warn("filesystems still mounted in /var/lib/docker; aborting!");
		TBScriptUnlock();
		return -1;
	    }
2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188
	    if (! -d "$EXTRAFS/var.lib.docker") {
		mkdir("$EXTRAFS/var.lib.docker");
		#
		# We need this stuff to be sticky across reloads, so move it
		# into an lvm. If we lose the lvm, well then we are screwed.
		#
		my @files = glob("/var/lib/docker/*");
		foreach my $file (@files) {
		    my $base = basename($file);
		    mysystem("/bin/mv $file $EXTRAFS/var.lib.docker")
			if (! -e "$EXTRAFS/var.lib.docker/$base");
		}
2189 2190 2191
	    }
	    mysystem("/bin/rm -rf /var/lib/docker");
	    mysystem("/bin/ln -s $EXTRAFS/var.lib.docker /var/lib/docker");
2192
	}
2193