libvnode_docker.pm 188 KB
Newer Older
1
#!/usr/bin/perl -T
2
#
3
# Copyright (c) 2008-2018 University of Utah and the Flux Group.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
#
# Implements the libvnode API for Docker support in Emulab.
#
# Note that there is no distinguished first or last call of this library
# in the current implementation.  Every vnode creation (through mkvnode.pl)
# will invoke all the root* and vnode* functions.  It is up to us to make
# sure that "one time" operations really are executed only once.
#
package libvnode_docker;
use Exporter;
@ISA    = "Exporter";
@EXPORT = qw( init setDebug rootPreConfig
              rootPreConfigNetwork rootPostConfig
	      vnodeCreate vnodeDestroy vnodeState vnodePoll vnodePollCleanup
	      vnodeBoot vnodePreBoot vnodeHalt vnodeReboot
	      vnodeUnmount
	      vnodePreConfig vnodePreConfigControlNetwork
              vnodePreConfigExpNetwork vnodeConfigResources
              vnodeConfigDevices vnodePostConfig vnodeExec vnodeTearDown VGNAME
	    );
use vars qw($VGNAME);

%ops = ( 'init' => \&init,
         'setDebug' => \&setDebug,
         'rootPreConfig' => \&rootPreConfig,
         'rootPreConfigNetwork' => \&rootPreConfigNetwork,
         'rootPostConfig' => \&rootPostConfig,
         'vnodeCreate' => \&vnodeCreate,
         'vnodeDestroy' => \&vnodeDestroy,
	 'vnodeTearDown' => \&vnodeTearDown,
         'vnodeState' => \&vnodeState,
	 'vnodePoll' => \&vnodePoll,
	 'vnodePollCleanup' => \&vnodePollCleanup,
         'vnodeBoot' => \&vnodeBoot,
         'vnodeHalt' => \&vnodeHalt,
         'vnodeUnmount' => \&vnodeUnmount,
         'vnodeReboot' => \&vnodeReboot,
         'vnodeExec' => \&vnodeExec,
         'vnodePreConfig' => \&vnodePreConfig,
         'vnodePreConfigControlNetwork' => \&vnodePreConfigControlNetwork,
         'vnodePreConfigExpNetwork' => \&vnodePreConfigExpNetwork,
         'vnodeConfigResources' => \&vnodeConfigResources,
         'vnodeConfigDevices' => \&vnodeConfigDevices,
         'vnodePostConfig' => \&vnodePostConfig,
       );


use strict;
71
use warnings;
72
73
74
75
76
77
78
79
use English;
use Data::Dumper;
use Socket;
use IO::Handle;
use IO::Select;
use File::Basename;
use File::Path;
use File::Copy;
80
use File::Temp qw(tempdir);
81
82
use POSIX;
use JSON::PP;
83
use Digest::SHA qw(sha1_hex);
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

# Pull in libvnode
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }
use libutil;
use libgenvnode;
use libvnode;
use libtestbed;
use libsetup;
use libtmcc;
use liblocsetup;

#
# Turn off line buffering on output
#
$| = 1;

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 

##
## Standard utilities and files section
##

my $DOCKER = "/usr/bin/docker";
my $CURL = "/usr/bin/curl";
my $BRCTL = "brctl";
my $IP = "/sbin/ip";
my $IFCONFIG = "/sbin/ifconfig";
my $ETHTOOL = "/sbin/ethtool";
my $ROUTE = "/sbin/route";
my $SYSCTL = "/sbin/sysctl";
my $VLANCONFIG = "/sbin/vconfig";
my $MODPROBE = "/sbin/modprobe";
my $IPTABLES	= "/sbin/iptables";
my $IPBIN	= "/sbin/ip";
my $NETSTAT     = "/bin/netstat";
my $IMAGEZIP    = "/usr/local/bin/imagezip";
my $IMAGEUNZIP  = "/usr/local/bin/imageunzip";
my $IMAGEDUMP   = "/usr/local/bin/imagedump";

##
## Runtime configuration options.
##
my $debug  = 0;
130
my $apidebug = 5;
131
132
133
my $lockdebug = 0;
my $sleepdebug = 0;

134
135
136
137
138
139
140
141
142
#
# Set to enable vnodesetup to exit before vnode is completely up
# (see vnodesetup::hackwaitandexit). Allows more parallelism during
# boot-time vnode setup. Note that concurrency may still be constrained
# by $MAXCONCURRENT (defined below) which limits how many new VMs can
# be created at once.
#
my $vsrelease = "immediate";	# or "early" or "none"

143
144
145
146
147
148
149
150
151
152
153
154
155
#
# If Docker is not already installed, which one should we use?  If it's
# not installed, we default to the community edition.  This is a
# runtime-checked param, so we'll use whatever is installed by default,
# not necessarily what is specified here.
#
# You really don't want to use docker.io <= 1.12, because it will take
# too many liberties with the control net bridge.  For instance, if you
# attempt a `systemctl restart docker.service`, you may be SOL and no
# longer on the control net!  docker-ce has patches against this rolled
# in already.
#
my $USE_DOCKER_CE = 1;
156
157
158
#
# Should we use LVM for extra storage space?  This should remain set.
#
159
160
161
162
163
164
165
166
my $USE_LVM = 1;
#
# Should we use the Docker devicemapper direct-lvm storage backend?
# This should remain set, so that it is used for shared hosts.  User
# should be able to change to the default AUFS backend on dedicated
# hosts.
#
my $USE_DOCKER_LVM = 1;
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#
# Default NFS mounts to read-only for now so that nothing in the
# container can blow them away accidentally!
#
my $NFS_MOUNTS_READONLY = 0;
#
# Should we log packets the firewall rejects?
#
my $IPTABLES_PACKET_LOG = 1;
#
# Defaults for the default docker bridge (not our control net bridge).
#
my $DOCKER_DEFAULT_BRIDGE_IP = '192.168.254.1';
my $DOCKER_DEFAULT_BRIDGE_CIDR = '192.168.254.1/24';
#
# Docker supports both macvlan and regular bridging, but we use regular
# bridges because we need to impose traffic control on the host context
# half of the veth.
#
# There *is* a Docker plugin for openvswitch, but we don't want to use
# that yet; $BR_USE_OPENVSWITCH is simply for existing code that could
# enable this feature.
#
my $USE_MACVLAN = 0;
#
# We support macvlans on the control net, but we don't use them because
# we need to apply iptables rules outside the containers, so we need the
# host context half of the veth to use as a source interface.  It is
# tempting to use a cgroup ID plus net_cls, but apparently the markings
# only hold within the container's netns, and don't make it into the
# root (i.e. https://github.com/docker/docker/issues/19802).  So we're
# really stuck with real bridges -- and thus this should not be enabled,
# unless someone else can find a way around this.
#
my $USE_MACVLAN_CNET = 0;
my $USE_OPENVSWITCH = 0;
#
# This flag controls whether we use OVS for GRE tunnels (i.e. for EGRE),
# or if we use Linux kernel GRE + routing + veths.
#
my $TUN_USE_OPENVSWITCH = 0;

##
## Detected configuration variables.
##

#
# Is this our customized version of Docker?
#
my $ISOURDOCKER = 0;
#
# Some commands/subsystems have evolved in incompatible ways over time,
# these vars keep track of such things.
#
my $NEW_LVM = 0;

##
## Various constants.
##

#
# Image wait time.  How long (seconds) we will wait to when trying to
# grab a lock on an image. Should be set to the max time you think it
# could take to pull a large Docker image.  This is a wild guess, obviously.
#
my $MAXIMAGEWAIT = 1800;
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255

#
# Serial console handling. We fire up a capture per active vnode.
# We use a fine assortment of capture options:
#
#	-i: standalone mode, don't try to contact capserver directly
#	-l: (added later) set directory where log, ACL, and pid files are kept.
#	-C: use a circular buffer to capture activity while no user
#	    is connected. This gets dumped to the user when they connect.
#	-T: Put out a timestamp if there has been no previous output
#	    for at least 10 seconds.
#	-L: In conjunction with -T, the timestamp message includes how
#	    long it has been since the last output.
#	-R: Retry interval of 1 second. When capture is disconnected
#	    from the pty (due to container reboot/shutdowns), this is how
#	    long we wait between attempts to reconnect.
#       -y: When capture disconnects from the pty, we retry forever to reopen.
#       -A: tell capture not to prepend '/dev' to the device path we supply.
#
my $CAPTURE     = "/usr/local/sbin/capture-nossl";
my $CAPTUREOPTS	= "-i -C -L -T 10 -R 1000 -y -1 -A";
my $C2P = "/usr/local/etc/emulab/container2pty.py";

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#
# Create a thin pool with the name $POOL_NAME using not more
# than $POOL_FRAC of any disk.
# 
my $USE_THIN_LVM = 1;
my $POOL_NAME = "disk-pool";
my $POOL_FRAC = 0.75;
#
# Minimum acceptible size (in GB) of LVM VG for containers.
#
# XXX we used to calculate this in terms of anticipated maximum number
# of vnodes and minimum vnode images size, blah, blah. Now we just pick
# a value that allows us to use a pc3000 node with a single 144GB disk!
#
my $DOCKER_MIN_VGSIZE = 120;
# Striping
my $STRIPE_COUNT   = 1;
# Avoid using SSDs unless there are only SSDs
my $LVM_AVOIDSSD = 1;
# Whether or not to use only unpartitioned (unused) disks to form the Xen VG.
my $LVM_FULLDISKONLY = 0;
# Whether or not to use partitions only when they are big.
my $LVM_ONLYLARGEPARTS = 1;
my $LVM_LARGEPARTPCT = 10;
# In general, you only want to use one partition per disk since we stripe.
my $LVM_ONEPARTPERDISK = 1;
#
# Flags for allocating LVs
#
sub ALLOC_NOPOOL()	{ return 0; }
sub ALLOC_INPOOL()	{ return 1; }
sub ALLOC_PREFERNOPOOL	{ return 2; }
sub ALLOC_PREFERINPOOL	{ return 3; }

##
## Randomly chosen convention section
##

# Locks.
my $GLOBAL_CONF_LOCK = "emulabdockerconf";
my $GLOBAL_MOUNT_LOCK = "emulabmounts";
297
298
299
300
301
my $SSHD_EXEC_LOCK = "sshdockerexec";

my $DOCKER_EXEC_SSHD_CONFIGFILE = "/etc/ssh/sshd_config-docker-exec";
my $DOCKER_EXEC_SSHD_CONFIGFILE_HEAD = "/etc/ssh/sshd_config-docker-exec.head";
my $DOCKER_EXEC_SSHD_CONFIGDIR = "/etc/ssh/docker-exec.conf.d";
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322

# Config done file.
my $READYFILE = "/var/run/emulab.docker.ready";

# default image to load on logical disks
# Just symlink /boot/vmlinuz-xenU and /boot/initrd-xenU
# to the kernel and ramdisk you want to use by default.
my %defaultImage = (
    'name'      => "ubuntu:16.04",
#    'hub'    => "",
);

# Where we store all our config files.
my $VMS    = "/var/emulab/vms";
my $VMDIR  = "$VMS/vminfo";
# Extra space for VM info.
my $EXTRAFS = "/vms";
# Extra space for vminfo (/var/emulab/vms) between reloads.
my $INFOFS = "/vminfo";

# Docker LVM volume group name. Accessible outside this file.
323
$VGNAME = "docker";
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# So we can ask this from outside;
sub VGNAME()  { return $VGNAME; }
    
my $CTRLIPFILE = "/var/emulab/boot/myip";
# XXX needs lifting up
my $JAILCTRLNET = "172.16.0.0";
my $JAILCTRLNETMASK = "255.240.0.0";

#
# NB: Total hack.  Docker doesn't give you control over default gateway
# for a multi-homed container, other than to ensure that virtual NICs
# are added in lexical order of name, and to promise that the default
# gateway set by the first-added network will remain.  So make sure the
# control net has a lexical name at the beginning of everything.
#
my $DOCKERCNET = "_dockercnet";

#
# Some of the core dirs for Emulabization existing Docker images.
#
my $EMULABSRC = "$EXTRAFS/emulab-devel";
my $PUBSUBSRC = "$EXTRAFS/pubsub";
346
my $RUNITSRC = "$EXTRAFS/runit";
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
my $CONTEXTDIR = "$EXTRAFS/contexts";
my $DOCKERFILES = "/etc/emulab/docker/dockerfiles";

# IFBs
my $IFBDB      = "/var/emulab/db/ifbdb";

# Use openvswitch for gre tunnels.
# Use a custom version if present, the standard version otherwise.
my $OVSCTL   = "/usr/local/bin/ovs-vsctl";
my $OVSSTART = "/usr/local/share/openvswitch/scripts/ovs-ctl";
if (! -x "$OVSCTL") {
    $OVSCTL   = "/usr/bin/ovs-vsctl";
    $OVSSTART = "/usr/share/openvswitch/scripts/ovs-ctl";
}

my $ISREMOTENODE = REMOTEDED();

##
## Emulab constants.
##
my $TMCD_PORT	 = 7777;
my $SLOTHD_PORT  = 8509;
my $EVPROXY_PORT = 16505;

371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
##
## Docker constants.
##
#
# The options as far as what to install in an image to support its use
# in Emulab.
#
#   none: we do not alter the image at all!
#   basic: install only sshd and syslogd, and whatever init the user wants
#   core: basic + install a custom-build of the clientside, using a buildenv of
#     the image, but only installing the DESTDIR clientside binaries/fs stuff;
#     also install a whole bunch of packages the clientside stuff needs.
#   buildenv: basic + full + install all build tools for clientside, and
#     install the clientside.
#   full: buildenv + packages to make the image identical to a normal Emulab
#     disk image.
#
sub DOCKER_EMULABIZE_NONE() { return "none"; }
sub DOCKER_EMULABIZE_BASIC() { return "basic"; }
sub DOCKER_EMULABIZE_CORE() { return "core"; }
sub DOCKER_EMULABIZE_BUILDENV() { return "buildenv"; }
sub DOCKER_EMULABIZE_FULL() { return "full"; }
#
# Most of the Linux images that users will use will be generic images
# whose startup command is sh or bash.  We need something that (at
# minimum) runs infinitely, reaps processes like init, and allows remote
# logins via ssh, syslogs, etc.  Users are free to specify no
# emulabization to cover the cases where the image runs a bona fide
# daemon or pre-configured init.  But we cannot help them with those
# cases automatically.
#
#sub DOCKER_EMULABIZE_DEFAULT() { return DOCKER_EMULABIZE_BASIC(); }
sub DOCKER_EMULABIZE_DEFAULT() { return DOCKER_EMULABIZE_NONE(); }

#
# On modern (ie.e. 2016) Linux images, systemd is already installed (on
# Ubuntu/Debian, and Fedora/CentOS).  We really want to let people use
# it if it's there, instead of falling back to runit (which we install
# during Emulabization).  However, the problem is that we cannot use
# systemd as the init on shared nodes -- systemd requires at least
# read-only access to /sys/fs/cgroup, and docker as of 1.26 does not
# virtualize the cgroup mount (although it's in kernels >= 4.4) -- even
# if Docker did, it might not work; I don't know what systemd wants out
# of /sys/fs/cgroup.
#
# Thus, we must default to runit so that users have images that work on
# both shared and dedicated container hosts.  Ugh!
#
sub DOCKER_INIT_INSTALLED() { return "installed"; }
sub DOCKER_INIT_RUNIT() { return "runit"; }

#
# Either we always pull the reference image when setting up a new
# container, or we only pull the first time.  Simple.
#
sub DOCKER_PULLPOLICY_LATEST() { return "latest"; }
sub DOCKER_PULLPOLICY_CACHED() { return "cached"; }

429
430
431
432
433
434
435
436
437
# Local functions
sub findRoot();
sub copyRoot($$);
sub replace_hacks($);
sub disk_hacks($);
sub hostMemory();
sub hostResources();
sub hostIP($);
sub fixupMac($);
438
sub lvmVGSize($);
439
440
441
442
443
444
445
446
sub checkForInterrupt();
sub genhostspairlist($$);
sub addMounts($$);
sub removeMounts($);
sub bindNetNS($$);
sub moveNetDeviceToNetNS($$$);
sub moveNetDeviceFromNetNS($$$);
sub unbindNetNS($$);
447
sub setupImage($$$$$$$$$$);
448
sub pullImage($$$$;$);
449
sub emulabizeImage($;$$$$$$$$);
450
451
452
453
454
455
456
457
458
459
460
461
sub analyzeImage($$);
sub AllocateIFBs($$$);
sub ReleaseIFBs($$);
sub CreateShapingScripts($$$$;$);
sub RunShapingScripts($$);
sub CreateRoutingScripts($$);
sub RunRoutingScripts($$);
sub RunWithSignalsBlocked($@);
sub RunProxies($$);
sub KillProxies($$);
sub InsertPostBootIptablesRules($$$$);
sub RemovePostBootIptablesRules($$$$);
462
463
sub captureRunning($);
sub captureStart($$);
464
465
466
467
468
469
470
471
472
473
474

#
# A single client object per load of this file is safe.
#
my $_CLIENT;

sub getClient()
{
    return $_CLIENT
	if (defined($_CLIENT));
    # Load late, because this requires a bunch of deps we might have
475
    # installed in ensureDeps().
476
477
478
479
480
481
    require dockerclient;
    $_CLIENT = dockerclient->new();
    $_CLIENT->debug($apidebug);
    return $_CLIENT;
}

482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
#
# Historic concurrency value. Should get overwritten in setConcurrency.
#
my $MAXCONCURRENT = 5;

#
# Number of concurrent containers set up in parallel.  Lifted from
# libvnode_xen; will be changed later.
#
sub setConcurrency($)
{
    my ($maxval) = @_;
   
    if ($maxval) {
	$MAXCONCURRENT = 5;
    } else {
	my ($ram,$cpus) = hostResources();
	my $disks = $STRIPE_COUNT;
	my $hasswapped = hostSwapping();

	print STDERR "setConcurrency: cpus=$cpus, ram=$ram, disks=$disks".
	    " hasswapped=$hasswapped\n"
	    if ($debug);

	if ($cpus > 0 && $disks > 0 && $ram > 0) {
	    if ($ram < 1024 || (!SHAREDHOST() && $hasswapped)) {
		$MAXCONCURRENT = 1;
	    } elsif ($cpus <= 2 || $disks == 1 || $ram <= 2048) {
		$MAXCONCURRENT = 3;
	    } else {
		$MAXCONCURRENT = 5;
	    }
	}
    }
    print STDERR "Limiting to $MAXCONCURRENT concurrent vnode creations.\n";
}

sub setDebug($)
{
    $debug = shift;
    libvnode::setDebug($debug);
    $lockdebug = 1;
    if ($debug > 1) {
	$sleepdebug = 1;
526
	$apidebug = 5;
527
    }
528
    print "libvnode_docker: debug=$debug, apidebug=$apidebug\n"
529
530
531
532
533
534
535
	if ($debug);
}

sub ImageLockName($)
{
    my ($imagename) = @_;

536
    my $ln = "dockerimage." .
537
	(defined($imagename) ? $imagename : $defaultImage{'name'});
538
539
540
    $ln =~ tr/\//-/;

    return $ln;
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
}

sub ImageLVName($)
{
    my ($imagename) = @_;

    return "image+" . $imagename;
}

#
# Apt constants and helper functions.
#
my $APTGET = "/usr/bin/apt-get";
my $APTGETINSTALL = "$APTGET -o Dpkg::Options::='--force-confold'".
    " -o Dpkg::Options::='--force-confdef' install -y ";
my $APTLOCK = "emulab.apt.running";
my $APTLOCK_REF;
my $APTUPDATEDFILE = "/var/run/emulab.apt.updated";

sub aptLock()
{
    TBDebugTimeStamp("aptLock: grabbing global lock $APTLOCK")
	if ($lockdebug);
    my $locked = TBScriptLock($APTLOCK,
			      TBSCRIPTLOCK_GLOBALWAIT(),900,\$APTLOCK_REF);
    if ($locked != TBSCRIPTLOCK_OKAY()) {
	return 0
	    if ($locked == TBSCRIPTLOCK_IGNORE());
	print STDERR "Could not get the apt-get lock after a long time!\n";
	return -1;
    }
    TBDebugTimeStamp("  got global lock $APTLOCK")
	if ($lockdebug);
    return 0;
}

sub aptUnlock()
{
    return TBScriptUnlock($APTLOCK_REF);
}

# Only run once per boot.
sub aptGetUpdate()
{
    if (-f $APTUPDATEDFILE) {
	return 0;
    }
    aptLock();
    mysystem2("apt-get update");
    if (!$?) {
	mysystem("touch $APTUPDATEDFILE");
    }
    my $rc = $?;
    aptUnlock();
    return $rc;
}

#
# Returns 0 if all packages are installed; else the number of
# non-installed packages.
#
sub aptNotInstalled(@)
{
    my @packages = @_;
    my $rc = 0;

    foreach my $P (@packages) {
	my $pstat = `dpkg-query -L $P 2>&1 >/dev/null`;
	if ($pstat) {
	    ++$rc;
	}
    }

    return $rc;
}

sub aptGetInstall(@)
{
    my @packages = @_;
    my $rc = 0;

    aptGetUpdate();

    $ENV{DEBIAN_FRONTEND} = 'noninteractive';
    aptLock();
    foreach my $P (@packages) {
	mysystem2("$APTGETINSTALL $P");
	if ($?) {
	    ++$rc;
	}
    }
    aptUnlock();
    $ENV{DEBIAN_FRONTEND} = undef;

    return $rc;
}

sub aptGetEnsureInstalled(@)
{
    my @packages = @_;
    my $rc = 0;

    foreach my $P (@packages) {
	$rc += aptGetInstall($P)
	    if (aptNotInstalled($P));
    }

    return $rc;
}

sub refreshNetworkDeviceMaps()
{
    makeIfaceMaps();
    if (!$USE_MACVLAN) {
	makeBridgeMaps();
    }
    else {
	makeMacvlanMaps();
    }
}

662
sub ensureDeps()
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
{
    if (aptNotInstalled("libwww-perl")) {
	aptGetInstall("libwww-perl");
    }
    if (aptNotInstalled("liburi-perl")) {
	aptGetInstall("liburi-perl");
    }
    if (aptNotInstalled("libhash-merge-perl")) {
	aptGetInstall("libhash-merge-perl");
    }
    if (aptNotInstalled("libmime-base64-urlsafe-perl")) {
	aptGetInstall("libmime-base64-urlsafe-perl");
    }
    eval {
	use LWP::Protocol::http::SocketUnixAlt;
    };
    if ($@) {
	mysystem("cpan -i LWP::Protocol::http::SocketUnixAlt");
    }
682
683
684
    if (aptNotInstalled("python-docker")) {
	aptGetInstall("python-docker");
    }
685
686
}

687
688
689
690
# (Must be called only after refreshNetworkDeviceMaps() is called for
# the first time in init.)
sub ensureDockerInstalled()
{
691
692
693
694
695
696
697
698
    if (!aptNotInstalled("docker.io")) {
	TBDebugTimeStamp("docker.io installed; using that");
	$USE_DOCKER_CE = 0;
    }
    elsif (!aptNotInstalled("docker-ce")) {
	TBDebugTimeStamp("docker-ce installed; using that");
	$USE_DOCKER_CE = 1;
    }
699

700
701
702
703
704
705
706
    if (!$USE_DOCKER_CE) {
	TBDebugTimeStamp("Ensuring docker.io installed...");
	if (aptNotInstalled("docker.io")) {
	    TBDebugTimeStamp("Installing docker.io...");
	    if (aptGetInstall("docker.io")) {
		die("Failed to install docker.io; aborting!\n");
	    }
707

708
709
710
711
712
713
714
715
716
717
718
719
	    mysystem2("service docker restart");

	    # Remap, cause Docker creates some ifaces.
	    refreshNetworkDeviceMaps();
	}

	#
	# Check which docker this is.
	#
	if (-e "/usr/share/docker.io/EMULAB.md") {
	    $ISOURDOCKER = 1;
	}
720
    }
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
    else {
	TBDebugTimeStamp("Ensuring docker-ce installed...");
	# Ensure the Docker CE repo is configured.
	system("grep -q docker.com /etc/apt/sources.list /etc/apt/sources.list.d");
	if ($?) {
	    TBDebugTimeStamp("Installing docker-ce Apt repos...");
	    aptGetEnsureInstalled("apt-transport-https","ca-certificates",
				  "curl","software-properties-common");
	    mysystem("curl -fsSL https://download.docker.com/linux/ubuntu/gpg".
		     " | sudo apt-key add -");
	    my $release = `lsb_release -cs`;
	    chomp($release);
	    my $arch = `uname -m`;
	    chomp($arch);
	    if ($arch eq 'x86_64' || $arch eq 'amd64') {
		$arch = "amd64";
	    }
	    elsif ($arch eq 'armhf') {
		;
	    }
	    else {
		fatal("currently docker CE is only available on amd64/armhf!");
	    }
	    mysystem("add-apt-repository".
		     " \"deb [arch=$arch] https://download.docker.com/linux/ubuntu $release stable\"");
	    aptGetUpdate();
	}
748

749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
	if (aptNotInstalled("docker-ce")) {
	    TBDebugTimeStamp("Installing docker-ce...");
	    if (aptGetInstall("docker-ce")) {
		warn("Failed to install docker-ce; retrying in 8 seconds!\n");
		sleep(8);
		system("systemctl restart docker.service");
		sleep(2);
		system("apt-get install -y docker-ce");
		if ($?) {
		    fatal("Failed to install docker-ce; aborting!\n");
		}
	    }

	    mysystem2("service docker restart");

	    # Remap, cause Docker creates some ifaces.
	    refreshNetworkDeviceMaps();
	}

	#
	# Check which docker this is.
	#
	if (-e "/usr/share/docker-ce/EMULAB.md") {
	    $ISOURDOCKER = 1;
	}
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
    }

    #if (aptNotInstalled("systemd-container")
    #	&& aptGetInstall("systemd-container")) {
    #	die("Failed to install systemd-container; aborting!\n");
    #}

    #
    # Check or create the Docker config file; if we have to modify it,
    # restart Docker.
    #
    mkdir("/etc")
	if (! -d "/etc");
    mkdir("/etc/docker")
	if (! -d "/etc/docker");
    my $origjsontext = '';
    my $json = {};
    my $changed = 0;
    if (-e "/etc/docker/daemon.json") {
	open(FD,"/etc/docker/daemon.json")
	    or die("could not open /etc/docker/daemon.json: $!");
	my @lines = <FD>;
	close(FD);
	$origjsontext = join("",@lines);
	$json = decode_json($origjsontext);
    }

    # Check to ensure the docker iface has a non-172.16 subnet:
    my $diface = getIfaceInfo("docker0");
    if (!defined($diface)) {
	fatal("Could not find default docker network interface; aborting!");
    }
    if ($diface->{'ip'} ne $DOCKER_DEFAULT_BRIDGE_IP
	|| !defined($json) || !exists($json->{'bip'})
	|| $json->{'bip'} ne $DOCKER_DEFAULT_BRIDGE_CIDR) {
	TBDebugTimeStamp("Moving docker0 to $DOCKER_DEFAULT_BRIDGE_CIDR");

	# Blast our docker opts into the right place:
	$json->{'bip'} = $DOCKER_DEFAULT_BRIDGE_CIDR;
	$changed = 1;
    }

    # Check to ensure we're doing the right thing w.r.t. iptables:
    my $iptval = ($ISOURDOCKER) ? JSON::PP::true : JSON::PP::false;
818
    my $ichanged = 0;
819
820
821
822
    if (!defined($json) || !exists($json->{"iptables"})
	|| $json->{'iptables'} != $iptval) {
	$json->{'iptables'} = $iptval;
	$changed = 1;
823
	$ichanged = 1;
824
825
826
827
828
    }
    if (!defined($json) || !exists($json->{"ip-masq"})
	|| $json->{'ip-masq'} != $iptval) {
	$json->{'ip-masq'} = $iptval;
	$changed = 1;
829
	$ichanged = 1;
830
831
832
833
834
835
836
837
838
839
840
    }

    if ($changed) {
	TBDebugTimeStamp("Updating /etc/docker/daemon.json");

	my $newjsontext = encode_json($json);

	open(FD,">/etc/docker/daemon.json")
	    or die("could not write /etc/docker/daemon.json: $!");
	print FD $newjsontext;
	close(FD);
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859

	mysystem2("service docker stop");

	if ($ichanged && !$ISOURDOCKER) {
	    #
	    # Make sure all the Docker stuff is undone, if this is not
	    # our Docker.
	    #
	    mysystem("$IPTABLES -P FORWARD ACCEPT");
	    mysystem("$IPTABLES -F INPUT");
	    mysystem("$IPTABLES -F OUTPUT");
	    mysystem("$IPTABLES -F FORWARD");
	    mysystem("$IPTABLES -F DOCKER");
	    mysystem2("$IPTABLES -X DOCKER");
	    mysystem("$IPTABLES -F DOCKER-ISOLATION");
	    mysystem2("$IPTABLES -X DOCKER-ISOLATION");
	}

	mysystem2("service docker start");
860
861
862
863
864
865
866
867

	# Remap, cause Docker creates some ifaces.
	refreshNetworkDeviceMaps();
    }

    return 0;
}

868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
sub setupDockerExecSSH() {
    #
    # We need to read the default sshd config; comment out any Port or
    # ListenAddress lines; and write it out to the head config file.
    # Note, we blow away the head file when first configuring the phost
    # to support docker.
    #
    my @newlines = ();
    open(FD,"/etc/ssh/sshd_config");
    my @lines = <FD>;
    close(FD);
    foreach my $line (@lines) {
	if ($line =~ /^\s*(Port|ListenAddress)/) {
	    $line = "#$line";
	}
	push(@newlines,$line);
    }
    open(FD,">$DOCKER_EXEC_SSHD_CONFIGFILE_HEAD");
    print FD @newlines;
    close(FD);

    #
    # Then make the dir where we put the per-vhost sshd config bits.
    #
    mysystem("mkdir -p $DOCKER_EXEC_SSHD_CONFIGDIR");

    return 0;
}

sub rebuildAndReloadDockerExecSSH() {
    my $retval;

    TBDebugTimeStamp("rebuildAndReloadDockerExecSSH: grabbing sshd lock".
		     " $SSHD_EXEC_LOCK")
	if ($lockdebug);
    my $locked = TBScriptLock($SSHD_EXEC_LOCK,TBSCRIPTLOCK_GLOBALWAIT(), 900);
    if ($locked != TBSCRIPTLOCK_OKAY()) {
	return 0
	    if ($locked == TBSCRIPTLOCK_IGNORE());
	print STDERR "Could not get the $SSHD_EXEC_LOCK lock".
	    " after a long time!\n";
	return -1;
    }

    #
    # Our private Docker Exec sshd listens on the private VM ports and
    # when a user authenticates, we use the ForceCommand directive in a
    # Match block to gateway them into the container that is supposed to
    # be reachable via ssh on that port.  However, only Match blocks may
    # follow other Match blocks -- in particular, a Port directive (to
    # listen on) must precede the Match blocks.  Thus, for each
    # container, we create one file in the configdir like
    # 0.$vnode_id.port with the Port line, and another like
    # 1.$vnode_id.match with the match and command directives).
    #
    # Thus, we need an rcsorted order of files in $DOCKER_EXEC_SSHD_CONFIGDIR.
    #
    my @pmlines = ();
    if (sortedreadallfilesindir($DOCKER_EXEC_SSHD_CONFIGDIR,\@pmlines)) {
	$retval = -1;
	goto out;
    }

    open(FD,"$DOCKER_EXEC_SSHD_CONFIGFILE_HEAD");
    my @hlines = <FD>;
    close(FD);

    open(FD,">$DOCKER_EXEC_SSHD_CONFIGFILE");
    print FD "".join('',@hlines)."\n".join('',@pmlines)."\n";
    close(FD);

    #
    # But, if there were no port/match lines, *stop* the service instead of
    # restarting -- because it would probably try to start on port 22, which
    # of course will just fail it.
    #
    if (@pmlines == 0) {
	TBDebugTimeStamp("No more ports/commands in sshd_config-docker-exec;".
			 " stopping service!");
	mysystem2("systemctl stop sshd-docker-exec.service");
    }
    else {
	TBDebugTimeStamp("Restarting sshd-docker-exec.service for changes to".
			 " sshd_config-docker-exec");
	mysystem2("systemctl restart sshd-docker-exec.service");
    }
    $retval = 0;

  out:
    TBScriptUnlock();
    return $retval;
}

sub addContainerToDockerExecSSH($$$) {
    my ($vnode_id,$port,$shell)  = @_;

    open(FD,">$DOCKER_EXEC_SSHD_CONFIGDIR/0.${vnode_id}.port");
    print FD "Port $port\n";
    close(FD);

    open(FD,">$DOCKER_EXEC_SSHD_CONFIGDIR/1.${vnode_id}.match");
    print FD "Match LocalPort=$port\n";
    print FD "ForceCommand /usr/bin/sudo /usr/bin/docker exec -it $vnode_id $shell\n";
    close(FD);

    return rebuildAndReloadDockerExecSSH();
}

sub removeContainerFromDockerExecSSH($) {
    my ($vnode_id,) = @_;

    unlink("$DOCKER_EXEC_SSHD_CONFIGDIR/0.${vnode_id}.port");
    unlink("$DOCKER_EXEC_SSHD_CONFIGDIR/0.${vnode_id}.match");

    return rebuildAndReloadDockerExecSSH();
}

985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
sub getBridgeInterfaces($)
{
    my ($brname,) = @_;

    my @output = `$BRCTL show $brname`;
    if ($?) {
	return undef;
    }

    my @retval = ();
    foreach my $line (@output) {
	if ($line =~ /^[^\s]+\s+[^\s]+\s+[^\s]+\s+([^\s])+$/) {
	    push(@retval,$1);
	}
    }
    return @retval;
}

sub getDockerNetMemberIds($)
{
    my ($netname,) = @_;

1007
1008
    my ($code,$content,$resp) = getClient()->network_inspect($netname);
    if ($code) {
1009
1010
	return undef;
    }
1011
1012
1013
    if (!exists($content->{"Containers"})) {
	return ();
    }
1014
1015

    my @retval = ();
1016
1017
1018
1019
    foreach my $cid (keys(%{$content->{"Containers"}})) {
	next
	    if (!exists($content->{"Containers"}{$cid}{"Name"}));
	push(@retval,$cid);
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
    }
    return @retval;
}

sub setupLVM()
{

    print "Enabling LVM...\n"
	if ($debug);

    # We assume our kernels support this.
    mysystem2("$MODPROBE dm-snapshot");
    if ($?) {
	print STDERR "ERROR: could not load snaphot module!\n";
	return -1;
    }

    #
1038
    # Make sure pieces are at least 32 GiB.
1039
    #
1040
    my $minpsize = 32 * 1024;
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
    my %devs = libvnode::findSpareDisks($minpsize, $LVM_AVOIDSSD);

    # if ignoring SSDs but came up with nothing, we have to use them!
    if ($LVM_AVOIDSSD && keys(%devs) == 0) {
	%devs = libvnode::findSpareDisks($minpsize, 0);
    }

    #
    # Turn on write caching. Hacky. 
    # XXX note we do not use the returned "path" here as we need to
    # change the setting on all devices, not just the whole disk devices.
    #
    my %diddev = ();
    foreach my $dev (keys(%devs)) {
	# only mess with the disks we are going to use
	if (!exists($diddev{$dev}) &&
	    (exists($devs{$dev}{"size"}) || $LVM_FULLDISKONLY == 0)) {
	    mysystem2("hdparm -W1 /dev/$dev");
	    $diddev{$dev} = 1;
	}
    }
    undef %diddev;

    #
    # See if our LVM volume group for VMs exists and create it if not.
    #
    my $vg = `vgs | grep $VGNAME`;
    if ($vg !~ /^\s+${VGNAME}\s/) {
	print "Creating volume group...\n"
	    if ($debug);

	#
	# Total up potential maximum size.
	# Also determine mix of SSDs and non-SSDs if required.
	#
	my $maxtotalSize = 0;
	my $sizeThreshold = 0;
	foreach my $dev (keys(%devs)) {
	    if (defined($devs{$dev}{"size"})) {
		$maxtotalSize += $devs{$dev}{"size"};
	    } else {
		foreach my $part (keys(%{$devs{$dev}})) {
		    $maxtotalSize += $devs{$dev}{$part}{"size"};
		}
	    }
	}
	if ($maxtotalSize > 0) {
	    $sizeThreshold = int($maxtotalSize * $LVM_LARGEPARTPCT / 100.0);
	}

	#
	# Find available devices of sufficient size, prepare them,
	# and incorporate them into a volume group.
	#
	my $totalSize = 0;
	my @blockdevs = ();
	foreach my $dev (sort keys(%devs)) {
	    #
	    # Whole disk is available, use it.
	    #
	    if (defined($devs{$dev}{"size"})) {
		push(@blockdevs, $devs{$dev}{"path"});
		$totalSize += $devs{$dev}{"size"};
		next;
	    }

	    #
	    # Disk contains partitions that are available.
	    #
	    my ($lpsize,$lppath);
	    foreach my $part (keys(%{$devs{$dev}})) {
		my $psize = $devs{$dev}{$part}{"size"};
		my $ppath = $devs{$dev}{$part}{"path"};

		#
		# XXX one way to avoid using the system disk, just ignore
		# all partition devices. However, in cases where the
		# remainder of the system disk represents the majority of
		# the available space (e.g., Utah d710s), this is a bad
		# idea.
		#
		if ($LVM_FULLDISKONLY) {
		    print STDERR
			"WARNING: not using partition $ppath for LVM\n";
		    next;
		}

		#
		# XXX Another heurstic to try to weed out the system
		# disk whenever feasible: if a partition device represents
		# less than some percentage of the max possible space,
		# avoid it. At Utah this one is tuned (10%) to avoid using
		# left over space on the system disk of d820s (which have
		# six other larger drives) or d430s (which have two large
		# disks) while using it on the pc3000s and d710s.
		#
		if ($LVM_ONLYLARGEPARTS && $psize < $sizeThreshold) {
		    print STDERR "WARNING: not using $ppath for LVM (too small)\n";
		    next;
		}

		#
		# XXX If we are only going to use one partition per disk,
		# record the largest one we find here. This check will
		# filter out the small "other OS" partition (3-6GB) in
		# favor of the larger "rest of the disk" partition.
		#
		if ($LVM_ONEPARTPERDISK) {
		    if (!defined($lppath) || $psize > $lpsize) {
			$lppath = $ppath;
			$lpsize = $psize;
		    }
		    next;
		}

		#
		# It ran the gauntlet of feeble filters, use it!
		#
		push(@blockdevs, $ppath);
		$totalSize += $psize;
	    }
	    if ($LVM_ONEPARTPERDISK && defined($lppath)) {
		push(@blockdevs, $lppath);
		$totalSize += $lpsize;
	    }
	}
	if (@blockdevs == 0) {
	    print STDERR "ERROR: findSpareDisks found no disks for LVM!\n";
	    return -1;
	}
		    
	my $blockdevstr = join(' ', sort @blockdevs);
	mysystem("pvcreate $blockdevstr");
	mysystem("vgcreate $VGNAME $blockdevstr");

	my $size = lvmVGSize($VGNAME);
	if ($size < $DOCKER_MIN_VGSIZE) {
	    print STDERR "WARNING: physical disk space below the desired ".
		" minimum value ($size < $DOCKER_MIN_VGSIZE), expect trouble.\n";
	}
    }
    $STRIPE_COUNT = computeStripeSize($VGNAME);
    
    #
    # Make sure our volumes are active -- they seem to become inactive
    # across reboots
    #
    mysystem("vgchange -a y $VGNAME");

    return 0;
}

#
# Bridge stuff
#
sub addbr($)
{
    my $br  = $_[0];
    my $cmd = ($USE_OPENVSWITCH ? "$OVSCTL add-br" : "$BRCTL addbr") . " $br";

    system($cmd);
}
sub delbr($)
{
    my $br  = $_[0];
    if ($USE_OPENVSWITCH) {
	mysystem2("$OVSCTL del-br $br");
    }
    else {
	mysystem2("$IFCONFIG $br down");
	mysystem2("$BRCTL delbr $br");
    }
}
sub addbrif($$)
{
    my $br  = $_[0];
    my $if  = $_[1];
    my $cmd = ($USE_OPENVSWITCH ? "$OVSCTL add-port" : "$BRCTL addif") .
	" $br $if";

    system($cmd);
}
sub delbrif($$)
{
    my $br  = $_[0];
    my $if  = $_[1];
    my $cmd = ($USE_OPENVSWITCH ? "$OVSCTL del-port" : "$BRCTL delif") .
	" $br $if";

    system($cmd);
}

##
## libvnode API implementation
##

sub init($)
{
    my ($pnode_id,) = @_;

    if ($USE_LVM) {
	# See what version of LVM we have. Again, some commands are different.
	my $out = `lvm version | grep 'LVM version'`;
	if (defined($out) && $out =~ /LVM version:\s+(\d+)\.(\d+)\.(\d+)/) {
	    if (int($1) > 2 ||
		(int($1) == 2 && int($2) > 2) ||
		(int($1) == 2 && int($2) == 2 && int($3) >= 99)) {
		$NEW_LVM = 1;
	    }
	}

	# Compute the strip size for new lvms.
	if (-e "$READYFILE") {
	    $STRIPE_COUNT = computeStripeSize($VGNAME);
	}
    }

    #
    # Check which docker this is.
    #
    if (-e "/usr/share/docker.io/EMULAB.md") {
	$ISOURDOCKER = 1;
    }

    return 0;
}

#
# Called on each vnode, but should only be executed once per boot.
# We use a file in /var/run (cleared on reboots) to ensure this.
#
sub rootPreConfig($)
{
    my $bossip = shift;
1275
    my ($code,$content,$resp);
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304

    #
    # Haven't been called yet, grab the lock and double check that someone
    # didn't do it while we were waiting.
    #
    if (! -e "$READYFILE") {
	TBDebugTimeStamp("rootPreConfig: grabbing global lock $GLOBAL_CONF_LOCK")
	    if ($lockdebug);
	my $locked = TBScriptLock($GLOBAL_CONF_LOCK,
				  TBSCRIPTLOCK_GLOBALWAIT(), 900);
	if ($locked != TBSCRIPTLOCK_OKAY()) {
	    return 0
		if ($locked == TBSCRIPTLOCK_IGNORE());
	    print STDERR "Could not get the $GLOBAL_CONF_LOCK lock".
		" after a long time!\n";
	    return -1;
	}
    }
    TBDebugTimeStamp("  got global lock")
	if ($lockdebug);
    if (-e "$READYFILE") {
	TBDebugTimeStamp("  releasing global lock")
	    if ($lockdebug);
        TBScriptUnlock();
        return 0;
    }
    
    TBDebugTimeStamp("Configuring root vhost context");

1305
1306
1307
1308
1309
    #
    # Ensure we have the latest bridge/iface state!
    #
    refreshNetworkDeviceMaps();

1310
1311
1312
1313
1314
    #
    # Make sure we actually have Docker.
    #
    ensureDockerInstalled();

1315
1316
1317
    #
    # Make sure we have all our Perl deps.
    #
1318
    ensureDeps();
1319

1320
1321
1322
1323
1324
1325
    #
    # Make sure we have a bunch of other common tools.
    #
    aptGetEnsureInstalled("lvm2","thin-provisioning-tools",
			  "bridge-utils","iproute2","vlan");

1326
1327
1328
1329
1330
    #
    # Set up the docker exec sshd service.
    #
    setupDockerExecSSH();

1331
1332
1333
    #
    # Setup our control net device if not already up.
    #
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
    if ($USE_MACVLAN_CNET || $USE_MACVLAN) {
	#
	# If we build dummy shortbridge nets atop either a physical
	# device, or atop a dummy device, load these!
	#
	mysystem("$MODPROBE macvlan");
	mysystem("$MODPROBE dummy");
    }
    if (!$USE_MACVLAN_CNET || !$USE_MACVLAN) {
	mysystem("$MODPROBE bridge");
    }

1346
1347
    my ($cnet_iface,$cnet_ip,$cnet_mask,
	$cnet_maskbits,$cnet_net,$cnet_mac,$cnet_gw) = findControlNet();
1348
1349
1350
1351
    my ($alias_ip,$alias_mask,$vmac) = hostControlNet();
    my ($VCNET_NET,undef,$VCNET_GW,$VCNET_SLASHMASK) = findVirtControlNet();
    my $nettype = ($USE_MACVLAN_CNET) ? "macvlan" : "bridge";

1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
    #
    # NB: in the case of !$USE_MACVLAN_CNET (i.e. using bridges for
    # control net) and !$ISREMOTENODE, we place the real routable
    # control net addr on the bridge and put the real control net dev in
    # the bridge.  So we want to track the orig_cnet_iface.  Once we
    # shuffle that dev into the bridge, we reset the
    # /var/emulab/boot/controlif file to point to the bridge -- and thus
    # if this gets re-run, it won't get the real control net dev as in
    # arg to this function.  So the code that handles this case is
    # careful to use orig_cnet_iface instead of cnet_iface!  None of the
    # other cases care, since they don't re-write
    # /var/emulab/boot/controlif.
    #
    my $orig_cnet_iface;
    #
    # Assume if this is not present, this is the first time running.  If
    # so, the real control net device must have the real control net IP;
    # not $DOCKERCNET!  So if you wipe this file out to retry, make sure
    # to reset the real controlif with proper info from dhclient.
    #
    if (! -e "/var/run/emulab-controlif-orig") {
	$orig_cnet_iface = $cnet_iface;
	open(FD,">/var/run/emulab-controlif-orig")
	    or fatal("could not open /var/run/emulab-controlif-orig: $!");
	print FD "$cnet_iface";
	close(FD);
    }
    else {
	open(FD,"/var/run/emulab-controlif-orig")
	    or fatal("could not open /var/run/emulab-controlif-orig: $!");
	$orig_cnet_iface = <FD>;
	chomp($orig_cnet_iface);
	close(FD);
    }

    my $dcnexists = 0;
1388
1389
1390
    TBDebugTimeStamp("checking for docker network $DOCKERCNET...");
    ($code,$content,$resp) = getClient()->network_inspect($DOCKERCNET);
    if ($code == 0) {
1391
1392
1393
	$dcnexists = 1;
    }

1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
    if ($USE_MACVLAN_CNET && ! -e "/sys/class/net/$DOCKERCNET") {
	my $alias_net =
	    inet_ntoa(inet_aton($alias_ip) & inet_aton($alias_mask));

	if (!$ISREMOTENODE) {
            #
            # We first add a macvlan "alias" to the control net device
            # so that we (the physical host) are in the same subnet as
            # the vnodes.  With the macvlan interfaces, you cannot
            # directly alias the parent device and talk to/from the
            # other macvlan children on the parent.
            #
	    print "Creating $DOCKERCNET macvlan on $cnet_iface".
		" ($alias_ip,$alias_mask)...\n";
	    mysystem("ip link add link $cnet_iface name $DOCKERCNET".
		     " address $vmac type macvlan mode bridge");
	    mysystem("ip addr replace $alias_ip/$alias_mask dev $DOCKERCNET");
	    mysystem("ip link set up $DOCKERCNET");

	    #my $isroutable = isRoutable($alias_ip);
	    ## Add a route to reach the vnodes. Do it for the entire
	    ## network, and no need to remove it.
	    #if (!$ISREMOTENODE && !$isroutable
	    #	&& system("$NETSTAT -r | grep -q $alias_net")) {
	    #	mysystem2("$ROUTE add -net $alias_net netmask $alias_mask dev $cnet_iface");
	    #	if ($?) {
	    #	    warn("could not add non-routable local virt control net route!");
	    #	    #return -1;
	    #	}
	    #}
	}
	else {
	    #
	    # XXX will this actually work? macvlan children can't talk to host?
	    # XXX probably need to add a dummy device to back the docker
	    # macvlan network!
	    # $alias_ip = $cnet_ip;
            #
            # Ok, since that won't work, in this case, we add a dummy
            # device to host our control net macvlan devices atop; we
            # don't want anything bridged to the outside world in the
            # remoteded case.  Then we add our control net alias like
            # above.
            #
	    $cnet_iface = "dummycnet";
	    mysystem2("ip link add dummycnet type dummy");
	    print "Creating $DOCKERCNET macvlan on $cnet_iface".
		" ($alias_ip,$alias_mask)...\n";
	    mysystem("ip link add link $cnet_iface".
		     " name $DOCKERCNET address $vmac type macvlan mode bridge");
	    mysystem("ip addr replace $alias_ip/$alias_mask dev $DOCKERCNET");
	    mysystem("ip link set up $DOCKERCNET");
	}
    }
1448
    elsif (!$USE_MACVLAN_CNET
1449
1450
1451
1452
	   && (!$dcnexists
	       || ! -e "/sys/class/net/$DOCKERCNET"
	       || !defined(findBridge($orig_cnet_iface))
	       || findBridge($orig_cnet_iface) ne $DOCKERCNET)) {
1453
1454
1455
	my $alias_net =
	    inet_ntoa(inet_aton($alias_ip) & inet_aton($alias_mask));

1456
1457
1458
	#
	# If the bridge doesn't exist, add it first.
	#
1459
1460
1461
1462
1463
1464
	if (! -e "/sys/class/net/$DOCKERCNET") {
	    addbr($DOCKERCNET);
	    if ($?) {
		fatal("failed to create $DOCKERCNET bridge!");
		return -1;
	    }
1465
1466
	}

1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
	#
	# The $ISREMOTENODE case is easy, because the real control net
	# device doesn't go into the bridge, and we and Docker expect
	# the bridge to have the fake virtual control net address.  So
	# harmony ensues.
	#
	# The !$ISREMOTENODE case is very, very tricky.  The first time
	# we boot, the docker network doesn't exist; the bridge doesn't
	# exist; all the control net state is as dhclient left it.  The
	# correct order there is create bridge; flush control net ip
	# addr; move control net dev into bridge; add control net as
	# docker network; flush bridge ip addr Docker set; set our
	# proper public control net IP as the bridge ip addr; and add
	# the unroutable virtual control net addr (the docker network
	# gateway) as an alias.  NB: Docker will not accept or add the
	# virtual control net IP as an alias; it will error, or force
	# the IP to the virtual addr.  That is why we must fix it up
	# after creating the Docker network.
	#
	# On subsequent boots, the control net already exists as a
	# Docker network, and Docker will create the control net device
	# before we run.  However, Docker doesn't put the real control
	# net device into that bridge (it doesn't know that kind of
	# thing); but it does give the bridge the virtual control IP as
	# its primary IP.  So, we have to flush the bridge IP, and *not*
	# remake the Docker cnet.
	#
	# What a pain, all because Docker cannot just leave an existing
	# bridge alone (i.e.,
	# https://github.com/docker/docker/issues/20758).
	#
1498
	if (!$ISREMOTENODE) {
1499
1500
1501
1502
	    my $ipandmaskbits = "$cnet_ip/$cnet_maskbits";

	    # First grab the default gateway.
	    my ($defroute,$defrouteiface);
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
	    open(ROUTEOUTPUT,"ip route list |")
		or fatal("unable to get route list via 'ip'!");
	    while (!eof(ROUTEOUTPUT)) {
		my $line = <ROUTEOUTPUT>;
		chomp($line);
		if ($line =~ /^default via (\d+\.\d+\.\d+\.\d+)/) {
		    $defroute = $1;
		}
		if ($line =~ /^default via [\w\.\/]+\s+dev\s+([\w\.]+)/) {
		    $defrouteiface = $1;
		}
	    }
	    if (!$defroute) {
		fatal("could not find default route!");
	    }

1519
	    #
1520
1521
1522
1523
	    # Undo the existing control net config we obtained on boot,
	    # and move that interface into our $DOCKERCNET bridge, IFF
	    # it's not in the bridge already.  If it's already in the
	    # bridge, no need to do any of this.
1524
	    #
1525
1526
1527
1528
1529
1530
1531
	    if (!defined(findBridge($orig_cnet_iface))
		|| findBridge($orig_cnet_iface) ne $DOCKERCNET) {
		mysystem2("ip link set down $orig_cnet_iface");
		mysystem2("ip addr del $ipandmaskbits dev $orig_cnet_iface");
		mysystem2("ip addr flush dev $orig_cnet_iface");
		addbrif($DOCKERCNET,$orig_cnet_iface);
	    }
1532

1533
	    #
1534
1535
1536
1537
1538
1539
1540
1541
	    # If the Docker network does not exist in Docker itself, but
	    # it *does* exist as a device, flush its IP addr since
	    # Docker insists on setting that itself.
	    #
	    if (!$dcnexists && -e "/sys/class/net/$DOCKERCNET") {
		mysystem2("ip addr flush dev $DOCKERCNET");
	    }

1542
	    #
1543
1544
1545
	    # If the docker network isn't yet built, do that now.
	    #
	    if (!$dcnexists) {
1546
1547
1548
1549
1550
1551
1552
1553
		TBDebugTimeStamp("creating bridged docker network $DOCKERCNET");
		($code,$content) = getClient()->network_create_bridge(
		    $DOCKERCNET,"${VCNET_NET}/${VCNET_SLASHMASK}",$alias_ip,
		    $DOCKERCNET);
		if ($code) {
		    fatal("failed to create bridged Docker $DOCKERCNET control net:".
			  " $content");
		}
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
		$dcnexists = 1;
	    }

	    #
	    # Always flush the bridge's Docker-imposed addr immediately,
	    # whether it existed or we created it.
	    #
	    mysystem("ip addr flush dev $DOCKERCNET");

	    #
	    # Set the $DOCKERCNET configuration to one that both we and
	    # Docker are happy with.
	    #
	    mysystem2("ip addr add $ipandmaskbits dev $DOCKERCNET");
1568
	    if ($?) {
1569
		mysystem("ip addr replace $ipandmaskbits dev $DOCKERCNET");
1570
	    }
1571
	    mysystem("ip link set up $DOCKERCNET");
1572
1573
1574
1575
	    mysystem("ip link set up $orig_cnet_iface");
	    if ($defrouteiface eq $cnet_iface
		|| $defrouteiface eq $orig_cnet_iface) {
		mysystem("ip route replace default via $defroute");
1576
	    }
1577
1578
	    mysystem("ip addr add $alias_ip/$alias_mask dev $DOCKERCNET".
		     " label $DOCKERCNET:1");
1579

1580
1581
1582
	    #
	    # Save the bridge as the real control net iface.
	    #
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
	    open(CONTROLIF,">$BOOTDIR/controlif");
	    print CONTROLIF "$DOCKERCNET\n";
	    close(CONTROLIF);
	}
	else {
	    #
	    # If this node is remote, then it gets a bridge without the
	    # control net.
	    #
	    mysystem("ip addr replace $alias_ip/$alias_mask dev $DOCKERCNET");
	    mysystem("ip link set up $DOCKERCNET");
	}
    }

    #
1598
    # Now if the Docker control net still doesn't exist, create that.
1599
    #
1600
    if (!$dcnexists) {
1601
1602
1603
1604
1605
	if ($USE_MACVLAN_CNET) {
	    #
	    # Next, we create a docker macvlan network to front for the
	    # virt control net.
	    #
1606
1607
1608
1609
1610
1611
1612
1613
	    TBDebugTimeStamp("creating macvlan docker network $DOCKERCNET");
	    ($code,$content) = getClient()->network_create_macvlan(
		$DOCKERCNET,"${VCNET_NET}/${VCNET_SLASHMASK}",$alias_ip,
		$cnet_iface);
	    if ($code) {
		fatal("failed to create bridged Docker $DOCKERCNET control net:".
		      " $content");
	    }
1614
1615
	}
	else {
1616
1617
1618
1619
1620
1621
1622
1623
	    TBDebugTimeStamp("creating bridged docker network $DOCKERCNET");
	    ($code,$content) = getClient()->network_create_bridge(
		$DOCKERCNET,"${VCNET_NET}/${VCNET_SLASHMASK}",$alias_ip,
		$DOCKERCNET);
	    if ($code) {
		fatal("failed to create bridged Docker $DOCKERCNET control net:".
		      " $content");
	    }
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
	}
    }

    #
    # Mesh our iptables setup with docker's.  This is nontrivial because
    # Docker does one nasty thing: it continually forces its -j
    # DOCKER-ISOLATION rule into the top of the FORWARD chain on
    # significant operations (like creating a container).  This has been
    # much discussed but not fixed, so we have two strategies.  First,
    # we have a patched version of Docker that does not do this crazy
    # crap; second, if that is not available, we disable its use of
    # iptables and do all the stuff Docker would normally do that we
    # actually need (a subset of what Docker normally does).
    #
    # We use the same basic strategy in either case: what we want to do
    # is flow all packets on the control net bridge through our
    # EMULAB-ISOLATION chain.  But we do return to the DOCKER-ISOLATION
    # chain so that Docker rules can affect other Docker networks.
    #
    mysystem2("$IPTABLES -N EMULAB-ISOLATION");
    mysystem("$IPTABLES -F EMULAB-ISOLATION");
    mysystem("$IPTABLES -A EMULAB-ISOLATION -j RETURN");
    mysystem("$IPTABLES -I FORWARD -j EMULAB-ISOLATION");

    #
    # Also, Docker handles MASQUERADING for us by default.  We don't
    # want to turn off Docker's iptables (it's on or off) functionality,
    # because people should be able to bring up Docker VMs manually if
    # they want, using the default Docker host network (or one of the
    # experiment networks, if they safely manage IP addr assignment).
    # However, as discussed above, we have to turn it off if it's not
    # our modified version.  So we have to add the MASQ rule if iptables
    # is off in Docker.
    #
    # If this is a local testbed node, we want to allow unroutable
    # packets on the control net.  So we have to add local control net
    # exceptions ahead of Docker's default MASQ-all rules.
    #
    if (!$ISREMOTENODE) {
	my (undef,undef,$ctlmask,undef,$ctlnet,undef,undef) = findControlNet();
	mysystem("$IPTABLES -t nat -I POSTROUTING".
		 " -s ${VCNET_NET}/${VCNET_SLASHMASK}".
		 " -d ${VCNET_NET}/${VCNET_SLASHMASK} -j ACCEPT");
	mysystem("$IPTABLES -t nat -I POSTROUTING".
		 " -s ${VCNET_NET}/${VCNET_SLASHMASK}".
		 " -d ${ctlnet}/${ctlmask} -j ACCEPT");
	if (!$ISOURDOCKER) {
	    mysystem("$IPTABLES -t nat -A POSTROUTING".
		     " -s ${VCNET_NET}/${VCNET_SLASHMASK}".
		     " -j MASQUERADE");
	    # Also do the default docker0 bridge CIDR, since Docker
	    # won't be doing it and we want temp user containers to
	    # work.
	    mysystem("$IPTABLES -t nat -A POSTROUTING".
		     " -s $DOCKER_DEFAULT_BRIDGE_CIDR".
		     " -j MASQUERADE");
	}
    }

    #
    # XXX: antispoofing!  Can't do it with macvlan control net though.
    #
    # We also choose not to use the style here; instead, we are
    # draconian and drop everything that comes from the vnode that does
    # not have its IP.  We do that later.
    #
    # We want to change the below code not to DROP on the FORWARD chain
    # by default, but rather to drop anything that comes from a vnode's
    # cnet iface that is not sourced from its assigned control net IP.
    #
    if (0) {
	mysystem("$IPTABLES -P FORWARD DROP");
	mysystem("$IPTABLES -F FORWARD");
	# This says to forward traffic across the bridge.
	mysystem("$IPTABLES -A FORWARD ".
		 "-m physdev --physdev-in $cnet_iface -j ACCEPT");
    }

    # For tunnels
    if ($USE_OPENVSWITCH) {
	mysystem("$MODPROBE openvswitch");
    }
    else {
	mysystem("$MODPROBE ip_gre");
    }

    # For VLANs
    mysystem("$MODPROBE 8021q");

    # We need this stuff for traffic shaping -- only root context can
    # modprobe.
    mysystem("$MODPROBE sch_netem");
    mysystem("$MODPROBE sch_htb");

    # start up open vswitch stuff.
    if ($USE_OPENVSWITCH) {
        # For tunnels
	mysystem("$OVSSTART --delete-bridges start");
    }

    # For bandwidth contraints.
    mysystem("$MODPROBE ifb");

    # Create a DB to manage them. 
    my %MDB;
    if (!dbmopen(%MDB, $IFBDB, 0660)) {
	print STDERR "*** Could not create $IFBDB\n";
	TBScriptUnlock();
	return -1;
    }
    dbmclose(%MDB);
    
    #
    # Ensure that LVM is loaded in the kernel and ready.
    #
    if ($USE_LVM) {
1740
1741
1742
1743
1744
1745
1746
1747
	# There are several reasons we might need a Docker restart in
	# this LVM setup bit; they will be noted along the way, and we
	# will restart if necessary.
	my $needdockerrestart = 0;

	#
	# Sets up our PVs and VG ($VGNAME).
	#
1748
1749
	setupLVM();

1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
	#
	# Figure out how big various volumes should be.
	#
	# If we are using the aufs storage backend for Docker, we want
	# most of our space in $EXTRAFS (since /var/lib/docker gets
	# symlinked there, our heaviest space usage may be there); in
	# that case, we save a ~10%VG buffer of free space.  Wild guess.
	#
	# If we are instead using the devicemapper direct-lvm backend,
	# we need both $EXTRAFS and $INFOFS, but we also need a beefy
	# thinpool for Docker.  In this case, we use max(5GB,3%VG) LV
	# for $INFOFS; use min(32GB,15%remainingVG) for the $EXTRAFS;
	# then we provision the thin pool with 90% of the remaining
	# space (i.e., 0.90*(totalVG - sizeof($EXTRAFS) -
	# sizeof($INFOFS))).  This results in at least some spare space
	# in case some heavy usage happens, for autoextension of the
	# thinpool.  And we could even consider garbage-collecting
	# context build dirs in $EXTRAFS and downsizing that so that the
	# thin pool can grow more, for instance on a shared host, if
	# necessary.
	#
	my ($extrasize,$infosize,$thinpoolsize) = (0,0,0);
	my $vgsize = lvmVGSize($VGNAME);
	my $remaining = $vgsize;

	if (!$USE_DOCKER_LVM) {
	    # We will only create $EXTRAFS and $INFOFS.
	    if (0.03 * $remaining < 5) {
		$infosize = 0.03 * $remaining;
	    }
	    else {
		$infosize = 5;
	    }
	    $remaining -= $infosize;
	    $extrasize = 0.90 * $remaining;
	    $remaining -= $extrasize;
1786
	}
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
	else {
	    # We will create $EXTRAFS and $INFOFS, as well as the Docker
	    # thin pool.
	    if (0.03 * $remaining < 5) {
		$infosize = 0.03 * $remaining;
	    }
	    else {
		$infosize = 5;
	    }
	    $remaining -= $infosize;
	    if (0.15 * $remaining < 32) {
		$extrasize = 0.15 * $remaining;
	    }
	    else {
		$extrasize = 32;
	    }
	    $remaining -= $extrasize;
	    $thinpoolsize = 0.90 * $remaining;
	    $remaining -= $thinpoolsize;
	}

	my $tmplvname;
	if ($INFOFS =~ /\/(.*)$/) {
	    $tmplvname = $1;
	}
	if (!libvnode::lvExists($VGNAME,$tmplvname)) {
	    print "Creating container info FS ...\n";
	    if (createExtraFS($INFOFS, $VGNAME, "${infosize}G")) {
		TBScriptUnlock();
		return -1;
	    }
	}
	if ($EXTRAFS =~ /\/(.*)$/) {
	    $tmplvname = $1;
	}
	if (!libvnode::lvExists($VGNAME,$tmplvname)) {
	    print "Creating scratch FS ...\n";
	    my $already = 0;
	    if (-d $EXTRAFS) {
		$already = 1;
		mysystem("mv $EXTRAFS ${EXTRAFS}.bak");
	    }
	    if (createExtraFS($EXTRAFS, $VGNAME, "${extrasize}G")) {
		TBScriptUnlock();
		return -1;
	    }
	    if ($already) {
		my @files = glob("${EXTRAFS}.bak/*");
		foreach my $file (@files) {
		    my $base = basename($file);
		    mysystem("/bin/mv $file $EXTRAFS")
			if (! -e "$EXTRAFS/$base");
		}
		mysystem("/bin/rm -rf ${EXTRAFS}.bak");
	    }
	}
	if ($USE_DOCKER_LVM && !libvnode::lvExists($VGNAME,"thinpool")) {
	    print "Creating Docker Thin Pool...\n";
	    #
	    # Docker wants a thinpool and a metadata pool.  Size of the
	    # metadata pool cannot exceed 16GB.  So we create that as
	    # min(16,0.01*$thinpoolsize).
	    #
	    my ($tps,$tpms) = (0,0);
	    if (0.01 * $thinpoolsize < 16) {
		$tpms = 0.01 * $thinpoolsize;
	    }
	    else {
		$tpms = 16;
	    }
	    $tps = $thinpoolsize - $tpms;
	    # XXX: --wipesignatures y ?
1859
1860
	    mysystem("lvcreate -n thinpool $VGNAME -L ${tps}G");
	    mysystem("lvcreate -n thinpoolmeta $VGNAME -L ${tpms}G");
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
	    mysystem("lvconvert -y --zero n -c 512K".
		     " --thinpool $VGNAME/thinpool".
		     " --poolmetadata $VGNAME/thinpoolmeta");
	    mkdir("/etc/lvm/profile");
	    open(FD,">/etc/lvm/profile/$VGNAME-thinpool.profile")
		or fatal("could not open /etc/lvm/profile/$VGNAME-thinpool.profile: $@");
	    print FD "activation {\n".
		"  thin_pool_autoextend_threshold=90\n".
		"  thin_pool_autoextend_percent=10\n".
		"}\n";
	    close(FD);
	    mysystem("lvchange --metadataprofile $VGNAME-thinpool".
		     " $VGNAME/thinpool");
	    mysystem("lvs -o+seg_monitor");

	    #
	    # Setup the Docker devicemapper direct-lvm storage backend.
	    # { "storage-driver": "devicemapper",
	    #   "storage-opts": [
	    #     "dm.thinpooldev=/dev/mapper/docker-thinpool",
	    #     "dm.use_deferred_removal=true",
	    #     "dm.use_deferred_deletion=true" ] }
	    #
	    my $origjsontext = '';
	    my $json = {};
	    if (-e "/etc/docker/daemon.json") {
		open(FD,"/etc/docker/daemon.json")
		    or die("could not open /etc/docker/daemon.json: $!");
		my @lines = <FD>;
		close(FD);
		$origjsontext = join("",@lines);
		$json = decode_json($origjsontext);
	    }

	    # If it exists, just delete it; we only want valid stuff in here.
	    if (defined($json->{"storage-driver"})) {
		delete($json->{"storage-driver"});
	    }
	    if (defined($json->{"storage-opts"})) {
		delete($json->{"storage-opts"});
	    }

	    # Write our config.
	    # Don't restart docker; that happens at the end of $USE_LVM.
	    $needdockerrestart = 1;
	    $json->{"storage-driver"} = "devicemapper";
	    $json->{"storage-opts"} = [
		"dm.thinpooldev=/dev/mapper/${VGNAME}-thinpool",
		"dm.use_deferred_removal=true",
		"dm.use_deferred_deletion=true"
		];

	    TBDebugTimeStamp("Updating /etc/docker/daemon.json");

	    my $newjsontext = encode_json($json);

	    open(FD,">/etc/docker/daemon.json")
		or die("could not write /etc/docker/daemon.json: $!");
	    print FD $newjsontext;
	    close(FD);
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
	}
	if (! -l $VMS) {
	    #
	    # We need this stuff to be sticky across reloads, so move it
	    # into an lvm. If we lose the lvm, well then we are screwed.
	    #
	    my @files = glob("$VMS/*");
	    foreach my $file (@files) {
		my $base = basename($file);
		mysystem("/bin/mv $file $INFOFS")
		    if (! -e "$INFOFS/$base");
	    }
	    mysystem("/bin/rm -rf $VMS");
	    mysystem("/bin/ln -s $INFOFS $VMS");
	}
	if (! -l '/var/lib/docker') {
1937
1938
	    # Make sure Docker is stopped before we do this, if it
	    # wasn't stopped above already!
1939
	    mysystem2("systemctl stop docker.service");
1940
	    $needdockerrestart = 1;
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
	    if ($?) {
		warn("could not stop docker service before moving".
		     " /var/lib/docker to LVM; aborting!");
		TBScriptUnlock();
		return -1;
	    }
	    mysystem2("mount -t aufs | grep /var/lib/docker/");
	    if ($? == 0) {
		warn("filesystems still mounted in /var/lib/docker; aborting!");
		TBScriptUnlock();
		return -1;
	    }
	    mkdir("$EXTRAFS/var.lib.docker");
	    #
	    # We need this stuff to be sticky across reloads, so move it
	    # into an lvm. If we lose the lvm, well then we are screwed.
	    #
	    my @files = glob("/var/lib/docker/*");
	    foreach my $file (@files) {
		my $base = basename($file);
1961
		mysystem("/bin/mv $file $EXTRAFS/var.lib.docker")
1962
1963
1964
1965
		    if (! -e "$EXTRAFS/var.lib.docker/$base");
	    }
	    mysystem("/bin/rm -rf /var/lib/docker");
	    mysystem("/bin/ln -s $EXTRAFS/var.lib.docker /var/lib/docker");
1966
	}
1967

1968
1969
	if ($needdockerrestart) {
	    mysystem2("systemctl restart docker.service");
1970
	    if ($?) {
1971
		warn("could not restart docker service after LVM setup; aborting!");
1972
1973
1974
1975
		TBScriptUnlock();
		return -1;
	    }
	}
1976
1977
1978
1979
1980
1981

	#
	# Check the $DOCKERCNET again after LVM setup... if the move of
	# /var/lib/docker fails, all Docker state (including
	# $DOCKERCNET) will appear to have vanished!
	#
1982
1983
1984
1985
1986
1987
	TBDebugTimeStamp("checking docker network $DOCKERCNET after LVM move");
	($code,$content,$resp) = getClient()->network_inspect($DOCKERCNET);
	if ($code) {
	    fatal("$DOCKERCNET still does not appear as a Docker network;".
		  " something must have gone wrong in LVM setup!\n");
	}
1988
1989
1990
    }
    else {
	mkdir($VMS);
1991
1992
	mkdir($INFOFS);
	mkdir($EXTRAFS);
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
    }

    #
    # Make sure IP forwarding is enabled on the host
    #
    mysystem2("$SYSCTL -w net.ipv4.conf.all.forwarding=1");

    #
    # Increase socket buffer size for frisbee download of images.
    #
    mysystem2("$SYSCTL -w net.core.rmem_max=1048576");
    mysystem2("$SYSCTL -w net.core.wmem_max=1048576");

    #
    # Need these to avoid overflowing the NAT tables.
    #
    mysystem2("$MODPROBE nf_conntrack");
    if ($?) {
	print STDERR "ERROR: could not load nf_conntrack module!\n";
	TBScriptUnlock();
	return -1;
    }
    mysystem2("$SYSCTL -w ".
	     "  net.netfilter.nf_conntrack_generic_timeout=120");
    mysystem2("$SYSCTL -w ".
	     "  net.netfilter.nf_conntrack_tcp_timeout_established=54000");
    mysystem2("$SYSCTL -w ".
	     "  net.netfilter.nf_conntrack_max=131071");
    mysystem2("echo 16384 > /sys/module/nf_conntrack/parameters/hashsize");
 
    # These might fail on new kernels.  
    mysystem2("$SYSCTL -w ".
	      " net.ipv4.netfilter.ip_conntrack_generic_timeout=120");
    mysystem2("$SYSCTL -w ".
	      " net.ipv4.netfilter.ip_conntrack_tcp_timeout_established=54000");

    #
    # Clone the emulab and pubsub src repos.  Make other dirs.
    #
    mkdir($CONTEXTDIR);
    if (! -d $EMULABSRC) {
	mysystem("git clone https://gitlab.flux.utah.edu/emulab/emulab-devel".
		 " $EMULABSRC");
    }
    if (! -d $PUBSUBSRC) {
	mysystem("git clone https://gitlab.flux.utah.edu/emulab/pubsub".
		 " $PUBSUBSRC");
    }
2041
2042
2043
2044
    if (! -d $RUNITSRC) {
	mysystem("git clone https://gitlab.flux.utah.edu/emulab/runit".
		 " $RUNITSRC");
    }
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064

    # We're done; mark it.
    mysystem("touch $READYFILE");
    TBDebugTimeStamp("  releasing global lock")
	if ($lockdebug);
    TBScriptUnlock();
    return 0;
}

#
# Prepare any network stuff in the root context for a specific vnode.
# Run once at boot/create, or at reconfigure.  For Docker, this consists
# of creating bridges and/or macvlans, configuring them as necessary,
# and binding them to Docker networks.
#
# NOTE: This function must clean up any side effects if it fails partway.
#
sub rootPreConfigNetwork($$$$)
{
    my ($vnode_id, undef, $vnconfig, $private) = @_;
2065
    my ($code,$content,$resp);
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179

    TBDebugTimeStamp("rootPreConfigNetwork: grabbing global lock".
		     " $GLOBAL_CONF_LOCK")
	if ($lockdebug);
    if (TBScriptLock($GLOBAL_CONF_LOCK,
		     TBSCRIPTLOCK_INTERRUPTIBLE(), 900) != TBSCRIPTLOCK_OKAY()){
	print STDERR "Could not get the global lock!\n";
	return -1;
    }
    TBDebugTimeStamp("  got global lock")
	if ($lockdebug);

    #
    # If we blocked, it would be because vnodes have come or gone,
    # so we need to rebuild the maps.
    #
    # It is important that we do this once we have the global lock!  Our
    # cleanup code in bad: depends on us having the lock before we call
    # thi
    #
    refreshNetworkDeviceMaps();

    my $vmid;
    if ($vnode_id =~ /^[-\w]+\-(\d+)$/) {
	$vmid = $1;
    }
    else {
	print STDERR "vz_rootPreConfigNetwork: bad vnode_id $vnode_id, aborting!";
	goto badbad;
    }
    
    my @node_ifs = @{ $vnconfig->{'ifconfig'} };
    my @node_lds = @{ $vnconfig->{'ldconfig'} };

    #
    # If we're using veths, figure out what bridges we need to make:
    # we need a bridge for each physical iface that is a multiplex pipe,
    # and one for each VTAG given PMAC=none (i.e., host containing both sides
    # of a link, or an entire lan).
    #
    my %brs = ();
    my $prefix;
    if ($USE_MACVLAN) {
	$prefix = "mv";
    }
    else {
	$prefix = "br";
    }

    foreach my $ifc (@node_ifs) {
	# XXX
	#next if (!$ifc->{ISVIRT});

	print "$vnode_id interface " . Dumper($ifc) . "\n"
	    if ($debug > 1);

	#
	# In the era of shared nodes, we cannot name the bridges
	# using experiment local names (e.g., the link name).
	# Bridges are now named after either the physical interface
	# they are associated with or the "tag" if there is no physical
	# interface.
	#
	my $brname;
	my $physdev;

	if ($ifc->{ITYPE} eq "loop") {
	    my $vtag  = $ifc->{VTAG};

	    #
	    # No physical device. It's a loopback (trivial) link/lan
	    # All we need is a common bridge to put the veth ifaces into,
	    # or a dummy device to host the macvlan devices on.
	    #
	    $physdev = $brname = "${prefix}$vtag";
	    $brs{$brname}{ENCAP} = 0;
	    $brs{$brname}{SHORT} = 0;
	}
	elsif ($ifc->{ITYPE} eq "vlan") {
	    my $iface = $ifc->{IFACE};
	    my $vtag  = $ifc->{VTAG};
	    my $vdev  = "${iface}.${vtag}";

	    if (! -d "/sys/class/net/$vdev") {
		mysystem2("$VLANCONFIG set_name_type DEV_PLUS_VID_NO_PAD");
		mysystem2("$VLANCONFIG add $iface $vtag");
		goto bad
		    if ($?);
		mysystem2("$VLANCONFIG set_name_type VLAN_PLUS_VID_NO_PAD");

		#
		# We do not want the vlan device to have the same
		# mac as the physical device, since that will confuse
		# findif later.
		#
		my $bmac = fixupMac(GenFakeMac());
		mysystem2("$IP link set $vdev address $bmac");
		goto bad
		    if ($?);

		mysystem2("$IFCONFIG $vdev up");
		# XXX
		#mysystem2("$ETHTOOL -K $vdev tso off gso off");
		refreshNetworkDeviceMaps();

		# XXX
		# Another thing that seems to screw up, causing the ciscos
		# to drop packets with an undersize error.
		#mysystem2("$ETHTOOL -K $iface txvlan off");
	    }
	    # XXX
	    # Temporary, to get existing devices after upgrade.
	    #mysystem2("$ETHTOOL -K $vdev tso off gso off");

2180
2181
2182
	    $physdev =  $vdev;
	    $brname  = $prefix . $vdev;

2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
	    # We save this so we can garbage-collect it in vnodeDestroy.
	    # But we don't remove it here if there's a failure.
	    $private->{'vlandevs'}->{$brname} = $vdev;
	    $brs{$brname}{ENCAP} = 1;
	    $brs{$brname}{SHORT} = 0;
	    $brs{$brname}{PHYSDEV} = $vdev;
	    $brs{$brname}{IFC} = $ifc;
	}
	#
	# These final two cases should only be ITYPE==veth .
	# We will never see a veth on a shared node, thus they
	# have already been created during the physnode config.
	#
	elsif ($ifc->{PMAC} eq "none") {
	    $physdev = $brname = $prefix . $ifc->{VTAG};
	    # if no PMAC, we don't need encap on the bridge
	    $brs{$brname}{ENCAP} = 0;
	    # count members below so we can figure out if this is a shorty
	    $brs{$brname}{MEMBERS} = 0;
	}
	else {
	    my $iface = findIface($ifc->{PMAC});
	    $physdev = $iface;
	    $brname  = $prefix . $iface;
	    $brs{$brname}{ENCAP} = 1;
	    $brs{$brname}{SHORT} = 0;
	    $brs{$brname}{IFC} = $ifc;
	    $brs{$brname}{PHYSDEV} = $iface;
	}
	# Stash for later phase.
	$ifc->{'PHYSDEV'} = $physdev
	    if (defined($physdev));
	$ifc->{'BRIDGE'} = $brname
	    if (defined($brname));

	#
	# Docker networks require a subnet (and a gateway; i.e.
	# https://github.com/docker/libnetwork/issues/1447#issuecomment-247368397).
	# This gateway assumption appears builtin to Docker at abstract
	# levels, and thus would take significant patching to
	# workaround.  So we don't do that.  Instead we have a hack, see below.
	#
	# Anyway, we have to extract and save off the cidr/gateway bits so that
	# when we create the Docker network, we have what we need.
	#
	# XXX: this of course won't work for shared nodes with
	# overlapping exp net subnets!  Docker/libnetwork has an
	# incredibly limited network model; it's ridiculous.
	#
	if (exists($ifc->{IPMASK}) && exists($ifc->{IPADDR})) {
	    # Figure out the subnet for this network:
	    my $netaddr = inet_aton($ifc->{IPADDR}) & inet_aton($ifc->{IPMASK});
	    my $maskbits = 0;
	    foreach my $octet (split(/\./,$ifc->{IPMASK})) {
		my $cval = int($octet);
		for (my $i = 0; $i < 8; ++$i) {
		    $maskbits += $cval & 1;
		    $cval = $cval >> 1;
		}
	    }
	    $brs{$brname}{CIDR} = inet_ntoa($netaddr) . "/$maskbits";

	    #
	    # NB XXX: Use the final address in the subnet as the
	    # gateway.  (I considered using the penultimate address, to
	    # assume that some manually-assigning users will take the
	    # final non-broadcast address, but that is just a grosser
	    # hack -- we'll just document this). Obviously, if this
	    # address was used/assigned by Emulab or the user, that
	    # container will fail to boot!  I could check this in the
	    # single experiment case, but I'm not sure how to check for
	    # a shared LAN.  Anyway, we'll just document this too...
	    #
	    my $bcast = ~inet_aton($ifc->{IPMASK});
	    $brs{$brname}{GW} =
		inet_ntoa($netaddr | pack("N",unpack("N",$bcast) - 1));
	}
	else {
	    warn("Fatal: all Docker network interfaces *must* have an".
		 " IP address and subnet; aborting!");
	    goto bad;
	}
    }

    #
    # Make bridges and add phys ifaces.
    #
    # Or, in the macvlan case, create a dummy device if there is no
    # underlying physdev to "host" the macvlan.
    #
    foreach my $k (keys(%brs)) {
	my $cidr = $brs{$k}{CIDR};
	my $gw = $brs{$k}{GW};

	if (!$USE_MACVLAN) {
	    #
	    # This bridge might be shared with other containers, so difficult
	    # to delete. This really only matters on shared nodes though, where
	    # bridges and vlans could stack up forever (or just a long time).
	    #
	    if (! -d "/sys/class/net/$k/bridge") {
		addbr($k);
		goto bad
		    if ($?);

		#
		# Bad feature of bridges; they take on the lowest numbered
		# mac of the added interfaces (and it changes as interfaces
		# are added and removed!). But the main point is that we end
		# up with a bridge that has the same mac as a physical device
		# and that screws up findIface(). But if we "assign" a mac
		# address, it does not change and we know it will be unique.
		#
		my $bmac = fixupMac(GenFakeMac());
		mysystem2("$IP link set $k address $bmac");
		goto bad
		    if ($?);
	    }
	    # record bridge used
	    $private->{'physbridges'}->{$k} = $k;

	    # repetitions of this should not hurt anything
	    mysystem2("$IFCONFIG $k 0 up");

	    #
	    # Add a physical interface to the bridge if necessary.
	    #
	    if (exists($brs{$k}{PHYSDEV})) {
		my $physdev = $brs{$k}{PHYSDEV};

		#
		# This interface should not be a member of another bridge.
		# If it is, it is an error.
		#
		# Continuing the comment above, this bridge and this interface
		# might be shared with other containers, so we cannot remove it
		# unless it is the only one left. 
		#
		my $obr = findBridge($physdev);
		if (defined($obr) && $obr ne $k) {
		    # Avoid removing the device from the bridge if it
		    # is in the correct bridge. 
		    delbrif($obr, $physdev);
		    goto bad
			if ($?);
		    $obr = undef;
		}
		if (!defined($obr)) {
		    addbrif($k, $physdev);
		    goto bad
			if ($?);
		    # rebuild hashes
		    makeBridgeMaps();
		}

		$private->{'physbridgeifaces'}->{$k}->{$physdev} = $physdev;
	    }

	    #
	    # Now that the bridge exists, make the Docker network atop it.
	    #