#!/usr/bin/perl -T
#
# Copyright (c) 2008-2018 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
# This file is part of the Emulab network testbed software.
#
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this file. If not, see .
#
# }}}
#
# Implements the libvnode API for Docker support in Emulab.
#
# Note that there is no distinguished first or last call of this library
# in the current implementation. Every vnode creation (through mkvnode.pl)
# will invoke all the root* and vnode* functions. It is up to us to make
# sure that "one time" operations really are executed only once.
#
package libvnode_docker;
use Exporter;
@ISA = "Exporter";
@EXPORT = qw( init setDebug rootPreConfig
rootPreConfigNetwork rootPostConfig
vnodeCreate vnodeDestroy vnodeState vnodePoll vnodePollCleanup
vnodeBoot vnodePreBoot vnodeHalt vnodeReboot
vnodeUnmount
vnodePreConfig vnodePreConfigControlNetwork
vnodePreConfigExpNetwork vnodeConfigResources
vnodeConfigDevices vnodePostConfig vnodeExec vnodeTearDown VGNAME
);
use vars qw($VGNAME);
%ops = ( 'init' => \&init,
'setDebug' => \&setDebug,
'rootPreConfig' => \&rootPreConfig,
'rootPreConfigNetwork' => \&rootPreConfigNetwork,
'rootPostConfig' => \&rootPostConfig,
'vnodeCreate' => \&vnodeCreate,
'vnodeDestroy' => \&vnodeDestroy,
'vnodeTearDown' => \&vnodeTearDown,
'vnodeState' => \&vnodeState,
'vnodePoll' => \&vnodePoll,
'vnodePollCleanup' => \&vnodePollCleanup,
'vnodeBoot' => \&vnodeBoot,
'vnodeHalt' => \&vnodeHalt,
'vnodeUnmount' => \&vnodeUnmount,
'vnodeReboot' => \&vnodeReboot,
'vnodeExec' => \&vnodeExec,
'vnodePreConfig' => \&vnodePreConfig,
'vnodePreConfigControlNetwork' => \&vnodePreConfigControlNetwork,
'vnodePreConfigExpNetwork' => \&vnodePreConfigExpNetwork,
'vnodeConfigResources' => \&vnodeConfigResources,
'vnodeConfigDevices' => \&vnodeConfigDevices,
'vnodePostConfig' => \&vnodePostConfig,
);
use strict;
use warnings;
use English;
use Data::Dumper;
use Socket;
use IO::Handle;
use IO::Select;
use File::Basename;
use File::Path;
use File::Copy;
use File::Temp qw(tempdir);
use POSIX;
use JSON::PP;
use Digest::SHA qw(sha1_hex);
# Pull in libvnode
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }
use libutil;
use libgenvnode;
use libvnode;
use libtestbed;
use libsetup;
use libtmcc;
use liblocsetup;
#
# Turn off line buffering on output
#
$| = 1;
#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself.
#
##
## Standard utilities and files section
##
my $DOCKER = "/usr/bin/docker";
my $CURL = "/usr/bin/curl";
my $BRCTL = "brctl";
my $IP = "/sbin/ip";
my $IFCONFIG = "/sbin/ifconfig";
my $ETHTOOL = "/sbin/ethtool";
my $ROUTE = "/sbin/route";
my $SYSCTL = "/sbin/sysctl";
my $VLANCONFIG = "/sbin/vconfig";
my $MODPROBE = "/sbin/modprobe";
my $IPTABLES = "/sbin/iptables";
my $IPBIN = "/sbin/ip";
my $NETSTAT = "/bin/netstat";
my $IMAGEZIP = "/usr/local/bin/imagezip";
my $IMAGEUNZIP = "/usr/local/bin/imageunzip";
my $IMAGEDUMP = "/usr/local/bin/imagedump";
##
## Runtime configuration options.
##
my $debug = 0;
my $apidebug = 5;
my $lockdebug = 0;
my $sleepdebug = 0;
#
# Set to enable vnodesetup to exit before vnode is completely up
# (see vnodesetup::hackwaitandexit). Allows more parallelism during
# boot-time vnode setup. Note that concurrency may still be constrained
# by $MAXCONCURRENT (defined below) which limits how many new VMs can
# be created at once.
#
my $vsrelease = "immediate"; # or "early" or "none"
#
# If Docker is not already installed, which one should we use? If it's
# not installed, we default to the community edition. This is a
# runtime-checked param, so we'll use whatever is installed by default,
# not necessarily what is specified here.
#
# You really don't want to use docker.io <= 1.12, because it will take
# too many liberties with the control net bridge. For instance, if you
# attempt a `systemctl restart docker.service`, you may be SOL and no
# longer on the control net! docker-ce has patches against this rolled
# in already.
#
my $USE_DOCKER_CE = 1;
#
# Should we use LVM for extra storage space? This should remain set.
#
my $USE_LVM = 1;
#
# Should we use the Docker devicemapper direct-lvm storage backend?
# This should remain set, so that it is used for shared hosts. User
# should be able to change to the default AUFS backend on dedicated
# hosts.
#
my $USE_DOCKER_LVM = 1;
#
# Default NFS mounts to read-only for now so that nothing in the
# container can blow them away accidentally!
#
my $NFS_MOUNTS_READONLY = 0;
#
# Should we log packets the firewall rejects?
#
my $IPTABLES_PACKET_LOG = 1;
#
# Defaults for the default docker bridge (not our control net bridge).
#
my $DOCKER_DEFAULT_BRIDGE_IP = '192.168.254.1';
my $DOCKER_DEFAULT_BRIDGE_CIDR = '192.168.254.1/24';
#
# Docker supports both macvlan and regular bridging, but we use regular
# bridges because we need to impose traffic control on the host context
# half of the veth.
#
# There *is* a Docker plugin for openvswitch, but we don't want to use
# that yet; $BR_USE_OPENVSWITCH is simply for existing code that could
# enable this feature.
#
my $USE_MACVLAN = 0;
#
# We support macvlans on the control net, but we don't use them because
# we need to apply iptables rules outside the containers, so we need the
# host context half of the veth to use as a source interface. It is
# tempting to use a cgroup ID plus net_cls, but apparently the markings
# only hold within the container's netns, and don't make it into the
# root (i.e. https://github.com/docker/docker/issues/19802). So we're
# really stuck with real bridges -- and thus this should not be enabled,
# unless someone else can find a way around this.
#
my $USE_MACVLAN_CNET = 0;
my $USE_OPENVSWITCH = 0;
#
# This flag controls whether we use OVS for GRE tunnels (i.e. for EGRE),
# or if we use Linux kernel GRE + routing + veths.
#
my $TUN_USE_OPENVSWITCH = 0;
##
## Detected configuration variables.
##
#
# Is this our customized version of Docker?
#
my $ISOURDOCKER = 0;
#
# Some commands/subsystems have evolved in incompatible ways over time,
# these vars keep track of such things.
#
my $NEW_LVM = 0;
##
## Various constants.
##
#
# Image wait time. How long (seconds) we will wait to when trying to
# grab a lock on an image. Should be set to the max time you think it
# could take to pull a large Docker image. This is a wild guess, obviously.
#
my $MAXIMAGEWAIT = 1800;
#
# Serial console handling. We fire up a capture per active vnode.
# We use a fine assortment of capture options:
#
# -i: standalone mode, don't try to contact capserver directly
# -l: (added later) set directory where log, ACL, and pid files are kept.
# -C: use a circular buffer to capture activity while no user
# is connected. This gets dumped to the user when they connect.
# -T: Put out a timestamp if there has been no previous output
# for at least 10 seconds.
# -L: In conjunction with -T, the timestamp message includes how
# long it has been since the last output.
# -R: Retry interval of 1 second. When capture is disconnected
# from the pty (due to container reboot/shutdowns), this is how
# long we wait between attempts to reconnect.
# -y: When capture disconnects from the pty, we retry forever to reopen.
# -A: tell capture not to prepend '/dev' to the device path we supply.
#
my $CAPTURE = "/usr/local/sbin/capture-nossl";
my $CAPTUREOPTS = "-i -C -L -T 10 -R 1000 -y -1 -A";
my $C2P = "/usr/local/etc/emulab/container2pty.py";
#
# Create a thin pool with the name $POOL_NAME using not more
# than $POOL_FRAC of any disk.
#
my $USE_THIN_LVM = 1;
my $POOL_NAME = "disk-pool";
my $POOL_FRAC = 0.75;
#
# Minimum acceptible size (in GB) of LVM VG for containers.
#
# XXX we used to calculate this in terms of anticipated maximum number
# of vnodes and minimum vnode images size, blah, blah. Now we just pick
# a value that allows us to use a pc3000 node with a single 144GB disk!
#
my $DOCKER_MIN_VGSIZE = 120;
# Striping
my $STRIPE_COUNT = 1;
# Avoid using SSDs unless there are only SSDs
my $LVM_AVOIDSSD = 1;
# Whether or not to use only unpartitioned (unused) disks to form the Xen VG.
my $LVM_FULLDISKONLY = 0;
# Whether or not to use partitions only when they are big.
my $LVM_ONLYLARGEPARTS = 1;
my $LVM_LARGEPARTPCT = 10;
# In general, you only want to use one partition per disk since we stripe.
my $LVM_ONEPARTPERDISK = 1;
#
# Flags for allocating LVs
#
sub ALLOC_NOPOOL() { return 0; }
sub ALLOC_INPOOL() { return 1; }
sub ALLOC_PREFERNOPOOL { return 2; }
sub ALLOC_PREFERINPOOL { return 3; }
##
## Randomly chosen convention section
##
# Locks.
my $GLOBAL_CONF_LOCK = "emulabdockerconf";
my $GLOBAL_MOUNT_LOCK = "emulabmounts";
my $SSHD_EXEC_LOCK = "sshdockerexec";
my $DOCKER_EXEC_SSHD_CONFIGFILE = "/etc/ssh/sshd_config-docker-exec";
my $DOCKER_EXEC_SSHD_CONFIGFILE_HEAD = "/etc/ssh/sshd_config-docker-exec.head";
my $DOCKER_EXEC_SSHD_CONFIGDIR = "/etc/ssh/docker-exec.conf.d";
# Config done file.
my $READYFILE = "/var/run/emulab.docker.ready";
# default image to load on logical disks
# Just symlink /boot/vmlinuz-xenU and /boot/initrd-xenU
# to the kernel and ramdisk you want to use by default.
my %defaultImage = (
'name' => "ubuntu:16.04",
# 'hub' => "",
);
# Where we store all our config files.
my $VMS = "/var/emulab/vms";
my $VMDIR = "$VMS/vminfo";
# Extra space for VM info.
my $EXTRAFS = "/vms";
# Extra space for vminfo (/var/emulab/vms) between reloads.
my $INFOFS = "/vminfo";
# Docker LVM volume group name. Accessible outside this file.
$VGNAME = "docker";
# So we can ask this from outside;
sub VGNAME() { return $VGNAME; }
my $CTRLIPFILE = "/var/emulab/boot/myip";
# XXX needs lifting up
my $JAILCTRLNET = "172.16.0.0";
my $JAILCTRLNETMASK = "255.240.0.0";
#
# NB: Total hack. Docker doesn't give you control over default gateway
# for a multi-homed container, other than to ensure that virtual NICs
# are added in lexical order of name, and to promise that the default
# gateway set by the first-added network will remain. So make sure the
# control net has a lexical name at the beginning of everything.
#
my $DOCKERCNET = "_dockercnet";
#
# Some of the core dirs for Emulabization existing Docker images.
#
my $EMULABSRC = "$EXTRAFS/emulab-devel";
my $PUBSUBSRC = "$EXTRAFS/pubsub";
my $RUNITSRC = "$EXTRAFS/runit";
my $CONTEXTDIR = "$EXTRAFS/contexts";
my $DOCKERFILES = "/etc/emulab/docker/dockerfiles";
# IFBs
my $IFBDB = "/var/emulab/db/ifbdb";
# Use openvswitch for gre tunnels.
# Use a custom version if present, the standard version otherwise.
my $OVSCTL = "/usr/local/bin/ovs-vsctl";
my $OVSSTART = "/usr/local/share/openvswitch/scripts/ovs-ctl";
if (! -x "$OVSCTL") {
$OVSCTL = "/usr/bin/ovs-vsctl";
$OVSSTART = "/usr/share/openvswitch/scripts/ovs-ctl";
}
my $ISREMOTENODE = REMOTEDED();
##
## Emulab constants.
##
my $TMCD_PORT = 7777;
my $SLOTHD_PORT = 8509;
my $EVPROXY_PORT = 16505;
##
## Docker constants.
##
#
# The options as far as what to install in an image to support its use
# in Emulab.
#
# none: we do not alter the image at all!
# basic: install only sshd and syslogd, and whatever init the user wants
# core: basic + install a custom-build of the clientside, using a buildenv of
# the image, but only installing the DESTDIR clientside binaries/fs stuff;
# also install a whole bunch of packages the clientside stuff needs.
# buildenv: basic + full + install all build tools for clientside, and
# install the clientside.
# full: buildenv + packages to make the image identical to a normal Emulab
# disk image.
#
sub DOCKER_EMULABIZE_NONE() { return "none"; }
sub DOCKER_EMULABIZE_BASIC() { return "basic"; }
sub DOCKER_EMULABIZE_CORE() { return "core"; }
sub DOCKER_EMULABIZE_BUILDENV() { return "buildenv"; }
sub DOCKER_EMULABIZE_FULL() { return "full"; }
#
# Most of the Linux images that users will use will be generic images
# whose startup command is sh or bash. We need something that (at
# minimum) runs infinitely, reaps processes like init, and allows remote
# logins via ssh, syslogs, etc. Users are free to specify no
# emulabization to cover the cases where the image runs a bona fide
# daemon or pre-configured init. But we cannot help them with those
# cases automatically.
#
#sub DOCKER_EMULABIZE_DEFAULT() { return DOCKER_EMULABIZE_BASIC(); }
sub DOCKER_EMULABIZE_DEFAULT() { return DOCKER_EMULABIZE_NONE(); }
#
# On modern (ie.e. 2016) Linux images, systemd is already installed (on
# Ubuntu/Debian, and Fedora/CentOS). We really want to let people use
# it if it's there, instead of falling back to runit (which we install
# during Emulabization). However, the problem is that we cannot use
# systemd as the init on shared nodes -- systemd requires at least
# read-only access to /sys/fs/cgroup, and docker as of 1.26 does not
# virtualize the cgroup mount (although it's in kernels >= 4.4) -- even
# if Docker did, it might not work; I don't know what systemd wants out
# of /sys/fs/cgroup.
#
# Thus, we must default to runit so that users have images that work on
# both shared and dedicated container hosts. Ugh!
#
sub DOCKER_INIT_INSTALLED() { return "installed"; }
sub DOCKER_INIT_RUNIT() { return "runit"; }
#
# Either we always pull the reference image when setting up a new
# container, or we only pull the first time. Simple.
#
sub DOCKER_PULLPOLICY_LATEST() { return "latest"; }
sub DOCKER_PULLPOLICY_CACHED() { return "cached"; }
# Local functions
sub findRoot();
sub copyRoot($$);
sub replace_hacks($);
sub disk_hacks($);
sub hostMemory();
sub hostResources();
sub hostIP($);
sub fixupMac($);
sub lvmVGSize($);
sub checkForInterrupt();
sub genhostspairlist($$);
sub addMounts($$);
sub removeMounts($);
sub bindNetNS($$);
sub moveNetDeviceToNetNS($$$);
sub moveNetDeviceFromNetNS($$$);
sub unbindNetNS($$);
sub setupImage($$$$$$$$$$);
sub pullImage($$$$;$);
sub emulabizeImage($;$$$$$$$$);
sub analyzeImage($$);
sub AllocateIFBs($$$);
sub ReleaseIFBs($$);
sub CreateShapingScripts($$$$;$);
sub RunShapingScripts($$);
sub CreateRoutingScripts($$);
sub RunRoutingScripts($$);
sub RunWithSignalsBlocked($@);
sub RunProxies($$);
sub KillProxies($$);
sub InsertPostBootIptablesRules($$$$);
sub RemovePostBootIptablesRules($$$$);
sub captureRunning($);
sub captureStart($$);
#
# A single client object per load of this file is safe.
#
my $_CLIENT;
sub getClient()
{
return $_CLIENT
if (defined($_CLIENT));
# Load late, because this requires a bunch of deps we might have
# installed in ensureDeps().
require dockerclient;
$_CLIENT = dockerclient->new();
$_CLIENT->debug($apidebug);
return $_CLIENT;
}
#
# Historic concurrency value. Should get overwritten in setConcurrency.
#
my $MAXCONCURRENT = 5;
#
# Number of concurrent containers set up in parallel. Lifted from
# libvnode_xen; will be changed later.
#
sub setConcurrency($)
{
my ($maxval) = @_;
if ($maxval) {
$MAXCONCURRENT = 5;
} else {
my ($ram,$cpus) = hostResources();
my $disks = $STRIPE_COUNT;
my $hasswapped = hostSwapping();
print STDERR "setConcurrency: cpus=$cpus, ram=$ram, disks=$disks".
" hasswapped=$hasswapped\n"
if ($debug);
if ($cpus > 0 && $disks > 0 && $ram > 0) {
if ($ram < 1024 || (!SHAREDHOST() && $hasswapped)) {
$MAXCONCURRENT = 1;
} elsif ($cpus <= 2 || $disks == 1 || $ram <= 2048) {
$MAXCONCURRENT = 3;
} else {
$MAXCONCURRENT = 5;
}
}
}
print STDERR "Limiting to $MAXCONCURRENT concurrent vnode creations.\n";
}
sub setDebug($)
{
$debug = shift;
libvnode::setDebug($debug);
$lockdebug = 1;
if ($debug > 1) {
$sleepdebug = 1;
$apidebug = 5;
}
print "libvnode_docker: debug=$debug, apidebug=$apidebug\n"
if ($debug);
}
sub ImageLockName($)
{
my ($imagename) = @_;
my $ln = "dockerimage." .
(defined($imagename) ? $imagename : $defaultImage{'name'});
$ln =~ tr/\//-/;
return $ln;
}
sub ImageLVName($)
{
my ($imagename) = @_;
return "image+" . $imagename;
}
#
# Apt constants and helper functions.
#
my $APTGET = "/usr/bin/apt-get";
my $APTGETINSTALL = "$APTGET -o Dpkg::Options::='--force-confold'".
" -o Dpkg::Options::='--force-confdef' install -y ";
my $APTLOCK = "emulab.apt.running";
my $APTLOCK_REF;
my $APTUPDATEDFILE = "/var/run/emulab.apt.updated";
sub aptLock()
{
TBDebugTimeStamp("aptLock: grabbing global lock $APTLOCK")
if ($lockdebug);
my $locked = TBScriptLock($APTLOCK,
TBSCRIPTLOCK_GLOBALWAIT(),900,\$APTLOCK_REF);
if ($locked != TBSCRIPTLOCK_OKAY()) {
return 0
if ($locked == TBSCRIPTLOCK_IGNORE());
print STDERR "Could not get the apt-get lock after a long time!\n";
return -1;
}
TBDebugTimeStamp(" got global lock $APTLOCK")
if ($lockdebug);
return 0;
}
sub aptUnlock()
{
return TBScriptUnlock($APTLOCK_REF);
}
# Only run once per boot.
sub aptGetUpdate()
{
if (-f $APTUPDATEDFILE) {
return 0;
}
aptLock();
mysystem2("apt-get update");
if (!$?) {
mysystem("touch $APTUPDATEDFILE");
}
my $rc = $?;
aptUnlock();
return $rc;
}
#
# Returns 0 if all packages are installed; else the number of
# non-installed packages.
#
sub aptNotInstalled(@)
{
my @packages = @_;
my $rc = 0;
foreach my $P (@packages) {
my $pstat = `dpkg-query -L $P 2>&1 >/dev/null`;
if ($pstat) {
++$rc;
}
}
return $rc;
}
sub aptGetInstall(@)
{
my @packages = @_;
my $rc = 0;
aptGetUpdate();
$ENV{DEBIAN_FRONTEND} = 'noninteractive';
aptLock();
foreach my $P (@packages) {
mysystem2("$APTGETINSTALL $P");
if ($?) {
++$rc;
}
}
aptUnlock();
$ENV{DEBIAN_FRONTEND} = undef;
return $rc;
}
sub aptGetEnsureInstalled(@)
{
my @packages = @_;
my $rc = 0;
foreach my $P (@packages) {
$rc += aptGetInstall($P)
if (aptNotInstalled($P));
}
return $rc;
}
sub refreshNetworkDeviceMaps()
{
makeIfaceMaps();
if (!$USE_MACVLAN) {
makeBridgeMaps();
}
else {
makeMacvlanMaps();
}
}
sub ensureDeps()
{
if (aptNotInstalled("libwww-perl")) {
aptGetInstall("libwww-perl");
}
if (aptNotInstalled("liburi-perl")) {
aptGetInstall("liburi-perl");
}
if (aptNotInstalled("libhash-merge-perl")) {
aptGetInstall("libhash-merge-perl");
}
if (aptNotInstalled("libmime-base64-urlsafe-perl")) {
aptGetInstall("libmime-base64-urlsafe-perl");
}
eval {
use LWP::Protocol::http::SocketUnixAlt;
};
if ($@) {
mysystem("cpan -i LWP::Protocol::http::SocketUnixAlt");
}
if (aptNotInstalled("python-docker")) {
aptGetInstall("python-docker");
}
}
# (Must be called only after refreshNetworkDeviceMaps() is called for
# the first time in init.)
sub ensureDockerInstalled()
{
if (!aptNotInstalled("docker.io")) {
TBDebugTimeStamp("docker.io installed; using that");
$USE_DOCKER_CE = 0;
}
elsif (!aptNotInstalled("docker-ce")) {
TBDebugTimeStamp("docker-ce installed; using that");
$USE_DOCKER_CE = 1;
}
if (!$USE_DOCKER_CE) {
TBDebugTimeStamp("Ensuring docker.io installed...");
if (aptNotInstalled("docker.io")) {
TBDebugTimeStamp("Installing docker.io...");
if (aptGetInstall("docker.io")) {
die("Failed to install docker.io; aborting!\n");
}
mysystem2("service docker restart");
# Remap, cause Docker creates some ifaces.
refreshNetworkDeviceMaps();
}
#
# Check which docker this is.
#
if (-e "/usr/share/docker.io/EMULAB.md") {
$ISOURDOCKER = 1;
}
}
else {
TBDebugTimeStamp("Ensuring docker-ce installed...");
# Ensure the Docker CE repo is configured.
system("grep -q docker.com /etc/apt/sources.list /etc/apt/sources.list.d");
if ($?) {
TBDebugTimeStamp("Installing docker-ce Apt repos...");
aptGetEnsureInstalled("apt-transport-https","ca-certificates",
"curl","software-properties-common");
mysystem("curl -fsSL https://download.docker.com/linux/ubuntu/gpg".
" | sudo apt-key add -");
my $release = `lsb_release -cs`;
chomp($release);
my $arch = `uname -m`;
chomp($arch);
if ($arch eq 'x86_64' || $arch eq 'amd64') {
$arch = "amd64";
}
elsif ($arch eq 'armhf') {
;
}
else {
fatal("currently docker CE is only available on amd64/armhf!");
}
mysystem("add-apt-repository".
" \"deb [arch=$arch] https://download.docker.com/linux/ubuntu $release stable\"");
aptGetUpdate();
}
if (aptNotInstalled("docker-ce")) {
TBDebugTimeStamp("Installing docker-ce...");
if (aptGetInstall("docker-ce")) {
warn("Failed to install docker-ce; retrying in 8 seconds!\n");
sleep(8);
system("systemctl restart docker.service");
sleep(2);
system("apt-get install -y docker-ce");
if ($?) {
fatal("Failed to install docker-ce; aborting!\n");
}
}
mysystem2("service docker restart");
# Remap, cause Docker creates some ifaces.
refreshNetworkDeviceMaps();
}
#
# Check which docker this is.
#
if (-e "/usr/share/docker-ce/EMULAB.md") {
$ISOURDOCKER = 1;
}
}
#if (aptNotInstalled("systemd-container")
# && aptGetInstall("systemd-container")) {
# die("Failed to install systemd-container; aborting!\n");
#}
#
# Check or create the Docker config file; if we have to modify it,
# restart Docker.
#
mkdir("/etc")
if (! -d "/etc");
mkdir("/etc/docker")
if (! -d "/etc/docker");
my $origjsontext = '';
my $json = {};
my $changed = 0;
if (-e "/etc/docker/daemon.json") {
open(FD,"/etc/docker/daemon.json")
or die("could not open /etc/docker/daemon.json: $!");
my @lines = ;
close(FD);
$origjsontext = join("",@lines);
$json = decode_json($origjsontext);
}
# Check to ensure the docker iface has a non-172.16 subnet:
my $diface = getIfaceInfo("docker0");
if (!defined($diface)) {
fatal("Could not find default docker network interface; aborting!");
}
if ($diface->{'ip'} ne $DOCKER_DEFAULT_BRIDGE_IP
|| !defined($json) || !exists($json->{'bip'})
|| $json->{'bip'} ne $DOCKER_DEFAULT_BRIDGE_CIDR) {
TBDebugTimeStamp("Moving docker0 to $DOCKER_DEFAULT_BRIDGE_CIDR");
# Blast our docker opts into the right place:
$json->{'bip'} = $DOCKER_DEFAULT_BRIDGE_CIDR;
$changed = 1;
}
# Check to ensure we're doing the right thing w.r.t. iptables:
my $iptval = ($ISOURDOCKER) ? JSON::PP::true : JSON::PP::false;
my $ichanged = 0;
if (!defined($json) || !exists($json->{"iptables"})
|| $json->{'iptables'} != $iptval) {
$json->{'iptables'} = $iptval;
$changed = 1;
$ichanged = 1;
}
if (!defined($json) || !exists($json->{"ip-masq"})
|| $json->{'ip-masq'} != $iptval) {
$json->{'ip-masq'} = $iptval;
$changed = 1;
$ichanged = 1;
}
if ($changed) {
TBDebugTimeStamp("Updating /etc/docker/daemon.json");
my $newjsontext = encode_json($json);
open(FD,">/etc/docker/daemon.json")
or die("could not write /etc/docker/daemon.json: $!");
print FD $newjsontext;
close(FD);
mysystem2("service docker stop");
if ($ichanged && !$ISOURDOCKER) {
#
# Make sure all the Docker stuff is undone, if this is not
# our Docker.
#
mysystem("$IPTABLES -P FORWARD ACCEPT");
mysystem("$IPTABLES -F INPUT");
mysystem("$IPTABLES -F OUTPUT");
mysystem("$IPTABLES -F FORWARD");
mysystem("$IPTABLES -F DOCKER");
mysystem2("$IPTABLES -X DOCKER");
mysystem("$IPTABLES -F DOCKER-ISOLATION");
mysystem2("$IPTABLES -X DOCKER-ISOLATION");
}
mysystem2("service docker start");
# Remap, cause Docker creates some ifaces.
refreshNetworkDeviceMaps();
}
return 0;
}
sub setupDockerExecSSH() {
#
# We need to read the default sshd config; comment out any Port or
# ListenAddress lines; and write it out to the head config file.
# Note, we blow away the head file when first configuring the phost
# to support docker.
#
my @newlines = ();
open(FD,"/etc/ssh/sshd_config");
my @lines = ;
close(FD);
foreach my $line (@lines) {
if ($line =~ /^\s*(Port|ListenAddress)/) {
$line = "#$line";
}
push(@newlines,$line);
}
open(FD,">$DOCKER_EXEC_SSHD_CONFIGFILE_HEAD");
print FD @newlines;
close(FD);
#
# Then make the dir where we put the per-vhost sshd config bits.
#
mysystem("mkdir -p $DOCKER_EXEC_SSHD_CONFIGDIR");
return 0;
}
sub rebuildAndReloadDockerExecSSH() {
my $retval;
TBDebugTimeStamp("rebuildAndReloadDockerExecSSH: grabbing sshd lock".
" $SSHD_EXEC_LOCK")
if ($lockdebug);
my $locked = TBScriptLock($SSHD_EXEC_LOCK,TBSCRIPTLOCK_GLOBALWAIT(), 900);
if ($locked != TBSCRIPTLOCK_OKAY()) {
return 0
if ($locked == TBSCRIPTLOCK_IGNORE());
print STDERR "Could not get the $SSHD_EXEC_LOCK lock".
" after a long time!\n";
return -1;
}
#
# Our private Docker Exec sshd listens on the private VM ports and
# when a user authenticates, we use the ForceCommand directive in a
# Match block to gateway them into the container that is supposed to
# be reachable via ssh on that port. However, only Match blocks may
# follow other Match blocks -- in particular, a Port directive (to
# listen on) must precede the Match blocks. Thus, for each
# container, we create one file in the configdir like
# 0.$vnode_id.port with the Port line, and another like
# 1.$vnode_id.match with the match and command directives).
#
# Thus, we need an rcsorted order of files in $DOCKER_EXEC_SSHD_CONFIGDIR.
#
my @pmlines = ();
if (sortedreadallfilesindir($DOCKER_EXEC_SSHD_CONFIGDIR,\@pmlines)) {
$retval = -1;
goto out;
}
open(FD,"$DOCKER_EXEC_SSHD_CONFIGFILE_HEAD");
my @hlines = ;
close(FD);
open(FD,">$DOCKER_EXEC_SSHD_CONFIGFILE");
print FD "".join('',@hlines)."\n".join('',@pmlines)."\n";
close(FD);
#
# But, if there were no port/match lines, *stop* the service instead of
# restarting -- because it would probably try to start on port 22, which
# of course will just fail it.
#
if (@pmlines == 0) {
TBDebugTimeStamp("No more ports/commands in sshd_config-docker-exec;".
" stopping service!");
mysystem2("systemctl stop sshd-docker-exec.service");
}
else {
TBDebugTimeStamp("Restarting sshd-docker-exec.service for changes to".
" sshd_config-docker-exec");
mysystem2("systemctl restart sshd-docker-exec.service");
}
$retval = 0;
out:
TBScriptUnlock();
return $retval;
}
sub addContainerToDockerExecSSH($$$) {
my ($vnode_id,$port,$shell) = @_;
open(FD,">$DOCKER_EXEC_SSHD_CONFIGDIR/0.${vnode_id}.port");
print FD "Port $port\n";
close(FD);
open(FD,">$DOCKER_EXEC_SSHD_CONFIGDIR/1.${vnode_id}.match");
print FD "Match LocalPort=$port\n";
print FD "ForceCommand /usr/bin/sudo /usr/bin/docker exec -it $vnode_id $shell\n";
close(FD);
return rebuildAndReloadDockerExecSSH();
}
sub removeContainerFromDockerExecSSH($) {
my ($vnode_id,) = @_;
unlink("$DOCKER_EXEC_SSHD_CONFIGDIR/0.${vnode_id}.port");
unlink("$DOCKER_EXEC_SSHD_CONFIGDIR/0.${vnode_id}.match");
return rebuildAndReloadDockerExecSSH();
}
sub getBridgeInterfaces($)
{
my ($brname,) = @_;
my @output = `$BRCTL show $brname`;
if ($?) {
return undef;
}
my @retval = ();
foreach my $line (@output) {
if ($line =~ /^[^\s]+\s+[^\s]+\s+[^\s]+\s+([^\s])+$/) {
push(@retval,$1);
}
}
return @retval;
}
sub getDockerNetMemberIds($)
{
my ($netname,) = @_;
my ($code,$content,$resp) = getClient()->network_inspect($netname);
if ($code) {
return undef;
}
if (!exists($content->{"Containers"})) {
return ();
}
my @retval = ();
foreach my $cid (keys(%{$content->{"Containers"}})) {
next
if (!exists($content->{"Containers"}{$cid}{"Name"}));
push(@retval,$cid);
}
return @retval;
}
sub setupLVM()
{
print "Enabling LVM...\n"
if ($debug);
# We assume our kernels support this.
mysystem2("$MODPROBE dm-snapshot");
if ($?) {
print STDERR "ERROR: could not load snaphot module!\n";
return -1;
}
#
# Make sure pieces are at least 32 GiB.
#
my $minpsize = 32 * 1024;
my %devs = libvnode::findSpareDisks($minpsize, $LVM_AVOIDSSD);
# if ignoring SSDs but came up with nothing, we have to use them!
if ($LVM_AVOIDSSD && keys(%devs) == 0) {
%devs = libvnode::findSpareDisks($minpsize, 0);
}
#
# Turn on write caching. Hacky.
# XXX note we do not use the returned "path" here as we need to
# change the setting on all devices, not just the whole disk devices.
#
my %diddev = ();
foreach my $dev (keys(%devs)) {
# only mess with the disks we are going to use
if (!exists($diddev{$dev}) &&
(exists($devs{$dev}{"size"}) || $LVM_FULLDISKONLY == 0)) {
mysystem2("hdparm -W1 /dev/$dev");
$diddev{$dev} = 1;
}
}
undef %diddev;
#
# See if our LVM volume group for VMs exists and create it if not.
#
my $vg = `vgs | grep $VGNAME`;
if ($vg !~ /^\s+${VGNAME}\s/) {
print "Creating volume group...\n"
if ($debug);
#
# Total up potential maximum size.
# Also determine mix of SSDs and non-SSDs if required.
#
my $maxtotalSize = 0;
my $sizeThreshold = 0;
foreach my $dev (keys(%devs)) {
if (defined($devs{$dev}{"size"})) {
$maxtotalSize += $devs{$dev}{"size"};
} else {
foreach my $part (keys(%{$devs{$dev}})) {
$maxtotalSize += $devs{$dev}{$part}{"size"};
}
}
}
if ($maxtotalSize > 0) {
$sizeThreshold = int($maxtotalSize * $LVM_LARGEPARTPCT / 100.0);
}
#
# Find available devices of sufficient size, prepare them,
# and incorporate them into a volume group.
#
my $totalSize = 0;
my @blockdevs = ();
foreach my $dev (sort keys(%devs)) {
#
# Whole disk is available, use it.
#
if (defined($devs{$dev}{"size"})) {
push(@blockdevs, $devs{$dev}{"path"});
$totalSize += $devs{$dev}{"size"};
next;
}
#
# Disk contains partitions that are available.
#
my ($lpsize,$lppath);
foreach my $part (keys(%{$devs{$dev}})) {
my $psize = $devs{$dev}{$part}{"size"};
my $ppath = $devs{$dev}{$part}{"path"};
#
# XXX one way to avoid using the system disk, just ignore
# all partition devices. However, in cases where the
# remainder of the system disk represents the majority of
# the available space (e.g., Utah d710s), this is a bad
# idea.
#
if ($LVM_FULLDISKONLY) {
print STDERR
"WARNING: not using partition $ppath for LVM\n";
next;
}
#
# XXX Another heurstic to try to weed out the system
# disk whenever feasible: if a partition device represents
# less than some percentage of the max possible space,
# avoid it. At Utah this one is tuned (10%) to avoid using
# left over space on the system disk of d820s (which have
# six other larger drives) or d430s (which have two large
# disks) while using it on the pc3000s and d710s.
#
if ($LVM_ONLYLARGEPARTS && $psize < $sizeThreshold) {
print STDERR "WARNING: not using $ppath for LVM (too small)\n";
next;
}
#
# XXX If we are only going to use one partition per disk,
# record the largest one we find here. This check will
# filter out the small "other OS" partition (3-6GB) in
# favor of the larger "rest of the disk" partition.
#
if ($LVM_ONEPARTPERDISK) {
if (!defined($lppath) || $psize > $lpsize) {
$lppath = $ppath;
$lpsize = $psize;
}
next;
}
#
# It ran the gauntlet of feeble filters, use it!
#
push(@blockdevs, $ppath);
$totalSize += $psize;
}
if ($LVM_ONEPARTPERDISK && defined($lppath)) {
push(@blockdevs, $lppath);
$totalSize += $lpsize;
}
}
if (@blockdevs == 0) {
print STDERR "ERROR: findSpareDisks found no disks for LVM!\n";
return -1;
}
my $blockdevstr = join(' ', sort @blockdevs);
mysystem("pvcreate $blockdevstr");
mysystem("vgcreate $VGNAME $blockdevstr");
my $size = lvmVGSize($VGNAME);
if ($size < $DOCKER_MIN_VGSIZE) {
print STDERR "WARNING: physical disk space below the desired ".
" minimum value ($size < $DOCKER_MIN_VGSIZE), expect trouble.\n";
}
}
$STRIPE_COUNT = computeStripeSize($VGNAME);
#
# Make sure our volumes are active -- they seem to become inactive
# across reboots
#
mysystem("vgchange -a y $VGNAME");
return 0;
}
#
# Bridge stuff
#
sub addbr($)
{
my $br = $_[0];
my $cmd = ($USE_OPENVSWITCH ? "$OVSCTL add-br" : "$BRCTL addbr") . " $br";
system($cmd);
}
sub delbr($)
{
my $br = $_[0];
if ($USE_OPENVSWITCH) {
mysystem2("$OVSCTL del-br $br");
}
else {
mysystem2("$IFCONFIG $br down");
mysystem2("$BRCTL delbr $br");
}
}
sub addbrif($$)
{
my $br = $_[0];
my $if = $_[1];
my $cmd = ($USE_OPENVSWITCH ? "$OVSCTL add-port" : "$BRCTL addif") .
" $br $if";
system($cmd);
}
sub delbrif($$)
{
my $br = $_[0];
my $if = $_[1];
my $cmd = ($USE_OPENVSWITCH ? "$OVSCTL del-port" : "$BRCTL delif") .
" $br $if";
system($cmd);
}
##
## libvnode API implementation
##
sub init($)
{
my ($pnode_id,) = @_;
if ($USE_LVM) {
# See what version of LVM we have. Again, some commands are different.
my $out = `lvm version | grep 'LVM version'`;
if (defined($out) && $out =~ /LVM version:\s+(\d+)\.(\d+)\.(\d+)/) {
if (int($1) > 2 ||
(int($1) == 2 && int($2) > 2) ||
(int($1) == 2 && int($2) == 2 && int($3) >= 99)) {
$NEW_LVM = 1;
}
}
# Compute the strip size for new lvms.
if (-e "$READYFILE") {
$STRIPE_COUNT = computeStripeSize($VGNAME);
}
}
#
# Check which docker this is.
#
if (-e "/usr/share/docker.io/EMULAB.md") {
$ISOURDOCKER = 1;
}
return 0;
}
#
# Called on each vnode, but should only be executed once per boot.
# We use a file in /var/run (cleared on reboots) to ensure this.
#
sub rootPreConfig($)
{
my $bossip = shift;
my ($code,$content,$resp);
#
# Haven't been called yet, grab the lock and double check that someone
# didn't do it while we were waiting.
#
if (! -e "$READYFILE") {
TBDebugTimeStamp("rootPreConfig: grabbing global lock $GLOBAL_CONF_LOCK")
if ($lockdebug);
my $locked = TBScriptLock($GLOBAL_CONF_LOCK,
TBSCRIPTLOCK_GLOBALWAIT(), 900);
if ($locked != TBSCRIPTLOCK_OKAY()) {
return 0
if ($locked == TBSCRIPTLOCK_IGNORE());
print STDERR "Could not get the $GLOBAL_CONF_LOCK lock".
" after a long time!\n";
return -1;
}
}
TBDebugTimeStamp(" got global lock")
if ($lockdebug);
if (-e "$READYFILE") {
TBDebugTimeStamp(" releasing global lock")
if ($lockdebug);
TBScriptUnlock();
return 0;
}
TBDebugTimeStamp("Configuring root vhost context");
#
# Ensure we have the latest bridge/iface state!
#
refreshNetworkDeviceMaps();
#
# Make sure we actually have Docker.
#
ensureDockerInstalled();
#
# Make sure we have all our Perl deps.
#
ensureDeps();
#
# Make sure we have a bunch of other common tools.
#
aptGetEnsureInstalled("lvm2","thin-provisioning-tools",
"bridge-utils","iproute2","vlan");
#
# Set up the docker exec sshd service.
#
setupDockerExecSSH();
#
# Setup our control net device if not already up.
#
if ($USE_MACVLAN_CNET || $USE_MACVLAN) {
#
# If we build dummy shortbridge nets atop either a physical
# device, or atop a dummy device, load these!
#
mysystem("$MODPROBE macvlan");
mysystem("$MODPROBE dummy");
}
if (!$USE_MACVLAN_CNET || !$USE_MACVLAN) {
mysystem("$MODPROBE bridge");
}
my ($cnet_iface,$cnet_ip,$cnet_mask,
$cnet_maskbits,$cnet_net,$cnet_mac,$cnet_gw) = findControlNet();
my ($alias_ip,$alias_mask,$vmac) = hostControlNet();
my ($VCNET_NET,undef,$VCNET_GW,$VCNET_SLASHMASK) = findVirtControlNet();
my $nettype = ($USE_MACVLAN_CNET) ? "macvlan" : "bridge";
#
# NB: in the case of !$USE_MACVLAN_CNET (i.e. using bridges for
# control net) and !$ISREMOTENODE, we place the real routable
# control net addr on the bridge and put the real control net dev in
# the bridge. So we want to track the orig_cnet_iface. Once we
# shuffle that dev into the bridge, we reset the
# /var/emulab/boot/controlif file to point to the bridge -- and thus
# if this gets re-run, it won't get the real control net dev as in
# arg to this function. So the code that handles this case is
# careful to use orig_cnet_iface instead of cnet_iface! None of the
# other cases care, since they don't re-write
# /var/emulab/boot/controlif.
#
my $orig_cnet_iface;
#
# Assume if this is not present, this is the first time running. If
# so, the real control net device must have the real control net IP;
# not $DOCKERCNET! So if you wipe this file out to retry, make sure
# to reset the real controlif with proper info from dhclient.
#
if (! -e "/var/run/emulab-controlif-orig") {
$orig_cnet_iface = $cnet_iface;
open(FD,">/var/run/emulab-controlif-orig")
or fatal("could not open /var/run/emulab-controlif-orig: $!");
print FD "$cnet_iface";
close(FD);
}
else {
open(FD,"/var/run/emulab-controlif-orig")
or fatal("could not open /var/run/emulab-controlif-orig: $!");
$orig_cnet_iface = ;
chomp($orig_cnet_iface);
close(FD);
}
my $dcnexists = 0;
TBDebugTimeStamp("checking for docker network $DOCKERCNET...");
($code,$content,$resp) = getClient()->network_inspect($DOCKERCNET);
if ($code == 0) {
$dcnexists = 1;
}
if ($USE_MACVLAN_CNET && ! -e "/sys/class/net/$DOCKERCNET") {
my $alias_net =
inet_ntoa(inet_aton($alias_ip) & inet_aton($alias_mask));
if (!$ISREMOTENODE) {
#
# We first add a macvlan "alias" to the control net device
# so that we (the physical host) are in the same subnet as
# the vnodes. With the macvlan interfaces, you cannot
# directly alias the parent device and talk to/from the
# other macvlan children on the parent.
#
print "Creating $DOCKERCNET macvlan on $cnet_iface".
" ($alias_ip,$alias_mask)...\n";
mysystem("ip link add link $cnet_iface name $DOCKERCNET".
" address $vmac type macvlan mode bridge");
mysystem("ip addr replace $alias_ip/$alias_mask dev $DOCKERCNET");
mysystem("ip link set up $DOCKERCNET");
#my $isroutable = isRoutable($alias_ip);
## Add a route to reach the vnodes. Do it for the entire
## network, and no need to remove it.
#if (!$ISREMOTENODE && !$isroutable
# && system("$NETSTAT -r | grep -q $alias_net")) {
# mysystem2("$ROUTE add -net $alias_net netmask $alias_mask dev $cnet_iface");
# if ($?) {
# warn("could not add non-routable local virt control net route!");
# #return -1;
# }
#}
}
else {
#
# XXX will this actually work? macvlan children can't talk to host?
# XXX probably need to add a dummy device to back the docker
# macvlan network!
# $alias_ip = $cnet_ip;
#
# Ok, since that won't work, in this case, we add a dummy
# device to host our control net macvlan devices atop; we
# don't want anything bridged to the outside world in the
# remoteded case. Then we add our control net alias like
# above.
#
$cnet_iface = "dummycnet";
mysystem2("ip link add dummycnet type dummy");
print "Creating $DOCKERCNET macvlan on $cnet_iface".
" ($alias_ip,$alias_mask)...\n";
mysystem("ip link add link $cnet_iface".
" name $DOCKERCNET address $vmac type macvlan mode bridge");
mysystem("ip addr replace $alias_ip/$alias_mask dev $DOCKERCNET");
mysystem("ip link set up $DOCKERCNET");
}
}
elsif (!$USE_MACVLAN_CNET
&& (!$dcnexists
|| ! -e "/sys/class/net/$DOCKERCNET"
|| !defined(findBridge($orig_cnet_iface))
|| findBridge($orig_cnet_iface) ne $DOCKERCNET)) {
my $alias_net =
inet_ntoa(inet_aton($alias_ip) & inet_aton($alias_mask));
#
# If the bridge doesn't exist, add it first.
#
if (! -e "/sys/class/net/$DOCKERCNET") {
addbr($DOCKERCNET);
if ($?) {
fatal("failed to create $DOCKERCNET bridge!");
return -1;
}
}
#
# The $ISREMOTENODE case is easy, because the real control net
# device doesn't go into the bridge, and we and Docker expect
# the bridge to have the fake virtual control net address. So
# harmony ensues.
#
# The !$ISREMOTENODE case is very, very tricky. The first time
# we boot, the docker network doesn't exist; the bridge doesn't
# exist; all the control net state is as dhclient left it. The
# correct order there is create bridge; flush control net ip
# addr; move control net dev into bridge; add control net as
# docker network; flush bridge ip addr Docker set; set our
# proper public control net IP as the bridge ip addr; and add
# the unroutable virtual control net addr (the docker network
# gateway) as an alias. NB: Docker will not accept or add the
# virtual control net IP as an alias; it will error, or force
# the IP to the virtual addr. That is why we must fix it up
# after creating the Docker network.
#
# On subsequent boots, the control net already exists as a
# Docker network, and Docker will create the control net device
# before we run. However, Docker doesn't put the real control
# net device into that bridge (it doesn't know that kind of
# thing); but it does give the bridge the virtual control IP as
# its primary IP. So, we have to flush the bridge IP, and *not*
# remake the Docker cnet.
#
# What a pain, all because Docker cannot just leave an existing
# bridge alone (i.e.,
# https://github.com/docker/docker/issues/20758).
#
if (!$ISREMOTENODE) {
my $ipandmaskbits = "$cnet_ip/$cnet_maskbits";
# First grab the default gateway.
my ($defroute,$defrouteiface);
open(ROUTEOUTPUT,"ip route list |")
or fatal("unable to get route list via 'ip'!");
while (!eof(ROUTEOUTPUT)) {
my $line = ;
chomp($line);
if ($line =~ /^default via (\d+\.\d+\.\d+\.\d+)/) {
$defroute = $1;
}
if ($line =~ /^default via [\w\.\/]+\s+dev\s+([\w\.]+)/) {
$defrouteiface = $1;
}
}
if (!$defroute) {
fatal("could not find default route!");
}
#
# Undo the existing control net config we obtained on boot,
# and move that interface into our $DOCKERCNET bridge, IFF
# it's not in the bridge already. If it's already in the
# bridge, no need to do any of this.
#
if (!defined(findBridge($orig_cnet_iface))
|| findBridge($orig_cnet_iface) ne $DOCKERCNET) {
mysystem2("ip link set down $orig_cnet_iface");
mysystem2("ip addr del $ipandmaskbits dev $orig_cnet_iface");
mysystem2("ip addr flush dev $orig_cnet_iface");
addbrif($DOCKERCNET,$orig_cnet_iface);
}
#
# If the Docker network does not exist in Docker itself, but
# it *does* exist as a device, flush its IP addr since
# Docker insists on setting that itself.
#
if (!$dcnexists && -e "/sys/class/net/$DOCKERCNET") {
mysystem2("ip addr flush dev $DOCKERCNET");
}
#
# If the docker network isn't yet built, do that now.
#
if (!$dcnexists) {
TBDebugTimeStamp("creating bridged docker network $DOCKERCNET");
($code,$content) = getClient()->network_create_bridge(
$DOCKERCNET,"${VCNET_NET}/${VCNET_SLASHMASK}",$alias_ip,
$DOCKERCNET);
if ($code) {
fatal("failed to create bridged Docker $DOCKERCNET control net:".
" $content");
}
$dcnexists = 1;
}
#
# Always flush the bridge's Docker-imposed addr immediately,
# whether it existed or we created it.
#
mysystem("ip addr flush dev $DOCKERCNET");
#
# Set the $DOCKERCNET configuration to one that both we and
# Docker are happy with.
#
mysystem2("ip addr add $ipandmaskbits dev $DOCKERCNET");
if ($?) {
mysystem("ip addr replace $ipandmaskbits dev $DOCKERCNET");
}
mysystem("ip link set up $DOCKERCNET");
mysystem("ip link set up $orig_cnet_iface");
if ($defrouteiface eq $cnet_iface
|| $defrouteiface eq $orig_cnet_iface) {
mysystem("ip route replace default via $defroute");
}
mysystem("ip addr add $alias_ip/$alias_mask dev $DOCKERCNET".
" label $DOCKERCNET:1");
#
# Save the bridge as the real control net iface.
#
open(CONTROLIF,">$BOOTDIR/controlif");
print CONTROLIF "$DOCKERCNET\n";
close(CONTROLIF);
}
else {
#
# If this node is remote, then it gets a bridge without the
# control net.
#
mysystem("ip addr replace $alias_ip/$alias_mask dev $DOCKERCNET");
mysystem("ip link set up $DOCKERCNET");
}
}
#
# Now if the Docker control net still doesn't exist, create that.
#
if (!$dcnexists) {
if ($USE_MACVLAN_CNET) {
#
# Next, we create a docker macvlan network to front for the
# virt control net.
#
TBDebugTimeStamp("creating macvlan docker network $DOCKERCNET");
($code,$content) = getClient()->network_create_macvlan(
$DOCKERCNET,"${VCNET_NET}/${VCNET_SLASHMASK}",$alias_ip,
$cnet_iface);
if ($code) {
fatal("failed to create bridged Docker $DOCKERCNET control net:".
" $content");
}
}
else {
TBDebugTimeStamp("creating bridged docker network $DOCKERCNET");
($code,$content) = getClient()->network_create_bridge(
$DOCKERCNET,"${VCNET_NET}/${VCNET_SLASHMASK}",$alias_ip,
$DOCKERCNET);
if ($code) {
fatal("failed to create bridged Docker $DOCKERCNET control net:".
" $content");
}
}
}
#
# Mesh our iptables setup with docker's. This is nontrivial because
# Docker does one nasty thing: it continually forces its -j
# DOCKER-ISOLATION rule into the top of the FORWARD chain on
# significant operations (like creating a container). This has been
# much discussed but not fixed, so we have two strategies. First,
# we have a patched version of Docker that does not do this crazy
# crap; second, if that is not available, we disable its use of
# iptables and do all the stuff Docker would normally do that we
# actually need (a subset of what Docker normally does).
#
# We use the same basic strategy in either case: what we want to do
# is flow all packets on the control net bridge through our
# EMULAB-ISOLATION chain. But we do return to the DOCKER-ISOLATION
# chain so that Docker rules can affect other Docker networks.
#
mysystem2("$IPTABLES -N EMULAB-ISOLATION");
mysystem("$IPTABLES -F EMULAB-ISOLATION");
mysystem("$IPTABLES -A EMULAB-ISOLATION -j RETURN");
mysystem("$IPTABLES -I FORWARD -j EMULAB-ISOLATION");
#
# Also, Docker handles MASQUERADING for us by default. We don't
# want to turn off Docker's iptables (it's on or off) functionality,
# because people should be able to bring up Docker VMs manually if
# they want, using the default Docker host network (or one of the
# experiment networks, if they safely manage IP addr assignment).
# However, as discussed above, we have to turn it off if it's not
# our modified version. So we have to add the MASQ rule if iptables
# is off in Docker.
#
# If this is a local testbed node, we want to allow unroutable
# packets on the control net. So we have to add local control net
# exceptions ahead of Docker's default MASQ-all rules.
#
if (!$ISREMOTENODE) {
my (undef,undef,$ctlmask,undef,$ctlnet,undef,undef) = findControlNet();
mysystem("$IPTABLES -t nat -I POSTROUTING".
" -s ${VCNET_NET}/${VCNET_SLASHMASK}".
" -d ${VCNET_NET}/${VCNET_SLASHMASK} -j ACCEPT");
mysystem("$IPTABLES -t nat -I POSTROUTING".
" -s ${VCNET_NET}/${VCNET_SLASHMASK}".
" -d ${ctlnet}/${ctlmask} -j ACCEPT");
if (!$ISOURDOCKER) {
mysystem("$IPTABLES -t nat -A POSTROUTING".
" -s ${VCNET_NET}/${VCNET_SLASHMASK}".
" -j MASQUERADE");
# Also do the default docker0 bridge CIDR, since Docker
# won't be doing it and we want temp user containers to
# work.
mysystem("$IPTABLES -t nat -A POSTROUTING".
" -s $DOCKER_DEFAULT_BRIDGE_CIDR".
" -j MASQUERADE");
}
}
#
# XXX: antispoofing! Can't do it with macvlan control net though.
#
# We also choose not to use the style here; instead, we are
# draconian and drop everything that comes from the vnode that does
# not have its IP. We do that later.
#
# We want to change the below code not to DROP on the FORWARD chain
# by default, but rather to drop anything that comes from a vnode's
# cnet iface that is not sourced from its assigned control net IP.
#
if (0) {
mysystem("$IPTABLES -P FORWARD DROP");
mysystem("$IPTABLES -F FORWARD");
# This says to forward traffic across the bridge.
mysystem("$IPTABLES -A FORWARD ".
"-m physdev --physdev-in $cnet_iface -j ACCEPT");
}
# For tunnels
if ($USE_OPENVSWITCH) {
mysystem("$MODPROBE openvswitch");
}
else {
mysystem("$MODPROBE ip_gre");
}
# For VLANs
mysystem("$MODPROBE 8021q");
# We need this stuff for traffic shaping -- only root context can
# modprobe.
mysystem("$MODPROBE sch_netem");
mysystem("$MODPROBE sch_htb");
# start up open vswitch stuff.
if ($USE_OPENVSWITCH) {
# For tunnels
mysystem("$OVSSTART --delete-bridges start");
}
# For bandwidth contraints.
mysystem("$MODPROBE ifb");
# Create a DB to manage them.
my %MDB;
if (!dbmopen(%MDB, $IFBDB, 0660)) {
print STDERR "*** Could not create $IFBDB\n";
TBScriptUnlock();
return -1;
}
dbmclose(%MDB);
#
# Ensure that LVM is loaded in the kernel and ready.
#
if ($USE_LVM) {
# There are several reasons we might need a Docker restart in
# this LVM setup bit; they will be noted along the way, and we
# will restart if necessary.
my $needdockerrestart = 0;
#
# Sets up our PVs and VG ($VGNAME).
#
setupLVM();
#
# Figure out how big various volumes should be.
#
# If we are using the aufs storage backend for Docker, we want
# most of our space in $EXTRAFS (since /var/lib/docker gets
# symlinked there, our heaviest space usage may be there); in
# that case, we save a ~10%VG buffer of free space. Wild guess.
#
# If we are instead using the devicemapper direct-lvm backend,
# we need both $EXTRAFS and $INFOFS, but we also need a beefy
# thinpool for Docker. In this case, we use max(5GB,3%VG) LV
# for $INFOFS; use min(32GB,15%remainingVG) for the $EXTRAFS;
# then we provision the thin pool with 90% of the remaining
# space (i.e., 0.90*(totalVG - sizeof($EXTRAFS) -
# sizeof($INFOFS))). This results in at least some spare space
# in case some heavy usage happens, for autoextension of the
# thinpool. And we could even consider garbage-collecting
# context build dirs in $EXTRAFS and downsizing that so that the
# thin pool can grow more, for instance on a shared host, if
# necessary.
#
my ($extrasize,$infosize,$thinpoolsize) = (0,0,0);
my $vgsize = lvmVGSize($VGNAME);
my $remaining = $vgsize;
if (!$USE_DOCKER_LVM) {
# We will only create $EXTRAFS and $INFOFS.
if (0.03 * $remaining < 5) {
$infosize = 0.03 * $remaining;
}
else {
$infosize = 5;
}
$remaining -= $infosize;
$extrasize = 0.90 * $remaining;
$remaining -= $extrasize;
}
else {
# We will create $EXTRAFS and $INFOFS, as well as the Docker
# thin pool.
if (0.03 * $remaining < 5) {
$infosize = 0.03 * $remaining;
}
else {
$infosize = 5;
}
$remaining -= $infosize;
if (0.15 * $remaining < 32) {
$extrasize = 0.15 * $remaining;
}
else {
$extrasize = 32;
}
$remaining -= $extrasize;
$thinpoolsize = 0.90 * $remaining;
$remaining -= $thinpoolsize;
}
my $tmplvname;
if ($INFOFS =~ /\/(.*)$/) {
$tmplvname = $1;
}
if (!libvnode::lvExists($VGNAME,$tmplvname)) {
print "Creating container info FS ...\n";
if (createExtraFS($INFOFS, $VGNAME, "${infosize}G")) {
TBScriptUnlock();
return -1;
}
}
if ($EXTRAFS =~ /\/(.*)$/) {
$tmplvname = $1;
}
if (!libvnode::lvExists($VGNAME,$tmplvname)) {
print "Creating scratch FS ...\n";
my $already = 0;
if (-d $EXTRAFS) {
$already = 1;
mysystem("mv $EXTRAFS ${EXTRAFS}.bak");
}
if (createExtraFS($EXTRAFS, $VGNAME, "${extrasize}G")) {
TBScriptUnlock();
return -1;
}
if ($already) {
my @files = glob("${EXTRAFS}.bak/*");
foreach my $file (@files) {
my $base = basename($file);
mysystem("/bin/mv $file $EXTRAFS")
if (! -e "$EXTRAFS/$base");
}
mysystem("/bin/rm -rf ${EXTRAFS}.bak");
}
}
if ($USE_DOCKER_LVM && !libvnode::lvExists($VGNAME,"thinpool")) {
print "Creating Docker Thin Pool...\n";
#
# Docker wants a thinpool and a metadata pool. Size of the
# metadata pool cannot exceed 16GB. So we create that as
# min(16,0.01*$thinpoolsize).
#
my ($tps,$tpms) = (0,0);
if (0.01 * $thinpoolsize < 16) {
$tpms = 0.01 * $thinpoolsize;
}
else {
$tpms = 16;
}
$tps = $thinpoolsize - $tpms;
# XXX: --wipesignatures y ?
mysystem("lvcreate -n thinpool $VGNAME -L ${tps}G");
mysystem("lvcreate -n thinpoolmeta $VGNAME -L ${tpms}G");
mysystem("lvconvert -y --zero n -c 512K".
" --thinpool $VGNAME/thinpool".
" --poolmetadata $VGNAME/thinpoolmeta");
mkdir("/etc/lvm/profile");
open(FD,">/etc/lvm/profile/$VGNAME-thinpool.profile")
or fatal("could not open /etc/lvm/profile/$VGNAME-thinpool.profile: $@");
print FD "activation {\n".
" thin_pool_autoextend_threshold=90\n".
" thin_pool_autoextend_percent=10\n".
"}\n";
close(FD);
mysystem("lvchange --metadataprofile $VGNAME-thinpool".
" $VGNAME/thinpool");
mysystem("lvs -o+seg_monitor");
#
# Setup the Docker devicemapper direct-lvm storage backend.
# { "storage-driver": "devicemapper",
# "storage-opts": [
# "dm.thinpooldev=/dev/mapper/docker-thinpool",
# "dm.use_deferred_removal=true",
# "dm.use_deferred_deletion=true" ] }
#
my $origjsontext = '';
my $json = {};
if (-e "/etc/docker/daemon.json") {
open(FD,"/etc/docker/daemon.json")
or die("could not open /etc/docker/daemon.json: $!");
my @lines = ;
close(FD);
$origjsontext = join("",@lines);
$json = decode_json($origjsontext);
}
# If it exists, just delete it; we only want valid stuff in here.
if (defined($json->{"storage-driver"})) {
delete($json->{"storage-driver"});
}
if (defined($json->{"storage-opts"})) {
delete($json->{"storage-opts"});
}
# Write our config.
# Don't restart docker; that happens at the end of $USE_LVM.
$needdockerrestart = 1;
$json->{"storage-driver"} = "devicemapper";
$json->{"storage-opts"} = [
"dm.thinpooldev=/dev/mapper/${VGNAME}-thinpool",
"dm.use_deferred_removal=true",
"dm.use_deferred_deletion=true"
];
TBDebugTimeStamp("Updating /etc/docker/daemon.json");
my $newjsontext = encode_json($json);
open(FD,">/etc/docker/daemon.json")
or die("could not write /etc/docker/daemon.json: $!");
print FD $newjsontext;
close(FD);
}
if (! -l $VMS) {
#
# We need this stuff to be sticky across reloads, so move it
# into an lvm. If we lose the lvm, well then we are screwed.
#
my @files = glob("$VMS/*");
foreach my $file (@files) {
my $base = basename($file);
mysystem("/bin/mv $file $INFOFS")
if (! -e "$INFOFS/$base");
}
mysystem("/bin/rm -rf $VMS");
mysystem("/bin/ln -s $INFOFS $VMS");
}
if (! -l '/var/lib/docker') {
# Make sure Docker is stopped before we do this, if it
# wasn't stopped above already!
mysystem2("systemctl stop docker.service");
$needdockerrestart = 1;
if ($?) {
warn("could not stop docker service before moving".
" /var/lib/docker to LVM; aborting!");
TBScriptUnlock();
return -1;
}
mysystem2("mount -t aufs | grep /var/lib/docker/");
if ($? == 0) {
warn("filesystems still mounted in /var/lib/docker; aborting!");
TBScriptUnlock();
return -1;
}
mkdir("$EXTRAFS/var.lib.docker");
#
# We need this stuff to be sticky across reloads, so move it
# into an lvm. If we lose the lvm, well then we are screwed.
#
my @files = glob("/var/lib/docker/*");
foreach my $file (@files) {
my $base = basename($file);
mysystem("/bin/mv $file $EXTRAFS/var.lib.docker")
if (! -e "$EXTRAFS/var.lib.docker/$base");
}
mysystem("/bin/rm -rf /var/lib/docker");
mysystem("/bin/ln -s $EXTRAFS/var.lib.docker /var/lib/docker");
}
if ($needdockerrestart) {
mysystem2("systemctl restart docker.service");
if ($?) {
warn("could not restart docker service after LVM setup; aborting!");
TBScriptUnlock();
return -1;
}
}
#
# Check the $DOCKERCNET again after LVM setup... if the move of
# /var/lib/docker fails, all Docker state (including
# $DOCKERCNET) will appear to have vanished!
#
TBDebugTimeStamp("checking docker network $DOCKERCNET after LVM move");
($code,$content,$resp) = getClient()->network_inspect($DOCKERCNET);
if ($code) {
fatal("$DOCKERCNET still does not appear as a Docker network;".
" something must have gone wrong in LVM setup!\n");
}
}
else {
mkdir($VMS);
mkdir($INFOFS);
mkdir($EXTRAFS);
}
#
# Make sure IP forwarding is enabled on the host
#
mysystem2("$SYSCTL -w net.ipv4.conf.all.forwarding=1");
#
# Increase socket buffer size for frisbee download of images.
#
mysystem2("$SYSCTL -w net.core.rmem_max=1048576");
mysystem2("$SYSCTL -w net.core.wmem_max=1048576");
#
# Need these to avoid overflowing the NAT tables.
#
mysystem2("$MODPROBE nf_conntrack");
if ($?) {
print STDERR "ERROR: could not load nf_conntrack module!\n";
TBScriptUnlock();
return -1;
}
mysystem2("$SYSCTL -w ".
" net.netfilter.nf_conntrack_generic_timeout=120");
mysystem2("$SYSCTL -w ".
" net.netfilter.nf_conntrack_tcp_timeout_established=54000");
mysystem2("$SYSCTL -w ".
" net.netfilter.nf_conntrack_max=131071");
mysystem2("echo 16384 > /sys/module/nf_conntrack/parameters/hashsize");
# These might fail on new kernels.
mysystem2("$SYSCTL -w ".
" net.ipv4.netfilter.ip_conntrack_generic_timeout=120");
mysystem2("$SYSCTL -w ".
" net.ipv4.netfilter.ip_conntrack_tcp_timeout_established=54000");
#
# Clone the emulab and pubsub src repos. Make other dirs.
#
mkdir($CONTEXTDIR);
if (! -d $EMULABSRC) {
mysystem("git clone https://gitlab.flux.utah.edu/emulab/emulab-devel".
" $EMULABSRC");
}
if (! -d $PUBSUBSRC) {
mysystem("git clone https://gitlab.flux.utah.edu/emulab/pubsub".
" $PUBSUBSRC");
}
if (! -d $RUNITSRC) {
mysystem("git clone https://gitlab.flux.utah.edu/emulab/runit".
" $RUNITSRC");
}
# We're done; mark it.
mysystem("touch $READYFILE");
TBDebugTimeStamp(" releasing global lock")
if ($lockdebug);
TBScriptUnlock();
return 0;
}
#
# Prepare any network stuff in the root context for a specific vnode.
# Run once at boot/create, or at reconfigure. For Docker, this consists
# of creating bridges and/or macvlans, configuring them as necessary,
# and binding them to Docker networks.
#
# NOTE: This function must clean up any side effects if it fails partway.
#
sub rootPreConfigNetwork($$$$)
{
my ($vnode_id, undef, $vnconfig, $private) = @_;
my ($code,$content,$resp);
TBDebugTimeStamp("rootPreConfigNetwork: grabbing global lock".
" $GLOBAL_CONF_LOCK")
if ($lockdebug);
if (TBScriptLock($GLOBAL_CONF_LOCK,
TBSCRIPTLOCK_INTERRUPTIBLE(), 900) != TBSCRIPTLOCK_OKAY()){
print STDERR "Could not get the global lock!\n";
return -1;
}
TBDebugTimeStamp(" got global lock")
if ($lockdebug);
#
# If we blocked, it would be because vnodes have come or gone,
# so we need to rebuild the maps.
#
# It is important that we do this once we have the global lock! Our
# cleanup code in bad: depends on us having the lock before we call
# thi
#
refreshNetworkDeviceMaps();
my $vmid;
if ($vnode_id =~ /^[-\w]+\-(\d+)$/) {
$vmid = $1;
}
else {
print STDERR "vz_rootPreConfigNetwork: bad vnode_id $vnode_id, aborting!";
goto badbad;
}
my @node_ifs = @{ $vnconfig->{'ifconfig'} };
my @node_lds = @{ $vnconfig->{'ldconfig'} };
#
# If we're using veths, figure out what bridges we need to make:
# we need a bridge for each physical iface that is a multiplex pipe,
# and one for each VTAG given PMAC=none (i.e., host containing both sides
# of a link, or an entire lan).
#
my %brs = ();
my $prefix;
if ($USE_MACVLAN) {
$prefix = "mv";
}
else {
$prefix = "br";
}
foreach my $ifc (@node_ifs) {
# XXX
#next if (!$ifc->{ISVIRT});
print "$vnode_id interface " . Dumper($ifc) . "\n"
if ($debug > 1);
#
# In the era of shared nodes, we cannot name the bridges
# using experiment local names (e.g., the link name).
# Bridges are now named after either the physical interface
# they are associated with or the "tag" if there is no physical
# interface.
#
my $brname;
my $physdev;
if ($ifc->{ITYPE} eq "loop") {
my $vtag = $ifc->{VTAG};
#
# No physical device. It's a loopback (trivial) link/lan
# All we need is a common bridge to put the veth ifaces into,
# or a dummy device to host the macvlan devices on.
#
$physdev = $brname = "${prefix}$vtag";
$brs{$brname}{ENCAP} = 0;
$brs{$brname}{SHORT} = 0;
}
elsif ($ifc->{ITYPE} eq "vlan") {
my $iface = $ifc->{IFACE};
my $vtag = $ifc->{VTAG};
my $vdev = "${iface}.${vtag}";
if (! -d "/sys/class/net/$vdev") {
mysystem2("$VLANCONFIG set_name_type DEV_PLUS_VID_NO_PAD");
mysystem2("$VLANCONFIG add $iface $vtag");
goto bad
if ($?);
mysystem2("$VLANCONFIG set_name_type VLAN_PLUS_VID_NO_PAD");
#
# We do not want the vlan device to have the same
# mac as the physical device, since that will confuse
# findif later.
#
my $bmac = fixupMac(GenFakeMac());
mysystem2("$IP link set $vdev address $bmac");
goto bad
if ($?);
mysystem2("$IFCONFIG $vdev up");
# XXX
#mysystem2("$ETHTOOL -K $vdev tso off gso off");
refreshNetworkDeviceMaps();
# XXX
# Another thing that seems to screw up, causing the ciscos
# to drop packets with an undersize error.
#mysystem2("$ETHTOOL -K $iface txvlan off");
}
# XXX
# Temporary, to get existing devices after upgrade.
#mysystem2("$ETHTOOL -K $vdev tso off gso off");
$physdev = $vdev;
$brname = $prefix . $vdev;
# We save this so we can garbage-collect it in vnodeDestroy.
# But we don't remove it here if there's a failure.
$private->{'vlandevs'}->{$brname} = $vdev;
$brs{$brname}{ENCAP} = 1;
$brs{$brname}{SHORT} = 0;
$brs{$brname}{PHYSDEV} = $vdev;
$brs{$brname}{IFC} = $ifc;
}
#
# These final two cases should only be ITYPE==veth .
# We will never see a veth on a shared node, thus they
# have already been created during the physnode config.
#
elsif ($ifc->{PMAC} eq "none") {
$physdev = $brname = $prefix . $ifc->{VTAG};
# if no PMAC, we don't need encap on the bridge
$brs{$brname}{ENCAP} = 0;
# count members below so we can figure out if this is a shorty
$brs{$brname}{MEMBERS} = 0;
}
else {
my $iface = findIface($ifc->{PMAC});
$physdev = $iface;
$brname = $prefix . $iface;
$brs{$brname}{ENCAP} = 1;
$brs{$brname}{SHORT} = 0;
$brs{$brname}{IFC} = $ifc;
$brs{$brname}{PHYSDEV} = $iface;
}
# Stash for later phase.
$ifc->{'PHYSDEV'} = $physdev
if (defined($physdev));
$ifc->{'BRIDGE'} = $brname
if (defined($brname));
#
# Docker networks require a subnet (and a gateway; i.e.
# https://github.com/docker/libnetwork/issues/1447#issuecomment-247368397).
# This gateway assumption appears builtin to Docker at abstract
# levels, and thus would take significant patching to
# workaround. So we don't do that. Instead we have a hack, see below.
#
# Anyway, we have to extract and save off the cidr/gateway bits so that
# when we create the Docker network, we have what we need.
#
# XXX: this of course won't work for shared nodes with
# overlapping exp net subnets! Docker/libnetwork has an
# incredibly limited network model; it's ridiculous.
#
if (exists($ifc->{IPMASK}) && exists($ifc->{IPADDR})) {
# Figure out the subnet for this network:
my $netaddr = inet_aton($ifc->{IPADDR}) & inet_aton($ifc->{IPMASK});
my $maskbits = 0;
foreach my $octet (split(/\./,$ifc->{IPMASK})) {
my $cval = int($octet);
for (my $i = 0; $i < 8; ++$i) {
$maskbits += $cval & 1;
$cval = $cval >> 1;
}
}
$brs{$brname}{CIDR} = inet_ntoa($netaddr) . "/$maskbits";
#
# NB XXX: Use the final address in the subnet as the
# gateway. (I considered using the penultimate address, to
# assume that some manually-assigning users will take the
# final non-broadcast address, but that is just a grosser
# hack -- we'll just document this). Obviously, if this
# address was used/assigned by Emulab or the user, that
# container will fail to boot! I could check this in the
# single experiment case, but I'm not sure how to check for
# a shared LAN. Anyway, we'll just document this too...
#
my $bcast = ~inet_aton($ifc->{IPMASK});
$brs{$brname}{GW} =
inet_ntoa($netaddr | pack("N",unpack("N",$bcast) - 1));
}
else {
warn("Fatal: all Docker network interfaces *must* have an".
" IP address and subnet; aborting!");
goto bad;
}
}
#
# Make bridges and add phys ifaces.
#
# Or, in the macvlan case, create a dummy device if there is no
# underlying physdev to "host" the macvlan.
#
foreach my $k (keys(%brs)) {
my $cidr = $brs{$k}{CIDR};
my $gw = $brs{$k}{GW};
if (!$USE_MACVLAN) {
#
# This bridge might be shared with other containers, so difficult
# to delete. This really only matters on shared nodes though, where
# bridges and vlans could stack up forever (or just a long time).
#
if (! -d "/sys/class/net/$k/bridge") {
addbr($k);
goto bad
if ($?);
#
# Bad feature of bridges; they take on the lowest numbered
# mac of the added interfaces (and it changes as interfaces
# are added and removed!). But the main point is that we end
# up with a bridge that has the same mac as a physical device
# and that screws up findIface(). But if we "assign" a mac
# address, it does not change and we know it will be unique.
#
my $bmac = fixupMac(GenFakeMac());
mysystem2("$IP link set $k address $bmac");
goto bad
if ($?);
}
# record bridge used
$private->{'physbridges'}->{$k} = $k;
# repetitions of this should not hurt anything
mysystem2("$IFCONFIG $k 0 up");
#
# Add a physical interface to the bridge if necessary.
#
if (exists($brs{$k}{PHYSDEV})) {
my $physdev = $brs{$k}{PHYSDEV};
#
# This interface should not be a member of another bridge.
# If it is, it is an error.
#
# Continuing the comment above, this bridge and this interface
# might be shared with other containers, so we cannot remove it
# unless it is the only one left.
#
my $obr = findBridge($physdev);
if (defined($obr) && $obr ne $k) {
# Avoid removing the device from the bridge if it
# is in the correct bridge.
delbrif($obr, $physdev);
goto bad
if ($?);
$obr = undef;
}
if (!defined($obr)) {
addbrif($k, $physdev);
goto bad
if ($?);
# rebuild hashes
makeBridgeMaps();
}
$private->{'physbridgeifaces'}->{$k}->{$physdev} = $physdev;
}
#
# Now that the bridge exists, make the Docker network atop it.
#
TBDebugTimeStamp("checking existence of docker network $k");
($code,$content,$resp) = getClient()->network_inspect($k);
if ($code) {
TBDebugTimeStamp("creating docker network $k");
($code,$content,$resp) = getClient()->network_create_bridge(
$k,$cidr,$gw,$k);
goto bad
if ($code);
}
$private->{'dockernets'}->{$k} = $k;
}
else {
my $basedev;
#
# If there's a physical device, build the macvlan atop
# that. Otherwise, need to create a dummy device to
# "host" the macvlan ports.
#
if (exists($brs{$k}{PHYSDEV})) {
$basedev = $brs{$k}{PHYSDEV};
}
else {
$basedev = $k;
if (! -d "/sys/class/net/$k") {
mysystem2("$IP link add name $basedev type dummy");
goto bad
if ($?);
}
# record dummy used
$private->{'dummys'}->{$k} = $basedev;
}
#
# Make the docker network if necessary.
#
TBDebugTimeStamp("checking existence of docker network $k");
($code,$content,$resp) = getClient()->network_inspect($k);
if ($code) {
# Now that the dummy device exists, make the Docker
# network atop it.
TBDebugTimeStamp("creating docker network $k");
($code,$content,$resp) = getClient()->network_create_macvlan(
$k,$cidr,$gw,$basedev);
goto bad
if ($code);
}
$private->{'dockernets'}->{$k} = $k;
}
}
#
# We can handle linkdelays in two combinations.
#
# First, if we're not using macvlans and are using bridges, we place
# a qdisc on the veth in the root context to handle egress shaping;
# and we bind an IFB device to the veth and redirect ingress packets
# to it, and place an egress qdisc on it, to handle ingress shaping.
#
# Second, the only way to do this with macvlans is to place the
# qdiscs *in* the container, and to move an IFB into the container's
# network namespace. This is only secure for shared vnodes IFF the
# container is unprivileged (not real root), and if it does not have
# CAP_NET_ADMIN -- given both those restrictions, the user cannot
# remove the traffic shaping inside the container. On a shared
# node, of course our containers are deprivileged, but they ought to
# have CAP_NET_ADMIN and CAP_NET_RAW.
#
# This is unfortunate; we would like to use macvlans; but we prefer
# to use the same mechanism for both the dedicated and share vnode
# case -- thus we use bridges unless requested to use macvlans.
#
#
# IFBs are a little tricky. Once they are mapped into a container,
# we never get to see them again until the container is fully
# destroyed or until we explicitly unmap them from the container.
# We also want to hang onto them so we do not get into a situation
# where we stopped to take a disk image, and then cannot start
# again cause we ran out of resources (shared nodes). So, we have
# to look for IFBs that are already allocated to the
# container. See the allocate routines, which make use of the tag.
#
if (@node_lds) {
my $ifbs = AllocateIFBs($vmid, \@node_lds, $private);
goto bad
if (!(defined($ifbs)));
foreach my $ldc (@node_lds) {
my $tag = "$vnode_id:" . $ldc->{'LINKNAME'};
my $ifb = pop(@$ifbs);
$private->{'ifbs'}->{$ifb} = $tag;
# Stash for later.
$ldc->{'IFB'} = $ifb;
}
CreateShapingScripts($vnode_id,$private,\@node_ifs,\@node_lds);
}
# Setup our routing stuff.
CreateRoutingScripts($vnode_id,$private);
TBDebugTimeStamp(" releasing global lock")
if ($lockdebug);
TBScriptUnlock();
#
# Let vnodesetup exit early, so that bootvnodes delays minimally per
# vnode. I guess we figure if we can get through this call, we've
# made it through the obvious failures in the overall preparatory
# code in mkvnode.pl.
#
if ($vsrelease eq "immediate") {
TBDebugTimeStamp("rootPreConfigNetwork: touching $VMS/$vnode_id/running");
mysystem2("touch $VMS/$vnode_id/running");
}
return 0;
bad:
#
# Unwind anything we did.
#
# Remove any Docker networks we would have used that are unused.
if (exists($private->{'dockernets'})) {
foreach my $name (keys(%{ $private->{'dockernets'} })) {
my @members = getDockerNetMemberIds($name);
if (@members == 0) {
TBDebugTimeStamp("removing docker network $name");
($code,) = getClient()->network_delete($name);
delete($private->{'dockernets'}->{$name})
if (!$code);
}
}
}
# Delete bridges we would have used that are unused. If only the
# physdevices we added to the bridge are in the bridge, remove them,
# then remove the bridge.
if (exists($private->{'physbridges'})) {
foreach my $brname (keys(%{ $private->{'physbridges'} })) {
my @ifaces = findBridgeIfaces($brname);
if (@ifaces == 0) {
TBDebugTimeStamp("removing unused $brname");
mysystem2("$IFCONFIG $brname down");
delbr($brname);
if (!$?) {
delete($private->{'physbridges'}->{$brname});
delete($private->{'physbridgeifaces'}->{$brname});
}
}
elsif (exists($private->{'physbridgeifaces'}->{$brname})) {
#
# Check for anything other than the physbridgeifaces we
# would have added to this bridge; if only those are in
# the bridge, remove them, then remove the bridge.
#
my %ifm = ();
foreach my $ifc (@ifaces) {
$ifm{$ifc} = 1;
}
foreach my $physiface (keys(%{$private->{'physbridgeifaces'}->{$brname}})) {
delete($ifm{$physiface})
if (exists($ifm{$physiface}));
}
# If only the physifaces were left in the bridge, nuke
# them all, then dump the bridge.
if (keys(%ifm) == 0) {
foreach my $ifc (@ifaces) {
TBDebugTimeStamp("removing $ifc from unused $brname");
delbrif($brname,$ifc);
delete($private->{'physbridgeifaces'}->{$brname}->{$ifc});
}
TBDebugTimeStamp("removing unused $brname");
mysystem2("$IFCONFIG $brname down");
delbr($brname);
if (!$?) {
delete($private->{'physbridges'}->{$brname});
delete($private->{'physbridgeifaces'}->{$brname});
}
}
}
}
}
# Delete the dummy macvlan thingies we would have used, if no
# one else is using them.
if (exists($private->{'dummys'})) {
foreach my $brname (keys(%{ $private->{'dummys'} })) {
my @mvs = findMacvlanIfaces($private->{'dummys'}->{$brname});
if (@mvs == 0) {
mysystem2("$IP link del dev $brname");
delete($private->{'dummys'}->{$brname})
if ($?);
}
}
}
# Delete any vlan devices we would have used, if no one else is
# using them (i.e. if they are not in a bridge, and are not a parent
# of any other macvlan devices).
if (exists($private->{'vlandevs'})) {
foreach my $brname (keys(%{ $private->{'vlandevs'} })) {
my $brv = findBridge($private->{'dummys'}->{$brname});
my @mvs = findMacvlanIfaces($private->{'dummys'}->{$brname});
if (!defined($brv) && @mvs == 0) {
mysystem2("$IP link del dev $brname");
delete($private->{'vlandevs'}->{$brname})
if ($?);
}
}
}
# This shouldn't matter, but let's be complete; we might've deleted
# some bridges and interfaces.
refreshNetworkDeviceMaps();
# Release the IFBs
ReleaseIFBs($vmid, $private)
if (exists($private->{'ifbs'}));
badbad:
TBScriptUnlock();
return -1;
}
#
# Create the basic context for the VM and give it a unique ID for identifying
# "internal" state. If $raref is set, then we are in a RELOAD state machine
# and need to walk the appropriate states.
#
sub vnodeCreate($$$$)
{
my ($vnode_id, undef, $vnconfig, $private) = @_;
my $attributes = $vnconfig->{'attributes'};
my $imagename = $vnconfig->{'image'};
my $inreload = defined($imagename) ? 1 : 0;
my $raref = $vnconfig->{'reloadinfo'};
my $vninfo = $private;
my %mounts = ();
my $imagemetadata;
my $lvname;
my $rc;
my $err = undef;
my ($code,$content,$resp);
my $vmid;
if ($vnode_id =~ /^[-\w]+\-(\d+)$/) {
$vmid = $1;
}
else {
fatal("vnodeCreate: bad vnode_id $vnode_id!");
}
$vninfo->{'vmid'} = $vmid;
my ($host_iface,$host_ip,$host_mask,$host_maskbits,$host_net,
$host_mac,$host_gw) = findControlNet();
if (defined($raref)) {
$raref = $raref->[0];
$inreload = 1;
}
#
# Figure out where/what we're pulling, and a username/password if
# necessary.
#
my ($user,$pass);
if ((!$imagename || $imagename =~ /^emulab-ops-emulab-ops-DOCKER-EXT/)
&& exists($attributes->{'DOCKER_EXTIMAGE'})) {
$imagename = $attributes->{'DOCKER_EXTIMAGE'};
if (exists($attributes->{'DOCKER_EXTUSER'})) {
$user = $attributes->{'DOCKER_EXTUSER'};
}
if (exists($attributes->{'DOCKER_EXTPASS'})) {
$pass = $attributes->{'DOCKER_EXTPASS'};
}
}
elsif ($inreload) {
# For local reloads, username is physical host shortname;
# password is the eventkey.
open(FD,"$BOOTDIR/nodeid")
or die("open($BOOTDIR/nodeid): $!");
$user = ;
chomp($user);
close(FD);
open(FD,"$BOOTDIR/eventkey")
or die("open($BOOTDIR/eventkey): $!");
$pass = ;
chomp($pass);
close(FD);
print "raref:" . Dumper($raref) . "\n";
if (!exists($raref->{"PATH"}) || !$raref->{"PATH"}) {
fatal("reload specified, but not external image, and no image PATH!");
}
$imagename = $raref->{"PATH"};
}
else {
$imagename = $defaultImage{'name'};
}
#
# XXX future optimization possibility.
#
# Try to be smart about holding the vnode creation lock which is not
# a single lock, but rather a small set of locks intended to limit
# concurrency in the vnode creation process. Specifically, if we grab
# a create_vnode lock and then block waiting for our image lock, then
# we might prevent someone else (using a different image) from making
# progress. So we could instead: grab a create_vnode lock, make a short
# attempt (5-10 seconds) to grab the image lock and, failing that, back
# off of the create_vnode lock, wait and then try the whole process again.
#
# The problem is that we may block again down in downloadOneImage when
# we try to grab the image lock exclusively. Not sure we can back all
# the way out easily in that case!
#
# This is also a bit of a de-optimization when we have a set of vnodes
# all using the same image. We just cause a bit of excess context
# switching in that (probably more common) case.
#
if (CreateVnodeLock() != 0) {
fatal("CreateVnodeLock()");
}
if ($inreload) {
# No real difference for us here; RELOADING has a longer timeout too.
libutil::setState("RELOADSETUP");
libutil::setState("RELOADING");
}
my ($newimagename,$newcreateargs,$newcmd,$newization);
$rc = setupImage($vnode_id,$vnconfig,$private,$imagename,$user,$pass,
\$newimagename,\$newcreateargs,\$newcmd,\$newization);
if ($rc) {
libutil::setState("RELOADFAILED");
fatal("Failed to setup $imagename for $vnode_id; aborting!");
}
$private->{'emulabization'} = $newization;
if ($inreload) {
libutil::setState("RELOADDONE");
# XXX why do we need to wait for this to take effect?
TBDebugTimeStamp("waiting 4 sec after asserting RELOADDONE...");
sleep(4);
#
# Finish off the state transitions as necessary.
#
libutil::setState("SHUTDOWN");
}
CreateVnodeUnlock();
#
# Make sure all the physical NFS mounts we're going to bind mount
# are in place.
#
addMounts($vnode_id,\%mounts);
#
# Start building the 'docker create' args.
# (NB: see note below about why we have to put the container on the
# network right away!)
#
my %args = ( "Tty" => JSON::PP::true,"Image" => $newimagename );
# XXX: I wonder if not all containers will want this, but who knows.
$args{'AttachStdin'} = JSON::PP::true;
$args{'AttachStdout'} = JSON::PP::true;
$args{'AttachStderr'} = JSON::PP::true;
$args{'OpenStdin'} = JSON::PP::true;
my @hostspairs = ();
genhostspairlist($vnode_id,\@hostspairs);
if (@hostspairs) {
$args{"HostConfig"}{"ExtraHosts"} = \@hostspairs;
}
#
# Add NFS mounts.
#
$args{"HostConfig"}{"Binds"} = [];
foreach my $path (values(%mounts)) {
my $bind = "${path}:${path}";
if ($NFS_MOUNTS_READONLY) {
$bind .= ":ro";
}
push(@{$args{"HostConfig"}{"Binds"}},$bind);
}
#
# Add some Emulab-specific mount points that contain information:
# /var/emulab/boot/{tmcc,tmcc.}.
#
my $mntdir = CONFDIR()."/mountpoints";
for my $dir ("$mntdir","$mntdir/var.emulab",
"$mntdir/var.emulab/boot","$mntdir/var.emulab/boot/tmcc",
"$mntdir/var.emulab/db","$mntdir/var.emulab/logs",
"$mntdir/var.emulab/lock") {
mkdir($dir);
}
if ($newization >= DOCKER_EMULABIZE_CORE()) {
my ($boss_name,$boss_ip) = tmccbossinfo();
open(FD,">$mntdir/bossnode");
print FD "$boss_name\n";
close(FD);
push(@{$args{"HostConfig"}{"Binds"}},"$mntdir/bossnode:/etc/emulab/bossnode:ro");
}
# Populate the tmcc info.
mysystem2("rsync -a /var/emulab/boot/tmcc.$vnode_id/".
" $mntdir/var.emulab/boot/tmcc/");
push(@{$args{"HostConfig"}{"Binds"}},"$mntdir/var.emulab:/var/emulab:rw");
#
# Let the inside clientside know it is a GENVNODE(). NB: we do this
# as an read-only mount because the container removes it on reboot,
# and we don't want to have to rewrite it in time.
#
open(FD,">$mntdir/vmname")
or fatal("could not open $mntdir/vmname: $!");
print FD $vnode_id;
close(FD);
push(@{$args{"HostConfig"}{"Binds"}},
"$mntdir/vmname:/var/emulab/boot/vmname:ro");
#
# Tell the inside clientside which event server to use. NB: we do
# this as an read-only mount because the container removes it on
# reboot, and we don't want to have to rewrite it in time.
#
my $evip;
if (isRoutable($vnconfig->{'config'}->{'CTRLIP'})) {
$evip = $vnconfig->{'config'}->{'CTRLIP'};
}
else {
$evip = $host_ip;
}
open(FD,">$mntdir/localevserver")
or fatal("could not write $mntdir/localevserver: $!");
print FD "$evip";
close(FD);
push(@{$args{"HostConfig"}{"Binds"}},
"$mntdir/localevserver:/var/emulab/boot/localevserver:ro");
# Ugh, have to mount the certs into the container. We can't just
# mount over /etc/emulab entirely (well, we could, but it would not
# be safe; the clientside allows the user to persistently update
# stuff in that dir, so we don't want a mount over top what's in the
# image, even if it's writeable).
push(@{$args{"HostConfig"}{"Binds"}},
"/etc/emulab/client.pem:/etc/emulab/client.pem:ro");
push(@{$args{"HostConfig"}{"Binds"}},
"/etc/emulab/emulab.pem:/etc/emulab/emulab.pem:ro");
#
# We allow the server to tell us how many VCPUs to allocate to the
# guest.
#
my $cpus = 0;
if (exists($attributes->{'DOCKER_VCPUS'})
&& $attributes->{'DOCKER_VCPUS'} > 1) {
$cpus = $attributes->{'DOCKER_VCPUS'};
}
elsif (exists($attributes->{'VM_VCPUS'}) && $attributes->{'VM_VCPUS'} > 1) {
$cpus = $attributes->{'VM_VCPUS'};
}
if ($cpus > 0) {
#
# Docker on non-windows doesn't really support the notion of a
# whole VCPU (unless you pin specific CPUs to a container, which
# we don't want to do, cause it's more bookkeeping). So we
# emulate that with a combination of cpu period and cpu quota.
#
$args{"HostConfig"}{"CpuPeriod"} = 100000;
$args{"HostConfig"}{"CpuShares"} = 100000 * $cpus;
}
#
# Give the vnode some memory. The server usually tells us how much.
#
if (exists($attributes->{'DOCKER_MEMSIZE'})) {
# Better be MB. Docker wants bytes.
$args{"HostConfig"}{"Memory"} = \
$attributes->{'DOCKER_MEMSIZE'} * 1024 * 1024;
}
elsif (exists($attributes->{'VM_MEMSIZE'})) {
# Better be MB. Docker wants bytes.
$args{"HostConfig"}{"Memory"} = \
$attributes->{'VM_MEMSIZE'} * 1024 * 1024;
}
#
# Attach the node to the control network. NB: we would like to do
# this in vnodePreConfigControlNetwork, but with docker, if you
# specify --net=none as the initial "network", you cannot connect
# your container to any other networks after create. So we have to
# do that initial connection here!
#
my ($ctrlip,$ctrlmask) = ($vnconfig->{config}{CTRLIP},
$vnconfig->{config}{CTRLMASK});
my $ctrlmac = ipToMac($ctrlip);
my $ctrlnetwork = inet_ntoa(inet_aton($ctrlip) & inet_aton($ctrlmask));
my $fmac = fixupMac($ctrlmac);
my $maskbits = 0;
foreach my $octet (split(/\./,$ctrlmask)) {
my $cval = int($octet);
for (my $i = 0; $i < 8; ++$i) {
$maskbits += $cval & 1;
$cval = $cval >> 1;
}
}
#
# Need the domain, but no conistent way to do it. Ask tmcc for the
# boss node and parse out the domain.
#
my ($DOMAINNAME,$BOSSIP) = tmccbossinfo();
die("Could not get bossname from tmcc!")
if (!defined($DOMAINNAME));
if ($DOMAINNAME =~ /^[-\w]+\.(.*)$/) {
$DOMAINNAME = $1;
}
else {
$err = "Could not parse domain name!";
goto bad;
}
my ($pid, $eid, $vname) = check_nickname();
my $longdomain = "${eid}.${pid}.${DOMAINNAME}";
my $shortdomain = `cat /var/emulab/boot/mydomain`;
chomp($shortdomain);
my %cnetconfig = (
"IPAMConfig" => { "IPv4Address" => $ctrlip},
"MacAddress" => $fmac
);
$args{"NetworkingConfig"}{"EndpointsConfig"}{$DOCKERCNET} = \%cnetconfig;
$args{"Hostname"} = "$vname.$longdomain";
#
# NB XXX: apparently --dns-search *does* work, but not --dns, when
# you are using user-defined networks, or something. Anyway, Docker
# stuffs a 127.0.0.11 nameserver into /etc/resolv.conf even when
# --dns is specified, so until that changes, I just mount the host's
# resolv.conf into place. Kill me now...
#
$args{"HostConfig"}{"DnsSearch"} = [ $shortdomain ];
$args{"HostConfig"}{"Dns"} = [ $BOSSIP ];
push(@{$args{"HostConfig"}{"Binds"}},
"/etc/resolv.conf:/etc/resolv.conf:ro");
#
# Tell the clientside in the VM what kind of machine it is.
#
push(@{$args{"HostConfig"}{"Binds"}},
"/etc/emulab/genvmtype:/etc/emulab/genvmtype:ro");
#
# XXX: safe on shared hosts? Oh well, we have to have them.
#
$args{"HostConfig"}{"CapAdd"} = [ "NET_ADMIN","NET_BIND_SERVICE","NET_RAW" ];
$args{"HostConfig"}{"CgroupParent"} = $vnode_id;
# XXX: need to actually check to see if image has entrypoint/cmd,
# and maybe emulate that stuff with a wrapper script.
#
# Finally, add in any of the extra args from setupImage, by merging
# in the JSONish hashes into our config args. They cannot override
# the values we've already set (due to Hash::Merge's default policy
# of left-precedence).
#
if (defined($newcreateargs)) {
require Hash::Merge;
if ($debug) {
print STDERR "DEBUG: pre-merge args = ".Dumper(%args)."\n";
print STDERR "DEBUG: pre-merge newcreateargs = ".Dumper(%$newcreateargs)."\n";
}
%args = %{Hash::Merge::merge(\%args,$newcreateargs)};
if ($debug) {
print STDERR "DEBUG: merged createargs = ".Dumper(%args)."\n";
}
}
if (defined($newcmd)) {
require Hash::Merge;
%args = %{Hash::Merge::merge(\%args,$newcmd)};
if ($debug) {
print STDERR "DEBUG: merged createcmd = ".Dumper(%args)."\n";
}
}
if ($debug) {
print STDERR "container_create($vnode_id) args:\n".Dumper(%args)."\n";
}
#
# Kill off a capture that might be running for this container.
#
if (-x "$CAPTURE") {
my $rpid = captureRunning($vnode_id);
if ($rpid) {
print STDERR "WARNING: capture already running ($rpid)!?".
" Killing...\n";
kill("TERM", $rpid);
sleep(1);
}
}
#
# Go ahead and create.
#
TBDebugTimeStamp("creating docker container $vnode_id");
($code,$content,$resp) = getClient()->container_create($vnode_id,\%args);
if ($code) {
$err = "failed to create the container: $content";
goto bad;
}
#
# Finish off the state transitions as necessary.
#
if (defined($raref)) {
libutil::setState("SHUTDOWN");
}
return $vmid;
bad:
removeMounts($vnode_id);
fatal($err);
}
sub vnodePreConfig($$$$$){
my ($vnode_id, $vmid, $vnconfig, $private, $callback) = @_;
return 0;
}
#
# We already added the control net interface in vnodeCreate so that we
# could pass the network args via the docker create call. So now just
# create associated root context stuff like firewall rules and port
# forwards. We don't let Docker handle these port forwards because it
# is restrictive.
#
sub vnodePreConfigControlNetwork($$$$$$$$$$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private,
$ip,$mask,$mac,$gw, $vname,$longdomain,$shortdomain,$bossip) = @_;
my $vninfo = $private;
# NB: the control net config is already associated with the
# container, so we have no devices to create nor configure; Docker
# will create them when it creates the container.
# Maybe allow routable control network.
my $isroutable = isRoutable($ip);
#my ($host_ip,$host_mask,$vmac) = hostControlNet();
my ($host_iface,$host_ip,$host_mask,$host_maskbits,$host_net,
$host_mac,$host_gw) = findControlNet();
my ($bossdomain,$boss_ip) = tmccbossinfo();
if (!$boss_ip) {
$boss_ip = `cat $BOOTDIR/bossip`;
chomp($boss_ip);
}
if (!$boss_ip) {
warn("could not find bossip anywhere; aborting!");
return -1;
}
my (undef,undef,undef,undef,@addrs) = gethostbyname("users");
if ($? || @addrs == 0) {
warn("could not resolve users.$bossdomain; aborting!");
return -1;
}
my $ops_ip = inet_ntoa($addrs[0]);
my $local_tmcd_port = $TMCD_PORT + $vmid;
#
# Set up the chains. We always create them, and if there is no
# firewall, they default to accept. This makes things easier in
# the control network script (emulab-cnet.pl).
#
# Do not worry if these fail; we will catch it below when we add
# the rules. Or I could look to see if the chains already exist,
# but why bother.
#
my @rules = ();
# Ick, iptables has a 28 character limit on chain names. But we have to
# be backwards compatible with existing chain names. See corresponding
# code in emulab-cnet.pl
my $IN_CHAIN = "IN_${vnode_id}";
my $OUT_CHAIN = "OUT_${vnode_id}";
if (length($IN_CHAIN) > 28) {
$IN_CHAIN = "I_${vnode_id}";
$OUT_CHAIN = "O_${vnode_id}";
}
push(@rules, "-N $IN_CHAIN");
push(@rules, "-F $IN_CHAIN");
push(@rules, "-N $OUT_CHAIN");
push(@rules, "-F $OUT_CHAIN");
# Match existing dynamic rules as early as possible.
push(@rules, "-A $IN_CHAIN -m conntrack ".
"--ctstate RELATED,ESTABLISHED -j ACCEPT");
push(@rules, "-A $OUT_CHAIN -m conntrack ".
"--ctstate RELATED,ESTABLISHED -j ACCEPT");
# Do all the rules regardless of whether they fail
DoIPtablesNoFail(@rules);
# For the next set of rules we want to fail on first error
@rules = ();
if ($vnconfig->{'fwconfig'}->{'fwinfo'}->{'TYPE'} eq "none") {
if ($IPTABLES_PACKET_LOG) {
push(@rules, "-A $IN_CHAIN -j LOG ".
" --log-prefix 'IN-${vnode_id}: ' --log-level 5");
push(@rules, "-A $OUT_CHAIN -j LOG ".
" --log-prefix 'OUT-${vnode_id}: ' --log-level 5");
}
push(@rules, "-A $IN_CHAIN -j ACCEPT");
push(@rules, "-A $OUT_CHAIN -j ACCEPT");
}
else {
if ($IPTABLES_PACKET_LOG) {
push(@rules, "-A $IN_CHAIN -j LOG ".
" --log-prefix 'IN-${vnode_id}: ' --log-level 5");
push(@rules, "-A $OUT_CHAIN -j LOG ".
" --log-prefix 'OUT-${vnode_id}: ' --log-level 5");
}
#
# These rules allows the container to talk to the TMCC proxy.
# If you change this port, change InsertPostBootIptablesRules too.
#
push(@rules,
"-A $OUT_CHAIN -p tcp ".
"-d $host_ip --dport $local_tmcd_port ".
"-m conntrack --ctstate NEW -j ACCEPT");
push(@rules,
"-A $OUT_CHAIN -p udp ".
"-d $host_ip --dport $local_tmcd_port ".
"-m conntrack --ctstate NEW -j ACCEPT");
#
# Need to do some substitution first.
#
foreach my $rule (@{ $vnconfig->{'fwconfig'}->{'fwrules'} }) {
my $rulestr = $rule->{'RULE'};
$rulestr =~ s/\s+me\s+/ $ip /g;
$rulestr =~ s/\s+INSIDE\s+/ $OUT_CHAIN /g;
$rulestr =~ s/\s+OUTSIDE\s+/ $IN_CHAIN /g;
$rulestr =~ s/^iptables //;
push(@rules, $rulestr);
}
#
# For debugging, we want to log any packets that get to the bottom,
# since they are going to get dropped.
#
if ($IPTABLES_PACKET_LOG) {
push(@rules, "-A $IN_CHAIN -j LOG ".
" --log-prefix 'IN ${vnode_id}: ' --log-level 5");
push(@rules, "-A $OUT_CHAIN -j LOG ".
" --log-prefix 'OUT ${vnode_id}: ' --log-level 5");
}
}
# Add some global rules (i.e. that cannot simply be flushed by
# flushing one of the input/output chains for this vnode); and save
# them for later deletion so we don't have to reconstruct them
# later!
my @grules = ();
#
# Finally, either allow direct ssh into the container (if it was
# emulabized OR if user specifically requested direct ssh), or add
# this port to our alternate sshd-docker-exec service (if not
# emulabized or user requested ssh-attach).
#
if (exists($vnconfig->{'config'}->{'SSHDPORT'})) {
my $attributes = $vnconfig->{'attributes'};
my $emulabization = $attributes->{DOCKER_EMULABIZATION};
my $ssh_style = $attributes->{DOCKER_SSH_STYLE};
my $exec_shell = $attributes->{DOCKER_EXEC_SHELL};
if (defined($exec_shell) && $exec_shell =~ /^([\/\w\d\-_]+)$/) {
$exec_shell = $1;
}
else {
warn("malformed shell: $exec_shell ; defaulting to /bin/sh");
$exec_shell = '/bin/sh';
}
if (($emulabization ne DOCKER_EMULABIZE_NONE()
&& (!defined($ssh_style) || $ssh_style eq ''
|| $ssh_style eq 'direct'))
|| (defined($ssh_style) && $ssh_style eq 'direct')) {
if (!isRoutable($ip)) {
# Override the common/mkvnode.pl ssh portfw. We want the
# alt sshd port for this vnode to redirect from the public
# host to port 22 on the inside, not to the alt port on the
# inside, like mkvnode.pl assumes. Ugh.
push(@grules,
"-t nat -A PREROUTING -j DNAT -p tcp ".
"--dport $vnconfig->{config}->{SSHDPORT} -d $host_ip ".
"--to-destination $ip:22");
}
$private->{'ssh_style'} = 'direct';
}
else {
# Setup our docker exec via ssh.
addContainerToDockerExecSSH(
$vnode_id,$vnconfig->{config}->{SSHDPORT},$exec_shell);
$private->{'ssh_style'} = 'exec';
}
}
# Reroute tmcd calls to the proxy on the physical host
push(@grules,
"-t nat -A PREROUTING -j DNAT -p tcp ".
"--dport $TMCD_PORT -d $boss_ip -s $ip ".
"--to-destination $host_ip:$local_tmcd_port");
push(@grules,
"-t nat -A PREROUTING -j DNAT -p udp ".
"--dport $TMCD_PORT -d $boss_ip -s $ip ".
"--to-destination $host_ip:$local_tmcd_port");
# Reroute evproxy to use the local daemon.
push(@grules,
"-t nat -A PREROUTING -j DNAT -p tcp ".
"--dport $EVPROXY_PORT -d $ops_ip -s $ip ".
"--to-destination $host_ip:$EVPROXY_PORT");
push(@rules,@grules);
my @deleterules = ();
foreach my $rule (@grules) {
if ($rule =~ /^(-t \w+\s+)?(-[AIR]\s+)([A-Za-z][-A-Za-z0-9]*)\s+(.+)$/) {
push(@deleterules,"$1 -D $3 $4");
}
elsif ($rule =~ /^(-t \w+\s+)?(-[IR]\s+)([A-Za-z][-A-Za-z0-9]*)\s+\d+\s+(.+)$/) {
push(@deleterules,"$1 -D $3 $4");
}
}
$private->{'preboot_iptables_rules'} = \@deleterules;
# Install the iptable rules
TBDebugTimeStamp("vnodePreConfigControlNetwork: installing iptables rules");
if (DoIPtables(@rules)) {
TBDebugTimeStamp(" failed to install iptables rules");
return -1;
}
TBDebugTimeStamp(" installed iptables rules");
return 0;
}
#
# Since we already did the work of figuring out which exp networks this
# container is connected to above in rootPreConfigNetwork, the only
# thing we have to handle in this function is the runtime stuff of
# figuring what if any traffic shaping is necessary, and setting up
# those devices.
#
sub vnodePreConfigExpNetwork($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
my $ifs = $vnconfig->{'ifconfig'};
my $lds = $vnconfig->{'ldconfig'};
my $tunnels = $vnconfig->{'tunconfig'};
#
# Since network config with Docker is persistent, we add the
# experiment net devices here, but have to check if any interfaces
# have been modified, removed, or freshly added, and handle the
# delta.
#
my $basetable;
my $elabifs = "";
my $elabroutes = "";
my %netif_strs = ();
foreach my $ifc (@$ifs) {
# XXX
#next if (!$ifc->{ISVIRT});
TBDebugTimeStamp("vnodePreConfigExpNetwork: $vnode_id interface ".
Dumper($ifc))
if ($debug > 1);
my $br = $ifc->{"BRIDGE"};
my $physdev = $ifc->{"PHYSDEV"};
my $ldinfo;
#
# Find associated delay info
#
foreach my $ld (@$lds) {
if ($ld->{"IFACE"} eq $ifc->{"MAC"}) {
$ldinfo = $ld;
}
}
#
# All the "hard" work (of creating bridges (or dummy devices for
# macvlan support), and docker networks) was done in
# rootPreConfigNetwork above. So all we have to do here is add this
# device and its configuration to the docker container we created
#
my $nfmac = $ifc->{MAC};
my $fmac = fixupMac($nfmac);
my ($ip,$mask) = ($ifc->{IPADDR},$ifc->{IPMASK});
my $network = inet_ntoa(inet_aton($ip) & inet_aton($mask));
my $maskbits = 0;
foreach my $octet (split(/\./,$mask)) {
my $cval = int($octet);
for (my $i = 0; $i < 8; ++$i) {
$maskbits += $cval & 1;
$cval = $cval >> 1;
}
}
#
# Before anything else, we add a network interface to the
# container for this exp net iface. (We have to do this with
# raw docker API access because as of 1.12.x, the docker CLI did
# not support fixing a MAC address via 'docker network connect
# ...'.) Anyway, first we must find the docker network ID.
#
TBDebugTimeStamp("connecting docker container $vnode_id to".
" network ".$ifc->{BRIDGE});
my ($code,$content,$resp) = getClient()->network_connect_container(
$ifc->{BRIDGE},$vnode_id,$ip,$maskbits,$fmac);
if ($code) {
fatal("Could not connect $vnode_id to $DOCKERCNET".
" ($code,$content); aborting!");
}
}
return 0;
}
sub vnodeConfigResources($$$$){
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
my $attributes = $vnconfig->{'attributes'};
my $memory;
return 0;
}
sub vnodeConfigDevices($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
my $vninfo = $private;
return 0;
}
sub vnodeState($;$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
my $err = 0;
my $out = VNODE_STATUS_UNKNOWN();
TBDebugTimeStamp("getting state info for docker container $vnode_id");
my ($code,$content) = getClient()->container_inspect($vnode_id);
if ($code) {
print STDERR "vnodeState: could not inspect container: $content ($code)!";
return ($code, $out);
}
my $json = $content;
my $jstate = $json->[0]->{'State'};
if ($jstate->{"Running"} == JSON::PP::true) {
$out = VNODE_STATUS_RUNNING();
}
elsif ($jstate->{"Restarting"} == JSON::PP::true) {
$out = VNODE_STATUS_BOOTING();
}
elsif ($jstate->{"Paused"} == JSON::PP::true) {
$out = VNODE_STATUS_PAUSED();
}
elsif ($jstate->{"Dead"} == JSON::PP::true) {
$out = VNODE_STATUS_STOPPED();
}
elsif ($jstate->{"Status"} eq "exited"
|| $jstate->{"Status"} eq "stopped") {
$out = VNODE_STATUS_STOPPED();
}
else {
# Else, it must be stopped!
$out = VNODE_STATUS_STOPPED();
}
return ($err, $out);
}
sub vnodeBootHook($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
my $vninfo = $private;
#
# If the image is not emulabized, it cannot be expected to send
# status, so we do it.
#
if ($private->{'emulabization'} eq DOCKER_EMULABIZE_NONE()) {
libutil::setState("TBSETUP");
}
#
# Start up our Docker-to-pty script for this container; the capture
# will attach to it. We always fire this off here; it cannot
# survive when the container reboots or shuts down.
#
my $PTYLINKFILE = "$VMDIR/$vnode_id/vnode.pty";
TBDebugTimeStamp("vnodeBootHook: starting container2pty;".
" symlink $PTYLINKFILE");
mysystem("$C2P $vnode_id $PTYLINKFILE &");
# Wait 5 seconds to ensure $PTYLINKFILE appears...
my $tries = 10;
while (! -e $PTYLINKFILE && $tries > 0) {
sleep(1);
$tries -= 1;
TBDebugTimeStamp("vnodeBootHook: waiting for $PTYLINKFILE...");
}
#
# Start a capture if there isn't one running.
#
if (-x "$CAPTURE") {
my $rpid = captureRunning($vnode_id);
if ($rpid == 0) {
captureStart($vnode_id,$PTYLINKFILE);
}
}
#
# This function is not yet part of the libvnode API, but our
# vnodeBoot and vnodeReboot functions call it.
#
# XXX: mkvnode.pl probably needs to call it when it notices a reboot
# (i.e., reboot within the container, or a docker restart).
#
#
# After boot or reboot, save off its network namespace (also removes
# an old namespace from a previous boot). This is very important
# because we want to move our own network devices into the namespace
# (which docker doesn't allow us to do), and if we move a device in,
# we have to remove it before we lose a handle to the namespace.
#
bindNetNS($vnode_id,$private);
#
# First, install our runtime iptables rules (those that depend on
# the veth, like antispoofing).
#
InsertPostBootIptablesRules($vnode_id,$vmid,$vnconfig,$private);
#
# Run the routing scripts we built earlier.
#
RunRoutingScripts($vnode_id,1);
#
# Run the shaping scripts we built earlier.
#
RunShapingScripts($vnode_id,1);
#
# If the image is not emulabized, it cannot be expected to send
# status, so we do it.
#
if ($private->{'emulabization'} eq DOCKER_EMULABIZE_NONE()) {
libutil::setState("ISUP");
}
return 0;
}
sub vnodeBoot($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
my $vninfo = $private;
# notify stated that we are about to boot. We need this transition for
# stated to do its thing, this state name is treated specially.
libutil::setState("BOOTING");
RunProxies($vnode_id,$vmid);
TBDebugTimeStamp("Starting vnode $vnode_id...");
my ($code,$content) = getClient()->container_start($vnode_id);
if ($code) {
print STDERR "container_start $vnode_id failed: $content ($code)\n";
return -1;
}
#
# We cannot wait until we can ping the node. We must immediately
# kick off our hooks so that container processes never send any
# packets prior to their installation. If this is too racy, we'll
# have to launch the container's init process a bit more carefully
# to coordinate this inside/outside dance.
#
TBDebugTimeStamp("Created container $vnode_id");
vnodeBootHook($vnode_id,$vmid,$vnconfig,$private);
#
# XXX originally we had "-t 5", but -t is not a timeout
# in Linux ping. So there was no timeout which resulted
# in sending all 5 pings at 1 second intervals and then
# waiting for the last one to not respond, a total of
# 6 seconds. So this loop of 10 tries took about 60 seconds.
#
# If we fix the option ("-w 5"), we still don't timeout after
# 5 seconds since in linux, we get a network error after 3
# seconds if the node is down. So ironically, we were closer
# to out timeout value with the wrong option!
#
# The worst part is in the common case where the node is
# up and responding: we wind up waiting a little over 4
# seconds til we get a response from 5 pings.
#
# So lets try fewer pings (1) so the successful case returns
# immediately, and account for the 3 second node down timeout
# by increasing the countdown to match the original ~60 seconds
# before giving up.
#
my $ip = $vnconfig->{"config"}{CTRLIP};
my $countdown = 8;
while ($countdown > 0) {
TBDebugTimeStamp("Pinging $ip for up to five seconds ...");
system("ping -q -c 1 -w 5 $ip > /dev/null 2>&1");
# Ping returns zero if any packets received.
if (! $?) {
TBDebugTimeStamp("Container $vnode_id is up");
return 0;
}
$countdown--;
last
if (checkForInterrupt());
}
#
# Tear it down and try again. Use vnodeHalt cause it protects
# itself with an alarm.
#
TBDebugTimeStamp("Container did not start, stopping for retry ...");
vnodeHalt($vnode_id, $vmid, $vnconfig, $private);
TBDebugTimeStamp("Container halted, waiting for it to stop ...");
$countdown = 10;
while ($countdown >= 0) {
sleep(5);
last
if (vnodeState($vnode_id,$vmid,$vnconfig,$private)
eq VNODE_STATUS_STOPPED());
$countdown--;
TBDebugTimeStamp("Container not stopped yet");
}
TBDebugTimeStamp("Container is stopped ($countdown)!");
last
if (checkForInterrupt());
return -1;
}
#
# Connects to the Docker events daemon to listen to events for this
# container. We have two paths to get here. First, if mkvnode.pl is
# signaled from our code, it might stop/restart/kill the vnode. Second,
# if the user runs a docker command (i.e. docker stop/restart/start), we
# want to be able to fire up our monitor again, and most importantly to
# re-run our boot hooks.
#
# For the second case, a 'docker restart' will result in
# kill,kill,die,stop,start,restart container events. A 'docker stop'
# results in kill,kill,die,stop events. For restart, We may not catch
# container death via sleep poll, of course -- but we still have to
# apply our runtime boot hooks (like moving devices into the container,
# applying traffic shaping, etc) -- and docker doesn't help us with
# runtime hooks. So we need to catch the restart event and run teardown
# and boot hooks. Problem is in this case, we don't actually know until
# the final restart event if this is a restart! How goofy. So either
# we wait in mkvnode.pl (vnodePoll) and see if it comes back, then run
# the boot hook; or we just always exit from mkvnode.pl, and have a
# central monitor daemon that looks for container start/stop events and
# fires off a mkvnode.pl monitor to run the boot hook. Probably the
# latter is most flexible, but maybe also wasted cycles or
# higher-latency... hard to know.
#
sub vnodePoll($$$$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private, $statusref, $eventref) = @_;
reconnect:
if (!exists($private->{DOCKER_EVENT_FD})
|| !defined($private->{DOCKER_EVENT_FD})
|| !$private->{DOCKER_EVENT_FD}->opened()) {
TBDebugTimeStamp("connecting to docker event stream for $vnode_id");
my $ccmd = "$CURL -sN --unix-socket /var/run/docker.sock".
" -H \"Content-Type: application/json\"".
" --data-urlencode 'filters=\{\"type\":\[\"container\"\],".
" \"container\":\[\"${vnode_id}\"\]\}'".
' -G http:/events?filters={"type":["container"],"container":["'.$vnode_id.'"]}';
pipe($private->{DOCKER_EVENT_FD},WRITER)
or fatal("popen docker event stream $vnode_id: $@");
my $pid = fork();
if (!$pid) {
close(STDIN);
open(STDOUT,">&WRITER");
close($private->{DOCKER_EVENT_FD});
if (0) {
exec($ccmd);
}
else {
# This just prints JSON events to STDOUT by default,
# which is exactly what we want.
getClient()->monitor_events(
{"type"=>["container"],"container"=>["$vnode_id"]});
}
exit(-1);
}
# Parent continues.
close(WRITER);
$private->{DOCKER_EVENT_CHILD} = $pid;
$private->{DOCKER_EVENT_FD}->autoflush(1);
my $oldfh = select($private->{DOCKER_EVENT_FD});
$| = 1;
select($oldfh);
# We save off the last N events for an existing connection. If the
# connection drops, we dump the list.
$private->{DOCKER_EVENT_HISTORY} = [];
}
#my $comp = Array::Compare->new();
my $buf = '';
while (1) {
my $rc;
my $sel = IO::Select->new($private->{DOCKER_EVENT_FD});
my @ready = $sel->can_read(2);
if ($?) {
TBDebugTimeStamp("error in select: $!");
if (!$private->{DOCKER_EVENT_FD}->opened()) {
TBDebugTimeStamp("lost docker event stream connection;".
" reconnecting...");
delete($private->{DOCKER_EVENT_FD});
kill('KILL',$private->{DOCKER_EVENT_CHILD});
delete($private->{DOCKER_EVENT_CHILD});
goto reconnect;
}
}
if (!@ready) {
#TBDebugTimeStamp("nothing to read, continuing...");
next;
}
$rc = sysread($private->{DOCKER_EVENT_FD},$buf,4096,length($buf));
if (!defined($rc)) {
warn("sysread on event stream failed; aborting poll loop!");
return libgenvnode::VNODE_POLL_ERROR();
}
while (1) {
my $pos = index($buf,"\n");
if ($pos < 0) {
last;
}
my $line = substr($buf,0,$pos+1);
chomp($line);
$buf = substr($buf,$pos+1);
my $json = decode_json($line);
if (!exists($json->{"Type"}) || $json->{"Type"} ne "container"
|| !exists($json->{"Actor"}{"Attributes"}{"name"})
|| $json->{"Actor"}{"Attributes"}{"name"} ne $vnode_id) {
TBDebugTimeStamp("event $line not for us; ignoring!");
next;
}
if (!exists($json->{"status"})) {
# We only want status change events.
next;
}
TBDebugTimeStamp("$vnode_id status: $json->{status}".
" ($json->{time}.$json->{timeNano}");
#
# NB: when we make library calls below, we block signals
# temporarily so that the whole thing finishes.
# vnodesetup/mkvnode keep trying :).
#
my $status = $json->{"status"};
if ($status eq 'die') {
TBDebugTimeStamp("$vnode_id died; tearing down");
$rc = RunWithSignalsBlocked(\&vnodeTearDown,
$vnode_id,$vmid,$vnconfig,$private);
if ($rc) {
warn("vnodeTearDown failed; aborting poll loop with error!");
return libgenvnode::VNODE_POLL_ERROR();
}
}
elsif ($status eq 'start') {
TBDebugTimeStamp("$vnode_id started; running boot hooks");
$rc = RunWithSignalsBlocked(\&vnodeBootHook,
$vnode_id,$vmid,$vnconfig,$private);
if ($rc) {
warn("vnodeBootHook failed; aborting poll loop with error!");
return libgenvnode::VNODE_POLL_ERROR();
}
}
}
}
}
sub vnodePollCleanup($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
if (exists($private->{DOCKER_EVENT_FD})) {
kill('KILL',$private->{DOCKER_EVENT_CHILD});
if ($private->{DOCKER_EVENT_FD}->opened()) {
close($private->{DOCKER_EVENT_FD});
}
delete($private->{DOCKER_EVENT_CHILD});
delete($private->{DOCKER_EVENT_FD});
}
if (exists($private->{DOCKER_EVENT_HISTORY})) {
delete($private->{DOCKER_EVENT_HISTORY});
}
return 0;
}
sub vnodePostConfig($)
{
return 0;
}
sub rootPostConfig($)
{
return 0;
}
sub vnodeReboot($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
TBDebugTimeStamp("restarting vnode $vnode_id...");
my ($code,$content) = getClient()->container_restart($vnode_id);
if ($code) {
warn("container_restart($vnode_id) failed: $content ($code)\n");
return $code;
}
return vnodeBootHook($vnode_id, $vmid, $vnconfig, $private);
}
sub vnodeHalt($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
TBDebugTimeStamp("Stopping vnode $vnode_id...");
my ($code,$content) = getClient()->container_stop($vnode_id);
if ($code) {
warn("container_stop($vnode_id) failed: $content ($code)\n");
return $code;
}
return 0;
}
sub vnodeExec($$$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private, $command) = @_;
TBDebugTimeStamp("Running command '$command' inside vnode $vnode_id...");
my ($code,$content) = getClient()->container_exec($vnode_id,$command);
if ($code) {
warn("container_exec($vnode_id) failed: $content ($code)\n");
return $code;
}
if (wantarray) {
return ($code,$content);
}
else {
return $code;
}
}
#
# Docker doesn't support mount/unmount.
#
sub vnodeUnmount($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
return undef;
}
#
# Remove the transient state, but not the disk. Basically, remove
# anything that happened in vnodeBoot and vnodeBootHook.
#
sub vnodeTearDown($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
# Lots of shared resources
TBDebugTimeStamp("vnodeTearDown: grabbing global lock $GLOBAL_CONF_LOCK")
if ($lockdebug);
if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) {
print STDERR "Could not get the global lock after a long time!\n";
return -1;
}
TBDebugTimeStamp(" got global lock")
if ($lockdebug);
KillProxies($vnode_id,$vmid);
RemovePostBootIptablesRules($vnode_id,$vmid,$vnconfig,$private);
#
# Unwind anything we did in vnodeBootHook.
#
unbindNetNS($vnode_id,$private);
badbad:
TBDebugTimeStamp(" releasing global lock")
if ($lockdebug);
TBScriptUnlock();
return 0;
}
sub vnodeDestroy($$$$)
{
my ($vnode_id, $vmid, $vnconfig, $private) = @_;
my $vninfo = $private;
# Always do this.
return -1
if (vnodeTearDown($vnode_id, $vmid, $vnconfig, $private));
TBDebugTimeStamp("Removing vnode $vnode_id docker container...");
my ($code,$content) = getClient()->container_delete($vnode_id);
if ($code) {
print STDERR "container_delete $vnode_id failed: $content ($code)\n";
}
#
# Remove mounts.
#
removeMounts($vnode_id);
#
# Remove any global iptables rules (i.e., in chains other than our
# per-vnode special input/output chains).
#
if (exists($private->{'preboot_iptables_rules'})
&& @{$private->{'preboot_iptables_rules'}}) {
DoIPtables(@{$private->{'preboot_iptables_rules'}});
delete($private->{'preboot_iptables_rules'});
}
#
# If user wanted 'exec' ssh_style, remove this vnode from the
# private sshd.
#
if (exists($private->{'ssh_style'}) && $private->{'ssh_style'} eq 'exec') {
delete($private->{'ssh_style'});
removeContainerFromDockerExecSSH($vnode_id);
}
#
# Shutdown the capture now that it is gone. We leave the log around
# til next time this vnode comes back.
#
if (-x "$CAPTURE") {
my $LOGPATH = "$VMDIR/$vnode_id";
my $pidfile = "$LOGPATH/$vnode_id.pid";
my $pid = 0;
if (-r "$pidfile" && open(PID, "<$pidfile")) {
my $pid = ;
close(PID);
chomp($pid);
if ($pid =~ /^(\d+)$/ && $1 > 1) {
$pid = $1;
} else {
print STDERR "WARNING: bogus pid in capture pidfile ($pid)\n";
$pid = 0;
}
}
# XXX sanity: make sure pidfile matches reality
my $rpid = captureRunning($vnode_id);
if ($rpid == 0) {
print STDERR "WARNING: capture not running";
if ($pid > 0) {
print STDERR ", should have been pid $pid";
$pid = 0;
}
print STDERR "\n";
} elsif ($pid != $rpid) {
if ($pid == 0) {
print STDERR "WARNING: no recorded capture pid, ".
"but found process ($rpid)\n";
} else {
print STDERR "WARNING: recorded capture pid ($pid) ".
"does not match actual pid ($rpid)\n";
}
$pid = $rpid;
}
if ($pid > 0) {
kill("TERM", $pid);
}
}
# Kill the chains.
# Ick, iptables has a 28 character limit on chain names. But we have to
# be backwards compatible with existing chain names. See corresponding
# code in emulab-cnet.pl
my $IN_CHAIN = "IN_${vnode_id}";
my $OUT_CHAIN = "OUT_${vnode_id}";
if (length($IN_CHAIN) > 28) {
$IN_CHAIN = "I_${vnode_id}";
$OUT_CHAIN = "O_${vnode_id}";
}
DoIPtables("-F $IN_CHAIN");
DoIPtables("-X $IN_CHAIN");
DoIPtables("-F $OUT_CHAIN");
DoIPtables("-X $OUT_CHAIN");
#
# Unwind anything we did.
#
# Remove any Docker networks we would have used that are unused.
if (exists($private->{'dockernets'})) {
foreach my $name (keys(%{ $private->{'dockernets'} })) {
my @members = getDockerNetMemberIds($name);
if (@members == 0) {
TBDebugTimeStamp("Deleting empty docker network $name...");
($code) = getClient()->network_delete($name);
delete($private->{'dockernets'}->{$name})
if (!$code);
}
}
}
# Delete bridges we would have used that are unused. If only the
# physdevices we added to the bridge are in the bridge, remove them,
# then remove the bridge.
if (exists($private->{'physbridges'})) {
foreach my $brname (keys(%{ $private->{'physbridges'} })) {
my @ifaces = findBridgeIfaces($brname);
if (@ifaces == 0) {
TBDebugTimeStamp("removing unused $brname");
if (-e "/sys/class/net/$brname") {
mysystem2("$IFCONFIG $brname down");
delbr($brname);
}
if (!$?) {
delete($private->{'physbridges'}->{$brname});
delete($private->{'physbridgeifaces'}->{$brname});
}
}
elsif (exists($private->{'physbridgeifaces'}->{$brname})) {
#
# Check for anything other than the physbridgeifaces we
# would have added to this bridge; if only those are in
# the bridge, remove them, then remove the bridge.
#
my %ifm = ();
foreach my $ifc (@ifaces) {
$ifm{$ifc} = 1;
}
foreach my $physiface (keys(%{$private->{'physbridgeifaces'}->{$brname}})) {
delete($ifm{$physiface})
if (exists($ifm{$physiface}));
}
# If only the physifaces were left in the bridge, nuke
# them all, then dump the bridge.
if (keys(%ifm) == 0) {
foreach my $ifc (@ifaces) {
TBDebugTimeStamp("removing $ifc from unused $brname");
delbrif($brname,$ifc);
delete($private->{'physbridgeifaces'}->{$brname}->{$ifc});
}
TBDebugTimeStamp("removing unused $brname");
if (-e "/sys/class/net/$brname") {
mysystem2("$IFCONFIG $brname down");
delbr($brname);
}
if (!$?) {
delete($private->{'physbridges'}->{$brname});
delete($private->{'physbridgeifaces'}->{$brname});
}
}
}
}
}
# Delete the dummy macvlan thingies we would have used, if no
# one else is using them.
if (exists($private->{'dummys'})) {
foreach my $brname (keys(%{ $private->{'dummys'} })) {
my @mvs = findMacvlanIfaces($private->{'dummys'}->{$brname});
if (@mvs == 0) {
mysystem2("$IP link del dev $brname");
delete($private->{'dummys'}->{$brname})
if ($?);
}
}
}
# Delete any vlan devices we would have used, if no one else is
# using them (i.e. if they are not in a bridge, and are not a parent
# of any other macvlan devices).
if (exists($private->{'vlandevs'})) {
foreach my $brname (keys(%{ $private->{'vlandevs'} })) {
my $brv = findBridge($private->{'dummys'}->{$brname});
my @mvs = findMacvlanIfaces($private->{'dummys'}->{$brname});
if (!defined($brv) && @mvs == 0) {
mysystem2("$IP link del dev $brname");
delete($private->{'vlandevs'}->{$brname})
if ($?);
}
}
}
# This shouldn't matter, but let's be complete; we might've deleted
# some bridges and interfaces.
refreshNetworkDeviceMaps();
#
# We keep the IFBs until complete destruction. We do this cause we do
# want to get into a situation where we stopped a container to do
# something like take a disk snapshot, and then not be able to
# restart it cause there are no more resources available (as might
# happen on a shared node).
#
ReleaseIFBs($vmid, $private)
if (exists($private->{'ifbs'}));
return 0;
}
##
## Utility and helper functions.
##
#
# Analyze an existing Docker image to extra image metadata,
# distro/version, and so on.
#
sub analyzeImage($$)
{
my ($image,$rethash) = @_;
my $output;
my @outlines;
my ($code,$json,$resp,$retval);
TBDebugTimeStamp("analyzing image $image...");
TBDebugTimeStamp("inspecting image $image...");
($code,$json) = getClient()->image_inspect($image);
if ($code) {
warn("inspect $image failed -- attempting to continue anyway!");
}
else {
my $jstate;
if (ref($json) eq 'ARRAY') {
$jstate = $json->[0];
}
$jstate = $jstate->{'Config'};
if (exists($jstate->{'Cmd'})) {
$rethash->{DOCKER_CMD} = $jstate->{'Cmd'};
}
if (exists($jstate->{'Entrypoint'})) {
$rethash->{DOCKER_ENTRYPOINT} = $jstate->{'Entrypoint'};
}
if (exists($jstate->{'Env'})) {
$rethash->{DOCKER_ENV} = $jstate->{'Env'};
}
if (exists($jstate->{'WorkingDir'})) {
$rethash->{DOCKER_WORKINGDIR} = $jstate->{'WorkingDir'};
}
if (exists($jstate->{'ArgsEscaped'})) {
$rethash->{DOCKER_ARGSESCAPED} = int($jstate->{'ArgsEscaped'});
}
if (exists($jstate->{'Architecture'})) {
$rethash->{DOCKER_ARCH} = $jstate->{'Architecture'};
}
if (exists($jstate->{'User'})) {
$rethash->{DOCKER_USER} = $jstate->{'User'};
}
}
TBDebugTimeStamp("running analysis script for image $image...");
my $args = {
'HostConfig' => {
'Binds' => [ "/etc/emulab/docker/container-utils:/tmp/docker:ro" ]
}
};
my $tmpname = "analyzer-".int(rand(POSIX::INT_MAX));
our $buf = '';
($code,$json,$resp,$retval) = getClient->container_run(
$tmpname,$image,['/tmp/docker/analyze.sh'],1,$args,
sub { $buf .= $_[0]; });
if ($code) {
warn("failed to run analysis script container $tmpname for $image");
return $code;
}
@outlines = split("\n",$buf);
for my $res (@outlines) {
if ($res =~ /^[a-zA-Z0-9_]*=[^=]*$/) {
chomp($res);
my ($key,$value,) = split('=',$res);
$rethash->{$key} = $value;
}
}
return 0;
}
sub pullImage($$$$;$)
{
my ($image,$user,$pass,$policy,$newref) = @_;
my ($code,$content);
if (SHAREDHOST()) {
if (defined($policy) && $policy ne DOCKER_PULLPOLICY_LATEST()) {
warn("forcing pull policy for image $image on sharedhost to".
" latest, instead of cached!\n");
$policy = DOCKER_PULLPOLICY_LATEST();
}
elsif (!defined($policy) || $policy eq '') {
$policy = DOCKER_PULLPOLICY_LATEST();
}
}
elsif (!defined($policy) || $policy eq '') {
$policy = DOCKER_PULLPOLICY_CACHED();
}
if ($policy eq DOCKER_PULLPOLICY_CACHED()) {
TBDebugTimeStamp("inspecting image $image...");
($code,$content) = getClient()->image_inspect($image);
if (!$code) {
return 0;
}
}
#
# We need to lock while messing with the image. But we can use
# shared lock so that others can proceed in parallel. We will have
# to promote to an exclusive lock if the image has to be changed.
#
my $imagelockname = ImageLockName($image);
TBDebugTimeStamp("grabbing image lock $imagelockname shared")
if ($lockdebug);
if (TBScriptLock($imagelockname,
TBSCRIPTLOCK_INTERRUPTIBLE()|TBSCRIPTLOCK_SHAREDLOCK(),
$MAXIMAGEWAIT) != TBSCRIPTLOCK_OKAY()) {
fatal("Could not get $imagelockname lock for $image!");
}
TBDebugTimeStamp(" got image lock $imagelockname for $image")
if ($lockdebug);
#
# Try one more time to inspect, and release the lock if we have it.
#
if ($policy eq DOCKER_PULLPOLICY_CACHED()) {
TBDebugTimeStamp("inspecting image $image...");
($code,$content) = getClient()->image_inspect($image);
if (!$code) {
TBDebugTimeStamp(" releasing image lock")
if ($lockdebug);
TBScriptUnlock();
return 0;
}
}
my $output = "";
my $retries = 10;
my $ret = 1;
while ($ret && $retries > 0) {
TBDebugTimeStamp("pulling image $image...");
($code,$content) = getClient()->image_pull($image,$user,$pass);
if ($code == 0) {
TBDebugTimeStamp("pull $image succeeded");
last;
}
my $ustr = "";
if (defined($user)) {
$ustr = " as user $user";
}
TBDebugTimeStamp("pull $image failed$ustr ($code, $content);" .
" sleeping and retrying...");
sleep(8);
$retries -= 1;
}
if ($code) {
TBDebugTimeStamp("failed to pull image $image!");
}
if ($code == 0 && defined($newref) && ref($content) eq 'ARRAY') {
for my $cc (@$content) {
next
if (!exists($cc->{'status'}));
if ($cc->{'status'} =~ /downloaded newer image/i) {
$$newref = 1;
last;
}
}
}
TBDebugTimeStamp(" releasing image lock")
if ($lockdebug);
TBScriptUnlock();
return $code;
}
sub emulabizeImage($;$$$$$$$$)
{
my ($image,$newimageref,$emulabization,$newzationref,$update,
$pullpolicy,$username,$password,$iattrsref) = @_;
my $rc;
my ($code,$content);
#
# We take a lock for the pull of the base image; and for
# emulabization, if any. To create an emulabized image, we pull the
# underlying image (if any), then inspect it using both docker
# inspect and our analyzer, then create a name based on the hash of
# our attributes for this node that affect the image build (and on
# the project/group) with the global lock held, then lock that
# image, then make it!
#
if (!defined($emulabization)) {
$emulabization = DOCKER_EMULABIZE_DEFAULT();
}
#
# If we're supposed to pull a new image, do it.
#
my $havenewbase = 0;
if (pullImage($image,$username,$password,$pullpolicy,\$havenewbase)) {
warn("failed to pull base Docker image $image");
return -1;
}
#
# Analyze the image to see what we'll need to do it, if anything.
#
my %iattrs = ();
$rc = analyzeImage($image,\%iattrs);
if ($rc) {
warn("analysis of image $image failed; continuing as best we can!");
}
my ($dist,$tag,$mintag) =
($iattrs{'DIST'},$iattrs{'TAG'},$iattrs{'MINTAG'});
TBDebugTimeStamp("analyzed $image, attrs:\n".Dumper(%iattrs));
if (defined($iattrsref)) {
$$iattrsref = \%iattrs;
}
my $curzation = $iattrs{'EMULABIZATION'};
if (!defined($curzation) || $curzation eq '') {
$curzation = DOCKER_EMULABIZE_NONE();
}
#
# Do we need to Emulabize?
#
my $newzation = DOCKER_EMULABIZE_NONE();
my @levels = ();
if ($emulabization eq '' || $emulabization eq DOCKER_EMULABIZE_NONE()) {
# Nothing to do.
$emulabization = DOCKER_EMULABIZE_NONE();
}
elsif ($emulabization eq DOCKER_EMULABIZE_BASIC()
&& ($update || $curzation eq DOCKER_EMULABIZE_NONE())) {
#
# Need to come up to basic.
#
$newzation = DOCKER_EMULABIZE_BASIC();
@levels = (DOCKER_EMULABIZE_BASIC());
}
elsif ($emulabization eq DOCKER_EMULABIZE_CORE()
&& ($update
|| $curzation eq DOCKER_EMULABIZE_NONE()
|| $curzation eq DOCKER_EMULABIZE_BASIC())) {
#
# Need to come up to core.
#
$newzation = DOCKER_EMULABIZE_CORE();
@levels = (DOCKER_EMULABIZE_BASIC(),DOCKER_EMULABIZE_CORE());
}
elsif ($emulabization eq DOCKER_EMULABIZE_BUILDENV()
&& ($update
|| $curzation eq DOCKER_EMULABIZE_NONE()
|| $curzation eq DOCKER_EMULABIZE_BASIC()
|| $curzation eq DOCKER_EMULABIZE_CORE())) {
#
# Need to come up to buildenv.
#
$newzation = DOCKER_EMULABIZE_BUILDENV();
@levels = (DOCKER_EMULABIZE_BASIC(),DOCKER_EMULABIZE_BUILDENV());
}
elsif ($emulabization eq DOCKER_EMULABIZE_FULL()
&& ($update
|| $curzation eq DOCKER_EMULABIZE_NONE()
|| $curzation eq DOCKER_EMULABIZE_BASIC()
|| $curzation eq DOCKER_EMULABIZE_CORE()
|| $curzation eq DOCKER_EMULABIZE_BUILDENV())) {
#
# Need to come up to full.
#
$newzation = DOCKER_EMULABIZE_FULL();
@levels = (DOCKER_EMULABIZE_BASIC(),DOCKER_EMULABIZE_BUILDENV(),
DOCKER_EMULABIZE_FULL());
}
else {
# Nothing to do; just use existing base image.
$emulabization = DOCKER_EMULABIZE_NONE();
}
if ($newzation eq DOCKER_EMULABIZE_NONE()) {
if ($debug) {
print STDERR "DEBUG: image $image will not be emulabized".
" ($emulabization, $newzation)\n";
}
if (defined($newimageref)) {
$$newimageref = $image;
}
if (defined($newzationref)) {
$$newzationref = $emulabization;
}
return 0;
}
#
# Figure out the new image name and the context dir. Let the caller
# supply one in the $newimageref parameter, too.
#
my $newimage;
my $newimagecdirname;
if (!defined($newimageref) || !defined($$newimageref)
|| $$newimageref eq '') {
#
# We are going to make a new image; give it a name. For now, just
# give it the current name:tag as the new name, then :, then
# level. We will retag it later with the real image name if
# they want to save it.
#
# XXX: Later, for shared nodes, need to ensure we can't be
# tricked into using a private image fro the wrong experiment!
#
$newimage = $image;
$newimage =~ tr/:/-/;
$newimagecdirname = $newimage;
$newimage .= ":emulab-$newzation";
$newimagecdirname .= "--emulab-$newzation";
}
else {
$newimage = $$newimageref;
$newimagecdirname = "$newimage--emulab-$newzation";
$newimagecdirname =~ tr/:/-/;
}
#
# We have to lock here, to avoid races.
#
my $imagelockname = ImageLockName($newimage);
TBDebugTimeStamp("grabbing image lock $imagelockname writeable")
if ($lockdebug);
if (TBScriptLock($imagelockname,
TBSCRIPTLOCK_INTERRUPTIBLE(),
$MAXIMAGEWAIT) != TBSCRIPTLOCK_OKAY()) {
fatal("Could not get $imagelockname lock for $newimage!");
}
TBDebugTimeStamp(" got image lock $imagelockname for $newimage")
if ($lockdebug);
#
# Check to see if the image already exists, and if we need to
# (re)build it. If there's a new base, and we always want the
# latest, we have to rebuild. Else, if this image has the
# Emulab code, and wants the latest, and is out of date
# w.r.t. the cached source tree, we rebuild it too.
#
my $build = 0;
TBDebugTimeStamp("inspecting image $newimage...");
($code,$content) = getClient()->image_inspect($newimage);
if ($code) {
TBDebugTimeStamp("$newimage does not exist; building!");
$build = 1;
}
elsif ($pullpolicy eq DOCKER_PULLPOLICY_LATEST() && $havenewbase) {
TBDebugTimeStamp("building new version of $newimage".
" because the base image was updated!");
$build = 1;
}
elsif ($newzation ne DOCKER_EMULABIZE_NONE()
&& $newzation ne DOCKER_EMULABIZE_BASIC()
&& $pullpolicy eq DOCKER_PULLPOLICY_LATEST()) {
my $installedvers = $iattrs{'EMULABVERSION'};
my $currentvers = `cat $EMULABSRC/.git/refs/heads/master`;
chomp($currentvers);
if ($installedvers ne $currentvers) {
TBDebugTimeStamp("building new version of $newimage".
" because the Emulab src repo was updated".
" ($installedvers -> $currentvers)!");
$build = 1;
}
}
if ($build) {
if ($dist eq '' && $tag eq '' && $mintag eq '') {
warn("cannot emulabize image with unknown distro!");
goto badimage;
}
if (!(($mintag ne '' && -d "$DOCKERFILES/$mintag")
|| ($tag ne '' && -d "$DOCKERFILES/$tag")
|| ($tag ne '' && -d "$DOCKERFILES/$tag"))) {
warn("cannot emulabize image with unsupported auto-analyzed".
" tags $dist/$tag/$mintag!");
goto badimage;
}
#
# Ok, finally, start the build. Find all the Dockerfile
# frags and shell scripts, and generate a Dockerfile and a
# context directory. If that dir exists already, remove it.
# Build any artifacts first.
#
# To find the fragments, we go from most specific to least
# (i.e., mintag -> tag -> dist).
#
# NB: we always copy the $mintag,$tag,$dist,common subdirs
# of /etc/emulab/docker/dockerfiles into the context for the
# image, because we want a script in say ubuntu16 to be able
# to reference something in the common/ subdir.
#
my @copydirs = ();
foreach my $td ('common',$dist,$tag,$mintag) {
push(@copydirs,$td)
if (-d "$DOCKERFILES/$td");
}
my @dfiles = ();
my @runscripts = ();
my @artifactscripts = ();
my $cwd = getcwd();
chdir($DOCKERFILES);
for my $l ('prepare',@levels) {
for my $t ($mintag,$tag,$dist) {
my $found = 0;
if (-f "$t/Dockerfile-$l") {
push(@dfiles,"$t/Dockerfile-$l");
$found = 1;
}
if (-f "$t/$l.sh") {
push(@runscripts,"$t/$l.sh");
$found = 1;
}
if ($found) {
if (-f "$t/$l-artifacts.sh") {
push(@artifactscripts,"$t/$l-artifacts.sh");
}
#
# Ok, we found instructions for this level, so
# skip to the next level.
#
next;
}
}
}
#
# Now look for init-related goo. We install all inits that
# we know about that apply to this mintag/tag/dist/common.
#
for my $init ('runit','systemd','upstart','init') {
for my $t ($mintag,$tag,$dist) {
my $found = 0;
if (-f "$t/Dockerfile-$init") {
push(@dfiles,"$t/Dockerfile-$init");
$found = 1;
}
if (-f "$t/$init.sh") {
push(@runscripts,"$t/$init.sh");
$found = 1;
}
if ($found) {
if (-f "$t/$init-artifacts.sh") {
push(@artifactscripts,"$t/$init-artifacts.sh");
}
#
# Ok, we found instructions for this level, so
# skip to the next level.
#
next;
}
}
}
for my $l ('cleanup') {
for my $t ($mintag,$tag,$dist) {
my $found = 0;
if (-f "$t/$l.sh") {
push(@runscripts,"$t/$l.sh");
#
# Ok, we found instructions for this level, so
# skip to the next level.
#
next;
}
}
}
chdir($cwd);
#
# Ok, we create a context dir that has two things. First,
# it has an artifacts subdir. Dockerfile fragments are
# responsible to copy stuff from artifacts into place. The
# fs/ subdir is intended to be a root filesystem fragment.
# Anything in it is automatically copied to the image
# rootfs. The fs/ subdir is populated from $DOCKERFILES as
# follows. First, each mintag/tag/dist/common subdir in
# DOCKERFILES is copied into fs/etc/emulab/CONTEXT --
# excluding any fs subdir in the mintag/tag/dist/common
# subdirs. Those fs subdirs are copied into the primary fs
# subdir, *in reverse order* (so that the most specific can
# overwrite the least specific). This is the best way to
# minimize layers -- i.e., to have a single COPY
# instruction, and a single RUN instruction, for two layers
# total. Ugh!
#
my $cdir = "$CONTEXTDIR/$newimagecdirname";
my $adir = "$cdir/artifacts";
my $hdir = "$cdir/fs";
mkdir($cdir);
mkdir($adir);
mkdir($hdir);
mkdir("$hdir/etc");
mkdir("$hdir/etc/ssh");
mkdir("$hdir/etc/emulab");
mkdir("$hdir/etc/emulab/CONTEXT");
mysystem2("rsync -a /etc/ssh/ssh_host* $hdir/etc/ssh/");
mysystem2("rsync -a /etc/emulab/*.pem $hdir/etc/emulab/");
for my $dir (@copydirs) {
mysystem2("rsync -a --exclude=$DOCKERFILES/$dir/fs".
" $DOCKERFILES/$dir $hdir/etc/emulab/CONTEXT/");
if (-d "$DOCKERFILES/$dir/fs") {
mysystem2("rsync -a $DOCKERFILES/$dir/fs/ $hdir/");
}
}
#
# Before we start setting up the new image Dockerfile, run
# all the artifact build scripts.
#
foreach my $ascript (@artifactscripts) {
my %args = ( 'Tty' => JSON::PP::true);
$args{'HostConfig'}{'Binds'} = [
"$hdir/etc/emulab/CONTEXT:/etc/emulab/CONTEXT:ro",
"$adir:/artifacts:rw",
"$EMULABSRC:/emulab:ro",
"$PUBSUBSRC:/pubsub:ro",
"$RUNITSRC:/runit:ro"
];
$args{'Env'} = [
"DESTDIR=/artifacts","EMULABSRC=/emulab","PUBSUBSRC=/pubsub",
"RUNITSRC=/runit","CONTEXT=/etc/emulab/CONTEXT"
];
$args{'Image'} = $image;
$args{'Cmd'} = ["/bin/sh","-c","cd \$CONTEXT && $ascript"];
my $tmpname = "artifact-".sha1_hex($image . rand(POSIX::INT_MAX));
TBDebugTimeStamp("creating artifact container $tmpname for".
" artifact script $ascript...");
($code,$content) = getClient()->container_create(
$tmpname,\%args);
if ($code) {
warn("failed to create image analysis container $tmpname".
" for image $image: $content ($code); aborting\n");
goto badimage;
}
TBDebugTimeStamp("starting artifact container $tmpname");
($code,$content) = getClient()->container_start($tmpname);
if ($code) {
warn("failed to start artifact container $tmpname".
" for image $image: $content ($code); aborting\n");
goto badimage;
}
open(our $fd,">$cdir-$tmpname.log");
sub log_printer {
my ($data,$foo,$resp) = @_;
print $data;
if (defined($fd)) {
print $fd $data;
}
}
# Purely for real-time logging purposes.
TBDebugTimeStamp("attaching to artifact container $tmpname;".
" stdout/stderr from container will follow...");
getClient()->container_attach($tmpname,1,1,0,1,1,1,\&log_printer);
close($fd);
TBDebugTimeStamp("waiting for artifact container $tmpname to stop");
($code,$content) = getClient()->container_wait($tmpname);
print STDERR "DEBUG: $content " . ref($content) . "\n";
if ($code) {
warn("failed to wait for artifact container $tmpname".
" for image $image: $content ($code); aborting\n");
goto badimage;
}
elsif (ref($content) eq 'ARRAY') {
foreach my $blurb (@$content) {
if (ref($blurb) eq 'HASH'
&& exists($blurb->{'StatusCode'})
&& $blurb->{'StatusCode'}) {
warn("image artifact container $tmpname,image $image".
" exited non-zero (".$blurb->{'StatusCode'}.");".
" aborting\n");
goto badimage;
}
}
}
elsif (ref($content) eq 'HASH'
&& exists($content->{'StatusCode'})
&& $content->{'StatusCode'}) {
warn("image artifact container $tmpname,image $image".
" exited non-zero (".$content->{'StatusCode'}.");".
" aborting\n");
goto badimage;
}
TBDebugTimeStamp("removing artifact container $tmpname");
($code,$content) = getClient()->container_delete($tmpname);
if ($code) {
warn("failed to delete artifact script container".
" $tmpname,image $image: $content ($code);".
" ignoring!\n");
}
}
my $dockerfile = "$cdir/Dockerfile";
open(DFD,">$dockerfile")
or fatal("could not open $dockerfile!");
#
# First, we are descended FROM the base image.
#
print DFD "FROM $image\n\n";
#
# Then, if this is emulabization core or full, add an
# ONBUILD instruction that runs our prepare script. And we
# *always* save off new versions of the master passwd files.
#
if ($emulabization eq DOCKER_EMULABIZE_CORE()
|| $emulabization eq DOCKER_EMULABIZE_FULL()) {
print DFD "ONBUILD RUN /usr/local/etc/emulab/prepare -M\n\n";
}
#
# Second, copy in all the Dockerfile fragments.
#
$cwd = getcwd();
chdir($DOCKERFILES);
foreach my $f (@dfiles) {
open(FD,"$f")
or fatal("could not open $f to copy into $dockerfile");
my @lines = ;
close(FD);
print DFD join("",@lines)."\n\n";
}
chdir($cwd);
#
# Next create COPY and RUN commands.
#
print DFD "COPY fs/ /\n";
my $runcmd = "";
foreach my $ruc (@runscripts) {
my $dn = dirname($ruc);
my $bn = basename($ruc);
if ($runcmd ne '') {
$runcmd .= " && ";
}
#$runcmd .= "cd /tmp/$dn && ./$bn && cd /tmp";
$runcmd .= "cd /etc/emulab/CONTEXT && $ruc";
}
if ($runcmd ne '') {
$runcmd .= " && ";
}
$runcmd .= "mkdir -p /etc/emulab".
" && echo $newzation > /etc/emulab/emulabization-type";
#
# If we are updating the Emulabization *or* if we are
# Emulabizing for the first time, *always* overwrite the
# Emulab master passwd files with the image's real files.
# This ensures that the client-install only temporarily
# overwrites the Emulab master passwd files; *and* ensures
# we always use the image's files. We cannot trust the
# Emulab per-distro/per-version passwd files from git, since
# they may not match what's installed; this is not our
# image.
#
if ($update
|| $curzation eq '' || $curzation eq DOCKER_EMULABIZE_NONE()) {
$runcmd .= " && cp -pv /etc/passwd /etc/group /etc/shadow /etc/gshadow /etc/emulab";
}
print DFD "RUN /bin/sh -c '$runcmd'\n\n";
close(DFD);
# We could just send the bytes to the daemon (tar -C $cdir -c . |),
# but we want to store the file on disk for provenance.
my $tarfile = "$cdir-context-" . time() . ".tar";
mysystem2("tar -cf $tarfile -C $cdir .");
if ($?) {
warn("failed to build tar archive of context dir $cdir;".
" aborting!\n");
goto badimage;
}
TBDebugTimeStamp("building new image $newimage");
my $buf = '';
open(our $fd,">$cdir-build.log");
our $bytes = 0;
sub json_log_printer {
my ($data,$foo,$resp) = @_;
if ($resp->header("content-type") eq 'application/json') {
eval {
$data = decode_json($data);
};
if ($@) {
warn("build log_printer: $! $@ ($data)\n");
}
}
print $data;
print $fd $data;
$bytes += length($data);
}
($code,$content) = getClient()->image_build_from_tar_file(
$tarfile,$newimage,undef,undef,\&json_log_printer);
close($fd);
if ($code) {
warn("failed to build $newimage from $image: $content ($code)!");
goto badimage;
}
if ($bytes == 0) {
open(FD,">$cdir-build.log");
if (defined($content) && ref($content) eq 'ARRAY'
&& defined($content->[0]) && ref($content->[0]) eq 'HASH'
&& defined($content->[0]->{'stream'})) {
foreach my $bit (@$content) {
next
if (!defined($bit->{'stream'}));
print FD $bit->{'stream'};
}
}
elsif (defined(ref($content)) && ref($content) ne '') {
print FD Dumper($content);
}
else {
print FD $content;
}
close(FD);
}
}
#
# Unlock the emulabized image.
#
TBScriptUnlock();
if (defined($newimageref)) {
$$newimageref = $newimage;
}
if (defined($newzationref)) {
$$newzationref = $emulabization;
}
if (defined($iattrsref)) {
$$iattrsref = \%iattrs;
}
return 0;
badimage:
TBScriptUnlock();
return -1;
}
sub setupImage($$$$$$$$$$)
{
my ($vnode_id,$vnconfig,$private,$image,$username,$password,
$newimageref,$newcreateargsref,$newcmdref,$newzationref) = @_;
my $rc;
my $cwd;
my ($code,$content);
TBDebugTimeStamp("setting up image $image for $vnode_id...");
#
# Check the emulabization value before we do anything, to save work.
# We default to "basic" emulabization.
#
my $emulabization;
my $update = 0;
if (!exists($vnconfig->{'attributes'}->{DOCKER_EMULABIZATION})) {
$emulabization = $vnconfig->{'attributes'}->{DOCKER_EMULABIZATION} =
DOCKER_EMULABIZE_DEFAULT();
}
else {
$emulabization = $vnconfig->{'attributes'}->{DOCKER_EMULABIZATION};
if ($emulabization ne DOCKER_EMULABIZE_FULL()
&& $emulabization ne DOCKER_EMULABIZE_BUILDENV()
&& $emulabization ne DOCKER_EMULABIZE_CORE()
&& $emulabization ne DOCKER_EMULABIZE_BASIC()
&& $emulabization ne DOCKER_EMULABIZE_NONE()
&& $emulabization ne '') {
warn("invalid emulabization ($emulabization) specified for".
" $vnode_id/$image; aborting!");
return -1;
}
}
# Save this off for later reference.
if ($emulabization eq '') {
$emulabization = DOCKER_EMULABIZE_NONE;
}
$vnconfig->{'attributes'}->{DOCKER_EMULABIZATION} = $emulabization;
if (exists($vnconfig->{'attributes'}->{DOCKER_EMULABIZATION_UPDATE})) {
$update = $vnconfig->{'attributes'}->{DOCKER_EMULABIZATION_UPDATE};
}
#
# Pull the image, according to the policy. Force the pull policy to
# the latest for shared nodes.
#
if (SHAREDHOST()) {
$vnconfig->{'attributes'}->{DOCKER_PULLPOLICY} =
DOCKER_PULLPOLICY_LATEST();
}
my $pullpolicy;
if (exists($vnconfig->{'attributes'}->{DOCKER_PULLPOLICY})) {
$pullpolicy = $vnconfig->{'attributes'}->{DOCKER_PULLPOLICY};
}
if (!defined($pullpolicy)) {
$pullpolicy = DOCKER_PULLPOLICY_CACHED();
}
# Save off a read-only version of this, for convenience.
my %vnattrs = %{$vnconfig->{'attributes'}};
my $newimage;
my $iattrs;
$rc = emulabizeImage($image,\$newimage,$emulabization,$newzationref,$update,
$pullpolicy,$username,$password,\$iattrs);
if ($rc) {
warn("failed to emulabize image $image; aborting!\n");
return $rc;
}
#print "DEBUG: setupImage iattrs = ".Dumper($iattrs)."\n";
#print "DEBUG: setupImage ".$iattrs->{'DIST'}.",".$iattrs->{'TAG'}.",".$iattrs->{'MINTAG'}."\n";
my ($dist,$tag,$mintag) =
($iattrs->{'DIST'},$iattrs->{'TAG'},$iattrs->{'MINTAG'});
#
# If we're not emulabizing, we don't mess with the cmd or
# entrypoint, for now.
#
if ($emulabization eq DOCKER_EMULABIZE_NONE()) {
$$newimageref = $newimage;
$$newcreateargsref = {};
$$newcmdref = {};
return 0;
}
#
# Ok. Now figure out any changes to the 'docker create ...' command
# line that we might need -- i.e. to change the command, or env
# vars, or the container stop signal.
#
# If emulabized, they are using our init. If a shared host, we need
# to force runit. If a dedicated host, we can do whichever they
# want. Either way, if we made the image, we *always* specify a
# custom init path, and a custom stop signal. Thus, we always get
# the init we want.
#
# If not emulabized, but if we detect they are using a real init,
# and that init is systemd, AND that the host is *not* shared, we
# need to change the command, the stopsignal, set container=docker,
# mount the cgroupfs ro.
#
my $init = DOCKER_INIT_RUNIT();
if (((!exists($vnattrs{DOCKER_INIT}) && 0)
|| (exists($vnattrs{DOCKER_INIT})
&& $vnattrs{DOCKER_INIT} eq DOCKER_INIT_INSTALLED()))
&& exists($iattrs->{INITPROG})
&& $iattrs->{INITPROG} ne '') {
$init = $iattrs->{INITPROG};
}
elsif (exists($vnattrs{DOCKER_INIT})
&& $vnattrs{DOCKER_INIT} eq DOCKER_INIT_RUNIT()) {
$init = DOCKER_INIT_RUNIT();
}
else {
$init = DOCKER_INIT_RUNIT();
}
if (SHAREDHOST() && $init eq 'systemd') {
$init = DOCKER_INIT_RUNIT();
warn("forcing init from systemd to $init on sharedhost!");
}
#
# Now look for init-related docker create args and cmd. At this
# point, we know which init we are going to run, so we look only for
# that one. NB: each file must be a JSON dict of args to the
# /containers/create Docker Engine API call.
#
my %initargs = ();
my %initcmd = ();
$cwd = getcwd();
chdir($DOCKERFILES);
TBDebugTimeStamp("entering dir $DOCKERFILES");
for my $t ($mintag,$tag,$dist) {
next
if (!defined($t));
my $found = 0;
TBDebugTimeStamp("looking for $t/Dockercmd-$init");
if (-f "$t/Dockercmd-$init") {
TBDebugTimeStamp("found $t/Dockercmd-$init");
open(FD,"$t/Dockercmd-$init")
or fatal("could not open $t/Dockercmd-$init");
my @lines = ;
close(FD);
my $jref;
eval {
$jref = decode_json(join('',@lines));
require Hash::Merge;
%initcmd = %{Hash::Merge::merge(\%initcmd,$jref)};
if ($debug) {
print STDERR "DEBUG: merged initcmd = ".Dumper(%initcmd)."\n";
}
};
if ($@) {
print STDERR "ERROR: invalid JSON in $t/Dockercmd-$init: $@\n";
goto badimage;
}
$found = 1;
}
TBDebugTimeStamp("looking for $t/Dockerargs-$init");
if (-f "$t/Dockerargs-$init") {
TBDebugTimeStamp("found $t/Dockerargs-$init");
open(FD,"$t/Dockerargs-$init")
or fatal("could not open $t/Dockerargs-$init");
my @lines = ;
close(FD);
my $jref;
eval {
$jref = decode_json(join('',@lines));
require Hash::Merge;
%initargs = %{Hash::Merge::merge(\%initargs,$jref)};
if ($debug) {
print STDERR "DEBUG: merged initargs = ".Dumper(%initargs)."\n";
}
};
if ($@) {
print STDERR "ERROR: invalid JSON in $t/Dockerargs-$init: $@\n";
goto badimage;
}
$found = 1;
}
}
if (keys(%initcmd) == 0) {
chdir($cwd);
warn("could not assemble init command; bug!");
goto badimage;
}
chdir($cwd);
$$newimageref = $newimage;
$$newcreateargsref = \%initargs;
$$newcmdref = \%initcmd;
return 0;
badimage:
return -1;
}
#
# "Save" a docker (libcontainer) network namespace so that we can
# customize it (i.e., add/remove devices) without those devices
# disappearing into a black hole if Docker releases its filehandle to
# the netns. On Linux, the only way I know of to preserve a namespace
# for access once all its pids go away is the same way the iproute2
# package does it (bind mount /proc/PID/ns/net into
# /var/run/netns/ -- the ip command looks for netnses there).
# So we do the same thing: we bind mount the initial docker container's
# pid into that file; then later on, the ip command can actually be used
# on it.
#
sub bindNetNS($$)
{
my ($vnode_id,$private) = @_;
my ($code,$content);
my $cpid;
# First get the container pid:
($code,$content) = getClient()->container_state($vnode_id);
if ($code) {
warn("could not find init pid of container $vnode_id; aborting".
" ($content ($code))\n");
return -1;
}
$cpid = $content->{"Pid"};
chomp($cpid);
# Check to see if a stale mount exists for this container;
# delete if so.
if (-f "/var/run/netns/$vnode_id") {
TBDebugTimeStamp("removing stale network namespace for $vnode_id".
" prior to copy")
if ($debug);
unbindNetNS($vnode_id,$private);
}
# Now do the bind mount (after creating the runtime netns
# dir, and the file we'll mount to):
if (! -d "/var/run/netns") {
mkdir("/var/run/netns");
}
if (! -f "/var/run/netns/$vnode_id") {
open(MFD,">/var/run/netns/$vnode_id");
close(MFD);
}
# Grab the container pid; we need a pid to find the file
# representing the netns (which in docker/libcontainer world is only
# /proc//ns/net):
($code,$content) = getClient()->container_state($vnode_id);
if ($code) {
warn("could not find init pid of container $vnode_id; aborting".
" ($content ($code))\n");
return -1;
}
$cpid = $content->{"Pid"};
chomp($cpid);
# Now do the bind mount:
mysystem2("mount -o bind /proc/$cpid/ns/net /var/run/netns/$vnode_id");
return $? >> 8;
}
# Move a network device from the root netns *into* a vnode (container) netns.
sub moveNetDeviceToNetNS($$$)
{
my ($vnode_id,$private,$dev) = @_;
mysystem2("ip link $dev set netns $vnode_id");
if (!$?) {
if (!exists($private->{"rawnetdevs"})) {
$private->{"rawnetdevs"} = {};
}
$private->{"rawnetdevs"}{"$dev"} = $dev;
}
return $? >> 8;
}
# Move a network device *from* a vnode (container) netns into the root netns.
sub moveNetDeviceFromNetNS($$$)
{
my ($vnode_id,$private,$dev) = @_;
if (!exists($private->{"rawnetdevs"})
or !exists($private->{"rawnetdevs"}{"$dev"})) {
warn("device $dev not in our data structures for $vnode_id;".
" attempting removal from netns anyway!");
}
mysystem2("ip netns exec $vnode_id ip link $dev set netns 1");
if (!$?) {
warn("device $dev not in $vnode_id netns; removing from".
" our data structures anyway!");
}
delete($private->{"rawnetdevs"}{"$dev"});
return $? >> 8;
}
# Move our devices (if any) out of the netns into the root, umount the
# netns bind mount, and remove the mount point.
sub unbindNetNS($$)
{
my ($vnode_id,$private) = @_;
if (! -f "/var/run/netns/$vnode_id") {
warn("container $vnode_id does not appear to have a bound netns!");
return -1;
}
if (exists($private->{"rawnetdevs"})) {
my @devs = keys(%{$private->{"rawnetdevs"}});
foreach my $dev (@devs) {
moveNetDeviceFromNetNS($vnode_id,$private,$dev);
}
}
mysystem2("umount /var/run/netns/$vnode_id");
unlink("/var/run/netns/$vnode_id");
return 0;
}
#
# Returns a list of : pairs (and :) in the same
# order as Emulab's classic /etc/hosts generation (i.e. genhostsfile).
#
# XXX: note that this does not support the new /etc/hosts.{head,tail}
# stuff; to do that, we'd have to grab it out of the image, and we don't
# do that for now.
#
sub genhostspairlist($$)
{
my ($vnode_id,$rptr) = @_;
my @tmccresults;
#
# First see if we have a topo file; we can generate our own hosts
# file if we do, saving a lot of load on tmcd in big experiments.
#
my $mapfile = "$VMS/$vnode_id/hostmap";
if (genhostslistfromtopo($mapfile,\@tmccresults) < 0 &&
tmcc(TMCCCMD_HOSTS,undef,\@tmccresults) < 0) {
warn("Could not get hosts file from server!");
@$rptr = ();
return -1;
}
# If no results then return nothing.
if (!@tmccresults) {
@$rptr = ();
return 0;
}
#
# First, write a localhost line into the hosts file - we have to know the
# domain to use here, so that we can qualify the localhost entry
#
# XXX: getting the vnode's domain is harder for shared nodes, so just don't
# do this for now...
#
#my $hostname = `hostname`;
#my $ldomain;
#if ($hostname =~ /[^.]+\.(.+)/) {
# $ldomain = "localhost.$1:127.0.0.1";
#}
@$rptr = ( "localhost:127.0.0.1","loghost:127.0.0.1" );
#push(@$rptr,$ldomain)
# if (defined($ldomain));
#
# Now convert each hostname into hosts file representation and write
# it to the hosts file. For docker, we have to explode these out into
# : pairs; we can't set aliases.
#
my $pat = q(NAME=([-\w\.]+) IP=([0-9\.]*) ALIASES=\'([-\w\. ]*)\');
foreach my $str (@tmccresults) {
if ($str =~ /$pat/) {
my $name = $1;
my $ip = $2;
my $aliases = $3;
push(@$rptr,"${name}:${ip}");
foreach my $alias (split(/\s+/,$aliases)) {
push(@$rptr,"${alias}:${ip}");
}
}
else {
warn("Ignoring bad hosts line: $str");
}
}
return 0;
}
#
# For docker, we handle mounts at container creation by mounting
# everything necessary on the physical host, then bind-mounting it into
# the container (which docker does for us as it starts up the
# container). This function ensures necessary things are mounted on the
# host, then returns a list of bind-mounts that should be configured for
# the container.
#
# On a remote host, we do nothing. On a dedicated local host, we do the
# same thing that rc.mounts -j does -- we mount all NFS mounts already
# mounted on the physical host. However, on a shared local host, we do
# slightly differently than rc.mounts -j would do, since all our docker
# mounts must be bind mounts, we have to mount all those mounts in the
# host, track who's using them, and umount them as containers are
# destroyed and the refcnts go to zero.
#
my $MOUNTDB = "$VARDIR/db/mountdb";
my $MOUNTREFDB = "$VARDIR/db/mountrefdb";
sub addMounts($$)
{
my ($vnode_id,$retref,) = @_;
my $JAILDB = CONFDIR() . "/mountdb";
my $mountstr;
my %MDB;
my %MRDB;
my %JDB;
my ($mdb_open,$mrdb_open,$jdb_open) = (0,0,0);
my $ret;
#
# No mounts on remote nodes.
#
if (REMOTE()) {
$retref = {};
return 0;
}
#
# If this is a shared node, we need to lock. Oh, whatever, let's
# just do it regardless...
#
TBDebugTimeStamp("setupMounts: grabbing lock $GLOBAL_MOUNT_LOCK")
if ($lockdebug);
if (TBScriptLock($GLOBAL_MOUNT_LOCK,
TBSCRIPTLOCK_INTERRUPTIBLE(), 900) != TBSCRIPTLOCK_OKAY()){
print STDERR "Could not get the mount lock!\n";
return -1;
}
TBDebugTimeStamp(" got mount lock")
if ($lockdebug);
#
# Open the main mount databases (the mount tracker, and the refcnter).
#
if (! dbmopen(%MDB, $MOUNTDB, 0644)) {
warn("Could not open $MOUNTDB!\n");
goto bad;
}
$mdb_open = 1;
if (! dbmopen(%MRDB, $MOUNTREFDB, 0644)) {
warn("Could not open $MOUNTREFDB!\n");
goto bad;
}
$mrdb_open = 1;
#
# First-time init code. Basically, if the phys host has any NFS
# volumes mounted, i.e., as it would for a dedicated or shared local
# host, we need to mark those in use if they're not already
# refcnt'd. This will ensure that they don't get umounted if no
# vnode is using them any longer. We really should do this in the
# global one-time init, but I'm here now.
#
while (my ($remote, $path) = each %MDB) {
if (!defined($MRDB{$remote})) {
$MRDB{$remote} = 1;
}
TBDebugTimeStamp("initialized global host mount $remote at $path")
if ($debug);
}
#
# We mount all the mounts that the physical node has mounted, into
# the jail/VM. Since this is not going to be updated (no support for
# updating mounts yet), we construct a copy of the DB in the jail
# directory so we know exactly what we did when it comes time to do
# the unmounts. In the meantime, the physical node might change its
# mount set, but we will not care (of course, if a dir is unmounted
# from the pnode, something is probably going to break).
#
my %MOUNTS = ();
if (! dbmopen(%JDB, $JAILDB, 0644)) {
warn("Could not create $JAILDB!\n");
goto bad;
}
if (SHAREDHOST()) {
my @tmccresults;
if (tmcc(TMCCCMD_MOUNTS, undef, \@tmccresults) < 0) {
warn("Could not get mount info from server!\n");
goto bad;
}
foreach my $str (@tmccresults) {
if ($str =~ /^REMOTE=([-:\@\w\.\/]+) LOCAL=([-\@\w\.\/]+)/) {
$MOUNTS{$2} = $2;
}
else {
warn("Unparseable tmcd mount string '$str'!\n");
}
}
while (my ($remote, $path) = each %MOUNTS) {
if (defined($MRDB{$remote})) {
$MRDB{$remote} += 1;
TBDebugTimeStamp("$vnode_id using existing $remote")
if ($debug);
}
else {
if (! -e $path) {
if (! os_mkdir($path, "0770")) {
warning("Could not make directory $path");
next;
}
}
print STDOUT " Mounting $remote on $path\n";
if (system("$NFSMOUNT $remote $path")) {
warning("Could not $NFSMOUNT $remote on $path");
next;
}
TBDebugTimeStamp("$vnode_id using new $remote")
if ($debug);
$MRDB{$remote} = 1;
}
# Record the container as using this remote.
$JDB{$remote} = $path;
}
}
else {
while (my ($remote, $path) = each %MDB) {
$MOUNTS{$remote} = $path;
$MRDB{$remote} += 1;
TBDebugTimeStamp("$vnode_id using $remote")
if ($debug);
# Record the container as using this remote.
$JDB{$remote} = $path;
}
}
# Populate our retref hash:
%$retref = ();
while (my ($remote, $path) = each %JDB) {
$retref->{$remote} = $path;
}
$ret = 0;
out:
dbmclose(%JDB)
if ($jdb_open);
dbmclose(%MDB)
if ($mdb_open);
dbmclose(%MRDB)
if ($mrdb_open);
TBScriptUnlock();
return $ret;
bad:
$ret = -1;
goto out;
}
sub removeMounts($)
{
my ($vnode_id,) = @_;
my $JAILDB = CONFDIR() . "/mountdb";
my $mountstr;
my %MDB;
my %MRDB;
my %JDB;
my ($mdb_open,$mrdb_open,$jdb_open) = (0,0,0);
my $ret;
#
# No mounts on remote nodes.
#
if (REMOTE()) {
return 0;
}
#
# If this is a shared node, we need to lock. Oh, whatever, let's
# just do it regardless...
#
TBDebugTimeStamp("addMounts: grabbing lock $GLOBAL_MOUNT_LOCK")
if ($lockdebug);
if (TBScriptLock($GLOBAL_MOUNT_LOCK,
TBSCRIPTLOCK_INTERRUPTIBLE(), 900) != TBSCRIPTLOCK_OKAY()){
print STDERR "Could not get the mount lock!\n";
return -1;
}
TBDebugTimeStamp(" got mount lock")
if ($lockdebug);
#
# Open the main mount databases (the mount tracker, and the refcnter).
#
if (! dbmopen(%MDB, $MOUNTDB, 0644)) {
warn("Could not open $MOUNTDB!\n");
goto bad;
}
$mdb_open = 1;
if (! dbmopen(%MRDB, $MOUNTREFDB, 0644)) {
warn("Could not open $MOUNTREFDB!\n");
goto bad;
}
$mrdb_open = 1;
#
# We mount all the mounts that the physical node has mounted, into
# the jail/VM. Since this is not going to be updated (no support for
# updating mounts yet), we construct a copy of the DB in the jail
# directory so we know exactly what we did when it comes time to do
# the unmounts. In the meantime, the physical node might change its
# mount set, but we will not care (of course, if a dir is unmounted
# from the pnode, something is probably going to break).
#
my %MOUNTS = ();
if (! dbmopen(%JDB, $JAILDB, 0644)) {
warn("Could not create $JAILDB!\n");
goto bad;
}
while (my ($remote, $path) = each %JDB) {
if (!defined($MRDB{$remote})) {
warn("$vnode_id had $remote mounted, but not in refcnt db;".
" skipping!\n");
next;
}
else {
$MRDB{$remote} -= 1;
TBDebugTimeStamp("reduced refcnt for $remote to ".$MRDB{$remote}.
" ($vnode_id)")
if ($debug);
if ($MRDB{$remote} == 0) {
TBDebugTimeStamp("$vnode_id: unmounting 0-refcnt mount $remote")
if ($debug);
system("umount $remote");
delete($MRDB{$remote});
delete($MDB{$remote});
}
}
}
$ret = 0;
out:
dbmclose(%JDB)
if ($jdb_open);
dbmclose(%MDB)
if ($mdb_open);
dbmclose(%MRDB)
if ($mrdb_open);
TBScriptUnlock();
return $ret;
bad:
$ret = -1;
goto out;
}
#
# This is almost directly out of rc.route, but there are several key
# differences. First, our setup is such that we install the routes from
# outside the container, because we might not have the `route` or `ip`
# binaries inside the container! Second, all our commands must be run
# in the container's netns, so all commands get prefixed with that.
# Finally, we don't support gated or ospf etc.
#
sub CreateRoutingScripts($$)
{
my ($vnode_id,$private) = @_;
my @routes = ();
my $type = 0;
my %upmap = ();
my %downmap = ();
print STDOUT "Checking Testbed route configuration ... \n";
if (getrouterconfig(\@routes, \$type)) {
warn("Could not get router configuration from libsetup!");
return -1;
}
my $script = CONFDIR()."/routing.sh";
#
# Always generate a script file since other scripts depend on it,
# even if no routing was requested (ifconfig, tunnel config).
#
unlink($script);
if (!open(RC, ">$script")) {
fatal("Could not open $script: $!\n");
}
print RC "#!/bin/sh\n";
print RC "# auto-generated by $0, DO NOT EDIT\n";
if ($type eq "none") {
print RC "true\n";
close(RC);
chmod(0755, $script);
return 0;
}
#
# Now convert static route info into OS route commands
# Also check for use of gated/manual and remember it.
#
my $usegated = (($type eq "gated" || $type eq "ospf") ? 1 : 0);
if ($usegated) {
warn("gated/ospf style of routing not supported in Docker containers!");
return -1;
}
my $usemanual = (($type eq "manual" ||
$type eq "static" || $type eq "static-old") ? 1 : 0);
foreach my $rconfig (@routes) {
my $dip = $rconfig->{"IPADDR"};
my $rtype = $rconfig->{"TYPE"};
my $dmask = $rconfig->{"IPMASK"};
my $gate = $rconfig->{"GATEWAY"};
my $cost = $rconfig->{"COST"};
my $sip = $rconfig->{"SRCIPADDR"};
my $rcline;
if (! defined($upmap{$sip})) {
$upmap{$sip} = [];
$downmap{$sip} = [];
}
$rcline = os_routing_add_manual($rtype, $dip,
$dmask, $gate, $cost, undef);
push(@{$upmap{$sip}}, $rcline);
$rcline = os_routing_del_manual($rtype, $dip,
$dmask, $gate, $cost, undef);
push(@{$downmap{$sip}}, $rcline);
}
my $prefix = "ip netns exec $vnode_id ";
print RC "case \"\$1\" in\n";
foreach my $arg (keys(%upmap)) {
print RC " $arg)\n";
print RC " case \"\$2\" in\n";
print RC " up)\n";
foreach my $rcline (@{$upmap{$arg}}) {
print RC " ${prefix}$rcline\n";
}
print RC " ;;\n";
print RC " down)\n";
foreach my $rcline (@{$downmap{$arg}}) {
print RC " ${prefix}$rcline\n";
}
print RC " ;;\n";
print RC " esac\n";
print RC " ;;\n";
}
print RC " enable)\n";
#
# Turn on IP forwarding
#
print RC " ${prefix}" . os_routing_enable_forward() . "\n";
print RC " ;;\n";
#
# For convenience, allup and alldown.
#
print RC " enable-routes)\n";
foreach my $arg (keys(%upmap)) {
foreach my $rcline (@{$upmap{$arg}}) {
print RC " ${prefix}$rcline\n";
}
}
print RC " ;;\n";
print RC " disable-routes)\n";
foreach my $arg (keys(%downmap)) {
foreach my $rcline (@{$downmap{$arg}}) {
print RC " ${prefix}$rcline\n";
}
}
print RC " ;;\n";
print RC "esac\n";
print RC "exit 0\n";
close(RC);
chmod(0755, $script);
return 0;
}
sub RunRoutingScripts($$)
{
my ($vnode_id,$updown) = @_;
my $script = CONFDIR()."/routing.sh";
if (! -e $script) {
TBDebugTimeStamp("RunRoutingScripts: no $script file!")
if ($debug);
return undef;
}
else {
TBDebugTimeStamp("RunRoutingScripts: $script:")
if ($debug);
my $cmd = $script;
my @output;
my $ret;
if ($updown) {
@output = system("/bin/sh $cmd enable-routes");
$ret = $? >> 8;
push(@output,system("/bin/sh $cmd enable-routes"));
$ret |= $? >> 8;
}
else {
@output = system("/bin/sh $cmd disable-routes");
$ret = $? >> 8;
}
TBDebugTimeStamp("ret = $ret\n".join("\n",@output))
if ($debug);
return $ret;
}
}
sub RunShapingScripts($$)
{
my ($vnode_id,$updown) = @_;
my $script = CONFDIR()."/shaping-";
if ($updown) {
$script .= "up.sh";
}
else {
$script .= "down.sh";
}
if (! -e $script) {
TBDebugTimeStamp("RunShapingScripts: no $script file!")
if ($debug);
return undef;
}
else {
TBDebugTimeStamp("RunShapingScripts: $script:")
if ($debug);
my @output = system("/bin/sh $script");
my $ret = $? >> 8;
TBDebugTimeStamp("ret = $ret\n".join("\n",@output))
if ($debug);
return $ret;
}
}
#
# Create scripts to enable and disable the endnodeshaping for a
# particular node. These scripts are not static -- they must figure out
# which veths the docker runtime has assigned to each vnode. They do
# this by running the `ip` command within the vnode's netns, finding the
# link associated with the vmac that is getting shaped, and then finding
# the peer device in the root context. At that point, they can setup
# traffic shaping in the root context. But it's a dynamic thing that
# changes each time the docker container boots.
#
# Also, note that we only support netem.
#
# Finally, if you set $ingress=0, that will disable output of ingress
# code EVEN IF it is a duplex link (i.e. we would normally use ingress
# shaping).
#
sub CreateShapingScripts($$$$;$)
{
my ($vnode_id,$private,$node_ifs,$node_lds,$ingress) = @_;
if (!defined($ingress)) {
$ingress = 1;
}
my $uscript = CONFDIR()."/shaping-up.sh";
my $dscript = CONFDIR()."/shaping-down.sh";
if (! open(UFILE, ">$uscript")) {
print STDERR "Error creating $uscript: $!\n";
return -1;
}
if (! open(DFILE, ">$dscript")) {
print STDERR "Error creating $dscript: $!\n";
return -1;
}
print UFILE "#!/bin/sh\n\n";
print DFILE "#!/bin/sh\n\n";
print UFILE "set -x\n\n";
print DFILE "set -x\n\n";
my @ucmds = ();
my @dcmds = ();
my $cstr;
$cstr = "vnodeid=$vnode_id";
push(@ucmds,$cstr);
push(@dcmds,$cstr);
foreach my $ifc (@$node_ifs) {
#
# Find associated delay info
#
my $ldinfo;
foreach my $ld (@$node_lds) {
if ($ld->{"IFACE"} eq $ifc->{"MAC"}) {
$ldinfo = $ld;
}
}
next
if (!defined($ldinfo));
my $type = $ldinfo->{TYPE};
my $linkname = $ldinfo->{LINKNAME};
#my $vnode = $ldinfo->{VNODE};
#my $inet = $ldinfo->{INET};
#my $mask = $ldinfo->{MASK};
my $pipeno = $ldinfo->{PIPE};
my $delay = $ldinfo->{DELAY};
my $bw = $ldinfo->{BW};
my $plr = $ldinfo->{PLR};
my $rpipeno = $ldinfo->{RPIPE};
my $rdelay = $ldinfo->{RDELAY};
my $rbw = $ldinfo->{RBW};
my $rplr = $ldinfo->{RPLR};
#
# Delays are floating point numbers (unit is ms). ipfw does not
# support floats, so apply a cheesy rounding function to convert
# to an integer (since perl does not have a builtin way to
# properly round a floating point number to an integer).
#
# NB: Linux doesn't support floats either, and wants usecs.
#
$delay = int($delay + 0.5) * 1000;
$rdelay = int($rdelay + 0.5) * 1000;
#
# Sweet! 'k' as in "kbit" means 1024, not 1000, to tc.
# Just spell it out as bits here, they can't screw that up!
#
$bw *= 1000;
$rbw *= 1000;
# Packet loss in netem is percent.
$plr *= 100;
#
# Set some variables enabling us to dynamically find the veth in
# the root context when this runs.
#
$cstr = "vip=$ifc->{IPADDR}";
push(@ucmds,$cstr);
push(@dcmds,$cstr);
$cstr = "vmac=".fixupMac($ifc->{MAC});
push(@ucmds,$cstr);
push(@dcmds,$cstr);
if ($ingress && $type eq 'duplex') {
$cstr = "ifb=$ldinfo->{IFB}";
push(@ucmds,$cstr);
push(@dcmds,$cstr);
}
$cstr = "bw=$bw\nplr=$plr\ndelay=$delay";
push(@ucmds,$cstr);
push(@dcmds,$cstr);
$cstr = "rbw=$rbw\nrplr=$rplr\nrdelay=$rdelay";
push(@ucmds,$cstr);
push(@dcmds,$cstr);
#
# Figure out the root context half of the veth peer for this
# docker container. Tricky because Docker doesn't expose this
# info.
#
$cstr = "IFIDX=`ip netns exec \$vnodeid ip -br link show".
" | sed -r -n -e \"s/^[^\@]+\@if([0-9]+).*\$vmac.*\$/\\1/p\"`";
push(@ucmds,$cstr);
push(@dcmds,$cstr);
$cstr = 'if [ "x$IFIDX" = "x" ]; then';
push(@ucmds,$cstr);
push(@dcmds,$cstr);
$cstr = " IFIDX=`ip netns exec \$vnodeid ip -br addr show".
" | sed -r -n -e \"s/^[^\@]+\@if([0-9]+).*\$vip.*\$/\\1/p\"`";
push(@ucmds,$cstr);
push(@dcmds,$cstr);
$cstr = 'fi';
push(@ucmds,$cstr);
push(@dcmds,$cstr);
$cstr = 'if [ "x$IFIDX" = "x" ]; then echo "ERROR: could not find iface $vmac $vnodeid!"; exit 1; fi';
push(@ucmds,$cstr);
push(@dcmds,$cstr);
$cstr = "VETH=`ip link show | sed -r -n -e \"s/^\$IFIDX: ([^\@]+)\@.*\$/\\1/p\"`";
push(@ucmds,$cstr);
push(@dcmds,$cstr);
$cstr = 'if [ "x$VETH" = "x" ]; then echo "ERROR: could not find host veth for container ifidx $IFIDX ($vmac $vnodeid)!"; exit 1; fi';
push(@ucmds,$cstr);
push(@dcmds,$cstr);
push(@ucmds,"\n");
push(@dcmds,"\n");
#
# Ok, finally get to the shaping. First we do the egress case:
#
push(@ucmds, 'tc qdisc del dev $VETH root');
push(@ucmds,'if [ ! $bw -eq 0 ]; then');
push(@ucmds, ' tc qdisc add dev $VETH handle 1 root htb default 1');
push(@ucmds, ' tc class add dev $VETH classid 1:1'.
' parent 1 htb rate $bw ceil $bw');
push(@ucmds,' tc qdisc add dev $VETH handle 2 parent 1:1'.
' netem drop $plr delay ${delay}us');
push(@ucmds,'else');
push(@ucmds,' tc qdisc add dev $VETH handle 1 root'.
' netem drop $plr delay ${delay}us');
push(@ucmds,'fi');
push(@dcmds, 'tc qdisc del dev $VETH root');
if ($ingress && $type eq 'duplex') {
push(@ucmds, 'ifconfig $ifb up');
push(@ucmds, 'tc qdisc del dev $ifb root');
push(@ucmds, 'tc qdisc add dev $VETH handle ffff: ingress');
push(@ucmds, 'tc filter add dev $VETH parent ffff: protocol ip'.
' u32 match u32 0 0 action mirred egress redirect dev $ifb');
push(@ucmds,'if [ ! $bw -eq 0 ]; then');
push(@ucmds,' tc qdisc add dev $ifb root handle 2: htb default 1');
push(@ucmds,' tc class add dev $ifb parent 2: classid 2:1'.
' htb rate $rbw ceil $rbw');
push(@ucmds,' tc qdisc add dev $ifb handle 3 parent 2:1'.
' netem drop $rplr delay ${rdelay}us');
push(@ucmds,'else');
push(@ucmds,' tc qdisc add dev $ifb handle 3 root'.
' netem drop $rplr delay ${rdelay}us');
push(@ucmds,'fi');
push(@dcmds, 'tc qdisc del dev $ifb root');
}
}
foreach my $cmd (@ucmds) {
print UFILE "$cmd\n";
}
print UFILE "exit 0\n";
foreach my $cmd (@dcmds) {
print DFILE "$cmd\n";
}
print DFILE "exit 0\n";
close(UFILE);
chmod(0554, $uscript);
close(DFILE);
chmod(0554, $dscript);
return 0;
}
sub RunProxies($$)
{
my ($vnode_id,$vmid) = @_;
my (undef,$boss_ip) = tmccbossinfo();
if (!$boss_ip) {
$boss_ip = `cat $BOOTDIR/bossip`;
chomp($boss_ip);
}
if (!$boss_ip) {
warn("could not find bossip anywhere; aborting!");
return -1;
}
my ($host_ip,$host_mask,$vmac) = hostControlNet();
# Each container gets a tmcc proxy running on another port. If this
# changes, make sure to update iptables rules elsewhere in
# SetupPostCreateIptables and SetupPostBootIptables.
my $local_tmcd_port = $TMCD_PORT + $vmid;
# Start a tmcc proxy (handles both TCP and UDP)
my $tmccpid = fork();
if ($tmccpid) {
# Give child a chance to react.
sleep(1);
# Make sure it is alive.
if (waitpid($tmccpid, &WNOHANG) == $tmccpid) {
print STDERR "$vnode_id: tmcc proxy failed to start\n";
return -1;
}
if (open(FD, ">/var/run/tmccproxy-$vnode_id.pid")) {
print FD "$tmccpid\n";
close(FD);
}
}
else {
POSIX::setsid();
# XXX make sure we can kill the proxy when done
local $SIG{TERM} = 'DEFAULT';
exec("$BINDIR/tmcc.bin -d -t 15 -n $vnode_id ".
" -X $host_ip:$local_tmcd_port -s $boss_ip -p $TMCD_PORT ".
" -o $LOGDIR/tmccproxy.$vnode_id.log");
die("Failed to exec tmcc proxy");
}
return 0;
}
sub KillProxies($$)
{
my ($vnode_id,$vmid) = @_;
if (-e "/var/run/tmccproxy-$vnode_id.pid") {
open(FD,"/var/run/tmccproxy-$vnode_id.pid")
or return -1;
my $pid = ;
close(FD);
chomp($pid);
mysystem2("/bin/kill $pid");
my $rc = $? >> 8;
if ($rc == 0) {
unlink("/var/run/tmccproxy-$vnode_id.pid");
}
return $rc;
}
return 0;
}
sub findControlNetVethInfo($$$$;$)
{
my ($vnode_id,$vip,$vmac,$hostdevref,$hifidxref) = @_;
my $ifidx;
my $dev;
open(FD,"ip netns exec $vnode_id ip -br link show |");
while (!eof(FD)) {
my $line = ;
chomp($line);
if ($line =~ /^[^\@]+\@if(\d+).*$vmac.*$/) {
$ifidx = $1;
last;
}
}
close(FD);
if (!$ifidx) {
open(FD,"ip netns exec $vnode_id ip -br addr show |");
while (!eof(FD)) {
my $line = ;
if ($line =~ /^[^\@]+\@if(\d+).*$vip.*$/) {
$ifidx = $1;
last;
}
}
close(FD);
}
if (!$ifidx) {
warn("could not find host control net iface ifidx for $vnode_id!");
return -1;
}
open(FD,"ip link show |");
while (!eof(FD)) {
my $line = ;
if ($line =~ /^$ifidx: ([^\@]+)\@.*$/) {
$dev = $1;
last;
}
}
close(FD);
if (!$dev) {
warn("could not find host veth iface for $vnode_id!");
return -1;
}
if (defined($hostdevref)) {
$$hostdevref = $dev;
}
if (defined($hifidxref)) {
$$hifidxref = $ifidx;
}
return 0;
}
sub InsertPostBootIptablesRules($$$$)
{
my ($vnode_id,$vmid,$vnconfig,$private) = @_;
# Maybe allow routable control network.
my @rules = ();
my $IN_CHAIN = "IN_${vnode_id}";
my $OUT_CHAIN = "OUT_${vnode_id}";
if (length($IN_CHAIN) > 28) {
$IN_CHAIN = "I_${vnode_id}";
$OUT_CHAIN = "O_${vnode_id}";
}
my ($vnode_ip,$vnode_mask) =
($vnconfig->{config}{CTRLIP},$vnconfig->{config}{CTRLMASK});
my $vnode_mac = fixupMac(ipToMac($vnode_ip));
#
# Send packets from the veth into our chains, and vice versa.
#
if (!$USE_MACVLAN_CNET) {
my ($veth,$ifidx);
if (findControlNetVethInfo($vnode_id,$vnode_ip,$vnode_mac,
\$veth,\$ifidx)) {
warn("could not find control net veth; aborting!");
return -1;
}
push(@rules,
"-I EMULAB-ISOLATION -m physdev --physdev-is-bridged".
" --physdev-in $veth -s $vnode_ip -j $OUT_CHAIN");
push(@rules,
"-I EMULAB-ISOLATION -m physdev --physdev-is-bridged".
" --physdev-out $veth -j $IN_CHAIN");
#
# Another wrinkle. We have to think about packets coming from
# the container and addressed to the physical host. Send them
# through OUTGOING chain for filtering, rather then adding
# another chain. We make sure there are appropriate rules in
# the OUTGOING chain to protect the host.
#
# XXX: We cannot use the input interface or bridge options, cause
# if the vnode_ip is unroutable, the packet appears to come from
# eth0, according to iptables logging. WTF!
#
push(@rules,
"-A INPUT -s $vnode_ip -j $OUT_CHAIN");
push(@rules,
"-A OUTPUT -d $vnode_ip -j ACCEPT");
}
else {
#
# XXX: obviously using the vnode's mac address is suboptimal,
# but it's all we have if we can't label packets coming from a
# cgroup.
#
push(@rules,
"-A EMULAB-ISOLATION -s $vnode_ip".
" -m mac --mac-source $vnode_mac -j $OUT_CHAIN");
push(@rules,
"-A FORWARD -d $vnode_ip -j $IN_CHAIN");
#
# Another wrinkle. We have to think about packets coming from
# the container and addressed to the physical host. Send them
# through OUTGOING chain for filtering, rather then adding
# another chain. We make sure there are appropriate rules in
# the OUTGOING chain to protect the host.
#
push(@rules,
"-A INPUT -s $vnode_ip".
" -m mac --mac-source $vnode_mac -j $OUT_CHAIN");
}
# Save for easy deletion later.
my @deleterules = ();
foreach my $rule (@rules) {
if ($rule =~ /^(-[AIR]\s+)([A-Za-z][-A-Za-z0-9]*)\s+(.+)$/) {
push(@deleterules,"-D $2 $3");
}
elsif ($rule =~ /^(-[IR]\s+)([A-Za-z][-A-Za-z0-9]*)\s+\d+\s+(.+)$/) {
push(@deleterules,"-D $2 $3");
}
}
if ($debug) {
TBDebugTimeStamp("scheduling runtime iptables rules for later".
" deletion\n:".join("\n",@deleterules));
}
$private->{'postboot_iptables_rules'} = \@deleterules;
print Dumper($private);
# Install the iptables rules
TBDebugTimeStamp("InsertPostBootIptablesRules: installing iptables rules");
if (DoIPtables(@rules)) {
TBDebugTimeStamp(" failed to install runtime iptables rules");
return -1;
}
TBDebugTimeStamp(" installed runtime iptables rules");
return 0;
}
sub RemovePostBootIptablesRules($$$$)
{
my ($vnode_id,$vmid,$vnconfig,$private) = @_;
# We simply remove whatever we added in InsertPostBootIptablesRules.
if (exists($private->{'postboot_iptables_rules'})) {
my @rules = @{$private->{'postboot_iptables_rules'}};
# Uninstall the iptables rules
TBDebugTimeStamp("RemovePostBootIptablesRules: removing iptables rules");
if (DoIPtables(@rules)) {
TBDebugTimeStamp(" failed to remove runtime iptables rules");
return -1;
}
TBDebugTimeStamp(" removed runtime iptables rules");
delete($private->{'postboot_iptables_rules'});
}
return 0;
}
#
# Return MB of memory used by dom0
# Give it at least 256MB of memory.
#
sub hostMemory()
{
my $memtotal = `grep MemTotal /proc/meminfo`;
if ($memtotal =~ /^MemTotal:\s*(\d+)\s(\w+)/) {
my $num = $1;
my $type = $2;
if ($type eq "kB") {
$num /= 1024;
}
$num = int($num);
return $num;
}
die("Could not find host total memory!");
}
#
# Return MB of memory and cores allocated to dom0.
#
sub hostResources()
{
my $cpus = `grep processor /proc/cpuinfo | wc -l`;
if ($cpus =~ /^(\d+)/) {
$cpus = $1;
}
else {
die("Could not find number of CPUs for host!");
}
return (hostMemory(),$cpus);
}
#
# Return non-zero if host has swapped to disk.
#
# XXX beware all ye callers! Note that this returns non-zero if host has
# *ever* swapped, not just if it has swapped as a result of recent activity.
# So once a node swaps that first time, for any reason, this will return
# non-zero til the next boot.
#
sub hostSwapping()
{
my ($total,$free) = (0,0);
my @lines = `grep Swap /proc/meminfo`;
chomp(@lines);
foreach my $line (@lines) {
if ($line =~ /^SwapTotal:\s*(\d+)\s(\w+)/) {
my $num = $1;
my $type = $2;
if ($type eq "kB") {
$num /= 1024;
}
$total = int($num);
next;
}
if ($line =~ /^SwapFree:\s*(\d+)\s(\w+)/) {
my $num = $1;
my $type = $2;
if ($type eq "kB") {
$num /= 1024;
}
$free = int($num);
next;
}
}
return ($free < $total) ? 1 : 0;
}
#
# Contruct and returns the jail control net IP of the physical host.
#
sub hostControlNet()
{
#
# XXX we use a woeful hack to get the virtual control net address,
# that is unique. I will assume that control network is never
# bigger then /16 and so just combine the top of the jail network
# with the lower half of the control network address.
#
my (undef,$vmask,$vgw) = findVirtControlNet();
my (undef, $ctrlip, $ctrlmask) = findControlNet();
my ($a,$b);
if ($vgw =~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) {
$a = $1;
$b = 31;
my $tmp = ~inet_aton("255.255.0.0") & inet_aton($ctrlip);
my $ipbase = inet_ntoa($tmp);
if ($ipbase =~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) {
my ($c,$d) = ($3,$4);
my ($m1,$m2,$m3,$m4) = (sprintf("%02x",$a),sprintf("%02x",$b),
sprintf("%02x",$c),sprintf("%02x",$d));
print STDERR "debug: $ipbase\n";
return ("$a.$b.$3.$4", $vmask, "02:00:$m1:$m2:$m3:$m4");
}
}
die("hostControlNet: could not create control net virtual IP");
}
#
# If there is a capture running for the indicated vnode, return the pid.
# Otherwise return 0.
#
# Note: we do not use the pidfile here! This is all about sanity checking.
#
sub captureRunning($)
{
my ($vnode_id) = @_;
my $LOGPATH = "$VMDIR/$vnode_id";
my $rpid = `pgrep -f '^$CAPTURE .*-l $LOGPATH $vnode_id'`;
if ($? == 0) {
chomp($rpid);
if ($rpid =~ /^(\d+)$/) {
return $1;
}
}
return 0;
}
sub captureStart($$)
{
my ($vnode_id,$ptyfile) = @_;
my $LOGPATH = "$VMDIR/$vnode_id";
my $acl = "$LOGPATH/$vnode_id.acl";
my $logfile = "$LOGPATH/$vnode_id.log";
my $pidfile = "$LOGPATH/$vnode_id.pid";
# unlink ACL file so that we know when capture has started
unlink($acl)
if (-e $acl);
# remove old log file before start
unlink($logfile)
if (-e $logfile);
# and old pid file
unlink($pidfile)
if (-e $pidfile);
TBDebugTimeStamp("captureStart: starting capture on pty symlink $ptyfile");
# XXX see start of file for meaning of the options
mysystem2("$CAPTURE $CAPTUREOPTS -l $LOGPATH $vnode_id $ptyfile");
#
# We need to report the ACL info to capserver via tmcc. But do not
# hang, use timeout. Also need to wait for the acl file, since
# capture is running in the background.
#
if (! $?) {
for (my $i = 0; $i < 10; $i++) {
last
if (-e $acl && -s $acl);
print "waiting 1 sec for capture ACL file...\n" if ($sleepdebug);
sleep(1);
}
if (! (-e $acl && -s $acl)) {
print STDERR "WARNING: $acl does not exist after 10 seconds; ".
"capture may not have started correctly.\n";
}
else {
if (mysystem2("$BINDIR/tmcc.bin -n $vnode_id -t 5 ".
" -f $acl tiplineinfo")) {
print STDERR "WARNING: could not report tiplineinfo; ".
"remote console connections may not work.\n";
}
}
} else {
print STDERR "WARNING: capture not started!\n";
}
}
# convert 123456 into 12:34:56
sub fixupMac($)
{
my ($x) = @_;
$x =~ s/(\w\w)/$1:/g;
chop($x);
return $x;
}
#
# Create a thin pool that uses most of the VG space.
#
# This is tricky if there are multiple PVs and they are different sizes.
# We cannot create the pool larger than M * N where M is the number of
# disks and N is the free space on the smallest disk.
#
sub createThinPool($)
{
my ($devs) = @_;
#
# Find the PV with the least available space
#
my $smallest;
my $num = 0;
my $tsize = 0;
foreach my $dsize (`pvs --noheadings -o pv_free $devs`) {
if ($dsize =~ /(\d+\.\d+)([mgt])/i) {
$dsize = $1;
my $u = lc($2);
if ($u eq "m") {
$dsize /= 1000;
} elsif ($u eq "t") {
$dsize *= 1000;
}
$tsize += $dsize;
if (!defined($smallest) || $dsize < $smallest) {
$smallest = $dsize;
}
} else {
print STDERR "createThinPool: could not parse PV size '$dsize'\n";
return -1;
}
$num++;
}
#
# Arbitrary conventions:
# - don't use more than 80% of the smallest device
# - leave at least 50g total for others
# - pool should be at least 100g
#
my $poolsize = int($num * ($smallest * $POOL_FRAC));
if ($poolsize > ($tsize - 50)) {
$poolsize = $tsize - 50;
}
if ($poolsize < 100) {
print STDERR "createThinPool: ${poolsize}g is not enough space ".
"for a reasonably sized thin pool\n";
return -1;
}
# Try to make it
if (mysystem2("lvcreate -i$num -L ${poolsize}g ".
"--type thin-pool --thinpool $POOL_NAME $VGNAME")) {
print STDERR "createThinPool: could not create ${poolsize}g ".
"thin pool\n";
return -1;
}
return 0;
}
#
# Return size of volume group in (decimal, aka disk-manufactuer) GB.
#
sub lvmVGSize($)
{
my ($vg) = @_;
my $size = `vgs --noheadings -o size $vg`;
if ($size =~ /(\d+\.\d+)([mgt])/i) {
$size = $1;
my $u = lc($2);
if ($u eq "m") {
$size /= 1000;
} elsif ($u eq "t") {
$size *= 1000;
}
return $size;
}
die "libvnode_docker: cannot parse LVM volume group size";
}
#
# Deal with IFBs. We add and remove them dynamically, per-VM. They are
# named like ifb-. We don't bother naming them per lanlink or
# anything; we might run into the 15-character limit too easily anyway.
#
my $IFB_LOCK = "ifblock";
sub AllocateIFBs($$$)
{
my ($vmid, $node_lds, $private) = @_;
my @ifbs = ();
TBDebugTimeStamp("AllocateIFBs: grabbing global lock $IFB_LOCK")
if ($lockdebug);
if (TBScriptLock($IFB_LOCK, TBSCRIPTLOCK_INTERRUPTIBLE(),
1800) != TBSCRIPTLOCK_OKAY()) {
print STDERR "Could not get the global lock after a long time!\n";
return -1;
}
TBDebugTimeStamp(" got global lock")
if ($lockdebug);
my %MDB;
if (!dbmopen(%MDB, $IFBDB, 0660)) {
print STDERR "*** Could not create $IFBDB\n";
TBScriptUnlock();
return undef;
}
#
# We need an IFB for every ld; allocate them if they don't exist.
# If they *do* exist, just steal them and warn; nothing else would
# be using them except a stale vnode, and those should be garbage
# collected before the new vnode of the same node comes into
# existence.
#
my $needed = scalar(@$node_lds);
for (my $i = 0; $i < $needed; ++$i) {
my $iname = "ifb$vmid-$i";
if (defined($MDB{"$iname"}) && $MDB{"$iname"} eq "$vmid") {
if (-e "/sys/class/net/$iname") {
warn("$iname device already exists for linkdelay $i; stealing!");
}
else {
warn("$iname DB entry already exists for linkdelay $i; stealing!");
}
}
if (! -e "/sys/class/net/$iname") {
mysystem("ip link add $iname type ifb");
}
$MDB{"$iname"} = $vmid;
# Record ifb in use
$private->{'ifbs'}->{$iname} = $i;
push(@ifbs, $iname);
}
dbmclose(%MDB);
TBDebugTimeStamp(" releasing global lock")
if ($lockdebug);
TBScriptUnlock();
return \@ifbs;
}
sub ReleaseIFBs($$)
{
my ($vmid, $private) = @_;
TBDebugTimeStamp("ReleaseIFBs: grabbing global lock $IFB_LOCK")
if ($lockdebug);
if (TBScriptLock($IFB_LOCK, 0, 1800) != TBSCRIPTLOCK_OKAY()) {
print STDERR "Could not get the global lock after a long time!\n";
return -1;
}
TBDebugTimeStamp(" got global lock")
if ($lockdebug);
my %MDB;
if (!dbmopen(%MDB, $IFBDB, 0660)) {
print STDERR "*** Could not create $IFBDB\n";
TBScriptUnlock();
return -1;
}
if (exists($private->{'ifbs'})) {
for my $iname (keys(%{$private->{'ifbs'}})) {
mysystem("ip link del $iname");
delete($MDB{$iname});
}
}
#
# Make sure we have released everything assigned to this vmid.
#
my @leftoverdbkeys = ();
for my $iname (keys(%MDB)) {
if ($MDB{$iname} eq "$vmid") {
push(@leftoverdbkeys,$iname);
}
}
for my $iname (@leftoverdbkeys) {
delete($MDB{$iname});
}
dbmclose(%MDB);
TBDebugTimeStamp(" releasing global lock")
if ($lockdebug);
TBScriptUnlock();
delete($private->{'ifbs'});
return 0;
}
#
# Run a function with vnodesetup/mkvnode signals blocked.
#
sub RunWithSignalsBlocked($@) {
my ($funcref,@args) = @_;
#
# Block signals that could kill us in the middle of some important
# operation. This ensure that if we have to tear down in the middle
# of setting up, the state is consistent.
#
my $new_sigset = POSIX::SigSet->new(SIGHUP, SIGINT, SIGUSR1, SIGUSR2);
my $old_sigset = POSIX::SigSet->new;
if (! defined(sigprocmask(SIG_BLOCK, $new_sigset, $old_sigset))) {
print STDERR "sigprocmask (BLOCK) failed!\n";
}
my $rc = $funcref->(@args);
if (! defined(sigprocmask(SIG_SETMASK, $old_sigset))) {
print STDERR "sigprocmask (UNBLOCK) failed!\n";
}
return $rc;
}
#
# Helper function to run a shell command wrapped by a lock.
#
sub RunWithLock($$)
{
my ($token, $command) = @_;
my $lockref;
if (TBScriptLock($token, undef, 900, \$lockref) != TBSCRIPTLOCK_OKAY()) {
print STDERR "Could not get $token lock after a long time!\n";
return -1;
}
mysystem2($command);
my $status = $?;
print "waiting 1 sec after RunWithLock...\n" if ($sleepdebug);
sleep(1);
TBScriptUnlock($lockref);
return $status;
}
sub checkForInterrupt()
{
my $sigset = POSIX::SigSet->new;
sigpending($sigset);
# XXX Why isn't SIGRTMIN and SIGRTMAX defined in the POSIX module.
for (my $i = 1; $i < 50; $i++) {
if ($sigset->ismember($i)) {
print "checkForInterrupt: Signal $i is pending\n";
return 1;
}
}
return 0;
}
#
# We need to control how many simultaneous creates happen at once.
#
my $createvnode_lockref;
sub CreateVnodeLock()
{
my $tries = 1000;
# Figure out how many vnodeCreates we can support at once
setConcurrency(0);
while ($tries) {
for (my $i = 0; $i < $MAXCONCURRENT; $i++) {
my $token = "createvnode_${i}";
TBDebugTimeStamp("grabbing vnode lock $token")
if ($lockdebug);
my $locked = TBScriptLock($token, TBSCRIPTLOCK_NONBLOCKING(),
0, \$createvnode_lockref);
if ($locked == TBSCRIPTLOCK_OKAY()) {
TBDebugTimeStamp(" got vnode lock")
if ($lockdebug);
return 0
}
return -1
if ($locked == TBSCRIPTLOCK_FAILED());
}
print "Still trying to get the create lock at " . time() . "\n"
if (($tries % 60) == 0);
return -1
if (checkForInterrupt());
sleep(4);
return -1
if (checkForInterrupt());
$tries--;
}
TBDebugTimeStamp("Could not get the createvnode lock after a long time!");
return -1;
}
sub CreateVnodeUnlock()
{
TBDebugTimeStamp(" releasing vnode lock")
if ($lockdebug);
TBScriptUnlock($createvnode_lockref);
}
sub CreateVnodeLockAll()
{
my @locks;
my $lockref;
# Determine the maximum concurrency
setConcurrency(1);
for (my $i = 0; $i < $MAXCONCURRENT; $i++) {
my $token = "createvnode_${i}";
if (TBScriptLock($token, TBSCRIPTLOCK_NONBLOCKING(), 0, \$lockref) ==
TBSCRIPTLOCK_OKAY()) {
push(@locks, $lockref);
}
else {
# Release all.
foreach my $ref (@locks) {
TBScriptUnlock($ref);
}
return undef;
}
}
return \@locks;
}
sub CreateVnodeUnlockAll($)
{
my ($plocks) = @_;
my @locks = @$plocks;
# Release all.
foreach my $ref (@locks) {
TBScriptUnlock($ref);
}
}
1;