Commit 96794781 authored by David Johnson's avatar David Johnson

Clientside Docker vnode support.

See clientside/tmcc/linux/docker/README.md for design notes.
See clientside/tmcc/linux/docker/dockerfiles/README.md for a description
of how we automatically Emulabize existing Docker images.

Also, this mostly fits within the existing vnodesetup path, but I did modify
mkvnode.pl to allow the libvnode backend to provide a vnodePoll wait
loop instead of the builtin vnodeState loop.
parent 3fe30c60
...@@ -70,7 +70,7 @@ use librc; ...@@ -70,7 +70,7 @@ use librc;
# Not all clients support this. # Not all clients support this.
# #
exit(0) exit(0)
if (MFS() || REMOTE() || JAILED() || INXENVM() || PLAB()); if (MFS() || REMOTE() || JAILED() || INXENVM() || INDOCKERVM() || PLAB());
# Protos. # Protos.
sub doboot(); sub doboot();
......
...@@ -67,7 +67,7 @@ my $pathname = $HOSTSFILE; # Default path from liblocsetup. ...@@ -67,7 +67,7 @@ my $pathname = $HOSTSFILE; # Default path from liblocsetup.
# Not all clients support this. # Not all clients support this.
# #
exit(0) exit(0)
if (MFS() || (REMOTE() && !(REMOTEDED() || PLAB() || JAILED()))); if (MFS() || (REMOTE() && !(REMOTEDED() || PLAB() || JAILED())) || INDOCKERVM());
# Protos. # Protos.
sub doboot(); sub doboot();
......
#!/usr/bin/perl -wT #!/usr/bin/perl -wT
# #
# Copyright (c) 2008-2014 University of Utah and the Flux Group. # Copyright (c) 2008-2014, 2017 University of Utah and the Flux Group.
# #
# {{{EMULAB-LICENSE # {{{EMULAB-LICENSE
# #
...@@ -30,6 +30,7 @@ use Exporter; ...@@ -30,6 +30,7 @@ use Exporter;
VNODE_STATUS_INIT VNODE_STATUS_STOPPING VNODE_STATUS_UNKNOWN VNODE_STATUS_INIT VNODE_STATUS_STOPPING VNODE_STATUS_UNKNOWN
VNODE_STATUS_MOUNTED VNODE_STATUS_MOUNTED
VNODE_PATH VNODE_PATH
VNODE_POLL_ERROR VNODE_POLL_STOP VNODE_POLL_CONTINUE
findVirtControlNet findVirtControlNet
); );
...@@ -42,8 +43,16 @@ sub VNODE_STATUS_MOUNTED() { return "mounted"; } ...@@ -42,8 +43,16 @@ sub VNODE_STATUS_MOUNTED() { return "mounted"; }
sub VNODE_STATUS_BOOTING() { return "booting"; } sub VNODE_STATUS_BOOTING() { return "booting"; }
sub VNODE_STATUS_INIT() { return "init"; } sub VNODE_STATUS_INIT() { return "init"; }
sub VNODE_STATUS_STOPPING(){ return "stopping"; } sub VNODE_STATUS_STOPPING(){ return "stopping"; }
sub VNODE_STATUS_PAUSED(){ return "paused"; }
sub VNODE_STATUS_UNKNOWN() { return "unknown"; } sub VNODE_STATUS_UNKNOWN() { return "unknown"; }
#
# Valid constants that can be returned by vnodePoll.
#
sub VNODE_POLL_ERROR() { return -1 }
sub VNODE_POLL_STOP() { return 1; }
sub VNODE_POLL_CONTINUE() { return 0; }
# VM path stuff # VM path stuff
my $VMPATH = "$VARDIR/vminfo"; my $VMPATH = "$VARDIR/vminfo";
sub VNODE_PATH(;$) { sub VNODE_PATH(;$) {
......
...@@ -62,7 +62,7 @@ use Exporter; ...@@ -62,7 +62,7 @@ use Exporter;
TMGATEDCONFIG TMSYNCSERVER TMKEYHASH TMNODEID TMNODEUUID TMEVENTKEY TMGATEDCONFIG TMSYNCSERVER TMKEYHASH TMNODEID TMNODEUUID TMEVENTKEY
TMCREATOR TMSWAPPER TMFWCONFIG TMGENVNODECONFIG TMCREATOR TMSWAPPER TMFWCONFIG TMGENVNODECONFIG
TMSTORAGEMAP TMDISKINFO TMEXTRAFS TMSTORAGEMAP TMDISKINFO TMEXTRAFS
INXENVM INVZVM INXENVM INVZVM INDOCKERVM
); );
# Must come after package declaration! # Must come after package declaration!
...@@ -562,6 +562,7 @@ sub setFSRVTYPE($) { ...@@ -562,6 +562,7 @@ sub setFSRVTYPE($) {
# #
sub INXENVM() { return ($ingenvnode && GENVNODETYPE() eq "xen"); } sub INXENVM() { return ($ingenvnode && GENVNODETYPE() eq "xen"); }
sub INVZVM() { return ($ingenvnode && GENVNODETYPE() eq "openvz"); } sub INVZVM() { return ($ingenvnode && GENVNODETYPE() eq "openvz"); }
sub INDOCKERVM(){ return ($ingenvnode && GENVNODETYPE() eq "docker"); }
# #
# Reset to a moderately clean state. # Reset to a moderately clean state.
......
...@@ -84,10 +84,14 @@ use libvnode; ...@@ -84,10 +84,14 @@ use libvnode;
# Helpers # Helpers
sub MyFatal($); sub MyFatal($);
sub hasLibOp($);
sub safeLibOp($$$;@); sub safeLibOp($$$;@);
sub CleanupVM(); sub CleanupVM();
sub TearDownStaleVM(); sub TearDownStaleVM();
sub StoreState(); sub StoreState();
sub ReadState();
sub BackendVnodePoll();
sub DefaultVnodePoll();
# Locals # Locals
my $CTRLIPFILE = "/var/emulab/boot/myip"; my $CTRLIPFILE = "/var/emulab/boot/myip";
...@@ -223,7 +227,7 @@ foreach my $type (@nodetypes) { ...@@ -223,7 +227,7 @@ foreach my $type (@nodetypes) {
} }
$libops{$type}{'init'}->(); $libops{$type}{'init'}->();
# need to do this for each type encountered. # need to do this for each type encountered.
TBDebugTimeStampWithDate("starting $type rootPreConfig()"); TBDebugTimeStampWithDate("starting $type rootPreConfig()");
$libops{$type}{'rootPreConfig'}->($BOSSIP); $libops{$type}{'rootPreConfig'}->($BOSSIP);
TBDebugTimeStampWithDate("finished $type rootPreConfig()"); TBDebugTimeStampWithDate("finished $type rootPreConfig()");
...@@ -758,7 +762,14 @@ if (defined(VNCONFIG('SSHDPORT')) && VNCONFIG('SSHDPORT') ne "" && ...@@ -758,7 +762,14 @@ if (defined(VNCONFIG('SSHDPORT')) && VNCONFIG('SSHDPORT') ne "" &&
# it running in its new context. Still, lets protect it with a timer # it running in its new context. Still, lets protect it with a timer
# since it might get hung up inside and we do not want to get stuck here. # since it might get hung up inside and we do not want to get stuck here.
# #
my $needschildmon;
if (!$ISXENVM) { if (!$ISXENVM) {
$needschildmon = 1;
}
else {
$needschildmon = 0;
}
if ($needschildmon) {
my $childpid = fork(); my $childpid = fork();
if ($childpid) { if ($childpid) {
my $timedout = 0; my $timedout = 0;
...@@ -791,12 +802,23 @@ if (!$ISXENVM) { ...@@ -791,12 +802,23 @@ if (!$ISXENVM) {
print STDERR "*** ERROR: vnodeBoot failed\n"; print STDERR "*** ERROR: vnodeBoot failed\n";
exit(1); exit(1);
} }
# NB: store the state, so that vnodeBoot too has writable $private!
if (StoreState()) {
MyFatal("Could not store container state to disk");
}
exit(0); exit(0);
} }
} }
elsif (safeLibOp('vnodeBoot', 1, 1)) { elsif (safeLibOp('vnodeBoot', 1, 1)) {
MyFatal("$vnodeid container startup failed."); MyFatal("$vnodeid container startup failed.");
} }
if ($needschildmon) {
# NB: before continuing, read the state stored in the child above
# after vnodeBoot!
if (ReadState()) {
MyFatal("Could not read container state from disk after vnodeBoot");
}
}
if (safeLibOp('vnodePostConfig', 1, 1)) { if (safeLibOp('vnodePostConfig', 1, 1)) {
MyFatal("vnodePostConfig failed"); MyFatal("vnodePostConfig failed");
} }
...@@ -818,61 +840,132 @@ if (StoreState()) { ...@@ -818,61 +840,132 @@ if (StoreState()) {
mysystem("touch $RUNNING_FILE"); mysystem("touch $RUNNING_FILE");
$running = 1; $running = 1;
#
# Poll as desired by the backend. See comments below for
# BackendVnodePoll() and DefaultVnodePoll().
#
if (hasLibOp("vnodePoll")) {
BackendVnodePoll();
}
else {
DefaultVnodePoll();
}
exit(CleanupVM());
#
# Invoke the backend to poll the vnode for status changes that mkvnode
# should/must respond to. This means that honoring the
# vnodesetup/mkvnode semantics is now in the hands of the backend, if it
# wants. For instance, the backend can choose to allow this mkvnode
# monitor to continue waiting even if the vnode is stopped for long
# periods of time.
#
# (More recently, other backends (Docker) require that we catch VM state
# transitions more frequently than this loop allows. Note the special
# case in the loop where there's a 15-second special case check to see
# if a Xen VM was reoboted from the inside, and ends up restarting
# successfully. To handle these kinds of special cases, it's no problem
# to allow backends to control the loop; if we are interrupted via
# signal, and are supposed to be cleaning = 1 or whatever, we just don't
# call vnodePoll again (and just call vnodeState a final couple times),
# as in the original loop. As long as backends don't override our
# signal handlers, we're good to follow the original semantics of
# vnodesetup/mkvnode. We can also modify the semantics slightly,
# i.e. to allow the mkvnode monitor to hang around even if the vnode is
# down (like if the user manually invokes `docker stop`).
#
sub BackendVnodePoll()
{
while (1) {
my ($status,$event) = ('','');
my $ret = eval {
$libops{$vmtype}{'vnodePoll'}->($vnodeid, $vmid,
\%vnconfig, $vnstate->{'private'},
\$status,\$event);
};
my $err = $@;
if ($err) {
fatal("*** ERROR: vnodePoll: $err\n");
return (-1,$err);
}
if ($ret == libgenvnode::VNODE_POLL_STOP()) {
TBDebugTimeStamp("vnodePoll told us to stop polling; cleaning up!");
last;
}
elsif ($ret == libgenvnode::VNODE_POLL_ERROR()) {
TBDebugTimeStamp("vnodePoll errored ($err); cleaning up!".
" status=$status, event=$event");
last;
}
else {
TBDebugTimeStamp("vnodePoll told us to continue polling;".
" status=$status, event=$event");
}
}
}
#
# The default polling implementation.
# #
# This loop is to catch when the container stops. We used to run a sleep # This loop is to catch when the container stops. We used to run a sleep
# inside and wait for it to exit, but that is not portable across the # inside and wait for it to exit, but that is not portable across the
# backends, and the return value did not indicate how it exited. So, lets # backends, and the return value did not indicate how it exited. So, lets
# just loop, asking for the status every few seconds. # just loop, asking for the status every few seconds.
# #
# XXX Turn off debugging during this loop to keep the log file from growing. sub DefaultVnodePoll()
# {
TBDebugTimeStampsOff() # XXX Turn off debugging during this loop to keep the log file from
if ($debug); # growing.
TBDebugTimeStampsOff()
if ($debug);
while (1) { while (1) {
sleep(5); sleep(5);
# #
# If the container exits, either it rebooted from the inside or # If the container exits, either it rebooted from the inside or
# the physical node is rebooting, or we are actively trying to kill # the physical node is rebooting, or we are actively trying to kill
# it cause our parent (vnodesetup) told us to. In all cases, we just # it cause our parent (vnodesetup) told us to. In all cases, we just
# exit and let the parent decide what to do. # exit and let the parent decide what to do.
# #
my ($ret,$err) = safeLibOp('vnodeState', 0, 0); my ($ret,$err) = safeLibOp('vnodeState', 0, 0);
if ($err) { if ($err) {
fatal("*** ERROR: vnodeState: $err\n"); fatal("*** ERROR: vnodeState: $err\n");
} }
if ($ret ne VNODE_STATUS_RUNNING()) { if ($ret ne VNODE_STATUS_RUNNING()) {
print "Container is no longer running.\n"; print "Container is no longer running.\n";
if (!$cleaning) { if (!$cleaning) {
# #
# Rebooted from inside, but not cause we told it to, so # Rebooted from inside, but not cause we told it to, so
# leave intact. # leave intact.
# #
# But before we fold, lets wait a moment and check again # But before we fold, lets wait a moment and check again
# since in XEN, the user can type reboot, which causes the # since in XEN, the user can type reboot, which causes the
# domain to disappear for a while. We do not want to be # domain to disappear for a while. We do not want to be
# fooled by that. Halt is another issue; if the user halts # fooled by that. Halt is another issue; if the user halts
# from inside the container it is never coming back and the # from inside the container it is never coming back and the
# user has screwed himself. Need to restart from the frontend. # user has screwed himself. Need to restart from the frontend.
# #
sleep(15); sleep(15);
($ret,$err) = safeLibOp('vnodeState', 0, 0); ($ret,$err) = safeLibOp('vnodeState', 0, 0);
if ($err) { if ($err) {
fatal("*** ERROR: vnodeState: $err\n"); fatal("*** ERROR: vnodeState: $err\n");
} }
if ($ret eq VNODE_STATUS_RUNNING()) { if ($ret eq VNODE_STATUS_RUNNING()) {
print "Container has restarted itself.\n"; print "Container has restarted itself.\n";
next; next;
}
$leaveme = $LEAVEME_REBOOT;
} }
$leaveme = $LEAVEME_REBOOT; last;
} }
last;
} }
TBDebugTimeStampsOn()
if ($debug);
} }
TBDebugTimeStampsOn()
if ($debug);
exit(CleanupVM());
# #
# Teardown a container. This should not be used if the mkvnode process # Teardown a container. This should not be used if the mkvnode process
...@@ -1062,6 +1155,15 @@ sub MyFatal($) ...@@ -1062,6 +1155,15 @@ sub MyFatal($)
# #
# Helpers: # Helpers:
# #
sub hasLibOp($) {
my ($op,) = @_;
return 1
if (exists($libops{$vmtype}{$op}) && defined($libops{$vmtype}{$op}));
return 0;
}
sub safeLibOp($$$;@) { sub safeLibOp($$$;@) {
my ($op,$autolog,$autoerr,@args) = @_; my ($op,$autolog,$autoerr,@args) = @_;
...@@ -1128,3 +1230,18 @@ sub StoreState() ...@@ -1128,3 +1230,18 @@ sub StoreState()
} }
return 0; return 0;
} }
sub ReadState()
{
# Read the state from disk.
print "Reading state from disk ...\n"
if ($debug);
my $ret = eval { $vnstate = Storable::retrieve("$VNDIR/vnode.state"); };
if ($@) {
print STDERR "$@";
return -1;
}
return 0;
}
...@@ -336,7 +336,7 @@ sub doboot() ...@@ -336,7 +336,7 @@ sub doboot()
# #
# This stuff is run regardless of reservation status. # This stuff is run regardless of reservation status.
# #
if (-x "$RCDIR/rc.ipod" && ! WINDOWS()) { if (-x "$RCDIR/rc.ipod" && ! WINDOWS() && ! INDOCKERVM()) {
print("Setting up Ping of Death\n"); print("Setting up Ping of Death\n");
# This is allowed to fail by default; ipod might not be supported. # This is allowed to fail by default; ipod might not be supported.
if (!exists($manifest{'rc.ipod'}) if (!exists($manifest{'rc.ipod'})
......
...@@ -461,3 +461,24 @@ openvz-guest-pack: $(OPENVZGUEST_TEMPLATE) ...@@ -461,3 +461,24 @@ openvz-guest-pack: $(OPENVZGUEST_TEMPLATE)
@if [ -e "$(OPENVZGUEST)" ]; then \ @if [ -e "$(OPENVZGUEST)" ]; then \
cp -fp $(OPENVZGUEST_TEMPLATE) $(OPENVZGUEST); \ cp -fp $(OPENVZGUEST_TEMPLATE) $(OPENVZGUEST); \
fi fi
docker-install: dir-install
$(INSTALL) -m 755 $(SRCDIR)/../common/vnodesetup $(BINDIR)/
$(INSTALL) -m 755 $(SRCDIR)/../common/mkvnode.pl $(BINDIR)/
$(INSTALL) -m 755 $(SRCDIR)/../common/libutil.pm $(BINDIR)/
$(INSTALL) -m 755 $(SRCDIR)/../common/bootvnodes $(BINDIR)/
$(INSTALL) -m 755 $(SRCDIR)/libvnode.pm $(BINDIR)/
$(INSTALL) -m 755 $(SRCDIR)/docker/libvnode_docker.pm $(BINDIR)/
$(INSTALL) -m 755 $(SRCDIR)/vnodectl $(BINDIR)/
echo "docker" > $(ETCDIR)/genvmtype
$(INSTALL) -m 755 -o root -g $(DIRGROUP) -d $(ETCDIR)/docker
$(INSTALL) -m 755 -o root -g $(DIRGROUP) -d $(ETCDIR)/docker/scripts
$(INSTALL) -m 755 -o root -g $(DIRGROUP) -d $(ETCDIR)/docker/container-utils
$(INSTALL) -m 755 -o root -g $(DIRGROUP) -d $(ETCDIR)/docker/dockerfiles
rsync -a --delete $(SRCDIR)/docker/scripts/ $(ETCDIR)/docker/scripts/
rsync -a --delete $(SRCDIR)/docker/container-utils/ $(ETCDIR)/docker/container-utils/
rsync -a --delete $(SRCDIR)/docker/dockerfiles/ $(ETCDIR)/docker/dockerfiles/
# $(INSTALL) -m 755 $(SRCDIR)/docker/analyze.sh $(ETCDIR)/docker/
# $(INSTALL) -m 755 $(SRCDIR)/docker/analyze-image.sh $(ETCDIR)/docker/
docker-guest-install:
This is a detailed discussion and list of design decisions made during
the addition of Docker as a supported Emulab vnode type.
Overview
--------
Emulab client-side Docker vnode support relies on the libvnode
mechanism. Thus, the vast majority of the code is in libvnode_docker.pm .
In Emulab, vnodes are launched on vhost boot, rc.bootsetup runs
bootvnodes which runs a vnodesetup process for each vnode. On Linux,
for Emulab vnode types that are built using the libvnode abstraction,
vnodesetup calls mkvnode.pl, which invokes all the libvnode library
calls to build, boot, shutdown, and destroy the vnode, as necessary
during its lifecycle. mkvnode.pl acts as a monitor. See further detail
in the "Container Lifecycle" section below.
Container Lifecycle
-------------------
What happens when user types reboot or shutdown? Those don't tend to
work in Docker containers :). We're not going to force them to work in
this first version; it would require some inside/outside collaboration.
The main helpful thing we can do is allow the user to interact with
Docker CLI tools in the normal way (at least on dedicated vhosts) --
i.e., to type `docker restart pcvmXXX-Y`, and have all the Emulab stuff
that was setup in mkvnode.pl and libvnode_docker.pm continue to work.
This is harder than it seems, since Docker provides no access to the
default container runtime's (runc) hook support! What a bad decision;
it causes us tremendous pain.
The way vnodes are launched in Emulab has traditionally been that on
vhost boot, rc.bootsetup runs bootvnodes which runs a vnodesetup process
for each vnode. On Linux, for Emulab vnode types that are built using
the libvnode abstraction, vnodesetup calls mkvnode.pl, which invokes all
the libvnode library calls to build, boot, shutdown, and destroy the
vnode, as necessary during its lifecycle.
Typically, mkvnode.pl acts as a "monitor"/watchdog process. Emulab
control software will signal it with USR1/2 to tell it to shutdown or
restart; any other signal to mkvnode.pl (like INT) will result in the
vnode being halted and destroyed. Usually, the expectation of
mkvnode.pl is that if the vnode dies unilaterally (i.e. is not signaled
nor rebooted), mkvnode.pl should die as well (i.e. the
libvnode::vnodeState call that is invoked by mkvnode.pl every ~5seconds
will eventually return VNODE_HALTED() or whatever, and mkvnode.pl will
return.
However, to support the Docker libvnode backend, we instead introduced
libvnode::vnodePoll(), which (if it exists) mkvnode.pl will call in a
loop until it is told specifically to stop polling. The Docker
implementation of libvnode::vnodePoll() subscribes to the JSON event
stream for the container, and is thus able to catch events like
uncommanded container death; docker-CLI-commanded restart or stop; and
it continues to run forever, never returning to mkvnode.pl unless there
is an error. This is necessary for Emulab (libvnode_docker.pm) to hook
the shutdown and startup of a container, so we can do extra
configuration of it. When we see a "die", we undo our boot hooks.
When we see a "restart", we redo our boot hooks.
In the second version of this, we will further inform this event-based,
asynchronous mechanism with an inside-outside collaboration. If the
initscripts inside see that this container is being launched by Emulab,
they will wait until the container has been "hooked" before proceeding
to boot; and work similarly on shutdown.
We also ensure that neither dockerd nor the vhost distro autostart
containers on boot; we want to start them, so we can re-create all the
associated goo, like network devices and tc qdiscs etc; and monitor
their execution via the docker event stream. We could relax this later
by adding a central daemon that listens for all container events, and
invokes mkvnode.pl for the corresponding operation, if it's not already
running. Needs more thought; mkvnode.pl assume it is in charge right now.
Container Boot
--------------
For now, we do not attempt to emulate the built-in image entrypoint or
command; we simply override them (on the belief that most uses will
choose generic Linux distro images whose default command is usually
/bin/bash or similar).
[TODO] Later, we will support command emulation, so that images that
really want to launch only httpd or whatever will still work.
Deployment/Orchestration
------------------------
We don't support 'docker compose' or the multi-host overlays, etc. We
are the orchestrator! If user wants to do a docker compose or swarm or
whatever, they can do that on the vnode host; they just better not use
our control net. They would have to intend to do that; the default
docker bridge is a private network.
Emulabization
-------------
We do support booting unmodified, external Docker images from the main
registry in containers. There are several drawbacks to this, however.
Emulab cannot manage their lifecycle nor enable any features for these
containers, other than what can be done from the outside (i.e., network,
traffic shaping, firewalling, NFS mounts); other features must be done
from the inside (event sys, program agents, linktest, user
accounts/keys, startup commands). Well, startup commands we *could* do
from the outside, but what's the point? The image already has a builtin
command, or the user would provide one. They already don't want to use
our mechanisms for content deployment; they just want to use our
container and network deployment mechanism.
(See also the "Init Strategy" discussion; it is part of the motivation
for Emulabizing images.)
So -- we also support (and hope that people will primarily use)
automatic Emulabization. The user can bring their own image (based on
Ubuntu, Centos (and later Fedora and Debian)), and we will automatically
Emulabize it to the level they choose We support several different
levels:
none: we do not alter the image at all!
basic: install only openssh, rsyslog, runit, and several runit initscripts
core: basic + install a custom-build of the clientside, using a
temporary buildenv of the image, but only installing the DESTDIR
clientside binaries/fs stuff; also install a bunch of packages the
clientside stuff needs.
buildenv: basic + install all build tools for clientside, and
install the clientside, plus the runtime binaries the clientside needs.
full: buildenv + packages to make the image closer to a normal Emulab
disk image.
This gives the user maximum flexibility -- do they want a small image
that's not messed with at all (or is miminally messed with, as in the
basic case); or do they want something more and more featureful that
supports the core set of Emulab features. Size matters, right? We
invest significant time to ensure that whatever we do add is added in as
few layers as possbile.