Commit 0910c65c authored by Leigh B. Stoller's avatar Leigh B. Stoller
Browse files

First cut at porting our jail setup to linux vservers. Most of the

changes are on the client side where I took mkjail and retargeted it
to vservers (called it mkvserver.pl, clever eh?) in the linux
directory. The real time sync was understanding how vservers work, how
they boot how they die, how they handle signals, etc. Very interesting
and very bizarre. Anyway, this first cut is done with the version 2.2
vserver code which does not virtualize the network stack or even the
loopback device, so I pretty much ignored the experimental network and
the host routine stuff. So, in your NS file you can now do this:

	set ns [new Simulator]
	set v0 [$ns node]
	set v1 [$ns node]

	tb-set-hardware $v0 pcvm
	tb-set-hardware $v1 pcvm
	tb-set-node-os $v0 FC-VSERVER
	tb-set-node-os $v1 FC-VSERVER

As you can see, I am using the osid to indicate jails vs
vservers. There are some small changes in assign_wrapper that use the
nextosid of the osid to map to the actual osid to install on the
hosting node. If you try to collocate a jail and a vserver assign will
refuse, cause we use features and desires for the osids. Sweet.

Oh, the ssh button in the web interface does not work yet cause the page
assumes that local virtnodes can bind to port 22 in each vserver, but
that will not work yet.
parent 9deae976
......@@ -395,7 +395,6 @@ sub nodetypeistype($) { return exists($node_types{$_[0]}); }
sub nodetypetype($) { return $node_types{$_[0]}->type(); }
sub nodetypeclass($) { return $node_types{$_[0]}->class(); }
sub nodedelayosid($) { return $node_types{$_[0]}->delay_osid(); }
sub nodejailosid($) { return $node_types{$_[0]}->jail_osid(); }
sub nodedefaultosid($) { return $node_types{$_[0]}->default_osid(); }
sub nodetypeisremote($) { return $node_types{$_[0]}->isremotenode(); }
sub nodetypeisvirt($) { return $node_types{$_[0]}->isvirtnode(); }
......@@ -411,6 +410,7 @@ sub nodetypesimcap($) { return $node_types{$_[0]}->simnode_capacity(); }
my %osids = ();
sub osidpath($) { return $osids{$_[0]}->{"path"}; }
sub osidos($) { return $osids{$_[0]}->{"OS"}; }
sub osidhaspath($) { my $path = osidpath($_[0]);
return (defined $path) && ($path ne "")};
sub osidnextosid($) {
......@@ -2985,7 +2985,7 @@ sub InitPnode($$)
$osid = $sim_osid;
}
else {
$osid = nodejailosid(physnodetype($pnode));
$osid = nodejailosid($pnode);
}
my $cmdline = TBGetOSBootCmd(osidnextosid($osid),
......@@ -3048,7 +3048,7 @@ sub InitPnode($$)
$osid = $jail_osid;
}
else {
$osid = nodejailosid(physnodetype($pnode));
$osid = nodejailosid($pnode);
}
$expt_stats{"jailnodes"} += 1;
#
......@@ -3058,7 +3058,9 @@ sub InitPnode($$)
$role = TBDB_RSRVROLE_VIRTHOST;
# XXX Must have routing on jail hosting nodes. Change me.
$routertype = TBDB_ROUTERTYPE_MANUAL;
$cmdline = "/kernel.jail";
if (osidos($osid) eq "FreeBSD") {
$cmdline = "/kernel.jail";
}
$cmdline_role = "vnodehost";
}
fatal("No OSID is defined for internal node $vname!")
......@@ -3878,8 +3880,9 @@ sub LoadPhysInfo()
# Get paths from os_info, so that we can identify OSKit/MFS OSes, basically
# those which do not load a disk image
#
$query_result = DBQueryFatal("select osid, path, nextosid from os_info");
while (my ($osid, $path, $nextosid) = $query_result->fetchrow()) {
$query_result =
DBQueryFatal("select osid,path,nextosid,OS from os_info");
while (my ($osid, $path, $nextosid, $OS) = $query_result->fetchrow()) {
$osids{$osid} = {};
if ($path) {
$osids{$osid}->{"path"} = $path;
......@@ -3887,6 +3890,7 @@ sub LoadPhysInfo()
if (defined($nextosid)) {
$osids{$osid}->{"nextosid"} = TBResolveNextOSID($osid,$pid,$eid);
}
$osids{$osid}->{"OS"} = $OS;
}
}
sub interfacespeedmbps($$) {
......@@ -5661,3 +5665,26 @@ sub LoadCurrent()
printdb "Old Reserved Nodes: " . join(" ", keys %oldreservednodes) . "\n";
}
#
# This is special. Look at the osid of the virtnodes on this pnode and
# map to a suitable osid using the nextosid field. This overloads nextosid
# to some extent ...
#
sub nodejailosid($)
{
my ($pnode) = @_;
my @vnodelist = @{$virtnodes{$pnode}};
#
# We know at this point that all vnodes on this pnode want the same
# osid cause of assign (osid features/desires). So just use the first
# one to figure out what osid for the physical node.
#
my $vnode = $vnodelist[0];
my $osid = virtnodeosid($vnode);
my $nextosid = osidnextosid($osid);
printdb "Mapping jail osid to $osid ($nextosid) on $pnode\n";
return $nextosid;
}
......@@ -87,6 +87,11 @@ if ($UID != 0) {
" Must be root to run this script!\n");
}
# We need to know this below.
my $sysname = `uname -s`;
chomp($sysname);
my $islinux = ($sysname eq "Linux");
#
# Put this into the background and log its output. We *must* do this cause
# we do not want to halt the boot if the testbed is down!
......@@ -233,7 +238,7 @@ if ($fakejails) {
# This will fail if it already exists. Keep going on any failure though.
#
if (!REMOTE()) {
system("mkextrafs.pl $vndir");
system("mkextrafs.pl " . ($islinux ? "/vservers" : $vndir));
}
#
......@@ -243,23 +248,27 @@ if (! -e "/local") {
system("ln -s " . LOCALROOTFS() . " /local");
}
#
# Make sure enough vn devices exist
#
for (my $i = 0;
$i < scalar(keys(%newvnodelist)) + scalar(keys(%curvnodelist)); $i++) {
my $dev = "vn${i}";
if (! -e "/dev/${dev}c") {
system("(cd /dev; ./MAKEDEV $dev)");
if (!$islinux) {
#
# Make sure enough vn devices exist
#
for (my $i = 0;
$i < scalar(keys(%newvnodelist)) + scalar(keys(%curvnodelist));
$i++) {
my $dev = "vn${i}";
if (! -e "/dev/${dev}c") {
system("(cd /dev; ./MAKEDEV $dev)");
}
}
}
#
# XXX tweak IP interrupt queue size to accomodate up to 8 x 100Mb trivial links
# XXX turn on local hack to retry on NFS EACCES errors (thanks mountd!)
#
system("sysctl net.inet.ip.intr_queue_maxlen=128 >/dev/null 2>&1");
system("sysctl vfs.nfs.eacces_retry_enable=1 >/dev/null 2>&1");
#
# XXX tweak IP interrupt queue size to accomodate up to 8 x 100Mb
# trivial links XXX turn on local hack to retry on NFS EACCES
# errors (thanks mountd!)
#
system("sysctl net.inet.ip.intr_queue_maxlen=128 >/dev/null 2>&1");
system("sysctl vfs.nfs.eacces_retry_enable=1 >/dev/null 2>&1");
}
#
# XXX grossed out yet? Try this one: the mount command will HUP mountd
......@@ -278,11 +287,13 @@ foreach my $vnode (sort(keys(%newvnodelist))) {
bootvnode($vnode, "boot", $newvnodelist{$vnode});
}
my $PIDFILE = "/var/run/progagent.pid";
if (!$islinux) {
my $PIDFILE = "/var/run/progagent.pid";
foreach my $vnode (keys(%newvnodelist)) {
system("rtprio 15 -`cat $vndir/$vnode/root/$PIDFILE`")
if (-e "$vndir/$vnode/root/$PIDFILE");
foreach my $vnode (keys(%newvnodelist)) {
system("rtprio 15 -`cat $vndir/$vnode/root/$PIDFILE`")
if (-e "$vndir/$vnode/root/$PIDFILE");
}
}
exit(0);
......@@ -794,8 +794,16 @@ sub removeconfdir($)
sub hackwaitandexit()
{
my $now = time();
my $goofy = CONFDIR() . "/root/var/run/emulab-watchdog.pid";
my $goofy;
my $count = 60;
# The first case is for our own (non-plab) vservers.
if (-e "/vservers") {
$goofy = "/vservers/$vnodeid/var/run/emulab-watchdog.pid";
}
else {
$goofy = CONFDIR() . "/root/var/run/emulab-watchdog.pid";
}
while ($count--) {
sleep(1);
......
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2007 University of Utah and the Flux Group.
# Copyright (c) 2000-2008 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -32,6 +32,7 @@ SYSETCDIR ?= $(DESTDIR)/etc
ETCDIR ?= $(DESTDIR)$(CLIENT_ETCDIR)
BINDIR ?= $(DESTDIR)$(CLIENT_BINDIR)
VARDIR ?= $(DESTDIR)$(CLIENT_VARDIR)
VSDIR ?= $(BINDIR)/vserver
RCDIR ?= $(SYSETCDIR)/rc.d
INSTALL ?= /usr/bin/install -c
COMMON ?= $(SRCDIR)/../common
......@@ -41,7 +42,7 @@ DEFRUNLVLDIR ?= $(RCDIR)/rc3.d
RRCDIR ?= /etc/rc.d
install client-install: common-install etc-install \
sup-install script-install bin-install
sup-install script-install bin-install vserver-install
@echo "Remember to install the PEM files if necessary"
simple-install: common-install script-install bin-install
......@@ -154,3 +155,9 @@ sfs-install:
# create ifcfg-eth? files
ifcfgs: $(SRCDIR)/mkifcfgs $(SRCDIR)/ifcfg.template
$(SRCDIR)/mkifcfgs $(SRCDIR)/ifcfg.template
vserver-install: dir-install
$(INSTALL) -m 755 -o root -g root -d $(VSDIR)
$(INSTALL) -m 755 $(SRCDIR)/rc.invserver $(VSDIR)/rc.invserver
$(INSTALL) -m 755 $(SRCDIR)/mkvserver.pl $(BINDIR)/mkvserver.pl
-ln -sf $(BINDIR)/mkvserver.pl $(BINDIR)/mkjail.pl
#!/usr/bin/perl -w
#
# EMULAB-COPYRIGHT
# Copyright (c) 2008 University of Utah and the Flux Group.
# All rights reserved.
#
# Kernel, jail, netstat, route, ifconfig, ipfw, header files.
#
use strict;
use English;
use Getopt::Std;
use Fcntl;
use IO::Handle;
use Socket;
use Fcntl ':flock';
# Drag in path stuff so we can find emulab stuff. Also untaints path.
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }
use libsetup qw(REMOTE LOCALROOTFS TMTOPOMAP TMLTMAP TMLTPMAP
TBDebugTimeStamp);
use libtmcc;
#
# Questions:
#
#
# Create a jailed environment. There are some stub files stored in
# /etc/jail that copied into the jail.
#
sub usage()
{
print("Usage: mkvserver.pl [-V] [-s] [-i <ipaddr>] [-p <pid>] ".
"[-h <hostname>] <vnodeid>\n");
exit(-1);
}
my $optlist = "Vi:p:e:sh:";
#
# Only real root can run this script.
#
if ($UID) {
die("Must be root to run this script!\n");
}
#
# Catch ^C and exit with error.
#
my $leaveme = 0;
sub handler ($) {
my ($signame) = @_;
$SIG{INT} = 'IGNORE';
$SIG{USR1} = 'IGNORE';
$SIG{TERM} = 'IGNORE';
$SIG{HUP} = 'IGNORE';
if ($signame eq 'USR1') {
$leaveme = 1;
}
fatal("Caught a SIG${signame}! Killing the vserver ...");
}
$SIG{INT} = \&handler;
$SIG{USR1} = \&handler;
$SIG{HUP} = \&handler;
$SIG{TERM} = 'IGNORE';
#
# Turn off line buffering on output
#
STDOUT->autoflush(1);
STDERR->autoflush(1);
# XXX
my $JAILCNET = "172.16.0.0";
my $JAILCNETMASK = "255.240.0.0";
#
# Locals
#
my $JAILPATH = "/var/emulab/jails";
my $ETCVSERVER = "/usr/local/etc/emulab/vserver";
my $VSERVER = "/usr/sbin/vserver";
my $VSERVERDIR = "/vservers";
my $JAILCONFIG = "jailconfig";
my @ROOTCPDIRS = ("etc", "root");
my @ROOTMKDIRS = ("dev", "tmp", "var", "usr", "proc", "users", "lib",
"bin", "sbin", "home");
my @ROOTMNTDIRS = ("bin", "sbin", "usr", "lib");
my @EMUVARDIRS = ("logs", "db", "jails", "boot", "lock");
my $IP;
my $IPMASK;
my $PID;
my $VDIR;
my $idnumber;
my $jailhostname;
my $jailpid;
my $debug = 1;
my @mntpoints = ();
my $USEVCNETROUTES = 0;
my @controlroutes = ();
my $interactive = 0;
my $cleaning = 0;
# This stuff is passed from tmcd, which we parse into a config string
# and an option set.
my %jailconfig = ();
my $sshdport = 50000; # Bogus default, good for testing.
my $jailflags = 0;
my @jailips = (); # List of jail IPs (for routing table).
#
# Protos
#
sub mkvserver($);
sub upvserver($);
sub LoopMount($$);
sub PreparePhysNode();
sub fatal($);
sub getjailconfig($);
sub setjailoptions();
sub mysystem($);
sub cleanup();
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (@ARGV != 1) {
usage();
}
my $vnodeid = $ARGV[0];
#
# Untaint the arguments.
#
if ($vnodeid =~ /^([-\w\/\.]+)$/) {
$vnodeid = $1;
$jailhostname = $1;
}
else {
die("Tainted argument $vnodeid!\n");
}
if (defined($options{'s'})) {
$interactive = 1;
}
TBDebugTimeStamp("mkjail starting to do real work");
#
# Get the parent IP.
#
my $hostname = `hostname`;
my $hostip;
# Untaint and strip newline.
if ($hostname =~ /^([-\w\.]+)$/) {
$hostname = $1;
my (undef,undef,undef,undef,@ipaddrs) = gethostbyname($hostname);
$hostip = inet_ntoa($ipaddrs[0]);
}
#
# If no IP, then it defaults to our hostname's IP, *if* none is provided
# by the jail configuration file. That check is later.
#
if (defined($options{'i'})) {
$IP = $options{'i'};
if ($IP =~ /^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$/) {
$IP = $1;
}
else {
die("Tainted argument $IP!\n");
}
}
if (defined($options{'p'})) {
$PID = $options{'p'};
if ($PID =~ /^([-\@\w]+)$/) {
$PID = $1;
}
else {
die("Tainted argument $PID.");
}
}
if (defined($options{'h'})) {
$jailhostname = $options{'h'};
if ($jailhostname =~ /^([-\w\.]+)$/) {
$jailhostname = $1;
}
else {
die("Tainted argument $jailhostname.");
}
}
#
# In most cases, the $vnodeid directory will have been created by the caller,
# and a config file possibly dropped in.
# When debugging, we have to create it here.
#
chdir($JAILPATH) or
die("Could not chdir to $JAILPATH: $!\n");
if (! -e $vnodeid) {
mkdir($vnodeid, 0770) or
fatal("Could not mkdir $vnodeid in $JAILPATH: $!");
}
else {
TBDebugTimeStamp("mkjail getting jail config");
getjailconfig("$JAILPATH/$vnodeid");
}
my $phys_cnet_if = `control_interface`;
chomp($phys_cnet_if);
#
# See if special options supported, and if so setup args as directed.
#
TBDebugTimeStamp("mkjail setting jail options");
setjailoptions();
# Do some prep stuff on the physical node if this is the first vserver.
PreparePhysNode();
print("Setting up jail for $vnodeid using $IP\n")
if ($debug);
$VDIR = "$VSERVERDIR/$vnodeid";
#
# Create the vserver.
#
if (-e $VDIR) {
#
# Try to pick up where we left off.
#
TBDebugTimeStamp("mkjail restoring root fs");
upvserver("$vnodeid");
}
else {
#
# Create the root filesystem.
#
TBDebugTimeStamp("Creating vserver");
mkvserver("$vnodeid");
}
TBDebugTimeStamp("mkjail done with root fs");
#
# Start the vserver. If all goes well, this will exit cleanly, with the
# vserver running in its new context. Still, lets protect it with a timer
# since it might get hung up inside and we do not want to get stuck here.
#
my $childpid = fork();
if ($childpid) {
local $SIG{ALRM} = sub { kill("TERM", $childpid); };
alarm 30;
waitpid($childpid, 0);
alarm 0;
#
# If failure then cleanup.
#
if ($?) {
fatal("mkvserver: $vnodeid vserver startup exited with $?");
}
}
else {
$SIG{TERM} = 'DEFAULT';
TBDebugTimeStamp("mkvserver: starting the vserver");
exec("$VSERVER $vnodeid start");
die("*** $0:\n".
" exec failed to start the vserver!\n");
}
#
# If this file does not exist, the inner setup failed somehow. Stop now.
#
fatal("vserver did not appear to set up properly. Exiting ...")
if (! -e "$VDIR/$BOOTDIR/vrunning");
print "vserver for $vnodeid started. Waiting ...\n";
$jailpid = fork();
if ($jailpid) {
#
# We do not really care about the exit status of the jail, we just want
# to know when it dies inside.
#
while (1) {
my $kidpid = waitpid(-1, 0);
if ($kidpid == $jailpid) {
undef($jailpid);
last;
}
print("Unknown child $kidpid exited with status $?!\n");
}
}
else {
$SIG{TERM} = 'DEFAULT';
exec("$VSERVER $vnodeid exec sleep 1000000");
die("*** $0:\n".
" exec failed to start the jail!\n");
}
print "vserver for $vnodeid has died. Cleaning up ...\n";
cleanup();
exit(0);
#
# Create a root filesystem for the vserver.
#
sub mkvserver($)
{
my ($vnodeid) = @_;
my $interface;
if (defined($IP)) {
system("/usr/lib/util-vserver/mask2prefix $JAILCNETMASK");
my $prefix = $? >> 8;
$interface = "${vnodeid}=${phys_cnet_if}:${IP}/${prefix}";
}
else {
$interface = "nodev:0.0.0.0/0";
}
# Create the skeleton vserver. It will be mostly empty.
mysystem("$VSERVER $vnodeid build --force -m skeleton ".
"--hostname $jailhostname --interface $interface ".
"--flags persistent");
# The filesystem for the vserver lands here.
my $vdir = $VDIR;
# The configuration directory is here.
my $cdir = "/etc/vservers/$vnodeid";
#
# Copy in the top level directories.
#
foreach my $dir (@ROOTCPDIRS) {
mysystem("rsync -a /$dir $vdir");
}
TBDebugTimeStamp("mkvserver: Copying root cp dirs done!");
#
# Set vserver "capabilities".
#
# Allows binding to TCP/UDP sockets below 1024
mysystem("echo 'NET_BIND_SERVICE' > $cdir/bcapabilities");
#
# Clean out some stuff from /eatc.
#
mysystem("/bin/rm -rf $vdir/etc/rc.d/rc*.d/*");
#
# Make some other directories that are need in /root.
#
foreach my $dir (@ROOTMKDIRS) {
if (! -e "$vdir/$dir") {
mkdir("$vdir/$dir", 0755) or
fatal("Could not mkdir '$dir' in $vdir: $!");
}
}
TBDebugTimeStamp("mkvserver: Creating root mkdir dirs done!");
#
# Mount (read-only) these other directories to save space.
#
foreach my $dir (@ROOTMNTDIRS) {
LoopMount("/$dir", "$vdir/$dir");
}
TBDebugTimeStamp("mkvserver: Mounting root dirs done!");
# /tmp is special.
mysystem("chmod 1777 $vdir/tmp");
#
# Duplicate the /var hierarchy without the contents.
#
open(VARDIRS, "find /var -type d -print |")
or fatal("Could not start find on /var");
while (<VARDIRS>) {
my $dir = $_;
chomp($dir);
mysystem("rsync -dlptgoD $dir $vdir$dir");
}
close(VARDIRS);
#
# Get a list of all the plain files and create zero length versions