libvnode_openvz.pm 40.5 KB
Newer Older
1 2 3
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2008-2010 University of Utah and the Flux Group.
5 6 7 8 9 10 11
# All rights reserved.
#
# Implements the libvnode API for OpenVZ support in Emulab.
#
package libvnode_openvz;
use Exporter;
@ISA    = "Exporter";
12
@EXPORT = qw( vz_init vz_setDebug
13 14
              vz_rootPreConfig vz_rootPreConfigNetwork vz_rootPostConfig 
              vz_vnodeCreate vz_vnodeDestroy vz_vnodeState 
15
              vz_vnodeBoot vz_vnodeHalt vz_vnodeReboot 
Leigh Stoller's avatar
Leigh Stoller committed
16
              vz_vnodePreConfig vz_vnodeUnmount
17 18
              vz_vnodePreConfigControlNetwork vz_vnodePreConfigExpNetwork 
              vz_vnodeConfigResources vz_vnodeConfigDevices
19
              vz_vnodePostConfig vz_vnodeExec
20 21 22 23 24 25 26 27 28 29 30 31
            );

%ops = ( 'init' => \&vz_init,
	 'setDebug' => \&vz_setDebug,
	 'rootPreConfig' => \&vz_rootPreConfig,
	 'rootPreConfigNetwork' => \&vz_rootPreConfigNetwork,
	 'rootPostConfig' => \&vz_rootPostConfig,
	 'vnodeCreate' => \&vz_vnodeCreate,
	 'vnodeDestroy' => \&vz_vnodeDestroy,
	 'vnodeState' => \&vz_vnodeState,
	 'vnodeBoot' => \&vz_vnodeBoot,
	 'vnodeHalt' => \&vz_vnodeHalt,
Leigh Stoller's avatar
Leigh Stoller committed
32
	 'vnodeUnmount' => \&vz_vnodeUnmount,
33
	 'vnodeReboot' => \&vz_vnodeReboot,
34
	 'vnodeExec' => \&vz_vnodeExec,
35 36 37 38 39 40 41 42 43 44 45 46
	 'vnodePreConfig' => \&vz_vnodePreConfig,
	 'vnodePreConfigControlNetwork' => \&vz_vnodePreConfigControlNetwork,
	 'vnodePreConfigExpNetwork' => \&vz_vnodePreConfigExpNetwork,
	 'vnodeConfigResources' => \&vz_vnodeConfigResources,
	 'vnodeConfigDevices' => \&vz_vnodeConfigDevices,
	 'vnodePostConfig' => \&vz_vnodePostConfig,
    );


use strict;
use English;
use Data::Dumper;
Leigh Stoller's avatar
Leigh Stoller committed
47
use Socket;
48 49 50 51

# Pull in libvnode
require "/etc/emulab/paths.pm"; import emulabpaths;
use libvnode;
52
use libtestbed;
53

54 55 56 57 58 59 60 61 62 63
#
# Turn off line buffering on output
#
$| = 1;

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 

David Johnson's avatar
David Johnson committed
64
my $defaultImage = "emulab-default";
65

66 67
my $DOLVM = 1;

68 69
my $GLOBAL_CONF_LOCK = "vzconf";

70 71
sub VZSTAT_RUNNING() { return "running"; }
sub VZSTAT_STOPPED() { return "stopped"; }
Leigh Stoller's avatar
Leigh Stoller committed
72
sub VZSTAT_MOUNTED() { return "mounted"; }
73 74 75 76 77 78 79 80 81

my $VZCTL  = "/usr/sbin/vzctl";
my $VZLIST = "/usr/sbin/vzlist";
my $IFCONFIG = "/sbin/ifconfig";
my $ROUTE = "/sbin/route";
my $BRCTL = "/usr/sbin/brctl";
my $IPTABLES = "/sbin/iptables";
my $MODPROBE = "/sbin/modprobe";
my $RMMOD = "/sbin/rmmod";
82
my $VLANCONFIG = "/sbin/vconfig";
83 84 85 86 87

my $VZRC   = "/etc/init.d/vz";
my $MKEXTRAFS = "/usr/local/etc/emulab/mkextrafs.pl";

my $CTRLIPFILE = "/var/emulab/boot/myip";
88
my $IMQDB      = "/var/emulab/db/imqdb";
89
my $MAXIMQ     = 64;
90 91 92 93 94 95 96 97 98 99

my $CONTROL_IFNUM  = 999;
my $CONTROL_IFDEV  = "eth${CONTROL_IFNUM}";
my $EXP_BASE_IFNUM = 0;

my $debug = 0;

# XXX needs lifting up
my $JAILCTRLNET = "172.16.0.0";
my $JAILCTRLNETMASK = "255.240.0.0";
100

101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
#
# Helpers.
#
sub findControlNet();
sub makeIfaceMaps();
sub makeBridgeMaps();
sub findIface($);
sub findMac($);
sub editContainerConfigFile($$);

sub vmexists($);
sub vmstatus($);
sub vmrunning($);
sub vmstopped($);

#
# Initialize the lib (and don't use BEGIN so we can do reinit).
#
sub vz_init {
    my ($pnode_id,) = @_;

    makeIfaceMaps();
    makeBridgeMaps();

125 126 127
    #
    # Turn off LVM if already using a /vz mount.
    #
128
    if (-e "/vz/.nolvm" || -e "/vz.save/.nolvm" || -e "/.nolvm") {
129
	$DOLVM = 0;
130
	mysystem("/sbin/dmsetup remove_all");
131
    }
132 133 134 135 136 137 138
    return 0;
}

#
# Prepare the root context.  Run once at boot.
#
sub vz_rootPreConfig {
139 140 141 142 143 144 145
    #
    # Only want to do this once, so use file in /var/run, which
    # is cleared at boot.
    #
    return 0
	if (-e "/var/run/openvz.ready");

146
    if ((my $locked = TBScriptLock($GLOBAL_CONF_LOCK,
147
				   TBSCRIPTLOCK_GLOBALWAIT(), 900)) 
148 149 150 151 152 153
	!= TBSCRIPTLOCK_OKAY()) {
	return 0
	    if ($locked == TBSCRIPTLOCK_IGNORE());
	print STDERR "Could not get the vzinit lock after a long time!\n";
	return -1;
    }
154 155 156 157 158
    # we must have the lock, so if we need to return right away, unlock
    if (-e "/var/run/openvz.ready") {
        TBScriptUnlock();
        return 0;
    }
159

160
    # make sure filesystem is setup 
161
    if ($DOLVM) {
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
	# be ready to snapshot later on...
	open(FD, "gunzip -c /proc/config.gz |");
	my $snapshot = "n";
	while (my $line = <FD>) {
	    if ($line =~ /^CONFIG_DM_SNAPSHOT=([yYmM])/) {
		$snapshot = $1;
		last;
	    }
	}
	close(FD);
	if ($snapshot eq 'n' || $snapshot eq 'N') {
	    print STDERR "ERROR: this kernel does not support LVM snapshots!\n";
	    TBScriptUnlock();
	    return -1;
	}
	elsif ($snapshot eq 'm' || $snapshot eq 'M') {
	    mysystem("$MODPROBE dm-snapshot");
	}

181
	if (system('vgs | grep -E -q '."'".'^[ ]+openvz.*$'."'")) {
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
	    my $blockdevs = "";
	    my %devs = libvnode::findSpareDisks();
	    my $totalSize = 0;
	    foreach my $dev (keys(%devs)) {
		if (defined($devs{$dev}{"size"})) {
		    $blockdevs .= " /dev/$dev";
		    $totalSize += $devs{$dev}{"size"};
		}
		else {
		    foreach my $part (keys(%{$devs{$dev}})) {
			$blockdevs .= " /dev/${dev}${part}";
			$totalSize += $devs{$dev}{$part}{"size"};
		    }
		}
	    }

	    if ($blockdevs eq '') {
		die "findSpareDisks found no disks, can't use LVM!\n";
	    }
		    
	    mysystem("pvcreate $blockdevs");
	    mysystem("vgcreate openvz $blockdevs");
204 205 206 207 208 209 210 211 212

	    # XXX eventually could move this into its own logical volume, but
	    # we don't ever know how many images we'll have to store.
	    mysystem("$VZRC stop");
	    mysystem("rm -rf /vz")
		if (-e "/vz");
	    mysystem("mkdir /vz");
	    mysystem("cp -pR /vz.save/* /vz/");
	}
213 214 215 216

	# make sure our volumes are active -- they seem to become inactive
	# across reboots
	mysystem("vgchange -a y openvz");
217
    }
218 219 220 221 222 223 224 225 226
    else {
	# about the funny quoting: don't ask... emacs perl mode foo.
	if (system('grep -q '."'".'^/dev/.*/vz.*$'."'".' /etc/fstab')) {
	    mysystem("$VZRC stop");
	    mysystem("rm -rf /vz")
		if (-e "/vz");
	    mysystem("mkdir /vz");
	    mysystem("$MKEXTRAFS -f /vz");
	    mysystem("cp -pR /vz.save/* /vz/");
227
	    mysystem("touch /vz/.nolvm");
228 229 230 231
	}
	if (system('mount | grep -q \'on /vz\'')) {
	    mysystem("mount /vz");
	}
232 233
    }

234 235 236 237 238 239
    # We need to increase the size of the net.core.netdev_max_backlog 
    # sysctl var in the root context; not sure to what amount, or exactly 
    # why though.  Perhaps there is too much contention when handling enqueued
    # packets on the veths?
    mysystem("sysctl -w net.core.netdev_max_backlog=2048");

240 241 242 243 244 245 246 247 248
    # make sure the initscript is going...
    if (system("$VZRC status 2&>1 > /dev/null")) {
	mysystem("$VZRC start");
    }

    # get rid of this simple container device support
    if (!system('lsmod | grep -q vznetdev')) {
	system("$RMMOD vznetdev");
    }
249

250 251 252
    # this is what we need for veths
    mysystem("$MODPROBE vzethdev");

Leigh Stoller's avatar
Leigh Stoller committed
253 254 255
    # For tunnels
    mysystem("$MODPROBE ip_gre");

256 257 258
    # For VLANs
    mysystem("$MODPROBE 8021q");

259 260 261 262 263 264
    # we need this stuff for traffic shaping -- only root context can
    # modprobe, for now.
    mysystem("$MODPROBE sch_plr");
    mysystem("$MODPROBE sch_delay");
    mysystem("$MODPROBE sch_htb");

265 266 267 268 269 270 271 272 273 274 275 276
    # make sure our network hooks are called
    if (system('grep -q -e EXTERNAL_SCRIPT /etc/vz/vznet.conf')) {
	if (! -e '/etc/vz/vznet.conf') {
	    open(FD,">/etc/vz/vznet.conf") 
		or die "could not open /etc/vz/vznet.conf: $!";
	    print FD "#!/bin/bash\n";
	    print FD "\n";
	    close(FD);
	}
	mysystem("echo 'EXTERNAL_SCRIPT=\"/usr/local/etc/emulab/vznetinit-elab.sh\"' >> /etc/vz/vznet.conf");
    }

277 278 279 280
    #
    # XXX all this network config stuff should be done in PreConfigNetwork,
    # but we can't rmmod the IMQ module to change the config, so no point.
    #
281

282
    # Ug, pre-create a bunch of imq devices, since adding new ones
283 284 285 286 287 288 289 290 291 292 293
    # does not work right yet.
    mysystem("$MODPROBE imq numdevs=$MAXIMQ");
    mysystem("$MODPROBE ipt_IMQ");

    # Create a DB to manage them.
    my %MDB;
    if (!dbmopen(%MDB, $IMQDB, 0660)) {
	print STDERR "*** Could not create $IMQDB\n";
	return -1;
    }
    for (my $i = 0; $i < $MAXIMQ; $i++) {
294 295
	$MDB{"$i"} = ""
	    if (!exists($MDB{"$i"}));
296 297 298 299 300
    }
    dbmclose(%MDB);

    mysystem("touch /var/run/openvz.ready");
    TBScriptUnlock();
301 302 303 304 305 306 307 308 309
    return 0;
}

#
# Prepare any network stuff in the root context on a global basis.  Run once
# at boot, or at reconfigure.  For openvz, this consists of creating bridges
# and configuring them as necessary.
#
sub vz_rootPreConfigNetwork {
310
    if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) {
311 312 313
	print STDERR "Could not get the vznetwork lock after a long time!\n";
	return -1;
    }
314

315 316 317 318
    # Do this again after lock.
    makeIfaceMaps();
    makeBridgeMaps();
    
319
    my ($node_ifs,$node_ifsets,$node_lds) = @_;
320

321 322 323 324 325 326 327
    # setup forwarding on ctrl net -- NOTE that iptables setup to do NAT
    # actually happens per vnode now.
    my ($iface,$ip,$netmask,$maskbits,$network,$mac) = findControlNet();
    mysystem("echo 1 > /proc/sys/net/ipv4/conf/$iface/forwarding");
    # XXX only needed for fake mac hack, which should go away someday
    mysystem("echo 1 > /proc/sys/net/ipv4/conf/$iface/proxy_arp");

328 329 330 331 332 333 334 335 336
    # figure out what bridges we need to make:
    # we need a bridge for each physical iface that is a multiplex pipe,
    # and one for each VTAG given PMAC=none (i.e., host containing both sides
    # of a link, or an entire lan).
    my %brs = ();
    foreach my $node (keys(%$node_ifs)) {
	foreach my $ifc (@{$node_ifs->{$node}}) {
	    next if (!$ifc->{ISVIRT});

337 338 339 340 341 342 343 344 345 346 347 348
	    if ($ifc->{ITYPE} eq "loop") {
		my $vtag  = $ifc->{VTAG};

		#
		# No physical device. Its a loopback (trivial) link/lan
		# All we need is a common bridge to put the veth ifaces into.
		#
		my $brname = "br$vtag";
		$brs{$brname}{ENCAP} = 0;
		$brs{$brname}{SHORT} = 0;
	    }
	    elsif ($ifc->{ITYPE} eq "vlan") {
349 350
		my $iface = $ifc->{IFACE};
		my $vtag  = $ifc->{VTAG};
351 352 353
		my $vdev  = "${iface}.${vtag}";

		system("$VLANCONFIG set_name_type DEV_PLUS_VID_NO_PAD");
354
		system("$VLANCONFIG add $iface $vtag");
355
		system("$VLANCONFIG set_name_type VLAN_PLUS_VID_NO_PAD");
356 357 358 359 360 361 362 363
		system("$IFCONFIG $vdev up");

		my $brname = "pbr$vdev";
		$brs{$brname}{ENCAP} = 1;
		$brs{$brname}{SHORT} = 0;
		$brs{$brname}{PHYSDEV} = $vdev;
	    }
	    elsif ($ifc->{PMAC} eq "none") {
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
		my $brname = "br" . $ifc->{VTAG};
		# if no PMAC, we don't need encap on the bridge
		$brs{$brname}{ENCAP} = 0;
		# count up the members so we can figure out if this is a shorty
		if (!exists($brs{$brname}{MEMBERS})) {
		    $brs{$brname}{MEMBERS} = 0;
		}
		else {
		    $brs{$brname}{MEMBERS}++;
		}
	    }
	    else {
		my $iface = findIface($ifc->{PMAC});
		my $brname = "pbr$iface";
		$brs{$brname}{ENCAP} = 1;
		$brs{$brname}{SHORT} = 0;
		$brs{$brname}{PHYSDEV} = $iface;
	    }
	}
    }

    # actually make bridges and add phys ifaces
    foreach my $k (keys(%brs)) {
	# postpass to setup SHORT if only two members and no PMAC
	if (exists($brs{$k}{MEMBERS})) {
	    if ($brs{$k}{MEMBERS} == 2) {
		$brs{$k}{SHORT} = 1;
	    }
	    else {
		$brs{$k}{SHORT} = 0;
	    }
	    $brs{$k}{MEMBERS} = undef;
	}

	# building bridges is an important activity
399
	if (! -d "/sys/class/net/$k/bridge") {
400 401 402 403 404 405 406 407 408 409 410
	    mysystem("$BRCTL addbr $k");
	}
	# repetitions of this should not hurt anything
	mysystem("$IFCONFIG $k 0 up");

	# XXX here we would normally config the bridge to encapsulate or
	# act in short mode

	if (exists($brs{$k}{PHYSDEV})) {
	    # make sure this iface isn't already part of another bridge; if it
	    # it is, remove it from there first and add to this bridge.
411 412 413
	    my $obr = findBridge($brs{$k}{PHYSDEV});
	    if (defined($obr)) {
		mysystem("$BRCTL delif " . $obr . " " .$brs{$k}{PHYSDEV});
414 415 416 417 418 419 420
		# rebuild hashes
		makeBridgeMaps();
	    }
	    mysystem("$BRCTL addif $k $brs{$k}{PHYSDEV}");
	}
    }

421 422 423 424 425 426 427 428
    # Use the IMQDB to reserve the devices to the container. We have the lock.
    my %MDB;
    if (!dbmopen(%MDB, $IMQDB, 0660)) {
	print STDERR "*** Could not create $IMQDB\n";
	TBScriptUnlock();
	return -1;
    }
    my $i = 0;
429 430 431
    foreach my $node (keys(%$node_lds)) {
        foreach my $ldc (@{$node_lds->{$node}}) {
	    if ($ldc->{"TYPE"} eq 'duplex') {
432 433 434 435 436 437 438 439 440 441 442 443 444
		while ($i < $MAXIMQ) {
		    my $current = $MDB{"$i"};

		    if (!defined($current) ||
			$current eq "" || $current eq $node) {
			$MDB{"$i"} = $node;
			$i++;
			last;
		    }
		    $i++;
		}
		if ($i == $MAXIMQ) {
		    print STDERR "*** No more IMQs\n";
445
		    TBScriptUnlock();
446 447
		    return -1;
		}
448 449
	    }
	}
450 451 452
	# Clear anything else this node is using; no longer needed.
	for (my $j = $i; $j < $MAXIMQ; $j++) {
	    my $current = $MDB{"$j"};
453

454 455 456 457 458
	    if (!defined($current)) {
		$MDB{"$j"} = $current = "";
	    }
	    if ($current eq $node) {
		$MDB{"$j"} = "";
459 460 461
	    }
	}
    }
462
    dbmclose(%MDB);
463

464
    TBScriptUnlock();
465 466 467 468
    return 0;
}

sub vz_rootPostConfig {
469
    # Locking, if this ever does something?
470 471 472 473 474 475 476
    return 0;
}

#
# Create an OpenVZ container to host a vnode.  Should be called only once.
#
sub vz_vnodeCreate {
477
    my ($vnode_id,$image,$reload_args_ref) = @_;
478 479 480 481 482 483 484 485 486 487 488 489 490

    my $vmid;
    if ($vnode_id =~ /^\w+\d+\-(\d+)$/) {
	$vmid = $1;
    }
    else {
	fatal("vz_vnodeCreate: bad vnode_id $vnode_id!");
    }

    if (!defined($image) || $image eq '') {
	$image = $defaultImage;
    }

491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
    my $imagelockpath = "/var/emulab/run/openvz.image.$image.ready";
    my $imagelockname = "vzimage.$image";
    my $imagepath = "/vz/template/cache/${image}.tar.gz";

    my %reload_args;
    if (defined($reload_args_ref)) {
	%reload_args = %$reload_args_ref;

	# Tell stated via tmcd
	libvnode::setState("RELOADSETUP");

	#
	# So, we are reloading this vnode (and maybe others).  Need to grab
	# the global lock for this image, check if we really need to download
	# the image based on the mtime for the currently cached image (if there
	# is one), if there is old image state, move out of the way, then
	# download the new image.  State to move out of teh way for an old
	# image is the ready file, the image file, lvm "root" devices that we
	# previously had built still-live VMs out of (we need to rename them),
	# and finally, garbage collecting unused "root" devices.  
	#
	# Note that we need to be really careful with the last item -- we 
	# only GC if our create has happened successfully, and we take the 
	# global image GC lock to do so.  This may race due to the nature 
	# of global locks and result in not all old devices getting reaped, 
	# but oh well.  Best effort for now.
	#
	if ((my $locked = TBScriptLock($imagelockname,
				       TBSCRIPTLOCK_GLOBALWAIT(), 1800))
	    != TBSCRIPTLOCK_OKAY()) {
#	    return 0
#		if ($locked == TBSCRIPTLOCK_IGNORE());
	    print STDERR "Could not get the $imagelockname lock after a long time!\n";
	    return -1;
	}

	# do we have the right image file already?
	my $incache = 0;
	if (-e $imagepath) {
	    my (undef,undef,undef,undef,undef,undef,undef,undef,undef,
		$mtime,undef,undef,undef) = stat($imagepath);
	    if ("$mtime" eq $reload_args{"IMAGEMTIME"}) {
		$incache = 1;
	    }
	    else {
		print "mtimes for $imagepath differ: local $mtime, server " . 
		    $reload_args{"IMAGEMTIME"} . "\n";
		unlink($imagepath);
	    }
	}

	if (!$incache && $DOLVM) {
	    # did we create an lvm device for the old image at some point?
	    # (i.e., does the image lock file exist?)
	    if (-e $imagelockpath) {
		# if there's already a logical device for this image...
		my $sysret = system("lvdisplay /dev/openvz/$image >& /dev/null");
		if (!$sysret) {
		    my $rand = int(rand(100000));
		    my @outlines = system("lvs --noheadings");
		    my $found = 0;
		    while (!$found) {
			foreach my $line (@outlines) {
			    if ($line =~ /^\s*([-_\d\w]+)\.(\d+)\s+openvz/) {
				if ($rand == $2) {
				    $found = 1;
				    last;
				}
			    }
			}
			if ($found) {
			    $found = 0;
			    $rand = int(rand(100000));
			    @outlines = system("lvs --noheadings");
			}
			else {
			    $found = 1;
			}
		    }

		    # rename nicely works even when snapshots exist
		    mysystem("lvrename /dev/openvz/$image" . 
			     " /dev/openvz/$image.$rand");

		    # now we can remove the readyfile
		    unlink($imagelockpath);
		}
	    }
	}
	elsif (!$incache && -e $imagelockpath) {
	    # now we can remove the readyfile
	    unlink($imagelockpath);
	}

	# Tell stated via tmcd
	libvnode::setState("RELOADING");

	if (!$incache) {
	    # Now we just download the file, then let create do its normal thing
590
	    my $dret = libvnode::downloadImage($imagepath,0,$reload_args_ref);
591 592 593 594 595 596 597 598

	    # reload has finished, file is written... so let's set its mtime
	    utime(time(),$reload_args{"IMAGEMTIME"},$imagepath);
	}

	TBScriptUnlock();
    }

599
    my $createArg = "";
600 601 602 603 604 605 606 607
    if ((my $locked = TBScriptLock($imagelockname,
				   TBSCRIPTLOCK_GLOBALWAIT(), 1800))
	!= TBSCRIPTLOCK_OKAY()) {
#	return 0
#	    if ($locked == TBSCRIPTLOCK_IGNORE());
	print STDERR "Could not get the $imagelockname lock after a long time!\n";
	return -1;
    }
608
    if ($DOLVM) {
609
	my $MIN_ROOT_LVM_VOL_SIZE = 1024;
610
	my $MAX_ROOT_LVM_VOL_SIZE = 8 * 1024;
611
	my $MIN_SNAPSHOT_VOL_SIZE = 512;
612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
	my $MAX_SNAPSHOT_VOL_SIZE = $MAX_ROOT_LVM_VOL_SIZE;

	# XXX size our snapshots to assume 50 VMs on the node.
	my $MAX_NUM_VMS = 50;

	# figure out how big our volumes should be based on the volume
	# group size
	my $vgSize;
	my $rootSize = $MAX_ROOT_LVM_VOL_SIZE;
	my $snapSize = $MAX_SNAPSHOT_VOL_SIZE;

	open (VFD,"vgdisplay openvz |")
	    or die "popen(vgdisplay openvz): $!";
	while (my $line = <VFD>) {
	    chomp($line);
	    if ($line =~ /^\s+VG Size\s+(\d+[\.\d]*)\s+(\w+)/) {
		# convert to MB
		if ($2 eq "GB") {    $vgSize = $1 * 1024; }
		elsif ($2 eq "TB") { $vgSize = $1 * 1024 * 1024; }
		elsif ($2 eq "PB") { $vgSize = $1 * 1024 * 1024 * 1024; }
		elsif ($2 eq "MB") { $vgSize = $1 + 0; }
		elsif ($2 eq "KB") { $vgSize = $1 / 1024; }
		last;
635
	    }
636 637 638 639 640 641 642 643
	}
	close(VFD);

	if (defined($vgSize)) {
	    $vgSize /= 50;

	    if ($vgSize < $MIN_ROOT_LVM_VOL_SIZE) {
		$rootSize = int($MIN_ROOT_LVM_VOL_SIZE);
644
	    }
645 646
	    elsif ($vgSize < $MAX_ROOT_LVM_VOL_SIZE) {
		$rootSize = int($vgSize);
647
	    }
648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
	    if ($vgSize < $MIN_SNAPSHOT_VOL_SIZE) {
		$snapSize = int($MIN_SNAPSHOT_VOL_SIZE);
	    }
	    elsif ($vgSize < $MAX_SNAPSHOT_VOL_SIZE) {
		$snapSize = int($vgSize);
	    }
	}

	print STDERR "Using LVM with root size $rootSize MB, snapshot size $snapSize MB.\n";

	# we must have the lock, so if we need to return right away, unlock
	if (-e $imagelockpath) {
	    TBScriptUnlock();
	}
	else {
	    print "Creating LVM core logical device for image $image\n";

	    # ok, create the lvm logical volume for this image.
	    mysystem("lvcreate -L${rootSize}M -n $image openvz");
	    mysystem("mkfs -t ext3 /dev/openvz/$image");
	    mysystem("mkdir -p /tmp/mnt/$image");
	    mysystem("mount /dev/openvz/$image /tmp/mnt/$image");
	    mysystem("mkdir -p /tmp/mnt/$image/root /tmp/mnt/$image/private");
	    mysystem("tar -xzf $imagepath -C /tmp/mnt/$image/private");
	    mysystem("umount /tmp/mnt/$image");

	    # ok, we're done
	    mysystem("mkdir -p /var/emulab/run");
	    mysystem("touch $imagelockpath");
	    TBScriptUnlock();
678 679 680
	}

	# Now take a snapshot of this image's logical device
681
	mysystem("lvcreate -s -L${snapSize}M -n $vnode_id /dev/openvz/$image");
682 683 684 685 686 687
	mysystem("mkdir -p /mnt/$vnode_id");
	mysystem("mount /dev/openvz/$vnode_id /mnt/$vnode_id");

	$createArg = "--private /mnt/$vnode_id/private" . 
	    " --root /mnt/$vnode_id/root --nofs yes";
    }
688 689 690 691 692 693 694 695 696 697
    else {
	TBScriptUnlock();
    }

    if (defined($reload_args_ref)) {
	# Tell stated via tmcd
	libvnode::setState("RELOADDONE");
	sleep(4);
	libvnode::setState("SHUTDOWN");
    }
698

699
    # build the container
700
    mysystem("$VZCTL create $vmid --ostemplate $image $createArg");
701 702 703 704

    # make sure bootvnodes actually starts things up on boot, not openvz
    mysystem("$VZCTL set $vmid --onboot no --name $vnode_id --save");

705
    # set some resource limits:
706
    my %deflimits = ( "diskinodes" => "unlimited:unlimited",
707
		      "diskspace" => "unlimited:unlimited",
708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
		      "numproc" => "unlimited:unlimited",
		      "numtcpsock" => "unlimited:unlimited",
		      "numothersock" => "unlimited:unlimited",
		      "vmguarpages" => "unlimited:unlimited",
		      "kmemsize" => "unlimited:unlimited",
		      "tcpsndbuf" => "unlimited:unlimited",
		      "tcprcvbuf" => "unlimited:unlimited",
		      "othersockbuf" => "unlimited:unlimited",
		      "dgramrcvbuf" => "unlimited:unlimited",
		      "oomguarpages" => "unlimited:unlimited",
		      "lockedpages" => "unlimited:unlimited",
		      "privvmpages" => "unlimited:unlimited",
		      "shmpages" => "unlimited:unlimited",
		      "numfile" => "unlimited:unlimited",
		      "numflock" => "unlimited:unlimited",
		      "numpty" => "unlimited:unlimited",
		      "numsiginfo" => "unlimited:unlimited",
		      #"dcachesize" => "unlimited:unlimited",
		      "numiptent" => "unlimited:unlimited",
		      "physpages" => "unlimited:unlimited",
		      #"cpuunits" => "unlimited",
		      "cpulimit" => "0",
		      "cpus" => "unlimited",
		      "meminfo" => "none",
	);
    my $savestr = "";
    foreach my $k (keys(%deflimits)) {
	$savestr .= " --$k $deflimits{$k}";
    }
    mysystem("$VZCTL set $vmid $savestr --save");

739 740 741 742 743
    # XXX give them cap_net_admin inside containers... necessary to set
    # txqueuelen on devices inside the container.  This may have other
    # undesireable side effects, but need it for now.
    mysystem("$VZCTL set $vmid --capability net_admin:on --save");

744 745 746 747 748 749 750 751 752 753
    #
    # Make some directories in case the guest doesn't have them -- the elab
    # mount and umount vz scripts need them to be there!
    #
    my $privroot = "/vz/private/$vnode_id";
    if ($DOLVM) {
	$privroot = "/mnt/$vnode_id/private";
    }
    mysystem("mkdir -p $privroot/var/emulab/boot/");

754 755 756 757 758 759 760
    # NOTE: we can't ever umount the LVM logical device because vzlist can't
    # return status appropriately if a VM's root and private areas don't
    # exist.
    if (0 && $DOLVM) {
	mysystem("umount /mnt/$vnode_id");
    }

761 762 763 764 765 766
    return $vmid;
}

sub vz_vnodeDestroy {
    my ($vnode_id,$vmid) = @_;

767 768 769 770
    if ($DOLVM) {
	mysystem("umount /mnt/$vnode_id");
	mysystem("lvremove -f /dev/openvz/$vnode_id");
    }
771
    mysystem("$VZCTL destroy $vnode_id");
772 773 774 775 776 777 778
    return -1
	if ($?);

    #
    # Clear the IMQ reservations. Must lock since IMQDB is a shared
    # resource.
    #
779
    if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) {
780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801
	print STDERR "Could not get the vzpreconfig lock after a long time!\n";
	return -1;
    }
    my %MDB;
    if (!dbmopen(%MDB, $IMQDB, 0660)) {
	print STDERR "*** Could not open $IMQDB\n";
	TBScriptUnlock();
	return -1;
    }
    for (my $i = 0; $i < $MAXIMQ; $i++) {
	next
	    if ($MDB{"$i"} ne $vnode_id);
	$MDB{"$i"} = "";
    }
    dbmclose(%MDB);
    TBScriptUnlock();
    return 0;
}

sub vz_vnodeExec {
    my ($vnode_id,$vmid,$command) = @_;

802 803
    # Note: do not use mysystem here since that exits.
    system("$VZCTL exec2 $vnode_id $command");
804

805
    return $?;
806 807 808 809 810
}

sub vz_vnodeState {
    my ($vnode_id,$vmid) = @_;

811 812 813 814 815 816 817 818 819 820 821 822 823
    # Sometimes if the underlying filesystems are not mounted, we might get 
    # no status even though the vnode has been created (currently, this will
    # only happen with LVM)... since the openvz utils seem to need to see the
    # vnode filesystem in order to work properly, which is sensible).
    if ($DOLVM) {
	if (-e "/etc/vz/conf/$vmid.conf" && -e "/dev/openvz/$vnode_id"
	    && ! -e "/mnt/$vnode_id/private") {
	    print "Trying to mount LVM logical device for vnode $vnode_id: ";
	    mysystem("mount /dev/openvz/$vnode_id /mnt/$vnode_id");
	    print "done.\n";
	}
    }

824
    my $status = vmstatus($vmid);
825 826
    return VNODE_STATUS_UNKNOWN()
	if (!defined($status));
827

828 829 830 831 832 833
    if ($status eq 'running') {
	return VNODE_STATUS_RUNNING();
    }
    elsif ($status eq 'stopped') {
	return VNODE_STATUS_STOPPED();
    }
Leigh Stoller's avatar
Leigh Stoller committed
834 835 836
    elsif ($status eq 'mounted') {
	return VNODE_STATUS_MOUNTED();
    }
837 838 839 840 841 842 843

    return VNODE_STATUS_UNKNOWN();
}

sub vz_vnodeBoot {
    my ($vnode_id,$vmid) = @_;

844 845 846 847
    if ($DOLVM) {
	system("mount /dev/openvz/$vnode_id /mnt/$vnode_id");
    }

848 849 850 851 852 853 854 855 856 857 858 859 860
    mysystem("$VZCTL start $vnode_id");

    return 0;
}

sub vz_vnodeHalt {
    my ($vnode_id,$vmid) = @_;

    mysystem("$VZCTL stop $vnode_id");

    return 0;
}

Leigh Stoller's avatar
Leigh Stoller committed
861 862 863 864 865 866 867 868
sub vz_vnodeUnmount {
    my ($vnode_id,$vmid) = @_;

    mysystem("$VZCTL umount $vnode_id");

    return 0;
}

869 870 871 872 873 874 875 876 877
sub vz_vnodeReboot {
    my ($vnode_id,$vmid) = @_;

    mysystem("$VZCTL restart $vnode_id");

    return 0;
}

sub vz_vnodePreConfig {
878
    my ($vnode_id,$vmid,$callback) = @_;
879

880 881 882 883 884 885
    # Make sure we're mounted so that vzlist and friends work; see NOTE about
    # mounting LVM logical devices above.
    if ($DOLVM) {
	system("mount /dev/openvz/$vnode_id /mnt/$vnode_id");
    }

886
    #
887 888 889
    # Look and see if this node already has imq devs mapped into it -- if
    # those match the ones in the IMQDB, do nothing, else fixup. Must lock
    # since IMQDB is a shared resource.
890
    #
891
    if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) {
892 893 894 895 896 897 898 899
	print STDERR "Could not get the vzpreconfig lock after a long time!\n";
	return -1;
    }
    my %MDB;
    if (!dbmopen(%MDB, $IMQDB, 0660)) {
	print STDERR "*** Could not open $IMQDB\n";
	TBScriptUnlock();
	return -1;
900
    }
901 902 903 904 905
    my %devs = ();

    for (my $i = 0; $i < $MAXIMQ; $i++) {
	next
	    if ($MDB{"$i"} ne $vnode_id);
906

907
	$devs{"imq$i"} = 1;
908 909 910 911
    }
    dbmclose(%MDB);
    TBScriptUnlock();
    
912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932
    my $existing = `sed -n -r -e 's/NETDEV="(.*)"/\1/p' /etc/vz/conf/$vmid.conf`;
    chomp($existing);
    foreach my $dev (split(/,/,$existing)) {
	if (!exists($devs{$dev})) {
	    # needs deleting
	    $devs{$dev} = 0;
	}
	else {
	    # was already mapped, leave alone
	    $devs{$dev} = undef;
	}
    }

    foreach my $dev (keys(%devs)) {
	if ($devs{$dev} == 1) {
	    mysystem("$VZCTL set $vnode_id --netdev_add $dev --save");
	}
	elsif ($devs{$dev} == 0) {
	    mysystem("$VZCTL set $vnode_id --netdev_del $dev --save");
	}
    }
933 934 935 936 937 938 939
    #
    # Make sure container is mounted before calling the callback.
    #
    my $status = vmstatus($vmid);
    my $didmount = 0;
    if ($status ne 'running' && $status ne 'mounted') {
	mysystem("$VZCTL mount $vnode_id");
Leigh Stoller's avatar
Leigh Stoller committed
940
	$didmount = 1;
941
    }
942 943 944 945
    my $privroot = "/vz/private/$vmid";
    if ($DOLVM) {
	$privroot = "/mnt/$vnode_id/private";
    }
946 947 948 949 950
    # Serialize the callback. Sucks. iptables.
    if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) {
	print STDERR "Could not get callback lock after a long time!\n";
	return -1;
    }
951
    my $ret = &$callback("$privroot");
952
    TBScriptUnlock();
953 954 955 956
    if ($didmount) {
	mysystem("$VZCTL umount $vnode_id");
    }
    return $ret;
957 958 959 960 961 962 963 964 965
}

#
# Preconfigure the control net interface; special case of vnodeConfigInterfaces.
#
sub vz_vnodePreConfigControlNetwork {
    my ($vnode_id,$vmid,$ip,$mask,$mac,$gw,
	$vname,$longdomain,$shortdomain,$bossip) = @_;

966 967 968 969 970 971 972 973 974
    # setup iptables on real ctrl net
    my ($ciface,$cip,$cnetmask,$cmaskbits,$cnetwork,$cmac) = findControlNet();

    my @ipa = map { int($_); } split(/\./,$ip);
    my @maska = map { int($_); } split(/\./,$mask);
    my @neta = ($ipa[0] & $maska[0],$ipa[1] & $maska[1],
		$ipa[2] & $maska[2],$ipa[3] & $maska[3]);
    my $net = join('.',@neta);

975 976 977 978 979 980 981 982
    #
    # Have to serialize iptables access. Silly locking problem in the kernel.
    #
    if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) {
	print STDERR "PreConfigControlNetwork: ".
	    "Could not get the lock after a long time!\n";
	return -1;
    }
983 984 985
    # If the SNAT rule is there, probably we're good.
    if (system('iptables -t nat -L POSTROUTING' . 
	       ' | grep -q -e \'^SNAT.* ' . $net . '\'')) {
986 987 988 989 990 991 992 993 994 995 996 997 998 999
	if (system("$MODPROBE ip_nat") ||
	    system("$IPTABLES -t nat -A POSTROUTING" . 
		   " -s $net/$mask" . 
		   " -d $cnetwork/$cnetmask -j ACCEPT") ||
	    system("$IPTABLES -t nat -A POSTROUTING" . 
		   " -s $net/$mask" . 
		   " -d $net/$mask -j ACCEPT") ||
	    system("$IPTABLES -t nat -A POSTROUTING" . 
		   " -s $net/$mask" . 
		   " -o $ciface -j SNAT --to-source $cip")) {
	    print STDERR "Could not PreConfigControlNetwork iptables\n";
	    TBScriptUnlock();
	    return -1;
	}
1000
    }
1001
    TBScriptUnlock();
1002

1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
    # Make sure we're mounted so that vzlist and friends work; see NOTE about
    # mounting LVM logical devices above.
    if ($DOLVM) {
	system("mount /dev/openvz/$vnode_id /mnt/$vnode_id");
    }

    my $privroot = "/vz/private/$vmid";
    if ($DOLVM) {
	$privroot = "/mnt/$vnode_id/private";
    }
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041

    # add the control net iface
    my $cnet_veth = "veth${vmid}.${CONTROL_IFNUM}";
    my $cnet_mac = macAddSep($mac);
    my $ext_vethmac = $cnet_mac;
    if ($ext_vethmac =~ /^(00:00)(.*)$/) {
	$ext_vethmac = "00:01$2";
    }

    #
    # we have to hack the VEID.conf file BEFORE calling --netif_add ... --save
    # below so that when the custom script is run against our changes, it does
    # the right thing!
    #
    my %lines = ( 'ELABCTRLIP' => '"' . $ip . '"',
		  'ELABCTRLDEV' => '"' . $cnet_veth . '"' );
    editContainerConfigFile($vmid,\%lines);

    # note that we don't assign a mac to the CT0 part of the veth pair -- 
    # openvz does that automagically
    mysystem("$VZCTL set $vnode_id" . 
	     " --netif_add ${CONTROL_IFDEV},$cnet_mac,$cnet_veth,$ext_vethmac --save");

    #
    # Make sure container is mounted
    #
    my $status = vmstatus($vmid);
    my $didmount = 0;
    if ($status ne 'running' && $status ne 'mounted') {
1042 1043 1044
	if ($DOLVM) {
	    system("mount /dev/openvz/$vnode_id /mnt/$vnode_id");
	}
1045 1046 1047 1048 1049 1050
	mysystem("$VZCTL mount $vnode_id");
    }

    #
    # Setup lo
    #
1051
    open(FD,">$privroot/etc/sysconfig/network-scripts/ifcfg-lo") 
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
	or die "vz_vnodePreConfigControlNetwork: could not open ifcfg-lo for $vnode_id: $!";
    print FD "DEVICE=lo\n";
    print FD "IPADDR=127.0.0.1\n";
    print FD "NETMASK=255.0.0.0\n";
    print FD "NETWORK=127.0.0.0\n";
    print FD "BROADCAST=127.255.255.255\n";
    print FD "ONBOOT=yes\n";
    print FD "NAME=loopback\n";
    close(FD);

    # remove any regular control net junk
1063
    unlink("$privroot/etc/sysconfig/network-scripts/ifcfg-eth99");
1064 1065 1066 1067

    #
    # setup the control net iface in the FS ...
    #
1068
    open(FD,">$privroot/etc/sysconfig/network-scripts/ifcfg-${CONTROL_IFDEV}") 
1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101
	or die "vz_vnodePreConfigControlNetwork: could not open ifcfg-${CONTROL_IFDEV} for $vnode_id: $!";
    print FD "DEVICE=${CONTROL_IFDEV}\n";
    print FD "IPADDR=$ip\n";
    print FD "NETMASK=$mask\n";
    
    my @ip;
    my @mask;
    if ($ip =~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) {
	@ip = ($1,$2,$3,$4);
    }
    if ($mask =~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) {
	@mask = ($1+0,$2+0,$3+0,$4+0);
    }
    my $network = ($ip[0] & $mask[0]) . "." . ($ip[1] & $mask[1]) . 
	"." . ($ip[2] & $mask[2]) . "." . ($ip[3] & $mask[3]);
    my $bcast = ($ip[0] | (~$mask[0] & 0xff)) . 
	"." . ($ip[1] | (~$mask[1] & 0xff)) . 
	"." . ($ip[2] | (~$mask[2] & 0xff)) . 
	"." . ($ip[3] | (~$mask[3] & 0xff));
    # grab number of network bits too, sigh
    my $maskbits = 0;
    foreach my $m (@mask) {
	for (my $i = 0; $i < 8; ++$i) {
	    $maskbits += (0x01 & ($m >> $i));
	}
    }

    print FD "NETWORK=$network\n";
    print FD "BROADCAST=$bcast\n";
    print FD "ONBOOT=yes\n";
    close(FD);

    # setup routes:
1102 1103
    my ($ctrliface,$ctrlip,$ctrlmask,$ctrlmaskbits,$ctrlnet,$ctrlmac) 
	= findControlNet();
1104
    open(FD,">$privroot/etc/sysconfig/network-scripts/route-${CONTROL_IFDEV}") 
1105 1106 1107 1108 1109
	or die "vz_vnodePreConfigControlNetwork: could not open route-${CONTROL_IFDEV} for $vnode_id: $!";
    #
    # HUGE NOTE: we *have* to use the /<bits> form, not the /<netmask> form
    # for now, since our iproute version is old.
    #
1110
    print FD "$ctrlnet/$ctrlmaskbits dev ${CONTROL_IFDEV}\n";
1111 1112 1113 1114 1115 1116 1117
    print FD "0.0.0.0/0 via $ctrlip\n";
    close(FD);

    #
    # ... and make sure it gets brought up on boot:
    # XXX: yes, this would blow away anybody's changes, but don't care now.
    #
1118
    open(FD,">$privroot/etc/sysconfig/network") 
1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
	or die "vz_vnodePreConfigControlNetwork: could not open sysconfig/networkfor $vnode_id: $!";
    print FD "NETWORKING=yes\n";
    print FD "HOSTNAME=$vname.$longdomain\n";
    print FD "DOMAIN=$longdomain\n";
    print FD "NOZEROCONF=yes\n";
    close(FD);

    #
    # dhclient-exit-hooks normally writes this stuff on linux, so we'd better
    # do it here.
    #
1130
    my $mybootdir = "$privroot/var/emulab/boot/";
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165

    # and before the dhclient stuff, do this first to tell bootsetup that we 
    # are a GENVNODE...
    open(FD,">$mybootdir/vmname") 
	or die "vz_vnodePreConfigControlNetwork: could not open vmname for $vnode_id: $!";
    print FD "$vnode_id\n";
    close(FD);
    # ...and that our event server is the proxy in the phys host
    open(FD,">$mybootdir/localevserver") 
	or die "vz_vnodePreConfigControlNetwork: could not open localevserver for $vnode_id: $!";
    print FD "$ctrlip\n";
    close(FD);

    open(FD,">$mybootdir/myip") 
	or die "vz_vnodePreConfigControlNetwork: could not open myip for $vnode_id: $!";
    print FD "$ip\n";
    close(FD);
    open(FD,">$mybootdir/mynetmask") 
	or die "vz_vnodePreConfigControlNetwork: could not open mynetmask for $vnode_id: $!";
    print FD "$mask\n";
    close(FD);
    open(FD,">$mybootdir/routerip") 
	or die "vz_vnodePreConfigControlNetwork: could not open routerip for $vnode_id: $!";
    print FD "$gw\n";
    close(FD);
    open(FD,">$mybootdir/controlif") 
	or die "vz_vnodePreConfigControlNetwork: could not open controlif for $vnode_id: $!";
    print FD "${CONTROL_IFDEV}\n";
    close(FD);
    open(FD,">$mybootdir/realname") 
	or die "vz_vnodePreConfigControlNetwork: could not open realname for $vnode_id: $!";
    print FD "$vnode_id\n";
    close(FD);
    open(FD,">$mybootdir/bossip") 
	or die "vz_vnodePreConfigControlNetwork: could not open bossip for $vnode_id: $!";
1166
    print FD "$bossip\n";
1167 1168 1169 1170 1171
    close(FD);

    #
    # Let's not hang ourselves before we start
    #
1172
    open(FD,">$privroot/etc/resolv.conf") 
1173
	or die "vz_vnodePreConfigControlNetwork: could not open resolv.conf for $vnode_id: $!";
1174

1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196
    print FD "nameserver $bossip\n";
    print FD "search $shortdomain\n";
    close(FD);

    #
    # XXX Ugh, this is icky, but it avoids a second mount in PreConfig().
    # Want to copy all the tmcd config info from root context into the 
    # container.
    #
    mysystem("cp -R /var/emulab/boot/tmcc.$vnode_id $mybootdir/");

    if ($didmount) {
	mysystem("$VZCTL umount $vnode_id");
    }

    return 0;
}

#
# Preconfigures experimental interfaces in the vnode before its first boot.
#
sub vz_vnodePreConfigExpNetwork {
Leigh Stoller's avatar
Leigh Stoller committed
1197
    my ($vnode_id,$vmid,$ifs,$lds,$tunnels) = @_;
1198

1199 1200 1201 1202 1203 1204
    # Make sure we're mounted so that vzlist and friends work; see NOTE about
    # mounting LVM logical devices above.
    if ($DOLVM) {
	system("mount /dev/openvz/$vnode_id /mnt/$vnode_id");
    }

1205
    my $elabifs = "";
Leigh Stoller's avatar
Leigh Stoller committed
1206
    my $elabroutes = "";
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
    my %netif_strs = ();
    foreach my $ifc (@$ifs) {
	next if (!$ifc->{ISVIRT});

	#
	# Add to ELABIFS for addition to conf file (for runtime config by 
        # external custom script)
	#
	my $veth = "veth$vmid.$ifc->{ID}";
	my $br;
1217 1218
	
	if ($ifc->{ITYPE} eq "vlan") {
1219
	    my $iface = $ifc->{IFACE};
1220
	    my $vtag  = $ifc->{VTAG};
1221
	    my $vdev  = "${iface}.${vtag}";
1222 1223
	    $br = "pbr$vdev";
	}
1224
	elsif ($ifc->{PMAC} eq "none" || $ifc->{ITYPE} eq "loop") {
1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256
	    $br = "br" . $ifc->{VTAG};
	}
	else {
	    my $iface = findIface($ifc->{PMAC});
	    $br = "pbr$iface";
	}
	if ($elabifs ne '') {
	    $elabifs .= ';';
	}
	$elabifs .= "$veth,$br";

	#
	# The ethX naming sucks, but hopefully it ensures unique, *easily 
	# reconfigurable* (i.e., without a local map file) naming for veths.
	#
	my $eth = "eth" . $ifc->{VTAG};
	my $ethmac = macAddSep($ifc->{VMAC});
	my $vethmac = $ethmac;
	if ($vethmac =~ /^(00:00)(.*)$/) {
	    $vethmac = "00:01$2";
	}

	#
	# Savefor later calling, since we need to hack the 
	# config file BEFORE calling --netif_add so the custom postconfig 
	# script does the right thing.
	# Also store up the current set of netifs so we can delete any that
	# might have been old!
	#
        $netif_strs{$eth} = "$eth,$ethmac,$veth,$vethmac";
    }

Leigh Stoller's avatar
Leigh Stoller committed
1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330
    if (values(%{ $tunnels })) {
	#
	# Get current list.
	#
	if (! open(IP, "/sbin/ip tunnel show|")) {
	    print STDERR "Could not start /sbin/ip\n";
	    return -1;
	}
	my %gre2ip = ();
	my %ip2gre = ();

	while (<IP>) {
	    if ($_ =~ /^(gre\d*):.*remote\s*([\d\.]*)\s*local\s*([\d\.]*)/) {
		$gre2ip{$1} = "$2:$3";
		$ip2gre{"$2:$3"} = $1;
	    }
	}
	if (!close(IP)) {
	    print STDERR "Could not get tunnel list\n";
	    return -1;
	}

	foreach my $tunnel (values(%{ $tunnels })) {
	    next
		if ($tunnel->{"tunnel_style"} ne "gre");
	
	    my $name     = $tunnel->{"tunnel_lan"};
	    my $srchost  = $tunnel->{"tunnel_srcip"};
	    my $dsthost  = $tunnel->{"tunnel_dstip"};
	    my $inetip   = $tunnel->{"tunnel_ip"};
	    my $peerip   = $tunnel->{"tunnel_peerip"};
	    my $mask     = $tunnel->{"tunnel_ipmask"};
	    my $unit     = $tunnel->{"tunnel_unit"};
	    my $gre;

	    if (exists($ip2gre{"$dsthost:$srchost"})) {
		$gre = $ip2gre{"$dsthost:$srchost"};
	    }
	    else {
		$gre = "gre" . (scalar(keys(%ip2gre)) + 1);
		mysystem2("/sbin/ip tunnel add $gre mode gre ".
			 "local $srchost remote $dsthost ttl 64");
		return -1
		    if ($?);
		mysystem2("/sbin/ifconfig $gre 0 up");
		return -1
		    if ($?);

		$ip2gre{"$dsthost:$srchost"} = $gre;
		$gre2ip{$gre} = "$dsthost:$srchost";
	    }
	    my $net = inet_ntoa(inet_aton($inetip) & inet_aton($mask));
	    mysystem2("/sbin/ip route replace $net/24 dev $gre");
	    return -1
		if ($?);

	    my $veth = "veth$vmid.tun$unit";
	    my $eth  = "gre$unit";
	    
	    $netif_strs{$eth} = "$eth,,$veth";
	    if ($elabifs ne '') {
		$elabifs .= ';';
	    }
	    # Leave bridge blank; see vznetinit-elab.sh. It does stuff.
	    $elabifs .= "$veth,";
	    # Route.

	    if ($elabroutes ne '') {
		$elabroutes .= ';';
	    }
	    $elabroutes .= "$veth,$inetip";
	}
    }

1331 1332 1333 1334
    #
    # Wait until end to do a single edit for all ifs, since they're all 
    # smashed into a single config file var
    #
Leigh Stoller's avatar
Leigh Stoller committed
1335 1336
    my %lines = ( 'ELABIFS'    => '"' . $elabifs . '"',
		  'ELABROUTES' => '"' . $elabroutes . '"');
1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
    editContainerConfigFile($vmid,\%lines);

    #
    # Ok, add (and delete stale) veth devices!
    # Grab current ones first.
    #
    my @current = ();
    open(CF,"/etc/vz/conf/$vmid.conf") 
	or die "could not open etc/vz/conf/$vmid.conf for read: $!";
    my @lines = grep { $_ =~ /^NETIF/ } <CF>;
    close(CF);
    if (@lines) {
	# always take the last one :-)
	my $netifs = $lines[@lines-1];
	if ($netifs =~ /NETIF="(.*)"/) {
	    $netifs = $1;
	}
	my @nifs = split(/;/,$netifs);
	foreach my $nif (@nifs) {
	    if ($nif =~ /ifname=([\w\d\-]+)/) {
		# don't delete the control net device!
		next if ($1 eq $CONTROL_IFDEV);

		push @current, $1;
	    }
	}
    }

    # delete
    foreach my $eth (@current) {
	if (!exists($netif_strs{$eth})) {
	    mysystem("$VZCTL set $vnode_id --netif_del $eth --save");
	}
    }
    # add/modify
    foreach my $eth (keys(%netif_strs)) {
	mysystem("$VZCTL set $vnode_id --netif_add $netif_strs{$eth} --save");
    }

    return 0;
}

sub vz_vnodeConfigResources {
    return 0;
}

sub vz_vnodeConfigDevices {
    return 0;
}

sub vz_vnodePostConfig {
    my ($vnode_id,$vmid) = @_;

    return 0;
}

sub vz_setDebug($) {
    $debug = shift;
1395
    libvnode::setDebug($debug);
1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439
}

##
## Bunch of helper functions.
##

#
# Edit an openvz container config file -- add a little emulab header and some
# vars to signal customization.  After that, change/add any lines indicated by
# the key/val pairs in the hash (sensible since the config file is intended to
# be slurped up by shells or something).
#
sub editContainerConfigFile($$) {
    my ($vmid,$edlines) = @_;

    my $conffile = "/etc/vz/conf/$vmid.conf";

    open(FD,"$conffile") 
	or die "could not open $conffile: $!";
    my @lines = <FD>;
    close(FD);

    if (!grep(/^ELABCUSTOM/,@lines)) {
	$lines[@lines] = "\n";
	$lines[@lines] = "#\n";
	$lines[@lines] = "# Emulab hooks\n";
	$lines[@lines] = "#\n";
	$lines[@lines] = "CONFIG_CUSTOMIZED=\"yes\"\n";
	$lines[@lines] = "ELABCUSTOM=\"yes\"\n";
    }

    # make a copy so we can delete keys
    my %dedlines = ();
    foreach my $k (keys(%$edlines)) {
	$dedlines{$k} = $edlines->{$k};
    }

    for (my $i = 0; $i < @lines; ++$i) {
	# note that if the value is a string, the quotes have to be sent
	# in from caller!
	if ($lines[$i] =~ /^([^#][^=]+)=(.*)$/) {
	    my $k = $1;
	    if (exists($dedlines{$k}) && $2 ne $dedlines{$k}) {
		$lines[$i] = "$k=$dedlines{$k}\n";
1440
		delete $dedlines{$k};
1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498
	    }
	}
    }
    foreach my $k (keys(%dedlines)) {
	$lines[@lines] = "$k=$dedlines{$k}\n";
    }

    open(FD,">$conffile") 
	or die "could not open $conffile for writing: $!";
    foreach my $line (@lines) {
	print FD $line;
    }
    close(FD);

    return 0;
}

sub vmexists($) {
    my $id = shift;

    return 1
	if (!system("$VZLIST $id"));
    return 0;
}

sub vmstatus($) {
    my $id = shift;

    open(PFD,"$VZLIST $id |") 
	or die "could not exec $VZLIST: $!";
    while (<PFD>) {
	if ($_ =~ /^\s+$id\s+[^\s]+\s+(\w+)/) {
	    close(PFD);
	    return $1;
	}
    }
    close(PFD);
    return undef;
}

sub vmrunning($) {
    my $id = shift;

    return 1 
	if (vmstatus($id) eq VZSTAT_RUNNING);
    return 0;
}

sub vmstopped($) {
    my $id = shift;

    return 1 
	if (vmstatus($id) eq VZSTAT_STOPPED);
    return 0;
}

# what can I say?
1;