libvnode_xen.pm 143 KB
Newer Older
Jon Rafkind's avatar
Jon Rafkind committed
1
#!/usr/bin/perl -wT
2
#
3
# Copyright (c) 2008-2015 University of Utah and the Flux Group.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#
# Implements the libvnode API for Xen support in Emulab.
#
# Note that there is no distinguished first or last call of this library
# in the current implementation.  Every vnode creation (through mkvnode.pl)
# will invoke all the root* and vnode* functions.  It is up to us to make
# sure that "one time" operations really are executed only once.
#
# TODO:
# + Clear out old, incorrect state in /var/lib/xend.
#   Maybe have to do this when tearing down (killing) vnodes.
#
# + Make more robust, little turds of state still get left around
#   that wreak havoc on reboot.
#
# + Support image loading.
#
Jon Rafkind's avatar
Jon Rafkind committed
40
41
42
43
44
45
package libvnode_xen;
use Exporter;
@ISA    = "Exporter";
@EXPORT = qw( init setDebug rootPreConfig
              rootPreConfigNetwork rootPostConfig
	      vnodeCreate vnodeDestroy vnodeState
46
	      vnodeBoot vnodePreBoot vnodeHalt vnodeReboot
47
	      vnodeUnmount
Jon Rafkind's avatar
Jon Rafkind committed
48
49
	      vnodePreConfig vnodePreConfigControlNetwork
              vnodePreConfigExpNetwork vnodeConfigResources
Leigh B Stoller's avatar
Leigh B Stoller committed
50
              vnodeConfigDevices vnodePostConfig vnodeExec vnodeTearDown VGNAME
Jon Rafkind's avatar
Jon Rafkind committed
51
	    );
52
use vars qw($VGNAME);
Jon Rafkind's avatar
Jon Rafkind committed
53
54
55
56
57
58
59
60

%ops = ( 'init' => \&init,
         'setDebug' => \&setDebug,
         'rootPreConfig' => \&rootPreConfig,
         'rootPreConfigNetwork' => \&rootPreConfigNetwork,
         'rootPostConfig' => \&rootPostConfig,
         'vnodeCreate' => \&vnodeCreate,
         'vnodeDestroy' => \&vnodeDestroy,
61
	 'vnodeTearDown' => \&vnodeTearDown,
Jon Rafkind's avatar
Jon Rafkind committed
62
63
64
         'vnodeState' => \&vnodeState,
         'vnodeBoot' => \&vnodeBoot,
         'vnodeHalt' => \&vnodeHalt,
65
# XXX needs to be implemented
66
         'vnodeUnmount' => \&vnodeUnmount,
Jon Rafkind's avatar
Jon Rafkind committed
67
         'vnodeReboot' => \&vnodeReboot,
68
# XXX needs to be implemented
69
         'vnodeExec' => \&vnodeExec,
Jon Rafkind's avatar
Jon Rafkind committed
70
71
72
73
74
75
76
77
         'vnodePreConfig' => \&vnodePreConfig,
         'vnodePreConfigControlNetwork' => \&vnodePreConfigControlNetwork,
         'vnodePreConfigExpNetwork' => \&vnodePreConfigExpNetwork,
         'vnodeConfigResources' => \&vnodeConfigResources,
         'vnodeConfigDevices' => \&vnodeConfigDevices,
         'vnodePostConfig' => \&vnodePostConfig,
       );

Jon Rafkind's avatar
Jon Rafkind committed
78

79
80
81
82
83
84
85
use strict;
use English;
use Data::Dumper;
use Socket;
use File::Basename;
use File::Path;
use File::Copy;
86
use File::Temp;
87
88

# Pull in libvnode
89
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }
90
91
use libutil;
use libgenvnode;
92
93
use libvnode;
use libtestbed;
94
use libsetup;
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

#
# Turn off line buffering on output
#
$| = 1;

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 

##
## Standard utilities and files section
##

110
my $BRCTL = "brctl";
111
my $IFCONFIG = "/sbin/ifconfig";
Leigh B Stoller's avatar
Leigh B Stoller committed
112
my $ETHTOOL = "/sbin/ethtool";
113
114
115
116
117
my $ROUTE = "/sbin/route";
my $SYSCTL = "/sbin/sysctl";
my $VLANCONFIG = "/sbin/vconfig";
my $MODPROBE = "/sbin/modprobe";
my $DHCPCONF_FILE = "/etc/dhcpd.conf";
118
my $NEW_DHCPCONF_FILE = "/etc/dhcp/dhcpd.conf";
119
my $RESTOREVM	= "$BINDIR/restorevm.pl";
Leigh B Stoller's avatar
Leigh B Stoller committed
120
my $LOCALIZEIMG	= "$BINDIR/localize_image";
121
my $IPTABLES	= "/sbin/iptables";
Leigh B Stoller's avatar
Leigh B Stoller committed
122
123
124
125
my $IPBIN	= "/sbin/ip";
my $NETSTAT     = "/bin/netstat";
my $IMAGEZIP    = "/usr/local/bin/imagezip";
my $IMAGEUNZIP  = "/usr/local/bin/imageunzip";
126
my $IMAGEDUMP   = "/usr/local/bin/imagedump";
127
my $XM          = "/usr/sbin/xm";
128
my $debug  = 0;
Mike Hibler's avatar
Mike Hibler committed
129
my $lockdebug = 0;
130

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#
# Serial console handling. We fire up a capture per active vnode.
# We use a fine assortment of capture options:
#
#	-i: standalone mode, don't try to contact capserver directly
#	-l: (added later) set directory where log, ACL, and pid files are kept.
#	-C: use a circular buffer to capture activity while no user
#	    is connected. This gets dumped to the user when they connect.
#	-X: (added later) run in "Xen mode" on the given domain.
#	    Monitors the pty exported by xenconsoled. Note that the
#	    specific pty can change when a domain reboots; capture
#	    deals with this.
#	-R: Retry interval of 2 seconds. When capture is disconnected
#	    from the pty (due to domain reboot/shutdowns), this is how
#	    long we wait between attempts to reconnect.
#
my $CAPTURE     = "/usr/local/sbin/capture-nossl";
my $CAPTUREOPTS	= "-i -C -R 2000";

Mike Hibler's avatar
Mike Hibler committed
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#
# Create a thin pool with the name $POOL_NAME using not more
# than $POOL_FRAC of any disk.
# 
my $usethin = 1;
my $POOL_NAME = "disk-pool";
my $POOL_FRAC = 0.75;

#
# If set to one, we will destroy a golden disk when no vnode disks
# are derived from it. Otherwise, we leave it around and it must be
# explicitly GCed by some yet-to-be-written daemon. 
#
my $REAP_GDS = 0;

#
# Flags for allocating LVs
#
sub ALLOC_NOPOOL()	{ return 0; }
sub ALLOC_INPOOL()	{ return 1; }
sub ALLOC_PREFERNOPOOL	{ return 2; }
sub ALLOC_PREFERINPOOL	{ return 3; }
172

173
174
175
176
177
178
179
180
##
## Randomly chosen convention section
##

# global lock
my $GLOBAL_CONF_LOCK = "xenconf";

# default image to load on logical disks
181
182
# Just symlink /boot/vmlinuz-xenU and /boot/initrd-xenU
# to the kernel and ramdisk you want to use by default.
183
my %defaultImage = (
184
185
186
187
188
189
    'name'      => "emulab-ops-emulab-ops-XEN-STD",
    'kernel'    => "/boot/vmlinuz-xenU",
    'ramdisk'   => "/boot/initrd-xenU",
    'OSVERSION' => "any",
    'PARTOS'    => "Linux",
    'ISPACKAGE' => 0,
190
191
    'PART'      => 2,
    'BOOTPART'  => 2,
192
193
194
);

# where all our config files go
195
196
my $VMS    = "/var/emulab/vms";
my $VMDIR  = "$VMS/vminfo";
197
198
my $XENDIR = "/var/xen";

199
# Extra space for capture/restore.
200
my $EXTRAFS = "/capture";
201

202
# Extra space for image metadata between reloads.
Leigh B Stoller's avatar
Leigh B Stoller committed
203
my $METAFS = "/metadata";
204
205
# So we can ask this from outside;
sub METAFS()  { return $METAFS; }
Leigh B Stoller's avatar
Leigh B Stoller committed
206

207
208
209
# Extra space for vminfo (/var/emulab/vms) between reloads.
my $INFOFS = "/vminfo";

210
211
# Xen LVM volume group name. Accessible outside this file.
$VGNAME = "xen-vg";
Leigh B Stoller's avatar
Leigh B Stoller committed
212
213
# So we can ask this from outside;
sub VGNAME()  { return $VGNAME; }
214
215
216
217
218
219
220
221

##
## Indefensible, arbitrary constant section
##

# Minimum memory for dom0
my $MIN_MB_DOM0MEM = 256;

Mike Hibler's avatar
Mike Hibler committed
222
#
223
# Minimum acceptible size (in GB) of LVM VG for domUs.
Mike Hibler's avatar
Mike Hibler committed
224
225
226
227
228
229
#
# XXX we used to calculate this in terms of anticipated maximum number
# of vnodes and minimum vnode images size, blah, blah. Now we just pick
# a value that allows us to use a pc3000 node with a single 144GB disk!
#
my $XEN_MIN_VGSIZE = 120;
230

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#
# When loading an Emulab partition image, we use a compressed version of our
# standard MBR layout:
#
# MBR 1 or 2 FreeBSD:
#    P1: 6GB (XEN_LDSIZE) offset at 63, OS goes here
#    P2: 1MB (XEN_EMPTYSIZE), as small as we can make it
#    P3: 1GB (XEN_SWAPSIZE), standard MBR2 swap size
# MBR 1 or 2 Linux:
#    P1: 1MB (XEN_EMPTYSIZE), as small as we can make it
#    P2: 6GB (XEN_LDSIZE) offset at 63, OS goes here
#    P3: 1GB (XEN_SWAPSIZE), standard MBR2 swap size
# MBR 3:
#    P1: 16GB (XEN_LDSIZE_3) offset at 2048, standard OS partition
#    P2: 1MB (XEN_EMPTYSIZE), as small as we can make it
#    P3: 1GB (XEN_SWAPSIZE), standard MBR2 swap size
#
248
249
250
251
252
# P4 is sized based on what the user told us. If they do not specify
# XEN_EXTRA, then we default to 1G (XEN_EXTRASIZE). We need enough
# space here to support uses of mkextrafs in the clientside (e.g., for
# "no nfs" experiments where local homedirs are created.
#
253
254
255
256
257
258
# Sizes below are in 1K blocks.
#
my $XEN_LDSIZE    =  6152895;
my $XEN_LDSIZE_3  = 16777216;
my $XEN_SWAPSIZE  =  1048576;
my $XEN_EMPTYSIZE =     1024;
259
my $XEN_EXTRASIZE =  1048576;
260

Leigh B Stoller's avatar
Leigh B Stoller committed
261
262
263
264
265
266
267
268
269
270
271
# IFBs
my $IFBDB      = "/var/emulab/db/ifbdb";
# Kernel auto-creates only two! Sheesh, why a fixed limit?
my $MAXIFB     = 1024;

# Route tables for tunnels
my $RTDB           = "/var/emulab/db/rtdb";
my $RTTABLES       = "/etc/iproute2/rt_tables";
# Temporary; later kernel version increases this.
my $MAXROUTETTABLE = 255;

272
273
274
# Striping
my $STRIPE_COUNT   = 1;

275
276
277
# Whether or not to use only unpartitioned (unused) disks to form the Xen VG.
my $LVM_FULLDISKONLY = 0;

278
279
# Whether or not to use partitions only when they are big.
my $LVM_ONLYLARGEPARTS = 1;
280
281
282
283
my $LVM_LARGEPARTPCT = 8;

# In general, you only want to use one partition per disk since we stripe.
my $LVM_ONEPARTPERDISK = 1;
284

285
286
287
288
# Use openvswitch for gre tunnels.
my $OVSCTL   = "/usr/local/bin/ovs-vsctl";
my $OVSSTART = "/usr/local/share/openvswitch/scripts/ovs-ctl";

289
290
my $ISREMOTENODE = REMOTEDED();
my $BRIDGENAME   = "xenbr0";
291
my $VIFROUTING   = ((-e "$ETCDIR/xenvifrouting") ? 1 : 0);
292

293
294
my $TMCD_PORT	 = 7777;

295
296
297
298
299
#
# Information about the running Xen hypervisor
#
my %xeninfo = ();

300
301
302
303
# Local functions
sub findRoot();
sub copyRoot($$);
sub createRootDisk($);
304
sub createAuxDisk($$);
305
306
307
308
309
310
sub replace_hacks($);
sub disk_hacks($);
sub configFile($);
sub domain0Memory();
sub totalMemory();
sub hostIP($);
311
sub createDHCP();
312
313
sub addDHCP($$$$);
sub subDHCP($$);
314
sub restartDHCP();
315
316
317
sub formatDHCP($$$);
sub fixupMac($);
sub createControlNetworkScript($$$);
318
sub createExpNetworkScript($$$$$$$$);
319
sub createTunnelScript($$$$$);
Leigh B Stoller's avatar
Leigh B Stoller committed
320
sub createExpBridges($$$);
321
322
323
324
325
326
sub destroyExpBridges($$);
sub domainStatus($);
sub domainExists($);
sub addConfig($$$);
sub createXenConfig($$);
sub readXenConfig($);
327
sub lookupXenConfig($$);
328
sub getXenInfo();
Leigh B Stoller's avatar
Leigh B Stoller committed
329
330
331
332
333
sub AllocateIFBs($$$);
sub InitializeRouteTable();
sub AllocateRouteTable($);
sub LookupRouteTable($);
sub FreeRouteTable($);
Mike Hibler's avatar
Mike Hibler committed
334
sub downloadOneImage($$$);
335
sub captureRunning($);
336
337
338

sub getXenInfo()
{
339
340
    open(XM,"$XM info|") 
        or die "getXenInfo: could not run '$XM info': $!";
341
342
343
344
345
346
347
348
349

    while (<XM>) {
	    chomp;
	    /^(\S+)\s*:\s+(.*)$/;
	    $xeninfo{$1} = $2;
    }
    
    close XM;
}
350
351
352
353
354
355
356

sub init($)
{
    my ($pnode_id,) = @_;

    makeIfaceMaps();
    makeBridgeMaps();
357
358
359
360
361

    my $toolstack = `grep TOOLSTACK /etc/default/xen`;
    if ($toolstack =~ /xl$/) {
	$XM = "/usr/sbin/xl";
    }
362
    getXenInfo();
363

364
365
366
    # Compute the strip size for new lvms.
    if (-e "/var/run/xen.ready") {
	$STRIPE_COUNT = computeStripeSize($VGNAME);
367
    }
Jon Rafkind's avatar
Jon Rafkind committed
368
369
370
    return 0;
}

371
372
373
374
375
376
377
378
sub setDebug($)
{
    $debug = shift;
    libvnode::setDebug($debug);
    print "libvnode_xen: debug=$debug\n"
	if ($debug);
}

379
380
381
382
383
384
385
sub ImageLockName($)
{
    my ($imagename) = @_;

    return "xenimage." .
	(defined($imagename) ? $imagename : $defaultImage{'name'});
}
386
387
388
389
390
391
sub ImageLVName($)
{
    my ($imagename) = @_;

    return "image+" . $imagename;
}
392

393
394
395
396
#
# Called on each vnode, but should only be executed once per boot.
# We use a file in /var/run (cleared on reboots) to ensure this.
#
397
sub rootPreConfig($)
398
{
399
    my $bossip = shift;
400
401
402
403
404
    #
    # Haven't been called yet, grab the lock and double check that someone
    # didn't do it while we were waiting.
    #
    if (! -e "/var/run/xen.ready") {
Mike Hibler's avatar
Mike Hibler committed
405
406
	TBDebugTimeStamp("rootPreConfig: grabbing global lock $GLOBAL_CONF_LOCK")
	    if ($lockdebug);
407
408
409
410
411
412
413
414
415
	my $locked = TBScriptLock($GLOBAL_CONF_LOCK,
				  TBSCRIPTLOCK_GLOBALWAIT(), 900);
	if ($locked != TBSCRIPTLOCK_OKAY()) {
	    return 0
		if ($locked == TBSCRIPTLOCK_IGNORE());
	    print STDERR "Could not get the xeninit lock after a long time!\n";
	    return -1;
	}
    }
Mike Hibler's avatar
Mike Hibler committed
416
417
    TBDebugTimeStamp("  got global lock")
	if ($lockdebug);
418
    if (-e "/var/run/xen.ready") {
Mike Hibler's avatar
Mike Hibler committed
419
420
	TBDebugTimeStamp("  releasing global lock")
	    if ($lockdebug);
421
422
423
424
425
426
        TBScriptUnlock();
        return 0;
    }
    
    print "Configuring root vnode context\n";

427
428
429
430
431
432
433
434
435
436
437
    #
    # For compatibility with existing (physical host) Emulab images,
    # the physical host provides DHCP info for the vnodes. We manage
    # the dhcpd.conf file here. See below. 
    #
    # Note that we must first add an alias to the control net bridge so
    # that we (the physical host) are in the same subnet as the vnodes,
    # otherwise dhcpd will fail.
    #
    my ($alias_iface, $alias_ip, $alias_mask);

438
    #
439
440
441
    # Locally, we just need to add the alias to the control interface
    # (which might be a bridge).
    # 
442
    if (!$ISREMOTENODE) {
443
	my ($cnet_iface) = findControlNet();
444
445

	#
446
447
448
	# We use xen's antispoofing when constructing the guest control net
	# interfaces. This is most useful on a shared host, but no
	# harm in doing it all the time.
449
	#
450
451
452
453
454
	mysystem("$IPTABLES -P FORWARD DROP");
	mysystem("$IPTABLES -F FORWARD");
	# This says to forward traffic across the bridge.
	mysystem("$IPTABLES -A FORWARD ".
		 "-m physdev --physdev-in $cnet_iface -j ACCEPT");
455
456
457
458
459
460
461
	
	if ($VIFROUTING) {
	    mysystem("echo 1 >/proc/sys/net/ipv4/conf/$cnet_iface/proxy_arp");
	    mysystem("echo 1 >/proc/sys/net/ipv4/ip_forward");
	    # This is for arping -A to work. See emulab-cnet.pl
	    mysystem("echo 1 >/proc/sys/net/ipv4/ip_nonlocal_bind");
	}
462
463
464
465
466
467
468
469

	# Set up for metadata server for ec2 support
	print "Setting up redirection for meta server...\n";
	mysystem("$IPBIN addr add 169.254.169.254/32 ".
		 "   scope global dev $cnet_iface");
	mysystem("$IPTABLES -t nat -A PREROUTING -d 169.254.169.254/32 " .
		 "   -p tcp -m tcp --dport 80 -j DNAT ".
		 "   --to-destination ${bossip}:8787");
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
    }
    else {
	if (!existsBridge($BRIDGENAME)) {
	    if (mysystem2("$BRCTL addbr $BRIDGENAME")) {
		TBScriptUnlock();
		return -1;
	    }
	    #
	    # We do not set the mac address; we want it to take
	    # on the address of the attached vif interfaces so that
	    # arp works. This is quite kludgy of course, but otherwise
	    # the arp comes into the bridge interface and then kernel
	    # drops it. There is a brouter (ebtables) work around
	    # but not worth worrying about. 
	    #
	}
	(undef,$alias_mask,$alias_ip) = findVirtControlNet();
	$alias_iface = $BRIDGENAME;
488
489
490
491
492

	if (system("ifconfig $alias_iface | grep -q 'inet addr'")) {
	    print "Creating $alias_iface alias...\n";
	    mysystem("ifconfig $alias_iface $alias_ip netmask $alias_mask");
	}
493
494
    }

495
496
497
498
    # For tunnels
    mysystem("$MODPROBE openvswitch");
    mysystem("$OVSSTART --delete-bridges start");

499
    # For bandwidth contraints.
Leigh B Stoller's avatar
Leigh B Stoller committed
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
    mysystem("$MODPROBE ifb numifbs=$MAXIFB");

    # Create a DB to manage them. 
    my %MDB;
    if (!dbmopen(%MDB, $IFBDB, 0660)) {
	print STDERR "*** Could not create $IFBDB\n";
	TBScriptUnlock();
	return -1;
    }
    for (my $i = 0; $i < $MAXIFB; $i++) {
	$MDB{"$i"} = ""
	    if (!defined($MDB{"$i"}));
    }
    dbmclose(%MDB);
    
515
516
517
518
519
520
521
522
523
524
525
526
527
528
    #
    # Ensure that LVM is loaded in the kernel and ready.
    #
    print "Enabling LVM...\n"
	if ($debug);

    # We assume our kernels support this.
    mysystem2("$MODPROBE dm-snapshot");
    if ($?) {
	print STDERR "ERROR: could not load snaphot module!\n";
	TBScriptUnlock();
	return -1;
    }

529
    #
530
    # Make sure pieces are at least a 5GiB.
531
    #
532
    my %devs = libvnode::findSpareDisks(5 * 1024);
533

Leigh B Stoller's avatar
Leigh B Stoller committed
534
535
    #
    # Turn on write caching. Hacky. 
536
537
    # XXX note we do not use the returned "path" here as we need to
    # change the setting on all devices, not just the whole disk devices.
Leigh B Stoller's avatar
Leigh B Stoller committed
538
539
    #
    foreach my $dev (keys(%devs)) {
540
541
542
543
	# only mess with the disks we are going to use
	if (exists($devs{$dev}{"size"}) || $LVM_FULLDISKONLY == 0) {
	    mysystem2("hdparm -W1 /dev/$dev");
	}
Leigh B Stoller's avatar
Leigh B Stoller committed
544
545
    }

546
547
548
549
550
551
552
553
    #
    # See if our LVM volume group for VMs exists and create it if not.
    #
    my $vg = `vgs | grep $VGNAME`;
    if ($vg !~ /^\s+${VGNAME}\s/) {
	print "Creating volume group...\n"
	    if ($debug);

554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
	#
	# Total up potential maximum size
	#
	my $maxtotalSize = 0;
	my $sizeThreshold = 0;
	foreach my $dev (keys(%devs)) {
	    if (defined($devs{$dev}{"size"})) {
		$maxtotalSize += $devs{$dev}{"size"};
	    } else {
		foreach my $part (keys(%{$devs{$dev}})) {
		    $maxtotalSize += $devs{$dev}{$part}{"size"};
		}
	    }
	}
	if ($maxtotalSize > 0) {
	    $sizeThreshold = int($maxtotalSize * $LVM_LARGEPARTPCT / 100.0);
	}

572
573
574
575
576
	#
	# Find available devices of sufficient size, prepare them,
	# and incorporate them into a volume group.
	#
	my $totalSize = 0;
577
	my @blockdevs = ();
578
	foreach my $dev (keys(%devs)) {
579
	    #
580
	    # Whole disk is available, use it.
581
	    #
582
	    if (defined($devs{$dev}{"size"})) {
583
		push(@blockdevs, $devs{$dev}{"path"});
584
		$totalSize += $devs{$dev}{"size"};
585
		next;
586
	    }
587

588
	    #
589
	    # Disk contains partitions that are available.
590
	    #
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
	    my ($lpsize,$lppath);
	    foreach my $part (keys(%{$devs{$dev}})) {
		my $psize = $devs{$dev}{$part}{"size"};
		my $ppath = $devs{$dev}{$part}{"path"};

		#
		# XXX one way to avoid using the system disk, just ignore
		# all partition devices. However, in cases where the
		# remainder of the system disk represents the majority of
		# the available space (e.g., Utah d710s), this is a bad
		# idea.
		#
		if ($LVM_FULLDISKONLY) {
		    print STDERR
			"WARNING: not using partition $ppath for LVM\n";
		    next;
		}
608

609
610
611
612
613
614
615
616
617
618
619
620
621
		#
		# XXX Another heurstic to try to weed out the system
		# disk whenever feasible: if a partition device represents
		# less than some percentage of the max possible space,
		# avoid it. At Utah this one is tuned (8%) to avoid using
		# left over space on the system disk of d820s (which have
		# six other larger drives) while using it on the pc3000s
		# and d710s.
		#
		if ($LVM_ONLYLARGEPARTS && $psize < $sizeThreshold) {
		    print STDERR "WARNING: not using $ppath for LVM (too small)\n";
		    next;
		}
622

623
624
625
626
627
628
629
630
631
632
633
634
		#
		# XXX If we are only going to use one partition per disk,
		# record the largest one we find here. This check will
		# filter out the small "other OS" partition (3-6GB) in
		# favor of the larger "rest of the disk" partition.
		#
		if ($LVM_ONEPARTPERDISK) {
		    if (!defined($lppath) || $psize > $lpsize) {
			$lppath = $ppath;
			$lpsize = $psize;
		    }
		    next;
635
		}
636
637
638
639
640
641
642
643
644
645

		#
		# It ran the gauntlet of feeble filters, use it!
		#
		push(@blockdevs, $ppath);
		$totalSize += $psize;
	    }
	    if ($LVM_ONEPARTPERDISK && defined($lppath)) {
		push(@blockdevs, $lppath);
		$totalSize += $lpsize;
646
647
	    }
	}
648
	if (@blockdevs == 0) {
649
650
651
652
653
	    print STDERR "ERROR: findSpareDisks found no disks for LVM!\n";
	    TBScriptUnlock();
	    return -1;
	}
		    
654
655
656
	my $blockdevstr = join(' ', sort @blockdevs);
	mysystem("pvcreate $blockdevstr");
	mysystem("vgcreate $VGNAME $blockdevstr");
657
658
659

	my $size = lvmVGSize($VGNAME);
	if ($size < $XEN_MIN_VGSIZE) {
Mike Hibler's avatar
Mike Hibler committed
660
661
662
663
664
665
666
667
	    print STDERR "WARNING: physical disk space below the desired ".
		" minimum value ($size < $XEN_MIN_VGSIZE), expect trouble.\n";
	}

	#
	# Create an image pool for golden images.
	# If this fails, we just don't use thin volumes!
	#
668
	if ($usethin && createThinPool($blockdevstr)) {
Mike Hibler's avatar
Mike Hibler committed
669
670
671
	    print STDERR "WARNING: could not create a thin pool, ".
		"disabling golden image support\n";
	    $usethin = 0;
672
673
	}
    }
674
675
    $STRIPE_COUNT = computeStripeSize($VGNAME);
    
676
677
678
679
680
681
682
683
    #
    # Make sure our volumes are active -- they seem to become inactive
    # across reboots
    #
    mysystem("vgchange -a y $VGNAME");

    print "Creating dhcp.conf skeleton...\n"
        if ($debug);
684
    createDHCP();
685

686
    print "Creating scratch FS ...\n";
Leigh B Stoller's avatar
Leigh B Stoller committed
687
    if (createExtraFS($EXTRAFS, $VGNAME, "25G")) {
688
689
690
	TBScriptUnlock();
	return -1;
    }
691
    print "Creating image metadata FS ...\n";
Leigh B Stoller's avatar
Leigh B Stoller committed
692
    if (createExtraFS($METAFS, $VGNAME, "1G")) {
Leigh B Stoller's avatar
Leigh B Stoller committed
693
694
695
	TBScriptUnlock();
	return -1;
    }
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
    print "Creating container info FS ...\n";
    if (createExtraFS($INFOFS, $VGNAME, "3G")) {
	TBScriptUnlock();
	return -1;
    }
    if (! -l $VMS) {
	#
	# We need this stuff to be sticky across reloads, so move it
	# into an lvm. If we lose the lvm, well then we are screwed.
	#
	my @files = glob("$VMS/*");
	foreach my $file (@files) {
	    my $base = basename($file);
	    mysystem("/bin/mv $file $INFOFS")
		if (! -e "$INFOFS/$base");
	}
	mysystem("/bin/rm -rf $VMS");
	mysystem("/bin/ln -s $INFOFS $VMS");
    }

Leigh B Stoller's avatar
Leigh B Stoller committed
716
717
718
719
720
    if (InitializeRouteTables()) {
	print STDERR "*** Could not initialize routing table DB\n";
	TBScriptUnlock();
	return -1;
    }
721

722
723
724
725
726
    #
    # Make sure IP forwarding is enabled on the host
    #
    mysystem2("$SYSCTL -w net.ipv4.conf.all.forwarding=1");

Mike Hibler's avatar
Mike Hibler committed
727
728
729
730
731
732
    #
    # Increase socket buffer size for frisbee download of images.
    #
    mysystem2("$SYSCTL -w net.core.rmem_max=1048576");
    mysystem2("$SYSCTL -w net.core.wmem_max=1048576");

733
734
735
    #
    # Need these to avoid overflowing the NAT tables.
    #
Leigh B Stoller's avatar
Leigh B Stoller committed
736
737
738
739
740
741
    mysystem2("$MODPROBE nf_conntrack");
    if ($?) {
	print STDERR "ERROR: could not load nf_conntrack module!\n";
	TBScriptUnlock();
	return -1;
    }
742
    mysystem2("$SYSCTL -w ".
743
	     "  net.netfilter.nf_conntrack_generic_timeout=120");
744
    mysystem2("$SYSCTL -w ".
745
	     "  net.netfilter.nf_conntrack_tcp_timeout_established=54000");
746
    mysystem2("$SYSCTL -w ".
747
	     "  net.netfilter.nf_conntrack_max=131071");
748
    mysystem2("echo 16384 > /sys/module/nf_conntrack/parameters/hashsize");
749
750
751
752
753
754
755
 
    # These might fail on new kernels.  
    mysystem2("$SYSCTL -w ".
	      " net.ipv4.netfilter.ip_conntrack_generic_timeout=120");
    mysystem2("$SYSCTL -w ".
	      " net.ipv4.netfilter.ip_conntrack_tcp_timeout_established=54000");

756
    mysystem("touch /var/run/xen.ready");
Mike Hibler's avatar
Mike Hibler committed
757
758
    TBDebugTimeStamp("  releasing global lock")
	if ($lockdebug);
759
    TBScriptUnlock();
Jon Rafkind's avatar
Jon Rafkind committed
760
761
762
    return 0;
}

763
sub rootPreConfigNetwork($$$$)
764
{
765
766
767
    my ($vnode_id, undef, $vnconfig, $private) = @_;
    my @node_ifs = @{ $vnconfig->{'ifconfig'} };
    my @node_lds = @{ $vnconfig->{'ldconfig'} };
Jon Rafkind's avatar
Jon Rafkind committed
768

Mike Hibler's avatar
Mike Hibler committed
769
770
    TBDebugTimeStamp("rootPreConfigNetwork: grabbing global lock $GLOBAL_CONF_LOCK")
	if ($lockdebug);
771
    if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) {
Leigh B Stoller's avatar
Leigh B Stoller committed
772
	print STDERR "Could not get the global lock after a long time!\n";
773
774
	return -1;
    }
Mike Hibler's avatar
Mike Hibler committed
775
776
    TBDebugTimeStamp("  got global lock")
	if ($lockdebug);
777

778
    createDHCP()
779
	if (! -e $DHCPCONF_FILE && ! -e $NEW_DHCPCONF_FILE);
780

781
782
783
784
785
786
787
788
789
790
791
    if (!$ISREMOTENODE) {
	my ($cnet_iface) = findControlNet();
	my ($alias_ip,$alias_mask) = domain0ControlNet();
	my $alias_iface = "$cnet_iface:1";

	if (system("ifconfig $alias_iface | grep -q 'inet addr'")) {
	    print "Creating $alias_iface alias...\n";
	    mysystem("ifconfig $alias_iface $alias_ip netmask $alias_mask");
	}
    }

792
793
794
795
796
797
    #
    # If we blocked, it would be because vnodes have come or gone,
    # so we need to rebuild the maps.
    #
    makeIfaceMaps();
    makeBridgeMaps();
Jon Rafkind's avatar
Jon Rafkind committed
798

Mike Hibler's avatar
Mike Hibler committed
799
800
    TBDebugTimeStamp("  releasing global lock")
	if ($lockdebug);
801
    TBScriptUnlock();
Jon Rafkind's avatar
Jon Rafkind committed
802
    return 0;
Leigh B Stoller's avatar
Leigh B Stoller committed
803
804
805
bad:
    TBScriptUnlock();
    return -1;
Jon Rafkind's avatar
Jon Rafkind committed
806
807
}

808
809
sub rootPostConfig($)
{
Jon Rafkind's avatar
Jon Rafkind committed
810
811
812
    return 0;
}

813
814
#
# Create the basic context for the VM and give it a unique ID for identifying
815
816
# "internal" state.  If $raref is set, then we are in a RELOAD state machine
# and need to walk the appropriate states.
817
#
818
sub vnodeCreate($$$$)
819
{
820
    my ($vnode_id, undef, $vnconfig, $private) = @_;
821
    my $attributes = $vnconfig->{'attributes'};
822
    my $imagename = $vnconfig->{'image'};
Leigh B Stoller's avatar
Leigh B Stoller committed
823
    my $raref = $vnconfig->{'reloadinfo'};
824
    my $vninfo = $private;
825
    my %image = %defaultImage;
Leigh B Stoller's avatar
Leigh B Stoller committed
826
    my $imagemetadata;
827
    my $lvname;
828
    my $inreload = 0;
Mike Hibler's avatar
Mike Hibler committed
829
    my $dothinlv = doingThinLVM();
830
831

    my $vmid;
832
    if ($vnode_id =~ /^[-\w]+\-(\d+)$/) {
833
834
835
836
837
	$vmid = $1;
    }
    else {
	fatal("xen_vnodeCreate: bad vnode_id $vnode_id!");
    }
838
839
    $vninfo->{'vmid'} = $vmid;

Leigh B Stoller's avatar
Leigh B Stoller committed
840
    if (CreateVnodeLock() != 0) {
841
	fatal("CreateVnodeLock()");
Leigh B Stoller's avatar
Leigh B Stoller committed
842
843
844
845
846
847
    }

    #
    # We need to lock while messing with the image. But we can use
    # shared lock so that others can proceed in parallel. We will have
    # to promote to an exclusive lock if the image has to be changed.
848
    #
849
    my $imagelockname = ImageLockName($imagename);
Mike Hibler's avatar
Mike Hibler committed
850
851
    TBDebugTimeStamp("grabbing image lock $imagelockname shared")
	if ($lockdebug);
Leigh B Stoller's avatar
Leigh B Stoller committed
852
    if (TBScriptLock($imagelockname, TBSCRIPTLOCK_SHAREDLOCK(), 1800)
853
854
855
	!= TBSCRIPTLOCK_OKAY()) {
	fatal("Could not get $imagelockname lock after a long time!");
    }
Mike Hibler's avatar
Mike Hibler committed
856
857
    TBDebugTimeStamp("  got image lock")
	if ($lockdebug);
858

859
860
861
862
    #
    # No image specified, use a default based on the dom0 OS.
    #
    if (!defined($imagename)) {
863
864
	$lvname = $image{'name'};
	
865
866
867
868
869
870
	#
	# Setup the default image now.
	# XXX right now this is a hack where we just copy the dom0
	# filesystem and clone (snapshot) that.
	#
	$imagename = $defaultImage{'name'};
871
872
	print STDERR "xen_vnodeCreate: ".
	    "no image specified, using default ('$imagename')\n";
873

Leigh B Stoller's avatar
Leigh B Stoller committed
874
875
876
	# Okay to fail if image does not exist yet.
	LoadImageMetadata($imagename, \$imagemetadata);

877
	$lvname = ImageLVName($imagename);
878
	if (!lvmFindVolume($lvname) && !defined($imagemetadata)) {
Leigh B Stoller's avatar
Leigh B Stoller committed
879
880
881
882
	    
	    #
	    # Need an exclusive lock for this.
	    #
Mike Hibler's avatar
Mike Hibler committed
883
884
	    TBDebugTimeStamp("  releasing image lock")
		if ($lockdebug);
Leigh B Stoller's avatar
Leigh B Stoller committed
885
	    TBScriptUnlock();	    
Mike Hibler's avatar
Mike Hibler committed
886
887
	    TBDebugTimeStamp("grabbing image lock $imagelockname exclusive")
		if ($lockdebug);
Leigh B Stoller's avatar
Leigh B Stoller committed
888
889
	    if (TBScriptLock($imagelockname, undef, 1800)
		!= TBSCRIPTLOCK_OKAY()) {
890
		fatal("Could not get $imagelockname write lock ".
Leigh B Stoller's avatar
Leigh B Stoller committed
891
892
		      "after a long time!");
	    }
Mike Hibler's avatar
Mike Hibler committed
893
894
	    TBDebugTimeStamp("  got image lock")
		if ($lockdebug);
Leigh B Stoller's avatar
Leigh B Stoller committed
895
	    # And now check again in case someone else snuck in.
896
	    if (!lvmFindVolume($lvname) && createRootDisk($imagename)) {
Leigh B Stoller's avatar
Leigh B Stoller committed
897
898
899
900
901
		TBScriptUnlock();
		fatal("xen_vnodeCreate: ".
		      "cannot find create root disk for default image");
	    }
	    # And back to a shared lock.
Mike Hibler's avatar
Mike Hibler committed
902
903
	    TBDebugTimeStamp("  releasing image lock")
		if ($lockdebug);
904
	    TBScriptUnlock();
Mike Hibler's avatar
Mike Hibler committed
905
906
	    TBDebugTimeStamp("grabbing image lock $imagelockname shared")
		if ($lockdebug);
Leigh B Stoller's avatar
Leigh B Stoller committed
907
908
909
910
911
912
	    if (TBScriptLock($imagelockname, TBSCRIPTLOCK_SHAREDLOCK(), 1800)
		!= TBSCRIPTLOCK_OKAY()) {
		fatal("Could not get $imagelockname lock back ".
		      "after a long time!");
	    }
	    $imagemetadata = undef;
913
	}
914
    }
915
916
917
918
919
    elsif (!defined($raref)) {
	#
	# Boot existing image. The base volume has to exist, since we do
	# not have any reload info to get it.
	#
920
	$lvname = ImageLVName($imagename);
921
	if (!lvmFindVolume($lvname)) {
922
	    TBScriptUnlock();
923
924
925
	    fatal("xen_vnodeCreate: ".
		  "cannot find logical volume for $lvname, and no reload info");
	}
926
    }
927
    else {
928
	$lvname = ImageLVName($imagename);
929
930
931
	$inreload = 1;

	print STDERR "xen_vnodeCreate: loading image '$imagename'\n";
932

933
	# Tell stated we are getting ready for a reload
934
	libutil::setState("RELOADSETUP");
935
936
937
938
939
940
941

	#
	# Immediately drop into RELOADING before calling createImageDisk as
	# that is the place where any image will be downloaded from the image
	# server and we want that download to take place in the longer timeout
	# period afforded by the RELOADING state.
	#
942
	libutil::setState("RELOADING");
943

Mike Hibler's avatar
Mike Hibler committed
944
	if (createImageDisk($imagename, $vnode_id, $raref, $dothinlv)) {
945
	    TBScriptUnlock();
946
947
948
	    fatal("xen_vnodeCreate: ".
		  "cannot create logical volume for $imagename");
	}
949
950
    }

Leigh B Stoller's avatar
Leigh B Stoller committed
951
952
953
954
955
956
957
958
959
960
961
    #
    # Load this from disk.
    #
    if (!defined($imagemetadata)) {
	if (LoadImageMetadata($imagename, \$imagemetadata)) {
	    TBScriptUnlock();
	    fatal("xen_vnodeCreate: ".
		  "cannot load image metadata for $imagename");
	}
    }

962
    #
963
    # See if the image is really a package.
964
    #
Leigh B Stoller's avatar
Leigh B Stoller committed
965
966
967
968
969
    if (exists($imagemetadata->{'ISPACKAGE'}) && $imagemetadata->{'ISPACKAGE'}){
	my $imagepath = lvmVolumePath($lvname);
	# In case of reboot.
	mysystem("mkdir -p /mnt/$imagename")
	    if (! -e "/mnt/$imagename");
970
971
	mysystem("mount $imagepath /mnt/$imagename")
	    if (! -e "/mnt/$imagename/.mounted");
972

973
974
975
976
977
	mysystem2("$RESTOREVM -t $VMDIR/$vnode_id $vnode_id /mnt/$imagename");
	if ($?) {
	    TBScriptUnlock();
	    fatal("xen_vnodeCreate: ".
		  "cannot restore logical volumes from $imagename");
978
	}
979
980
981
982
	if ($inreload) {
	    libutil::setState("RELOADDONE");
	    sleep(4);
	}
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	
	#
	# All of the lvms are created and a new xm.conf created.
	# Read that xm.conf in so we can figure out what lvms we
	# need to delete later (recreate the disks array). 
	#
	my $conf = configFile($vnode_id);
	my $aref = readXenConfig($conf);
	if (!$aref) {
	    TBScriptUnlock();
	    fatal("xen_vnodeCreate: ".
		  "Cannot read restored config file from $conf");
	}
	$vninfo->{'cffile'} = $aref;
	
	my $disks = parseXenDiskInfo($vnode_id, $aref);
	if (!defined($disks)) {
	    TBScriptUnlock();
For faster browsing, not all history is shown. View entire blame