libvnode_xen.pm 113 KB
Newer Older
Jon Rafkind's avatar
Jon Rafkind committed
1
#!/usr/bin/perl -wT
2
#
3
# Copyright (c) 2008-2014 University of Utah and the Flux Group.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#
# Implements the libvnode API for Xen support in Emulab.
#
# Note that there is no distinguished first or last call of this library
# in the current implementation.  Every vnode creation (through mkvnode.pl)
# will invoke all the root* and vnode* functions.  It is up to us to make
# sure that "one time" operations really are executed only once.
#
# TODO:
# + Clear out old, incorrect state in /var/lib/xend.
#   Maybe have to do this when tearing down (killing) vnodes.
#
# + Make more robust, little turds of state still get left around
#   that wreak havoc on reboot.
#
# + Support image loading.
#
Jon Rafkind's avatar
Jon Rafkind committed
40
41
42
43
44
45
package libvnode_xen;
use Exporter;
@ISA    = "Exporter";
@EXPORT = qw( init setDebug rootPreConfig
              rootPreConfigNetwork rootPostConfig
	      vnodeCreate vnodeDestroy vnodeState
46
	      vnodeBoot vnodePreBoot vnodeHalt vnodeReboot
47
	      vnodeUnmount
Jon Rafkind's avatar
Jon Rafkind committed
48
49
	      vnodePreConfig vnodePreConfigControlNetwork
              vnodePreConfigExpNetwork vnodeConfigResources
Leigh B Stoller's avatar
Leigh B Stoller committed
50
              vnodeConfigDevices vnodePostConfig vnodeExec vnodeTearDown VGNAME
Jon Rafkind's avatar
Jon Rafkind committed
51
	    );
52
use vars qw($VGNAME);
Jon Rafkind's avatar
Jon Rafkind committed
53
54
55
56
57
58
59
60

%ops = ( 'init' => \&init,
         'setDebug' => \&setDebug,
         'rootPreConfig' => \&rootPreConfig,
         'rootPreConfigNetwork' => \&rootPreConfigNetwork,
         'rootPostConfig' => \&rootPostConfig,
         'vnodeCreate' => \&vnodeCreate,
         'vnodeDestroy' => \&vnodeDestroy,
61
	 'vnodeTearDown' => \&vnodeTearDown,
Jon Rafkind's avatar
Jon Rafkind committed
62
63
64
         'vnodeState' => \&vnodeState,
         'vnodeBoot' => \&vnodeBoot,
         'vnodeHalt' => \&vnodeHalt,
65
# XXX needs to be implemented
66
         'vnodeUnmount' => \&vnodeUnmount,
Jon Rafkind's avatar
Jon Rafkind committed
67
         'vnodeReboot' => \&vnodeReboot,
68
# XXX needs to be implemented
69
         'vnodeExec' => \&vnodeExec,
Jon Rafkind's avatar
Jon Rafkind committed
70
71
72
73
74
75
76
77
         'vnodePreConfig' => \&vnodePreConfig,
         'vnodePreConfigControlNetwork' => \&vnodePreConfigControlNetwork,
         'vnodePreConfigExpNetwork' => \&vnodePreConfigExpNetwork,
         'vnodeConfigResources' => \&vnodeConfigResources,
         'vnodeConfigDevices' => \&vnodeConfigDevices,
         'vnodePostConfig' => \&vnodePostConfig,
       );

Jon Rafkind's avatar
Jon Rafkind committed
78

79
80
81
82
83
84
85
use strict;
use English;
use Data::Dumper;
use Socket;
use File::Basename;
use File::Path;
use File::Copy;
86
use File::Temp;
87
88

# Pull in libvnode
89
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }
90
91
use libutil;
use libgenvnode;
92
93
use libvnode;
use libtestbed;
94
use libsetup;
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

#
# Turn off line buffering on output
#
$| = 1;

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 

##
## Standard utilities and files section
##

110
my $BRCTL = "brctl";
111
my $IFCONFIG = "/sbin/ifconfig";
Leigh B Stoller's avatar
Leigh B Stoller committed
112
my $ETHTOOL = "/sbin/ethtool";
113
114
115
116
117
my $ROUTE = "/sbin/route";
my $SYSCTL = "/sbin/sysctl";
my $VLANCONFIG = "/sbin/vconfig";
my $MODPROBE = "/sbin/modprobe";
my $DHCPCONF_FILE = "/etc/dhcpd.conf";
118
my $NEW_DHCPCONF_FILE = "/etc/dhcp/dhcpd.conf";
119
my $RESTOREVM	= "$BINDIR/restorevm.pl";
Leigh B Stoller's avatar
Leigh B Stoller committed
120
my $LOCALIZEIMG	= "$BINDIR/localize_image";
121
my $IPTABLES	= "/sbin/iptables";
Leigh B Stoller's avatar
Leigh B Stoller committed
122
123
124
125
my $IPBIN	= "/sbin/ip";
my $NETSTAT     = "/bin/netstat";
my $IMAGEZIP    = "/usr/local/bin/imagezip";
my $IMAGEUNZIP  = "/usr/local/bin/imageunzip";
126
my $IMAGEDUMP   = "/usr/local/bin/imagedump";
127
my $XM          = "/usr/sbin/xm";
128
my $debug  = 0;
129
130
131
132
133
134
135
136
137

##
## Randomly chosen convention section
##

# global lock
my $GLOBAL_CONF_LOCK = "xenconf";

# default image to load on logical disks
138
139
# Just symlink /boot/vmlinuz-xenU and /boot/initrd-xenU
# to the kernel and ramdisk you want to use by default.
140
my %defaultImage = (
141
142
143
144
145
146
    'name'      => "emulab-ops-emulab-ops-XEN-STD",
    'kernel'    => "/boot/vmlinuz-xenU",
    'ramdisk'   => "/boot/initrd-xenU",
    'OSVERSION' => "any",
    'PARTOS'    => "Linux",
    'ISPACKAGE' => 0,
147
148
    'PART'      => 2,
    'BOOTPART'  => 2,
149
150
151
);

# where all our config files go
152
153
my $VMS    = "/var/emulab/vms";
my $VMDIR  = "$VMS/vminfo";
154
155
my $XENDIR = "/var/xen";

156
# Extra space for capture/restore.
157
my $EXTRAFS = "/capture";
158

159
# Extra space for image metadata between reloads.
Leigh B Stoller's avatar
Leigh B Stoller committed
160
161
my $METAFS = "/metadata";

162
163
164
# Extra space for vminfo (/var/emulab/vms) between reloads.
my $INFOFS = "/vminfo";

165
166
# Xen LVM volume group name. Accessible outside this file.
$VGNAME = "xen-vg";
Leigh B Stoller's avatar
Leigh B Stoller committed
167
168
# So we can ask this from outside;
sub VGNAME()  { return $VGNAME; }
169
170
171
172
173
174

##
## Indefensible, arbitrary constant section
##

# Maximum vnodes per physical host, used to size memory and disks
175
my $MAX_VNODES = 32;
176
177

# Minimum GB of disk per vnode
178
my $MIN_GB_DISK = 6;
179
180
181
182
183
184
185
186
187
188

# Minimum MB of memory per vnode
my $MIN_MB_VNMEM = 64;

# Minimum memory for dom0
my $MIN_MB_DOM0MEM = 256;

# Minimum acceptible size (in GB) of LVM VG for domUs.
my $XEN_MIN_VGSIZE = ($MAX_VNODES * $MIN_GB_DISK);

189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#
# When loading an Emulab partition image, we use a compressed version of our
# standard MBR layout:
#
# MBR 1 or 2 FreeBSD:
#    P1: 6GB (XEN_LDSIZE) offset at 63, OS goes here
#    P2: 1MB (XEN_EMPTYSIZE), as small as we can make it
#    P3: 1GB (XEN_SWAPSIZE), standard MBR2 swap size
# MBR 1 or 2 Linux:
#    P1: 1MB (XEN_EMPTYSIZE), as small as we can make it
#    P2: 6GB (XEN_LDSIZE) offset at 63, OS goes here
#    P3: 1GB (XEN_SWAPSIZE), standard MBR2 swap size
# MBR 3:
#    P1: 16GB (XEN_LDSIZE_3) offset at 2048, standard OS partition
#    P2: 1MB (XEN_EMPTYSIZE), as small as we can make it
#    P3: 1GB (XEN_SWAPSIZE), standard MBR2 swap size
#
# P4 is sized based on what the user told us.
# Sizes below are in 1K blocks.
#
my $XEN_LDSIZE    =  6152895;
my $XEN_LDSIZE_3  = 16777216;
my $XEN_SWAPSIZE  =  1048576;
my $XEN_EMPTYSIZE =     1024;
213

Leigh B Stoller's avatar
Leigh B Stoller committed
214
215
216
217
218
219
220
221
222
223
224
# IFBs
my $IFBDB      = "/var/emulab/db/ifbdb";
# Kernel auto-creates only two! Sheesh, why a fixed limit?
my $MAXIFB     = 1024;

# Route tables for tunnels
my $RTDB           = "/var/emulab/db/rtdb";
my $RTTABLES       = "/etc/iproute2/rt_tables";
# Temporary; later kernel version increases this.
my $MAXROUTETTABLE = 255;

225
226
227
# Whether or not to use only unpartitioned (unused) disks to form the Xen VG.
my $LVM_FULLDISKONLY = 0;

Leigh B Stoller's avatar
Leigh B Stoller committed
228
229
230
# LVM snapshots suck.
my $DOSNAP = 0;

231
232
233
234
# Use openvswitch for gre tunnels.
my $OVSCTL   = "/usr/local/bin/ovs-vsctl";
my $OVSSTART = "/usr/local/share/openvswitch/scripts/ovs-ctl";

235
236
my $ISREMOTENODE = REMOTEDED();
my $BRIDGENAME   = "xenbr0";
237
my $VIFROUTING   = ((-e "$ETCDIR/xenvifrouting") ? 1 : 0);
238

239
240
my $TMCD_PORT	 = 7777;

241
242
243
244
245
#
# Information about the running Xen hypervisor
#
my %xeninfo = ();

246
247
248
249
# Local functions
sub findRoot();
sub copyRoot($$);
sub createRootDisk($);
250
sub createAuxDisk($$);
251
252
253
254
255
256
sub replace_hacks($);
sub disk_hacks($);
sub configFile($);
sub domain0Memory();
sub totalMemory();
sub hostIP($);
257
sub createDHCP();
258
259
sub addDHCP($$$$);
sub subDHCP($$);
260
sub restartDHCP();
261
262
263
sub formatDHCP($$$);
sub fixupMac($);
sub createControlNetworkScript($$$);
264
sub createExpNetworkScript($$$$$$$$);
265
sub createTunnelScript($$$$$);
Leigh B Stoller's avatar
Leigh B Stoller committed
266
sub createExpBridges($$$);
267
268
269
270
271
272
sub destroyExpBridges($$);
sub domainStatus($);
sub domainExists($);
sub addConfig($$$);
sub createXenConfig($$);
sub readXenConfig($);
273
sub lookupXenConfig($$);
274
sub getXenInfo();
Leigh B Stoller's avatar
Leigh B Stoller committed
275
276
277
278
279
sub AllocateIFBs($$$);
sub InitializeRouteTable();
sub AllocateRouteTable($);
sub LookupRouteTable($);
sub FreeRouteTable($);
280
281
282

sub getXenInfo()
{
283
284
    open(XM,"$XM info|") 
        or die "getXenInfo: could not run '$XM info': $!";
285
286
287
288
289
290
291
292
293

    while (<XM>) {
	    chomp;
	    /^(\S+)\s*:\s+(.*)$/;
	    $xeninfo{$1} = $2;
    }
    
    close XM;
}
294
295
296
297
298
299
300

sub init($)
{
    my ($pnode_id,) = @_;

    makeIfaceMaps();
    makeBridgeMaps();
301
302
303
304
305

    my $toolstack = `grep TOOLSTACK /etc/default/xen`;
    if ($toolstack =~ /xl$/) {
	$XM = "/usr/sbin/xl";
    }
306
    getXenInfo();
307

Jon Rafkind's avatar
Jon Rafkind committed
308
309
310
    return 0;
}

311
312
313
314
315
316
317
318
sub setDebug($)
{
    $debug = shift;
    libvnode::setDebug($debug);
    print "libvnode_xen: debug=$debug\n"
	if ($debug);
}

319
320
321
322
323
324
325
326
sub ImageLockName($)
{
    my ($imagename) = @_;

    return "xenimage." .
	(defined($imagename) ? $imagename : $defaultImage{'name'});
}

327
328
329
330
#
# Called on each vnode, but should only be executed once per boot.
# We use a file in /var/run (cleared on reboots) to ensure this.
#
331
sub rootPreConfig($)
332
{
333
    my $bossip = shift;
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
    #
    # Haven't been called yet, grab the lock and double check that someone
    # didn't do it while we were waiting.
    #
    if (! -e "/var/run/xen.ready") {
	my $locked = TBScriptLock($GLOBAL_CONF_LOCK,
				  TBSCRIPTLOCK_GLOBALWAIT(), 900);
	if ($locked != TBSCRIPTLOCK_OKAY()) {
	    return 0
		if ($locked == TBSCRIPTLOCK_IGNORE());
	    print STDERR "Could not get the xeninit lock after a long time!\n";
	    return -1;
	}
    }
    if (-e "/var/run/xen.ready") {
        TBScriptUnlock();
        return 0;
    }
    
    print "Configuring root vnode context\n";

355
356
357
358
359
360
361
362
363
364
365
    #
    # For compatibility with existing (physical host) Emulab images,
    # the physical host provides DHCP info for the vnodes. We manage
    # the dhcpd.conf file here. See below. 
    #
    # Note that we must first add an alias to the control net bridge so
    # that we (the physical host) are in the same subnet as the vnodes,
    # otherwise dhcpd will fail.
    #
    my ($alias_iface, $alias_ip, $alias_mask);

366
    #
367
368
369
    # Locally, we just need to add the alias to the control interface
    # (which might be a bridge).
    # 
370
    if (!$ISREMOTENODE) {
371
	my ($cnet_iface) = findControlNet();
372
373

	#
374
375
376
	# We use xen's antispoofing when constructing the guest control net
	# interfaces. This is most useful on a shared host, but no
	# harm in doing it all the time.
377
	#
378
379
380
381
382
	mysystem("$IPTABLES -P FORWARD DROP");
	mysystem("$IPTABLES -F FORWARD");
	# This says to forward traffic across the bridge.
	mysystem("$IPTABLES -A FORWARD ".
		 "-m physdev --physdev-in $cnet_iface -j ACCEPT");
383
384
385
386
387
388
389
	
	if ($VIFROUTING) {
	    mysystem("echo 1 >/proc/sys/net/ipv4/conf/$cnet_iface/proxy_arp");
	    mysystem("echo 1 >/proc/sys/net/ipv4/ip_forward");
	    # This is for arping -A to work. See emulab-cnet.pl
	    mysystem("echo 1 >/proc/sys/net/ipv4/ip_nonlocal_bind");
	}
390
391
392
393
394
395
396
397

	# Set up for metadata server for ec2 support
	print "Setting up redirection for meta server...\n";
	mysystem("$IPBIN addr add 169.254.169.254/32 ".
		 "   scope global dev $cnet_iface");
	mysystem("$IPTABLES -t nat -A PREROUTING -d 169.254.169.254/32 " .
		 "   -p tcp -m tcp --dport 80 -j DNAT ".
		 "   --to-destination ${bossip}:8787");
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
    }
    else {
	if (!existsBridge($BRIDGENAME)) {
	    if (mysystem2("$BRCTL addbr $BRIDGENAME")) {
		TBScriptUnlock();
		return -1;
	    }
	    #
	    # We do not set the mac address; we want it to take
	    # on the address of the attached vif interfaces so that
	    # arp works. This is quite kludgy of course, but otherwise
	    # the arp comes into the bridge interface and then kernel
	    # drops it. There is a brouter (ebtables) work around
	    # but not worth worrying about. 
	    #
	}
	(undef,$alias_mask,$alias_ip) = findVirtControlNet();
	$alias_iface = $BRIDGENAME;
416
417
418
419
420

	if (system("ifconfig $alias_iface | grep -q 'inet addr'")) {
	    print "Creating $alias_iface alias...\n";
	    mysystem("ifconfig $alias_iface $alias_ip netmask $alias_mask");
	}
421
422
    }

423
424
425
426
    # For tunnels
    mysystem("$MODPROBE openvswitch");
    mysystem("$OVSSTART --delete-bridges start");

427
    # For bandwidth contraints.
Leigh B Stoller's avatar
Leigh B Stoller committed
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
    mysystem("$MODPROBE ifb numifbs=$MAXIFB");

    # Create a DB to manage them. 
    my %MDB;
    if (!dbmopen(%MDB, $IFBDB, 0660)) {
	print STDERR "*** Could not create $IFBDB\n";
	TBScriptUnlock();
	return -1;
    }
    for (my $i = 0; $i < $MAXIFB; $i++) {
	$MDB{"$i"} = ""
	    if (!defined($MDB{"$i"}));
    }
    dbmclose(%MDB);
    
443
444
445
446
447
448
449
450
451
452
453
454
455
456
    #
    # Ensure that LVM is loaded in the kernel and ready.
    #
    print "Enabling LVM...\n"
	if ($debug);

    # We assume our kernels support this.
    mysystem2("$MODPROBE dm-snapshot");
    if ($?) {
	print STDERR "ERROR: could not load snaphot module!\n";
	TBScriptUnlock();
	return -1;
    }

457
458
459
460
461
    #
    # Make sure pieces are at least a GiB.
    #
    my %devs = libvnode::findSpareDisks(1 * 1024);

Leigh B Stoller's avatar
Leigh B Stoller committed
462
463
464
465
466
467
468
    #
    # Turn on write caching. Hacky. 
    #
    foreach my $dev (keys(%devs)) {
	mysystem2("hdparm -W1 /dev/$dev");
    }

469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
    #
    # See if our LVM volume group for VMs exists and create it if not.
    #
    my $vg = `vgs | grep $VGNAME`;
    if ($vg !~ /^\s+${VGNAME}\s/) {
	print "Creating volume group...\n"
	    if ($debug);

	#
	# Find available devices of sufficient size, prepare them,
	# and incorporate them into a volume group.
	#
	my $blockdevs = "";
	my $totalSize = 0;
	foreach my $dev (keys(%devs)) {
	    if (defined($devs{$dev}{"size"})) {
		$blockdevs .= " /dev/$dev";
		$totalSize += $devs{$dev}{"size"};
	    }
488
	    elsif ($LVM_FULLDISKONLY == 0) {
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
		foreach my $part (keys(%{$devs{$dev}})) {
		    $blockdevs .= " /dev/${dev}${part}";
		    $totalSize += $devs{$dev}{$part}{"size"};
		}
	    }
	}
	if ($blockdevs eq '') {
	    print STDERR "ERROR: findSpareDisks found no disks for LVM!\n";
	    TBScriptUnlock();
	    return -1;
	}
		    
	mysystem("pvcreate $blockdevs");
	mysystem("vgcreate $VGNAME $blockdevs");

	my $size = lvmVGSize($VGNAME);
	if ($size < $XEN_MIN_VGSIZE) {
506
507
	    print STDERR "WARNING: physical disks not big enough to support".
		" $MAX_VNODES VMs ($size < $XEN_MIN_VGSIZE)\n";
508
509
510
511
512
513
514
515
516
517
518
	}
    }

    #
    # Make sure our volumes are active -- they seem to become inactive
    # across reboots
    #
    mysystem("vgchange -a y $VGNAME");

    print "Creating dhcp.conf skeleton...\n"
        if ($debug);
519
    createDHCP();
520

521
    print "Creating scratch FS ...\n";
Leigh B Stoller's avatar
Leigh B Stoller committed
522
    if (createExtraFS($EXTRAFS, $VGNAME, "25G")) {
523
524
525
	TBScriptUnlock();
	return -1;
    }
526
    print "Creating image metadata FS ...\n";
Leigh B Stoller's avatar
Leigh B Stoller committed
527
    if (createExtraFS($METAFS, $VGNAME, "1G")) {
Leigh B Stoller's avatar
Leigh B Stoller committed
528
529
530
	TBScriptUnlock();
	return -1;
    }
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
    print "Creating container info FS ...\n";
    if (createExtraFS($INFOFS, $VGNAME, "3G")) {
	TBScriptUnlock();
	return -1;
    }
    if (! -l $VMS) {
	#
	# We need this stuff to be sticky across reloads, so move it
	# into an lvm. If we lose the lvm, well then we are screwed.
	#
	my @files = glob("$VMS/*");
	foreach my $file (@files) {
	    my $base = basename($file);
	    mysystem("/bin/mv $file $INFOFS")
		if (! -e "$INFOFS/$base");
	}
	mysystem("/bin/rm -rf $VMS");
	mysystem("/bin/ln -s $INFOFS $VMS");
    }

Leigh B Stoller's avatar
Leigh B Stoller committed
551
552
553
554
555
    if (InitializeRouteTables()) {
	print STDERR "*** Could not initialize routing table DB\n";
	TBScriptUnlock();
	return -1;
    }
556
557
558
559

    #
    # Need these to avoid overflowing the NAT tables.
    #
Leigh B Stoller's avatar
Leigh B Stoller committed
560
561
562
563
564
565
    mysystem2("$MODPROBE nf_conntrack");
    if ($?) {
	print STDERR "ERROR: could not load nf_conntrack module!\n";
	TBScriptUnlock();
	return -1;
    }
566
    mysystem("$SYSCTL -w ".
567
	     "  net.netfilter.nf_conntrack_generic_timeout=120");
568
    mysystem("$SYSCTL -w ".
569
	     "  net.netfilter.nf_conntrack_tcp_timeout_established=54000");
570
571
572
    mysystem("$SYSCTL -w ".
	     "  net.netfilter.nf_conntrack_max=131071");
    mysystem("echo 16384 > /sys/module/nf_conntrack/parameters/hashsize");
573
574
575
576
577
578
579
 
    # These might fail on new kernels.  
    mysystem2("$SYSCTL -w ".
	      " net.ipv4.netfilter.ip_conntrack_generic_timeout=120");
    mysystem2("$SYSCTL -w ".
	      " net.ipv4.netfilter.ip_conntrack_tcp_timeout_established=54000");

580
581
    mysystem("touch /var/run/xen.ready");
    TBScriptUnlock();
Jon Rafkind's avatar
Jon Rafkind committed
582
583
584
    return 0;
}

585
sub rootPreConfigNetwork($$$$)
586
{
587
588
589
    my ($vnode_id, undef, $vnconfig, $private) = @_;
    my @node_ifs = @{ $vnconfig->{'ifconfig'} };
    my @node_lds = @{ $vnconfig->{'ldconfig'} };
Jon Rafkind's avatar
Jon Rafkind committed
590

591
    if (TBScriptLock($GLOBAL_CONF_LOCK, 0, 900) != TBSCRIPTLOCK_OKAY()) {
Leigh B Stoller's avatar
Leigh B Stoller committed
592
	print STDERR "Could not get the global lock after a long time!\n";
593
594
595
	return -1;
    }

596
    createDHCP()
597
	if (! -e $DHCPCONF_FILE && ! -e $NEW_DHCPCONF_FILE);
598

599
600
601
602
603
604
605
606
607
608
609
    if (!$ISREMOTENODE) {
	my ($cnet_iface) = findControlNet();
	my ($alias_ip,$alias_mask) = domain0ControlNet();
	my $alias_iface = "$cnet_iface:1";

	if (system("ifconfig $alias_iface | grep -q 'inet addr'")) {
	    print "Creating $alias_iface alias...\n";
	    mysystem("ifconfig $alias_iface $alias_ip netmask $alias_mask");
	}
    }

610
611
612
613
614
615
    #
    # If we blocked, it would be because vnodes have come or gone,
    # so we need to rebuild the maps.
    #
    makeIfaceMaps();
    makeBridgeMaps();
Jon Rafkind's avatar
Jon Rafkind committed
616

617
    TBScriptUnlock();
Jon Rafkind's avatar
Jon Rafkind committed
618
    return 0;
Leigh B Stoller's avatar
Leigh B Stoller committed
619
620
621
bad:
    TBScriptUnlock();
    return -1;
Jon Rafkind's avatar
Jon Rafkind committed
622
623
}

624
625
sub rootPostConfig($)
{
Jon Rafkind's avatar
Jon Rafkind committed
626
627
628
    return 0;
}

629
630
#
# Create the basic context for the VM and give it a unique ID for identifying
631
632
# "internal" state.  If $raref is set, then we are in a RELOAD state machine
# and need to walk the appropriate states.
633
#
634
sub vnodeCreate($$$$)
635
{
636
    my ($vnode_id, undef, $vnconfig, $private) = @_;
637
    my $attributes = $vnconfig->{'attributes'};
638
    my $imagename = $vnconfig->{'image'};
Leigh B Stoller's avatar
Leigh B Stoller committed
639
    my $raref = $vnconfig->{'reloadinfo'};
640
    my $vninfo = $private;
641
    my %image = %defaultImage;
Leigh B Stoller's avatar
Leigh B Stoller committed
642
    my $imagemetadata;
643
    my $lvname;
644
    my $inreload = 0;
645
646

    my $vmid;
647
    if ($vnode_id =~ /^[-\w]+\-(\d+)$/) {
648
649
650
651
652
	$vmid = $1;
    }
    else {
	fatal("xen_vnodeCreate: bad vnode_id $vnode_id!");
    }
653
654
655
    $vninfo->{'vmid'} = $vmid;

    #
Leigh B Stoller's avatar
Leigh B Stoller committed
656
657
658
    # 
    #
    if (CreateVnodeLock() != 0) {
659
	fatal("CreateVnodeLock()");
Leigh B Stoller's avatar
Leigh B Stoller committed
660
661
662
663
664
665
    }

    #
    # We need to lock while messing with the image. But we can use
    # shared lock so that others can proceed in parallel. We will have
    # to promote to an exclusive lock if the image has to be changed.
666
    #
667
    my $imagelockname = ImageLockName($imagename);
Leigh B Stoller's avatar
Leigh B Stoller committed
668
    if (TBScriptLock($imagelockname, TBSCRIPTLOCK_SHAREDLOCK(), 1800)
669
670
671
	!= TBSCRIPTLOCK_OKAY()) {
	fatal("Could not get $imagelockname lock after a long time!");
    }
672

673
674
675
676
    #
    # No image specified, use a default based on the dom0 OS.
    #
    if (!defined($imagename)) {
677
678
	$lvname = $image{'name'};
	
679
680
681
682
683
684
	#
	# Setup the default image now.
	# XXX right now this is a hack where we just copy the dom0
	# filesystem and clone (snapshot) that.
	#
	$imagename = $defaultImage{'name'};
685
686
	print STDERR "xen_vnodeCreate: ".
	    "no image specified, using default ('$imagename')\n";
687

Leigh B Stoller's avatar
Leigh B Stoller committed
688
689
690
	# Okay to fail if image does not exist yet.
	LoadImageMetadata($imagename, \$imagemetadata);

691
	$lvname = "image+" . $imagename;
Leigh B Stoller's avatar
Leigh B Stoller committed
692
693
694
695
696
697
698
699
700
	if (!findLVMLogicalVolume($lvname) &&
	    !defined($imagemetadata)) {
	    
	    #
	    # Need an exclusive lock for this.
	    #
	    TBScriptUnlock();	    
	    if (TBScriptLock($imagelockname, undef, 1800)
		!= TBSCRIPTLOCK_OKAY()) {
701
		fatal("Could not get $imagelockname write lock ".
Leigh B Stoller's avatar
Leigh B Stoller committed
702
703
704
705
706
707
708
709
710
		      "after a long time!");
	    }
	    # And now check again in case someone else snuck in.
	    if (!findLVMLogicalVolume($lvname) && createRootDisk($imagename)) {
		TBScriptUnlock();
		fatal("xen_vnodeCreate: ".
		      "cannot find create root disk for default image");
	    }
	    # And back to a shared lock.
711
	    TBScriptUnlock();
Leigh B Stoller's avatar
Leigh B Stoller committed
712
713
714
715
716
717
	    if (TBScriptLock($imagelockname, TBSCRIPTLOCK_SHAREDLOCK(), 1800)
		!= TBSCRIPTLOCK_OKAY()) {
		fatal("Could not get $imagelockname lock back ".
		      "after a long time!");
	    }
	    $imagemetadata = undef;
718
	}
719
    }
720
721
722
723
724
    elsif (!defined($raref)) {
	#
	# Boot existing image. The base volume has to exist, since we do
	# not have any reload info to get it.
	#
Leigh B Stoller's avatar
Leigh B Stoller committed
725
	$lvname = "image+" . $imagename;
726
	if (!findLVMLogicalVolume($lvname)) {
727
	    TBScriptUnlock();
728
729
730
	    fatal("xen_vnodeCreate: ".
		  "cannot find logical volume for $lvname, and no reload info");
	}
731
    }
732
    else {
Leigh B Stoller's avatar
Leigh B Stoller committed
733
	$lvname = "image+" . $imagename;
734
735
736
	$inreload = 1;

	print STDERR "xen_vnodeCreate: loading image '$imagename'\n";
737

738
	# Tell stated we are getting ready for a reload
739
	libutil::setState("RELOADSETUP");
740
741
742
743
744
745
746

	#
	# Immediately drop into RELOADING before calling createImageDisk as
	# that is the place where any image will be downloaded from the image
	# server and we want that download to take place in the longer timeout
	# period afforded by the RELOADING state.
	#
747
	libutil::setState("RELOADING");
748

749
	if (createImageDisk($imagename, $vnode_id, $raref)) {
750
	    TBScriptUnlock();
751
752
753
	    fatal("xen_vnodeCreate: ".
		  "cannot create logical volume for $imagename");
	}
754
755
    }

Leigh B Stoller's avatar
Leigh B Stoller committed
756
757
758
759
760
761
762
763
764
765
766
    #
    # Load this from disk.
    #
    if (!defined($imagemetadata)) {
	if (LoadImageMetadata($imagename, \$imagemetadata)) {
	    TBScriptUnlock();
	    fatal("xen_vnodeCreate: ".
		  "cannot load image metadata for $imagename");
	}
    }

767
    #
768
    # See if the image is really a package.
769
    #
Leigh B Stoller's avatar
Leigh B Stoller committed
770
771
772
773
774
    if (exists($imagemetadata->{'ISPACKAGE'}) && $imagemetadata->{'ISPACKAGE'}){
	my $imagepath = lvmVolumePath($lvname);
	# In case of reboot.
	mysystem("mkdir -p /mnt/$imagename")
	    if (! -e "/mnt/$imagename");
775
776
	mysystem("mount $imagepath /mnt/$imagename")
	    if (! -e "/mnt/$imagename/.mounted");
777

778
779
780
781
782
	mysystem2("$RESTOREVM -t $VMDIR/$vnode_id $vnode_id /mnt/$imagename");
	if ($?) {
	    TBScriptUnlock();
	    fatal("xen_vnodeCreate: ".
		  "cannot restore logical volumes from $imagename");
783
	}
784
785
786
787
	if ($inreload) {
	    libutil::setState("RELOADDONE");
	    sleep(4);
	}
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
	
	#
	# All of the lvms are created and a new xm.conf created.
	# Read that xm.conf in so we can figure out what lvms we
	# need to delete later (recreate the disks array). 
	#
	my $conf = configFile($vnode_id);
	my $aref = readXenConfig($conf);
	if (!$aref) {
	    TBScriptUnlock();
	    fatal("xen_vnodeCreate: ".
		  "Cannot read restored config file from $conf");
	}
	$vninfo->{'cffile'} = $aref;
	
	my $disks = parseXenDiskInfo($vnode_id, $aref);
	if (!defined($disks)) {
	    TBScriptUnlock();
	    fatal("xen_vnodeCreate: Could not restore disk info from $conf");
	}
	$private->{'disks'} = $disks;
809
810
811
812
813
814
	#
	# We want to support extra disk space on this path, but we cannot
	# just stick into the 4th partition like we do below, but have to
	# add an extra disk instead. But to do that we have to look at the
	# disks we just parsed and see what the highest lettered drive is.
	#
815
816
817
818
819
	if (exists($attributes->{'XEN_EXTRAFS'})) {
	    my $dsize   = $attributes->{'XEN_EXTRAFS'};
	    my $auxchar = ord('c');
	    my @stanzas = ();
	    
820
	    my $dpre = "xvd";
821
822
	    foreach my $disk (keys(%{$private->{'disks'}})) {
		my ($lvname,$vndisk,$vdisk) = @{$private->{'disks'}->{$disk}};
823
824
825
826
827
		if ($vdisk =~ /^(sd)(\w)$/ || $vdisk =~ /^(xvd)(\w)$/ ||
		    $vdisk =~ /^(hd)(\w)$/) {
		    $dpre = $1;
		    $auxchar = ord($2)
			if (ord($2) > $auxchar);
828
829
830
831
		}
		# Generate a new set of stanzas. see below.
		push(@stanzas, "'phy:$vndisk,$vdisk,w'");
	    }
832
	    my $vdisk = $dpre .	chr($auxchar);
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
	    my $auxlvname = "${vnode_id}.${vdisk}";
	    
	    if (!findLVMLogicalVolume($auxlvname)) {
		if (createAuxDisk($auxlvname, $dsize . "G")) {
		    fatal("libvnode_xen: could not create aux disk: $vdisk");
		}
	    }
	    my $vndisk = lvmVolumePath($auxlvname);
	    my $stanza = "'phy:$vndisk,$vdisk,w'";
	    $private->{'disks'}->{$auxlvname} = [$auxlvname, $vndisk, $vdisk];
	    push(@stanzas, $stanza);

	    #
	    # Replace the existing line in the conf file. 
	    #
	    addConfig($vninfo, "disk = [" . join(",", @stanzas) . "]", 2);
849
850
851

	    # Cause we have no idea.
	    $private->{'os'} = "other";
852
	}
853
	
854
	TBScriptUnlock();
Leigh B Stoller's avatar
Leigh B Stoller committed
855
	CreateVnodeUnlock();
856
	goto done;
857
858
    }

859
    #
Leigh B Stoller's avatar
Leigh B Stoller committed
860
    # We get the OS and version from loadinfo.
861
    #
862
    my $vdiskprefix = "sd";	# yes, this is right for FBSD too
863
    my $ishvm = 0;
864
    my $os;
Leigh B Stoller's avatar
Leigh B Stoller committed
865
866
    
    if ($imagemetadata->{'PARTOS'} =~ /freebsd/i) {
867
	$os = "FreeBSD";
868
869
870
871
872

	# XXX we assume that all 10.0 and above will be PVHVM
	if ($imagemetadata->{'OSVERSION'} >= 10) {
	    $ishvm = 1;
	}
873
874
    }
    else {
875
	$os = "Linux";
876

Leigh B Stoller's avatar
Leigh B Stoller committed
877
	if ($xeninfo{xen_major} >= 4) {
878
	    $vdiskprefix = "xvd";
879
	}
880
    }
881
    $private->{'os'} = $os;
882
    $private->{'ishvm'} = $ishvm;
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901

    # All of the disk stanzas for the config file.
    my @alldisks = ();
    # Cache the config file, but will read it later.
    $private->{'disks'} = {};

    #
    # The root disk.
    #
    my $rootvndisk = lvmVolumePath($vnode_id);

    #
    # Since we may have (re)loaded a new image for this vnode, check
    # and make sure the vnode snapshot disk is associated with the
    # correct image.  Otherwise destroy the current vnode LVM so it
    # will get correctly associated below.
    #
    if (findLVMLogicalVolume($vnode_id)) {
	my $olvname = findLVMOrigin($vnode_id);
Leigh B Stoller's avatar
Leigh B Stoller committed
902
903
904
	if (defined($raref) ||
	    ($olvname ne "" && $olvname ne $lvname)) {
	    RunWithLock("kpartx", "kpartx -dv $rootvndisk");
Leigh B Stoller's avatar
Leigh B Stoller committed
905
	    if (mysystem2("lvremove -f $VGNAME/$vnode_id")) {
906
907
908
909
910
911
912
		TBScriptUnlock();
		fatal("xen_vnodeCreate: ".
		      "could not destroy old disk for $vnode_id");
	    }
	}
    }

913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
    #
    # Figure out what slice the image is going in. It might be a whole
    # disk image though, so need to figure out what partition to boot.
    # Otherwise we force single slice images into its partition, and
    # put a swap partition after it. Lastly, if an extra disk partition
    # was requested, put that after the swap partition. This will allow
    # the user to take a whole disk image snapshot and load it on a physical
    # node later. 
    #
    print Dumper($imagemetadata);
    my $loadslice  = $imagemetadata->{'PART'};
    my $bootslice  = $loadslice;
    my $rootvdisk  = "${vdiskprefix}a";
    my $rootstanza = "phy:$rootvndisk,${vdiskprefix}a,w";
    push(@alldisks, "'$rootstanza'");

929
930
931
932
    #
    # Create the snapshot LVM.
    #
    if (!findLVMLogicalVolume($vnode_id)) {
933
934
935
936
937
	#
	# Need to create a new disk for the container. But lets see
	# if we have a disk cached. We still have the imagelock at
	# this point.
	#
Leigh B Stoller's avatar
Leigh B Stoller committed
938
939
940
941
942
943
	# Ick, this has to be done under an exclusive lock, but we
	# are currently running under a shared lock. We cannot drop
	# the shared lock though (and flock does promotion by drop
	# and relock). So, need to take another lock if we find
	# cached files.
	#
944
945
946
947
948
949
950
	if (my (@files) = glob("/dev/$VGNAME/_C_${imagename}_*")) {
	    #
	    # Grab the first file and rename it. It becomes ours.
	    # Then drop the lock.
	    #
	    my $file = $files[0];
	    if (mysystem2("lvrename $file $rootvndisk")) {
Leigh B Stoller's avatar
Leigh B Stoller committed
951
		TBScriptUnlock();
952
		fatal("libvnode_xen: could not rename cache file");
Leigh B Stoller's avatar
Leigh B Stoller committed
953
954
955
	    }
	}
	else {
956
957
958
959
960
961
962
963
	    my $extrafs = 
		(exists($attributes->{'XEN_EXTRAFS'}) ?
		 $attributes->{'XEN_EXTRAFS'} : undef);
		 
	    if (CreatePrimaryDisk($lvname,
				  $imagemetadata, $vnode_id, $extrafs)) {
		TBScriptUnlock();
		fatal("libvnode_xen: could not clone $lvname");
964
	    }
965
966
967
968
	    if ($inreload) {
		libutil::setState("RELOADDONE");
		sleep(5);
		
969
		#
970
971
972
		# We have to ask what partition to boot, since the
		# that info does not come across in the loadinfo, and
		# we cannot ask until RELOADDONE is sent. 
973
		#
Leigh B Stoller's avatar
Leigh B Stoller committed
974
		if ($loadslice == 0 && !exists($imagemetadata->{'BOOTPART'})) {
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
		    my @tmp;

		    if (getbootwhat(\@tmp) || !scalar(@tmp) ||
			!exists($tmp[0]->{"WHAT"}) ||
			$tmp[0]->{"WHAT"} !~ /^\d*$/) {
			print STDERR Dumper(\@tmp);
			TBScriptUnlock();
			fatal("libvnode_xen: could not get bootwhat info");
		    }
		    $bootslice = $tmp[0]->{"WHAT"};
		    #
		    # Store it back into the metadata for next time.
		    #
		    $imagemetadata->{'BOOTPART'} = $bootslice;
		    StoreImageMetadata($imagename, $imagemetadata);
990
		}
Leigh B Stoller's avatar
Leigh B Stoller committed
991
	    }
992
	}
993
994
995
996
997
998
999
	if ($loadslice == 0) {
	    $bootslice = $imagemetadata->{'BOOTPART'};
	}
	#
	# Need to create mapper entries so we can mount the
	# boot filesystem later, for slicefix.
	#
Leigh B Stoller's avatar
Leigh B Stoller committed
1000
	if (RunWithLock("kpartx", "kpartx -av $rootvndisk")) {
For faster browsing, not all history is shown. View entire blame