mkvnode.pl 30.2 KB
Newer Older
1
2
#!/usr/bin/perl -w
#
3
# Copyright (c) 2009-2017 University of Utah and the Flux Group.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
23
#
24
25
26
27
28
29
30
31
32
33
34
35
36

#
# This is the top-level vnode creation script, called via the vnodesetup
# wrapper.  It is os independent, calling into routines defined
# in liblocsetup or elsewhere for os-dependent functionality.  Libraries
# contained in modules named like libvnode_<type>.pm are hooked in to
# obtain setup operations that are specific to the vnode type.
#
# This script was specific to Linux host environments, but has been modified
# to be used under FreeBSD for certain vnode-like containers.  Eventually
# all vnode/jail/etc. setups under any host OS should flow through this.
#

37
38
39
40
41
use strict;
use Getopt::Std;
use English;
use Errno;
use POSIX qw(strftime);
42
43
use POSIX qw(:sys_wait_h);
use POSIX qw(:signal_h);
Leigh B Stoller's avatar
Leigh B Stoller committed
44
use POSIX qw(setsid);
45
use Data::Dumper;
46
47
use Storable;
use vars qw($vnstate);
48
49
50

sub usage()
{
Leigh B. Stoller's avatar
Leigh B. Stoller committed
51
    print "Usage: mkvnode [-d] vnodeid\n" . 
52
          "  -d   Debug mode.\n" .
53
54
	  "  -c   Cleanup stale container\n".
	  "  -s   Show state for container\n".
55
56
57
          "";
    exit(1);
}
58
my $optlist  = "dcs";
59
my $debug    = 1;
60
61
my $cleanup  = 0;
my $showstate= 0;
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
my $vnodeid;

#
# Turn off line buffering on output
#
$| = 1;

# Drag in path stuff so we can find emulab stuff.
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 
use libsetup;
use libtmcc;
78
use libutil;
79
80
use libtestbed;
    
81
82
# Pull in vnode stuff
use libgenvnode;
83
84
85
86
use libvnode;

# Helpers
sub MyFatal($);
87
88
89
90
sub safeLibOp($$$;@);
sub CleanupVM();
sub TearDownStaleVM();
sub StoreState();
91
92
93

# Locals
my $CTRLIPFILE = "/var/emulab/boot/myip";
94
95
96
97
98
99
100
101
my $VMPATH     = "/var/emulab/vms/vminfo";
my $VNDIR;
my $leaveme    = 0;
my $running    = 0;
my $cleaning   = 0;
my $rebooting  = 0;
my $reload     = 0;
my ($vmid,$vmtype,$ret,$err);
102
my $ISXENVM    = (GENVNODETYPE() eq "xen" ? 1 : 0);
103
104
105
106
107

# Flags for leaveme.
my $LEAVEME_REBOOT = 0x1;
my $LEAVEME_HALT   = 0x2;

108
109
110
111
112
113
114
115
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
my %options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
116
117
118
119
120
121
122
123
if (defined($options{"d"})) {
    $debug = 1;
}
if (defined($options{"c"})) {
    $cleanup = 1;
}
if (defined($options{"s"})) {
    $showstate = 1;
Leigh B Stoller's avatar
Leigh B Stoller committed
124
    $debug     = 0;
125
126
127
}
usage()
    if (@ARGV != 1);
128

129
$vnodeid = $ARGV[0];
130
$VNDIR   = "$VMPATH/$vnodeid";
131
132
133
134
135
136
137
138
139

#
# Must be root.
# 
if ($UID != 0) {
    die("*** $0:\n".
	"    Must be root to run this script!\n");
}

140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#
# Deal with VIFROUTING flag from the server. Do this before we switch
# our vnode_id below since it is a physical host attribute. This will
# go away at some point.
#
my %attributes = ();
if (getnodeattributes(\%attributes)) {
    die("*** $0:\n".
	"Could not get node attributes");
}
if (exists($attributes{"xenvifrouting"})) {
    # Gack, tell backend network scripts.
    system("touch $ETCDIR/xenvifrouting");
}

155
156
157
# Tell the library what vnode we are messing with.
libsetup_setvnodeid($vnodeid);

158
159
160
# Can set this after above line. 
my $RUNNING_FILE = CONFDIR() . "/running";

161
162
163
164
165
166
167
#
# Turn on debug timestamps if desired.
#
if ($debug) {
    TBDebugTimeStampsOn();
}

168
169
170
171
172
173
174
175
#
# Remove old state files at boot.
#
if (! -e "/var/run/mkvnode.ready") {
    system("rm -f $VARDIR/vms/*/vnode.state");
    system("touch /var/run/mkvnode.ready");
}

176
177
178
179
180
181
182
183
#
# XXX: for now, support only a single vnode type per phys node.  This is bad,
# but it's the current assumption.  For now, we also assume the nodetype since
# we only have pcvm.  Later, we need to get this info from tmcd so we know 
# lib to load.
#
my @nodetypes = ( GENVNODETYPE() );

184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#
# Need the domain, but no conistent way to do it. Ask tmcc for the
# boss node and parse out the domain. 
#
my ($DOMAINNAME,$BOSSIP) = tmccbossinfo();
die("Could not get bossname from tmcc!")
    if (!defined($DOMAINNAME));

if ($DOMAINNAME =~ /^[-\w]+\.(.*)$/) {
    $DOMAINNAME = $1;
}
else {
    die("Could not parse domain name!");
}
if ($BOSSIP !~ /^\d+\.\d+\.\d+\.\d+$/) {
    die "Bad bossip '$BOSSIP' from bossinfo!";
}

202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#
# We go through this crap so that we can pull in multiple packages implementing
# the libvnode API so they (hopefully) won't step on our namespace too much.
#
my %libops = ();
foreach my $type (@nodetypes) {
    if ($type =~ /^([\w\d\-]+)$/) {
	$type = $1;
    }
    # load lib and initialize it
    my %ops;
    eval "use libvnode_$type; %ops = %libvnode_${type}::ops";
    if ($@) {
	die "while trying to load 'libvnode_$type': $@";
    }
    if (0 && $debug) {
	print "%ops($type):\n" . Dumper(%ops);
    }
    $libops{$type} = \%ops;
    if ($debug) {
Leigh B Stoller's avatar
Leigh B Stoller committed
222
	$libops{$type}{'setDebug'}->($debug);
223
    }
224
225
226
227
    $libops{$type}{'init'}->();

    # need to do this for each type encountered. 
    TBDebugTimeStampWithDate("starting $type rootPreConfig()");
228
    $libops{$type}{'rootPreConfig'}->($BOSSIP);
229
    TBDebugTimeStampWithDate("finished $type rootPreConfig()");
230
231
232
233
234
235
236
}
if ($debug) {
    print "GENVNODETYPE " . GENVNODETYPE() . "\n";
    print "libops:\n" . Dumper(%libops);
}


237
238
239
240
241
242
243
244
245
246
#
# This holds the container state set up by the library. There is state
# added here, and state added in the library ("private"). We locally
# redefine this below, so cannot be a lexical.
#
# NOTE: There should be NO state in here that needs to survive reboot.
#       We just remove them all when rebooting. See above.
#
$vnstate = { "private" => {} };

247
248
249
250
251
252
253
#
# Quickie way to show the state.
#
if ($showstate) {
    if (! -e "$VNDIR/vnode.info") {
	fatal("No vnode.info file for $vnodeid");
    }
254
255
256
    my $str = `cat $VNDIR/vnode.info`;
    ($vmid, $vmtype, undef) = ($str =~ /^(\d*) (\w*) ([-\w]*)$/);
    
257
258
259
260
261
    my $tmp = eval { Storable::retrieve("$VNDIR/vnode.state"); };
    if ($@) {
	fatal("$@");
    }
    print Dumper($tmp);
262
263
264
265
266
267
268
269
270
271
272
273

    # So the lib op works.
    $vnstate = $tmp;

    ($ret,$err) = safeLibOp('vnodeState', 1, 0);
    if ($err) {
	fatal("Failed to get status for existing container: $err");
    }
    if ($ret eq VNODE_STATUS_UNKNOWN()) {
	print "Cannot determine status container $vmid.\n";
    }
    print "Domain is $ret\n";
274
275
276
    exit(0);
}

277
278
279
280
#
# In most cases, the vnodeid directory will have been created by the
# caller, and a config file possibly dropped in.  When debugging, we
# have to create it here.
281
282
283
284
285
#
if (! -e $VMPATH) {
    mkdir($VMPATH, 0770) or
	fatal("Could not mkdir $VMPATH: $!");
}
286
287
288
289
290
291
292
chdir($VMPATH) or
    die("Could not chdir to $VMPATH: $!\n");

if (! -e $vnodeid) {
    mkdir($vnodeid, 0770) or
	fatal("Could not mkdir $vnodeid in $VMPATH: $!");
}
293
294
295
296
297
298
299
300
#
# The container description for the library routines. 
#
my %vnconfig = ( "vnodeid"   => $vnodeid,
                 "config"    => undef,
		 "ifconfig"  => undef,
		 "ldconfig"  => undef,
		 "tunconfig" => undef,
301
		 "attributes"=> undef,
302
		 "environment"   => undef,
303
                 "storageconfig" => undef,
304
		 "fwconfig"      => undef,
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
);
sub VNCONFIG($) { return $vnconfig{'config'}->{$_[0]}; }

#
# If cleanup requested, make sure the manager process is not running
# Must do this after the stuff above is defined.
#
if ($cleanup) {
    # This path is in vnodesetup. 
    my $pidfile = "/var/run/tbvnode-${vnodeid}.pid";
    if (-e $pidfile) {
	print STDERR "Manager process still running. Use that instead.\n";
	print STDERR "If the manager is really dead, first rm $pidfile.\n";
	exit(1);
    }
    exit(TearDownStaleVM());
}

#
# Now we can start doing something useful.
#
326
my ($pid, $eid, $vname) = check_nickname();
327
328
my $nodeuuid = getnodeuuid();
$nodeuuid = $vnodeid if (!defined($nodeuuid));
329

330
331
332
333
334
335
#
# Get all the config stuff we need.
#
my %tmp;
my @tmp;
my $tmp;
336
my %attrs;
337
my %envvars;
338
339
340
my $fwinfo;
my @fwrules;
my @fwhosts;
341

342
343
344
fatal("Could not get vnode config for $vnodeid")
    if (getgenvnodeconfig(\%tmp));
$vnconfig{"config"} = \%tmp;
345
346

fatal("getifconfig($vnodeid): $!")
347
348
    if (getifconfig(\@tmp));
$vnconfig{"ifconfig"} = [ @tmp ];
349
350

fatal("getlinkdelayconfig($vnodeid): $!") 
351
352
    if (getlinkdelayconfig(\@tmp));
$vnconfig{"ldconfig"} = [ @tmp ];
353

Leigh B. Stoller's avatar
Leigh B. Stoller committed
354
fatal("gettunnelconfig($vnodeid): $!")
355
356
    if (gettunnelconfig(\$tmp));
$vnconfig{"tunconfig"} = $tmp;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
357

358
359
360
361
fatal("getnodeattributes($vnodeid): $!")
    if (getnodeattributes(\%attrs));
$vnconfig{"attributes"} = \%attrs;

362
363
364
365
fatal("getstorageconfig($vnodeid): $!")
    if (getstorageconfig(\@tmp));
$vnconfig{"storageconfig"} = [ @tmp ];

366
367
368
369
fatal("getenvvars(): $!")
    if (getenvvars(\%envvars));
$vnconfig{"environment"} = \%envvars;

370
371
372
373
374
375
376
fatal("getfwconfig(): $!")
    if (getfwconfig(\$fwinfo, \@fwrules, \@fwhosts));

$vnconfig{"fwconfig"} = {"fwinfo"  => $fwinfo,
			 "fwrules" => \@fwrules,
			 "fwhosts" => \@fwhosts};

377
378
379
380
381
#
# see if we 1) are supposed to be "booting" into the reload mfs, and 2) if
# we have loadinfo.  Need both to reload!
#
fatal("getbootwhat($vnodeid): $!") 
382
    if (getbootwhat(\@tmp));
383
384
if (scalar(@tmp) && exists($tmp[0]->{"WHAT"})) {
    if ($tmp[0]->{"WHAT"} =~ /frisbee-pcvm/) {
385
386
387
388
	#
	# Ok, we're reloading, using the fake frisbee pcvm mfs.
	#
	$reload = 1;
389
390
391
392
393
394
	
	fatal("getloadinfo($vnodeid): $!") 
	    if (getloadinfo(\@tmp));
	if (!scalar(@tmp)) {
	    fatal("vnode $vnodeid in reloading, but got no loadinfo!");
	}
395
396
397
398
399
400
401
402
403
404
	#
	# Loadinfo can now be a list, when loading deltas. Actually, I suppose
	# we could support loading multiple partitions, but other stuff would
	# have to change for that to work, so not going there right now.
	#
	$vnconfig{"reloadinfo"} = \@tmp;
	#
	# But the image we eventually boot is in jailconfig.
	# Sheesh, LVM names cannot include comma or colon. 
	#
Leigh B Stoller's avatar
Leigh B Stoller committed
405
	if (VNCONFIG('IMAGENAME') =~ /^([-\w]+),([-\w]+),([-\w\.]+)$/) {
406
407
408
409
410
	    $vnconfig{"image"}      = "$1-$2-$3";
	}
	elsif (VNCONFIG('IMAGENAME') =~ /^([-\w]+),([-\w]+),([^:]+):(\d+)$/) {
	    $vnconfig{"image"}      = "$1-$2-$3-$4";
	}
411
	else {
412
413
414
415
416
417
418
419
	    fatal("vnode $vnodeid in reloading, but got bogus IMAGENAME " . 
		   VNCONFIG('IMAGENAME') . " from jailconf!");
	}
	#
	# Apply the same transform to each loadinfo so that we do not have
	# duplicate it in the library,
	#
	foreach my $ref (@tmp) {
Leigh B Stoller's avatar
Leigh B Stoller committed
420
	    if ($ref->{'IMAGEID'} =~ /^([-\w]+),([-\w]+),([-\w\.]+)$/) {
421
422
423
424
		$ref->{'IMAGENAME'} = "$1-$2-$3";
	    }
	    elsif ($ref->{'IMAGEID'} =~ /^([-\w]+),([-\w]+),([^:]+):(\d+)$/) {
		$ref->{'IMAGENAME'} = "$1-$2-$3-$4";
425
426
	    }
	    else {
427
		fatal("Bad IMAGEID in loadinfo");
428
429
430
	    }
	}
    }
431
432
433
434
435
436
437
    elsif ($tmp[0]->{"WHAT"} =~ /^\d*$/) {
	#
	# We are using bootwhat for a much different purpose then intended.
	# It tells us a partition number, but that is meaningless. Look at
	# the jailconfig to see what image should boot. That image better
	# be resident already. 
	#
438
439
	# Sheesh, LVM names cannot include comma or colon.
	#
Leigh B Stoller's avatar
Leigh B Stoller committed
440
	if (VNCONFIG('IMAGENAME') =~ /^([-\w]+),([-\w]+),([-\w\.]+)$/) {
441
442
	    $vnconfig{"image"}      = "$1-$2-$3";
	}
443
444
445
	elsif (VNCONFIG('IMAGENAME') =~ /^([-\w]+),([-\w]+),([^:]+):(\d+)$/) {
	    $vnconfig{"image"}      = "$1-$2-$3-$4";
	}
446
447
448
449
    }
    else {
	# The library will boot the default, whatever that is.
    }
450
451
}

452
453
454
455
456
if ($debug) {
    print "VN Config:\n";
    print Dumper(\%vnconfig);
}

457
458
459
460
461
462
#
# Install a signal handler. We can get signals from vnodesetup.
#
sub handler ($) {
    my ($signame) = @_;

463
464
    print STDERR "mkvnode ($PID) caught a SIG${signame}!\n";

465
466
467
468
469
470
471
472
473
474
    # No more interruptions during teardown.
    $SIG{INT}  = 'IGNORE';
    $SIG{USR1} = 'IGNORE';
    $SIG{USR2} = 'IGNORE';
    $SIG{HUP}  = 'IGNORE';

    my $str = "killed";
    if ($signame eq 'USR1') {
	$leaveme = $LEAVEME_HALT;
	$str = "halted";
475
    }
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
    elsif ($signame eq 'USR2') {
	$leaveme = $LEAVEME_REBOOT;
	$str = "rebooted";
    }

    #
    # XXX this is a woeful hack for vnodesetup.  At the end of rebootvnode,
    # vnodesetup calls hackwaitandexit which essentially waits for a vnode
    # to be well on the way back up before it returns.  This call was
    # apparently added for the lighter-weight "reconfigure a vnode"
    # (as opposed to reboot it) path, however it makes the semantics of
    # reboot on a vnode different than that for a pnode, where reboot returns
    # as soon as the node stops responding (i.e., when it goes down and not
    # when it comes back up).  Why do I care?  Because Xen vnodes cannot
    # always "reboot" under the current semantics in less than 30 seconds,
    # which is the timeout in libreboot.
    #
    # So by touching the "running" file here we force hackwaitandexit to
    # return when the vnode is shutdown in Xen (or OpenVZ), more closely
    # matching the pnode semantics while leaving the BSD jail case (which
    # doesn't use this code) alone.  This obviously needs to be revisited.
    #
    mysystem("touch $RUNNING_FILE")
	if ($leaveme && -e "$RUNNING_FILE");

501
502
    print STDERR "Container is being $str\n";
    MyFatal("Container has been $str by $signame");
503
504
}

505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
#
# If this file exists, we are rebooting an existing container. But
# need to check if its a stale or aborted container (one that failed
# to setup or teardown) and got left behind. Another wrinkle is shared
# nodes, so we use the node uuid to determine if its another logical
# pcvm with the same name, and needs to be destroyed before setting up.
#
if (-e "$VNDIR/vnode.info") {
    my $uuid;
    my $teardown = 0;

    my $str = `cat $VNDIR/vnode.info`;
    ($vmid, $vmtype, $uuid) = ($str =~ /^(\d*) (\w*) ([-\w]*)$/);

    # Consistency check.
    fatal("No matching file: $VMPATH/vnode.$vmid")
	if (! -e "$VMPATH/vnode.$vmid");
    $str = `cat $VMPATH/vnode.$vmid`;
    chomp($str);
    if ($str ne $vnodeid) {
	fatal("Inconsistent vnodeid in $VMPATH/vnode.$vmid");
    }

    if ($uuid ne $nodeuuid) {
	print "UUID mismatch; tearing down stale vnode $vnodeid\n";
	$teardown = 1;
    }
    elsif ($reload) {
	print "Reload requested, tearing down old vnode\n";
	$teardown = 1;
535
536
    }
    else {
537
538
539
540
541
542
543
544
545
546
547
548
549
	# We (might) need this to discover the state. 
	local $vnstate = { "private" => {} };
	
	if (-e "$VNDIR/vnode.state") {
	    my $tmp = eval { Storable::retrieve("$VNDIR/vnode.state"); };
	    if ($@) {
		print STDERR "$@";
		$teardown = 1;
	    }
	    else {
		$vnstate->{'private'} = $tmp->{'private'};
	    }
	}
550
551
552
	($ret,$err) = safeLibOp('vnodeState', 1, 0);
	if ($err) {
	    fatal("Failed to get status for existing container: $err");
553
	}
554
555
556
557
	if ($ret eq VNODE_STATUS_UNKNOWN()) {
	    print "Cannot determine status container $vmid. Deleting ...\n";
	    $teardown = 1;
	}
558
559
560
561
562
	elsif ($ret eq VNODE_STATUS_MOUNTED()) {
	    print("vnode $vnodeid still mounted. Unmounting then restarting\n");
	    $teardown = 1;
	    $leaveme  = $LEAVEME_REBOOT;
	}
563
564
	elsif ($ret ne VNODE_STATUS_STOPPED()) {
	    fatal("vnode $vnodeid not stopped, not booting!");
565
566
	}
    }
567
    if ($teardown) {
Leigh B Stoller's avatar
Leigh B Stoller committed
568
569
570
571
572
573
574
575
576
	if (TearDownStaleVM()) {
	    #
	    # This really sucks. We have to be careful that the caller
	    # (vnodesetup) does not remove the data directory, or else
	    # we will not be able to come back here next time for cleanup.
	    #
	    print STDERR "Could not tear down stale container\n";
	    exit(1);
	}
577
578
579
	# See MOUNTED case above; we set leaveme to keep the container
	# file systems, but must reset leaveme. 
	$leaveme = 0;
580
581
582
583
    }
    else {
	$rebooting = 1;
    }
584
585
}

Leigh B Stoller's avatar
Leigh B Stoller committed
586
587
588
#
# Install handlers *after* down stale container teardown, since we set
# them to IGNORE during the teardown.
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
# 
# Ignore TERM since we want our caller to catch it first and then send
# it down to us. 
#
$SIG{TERM} = 'IGNORE';
# Halt container and exit. Tear down transient state, leave disk.
$SIG{USR1} = \&handler;
# Halt container and exit. Leave all state intact (we are rebooting).
$SIG{USR2} = \&handler;
# Halt container and exit. Tear down all state including disk.
$SIG{HUP}  = \&handler;
$SIG{INT}  = \&handler;

#
# Initial pre config for the experimental network. We want to make sure
# we can allocate the required devices and whatever else before going
# any further. 
606
#
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
TBDebugTimeStampWithDate("starting rootPreConfigNetwork()");
$ret = eval {
    $libops{GENVNODETYPE()}{'rootPreConfigNetwork'}->($vnodeid, undef,
	\%vnconfig, $vnstate->{'private'});
};
if ($ret || $@) {
    print STDERR $@
	if ($@);
    
    # If this fails, we require the library to clean up after itself
    # so that we can just exit without worrying about cleanup.
    fatal("rootPreConfigNetwork failed!");
}
TBDebugTimeStampWithDate("finished rootPreConfigNetwork()");

622
623
624
625
626
627
628
if (! -e "$VNDIR/vnode.info") {
    #
    # XXX XXX XXX: need to get this from tmcd!
    # NOTE: we first put the type into vndb so that the create call can go!
    #
    $vmtype = GENVNODETYPE();

Leigh B Stoller's avatar
Leigh B Stoller committed
629
    ($ret,$err) = safeLibOp('vnodeCreate',0,0);
630
    if ($err) {
631
	MyFatal("vnodeCreate failed: $err");
632
633
634
    }
    $vmid = $ret;

635
    mysystem("echo '$vmid $vmtype $nodeuuid' > $VNDIR/vnode.info");
636
    mysystem("echo '$vnodeid' > $VMPATH/vnode.$vmid");
637
638
639

    # bootvnodes wants this to be here...
    mysystem("mkdir -p /var/emulab/jails/$vnodeid");
640
}
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
else {
    #
    # Restore the state and throw away the private data. 
    #
    if (-e "$VNDIR/vnode.state") {
	my $tmp = eval { Storable::retrieve("$VNDIR/vnode.state"); };
	if ($@) {
	    print STDERR "$@";
	}
	else {
	    # Restore this from the saved state for vnodepreconfig.
	    $vnstate->{'private'}->{'os'} = $tmp->{'os'}
	        if (exists($tmp->{'os'}));
	    $vnstate->{'private'}->{'rootpartition'} = $tmp->{'rootpartition'}
	        if (exists($tmp->{'rootpartition'}));
656
657
	    $vnstate->{'private'}->{'ishvm'} = $tmp->{'ishvm'}
	        if (exists($tmp->{'ishvm'}));
658
659
660
661
	}
    }
}
# This state structure is saved to disk for TearDown and Reboot.
662
663
664
$vnstate->{"vmid"}   = $vmid;
$vnstate->{"vmtype"} = $vmtype;
$vnstate->{"uuid"}   = $nodeuuid;
665
666
667
668
669
# Save this for reboot. 
$vnstate->{'os'} = $vnstate->{'private'}->{'os'}
    if (exists($vnstate->{'private'}->{'os'}));
$vnstate->{'rootpartition'} = $vnstate->{'private'}->{'rootpartition'}
    if (exists($vnstate->{'private'}->{'rootpartition'}));
670
671
$vnstate->{'ishvm'} = $vnstate->{'private'}->{'ishvm'}
    if (exists($vnstate->{'private'}->{'ishvm'}));
672

673
674
675
# Store the state to disk.
if (StoreState()) {
    MyFatal("Could not store container state to disk");
676
677
}

678
679
my $cnet_mac = (defined(VNCONFIG('CTRLMAC')) ?
		VNCONFIG('CTRLMAC') : ipToMac(VNCONFIG('CTRLIP')));
680
681
682
683
684
685
686
687
688
my $ext_ctrlip = `cat $CTRLIPFILE`;
chomp($ext_ctrlip);
if ($ext_ctrlip !~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) {
    # cannot/should not really go on if this happens.
    MyFatal("error prior to vnodePreConfigControlNetwork($vnodeid): " . 
	    " could not find valid ip in $CTRLIPFILE!");
}
my $longdomain = "${eid}.${pid}.${DOMAINNAME}";

689
690
691
692
693
694
695
#
# Call back to do things to the container before it boots.
#
sub callback($)
{
    my ($path) = @_;

696
697
698
699
    #
    # Set up sshd port to listen on. If the vnode has its own IP
    # then listen on both 22 and the per-vnode port.
    #
700
701
702
703
704
705
706
    if (defined(VNCONFIG('SSHDPORT')) && VNCONFIG('SSHDPORT') ne "") {
	my $sshdport = VNCONFIG('SSHDPORT');

	mysystem2("echo '# EmulabJail' >> $path/etc/ssh/sshd_config");
	mysystem2("echo 'Port $sshdport' >> $path/etc/ssh/sshd_config");
	if (VNCONFIG('CTRLIP') ne $ext_ctrlip) {
	    mysystem2("echo 'Port 22' >> $path/etc/ssh/sshd_config");
707
	}
708
	mysystem2("echo '# EndEmulabJail' >> $path/etc/ssh/sshd_config");
709
    }
710
    # Localize the timezone.
711
712
    mysystem2("cp -fp /etc/localtime $path/etc");

713
714
715
    return 0;
}

716
# OP: preconfig
717
if (safeLibOp('vnodePreConfig', 1, 1, \&callback)) {
718
719
720
721
    MyFatal("vnodePreConfig failed");
}

# OP: control net preconfig
722
723
724
if (safeLibOp('vnodePreConfigControlNetwork',1,1,
	      VNCONFIG('CTRLIP'),
	      VNCONFIG('CTRLMASK'),$cnet_mac,
725
726
727
728
729
	      $ext_ctrlip,$vname,$longdomain,$DOMAINNAME,$BOSSIP)) {
    MyFatal("vnodePreConfigControlNetwork failed");
}

# OP: exp net preconfig
730
if (safeLibOp('vnodePreConfigExpNetwork', 1, 1)) {
731
732
    MyFatal("vnodePreConfigExpNetwork failed");
}
733
if (safeLibOp('vnodeConfigResources', 1, 1)) {
734
735
    MyFatal("vnodeConfigResources failed");
}
736
if (safeLibOp('vnodeConfigDevices', 1, 1)) {
737
738
739
    MyFatal("vnodeConfigDevices failed");
}

740
#
741
# Route to inner ssh, but not if the IP is routable, no need to.
742
#
743
744
if (defined(VNCONFIG('SSHDPORT')) && VNCONFIG('SSHDPORT') ne "" &&
    !isRoutable(VNCONFIG('CTRLIP'))) {
745
746
747
748
749
750
751
752
753
    my $ref = {};
    $ref->{'ext_ip'}   = $ext_ctrlip;
    $ref->{'ext_port'} = VNCONFIG('SSHDPORT');
    $ref->{'int_ip'}   = VNCONFIG('CTRLIP');
    $ref->{'int_port'} = VNCONFIG('SSHDPORT');
    $ref->{'protocol'} = "tcp";
    
    $vnstate->{'sshd_iprule'} = $ref
	if (libvnode::forwardPort($ref) == 0);
754
755
}

756
#
757
# Start the container. If all goes well, this will exit cleanly, with 
758
759
760
# it running in its new context. Still, lets protect it with a timer
# since it might get hung up inside and we do not want to get stuck here.
#
761
762
763
764
765
766
767
768
if (!$ISXENVM) {
    my $childpid = fork();
    if ($childpid) {
	my $timedout = 0;
	local $SIG{ALRM} = sub { kill("TERM", $childpid); $timedout = 1; };
	alarm 180;
	waitpid($childpid, 0);
	alarm 0;
769

770
771
772
773
774
775
776
	#
	# If failure then cleanup.
	#
	if ($? || $timedout) {
	    MyFatal("$vnodeid container startup ".
		    ($timedout ? "timed out." : "failed."));
	}
777
    }
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
    else {
	#
	# We want to call this as clean as possible.
	#
	$SIG{TERM} = 'DEFAULT';
	$SIG{INT}  = 'DEFAULT';
	$SIG{USR1} = 'DEFAULT';
	$SIG{USR2} = 'DEFAULT';
	$SIG{HUP}  = 'DEFAULT';
	POSIX::setsid();

	if ($libops{$vmtype}{"vnodeBoot"}->($vnodeid, $vmid,
					    \%vnconfig, $vnstate->{'private'})){
	    print STDERR "*** ERROR: vnodeBoot failed\n";
	    exit(1);
	}
	exit(0);
795
    }
796
797
798
}
elsif (safeLibOp('vnodeBoot', 1, 1)) {
    MyFatal("$vnodeid container startup failed.");
799
}
800
if (safeLibOp('vnodePostConfig', 1, 1)) {
801
802
803
    MyFatal("vnodePostConfig failed");
}
# XXX: need to do this for each type encountered!
804
TBDebugTimeStampWithDate("starting $vmtype rootPostConfig()");
805
$libops{$vmtype}{'rootPostConfig'}->();
806
807
808
809
810
811
TBDebugTimeStampWithDate("finished $vmtype rootPostConfig()");

if ($debug) {
    print "VN State:\n";
    print Dumper($vnstate);
}
812

813
814
815
816
# Store the state to disk.
if (StoreState()) {
    MyFatal("Could not store container state to disk");
}
817
# This is for vnodesetup
818
mysystem("touch $RUNNING_FILE");
819
$running = 1;
820
821

#
822
823
824
825
# This loop is to catch when the container stops. We used to run a sleep
# inside and wait for it to exit, but that is not portable across the
# backends, and the return value did not indicate how it exited. So, lets
# just loop, asking for the status every few seconds. 
826
#
827
828
829
830
# XXX Turn off debugging during this loop to keep the log file from growing.
#
TBDebugTimeStampsOff()
    if ($debug);
831

832
833
834
while (1) {
    sleep(5);
    
835
    #
836
837
838
839
    # If the container exits, either it rebooted from the inside or
    # the physical node is rebooting, or we are actively trying to kill
    # it cause our parent (vnodesetup) told us to. In all cases, we just
    # exit and let the parent decide what to do. 
840
    #
841
842
843
844
845
846
    my ($ret,$err) = safeLibOp('vnodeState', 0, 0);
    if ($err) {
	fatal("*** ERROR: vnodeState: $err\n");
    }
    if ($ret ne VNODE_STATUS_RUNNING()) {
	print "Container is no longer running.\n";
Leigh B Stoller's avatar
Leigh B Stoller committed
847
848
849
850
851
852
853
854
855
	if (!$cleaning) {
	    #
	    # Rebooted from inside, but not cause we told it to, so
	    # leave intact.
	    #
	    # But before we fold, lets wait a moment and check again
	    # since in XEN, the user can type reboot, which causes the
	    # domain to disappear for a while. We do not want to be
	    # fooled by that. Halt is another issue; if the user halts
856
	    # from inside the container it is never coming back and the 
Leigh B Stoller's avatar
Leigh B Stoller committed
857
858
859
860
861
862
863
864
865
866
867
868
869
	    # user has screwed himself. Need to restart from the frontend.
	    #
	    sleep(15);
	    ($ret,$err) = safeLibOp('vnodeState', 0, 0);
	    if ($err) {
		fatal("*** ERROR: vnodeState: $err\n");
	    }
	    if ($ret eq VNODE_STATUS_RUNNING()) {
		print "Container has restarted itself.\n";
		next;
	    }
	    $leaveme = $LEAVEME_REBOOT;
	}
870
871
	last;
    }
872
}
873
874
875
TBDebugTimeStampsOn()
    if ($debug);
exit(CleanupVM());
876
877

#
878
879
880
881
# Teardown a container. This should not be used if the mkvnode process
# is still running; use vnodesetup instead. This is just for the case
# that the manager (vnodesetup,mkvnode) process is gone and the turds
# need to be cleaned up.
882
#
883
884
885
886
887
888
889
sub TearDownStaleVM()
{
    if (! -e "$VNDIR/vnode.info") {
	fatal("TearDownStaleVM: no vnode.info file for $vnodeid");
    }
    my $str = `cat $VNDIR/vnode.info`;
    ($vmid, $vmtype, undef) = ($str =~ /^(\d*) (\w*) ([-\w]*)$/);
890

891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
    #
    # Load the state. Use a local so that we do not overwrite
    # the outer version. Just a precaution.
    #
    # The state might not exist, but we proceed anyway.
    #
    local $vnstate = { "private" => {} };

    if (-e "$VNDIR/vnode.state") {
	$vnstate = eval { Storable::retrieve("$VNDIR/vnode.state"); };
	if ($@) {
	    print STDERR "$@";
	    return -1;
	}
	if ($debug) {
	    print "vnstate:\n";
	    print Dumper($vnstate);
908
	}
909
    }
910

Leigh B Stoller's avatar
Leigh B Stoller committed
911
912
913
914
915
916
    # No interruptions during stale teardown.
    $SIG{INT}  = 'IGNORE';
    $SIG{USR1} = 'IGNORE';
    $SIG{USR2} = 'IGNORE';
    $SIG{HUP}  = 'IGNORE';

917
918
919
920
921
922
923
    #
    # if we fail to cleanup, store the state back to disk so that we
    # capture any changes. 
    #
    if (CleanupVM()) {
	StoreState();
	return -1;
924
    }
Leigh B Stoller's avatar
Leigh B Stoller committed
925
926
927
928
929
    $SIG{INT}  = 'DEFAULT';
    $SIG{USR1} = 'DEFAULT';
    $SIG{USR2} = 'DEFAULT';
    $SIG{HUP}  = 'DEFAULT';
    
930
    return 0;
931
932
933
934
935
}

#
# Clean things up.
#
936
sub CleanupVM()
937
938
939
940
941
942
943
944
{
    if ($cleaning) {
	die("*** $0:\n".
	    "    Oops, already cleaning!\n");
    }
    $cleaning = 1;

    # If the container was never built, there is nothing to do.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
945
    return 0
946
947
	if (! -e "$VNDIR/vnode.info" || !defined($vmid));

948
949
    if (exists($vnstate->{'sshd_iprule'})) {
	my $ref = $vnstate->{'sshd_iprule'};
950
	libvnode::removePortForward($ref);
951
952
953
	# Update new state.
	delete($vnstate->{'sshd_iprule'});
	StoreState();
954
955
    }

956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
    #
    # The tmcc proxy causes teardown problems, no idea why.
    # It used to be kill off from the unmount script, but lets
    # do it here.
    #
    my $PROXYPID = "/var/run/tmccproxy.${vnodeid}.pid";
    if (-e $PROXYPID) {
	my $ppid = `cat $PROXYPID`;
	chomp($ppid);
	# untaint
	if ($ppid =~ /^([-\@\w.]+)$/) {
	    $ppid = $1;
	}
	if (kill('TERM', $ppid) == 0) {
	    print"*** ERROR: Could not kill(TERM) proxy process $ppid: $!\n";
	}
	else {
	    unlink($PROXYPID);
	}
    }

977
    # if not halted, try that first
978
    my ($ret,$err) = safeLibOp('vnodeState', 1, 0);
979
980
981
    if ($err) {
	print STDERR "*** ERROR: vnodeState: ".
	    "failed to cleanup $vnodeid: $err\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
982
	return -1;
983
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
984
    if ($ret eq VNODE_STATUS_RUNNING()) {
985
	print STDERR "cleanup: $vnodeid not stopped, trying to halt it.\n";
986
	($ret,$err) = safeLibOp('vnodeHalt', 1, 1);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
987
988
989
990
991
	if ($err) {
	    print STDERR "*** ERROR: vnodeHalt: ".
		"failed to halt $vnodeid: $err\n";
	    return -1;
	}
992
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
993
994
    elsif ($ret eq VNODE_STATUS_MOUNTED()) {
	print STDERR "cleanup: $vnodeid is mounted, trying to unmount it.\n";
995
	($ret,$err) = safeLibOp('vnodeUnmount', 1, 1);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
996
997
998
999
1000
1001
	if ($err) {
	    print STDERR "*** ERROR: vnodeUnmount: ".
		"failed to unmount $vnodeid: $err\n";
	    return -1;
	}
    }
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
    if ($leaveme) {
	if ($leaveme == $LEAVEME_HALT || $leaveme == $LEAVEME_REBOOT) {
	    #
	    # When halting, the disk state is left, but the transient state
	    # is removed since it will get reconstructed later if the vnode
	    # is restarted. This avoids leaking a bunch of stuff in case the
	    # vnode never starts up again. We of course leave the disk, but
	    # that will eventually get cleaned up if the pcvm is reused for
	    # a future experiment.
	    #
	    # XXX Reboot should be different; there is no reason to tear
	    # down the transient state, but we do not handle that yet.
	    # Not hard to add though.
	    #
	    ($ret,$err) = safeLibOp('vnodeTearDown', 1, 1);
	    # Always store in case some progress was made. 
	    StoreState();
	    if ($err) {
		print STDERR "*** ERROR: failed to teardown $vnodeid: $err\n";
		return -1;
	    }
	}
	return 0;
    }
1026
1027

    # now destroy
1028
    ($ret,$err) = safeLibOp('vnodeDestroy', 1, 1);
1029
1030
    if ($err) {
	print STDERR "*** ERROR: failed to destroy $vnodeid: $err\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1031
	return -1;
1032
1033
    }
    unlink("$VNDIR/vnode.info");
1034
    unlink("$VNDIR/vnode.state");
1035
    unlink("$VMPATH/vnode.$vmid");
1036
    $cleaning = 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1037
    return 0;
1038
1039
1040
1041
1042
1043
1044
1045
1046
}
    
#
# Print error and exit.
#
sub MyFatal($)
{
    my ($msg) = @_;

1047
1048
1049
1050
    #
    # If rebooting but never got a chance to run, we do not want
    # to kill off the container. Might lose user data.
    #
1051
    $leaveme = $LEAVEME_REBOOT
1052
1053
	if ($rebooting && !$running);

1054
1055
1056
    TBDebugTimeStampsOn()
	if ($debug);
    
1057
    CleanupVM();
1058
1059
1060
1061
1062
1063
1064
    die("*** $0:\n".
	"    $msg\n");
}

#
# Helpers:
#
1065
1066
sub safeLibOp($$$;@) {
    my ($op,$autolog,$autoerr,@args) = @_;
1067
1068
1069

    my $sargs = '';
    if (@args > 0) {
1070
 	$sargs = join(',',@args);
1071
    }
1072
    TBDebugTimeStampWithDate("starting $vmtype $op($sargs)")
1073
	if ($debug);
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085

    #
    # Block signals that could kill us in the middle of a library call.
    # Might be better to do this down in the library, but this is an
    # easier place to do it. This ensure that if we have to tear down
    # in the middle of setting up, the state is consistent. 
    #
    my $new_sigset = POSIX::SigSet->new(SIGHUP, SIGINT, SIGUSR1, SIGUSR2);
    my $old_sigset = POSIX::SigSet->new;
    if (! defined(sigprocmask(SIG_BLOCK, $new_sigset, $old_sigset))) {
	print STDERR "sigprocmask (BLOCK) failed!\n";
    }
1086
    my $ret = eval {
1087
1088
	$libops{$vmtype}{$op}->($vnodeid, $vmid,
				\%vnconfig, $vnstate->{'private'}, @args);
1089
    };
1090
    my $err = $@;
1091
    if (! defined(sigprocmask(SIG_SETMASK, $old_sigset))) {
1092
1093
1094
	print STDERR "sigprocmask (UNBLOCK) failed!\n";
    }
    if ($err) {
1095
1096
1097
	if ($autolog) {
	    ;
	}
1098
	TBDebugTimeStampWithDate("failed $vmtype $op($sargs): $err")
1099
1100
1101
1102
	    if ($debug);
	return (-1,$err);
    }
    if ($autoerr && $ret) {
1103
	$err = "$op($vnodeid) failed with exit code $ret!";
1104
1105
1106
	if ($autolog) {
	    ;
	}
1107
	TBDebugTimeStampWithDate("failed $vmtype $op($sargs): exited with $ret")
1108
1109
1110
1111
	    if ($debug);
	return ($ret,$err);
    }

1112
    TBDebugTimeStampWithDate("finished $vmtype $op($sargs)")
1113
1114
1115
1116
	if ($debug);

    return $ret;
}
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130

sub StoreState()
{
    # Store the state to disk.
    print "Storing state to disk ...\n"
	if ($debug);
    
    my $ret = eval { Storable::store($vnstate, "$VNDIR/vnode.state"); };
    if ($@) {
	print STDERR "$@";
	return -1;
    }
    return 0;
}