mkvnode.pl 25.3 KB
Newer Older
1
2
3
#!/usr/bin/perl -w
#
# EMULAB-COPYRIGHT
Leigh B Stoller's avatar
Leigh B Stoller committed
4
# Copyright (c) 2009-2012 University of Utah and the Flux Group.
5
6
7
8
9
10
11
# All rights reserved.
#
use strict;
use Getopt::Std;
use English;
use Errno;
use POSIX qw(strftime);
12
13
use POSIX qw(:sys_wait_h);
use POSIX qw(:signal_h);
14
use Data::Dumper;
15
16
use Storable;
use vars qw($vnstate);
17
18
19
20
21
22

#
# The corollary to mkjail.pl in the freebsd directory ...
#
sub usage()
{
Leigh B. Stoller's avatar
Leigh B. Stoller committed
23
    print "Usage: mkvnode [-d] vnodeid\n" . 
24
          "  -d   Debug mode.\n" .
25
26
	  "  -c   Cleanup stale container\n".
	  "  -s   Show state for container\n".
27
28
29
          "";
    exit(1);
}
30
my $optlist  = "dcs";
31
my $debug    = 1;
32
33
my $cleanup  = 0;
my $showstate= 0;
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
my $vnodeid;

#
# Turn off line buffering on output
#
$| = 1;

# Drag in path stuff so we can find emulab stuff.
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 
use libsetup;
use libtmcc;
use libtestbed;
51
use liblocsetup;
52
53
54
55
56
57
    
# Pull in libvnode
use libvnode;

# Helpers
sub MyFatal($);
58
59
60
61
sub safeLibOp($$$;@);
sub CleanupVM();
sub TearDownStaleVM();
sub StoreState();
62
63
64

# Locals
my $CTRLIPFILE = "/var/emulab/boot/myip";
65
my $VMPATH     = "/var/emulab/vms/vminfo";
66
my $IPTABLES   = "/sbin/iptables";
67
68
69
70
71
72
73
74
75
76
77
78
my $VNDIR;
my $leaveme    = 0;
my $running    = 0;
my $cleaning   = 0;
my $rebooting  = 0;
my $reload     = 0;
my ($vmid,$vmtype,$ret,$err);

# Flags for leaveme.
my $LEAVEME_REBOOT = 0x1;
my $LEAVEME_HALT   = 0x2;

79
80
81
82
83
84
85
86
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
my %options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
87
88
89
90
91
92
93
94
95
if (defined($options{"d"})) {
    $debug = 1;
}
if (defined($options{"c"})) {
    $cleanup = 1;
}
if (defined($options{"s"})) {
    $showstate = 1;
    $debug   = 0;
96
97
98
}
usage()
    if (@ARGV != 1);
99

100
$vnodeid = $ARGV[0];
101
$VNDIR   = "$VMPATH/$vnodeid";
102
103
104
105
106
107
108
109
110
111
112
113

#
# Must be root.
# 
if ($UID != 0) {
    die("*** $0:\n".
	"    Must be root to run this script!\n");
}

# Tell the library what vnode we are messing with.
libsetup_setvnodeid($vnodeid);

114
115
116
# Can set this after above line. 
my $RUNNING_FILE = CONFDIR() . "/running";

117
118
119
120
121
122
123
#
# Turn on debug timestamps if desired.
#
if ($debug) {
    TBDebugTimeStampsOn();
}

124
125
126
127
128
129
130
131
#
# Remove old state files at boot.
#
if (! -e "/var/run/mkvnode.ready") {
    system("rm -f $VARDIR/vms/*/vnode.state");
    system("touch /var/run/mkvnode.ready");
}

132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#
# XXX: for now, support only a single vnode type per phys node.  This is bad,
# but it's the current assumption.  For now, we also assume the nodetype since
# we only have pcvm.  Later, we need to get this info from tmcd so we know 
# lib to load.
#
my @nodetypes = ( GENVNODETYPE() );

#
# We go through this crap so that we can pull in multiple packages implementing
# the libvnode API so they (hopefully) won't step on our namespace too much.
#
my %libops = ();
foreach my $type (@nodetypes) {
    if ($type =~ /^([\w\d\-]+)$/) {
	$type = $1;
    }
    # load lib and initialize it
    my %ops;
    eval "use libvnode_$type; %ops = %libvnode_${type}::ops";
    if ($@) {
	die "while trying to load 'libvnode_$type': $@";
    }
    if (0 && $debug) {
	print "%ops($type):\n" . Dumper(%ops);
    }
    $libops{$type} = \%ops;
    if ($debug) {
	$libops{$type}{'setDebug'}->(1);
    }
162
163
164
165
166
167
    $libops{$type}{'init'}->();

    # need to do this for each type encountered. 
    TBDebugTimeStampWithDate("starting $type rootPreConfig()");
    $libops{GENVNODETYPE()}{'rootPreConfig'}->();
    TBDebugTimeStampWithDate("finished $type rootPreConfig()");
168
169
170
171
172
173
174
175
176
177
}
if ($debug) {
    print "GENVNODETYPE " . GENVNODETYPE() . "\n";
    print "libops:\n" . Dumper(%libops);
}

#
# Need the domain, but no conistent way to do it. Ask tmcc for the
# boss node and parse out the domain. 
#
178
my ($DOMAINNAME,$BOSSIP) = tmccbossinfo();
179
180
181
182
183
184
185
186
187
188
die("Could not get bossname from tmcc!")
    if (!defined($DOMAINNAME));

if ($DOMAINNAME =~ /^[-\w]+\.(.*)$/) {
    $DOMAINNAME = $1;
}
else {
    die("Could not parse domain name!");
}
if ($BOSSIP !~ /^\d+\.\d+\.\d+\.\d+$/) {
189
    die "Bad bossip '$BOSSIP' from bossinfo!";
190
191
}

192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#
# Quickie way to show the state.
#
if ($showstate) {
    if (! -e "$VNDIR/vnode.info") {
	fatal("No vnode.info file for $vnodeid");
    }
    if (! -e "$VNDIR/vnode.state") {
	fatal("no vnode.state file for $vnodeid");
    }
    my $tmp = eval { Storable::retrieve("$VNDIR/vnode.state"); };
    if ($@) {
	fatal("$@");
    }
    print Dumper($tmp);
    exit(0);
}

210
211
212
213
#
# In most cases, the vnodeid directory will have been created by the
# caller, and a config file possibly dropped in.  When debugging, we
# have to create it here.
214
215
216
217
218
#
if (! -e $VMPATH) {
    mkdir($VMPATH, 0770) or
	fatal("Could not mkdir $VMPATH: $!");
}
219
220
221
222
223
224
225
chdir($VMPATH) or
    die("Could not chdir to $VMPATH: $!\n");

if (! -e $vnodeid) {
    mkdir($vnodeid, 0770) or
	fatal("Could not mkdir $vnodeid in $VMPATH: $!");
}
226
227
228
229
230
231
232
233
#
# The container description for the library routines. 
#
my %vnconfig = ( "vnodeid"   => $vnodeid,
                 "config"    => undef,
		 "ifconfig"  => undef,
		 "ldconfig"  => undef,
		 "tunconfig" => undef,
234
		 "attributes"=> undef,
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
);
sub VNCONFIG($) { return $vnconfig{'config'}->{$_[0]}; }

#
# If cleanup requested, make sure the manager process is not running
# Must do this after the stuff above is defined.
#
if ($cleanup) {
    # This path is in vnodesetup. 
    my $pidfile = "/var/run/tbvnode-${vnodeid}.pid";
    if (-e $pidfile) {
	print STDERR "Manager process still running. Use that instead.\n";
	print STDERR "If the manager is really dead, first rm $pidfile.\n";
	exit(1);
    }
    exit(TearDownStaleVM());
}

#
# This holds the container state set up by the library. There is state
# added here, and state added in the library ("private"). We locally
# redefine this below, so cannot be a lexical.
#
# NOTE: There should be NO state in here that needs to survive reboot.
#       We just remove them all when rebooting. See above.
#
$vnstate = { "private" => {} };
262

263
264
265
#
# Now we can start doing something useful.
#
266
my ($pid, $eid, $vname) = check_nickname();
267
268
my $nodeuuid = getnodeuuid();
$nodeuuid = $vnodeid if (!defined($nodeuuid));
269

270
271
272
273
274
275
#
# Get all the config stuff we need.
#
my %tmp;
my @tmp;
my $tmp;
276
my %attrs;
277

278
279
280
fatal("Could not get vnode config for $vnodeid")
    if (getgenvnodeconfig(\%tmp));
$vnconfig{"config"} = \%tmp;
281
282

fatal("getifconfig($vnodeid): $!")
283
284
    if (getifconfig(\@tmp));
$vnconfig{"ifconfig"} = [ @tmp ];
285
286

fatal("getlinkdelayconfig($vnodeid): $!") 
287
288
    if (getlinkdelayconfig(\@tmp));
$vnconfig{"ldconfig"} = [ @tmp ];
289

Leigh B. Stoller's avatar
Leigh B. Stoller committed
290
fatal("gettunnelconfig($vnodeid): $!")
291
292
    if (gettunnelconfig(\$tmp));
$vnconfig{"tunconfig"} = $tmp;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
293

294
295
296
297
fatal("getnodeattributes($vnodeid): $!")
    if (getnodeattributes(\%attrs));
$vnconfig{"attributes"} = \%attrs;

298
if ($debug) {
299
300
    print "VN Config:\n";
    print Dumper(\%vnconfig);
301
302
}

303
304
305
306
307
#
# see if we 1) are supposed to be "booting" into the reload mfs, and 2) if
# we have loadinfo.  Need both to reload!
#
fatal("getbootwhat($vnodeid): $!") 
308
    if (getbootwhat(\@tmp));
309
310
if (scalar(@tmp) && exists($tmp[0]->{"WHAT"})) {
    if ($tmp[0]->{"WHAT"} =~ /frisbee-pcvm/) {
311
312
313
314
	#
	# Ok, we're reloading, using the fake frisbee pcvm mfs.
	#
	$reload = 1;
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
	
	fatal("getloadinfo($vnodeid): $!") 
	    if (getloadinfo(\@tmp));
	if (!scalar(@tmp)) {
	    fatal("vnode $vnodeid in reloading, but got no loadinfo!");
	}
	else {
	    if ($tmp[0]->{"IMAGEID"} =~ /^([-\d\w]+),([-\d\w]+),([-\d\w]+)$/) {
		$vnconfig{"reloadinfo"} = $tmp[0];
		$vnconfig{"image"}      = "$1-$2-$3";
	    }
	    else {
		fatal("vnode $vnodeid in reloading, but got bogus IMAGEID " . 
		      $tmp[0]->{"IMAGEID"} . " from loadinfo!");
	    }
	}
    }
332
333
334
335
336
337
338
339
340
341
342
343
344
345
    elsif ($tmp[0]->{"WHAT"} =~ /^\d*$/) {
	#
	# We are using bootwhat for a much different purpose then intended.
	# It tells us a partition number, but that is meaningless. Look at
	# the jailconfig to see what image should boot. That image better
	# be resident already. 
	#
	if (VNCONFIG('IMAGENAME') =~ /^([-\w]+),([-\w]+),([-\w]+)$/) {
	    $vnconfig{"image"}      = "$1-$2-$3";
	}
    }
    else {
	# The library will boot the default, whatever that is.
    }
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
}

#
# Install a signal handler. We can get signals from vnodesetup.
#
sub handler ($) {
    my ($signame) = @_;

    # No more interruptions during teardown.
    $SIG{INT}  = 'IGNORE';
    $SIG{USR1} = 'IGNORE';
    $SIG{USR2} = 'IGNORE';
    $SIG{HUP}  = 'IGNORE';

    my $str = "killed";
    if ($signame eq 'USR1') {
	$leaveme = $LEAVEME_HALT;
	$str = "halted";
364
    }
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
    elsif ($signame eq 'USR2') {
	$leaveme = $LEAVEME_REBOOT;
	$str = "rebooted";
    }

    #
    # XXX this is a woeful hack for vnodesetup.  At the end of rebootvnode,
    # vnodesetup calls hackwaitandexit which essentially waits for a vnode
    # to be well on the way back up before it returns.  This call was
    # apparently added for the lighter-weight "reconfigure a vnode"
    # (as opposed to reboot it) path, however it makes the semantics of
    # reboot on a vnode different than that for a pnode, where reboot returns
    # as soon as the node stops responding (i.e., when it goes down and not
    # when it comes back up).  Why do I care?  Because Xen vnodes cannot
    # always "reboot" under the current semantics in less than 30 seconds,
    # which is the timeout in libreboot.
    #
    # So by touching the "running" file here we force hackwaitandexit to
    # return when the vnode is shutdown in Xen (or OpenVZ), more closely
    # matching the pnode semantics while leaving the BSD jail case (which
    # doesn't use this code) alone.  This obviously needs to be revisited.
    #
    mysystem("touch $RUNNING_FILE")
	if ($leaveme && -e "$RUNNING_FILE");

    MyFatal("mkvnode ($PID) caught a SIG${signame}! container $str");
391
392
}

393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
#
# If this file exists, we are rebooting an existing container. But
# need to check if its a stale or aborted container (one that failed
# to setup or teardown) and got left behind. Another wrinkle is shared
# nodes, so we use the node uuid to determine if its another logical
# pcvm with the same name, and needs to be destroyed before setting up.
#
if (-e "$VNDIR/vnode.info") {
    my $uuid;
    my $teardown = 0;

    my $str = `cat $VNDIR/vnode.info`;
    ($vmid, $vmtype, $uuid) = ($str =~ /^(\d*) (\w*) ([-\w]*)$/);

    # Consistency check.
    fatal("No matching file: $VMPATH/vnode.$vmid")
	if (! -e "$VMPATH/vnode.$vmid");
    $str = `cat $VMPATH/vnode.$vmid`;
    chomp($str);
    if ($str ne $vnodeid) {
	fatal("Inconsistent vnodeid in $VMPATH/vnode.$vmid");
    }

    if ($uuid ne $nodeuuid) {
	print "UUID mismatch; tearing down stale vnode $vnodeid\n";
	$teardown = 1;
    }
    elsif ($reload) {
	print "Reload requested, tearing down old vnode\n";
	$teardown = 1;
423
424
    }
    else {
425
426
427
	($ret,$err) = safeLibOp('vnodeState', 1, 0);
	if ($err) {
	    fatal("Failed to get status for existing container: $err");
428
	}
429
430
431
432
433
434
	if ($ret eq VNODE_STATUS_UNKNOWN()) {
	    print "Cannot determine status container $vmid. Deleting ...\n";
	    $teardown = 1;
	}
	elsif ($ret ne VNODE_STATUS_STOPPED()) {
	    fatal("vnode $vnodeid not stopped, not booting!");
435
436
	}
    }
437
438
439
440
441
442
443
    if ($teardown) {
	TearDownStaleVM() == 0
	    or fatal("Could not tear down stale container");
    }
    else {
	$rebooting = 1;
    }
444
445
}

446
#
Leigh B Stoller's avatar
Leigh B Stoller committed
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
# Another wrinkle; tagged vlans might not be setup yet when we get
# here, and we have to know those tags before we can proceed. We
# need to spin, but with signals enabled since we do not want to
# wait forever. Okay to get a signal and die at this point. 
#
if (0 && @{ $vnconfig{'ifconfig'} }) {
  again:
    foreach my $ifc (@{ $vnconfig{'ifconfig'} }) {
	my $lan = $ifc->{LAN};
	
	next
	    if ($ifc->{ITYPE} ne "vlan");

	# got the tag.
	next
	    if ($ifc->{VTAG});

	# no tag, wait and ask again.
	print STDERR
	    "$lan does not have a tag yet. Waiting, then asking again ...\n";

	sleep(5);

	my @tmp = ();
	fatal("getifconfig($vnodeid): $!")
	    if (getifconfig(\@tmp));
	$vnconfig{"ifconfig"} = [ @tmp ];

	# Just look through everything again; simple. 
	goto again;
    }
}

#
# Install handlers *after* down stale container teardown, since we set
# them to IGNORE during the teardown.
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
# 
# Ignore TERM since we want our caller to catch it first and then send
# it down to us. 
#
$SIG{TERM} = 'IGNORE';
# Halt container and exit. Tear down transient state, leave disk.
$SIG{USR1} = \&handler;
# Halt container and exit. Leave all state intact (we are rebooting).
$SIG{USR2} = \&handler;
# Halt container and exit. Tear down all state including disk.
$SIG{HUP}  = \&handler;
$SIG{INT}  = \&handler;

#
# Initial pre config for the experimental network. We want to make sure
# we can allocate the required devices and whatever else before going
# any further. 
500
#
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
TBDebugTimeStampWithDate("starting rootPreConfigNetwork()");
$ret = eval {
    $libops{GENVNODETYPE()}{'rootPreConfigNetwork'}->($vnodeid, undef,
	\%vnconfig, $vnstate->{'private'});
};
if ($ret || $@) {
    print STDERR $@
	if ($@);
    
    # If this fails, we require the library to clean up after itself
    # so that we can just exit without worrying about cleanup.
    fatal("rootPreConfigNetwork failed!");
}
TBDebugTimeStampWithDate("finished rootPreConfigNetwork()");

516
517
518
519
520
521
522
if (! -e "$VNDIR/vnode.info") {
    #
    # XXX XXX XXX: need to get this from tmcd!
    # NOTE: we first put the type into vndb so that the create call can go!
    #
    $vmtype = GENVNODETYPE();

523
    ($ret,$err) = safeLibOp('vnodeCreate',0,0);
524
525
526
527
528
    if ($err) {
	MyFatal("vnodeCreate failed");
    }
    $vmid = $ret;

529
    mysystem("echo '$vmid $vmtype $nodeuuid' > $VNDIR/vnode.info");
530
    mysystem("echo '$vnodeid' > $VMPATH/vnode.$vmid");
531
532
533

    # bootvnodes wants this to be here...
    mysystem("mkdir -p /var/emulab/jails/$vnodeid");
534
}
535
536
537
538
539
540
541
# This state structure is saved to disk for TearDown.
$vnstate->{"vmid"}   = $vmid;
$vnstate->{"vmtype"} = $vmtype;
$vnstate->{"uuid"}   = $nodeuuid;
# Store the state to disk.
if (StoreState()) {
    MyFatal("Could not store container state to disk");
542
543
}

544
545
my $cnet_mac = (defined(VNCONFIG('CTRLMAC')) ?
		VNCONFIG('CTRLMAC') : ipToMac(VNCONFIG('CTRLIP')));
546
547
548
549
550
551
552
553
554
my $ext_ctrlip = `cat $CTRLIPFILE`;
chomp($ext_ctrlip);
if ($ext_ctrlip !~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) {
    # cannot/should not really go on if this happens.
    MyFatal("error prior to vnodePreConfigControlNetwork($vnodeid): " . 
	    " could not find valid ip in $CTRLIPFILE!");
}
my $longdomain = "${eid}.${pid}.${DOMAINNAME}";

555
556
557
558
559
560
561
#
# Call back to do things to the container before it boots.
#
sub callback($)
{
    my ($path) = @_;

562
563
564
565
566
    #
    # Set up sshd port to listen on. If the vnode has its own IP
    # then listen on both 22 and the per-vnode port.
    #
    if (system('grep -q -e EmulabJail $path/etc/ssh/sshd_config')) {
567
568
	if (defined(VNCONFIG('SSHDPORT')) && VNCONFIG('SSHDPORT') ne "") {
	    my $sshdport = VNCONFIG('SSHDPORT');
569
570
571

	    system("echo '# EmulabJail' >> $path/etc/ssh/sshd_config");
	    system("echo 'Port $sshdport' >> $path/etc/ssh/sshd_config");
572
	    if (VNCONFIG('CTRLIP') ne $ext_ctrlip) {
573
574
575
576
		system("echo 'Port 22' >> $path/etc/ssh/sshd_config");
	    }
	}
    }
577
578
579
    # Localize the timezone.
    system("cp -fp /etc/localtime $path/etc");
    
580
581
582
    return 0;
}

583
# OP: preconfig
584
if (safeLibOp('vnodePreConfig', 1, 1, \&callback)) {
585
586
587
588
    MyFatal("vnodePreConfig failed");
}

# OP: control net preconfig
589
590
591
if (safeLibOp('vnodePreConfigControlNetwork',1,1,
	      VNCONFIG('CTRLIP'),
	      VNCONFIG('CTRLMASK'),$cnet_mac,
592
593
594
595
596
	      $ext_ctrlip,$vname,$longdomain,$DOMAINNAME,$BOSSIP)) {
    MyFatal("vnodePreConfigControlNetwork failed");
}

# OP: exp net preconfig
597
if (safeLibOp('vnodePreConfigExpNetwork', 1, 1)) {
598
599
    MyFatal("vnodePreConfigExpNetwork failed");
}
600
if (safeLibOp('vnodeConfigResources', 1, 1)) {
601
602
    MyFatal("vnodeConfigResources failed");
}
603
if (safeLibOp('vnodeConfigDevices', 1, 1)) {
604
605
606
    MyFatal("vnodeConfigDevices failed");
}

607
#
608
# Route to inner ssh, but not if the IP is routable, no need to.
609
#
610
611
if (defined(VNCONFIG('SSHDPORT')) && VNCONFIG('SSHDPORT') ne "" &&
    !isRoutable(VNCONFIG('CTRLIP'))) {
612
613
    my $sshdport = VNCONFIG('SSHDPORT');
    my $ctrlip   = VNCONFIG('CTRLIP');
614
615

    # Retry a few times cause of iptables locking stupidity.
616
    for (my $i = 0; $i < 10; $i++) {
617
618
619
	system("$IPTABLES -v -t nat -A PREROUTING -p tcp -d $ext_ctrlip ".
	       "--dport $sshdport -j DNAT ".
	       "--to-destination $ctrlip:$sshdport");
620
621
622
623
624
625
626
627
628
	
	if ($? == 0) {
	    my $ref = {};
	    $ref->{'port'}       = $sshdport;
	    $ref->{'ctrlip'}     = $ctrlip;
	    $ref->{'ext_ctrlip'} = $ext_ctrlip;
	    $vnstate->{'sshd_iprule'} = $ref;
	    last;
	}
629
630
	sleep(2);
    }
631
632
}

633
634
635
636
637
638
639
#
# Start the container. If all goes well, this will exit cleanly, with the
# it running in its new context. Still, lets protect it with a timer
# since it might get hung up inside and we do not want to get stuck here.
#
my $childpid = fork();
if ($childpid) {
640
641
    my $timedout = 0;
    local $SIG{ALRM} = sub { kill("TERM", $childpid); $timedout = 1; };
642
643
644
645
646
647
648
    alarm 30;
    waitpid($childpid, 0);
    alarm 0;

    #
    # If failure then cleanup.
    #
649
650
    if ($? || $timedout) {
	MyFatal("$vnodeid container startup failed or timed out");
651
652
653
654
    }
}
else {
    $SIG{TERM} = 'DEFAULT';
655
656
657
658
    $SIG{INT}  = 'DEFAULT';
    $SIG{USR1} = 'DEFAULT';
    $SIG{USR2} = 'DEFAULT';
    $SIG{HUP}  = 'DEFAULT';
659

660
    if (safeLibOp('vnodeBoot', 1, 1)) {
661
662
663
664
665
	print STDERR "*** ERROR: vnodeBoot failed\n";
	exit(1);
    }
    exit(0);
}
666
if (safeLibOp('vnodePostConfig', 1, 1)) {
667
668
669
    MyFatal("vnodePostConfig failed");
}
# XXX: need to do this for each type encountered!
670
TBDebugTimeStampWithDate("starting $vmtype rootPostConfig()");
671
$libops{$vmtype}{'rootPostConfig'}->();
672
673
674
675
676
677
TBDebugTimeStampWithDate("finished $vmtype rootPostConfig()");

if ($debug) {
    print "VN State:\n";
    print Dumper($vnstate);
}
678

679
680
681
682
# Store the state to disk.
if (StoreState()) {
    MyFatal("Could not store container state to disk");
}
683
# This is for vnodesetup
684
mysystem("touch $RUNNING_FILE");
685
$running = 1;
686
687

#
688
689
690
691
# This loop is to catch when the container stops. We used to run a sleep
# inside and wait for it to exit, but that is not portable across the
# backends, and the return value did not indicate how it exited. So, lets
# just loop, asking for the status every few seconds. 
692
#
693
694
695
696
# XXX Turn off debugging during this loop to keep the log file from growing.
#
TBDebugTimeStampsOff()
    if ($debug);
697

698
699
700
while (1) {
    sleep(5);
    
701
    #
702
703
704
705
    # If the container exits, either it rebooted from the inside or
    # the physical node is rebooting, or we are actively trying to kill
    # it cause our parent (vnodesetup) told us to. In all cases, we just
    # exit and let the parent decide what to do. 
706
    #
707
708
709
710
711
712
713
714
715
716
717
    my ($ret,$err) = safeLibOp('vnodeState', 0, 0);
    if ($err) {
	fatal("*** ERROR: vnodeState: $err\n");
    }
    if ($ret ne VNODE_STATUS_RUNNING()) {
	print "Container is no longer running.\n";
	# Rebooted from inside, but not cause we told it to, so leave intact.
	$leaveme = $LEAVEME_REBOOT
	    if (!$cleaning);
	last;
    }
718
}
719
720
721
TBDebugTimeStampsOn()
    if ($debug);
exit(CleanupVM());
722
723

#
724
725
726
727
# Teardown a container. This should not be used if the mkvnode process
# is still running; use vnodesetup instead. This is just for the case
# that the manager (vnodesetup,mkvnode) process is gone and the turds
# need to be cleaned up.
728
#
729
730
731
732
733
734
735
sub TearDownStaleVM()
{
    if (! -e "$VNDIR/vnode.info") {
	fatal("TearDownStaleVM: no vnode.info file for $vnodeid");
    }
    my $str = `cat $VNDIR/vnode.info`;
    ($vmid, $vmtype, undef) = ($str =~ /^(\d*) (\w*) ([-\w]*)$/);
736

737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
    #
    # Load the state. Use a local so that we do not overwrite
    # the outer version. Just a precaution.
    #
    # The state might not exist, but we proceed anyway.
    #
    local $vnstate = { "private" => {} };

    if (-e "$VNDIR/vnode.state") {
	$vnstate = eval { Storable::retrieve("$VNDIR/vnode.state"); };
	if ($@) {
	    print STDERR "$@";
	    return -1;
	}
	if ($debug) {
	    print "vnstate:\n";
	    print Dumper($vnstate);
754
	}
755
    }
756

Leigh B Stoller's avatar
Leigh B Stoller committed
757
758
759
760
761
762
    # No interruptions during stale teardown.
    $SIG{INT}  = 'IGNORE';
    $SIG{USR1} = 'IGNORE';
    $SIG{USR2} = 'IGNORE';
    $SIG{HUP}  = 'IGNORE';

763
764
765
766
767
768
769
    #
    # if we fail to cleanup, store the state back to disk so that we
    # capture any changes. 
    #
    if (CleanupVM()) {
	StoreState();
	return -1;
770
    }
Leigh B Stoller's avatar
Leigh B Stoller committed
771
772
773
774
775
    $SIG{INT}  = 'DEFAULT';
    $SIG{USR1} = 'DEFAULT';
    $SIG{USR2} = 'DEFAULT';
    $SIG{HUP}  = 'DEFAULT';
    
776
    return 0;
777
778
779
780
781
}

#
# Clean things up.
#
782
sub CleanupVM()
783
784
785
786
787
788
789
790
{
    if ($cleaning) {
	die("*** $0:\n".
	    "    Oops, already cleaning!\n");
    }
    $cleaning = 1;

    # If the container was never built, there is nothing to do.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
791
    return 0
792
793
	if (! -e "$VNDIR/vnode.info" || !defined($vmid));

794
795
796
797
798
    if (exists($vnstate->{'sshd_iprule'})) {
	my $ref = $vnstate->{'sshd_iprule'};
	my $sshdport    = $ref->{'port'};
	my $ctrlip      = $ref->{'ctrlip'};
	my $ext_ctrlip  = $ref->{'ext_ctrlip'};
799
800

	# Retry a few times cause of iptables locking stupidity.
801
	for (my $i = 0; $i < 10; $i++) {
802
803
804
805
806
807
808
	    system("$IPTABLES -v -t nat -D PREROUTING -p tcp -d $ext_ctrlip ".
		   "--dport $sshdport -j DNAT ".
		   "--to-destination $ctrlip:$sshdport");
	    last
		if ($? == 0);
	    sleep(2);
	}
809
810
811
	# Update new state.
	delete($vnstate->{'sshd_iprule'});
	StoreState();
812
813
    }

814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
    #
    # The tmcc proxy causes teardown problems, no idea why.
    # It used to be kill off from the unmount script, but lets
    # do it here.
    #
    my $PROXYPID = "/var/run/tmccproxy.${vnodeid}.pid";
    if (-e $PROXYPID) {
	my $ppid = `cat $PROXYPID`;
	chomp($ppid);
	# untaint
	if ($ppid =~ /^([-\@\w.]+)$/) {
	    $ppid = $1;
	}
	if (kill('TERM', $ppid) == 0) {
	    print"*** ERROR: Could not kill(TERM) proxy process $ppid: $!\n";
	}
	else {
	    unlink($PROXYPID);
	}
    }

835
    # if not halted, try that first
836
    my ($ret,$err) = safeLibOp('vnodeState', 1, 0);
837
838
839
    if ($err) {
	print STDERR "*** ERROR: vnodeState: ".
	    "failed to cleanup $vnodeid: $err\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
840
	return -1;
841
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
842
    if ($ret eq VNODE_STATUS_RUNNING()) {
843
	print STDERR "cleanup: $vnodeid not stopped, trying to halt it.\n";
844
	($ret,$err) = safeLibOp('vnodeHalt', 1, 1);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
845
846
847
848
849
	if ($err) {
	    print STDERR "*** ERROR: vnodeHalt: ".
		"failed to halt $vnodeid: $err\n";
	    return -1;
	}
850
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
851
852
    elsif ($ret eq VNODE_STATUS_MOUNTED()) {
	print STDERR "cleanup: $vnodeid is mounted, trying to unmount it.\n";
853
	($ret,$err) = safeLibOp('vnodeUnmount', 1, 1);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
854
855
856
857
858
859
	if ($err) {
	    print STDERR "*** ERROR: vnodeUnmount: ".
		"failed to unmount $vnodeid: $err\n";
	    return -1;
	}
    }
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
    if ($leaveme) {
	if ($leaveme == $LEAVEME_HALT || $leaveme == $LEAVEME_REBOOT) {
	    #
	    # When halting, the disk state is left, but the transient state
	    # is removed since it will get reconstructed later if the vnode
	    # is restarted. This avoids leaking a bunch of stuff in case the
	    # vnode never starts up again. We of course leave the disk, but
	    # that will eventually get cleaned up if the pcvm is reused for
	    # a future experiment.
	    #
	    # XXX Reboot should be different; there is no reason to tear
	    # down the transient state, but we do not handle that yet.
	    # Not hard to add though.
	    #
	    ($ret,$err) = safeLibOp('vnodeTearDown', 1, 1);
	    # Always store in case some progress was made. 
	    StoreState();
	    if ($err) {
		print STDERR "*** ERROR: failed to teardown $vnodeid: $err\n";
		return -1;
	    }
	}
	return 0;
    }
884
885

    # now destroy
886
    ($ret,$err) = safeLibOp('vnodeDestroy', 1, 1);
887
888
    if ($err) {
	print STDERR "*** ERROR: failed to destroy $vnodeid: $err\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
889
	return -1;
890
891
    }
    unlink("$VNDIR/vnode.info");
892
    unlink("$VNDIR/vnode.state");
893
    unlink("$VMPATH/vnode.$vmid");
894
    $cleaning = 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
895
    return 0;
896
897
898
899
900
901
902
903
904
}
    
#
# Print error and exit.
#
sub MyFatal($)
{
    my ($msg) = @_;

905
906
907
908
    #
    # If rebooting but never got a chance to run, we do not want
    # to kill off the container. Might lose user data.
    #
909
    $leaveme = $LEAVEME_REBOOT
910
911
	if ($rebooting && !$running);

912
913
914
    TBDebugTimeStampsOn()
	if ($debug);
    
915
    CleanupVM();
916
917
918
919
920
921
922
    die("*** $0:\n".
	"    $msg\n");
}

#
# Helpers:
#
923
924
sub safeLibOp($$$;@) {
    my ($op,$autolog,$autoerr,@args) = @_;
925
926
927

    my $sargs = '';
    if (@args > 0) {
928
 	$sargs = join(',',@args);
929
    }
930
    TBDebugTimeStampWithDate("starting $vmtype $op($sargs)")
931
	if ($debug);
932
933
934
935
936
937
938
939
940
941
942
943

    #
    # Block signals that could kill us in the middle of a library call.
    # Might be better to do this down in the library, but this is an
    # easier place to do it. This ensure that if we have to tear down
    # in the middle of setting up, the state is consistent. 
    #
    my $new_sigset = POSIX::SigSet->new(SIGHUP, SIGINT, SIGUSR1, SIGUSR2);
    my $old_sigset = POSIX::SigSet->new;
    if (! defined(sigprocmask(SIG_BLOCK, $new_sigset, $old_sigset))) {
	print STDERR "sigprocmask (BLOCK) failed!\n";
    }
944
    my $ret = eval {
945
946
	$libops{$vmtype}{$op}->($vnodeid, $vmid,
				\%vnconfig, $vnstate->{'private'}, @args);
947
    };
948
    my $err = $@;
949
    if (! defined(sigprocmask(SIG_SETMASK, $old_sigset))) {
950
951
952
	print STDERR "sigprocmask (UNBLOCK) failed!\n";
    }
    if ($err) {
953
954
955
	if ($autolog) {
	    ;
	}
956
	TBDebugTimeStampWithDate("failed $vmtype $op($sargs): $err")
957
958
959
960
	    if ($debug);
	return (-1,$err);
    }
    if ($autoerr && $ret) {
961
	$err = "$op($vnodeid) failed with exit code $ret!";
962
963
964
	if ($autolog) {
	    ;
	}
965
	TBDebugTimeStampWithDate("failed $vmtype $op($sargs): exited with $ret")
966
967
968
969
	    if ($debug);
	return ($ret,$err);
    }

970
    TBDebugTimeStampWithDate("finished $vmtype $op($sargs)")
971
972
973
974
	if ($debug);

    return $ret;
}
975
976
977
978
979
980
981
982
983
984
985
986
987
988

sub StoreState()
{
    # Store the state to disk.
    print "Storing state to disk ...\n"
	if ($debug);
    
    my $ret = eval { Storable::store($vnstate, "$VNDIR/vnode.state"); };
    if ($@) {
	print STDERR "$@";
	return -1;
    }
    return 0;
}