mkvnode.pl 26 KB
Newer Older
1
2
#!/usr/bin/perl -w
#
Leigh B Stoller's avatar
Leigh B Stoller committed
3
# Copyright (c) 2009-2012 University of Utah and the Flux Group.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
23
24
25
26
27
28
#
use strict;
use Getopt::Std;
use English;
use Errno;
use POSIX qw(strftime);
29
30
use POSIX qw(:sys_wait_h);
use POSIX qw(:signal_h);
31
use Data::Dumper;
32
33
use Storable;
use vars qw($vnstate);
34
35
36
37
38
39

#
# The corollary to mkjail.pl in the freebsd directory ...
#
sub usage()
{
Leigh B. Stoller's avatar
Leigh B. Stoller committed
40
    print "Usage: mkvnode [-d] vnodeid\n" . 
41
          "  -d   Debug mode.\n" .
42
43
	  "  -c   Cleanup stale container\n".
	  "  -s   Show state for container\n".
44
45
46
          "";
    exit(1);
}
47
my $optlist  = "dcs";
48
my $debug    = 1;
49
50
my $cleanup  = 0;
my $showstate= 0;
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
my $vnodeid;

#
# Turn off line buffering on output
#
$| = 1;

# Drag in path stuff so we can find emulab stuff.
BEGIN { require "/etc/emulab/paths.pm"; import emulabpaths; }

#
# Load the OS independent support library. It will load the OS dependent
# library and initialize itself. 
# 
use libsetup;
use libtmcc;
use libtestbed;
68
use liblocsetup;
69
70
71
72
73
74
    
# Pull in libvnode
use libvnode;

# Helpers
sub MyFatal($);
75
76
77
78
sub safeLibOp($$$;@);
sub CleanupVM();
sub TearDownStaleVM();
sub StoreState();
79
80
81

# Locals
my $CTRLIPFILE = "/var/emulab/boot/myip";
82
my $VMPATH     = "/var/emulab/vms/vminfo";
83
my $IPTABLES   = "/sbin/iptables";
84
85
86
87
88
89
90
91
92
93
94
95
my $VNDIR;
my $leaveme    = 0;
my $running    = 0;
my $cleaning   = 0;
my $rebooting  = 0;
my $reload     = 0;
my ($vmid,$vmtype,$ret,$err);

# Flags for leaveme.
my $LEAVEME_REBOOT = 0x1;
my $LEAVEME_HALT   = 0x2;

96
97
98
99
100
101
102
103
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
my %options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
104
105
106
107
108
109
110
111
112
if (defined($options{"d"})) {
    $debug = 1;
}
if (defined($options{"c"})) {
    $cleanup = 1;
}
if (defined($options{"s"})) {
    $showstate = 1;
    $debug   = 0;
113
114
115
}
usage()
    if (@ARGV != 1);
116

117
$vnodeid = $ARGV[0];
118
$VNDIR   = "$VMPATH/$vnodeid";
119
120
121
122
123
124
125
126
127
128
129
130

#
# Must be root.
# 
if ($UID != 0) {
    die("*** $0:\n".
	"    Must be root to run this script!\n");
}

# Tell the library what vnode we are messing with.
libsetup_setvnodeid($vnodeid);

131
132
133
# Can set this after above line. 
my $RUNNING_FILE = CONFDIR() . "/running";

134
135
136
137
138
139
140
#
# Turn on debug timestamps if desired.
#
if ($debug) {
    TBDebugTimeStampsOn();
}

141
142
143
144
145
146
147
148
#
# Remove old state files at boot.
#
if (! -e "/var/run/mkvnode.ready") {
    system("rm -f $VARDIR/vms/*/vnode.state");
    system("touch /var/run/mkvnode.ready");
}

149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#
# XXX: for now, support only a single vnode type per phys node.  This is bad,
# but it's the current assumption.  For now, we also assume the nodetype since
# we only have pcvm.  Later, we need to get this info from tmcd so we know 
# lib to load.
#
my @nodetypes = ( GENVNODETYPE() );

#
# We go through this crap so that we can pull in multiple packages implementing
# the libvnode API so they (hopefully) won't step on our namespace too much.
#
my %libops = ();
foreach my $type (@nodetypes) {
    if ($type =~ /^([\w\d\-]+)$/) {
	$type = $1;
    }
    # load lib and initialize it
    my %ops;
    eval "use libvnode_$type; %ops = %libvnode_${type}::ops";
    if ($@) {
	die "while trying to load 'libvnode_$type': $@";
    }
    if (0 && $debug) {
	print "%ops($type):\n" . Dumper(%ops);
    }
    $libops{$type} = \%ops;
    if ($debug) {
	$libops{$type}{'setDebug'}->(1);
    }
179
180
181
182
183
184
    $libops{$type}{'init'}->();

    # need to do this for each type encountered. 
    TBDebugTimeStampWithDate("starting $type rootPreConfig()");
    $libops{GENVNODETYPE()}{'rootPreConfig'}->();
    TBDebugTimeStampWithDate("finished $type rootPreConfig()");
185
186
187
188
189
190
191
192
193
194
}
if ($debug) {
    print "GENVNODETYPE " . GENVNODETYPE() . "\n";
    print "libops:\n" . Dumper(%libops);
}

#
# Need the domain, but no conistent way to do it. Ask tmcc for the
# boss node and parse out the domain. 
#
195
my ($DOMAINNAME,$BOSSIP) = tmccbossinfo();
196
197
198
199
200
201
202
203
204
205
die("Could not get bossname from tmcc!")
    if (!defined($DOMAINNAME));

if ($DOMAINNAME =~ /^[-\w]+\.(.*)$/) {
    $DOMAINNAME = $1;
}
else {
    die("Could not parse domain name!");
}
if ($BOSSIP !~ /^\d+\.\d+\.\d+\.\d+$/) {
206
    die "Bad bossip '$BOSSIP' from bossinfo!";
207
208
}

209
210
211
212
213
214
215
216
217
218
#
# This holds the container state set up by the library. There is state
# added here, and state added in the library ("private"). We locally
# redefine this below, so cannot be a lexical.
#
# NOTE: There should be NO state in here that needs to survive reboot.
#       We just remove them all when rebooting. See above.
#
$vnstate = { "private" => {} };

219
220
221
222
223
224
225
226
227
228
#
# Quickie way to show the state.
#
if ($showstate) {
    if (! -e "$VNDIR/vnode.info") {
	fatal("No vnode.info file for $vnodeid");
    }
    if (! -e "$VNDIR/vnode.state") {
	fatal("no vnode.state file for $vnodeid");
    }
229
230
231
    my $str = `cat $VNDIR/vnode.info`;
    ($vmid, $vmtype, undef) = ($str =~ /^(\d*) (\w*) ([-\w]*)$/);
    
232
233
234
235
236
    my $tmp = eval { Storable::retrieve("$VNDIR/vnode.state"); };
    if ($@) {
	fatal("$@");
    }
    print Dumper($tmp);
237
238
239
240
241
242
243
244
245
246
247
248

    # So the lib op works.
    $vnstate = $tmp;

    ($ret,$err) = safeLibOp('vnodeState', 1, 0);
    if ($err) {
	fatal("Failed to get status for existing container: $err");
    }
    if ($ret eq VNODE_STATUS_UNKNOWN()) {
	print "Cannot determine status container $vmid.\n";
    }
    print "Domain is $ret\n";
249
250
251
    exit(0);
}

252
253
254
255
#
# In most cases, the vnodeid directory will have been created by the
# caller, and a config file possibly dropped in.  When debugging, we
# have to create it here.
256
257
258
259
260
#
if (! -e $VMPATH) {
    mkdir($VMPATH, 0770) or
	fatal("Could not mkdir $VMPATH: $!");
}
261
262
263
264
265
266
267
chdir($VMPATH) or
    die("Could not chdir to $VMPATH: $!\n");

if (! -e $vnodeid) {
    mkdir($vnodeid, 0770) or
	fatal("Could not mkdir $vnodeid in $VMPATH: $!");
}
268
269
270
271
272
273
274
275
#
# The container description for the library routines. 
#
my %vnconfig = ( "vnodeid"   => $vnodeid,
                 "config"    => undef,
		 "ifconfig"  => undef,
		 "ldconfig"  => undef,
		 "tunconfig" => undef,
276
		 "attributes"=> undef,
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
);
sub VNCONFIG($) { return $vnconfig{'config'}->{$_[0]}; }

#
# If cleanup requested, make sure the manager process is not running
# Must do this after the stuff above is defined.
#
if ($cleanup) {
    # This path is in vnodesetup. 
    my $pidfile = "/var/run/tbvnode-${vnodeid}.pid";
    if (-e $pidfile) {
	print STDERR "Manager process still running. Use that instead.\n";
	print STDERR "If the manager is really dead, first rm $pidfile.\n";
	exit(1);
    }
    exit(TearDownStaleVM());
}

#
# Now we can start doing something useful.
#
298
my ($pid, $eid, $vname) = check_nickname();
299
300
my $nodeuuid = getnodeuuid();
$nodeuuid = $vnodeid if (!defined($nodeuuid));
301

302
303
304
305
306
307
#
# Get all the config stuff we need.
#
my %tmp;
my @tmp;
my $tmp;
308
my %attrs;
309

310
311
312
fatal("Could not get vnode config for $vnodeid")
    if (getgenvnodeconfig(\%tmp));
$vnconfig{"config"} = \%tmp;
313
314

fatal("getifconfig($vnodeid): $!")
315
316
    if (getifconfig(\@tmp));
$vnconfig{"ifconfig"} = [ @tmp ];
317
318

fatal("getlinkdelayconfig($vnodeid): $!") 
319
320
    if (getlinkdelayconfig(\@tmp));
$vnconfig{"ldconfig"} = [ @tmp ];
321

Leigh B. Stoller's avatar
Leigh B. Stoller committed
322
fatal("gettunnelconfig($vnodeid): $!")
323
324
    if (gettunnelconfig(\$tmp));
$vnconfig{"tunconfig"} = $tmp;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
325

326
327
328
329
fatal("getnodeattributes($vnodeid): $!")
    if (getnodeattributes(\%attrs));
$vnconfig{"attributes"} = \%attrs;

330
if ($debug) {
331
332
    print "VN Config:\n";
    print Dumper(\%vnconfig);
333
334
}

335
336
337
338
339
#
# see if we 1) are supposed to be "booting" into the reload mfs, and 2) if
# we have loadinfo.  Need both to reload!
#
fatal("getbootwhat($vnodeid): $!") 
340
    if (getbootwhat(\@tmp));
341
342
if (scalar(@tmp) && exists($tmp[0]->{"WHAT"})) {
    if ($tmp[0]->{"WHAT"} =~ /frisbee-pcvm/) {
343
344
345
346
	#
	# Ok, we're reloading, using the fake frisbee pcvm mfs.
	#
	$reload = 1;
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
	
	fatal("getloadinfo($vnodeid): $!") 
	    if (getloadinfo(\@tmp));
	if (!scalar(@tmp)) {
	    fatal("vnode $vnodeid in reloading, but got no loadinfo!");
	}
	else {
	    if ($tmp[0]->{"IMAGEID"} =~ /^([-\d\w]+),([-\d\w]+),([-\d\w]+)$/) {
		$vnconfig{"reloadinfo"} = $tmp[0];
		$vnconfig{"image"}      = "$1-$2-$3";
	    }
	    else {
		fatal("vnode $vnodeid in reloading, but got bogus IMAGEID " . 
		      $tmp[0]->{"IMAGEID"} . " from loadinfo!");
	    }
	}
    }
364
365
366
367
368
369
370
371
372
373
374
375
376
377
    elsif ($tmp[0]->{"WHAT"} =~ /^\d*$/) {
	#
	# We are using bootwhat for a much different purpose then intended.
	# It tells us a partition number, but that is meaningless. Look at
	# the jailconfig to see what image should boot. That image better
	# be resident already. 
	#
	if (VNCONFIG('IMAGENAME') =~ /^([-\w]+),([-\w]+),([-\w]+)$/) {
	    $vnconfig{"image"}      = "$1-$2-$3";
	}
    }
    else {
	# The library will boot the default, whatever that is.
    }
378
379
380
381
382
383
384
385
}

#
# Install a signal handler. We can get signals from vnodesetup.
#
sub handler ($) {
    my ($signame) = @_;

386
387
    print STDERR "mkvnode ($PID) caught a SIG${signame}!\n";

388
389
390
391
392
393
394
395
396
397
    # No more interruptions during teardown.
    $SIG{INT}  = 'IGNORE';
    $SIG{USR1} = 'IGNORE';
    $SIG{USR2} = 'IGNORE';
    $SIG{HUP}  = 'IGNORE';

    my $str = "killed";
    if ($signame eq 'USR1') {
	$leaveme = $LEAVEME_HALT;
	$str = "halted";
398
    }
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
    elsif ($signame eq 'USR2') {
	$leaveme = $LEAVEME_REBOOT;
	$str = "rebooted";
    }

    #
    # XXX this is a woeful hack for vnodesetup.  At the end of rebootvnode,
    # vnodesetup calls hackwaitandexit which essentially waits for a vnode
    # to be well on the way back up before it returns.  This call was
    # apparently added for the lighter-weight "reconfigure a vnode"
    # (as opposed to reboot it) path, however it makes the semantics of
    # reboot on a vnode different than that for a pnode, where reboot returns
    # as soon as the node stops responding (i.e., when it goes down and not
    # when it comes back up).  Why do I care?  Because Xen vnodes cannot
    # always "reboot" under the current semantics in less than 30 seconds,
    # which is the timeout in libreboot.
    #
    # So by touching the "running" file here we force hackwaitandexit to
    # return when the vnode is shutdown in Xen (or OpenVZ), more closely
    # matching the pnode semantics while leaving the BSD jail case (which
    # doesn't use this code) alone.  This obviously needs to be revisited.
    #
    mysystem("touch $RUNNING_FILE")
	if ($leaveme && -e "$RUNNING_FILE");

424
425
    print STDERR "Container is being $str\n";
    MyFatal("Container has been $str by $signame");
426
427
}

428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
#
# If this file exists, we are rebooting an existing container. But
# need to check if its a stale or aborted container (one that failed
# to setup or teardown) and got left behind. Another wrinkle is shared
# nodes, so we use the node uuid to determine if its another logical
# pcvm with the same name, and needs to be destroyed before setting up.
#
if (-e "$VNDIR/vnode.info") {
    my $uuid;
    my $teardown = 0;

    my $str = `cat $VNDIR/vnode.info`;
    ($vmid, $vmtype, $uuid) = ($str =~ /^(\d*) (\w*) ([-\w]*)$/);

    # Consistency check.
    fatal("No matching file: $VMPATH/vnode.$vmid")
	if (! -e "$VMPATH/vnode.$vmid");
    $str = `cat $VMPATH/vnode.$vmid`;
    chomp($str);
    if ($str ne $vnodeid) {
	fatal("Inconsistent vnodeid in $VMPATH/vnode.$vmid");
    }

    if ($uuid ne $nodeuuid) {
	print "UUID mismatch; tearing down stale vnode $vnodeid\n";
	$teardown = 1;
    }
    elsif ($reload) {
	print "Reload requested, tearing down old vnode\n";
	$teardown = 1;
458
459
    }
    else {
460
461
462
463
464
465
466
467
468
469
470
471
472
	# We (might) need this to discover the state. 
	local $vnstate = { "private" => {} };
	
	if (-e "$VNDIR/vnode.state") {
	    my $tmp = eval { Storable::retrieve("$VNDIR/vnode.state"); };
	    if ($@) {
		print STDERR "$@";
		$teardown = 1;
	    }
	    else {
		$vnstate->{'private'} = $tmp->{'private'};
	    }
	}
473
474
475
	($ret,$err) = safeLibOp('vnodeState', 1, 0);
	if ($err) {
	    fatal("Failed to get status for existing container: $err");
476
	}
477
478
479
480
481
482
	if ($ret eq VNODE_STATUS_UNKNOWN()) {
	    print "Cannot determine status container $vmid. Deleting ...\n";
	    $teardown = 1;
	}
	elsif ($ret ne VNODE_STATUS_STOPPED()) {
	    fatal("vnode $vnodeid not stopped, not booting!");
483
484
	}
    }
485
486
487
488
489
490
491
    if ($teardown) {
	TearDownStaleVM() == 0
	    or fatal("Could not tear down stale container");
    }
    else {
	$rebooting = 1;
    }
492
493
}

Leigh B Stoller's avatar
Leigh B Stoller committed
494
495
496
#
# Install handlers *after* down stale container teardown, since we set
# them to IGNORE during the teardown.
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
# 
# Ignore TERM since we want our caller to catch it first and then send
# it down to us. 
#
$SIG{TERM} = 'IGNORE';
# Halt container and exit. Tear down transient state, leave disk.
$SIG{USR1} = \&handler;
# Halt container and exit. Leave all state intact (we are rebooting).
$SIG{USR2} = \&handler;
# Halt container and exit. Tear down all state including disk.
$SIG{HUP}  = \&handler;
$SIG{INT}  = \&handler;

#
# Initial pre config for the experimental network. We want to make sure
# we can allocate the required devices and whatever else before going
# any further. 
514
#
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
TBDebugTimeStampWithDate("starting rootPreConfigNetwork()");
$ret = eval {
    $libops{GENVNODETYPE()}{'rootPreConfigNetwork'}->($vnodeid, undef,
	\%vnconfig, $vnstate->{'private'});
};
if ($ret || $@) {
    print STDERR $@
	if ($@);
    
    # If this fails, we require the library to clean up after itself
    # so that we can just exit without worrying about cleanup.
    fatal("rootPreConfigNetwork failed!");
}
TBDebugTimeStampWithDate("finished rootPreConfigNetwork()");

530
531
532
533
534
535
536
if (! -e "$VNDIR/vnode.info") {
    #
    # XXX XXX XXX: need to get this from tmcd!
    # NOTE: we first put the type into vndb so that the create call can go!
    #
    $vmtype = GENVNODETYPE();

Leigh B Stoller's avatar
Leigh B Stoller committed
537
    ($ret,$err) = safeLibOp('vnodeCreate',0,0);
538
539
540
541
542
    if ($err) {
	MyFatal("vnodeCreate failed");
    }
    $vmid = $ret;

543
    mysystem("echo '$vmid $vmtype $nodeuuid' > $VNDIR/vnode.info");
544
    mysystem("echo '$vnodeid' > $VMPATH/vnode.$vmid");
545
546
547

    # bootvnodes wants this to be here...
    mysystem("mkdir -p /var/emulab/jails/$vnodeid");
548
}
549
550
551
552
553
554
555
# This state structure is saved to disk for TearDown.
$vnstate->{"vmid"}   = $vmid;
$vnstate->{"vmtype"} = $vmtype;
$vnstate->{"uuid"}   = $nodeuuid;
# Store the state to disk.
if (StoreState()) {
    MyFatal("Could not store container state to disk");
556
557
}

558
559
my $cnet_mac = (defined(VNCONFIG('CTRLMAC')) ?
		VNCONFIG('CTRLMAC') : ipToMac(VNCONFIG('CTRLIP')));
560
561
562
563
564
565
566
567
568
my $ext_ctrlip = `cat $CTRLIPFILE`;
chomp($ext_ctrlip);
if ($ext_ctrlip !~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) {
    # cannot/should not really go on if this happens.
    MyFatal("error prior to vnodePreConfigControlNetwork($vnodeid): " . 
	    " could not find valid ip in $CTRLIPFILE!");
}
my $longdomain = "${eid}.${pid}.${DOMAINNAME}";

569
570
571
572
573
574
575
#
# Call back to do things to the container before it boots.
#
sub callback($)
{
    my ($path) = @_;

576
577
578
579
580
    #
    # Set up sshd port to listen on. If the vnode has its own IP
    # then listen on both 22 and the per-vnode port.
    #
    if (system('grep -q -e EmulabJail $path/etc/ssh/sshd_config')) {
581
582
	if (defined(VNCONFIG('SSHDPORT')) && VNCONFIG('SSHDPORT') ne "") {
	    my $sshdport = VNCONFIG('SSHDPORT');
583

584
585
	    mysystem2("echo '# EmulabJail' >> $path/etc/ssh/sshd_config");
	    mysystem2("echo 'Port $sshdport' >> $path/etc/ssh/sshd_config");
586
	    if (VNCONFIG('CTRLIP') ne $ext_ctrlip) {
587
		mysystem2("echo 'Port 22' >> $path/etc/ssh/sshd_config");
588
589
590
	    }
	}
    }
591
    # Localize the timezone.
592
593
    mysystem2("cp -fp /etc/localtime $path/etc");

594
595
596
    return 0;
}

597
# OP: preconfig
598
if (safeLibOp('vnodePreConfig', 1, 1, \&callback)) {
599
600
601
602
    MyFatal("vnodePreConfig failed");
}

# OP: control net preconfig
603
604
605
if (safeLibOp('vnodePreConfigControlNetwork',1,1,
	      VNCONFIG('CTRLIP'),
	      VNCONFIG('CTRLMASK'),$cnet_mac,
606
607
608
609
610
	      $ext_ctrlip,$vname,$longdomain,$DOMAINNAME,$BOSSIP)) {
    MyFatal("vnodePreConfigControlNetwork failed");
}

# OP: exp net preconfig
611
if (safeLibOp('vnodePreConfigExpNetwork', 1, 1)) {
612
613
    MyFatal("vnodePreConfigExpNetwork failed");
}
614
if (safeLibOp('vnodeConfigResources', 1, 1)) {
615
616
    MyFatal("vnodeConfigResources failed");
}
617
if (safeLibOp('vnodeConfigDevices', 1, 1)) {
618
619
620
    MyFatal("vnodeConfigDevices failed");
}

621
#
622
# Route to inner ssh, but not if the IP is routable, no need to.
623
#
624
625
if (defined(VNCONFIG('SSHDPORT')) && VNCONFIG('SSHDPORT') ne "" &&
    !isRoutable(VNCONFIG('CTRLIP'))) {
626
627
    my $sshdport = VNCONFIG('SSHDPORT');
    my $ctrlip   = VNCONFIG('CTRLIP');
628
629

    # Retry a few times cause of iptables locking stupidity.
630
    for (my $i = 0; $i < 10; $i++) {
631
632
633
	system("$IPTABLES -v -t nat -A PREROUTING -p tcp -d $ext_ctrlip ".
	       "--dport $sshdport -j DNAT ".
	       "--to-destination $ctrlip:$sshdport");
634
635
636
637
638
639
640
641
642
	
	if ($? == 0) {
	    my $ref = {};
	    $ref->{'port'}       = $sshdport;
	    $ref->{'ctrlip'}     = $ctrlip;
	    $ref->{'ext_ctrlip'} = $ext_ctrlip;
	    $vnstate->{'sshd_iprule'} = $ref;
	    last;
	}
643
644
	sleep(2);
    }
645
646
}

647
648
649
650
651
652
653
#
# Start the container. If all goes well, this will exit cleanly, with the
# it running in its new context. Still, lets protect it with a timer
# since it might get hung up inside and we do not want to get stuck here.
#
my $childpid = fork();
if ($childpid) {
654
655
    my $timedout = 0;
    local $SIG{ALRM} = sub { kill("TERM", $childpid); $timedout = 1; };
656
657
658
659
660
661
662
    alarm 30;
    waitpid($childpid, 0);
    alarm 0;

    #
    # If failure then cleanup.
    #
663
664
    if ($? || $timedout) {
	MyFatal("$vnodeid container startup failed or timed out");
665
666
667
668
    }
}
else {
    $SIG{TERM} = 'DEFAULT';
669
670
671
672
    $SIG{INT}  = 'DEFAULT';
    $SIG{USR1} = 'DEFAULT';
    $SIG{USR2} = 'DEFAULT';
    $SIG{HUP}  = 'DEFAULT';
673

674
    if (safeLibOp('vnodeBoot', 1, 1)) {
675
676
677
678
679
	print STDERR "*** ERROR: vnodeBoot failed\n";
	exit(1);
    }
    exit(0);
}
680
if (safeLibOp('vnodePostConfig', 1, 1)) {
681
682
683
    MyFatal("vnodePostConfig failed");
}
# XXX: need to do this for each type encountered!
684
TBDebugTimeStampWithDate("starting $vmtype rootPostConfig()");
685
$libops{$vmtype}{'rootPostConfig'}->();
686
687
688
689
690
691
TBDebugTimeStampWithDate("finished $vmtype rootPostConfig()");

if ($debug) {
    print "VN State:\n";
    print Dumper($vnstate);
}
692

693
694
695
696
# Store the state to disk.
if (StoreState()) {
    MyFatal("Could not store container state to disk");
}
697
# This is for vnodesetup
698
mysystem("touch $RUNNING_FILE");
699
$running = 1;
700
701

#
702
703
704
705
# This loop is to catch when the container stops. We used to run a sleep
# inside and wait for it to exit, but that is not portable across the
# backends, and the return value did not indicate how it exited. So, lets
# just loop, asking for the status every few seconds. 
706
#
707
708
709
710
# XXX Turn off debugging during this loop to keep the log file from growing.
#
TBDebugTimeStampsOff()
    if ($debug);
711

712
713
714
while (1) {
    sleep(5);
    
715
    #
716
717
718
719
    # If the container exits, either it rebooted from the inside or
    # the physical node is rebooting, or we are actively trying to kill
    # it cause our parent (vnodesetup) told us to. In all cases, we just
    # exit and let the parent decide what to do. 
720
    #
721
722
723
724
725
726
727
728
729
730
731
    my ($ret,$err) = safeLibOp('vnodeState', 0, 0);
    if ($err) {
	fatal("*** ERROR: vnodeState: $err\n");
    }
    if ($ret ne VNODE_STATUS_RUNNING()) {
	print "Container is no longer running.\n";
	# Rebooted from inside, but not cause we told it to, so leave intact.
	$leaveme = $LEAVEME_REBOOT
	    if (!$cleaning);
	last;
    }
732
}
733
734
735
TBDebugTimeStampsOn()
    if ($debug);
exit(CleanupVM());
736
737

#
738
739
740
741
# Teardown a container. This should not be used if the mkvnode process
# is still running; use vnodesetup instead. This is just for the case
# that the manager (vnodesetup,mkvnode) process is gone and the turds
# need to be cleaned up.
742
#
743
744
745
746
747
748
749
sub TearDownStaleVM()
{
    if (! -e "$VNDIR/vnode.info") {
	fatal("TearDownStaleVM: no vnode.info file for $vnodeid");
    }
    my $str = `cat $VNDIR/vnode.info`;
    ($vmid, $vmtype, undef) = ($str =~ /^(\d*) (\w*) ([-\w]*)$/);
750

751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
    #
    # Load the state. Use a local so that we do not overwrite
    # the outer version. Just a precaution.
    #
    # The state might not exist, but we proceed anyway.
    #
    local $vnstate = { "private" => {} };

    if (-e "$VNDIR/vnode.state") {
	$vnstate = eval { Storable::retrieve("$VNDIR/vnode.state"); };
	if ($@) {
	    print STDERR "$@";
	    return -1;
	}
	if ($debug) {
	    print "vnstate:\n";
	    print Dumper($vnstate);
768
	}
769
    }
770

Leigh B Stoller's avatar
Leigh B Stoller committed
771
772
773
774
775
776
    # No interruptions during stale teardown.
    $SIG{INT}  = 'IGNORE';
    $SIG{USR1} = 'IGNORE';
    $SIG{USR2} = 'IGNORE';
    $SIG{HUP}  = 'IGNORE';

777
778
779
780
781
782
783
    #
    # if we fail to cleanup, store the state back to disk so that we
    # capture any changes. 
    #
    if (CleanupVM()) {
	StoreState();
	return -1;
784
    }
Leigh B Stoller's avatar
Leigh B Stoller committed
785
786
787
788
789
    $SIG{INT}  = 'DEFAULT';
    $SIG{USR1} = 'DEFAULT';
    $SIG{USR2} = 'DEFAULT';
    $SIG{HUP}  = 'DEFAULT';
    
790
    return 0;
791
792
793
794
795
}

#
# Clean things up.
#
796
sub CleanupVM()
797
798
799
800
801
802
803
804
{
    if ($cleaning) {
	die("*** $0:\n".
	    "    Oops, already cleaning!\n");
    }
    $cleaning = 1;

    # If the container was never built, there is nothing to do.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
805
    return 0
806
807
	if (! -e "$VNDIR/vnode.info" || !defined($vmid));

808
809
810
811
812
    if (exists($vnstate->{'sshd_iprule'})) {
	my $ref = $vnstate->{'sshd_iprule'};
	my $sshdport    = $ref->{'port'};
	my $ctrlip      = $ref->{'ctrlip'};
	my $ext_ctrlip  = $ref->{'ext_ctrlip'};
813
814

	# Retry a few times cause of iptables locking stupidity.
815
	for (my $i = 0; $i < 10; $i++) {
816
817
818
819
820
821
822
	    system("$IPTABLES -v -t nat -D PREROUTING -p tcp -d $ext_ctrlip ".
		   "--dport $sshdport -j DNAT ".
		   "--to-destination $ctrlip:$sshdport");
	    last
		if ($? == 0);
	    sleep(2);
	}
823
824
825
	# Update new state.
	delete($vnstate->{'sshd_iprule'});
	StoreState();
826
827
    }

828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
    #
    # The tmcc proxy causes teardown problems, no idea why.
    # It used to be kill off from the unmount script, but lets
    # do it here.
    #
    my $PROXYPID = "/var/run/tmccproxy.${vnodeid}.pid";
    if (-e $PROXYPID) {
	my $ppid = `cat $PROXYPID`;
	chomp($ppid);
	# untaint
	if ($ppid =~ /^([-\@\w.]+)$/) {
	    $ppid = $1;
	}
	if (kill('TERM', $ppid) == 0) {
	    print"*** ERROR: Could not kill(TERM) proxy process $ppid: $!\n";
	}
	else {
	    unlink($PROXYPID);
	}
    }

849
    # if not halted, try that first
850
    my ($ret,$err) = safeLibOp('vnodeState', 1, 0);
851
852
853
    if ($err) {
	print STDERR "*** ERROR: vnodeState: ".
	    "failed to cleanup $vnodeid: $err\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
854
	return -1;
855
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
856
    if ($ret eq VNODE_STATUS_RUNNING()) {
857
	print STDERR "cleanup: $vnodeid not stopped, trying to halt it.\n";
858
	($ret,$err) = safeLibOp('vnodeHalt', 1, 1);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
859
860
861
862
863
	if ($err) {
	    print STDERR "*** ERROR: vnodeHalt: ".
		"failed to halt $vnodeid: $err\n";
	    return -1;
	}
864
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
865
866
    elsif ($ret eq VNODE_STATUS_MOUNTED()) {
	print STDERR "cleanup: $vnodeid is mounted, trying to unmount it.\n";
867
	($ret,$err) = safeLibOp('vnodeUnmount', 1, 1);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
868
869
870
871
872
873
	if ($err) {
	    print STDERR "*** ERROR: vnodeUnmount: ".
		"failed to unmount $vnodeid: $err\n";
	    return -1;
	}
    }
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
    if ($leaveme) {
	if ($leaveme == $LEAVEME_HALT || $leaveme == $LEAVEME_REBOOT) {
	    #
	    # When halting, the disk state is left, but the transient state
	    # is removed since it will get reconstructed later if the vnode
	    # is restarted. This avoids leaking a bunch of stuff in case the
	    # vnode never starts up again. We of course leave the disk, but
	    # that will eventually get cleaned up if the pcvm is reused for
	    # a future experiment.
	    #
	    # XXX Reboot should be different; there is no reason to tear
	    # down the transient state, but we do not handle that yet.
	    # Not hard to add though.
	    #
	    ($ret,$err) = safeLibOp('vnodeTearDown', 1, 1);
	    # Always store in case some progress was made. 
	    StoreState();
	    if ($err) {
		print STDERR "*** ERROR: failed to teardown $vnodeid: $err\n";
		return -1;
	    }
	}
	return 0;
    }
898
899

    # now destroy
900
    ($ret,$err) = safeLibOp('vnodeDestroy', 1, 1);
901
902
    if ($err) {
	print STDERR "*** ERROR: failed to destroy $vnodeid: $err\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
903
	return -1;
904
905
    }
    unlink("$VNDIR/vnode.info");
906
    unlink("$VNDIR/vnode.state");
907
    unlink("$VMPATH/vnode.$vmid");
908
    $cleaning = 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
909
    return 0;
910
911
912
913
914
915
916
917
918
}
    
#
# Print error and exit.
#
sub MyFatal($)
{
    my ($msg) = @_;

919
920
921
922
    #
    # If rebooting but never got a chance to run, we do not want
    # to kill off the container. Might lose user data.
    #
923
    $leaveme = $LEAVEME_REBOOT
924
925
	if ($rebooting && !$running);

926
927
928
    TBDebugTimeStampsOn()
	if ($debug);
    
929
    CleanupVM();
930
931
932
933
934
935
936
    die("*** $0:\n".
	"    $msg\n");
}

#
# Helpers:
#
937
938
sub safeLibOp($$$;@) {
    my ($op,$autolog,$autoerr,@args) = @_;
939
940
941

    my $sargs = '';
    if (@args > 0) {
942
 	$sargs = join(',',@args);
943
    }
944
    TBDebugTimeStampWithDate("starting $vmtype $op($sargs)")
945
	if ($debug);
946
947
948
949
950
951
952
953
954
955
956
957

    #
    # Block signals that could kill us in the middle of a library call.
    # Might be better to do this down in the library, but this is an
    # easier place to do it. This ensure that if we have to tear down
    # in the middle of setting up, the state is consistent. 
    #
    my $new_sigset = POSIX::SigSet->new(SIGHUP, SIGINT, SIGUSR1, SIGUSR2);
    my $old_sigset = POSIX::SigSet->new;
    if (! defined(sigprocmask(SIG_BLOCK, $new_sigset, $old_sigset))) {
	print STDERR "sigprocmask (BLOCK) failed!\n";
    }
958
    my $ret = eval {
959
960
	$libops{$vmtype}{$op}->($vnodeid, $vmid,
				\%vnconfig, $vnstate->{'private'}, @args);
961
    };
962
    my $err = $@;
963
    if (! defined(sigprocmask(SIG_SETMASK, $old_sigset))) {
964
965
966
	print STDERR "sigprocmask (UNBLOCK) failed!\n";
    }
    if ($err) {
967
968
969
	if ($autolog) {
	    ;
	}
970
	TBDebugTimeStampWithDate("failed $vmtype $op($sargs): $err")
971
972
973
974
	    if ($debug);
	return (-1,$err);
    }
    if ($autoerr && $ret) {
975
	$err = "$op($vnodeid) failed with exit code $ret!";
976
977
978
	if ($autolog) {
	    ;
	}
979
	TBDebugTimeStampWithDate("failed $vmtype $op($sargs): exited with $ret")
980
981
982
983
	    if ($debug);
	return ($ret,$err);
    }

984
    TBDebugTimeStampWithDate("finished $vmtype $op($sargs)")
985
986
987
988
	if ($debug);

    return $ret;
}
989
990
991
992
993
994
995
996
997
998
999
1000

sub StoreState()
{
    # Store the state to disk.
    print "Storing state to disk ...\n"
	if ($debug);
    
    my $ret = eval { Storable::store($vnstate, "$VNDIR/vnode.state"); };
    if ($@) {
	print STDERR "$@";
	return -1;
    }
For faster browsing, not all history is shown. View entire blame