elabinelab.in 50 KB
Newer Older
1
#!/usr/bin/perl -w
2
3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2004-2012 University of Utah and the Flux Group.
5
6
7
8
9
10
11
12
# All rights reserved.
#
# TODO: ntpinfo table.
#       Current source directory? From where?
#
use English;
use Getopt::Std;

13
14
15
16
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libdb;
use libtestbed;
17
use libtblog;
18
use Experiment;
19
use User;
20
use Lan;
21

22
23
24
25
26
#
# Do things necessary for setting up inner elab experiment. 
#
sub usage()
{
27
    print STDOUT "Usage: elabinelab [-d] [-g] [-u] pid eid\n";
28
    print STDOUT "       elabinelab [-d] [-k | -f] pid eid\n";
29
    print STDOUT "       elabinelab [-d] -r pid eid [node ...]\n";
30
31
32
 
    exit(-1);
}
33
my $optlist  = "dgkfurP";
34
my $debug    = 1;
35
my $verbose  = 0;
36
my $killmode = 0;
37
my $fwboot   = 0;
38
my $dbgooonly= 0;
39
40
my $update   = 0;
my $remove   = 0;
41

42
43
44
45
46
47
48
49
#
# XXX experimental speed hacks.
#     $inparallel    reboots all server in parallel (rather than serially)
#		     after setup
#     $restartnodes  uses a new bootinfo RESTART command to quickly move
#		     inner nodes from control of outer boss to inner boss
#		     avoiding all node reboots
#
50
my $inparallel = 1;
51
my $restartnodes = 0;
52

53
54
sub DumpDBGoo();

55
56
57
58
59
60
#
# Configure variables
#
my $TB		= "@prefix@";
my $TBOPS       = "@TBOPSEMAIL@";
my $CONTROL	= "@USERNODE@";
Russ Fish's avatar
Russ Fish committed
61
my $DBNAME      = "@TBDBNAME@";
62
63
my $TBOPSPID    = TBOPSPID();
my $SSH		= "$TB/bin/sshtb";
64
my $SCP		= "/usr/bin/scp";
65
my $nodereboot  = "$TB/bin/node_reboot";
66
my $noderestart	= "$TB/sbin/bootinfosend -R";
67
my $makeconf    = "$TB/sbin/dhcpd_makeconf";
68
my $nodewait    = "$TB/sbin/node_statewait";
69
my $snmpit      = "$TB/bin/snmpit";
70
my $osselect	= "$TB/bin/os_select";
71
72
73

# Protos
sub TearDownEmulab();
74
75
sub RemoveNodes();
sub UpdateEmulab();
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Turn off line buffering on output
#
$| = 1;

#
# We don't want to run this script unless its the real version.
#
if ($EUID != 0) {
    die("*** $0:\n".
	"    Must be root! Maybe its a development version?\n");
}

# Locals
my $SAVEUID     = $UID;
my $workdir;
97
my $expdir;
98
99
100
my %noderoles	= ();
my $opsnode;
my $bossnode;
Mike Hibler's avatar
Mike Hibler committed
101
102
my $fsnode;
my $routernode;
103
my @expnodes    = ();
104
my $query_result;
105
106
my $inner_experiment;
my $inner_nsfile;
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (defined($options{"g"})) {
    $dbgooonly = 1;
}
if (defined($options{"d"})) {
    $debug = 1;
}
122
123
124
if (defined($options{"k"})) {
    $killmode = 1;
}
125
126
127
if (defined($options{"f"})) {
    $fwboot = 1;
}
128
129
130
131
132
133
if (defined($options{"u"})) {
    $update = 1;
}
if (defined($options{"r"})) {
    $remove = 1;
}
134
135
136
if (defined($options{"P"})) {
    $inparallel = 1;
}
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
if (! @ARGV) {
    usage();
}
my ($pid,$eid) = @ARGV;

#
# Untaint the arguments.
#
if ($pid =~ /^([-\w]+)$/) {
    $pid = $1;
}
else {
    die("Tainted argument $pid!\n");
}
if ($eid =~ /^([-\w]+)$/) {
    $eid = $1;
}
else {
    die("Tainted argument $eid!\n");
}
157
158
159
160
161
my $experiment = Experiment->Lookup($pid, $eid);
if (!defined($experiment)) {
    die("*** $0:\n".
	"    Could not map $pid/$eid to its object!\n");
}
162
$workdir = TBExptWorkDir($pid, $eid);
163
$expdir = PROJROOT() . "/$pid/exp/$eid";
164

165
166
167
168
169
# Build Logfile names
my $opslogfile  = "$workdir/opsnode.log";
my $fslogfile   = "$workdir/fsnode.log";
my $bosslogfile = "$workdir/bossnode.log";

170
171
172
#
# Verify user and get his DB uid.
#
173
174
175
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    tbdie("You ($UID) do not exist!");
176
}
177
178
179
my $user_uid   = $this_user->uid();
my $user_name  = $this_user->name();
my $user_email = $this_user->email();
180

Leigh B. Stoller's avatar
Leigh B. Stoller committed
181
TBDebugTimeStampsOn();
182

183
184
185
186
#
# Get elabinelab status to make sure, and to see if we need to fire off
# an experiment inside once its setup.
#
187
188
189
190
191
my $elabinelab           = $experiment->elabinelab();
my $elabinelab_eid       = $experiment->elabinelab_eid();
my $elabinelab_nosetup   = $experiment->elabinelab_nosetup();
my $elabinelab_singlenet = $experiment->elabinelab_singlenet();

192
193
194
exit(0)
    if (!$elabinelab);

195
196
197
198
#
# See if the experiment is firewalled
#
my $firewall;
199
200
my $fwtype;
my $firewalled = TBExptFirewall($pid, $eid, \$firewall, undef, undef, \$fwtype);
201

202
203
204
205
206
207
208
209
210
#
# Presetup; turn off firewall.
#
if ($fwboot) {
    exit(0)
	if (!$firewalled);
    
    print "Turning off firewall rules on $firewall\n";
    $UID = 0;
211
212
213
214
215
    if ($fwtype =~ /^iptables/) {
        system("$SSH -host $firewall iptables -I FORWARD 1 -j ACCEPT");
    } else {
        system("$SSH -host $firewall ipfw add 1 allow all from any to any");
    }
216
217
218
219
220
221
222
    if ($?) {
	die("*** $0:\n".
	    "    Error turning off firewall rules ($firewall)!\n");
    }
    exit(0);
}

223
224
225
226
227
#
# If we are going to start an inner experiment, grab the stuff we need
# from the DB and save it. 
#
if (defined($elabinelab_eid)) {
228
    $inner_experiment = Experiment->Lookup($pid, $elabinelab_eid);
229
230
    die("*** $0:\n".
	"    No such experiment in DB for $pid/$elabinelab_eid\n")
231
	if (!defined($inner_experiment));
232

233
234
235
    $inner_experiment->GetNSFile(\$inner_nsfile) == 0 or
	die("*** $0:\n".
	    "    Could not get NS file for $inner_experiment\n");
236
237
    
    die("*** $0:\n".
238
239
	"    No nsfile in DB for $inner_experiment")
	if (!defined($inner_nsfile) || $inner_nsfile eq "");
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
}

#
# Get the role for each node.
#
$query_result =
    DBQueryFatal("select r.node_id,r.inner_elab_role from reserved as r ".
		 "where r.pid='$pid' and r.eid='$eid'");
while (my ($node_id,$role) = $query_result->fetchrow_array()) {
    # Like, the firewall node.
    next
	if (!defined($role));
	
    $noderoles{$node_id} = $role;
    $bossnode = $node_id
255
	if ($role =~ /^boss/);
Mike Hibler's avatar
Mike Hibler committed
256
257
    $routernode = $node_id
	if ($role eq 'router');
258
    $opsnode = $node_id
Mike Hibler's avatar
Mike Hibler committed
259
260
261
	if ($role eq 'ops' || $role eq 'ops+fs');
    $fsnode = $node_id
	if ($role eq 'fs');
262
263
264
265
266
267
268
269
270
271
    push(@expnodes, $node_id)
	if ($role eq 'node');
}

#
# Tear down an inner emulab.
# 
if ($killmode) {
    exit(TearDownEmulab());
}
272
273
274
275
276
277
elsif ($remove) {
    exit(RemoveNodes());
}
elsif ($update) {
    exit(UpdateEmulab());
}
278
279
280
281
282
283

#
# Get elabinelab info. If this is a container for an actual experiment,
# then need to fire off the experiment once the inner emulab is ready to
# go.
# 
Leigh B. Stoller's avatar
Leigh B. Stoller committed
284
TBDebugTimeStamp("Dumping DB state");
285
286
287
288
289
DumpDBGoo();
exit(0)
    if ($dbgooonly);

#
290
# For SSH and SCP below
291
292
#
$UID = 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
293

294
#
295
296
# The firewall should be off at this point; called from os_setup with -f.
# 
297

Leigh B. Stoller's avatar
Leigh B. Stoller committed
298
299
300
#
# This is temporary. I think I will switch this over to grabbing the latest
# version from the web server.
301
#
302
# XXX ugh, copy over a newer mkextrafs.pl as well (one that supports -2).
303
# XXX ughII, we only copy over a FreeBSD version, this will break a Linux boss.
304
#
305
my $mkelab = "$TB/etc/rc.mkelab";
306
307
if (-e "$expdir/rc.mkelab") {
    $mkelab = "$expdir/rc.mkelab";
308
}
309
310
311
312
my $mkextrafs = "";
if (-e "$TB/etc/mkextrafs.pl") {
    $mkextrafs = "$TB/etc/mkextrafs.pl";
}
313
314
315
print "Copying $mkelab $mkextrafs to ${bossnode}";
print "/${opsnode}"
    if (defined($opsnode));
316
317
318
print "/${fsnode}"
    if (defined($fsnode));
print "\n";
319
system("scp $mkelab $mkextrafs ${bossnode}:/tmp");
320
321
system("scp $mkelab $mkextrafs ${opsnode}:/tmp")
    if (defined($opsnode));
322
system("scp $mkelab $mkextrafs ${fsnode}:/tmp")
323
    if (defined($fsnode));
324

325
326
327
if (defined($fsnode)) {
    TBDebugTimeStamp("Setting up fsnode");
    print "Setting up fsnode on $fsnode\n";
328
    system("$SSH -host $fsnode /tmp/rc.mkelab -s -d > $fslogfile 2>&1");
329
330
331
332
333
334
335
    if ($?) {
	$UID = $SAVEUID;
	SENDMAIL("$user_name <$user_email>",
		 "ElabInElab Failure: $pid/$eid",
		 "Error building the fs node ($fsnode)",
		 $TBOPS,
		 "Cc: $TBOPS",
336
		 ($fslogfile));
337
338
339
340
341
	print STDERR "*** $0:\n".
	    "    Error building the fsnode ($fsnode)!\n";
	exit(($debug ? 0 : -1));
    }
}
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
if (defined($opsnode)) {
    TBDebugTimeStamp("Setting up opsnode");
    print "Setting up opsnode on $opsnode\n";
    system("$SSH -host $opsnode /tmp/rc.mkelab -s -d > $opslogfile 2>&1");
    if ($?) {
	$UID = $SAVEUID;
	SENDMAIL("$user_name <$user_email>",
		 "ElabInElab Failure: $pid/$eid",
		 "Error building the ops node ($opsnode)",
		 $TBOPS,
		 "Cc: $TBOPS",
		 ($opslogfile));
	print STDERR "*** $0:\n".
	             "    Error building the opsnode ($opsnode)!\n";
	exit(($debug ? 0 : -1));
    }
358
}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
359
TBDebugTimeStamp("Setting up bossnode");
360
print "Setting up bossnode on $bossnode\n";
361
system("$SSH -host $bossnode /tmp/rc.mkelab -s -d > $bosslogfile 2>&1");
362
363
364
365
if ($?) {
    $UID = $SAVEUID;
    SENDMAIL("$user_name <$user_email>",
	     "ElabInElab Failure: $pid/$eid",
366
	     "Error building the boss node ($bossnode)",
367
368
	     $TBOPS,
	     "Cc: $TBOPS",
369
	     ($bosslogfile));
370
371
372
373
374
    print STDERR "*** $0:\n".
	         "    Error building the bossnode ($bossnode)!\n";
    exit(($debug ? 0 : -1));
}

375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
if ($verbose) {
    # Send these log files off now so that we can look at them.
    if (defined($fsnode)) {
	SENDMAIL("$user_name <$user_email>",
		 "ElabInElab Setup Log: $pid/$eid",
		 "Logs for building fs/ops/boss ($fsnode/$opsnode/$bossnode)",
		 $TBOPS,
		 "Cc: $TBOPS",
		 ($fslogfile, $opslogfile, $bosslogfile));
    }
    else {
	SENDMAIL("$user_name <$user_email>",
		 "ElabInElab Setup Log: $pid/$eid",
		 "Logs for building ops/boss ($opsnode/$bossnode)",
		 $TBOPS,
		 "Cc: $TBOPS",
		 ($opslogfile, $bosslogfile));
    }
393
}
394
$UID  = $SAVEUID;
395
396

# Run as real user for the next few scripts, which are setuid.
397
$EUID = $UID;
398

399
goto skipsetup
400
    if ($elabinelab_nosetup);
401

402
#
403
404
405
406
407
# Restart DHCPD, but first mark the nodes as being ready to boot inside
# the inner emulab, so that dhcpd_makeconf knows what nodes to change
# the entries for.
#
DBQueryFatal("update reserved set inner_elab_boot=1 ".
408
	     "where pid='$pid' and eid='$eid'");
409

410
411
412
413
414
415
416
print "Regenerating DHCPD config file and restarting daemon.\n";
system("$makeconf -i -r");
if ($?) {
    die("*** $0:\n".
	"    Failed to reconfig/restart DHCPD.\n");
}

417
if ($inparallel) {
418
419
420
    my $nodes = "$bossnode";
    $nodes .= " $opsnode"
	if (defined($opsnode));
421
422
423
424
425
    $nodes .= " $fsnode"
	if (defined($fsnode));
    print "Rebooting servers ($nodes).\n";
    TBDebugTimeStamp("Rebooting servers");
    system("$nodereboot -w $nodes");
426
    if ($?) {
427
428
429
	print STDERR "*** $0:\n".
	    "    Error rebooting the servers ($nodes)!\n";
	exit(($debug ? 0 : -1));
430
431
432
433
434
435
436
437
    }
} else {
    if (defined($fsnode)) {
	# Reboot fs and wait for it to come back.
	print "Rebooting fsnode ($fsnode).\n";
	TBDebugTimeStamp("Rebooting fsnode");
	system("$nodereboot -w $fsnode");
	if ($?) {
438
439
440
	    print STDERR "*** $0:\n".
		"    Error rebooting the fsnode ($fsnode)!\n";
	    exit(($debug ? 0 : -1));
441
442
	}
    }
443
444
445
446
447
448
    if (defined($opsnode)) {
	# Reboot ops and wait for it to come back.
	print "Rebooting opsnode ($opsnode).\n";
	TBDebugTimeStamp("Rebooting opsnode");
	system("$nodereboot -w $opsnode");
	if ($?) {
449
450
451
	    print STDERR "*** $0:\n".
		"    Error rebooting the opsnode ($opsnode)!\n";
	    exit(($debug ? 0 : -1));
452
	}
453
454
455
456
457
458
    }
    # Reboot boss and wait for it to come back.
    print "Rebooting bossnode ($bossnode).\n";
    TBDebugTimeStamp("Rebooting bossnode");
    system("$nodereboot -w $bossnode");
    if ($?) {
459
460
461
	print STDERR "*** $0:\n".
	    "    Error rebooting the bossnode ($bossnode)!\n";
	exit(($debug ? 0 : -1));
462
    }
463
}
464
$EUID = 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
465

466
467
468
# Reboot the experimental nodes. They will come up inside the inner elab.
# DO NOT WAIT! They are not going to report ISUP from this point on. 
if (@expnodes) {
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
    #
    # First we try the magic pxeboot restart.
    # The nodes should still be in PXEWAIT, so we send them a restart
    # to make them re-DHCP.  This should get them quickly reparented to
    # the inner boss.
    #
    # If this doesn't work, we fall back on rebooting the nodes.
    #
    if ($restartnodes) {
	TBDebugTimeStamp("Redirecting experimental nodes to inner boss");
	my $stat = 0;
	# Run as real user again.
	$EUID = $UID;
	foreach my $node (@expnodes) {
	    $stat = system("$noderestart $node");
	    last if ($stat);
	}
	$EUID = 0;
	if ($stat) {
	    tbwarn("Node restart failed ($stat), falling back to reboot.");
	    goto rebootnodes;
	}

	#
	# Ssh into inner boss and use a utility script to determine
	# when the nodes have reported in and are in PXEWAIT (part of the
	# inner elab). Note the short timeout, since this operation should
	# be virtually instantaneous.
	#
	print "Waiting for nodes to restart and join the inner emulab.\n";
	TBDebugTimeStamp("Waiting for inner nodes to restart");
	$UID  = 0;
	$stat = system("$SSH -host $bossnode ".
		       "/usr/testbed/sbin/node_statewait -t 15 -a");
	$UID  = $SAVEUID;
	if ($stat) {
	    tbwarn("Error ($stat) waiting for nodes to restart, falling back to reboot.");
	    goto rebootnodes;
	}

	goto restartworked;
    }

rebootnodes:
513
    print "Rebooting inner experimental nodes.\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
514
    TBDebugTimeStamp("Rebooting experimental nodes");
515
516
    # Run as real user again.
    $EUID = $UID;
517
    system("$nodereboot -b @expnodes");
518
519
520
521
    if ($?) {
	die("*** $0:\n".
	    "    Error rebooting the expnodes (@expnodes)!\n");
    }
522
    $EUID = 0;
523
524
525
526
527

    #
    # Instead, we ssh into the node and use a utility script to determine
    # when the nodes have rebooted and are in PXEWAIT (part of the inner elab).
    #
528
    # Run as real root for ssh.
529
530
531
532
533
534
535
536
537
538
539
    $UID  = 0;

    print "Waiting for nodes to reboot and join the inner emulab.\n";
    TBDebugTimeStamp("Waiting for inner nodes to reboot");
    system("$SSH -host $bossnode /usr/testbed/sbin/node_statewait -t 180 -a");
    if ($?) {
	print STDERR "*** $0:\n".
	             "    Error waiting for inner nodes to join!\n";
	exit(($debug ? 0 : -1));
    }
    $UID  = $SAVEUID;
540

541
restartworked:
542
543
544
545
546
547
548
549
    #
    # To avoid confusion later (with swapmod, which wants them to be ISUP),
    # and so the web interface does not show the nodes as down, set the 
    # state to ISUP.
    #
    foreach my $node (@expnodes) {
	TBSetNodeEventState($node, TBDB_NODESTATE_ISUP());
    }
550
}
551
552

#
553
554
555
556
557
# Fire off inner elab experiment.
# 
if (defined($elabinelab_eid)) {
    # Formatted to make batchexp happy.
    my $nsfilename = "/tmp/$pid-$elabinelab_eid-$$.nsfile";
558
    
559
560
561
562
563
564
    #
    # Write NS file to temp file so we can send it over.
    #
    open(NS, "> /tmp/$$.ns")
	or die("*** $0:\n".
	       "    Could not write ns code to tmp file!\n");
565
    print NS $inner_nsfile;
566
567
568
569
570
571
572
    print NS "\n";
    close(NS);

    #
    # Copy the file over.
    #
    $UID = 0;
573
    print "Sending NS file to inner bossnode ($bossnode).\n";
574
    system("cat /tmp/$$.ns | $SSH -host $bossnode '(cat > $nsfilename)'");
575
576
    if ($?) {
	die("*** $0:\n".
577
	    "    Could not copy ns code to inner boss ($bossnode)!\n");
578
    }
579
580

    #
581
582
    # Now run batchexp on the node as the user. If firewalled, experiment
    # must start async (cause we have to turn the firewall back on). 
583
    #
584
585
586
587
    my $optarg = ($firewalled ? "" : "-w");
	
    print "Starting experiment $pid/$elabinelab_eid on inner emulab.\n";
    TBDebugTimeStamp("Starting inner experiment");
588
589
    system("$SSH -host $bossnode ".
	   " 'sudo -u $user_uid /usr/testbed/bin/batchexp ".
590
	   "  -q -i $optarg -S \"ElabInElab Experiment\" ".
591
592
593
594
595
	   "  -L \"ElabInElab ElabInElab\" -E \"ElabInElab Experiment\" ".
	   "  -p $pid -e $elabinelab_eid $nsfilename'");
    
    $UID = $SAVEUID;
    unlink("/tmp/$$.ns");
596
}
597
skipsetup:
598

599
#
600
601
602
603
604
605
# Turn the firewall back on.
#
# XXX If this fails, we have to do something much stronger! We do not want
# nodes coming up and starting something if the firewall is not active.
# Maybe hit the panic button from here (turning off the control network).
#
606
607
#
if ($firewalled) {
608
609
610
611
612
613
614
    my $cmd;

    if ($fwtype =~ /^iptables/) {
        $cmd = "$SSH -host $firewall iptables -D FORWARD 1";
    } else {
        $cmd = "$SSH -host $firewall ipfw delete 1";
    }
615
616
    print "Turning firewall back on\n";
    $UID = 0;
617
    system($cmd);
618
    if ($?) {
619
620
	print STDERR "*** Error turning back on firewall rules ($firewall)!\n".
		     "    Will retry again.\n";
621
	system($cmd);
622
623
624
625
	if ($?) {
	    die("*** $0:\n".
		"    Error turning back on firewall rules! Retry failed.\n");
	}
626
627
628
629
    }
    $UID = $SAVEUID;
}

630
631
632
TBDebugTimeStamp("ElabInElab setup done");
exit(0);

633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
#
# Dump parts of the DB that are needed for inner elab to run. The idea
# is to create a set of files named by the table name. Note that mysqld
# cannot write to the project tree cause of directory permissions. Put the
# files into the workdir for now, and them copy them over. 
#
sub DumpDBGoo()
{
    my $statedir = "$workdir/elabinelab";

    if (-d $statedir) {
	system("rm -rf $statedir");
    }
    mkdir($statedir, 0777) or
	die("*** $0:\n".
	    "    Could not mkdir $statedir\n");
    
    chmod(0777, $statedir) or
	die("*** $0:\n".
	    "    Could not chmod $statedir\n");

654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
    #
    # No place else for this; when ops is a vm on boss, need to generate
    # a proper IP that is routable on the control network.
    #
    my $attributes = $experiment->GetElabInElabAttrs();
    die("*** $0:\n".
	"    Could not get elabinelab attributes\n")
	if (!defined($attributes));

    if (exists($attributes->{'CONFIG_OPSVM'}) && $attributes->{'CONFIG_OPSVM'}){
	#
	# Need to assign an IP to the ops node for the jail.
	#
	my $pnode = Node->Lookup($bossnode);
	die("*** $0:\n".
	    "    Could not lookup $bossnode\n")
	    if (!defined($pnode));
671
	my ($ip,$ipmask) = $pnode->GetJailIP();
672
673
674
675
676
677
678
679
680
681
	die("*** $0:\n".
	    "    Could not generate an IP for OPS jail\n")
	    if (!defined($ip));
	
	print "Setting the IP for OPS jail to $ip\n";
	$experiment->SetElabInElabAttr("boss", "OPSIP", $ip);
	$experiment->SetElabInElabAttr("ops",  "OPSIP", $ip);
	$experiment->SetElabInElabAttr("fs",   "OPSIP", $ip);
    }

682
683
684
    #
    # These tables are dumped completely.
    #
685
    my @FULLTABLES = ("node_types", "node_type_attributes", "interface_types",
686
687
		      "interface_capabilities",
		      "switch_paths", "switch_stack_types", "switch_stacks",
Timothy Stack's avatar
   
Timothy Stack committed
688
689
		      "node_type_features", "node_types_auxtypes", "osid_map",
		      "os_boot_cmd");
690
691
692

    #
    # These tables are dumped by role (node/ops). For each one dump the table
693
694
    # as is, unless its the fs or ops node. For those we want to change the
    # node_id to "fs" or "ops" and their type to ops.
695
    #
696
697
698
    my @NODETABLES = ("node_auxtypes", "node_status", "nodes", 
		      "node_rusage", "node_hostkeys", "node_activity",
		      "interface_state");
699
700
701
702

    #
    # These tables are dumped by project ID.
    #
703
    my @PROJTABLES = ("projects", "groups");
704
705

    #
706
    # These tables are dumped by user ID (for the project members).
707
    #
708
    my @USERTABLES = ("users", "user_pubkeys");
709
710
711

    foreach my $table (@FULLTABLES) {
	unlink("$statedir/$table");
712
713
714
715
716
	DBQueryWarn("create temporary table temp_${table} ".
		    "select t.* from $table as t")
	    or die("*** $0:\n".
		   "    Could not dump table $table\n");

717
718
719
720
721
	#
	# Reduce the delay capacity by one if we are using one of
	# the experimental interfaces as an inner control network.
	#
	if ($table eq "node_type_attributes" && !$elabinelab_singlenet) {
722
	    my $attributes_result =
723
		DBQueryFatal("select type,attrvalue from temp_${table} ".
724
725
726
727
			     "where attrkey='delay_capacity'");

	    while (my ($ntype,$value) = $attributes_result->fetchrow_array()) {
		my $newvalue = $value - 1;
728
729
730

		next
		    if ($newvalue < 0);
731
732
733
		
		DBQueryFatal("update temp_${table} set ".
			     "   attrvalue='$newvalue' ".
734
735
			     "where type='$ntype' and ".
			     "      attrkey='delay_capacity'");
736
	    }
737
738
	}

739
740
741
742
743
	# filter out community strings
	if ($table eq "switch_stacks" || $table eq "switch_stack_types") {
	    DBQueryFatal("update temp_${table} set snmp_community=NULL");
	}

744
	DBQueryWarn("select * from temp_$table ".
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
		    "into outfile '$statedir/$table'")
	    or die("*** $0:\n".
		   "    Could not dump table $table\n");
    }

    foreach my $table (@NODETABLES) {
	unlink("$statedir/$table");
	#
	# Create a temporary table.
	#
	DBQueryWarn("create temporary table temp_${table} ".
		    "select t.* from reserved as r ".
		    "left join $table as t on t.node_id=r.node_id ".
		    "left join virt_nodes as v on v.vname=r.vname and ".
		    "     v.pid=r.pid and v.eid=r.eid ".
		    "where r.pid='$pid' and r.eid='$eid' and ".
		    "      t.node_id is not null and ".
Mike Hibler's avatar
Mike Hibler committed
762
		    "      v.inner_elab_role in ('node','fs','ops','ops+fs')")
763
764
765
	    or die("*** $0:\n".
		   "    Could not create temporary table temp_$table\n");
	#
766
767
	# Rename the fs and ops node in each table. For the nodes table,
	# there is a bunch of other stuff to do.
768
	#
769
770
771
	DBQueryFatal("update temp_${table} set node_id='fs' ".
		     "where node_id='$fsnode'")
	    if (defined($fsnode));
772
	DBQueryFatal("update temp_${table} set node_id='ops' ".
773
774
		     "where node_id='$opsnode'")
	    if (defined($opsnode));
775
776

	if ($table eq "nodes") {
777
	    DBQueryFatal("update temp_nodes set ".
778
			 " type='ops', ".
779
			 " phys_nodeid=node_id, ".
780
781
			 " role='ctrlnode', ".
			 " op_mode='OPSNODEBSD' ".
782
			 "where node_id in ('fs','ops')");
783

784
785
786
787
788
	    # Also add the nodes that correspond to infrastructure switches
	    DBQueryFatal("insert into temp_nodes ".
			 "select distinct n.* from switch_stacks as s ".
			 "left join nodes as n on s.node_id=n.node_id ".
			 "where stack_id not like 'ExpStack%'");
789
790

	    # Clear any node reservations on the inside
791
	    DBQueryFatal("update temp_nodes set ".
792
			 " reserved_pid=null where reserved_pid is not null");
793
794
795

	    # Put the inner nodes into "limbo" so they DTRT when restarted
	    if ($restartnodes) {
796
		DBQueryFatal("update temp_nodes set".
797
798
799
800
801
802
			     "  op_mode='PXEKERNEL',next_op_mode='',".
			     "  eventstate='". TBDB_NODESTATE_PXELIMBO . "',".
			     "  temp_boot_osid=NULL,next_boot_osid=NULL,".
			     "  osid=NULL".
			     " where role='testnode'");
	    }
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
	}
    
	DBQueryWarn("select * from temp_$table ".
		    "into outfile '$statedir/$table'")
	    or die("*** $0:\n".
		   "    Could not dump table $table\n");
    }

    foreach my $table (@PROJTABLES) {
	unlink("$statedir/$table");
	DBQueryWarn("select * from $table ".
		    "where pid='$pid' ".
		    "into outfile '$statedir/$table'")
	    or die("*** $0:\n".
		   "    Could not dump table $table\n");
    }
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
    #
    # Cleared versions of the project/group stats tables.
    #
    unlink("$statedir/project_stats");
    DBQueryFatal("create temporary table temp_project_stats ".
		 "like project_stats");
    DBQueryFatal("insert into temp_project_stats (pid,pid_idx) ".
		 "select pid,pid_idx from project_stats ".
		 "where pid='$pid'");
    DBQueryFatal("select * from temp_project_stats ".
		 "into outfile '$statedir/project_stats'");

    unlink("$statedir/group_stats");
    DBQueryFatal("create temporary table temp_group_stats ".
		 "like group_stats");
    DBQueryFatal("insert into temp_group_stats ".
		 "  (pid,pid_idx,gid,gid_idx,gid_uuid) ".
		 "select pid,pid_idx,gid,gid_idx,gid_uuid ".
		 "   from group_stats ".
		 "where pid='$pid'");
    DBQueryFatal("select * from temp_group_stats ".
		 "into outfile '$statedir/group_stats'");
841

842
843
844
845
846
847
848
849
850
851
852
853
    #
    # Special case the group and user policy tables. Not sure what to
    # really do about this; should there be any restrictions inside the
    # inner elab?
    #
    unlink("$statedir/group_policies");
    DBQueryWarn("select * from group_policies ".
		"where pid='$pid' or pid='+' or pid='-' ".
		"into outfile '$statedir/group_policies'")
	or die("*** $0:\n".
	       "    Could not dump table group_policies\n");

854
855
    foreach my $table (@USERTABLES) {
	unlink("$statedir/$table");
856
857

	DBQueryWarn("create temporary table temp_$table ".
858
		    "select distinct t.* from group_membership as gm ".
859
860
		    "left join users as u on u.uid_idx=gm.uid_idx ".
		    "left join $table as t on t.uid_idx=u.uid_idx ".
861
862
		    "where (gm.pid='$pid' or ".
		    "       gm.pid='" . TBOPSPID() . "') and gm.gid=gm.pid ".
863
		    " and t.uid_idx is not NULL and ".
864
865
866
867
		    " u.status='" . USERSTATUS_ACTIVE() . "'")
	    or die("*** $0:\n".
		   "    Could not create table temp_$table\n");

868
869
870
871
872
	# Clean up ... these are created in the inner elab.
	DBQueryFatal("delete from temp_${table} ".
		     "where uid='elabman' or uid='elabckup' or ".
		     "      uid='operator'");

873
	if ($table eq "users") {
874
	    my $creator_uid = $experiment->creator();
875
876
877
	    
	    DBQueryFatal("update temp_${table} set ".
			 " admin=1 ".
878
			 "where uid='$creator_uid'");
879
880

	    #
881
	    # Save time; force all other users to start out
882
883
884
885
886
	    # frozen since most users in the project do not ever
	    # actually log in. 
	    #
	    DBQueryFatal("update temp_${table} set ".
			 " status='" . USERSTATUS_FROZEN() . "' ".
887
			 "where uid!='$creator_uid'");
888
889
890
	}

	DBQueryWarn("select * from temp_$table ".
891
892
		    "into outfile '$statedir/$table'")
	    or die("*** $0:\n".
893
		   "    Could not dump table temp_$table\n");
894
    }
895
896
897
898
899
900
901
902
903
904
905
906
    #
    # We want a cleared stats table, so do it here.
    #
    DBQueryFatal("create temporary table temp_user_stats ".
		 "like user_stats");
    DBQueryFatal("insert into temp_user_stats ".
		 "  (uid,uid_idx,uid_uuid) ".
		 "select uid,uid_idx,uid_uuid from temp_users");
    DBQueryWarn("select * from temp_user_stats ".
		"into outfile '$statedir/user_stats'")
	or die("*** $0:\n".
	       "    Could not dump table temp_user_stats\n");
907
908
909

    # The group_membership is also special.
    DBQueryWarn("select gm.* from group_membership as gm ".
910
		"left join users as u on u.uid_idx=gm.uid_idx ".
911
		"where (gm.pid='$pid' or ".
912
		"       (gm.pid='" . TBOPSPID() . "' and gm.pid=gm.gid)) and ".
913
		" u.status='" . USERSTATUS_ACTIVE() . "' and ".
914
915
		" gm.uid!='elabman' and gm.uid!='elabckup' and ".
		" gm.uid!='operator' ".
916
917
918
919
920
		"into outfile '$statedir/group_membership'")
	or die("*** $0:\n".
	       "    Could not dump table group_membership\n");

    #
Mike Hibler's avatar
Mike Hibler committed
921
    # Initial images; note that these images are not going to exist inside!
922
923
    # Note that we exclude any encrypted images because we cannot expose
    # the encryption keys that they would need to decode the images!
924
925
    # 
    DBQueryWarn("select * from images ".
926
927
		"where (pid='$pid' or (pid='$TBOPSPID' and global=1)) ".
		"  and decryption_key is null ".
928
929
930
931
932
933
934
935
936
937
938
939
940
941
		"into outfile '$statedir/images'")
	or die("*** $0:\n".
	       "    Could not dump table images\n");
	    
    DBQueryWarn("create temporary table temp_os_info ".
		"select * from os_info ".
		"where pid='$pid' or (pid='$TBOPSPID' and shared=1)")
	or die("*** $0:\n".
	       "    Could not create table temp_os_info\n");

    # Ack. The MFS paths have a hardcoded "boss" in them, but that is going
    # to resolve incorrectly to an inner control IP, which will not work
    # from the pxeboot kernel since it uses the outer control network.
    # Just remove the host spec; pxeboot will do the right thing.
942
    my $query_result =
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
	DBQueryFatal("select osid,path from temp_os_info ".
		     "where path like '%:%'");
    
    while (my ($osid,$hostpath) = $query_result->fetchrow_array()) {
	my ($host,$path) = $hostpath =~ /^(.*):(.*)$/;

	DBQueryFatal("update temp_os_info set path='$path' where osid='$osid'");
    }

    DBQueryWarn("select * from temp_os_info ".
		"into outfile '$statedir/os_info'")
	or die("*** $0:\n".
	       "    Could not dump table os_info\n");
	    
    DBQueryWarn("select o.* from osidtoimageid as o ".
		"left join images as i on i.imageid=o.imageid ".
		"where i.pid='$pid' or (i.pid='$TBOPSPID' and i.global=1) ".
		"into outfile '$statedir/osidtoimageid'")
	or die("*** $0:\n".
	       "    Could not dump table osidtoimageid\n");
963
964
965
966
967
968
969
970
971
	    
    # Subosids.  Only take the mapping to parents for those children that
    # are in the e-in-e pid or are in emulab-ops and are shared.
    DBQueryWarn("select osm.* from os_submap as osm ".
		"left join os_info as osi on osm.osid=osi.osid ".
		"where osi.pid='$pid' or (osi.pid='$TBOPSPID' and osi.shared=1) ".
		"into outfile '$statedir/os_submap'")
	or die("*** $0:\n".
	       "    Could not dump table os_submap\n");
972
973
974
975
976
977
978
979
980
981
982
983

    #
    # interfaces table. Need to tag the interfaces being used as the control
    # network, with the proper tag so they do not say they experimental
    # interfaces in the inner emulab. Use a temp table again.
    #
    DBQueryWarn("create temporary table temp_interfaces ".
		"select t.* from reserved as r ".
		"left join interfaces as t on t.node_id=r.node_id ".
		"left join virt_nodes as v on v.vname=r.vname and ".
		"     v.pid=r.pid and v.eid=r.eid ".
		"where r.pid='$pid' and r.eid='$eid' and ".
Mike Hibler's avatar
Mike Hibler committed
984
		"      v.inner_elab_role in ('node','ops','fs','ops+fs')")
985
986
987
	or die("*** $0:\n".
	       "    Could not create temporary table temp_interfaces\n");

988
    if (! $elabinelab_singlenet) {
989
990
991
992
993
994
995
	# First, mark the real control network as "other" to avoid it being
	# thought of as the control network!.
	DBQueryWarn("update temp_interfaces ".
		    "set role='" . TBDB_IFACEROLE_OUTER_CONTROL() . "' " .
		    "where role='" . TBDB_IFACEROLE_CONTROL() . "'")
	    or die("*** $0:\n".
		   "    Could not delete control ifaces from temp_interfaces\n");
996

997
998
999
1000
1001
1002
	DBQueryWarn("update temp_interfaces set ".
		    " role='" . TBDB_IFACEROLE_CONTROL() . "' " .
		    "where IP!='' and role='" . TBDB_IFACEROLE_EXPERIMENT() . "'")
	    or die("*** $0:\n".
		   "    Could not update roles in temp_interfaces\n");
    }
1003

1004
1005
1006
1007
1008
1009
1010
    # And rename the fs/ops nodes as above.
    if (defined($fsnode)) {
	DBQueryWarn("update temp_interfaces set node_id='fs' ".
		    "where node_id='$fsnode'")
	    or die("*** $0:\n".
		   "    Could not fs node_id in temp_interfaces\n");
    }
1011
1012
1013
1014
1015
1016
    if (defined($opsnode)) {
	DBQueryWarn("update temp_interfaces set node_id='ops' ".
		    "where node_id='$opsnode'")
	    or die("*** $0:\n".
		   "    Could not ops node_id in temp_interfaces\n");
    }
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034

    # Also add the interfaces that correspond to the "trunk" wires.
    DBQueryFatal("insert into temp_interfaces ".
		 "select distinct i.* from wires as w ".
		 "left join interfaces as i on w.node_id1=i.node_id or ".
		 "     w.node_id2=i.node_id ".
		 "where w.type='Trunk'");

    DBQueryWarn("select * from temp_interfaces ".
		"into outfile '$statedir/interfaces'")
	or die("*** $0:\n".
	       "    Could not dump table interfaces\n");

    # And the wires table. Strip out the control wires; not needed.
    DBQueryWarn("create temporary table temp_wires ".
		"select t.* from reserved as r ".
		"left join virt_nodes as v on v.vname=r.vname and ".
		"     v.pid=r.pid and v.eid=r.eid ".
1035
		"left join wires as t on t.node_id1=r.node_id ".
1036
		($elabinelab_singlenet == 0 ? " and t.type='Node' " : " ") .
1037
		"where r.pid='$pid' and r.eid='$eid' and ".
Mike Hibler's avatar
Mike Hibler committed
1038
		"      v.inner_elab_role in ('node','ops','fs','ops+fs') ")
1039
1040
1041
	or die("*** $0:\n".
	       "    Could not create temporary table temp_wires\n");

1042
1043
1044
1045
1046
1047
1048
    # And rename the fs/ops node as above.
    if (defined($fsnode)) {
	DBQueryWarn("update temp_wires set node_id1='fs' ".
		    "where node_id1='$fsnode'")
	    or die("*** $0:\n".
		   "    Could not fs node_id in temp_wires\n");
    }
1049
1050
1051
1052
1053
1054
    if (defined($opsnode)) {
	DBQueryWarn("update temp_wires set node_id1='ops' ".
		    "where node_id1='$opsnode'")
	    or die("*** $0:\n".
		   "    Could not ops node_id in temp_wires\n");
    }
1055

1056
    if (! $elabinelab_singlenet) {
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
	# But we need to take out the wires that are being used as the
	# inner control network, or at least mark them as Control.
	$query_result =
	    DBQueryWarn("select node_id,card,port from temp_interfaces ".
			"where role='" . TBDB_IFACEROLE_CONTROL() . "' ");

	while (my ($node_id,$card,$port) = $query_result->fetchrow_array()) {
	    DBQueryWarn("update temp_wires set type='Control' ".
			"where node_id1='$node_id' and card1=$card and ".
			"      port1=$port");
	}
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
    }
    # Okay, now add the "trunk" wires in without any alteration.
    DBQueryWarn("insert into temp_wires ".
		"select * from wires where type='Trunk'") 
	or die("*** $0:\n".
	       "    Could not add trunk lines to temp_wires\n");

    DBQueryWarn("select * from temp_wires ".
		"into outfile '$statedir/wires'")
	or die("*** $0:\n".
	       "    Could not dump table wires\n");

    #
1081
1082
    # Ack, we need to create a reservation for the fs and ops nodes,
    # or else they will look free and it will not be able to check in.
1083
1084
1085
1086
1087
1088
    #
    DBQueryWarn("create temporary table temp_reserved ".
		"select r.* from reserved as r ".
		"left join virt_nodes as v on v.vname=r.vname and ".
		"     v.pid=r.pid and v.eid=r.eid ".
		"where r.pid='$pid' and r.eid='$eid' ".
Mike Hibler's avatar
Mike Hibler committed
1089
		"      and v.inner_elab_role in ('fs','ops','ops+fs')")
1090
1091
	or die("*** $0:\n".
	       "    Could not create temporary table temp_reserved\n");
1092
1093
1094
1095
    if (defined($fsnode)) {
	DBQueryWarn("update temp_reserved set ".
		    "   node_id='fs', ".
		    "   pid='$TBOPSPID', ".
1096
1097
		    "   eid='opsnodes', ".
		    "   exptidx=1 ".
1098
1099
1100
1101
		    "where node_id='$fsnode'")
	    or die("*** $0:\n".
		   "    Could not update temporary table temp_reserved\n");
    }
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
    if (defined($opsnode)) {
	DBQueryWarn("update temp_reserved set ".
		    "   node_id='ops', ".
		    "   pid='$TBOPSPID', ".
		    "   eid='opsnodes', ".
		    "   exptidx=1 ".
		    "where node_id='$opsnode'")
	    or die("*** $0:\n".
		   "    Could not update temporary table temp_reserved\n");
    }
1112
1113
1114
1115
1116
    DBQueryWarn("select * from temp_reserved ".
		"into outfile '$statedir/reserved'")
	or die("*** $0:\n".
	       "    Could not dump table reserved\n");

Mike Hibler's avatar
Mike Hibler committed
1117
    # Copy tiplines table for all nodes so web form gives us a console icon!
1118
    DBQueryWarn("select t.tipname,t.node_id,'',t.disabled,0,0,NULL ".
Mike Hibler's avatar
Mike Hibler committed
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
		"from reserved as r ".
		"left join virt_nodes as v on v.vname=r.vname and ".
		"     v.pid=r.pid and v.eid=r.eid ".
		"left join tiplines as t on t.node_id=r.node_id ".
		"where r.pid='$pid' and r.eid='$eid' and ".
		"      v.inner_elab_role='node' ".
		"into outfile '$statedir/tiplines'")
	or die("*** $0:\n".
	       "    Could not dump table tiplines\n");

1129
1130
1131
1132
1133
    #
    # Dump the DB schema too, so we can check in the inner Elab that this data
    # is compatible with the sql/database-create.sql schema file there, *before*
    # loading it into the db.  Added/removed columns would misalign row data.
    #
Russ Fish's avatar
Russ Fish committed
1134
1135
1136
1137
1138
1139
1140
1141
1142
    my $schemafile = "$expdir/outer_db_schema";
    system("rm -f $schemafile")
	if (-f $schemafile);
    my $isvers5     = system("mysql -V | egrep -q -s 'Distrib 5.'") == 0;
    my $extraopts   = ($isvers5 ? "--skip-quote-names" : "");
    #
    # XXX: Requires that mysqldump be in caller's $PATH - probably an OK
    # assumption, but maybe not always
    #
1143
1144
    my $mysqldump   = "mysqldump -d $extraopts $DBNAME " .
	"@FULLTABLES @NODETABLES @PROJTABLES @USERTABLES";
Russ Fish's avatar
Russ Fish committed
1145
    system("$mysqldump 2> /dev/null > $schemafile");
1146

1147
    #
1148
    # Tar up the directory and send it over to (real) ops.
1149
1150
1151
    #
    $UID = 0;
    system("tar cf - -C $statedir . | ".
Kirk Webb's avatar
   
Kirk Webb committed
1152
	   "   gzip | $SSH -F /dev/null -host $CONTROL ".
1153
	   "   '(cat > $expdir/dbstate.tar.gz)'");
1154
1155
1156
1157
1158
1159
1160
1161
    if ($?) {
	die("*** $0:\n".
	    "    Could not create dbstate.tar.gz\n");
    }
    $UID = $SAVEUID;
    return 0;
}

1162
#
1163
1164
# Tear down an inner Emulab as cleanly as possible to avoid power cycling
# nodes.
1165
1166
1167
1168
1169
1170
# 
sub TearDownEmulab()
{
    my $tbdir      = "/usr/testbed";
    my $wap        = "$tbdir/sbin/withadminprivs";
    my $nodereboot = "$tbdir/bin/node_reboot";
1171

1172
1173
1174
1175
1176
1177
1178
    #
    # We want to rebuild the DHCPD file so that when we reboot the inner nodes
    # they come back to the outer emulab. We cannot just free the nodes, cause
    # then the reload daemon might beat us to it, and end up power cycling the
    # nodes, and that would be bad. So, munge the DB and clear the "role" slot
    # for inner nodes. 
    #
1179
    DBQueryFatal("update reserved set inner_elab_role=NULL,inner_elab_boot=0 ".
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
		 "where pid='$pid' and eid='$eid'");

    #
    # XXX Failure at this point will leave things in an inconsistent state
    # cause we have just munged the reserved table. Since we were trying
    # to swap out the experiment, I think this will be okay. Wait and see.
    #
    return 0
	if (!defined($bossnode));

    #
    # Now regen the DHCPD file.
    #
1193
    # Run as real user since script is setuid.
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
    $EUID = $UID;
    
    print "Regenerating DHCPD config file and restarting daemon.\n";
    system("$makeconf -i -r");
    if ($?) {
	die("*** $0:\n".
	    "    Failed to reconfig/restart DHCPD.\n");
    }
    $EUID = 0;

1204
1205
1206
1207
1208
1209
1210
1211
1212
    #
    # Kill inner vlans table entries; this is the table that maps
    # inner to outer vlans. We do not care about that anymore since
    # all of the vlans are going to be torn down (using the outer
    # ids).
    #
    DBQueryFatal("delete from elabinelab_vlans ".
		 "where pid='$pid' and eid='$eid'");

1213
    #
1214
1215
    # If firewalled, just return now since all nodes will be powered
    # off anyway.
1216
    #
1217
1218
    if ($firewalled) {
	print "Skipping clean shutdown since experiment is firewalled.\n";
1219
1220
1221
	return 0;
    }

1222
1223
1224
1225
1226
    #
    # When the nodes reboot, we want them to do something reasonable. We
    # have no idea what is loaded on the disk, so they should go into an
    # MFS and wait, but then a bunch of nodes will all try to load the big
    # MFS at once, and that could wreak havoc. So, clear the boot osids
1227
1228
    # so they go into PXEWAIT.
    #
1229
1230
1231
1232
1233
1234
1235
    if (@expnodes) {
	system("$osselect -w @expnodes");
	if ($?) {
	    print STDERR "*** $0:\n".
		         "    Could not clear bootinfo for inner nodes!\n".
			 "    Continuing anyway.\n";
	}
1236
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1237
    
1238
1239
1240
1241
1242
    #
    # SSH in and kill the inner DHCPD daemon so that it does not reply
    # to rebooting nodes along the inner control network.
    #
    $UID = 0;
1243

1244
1245
1246
    print "Killing DHCPD on inner boss ($bossnode)\n";
    system("$SSH -host $bossnode /usr/local/etc/rc.d/2.dhcpd.sh stop");
    if ($?) {
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
	#
	# This error is non-fatal. If DHCPD cannot be killed, then the inner
	# boss is scrogged or never set up properly. Just return and let
	# the nodes get power cycled (if need be). At some point we need a
	# state machine to control this setup stuff. 
	# 
	print STDERR "*** $0:\n".
	             "    Could not stop DHCPD on inner bossnode ($bossnode)!\n".
		     "    Continuing anyway; outer boss will use power cycle.\n";
	return 0;
1257
    }
1258

1259
1260
1261
    #
    # Now we ask inner boss to reboot all of the testnodes. Maybe need an
    # option to node_reboot, but for now just pass them on the command line.
1262
1263
1264
1265
1266
1267
    #
    if (! @expnodes) {
	$UID = $SAVEUID;
	return 0;
    }
    
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
    print "Asking inner boss ($bossnode) to reboot inner nodes\n";
    system("$SSH -host $bossnode $wap $nodereboot -b @expnodes");
    if ($?) {
	#
	# This error is non-fatal; Outer boss will just resort to power cycle.
	#
	print STDERR "*** $0:\n".
	             "    Could not reboot some inner nodes!\n".
		     "    Continuing anyway; outer boss will use power cycle.\n";
    }
1278
    $UID = $SAVEUID;
1279

1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
    #
    # Now we wait for them to reach PXEWAIT. Again, use our utility script
    # instead of stated stuff.
    #
    $EUID = $UID;
    print "Waiting for inner nodes to reach PXEWAIT\n";
    system("$nodewait @expnodes");
    if ($?) {
	#
	# This error is non-fatal; Outer boss will just resort to power cycle.
	#
	print STDERR "*** $0:\n".
	             "    Some machines did not reboot properly!\n".
		     "    Continuing anyway; outer boss will use power cycle.\n";
    }
    return 0;
}
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306

#
# Remove nodes from an inner Emulab.
# 
sub RemoveNodes()
{
    my $tbdir      = "/usr/testbed";
    my $wap        = "$tbdir/sbin/withadminprivs";
    my $nodereboot = "$tbdir/bin/node_reboot";
    my $deletenode = "$tbdir/sbin/deletenode";
1307
    my $creator    = $experiment->creator();
1308
    my @nodes	   = ();
1309
    my $paniced    = 0;
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322

    #
    # If firewalled, check to see if paniced. Right now that means the nodes
    # are going to be powered off, so need to do the clean shutdown dance.
    # 
    if ($firewalled) {
	TBExptGetPanicBit($pid, $eid, \$paniced);
    }

    #
    # Actually, this should not even happen; a paniced experiment cannot be
    # modified at all.
    #
Mike Hibler's avatar