os_setup.in 16.5 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6
7
8
# All rights reserved.
#

9
use English;
10
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
11
require 'ctime.pl';
12

13
#
14
15
16
17
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
18
#
19
# TODO: Reload disk images.
20
# 
21
# usage: os_setup <pid> <eid>
22
#
23
24
sub usage()
{
25
    print STDERR "Usage: os_setup <pid> <eid>\n";
26
27
    exit(-1);
}
28
my  $optlist = "d";
29
30
31
32
33
34

#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
35
my $TBOPS       = "@TBOPSEMAIL@";
36
my $TESTMODE    = @TESTMODE@;
37
my $TFTP	= "/tftpboot";
38

39
40
41
42
43
44
45
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

46
my $nodereboot	= "$TB/bin/node_reboot";
47
my $os_load	= "$TB/bin/os_load";
48
my $vnode_setup = "$TB/sbin/vnode_setup";
49
my $osselect    = "$TB/bin/os_select";
50
my $dbg		= 0;
51
my $failed      = 0;
52
53
my %nodes       = ();
my %vnodes      = ();
54
my %osids       = ();
55
my %canfail     = ();
56
my $db_result;
57
my @row;
58

59
#
60
61
62
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
# imageid, a list of the nodes to pass to os_load for that imageid. 
63
#
64
65
66
my %reloads     = ();
my %reboots	= ();
my $doautoload  = 1;
67
my $dolastload  = 1;
68
    
69
70
71
72
73
74
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

75
76
77
78
79
80
81
82
83
84
85
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
86
87
88
89
if (defined($options{"d"})) {
    $dbg = 1;
}

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
    die("Bad data in pid: $pid.");
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
    die("Bad data in eid: $eid.");
}

109
#
110
# Figure out who called us. Only root, people with admin status
111
112
# in the DB, or the owner of the experiment can run this script.
#
113
114
115
116
if ($UID && !TBAdmin($UID) &&
    !TBExptAccessCheck($UID, $pid, $eid, TB_EXPT_MODIFY)) {
    die("*** $0:\n".
	"    You do not have permission to swap this experiment!\n");
117
118
}

119
120
TBDebugTimeStamp("os_setup started");

121
#
122
# Get the set of nodes, as well as the nodes table information for them.
123
#
124
$db_result =
125
126
127
128
    DBQueryFatal("select n.*,l.pid from nodes as n ".
		 "left join reserved as r on n.node_id=r.node_id ".
		 "left join last_reservation as l on n.node_id=l.node_id ".
		 "where r.pid='$pid' and r.eid='$eid'");
129

130
if ($db_result->numrows < 1) {	
131
132
    print "There are no nodes in experiment '$eid' in project '$pid'.\n";
    exit;
133
134
}

135
while (my %row = $db_result->fetchhash()) {
136
137
    my $node     = $row{'node_id'};
    my $osid     = $row{'def_boot_osid'};
138
    my $type     = $row{'type'};
139
    my $bootpath = 0;
140
    my $jailnode = 0;
141

142
    #
143
144
    # VIRTNODE HACK: Virtual nodes are special. Jailed vnodes can do quite
    # a bit, and so run them through the checks below.
145
    #
146
147
148
149
150
151
152
153
154
155
    if (TBIsNodeVirtual($node, \$jailed)) {
	$vnodes{$node} = $jailed;
	if (! $jailed) {
	    next;
	}
	$jailnode = 1;
    }
    else {
	$nodes{$node}  = $node;
	$reboots{$node} = 1;
156
    }
157
    $osids{$node} = $osid;
158

159
    #
160
161
162
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
    # and just check the path directly. 
163
164
165
166
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

167
168
169
170
171
172
173
174
175
176
177
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
178
		die("*** File $path for node $node must reside in $TFTP\n");
179
180
	    }

181
	    if (! -f $path) {
182
		die("*** File $path for node $node does not exist!");
183
	    }
184
	    $bootpath = 1;
185
	}
186
187
188
189
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

190
191
192
193
194
195
196
197
198
199
200
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
201
		die("*** File $path for node $node must reside in $TFTP\n");
202
203
	    }

204
	    if (! -f $path) {
205
		die("*** File $path for node $node does not exist!");
206
207
	    }
	}
208
209
    }

210
211
212
213
214
215
216
    #
    # XXX - Check for existence of the delta files. We do this here
    # cause its easier than looking for a failure later, when the node
    # tries to install the delta. Not a general solution though. Needs
    # more thought.
    #
    foreach my $delta (split(":", $row{'deltas'})) {
217
	if (! -f $delta) {
218
	    die("*** Delta file $delta for node $node does not exist!");
219
220
221
222
223
224
	}
    }
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
225
	if (! -f $rpm) {
226
	    die("*** RPM $rpm for node $node does not exist!");
227
228
229
	}
    }
    
230
231
232
233
234
235
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
	
236
	if (! -f $tar) {
237
	    die("*** Tarfile $tar for node $node does not exist!");
238
239
	}
    }
240
241
242
243
244
245
246

    #
    # If there is a path specified, then we don't worry anymore about it.
    # The user must know what is going on. The OSID might have a path
    # associated with it, which means the same thing; we don't worry about
    # it. 
    #
247
    if (!$bootpath && !$jailnode) {
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
	#
	# These checks are not necessary if the front end and web page
	# are doing the right thing, but lets be careful anyway.
	# 
	if (! $osid) {
	    die("*** $node has no bootpath and no def_boot_osid set!\n");
	}

	#
	# Grab the info for this OSID. This is part of the image check.
	#
	my $osid_result =
	    DBQueryFatal("select * from os_info where osid='$osid'");
	
	if ($osid_result->numrows == 0) {
	    die("*** No such OSID $osid is defined!\n");
	}
	
	my %osid_row   = $osid_result->fetchhash();

	#
	# If there is an actual path, its an OSKit kernel not an image.
	# 
271
	if (! defined($osid_row{'path'}) || $osid_row{'path'} eq "") {
272
	    #
273
274
	    # Not an OSKit kernel.
	    # Make sure this OSID is actually loaded on the machine. 
275
	    #
276
277
278
	    my $p_result =
		DBQueryFatal("select * from partitions ".
			     "where node_id='$node' and osid='$osid'");
279
280

	    #
281
282
	    # If not loaded, then see if the user was looking for the generic
	    # name of the OS that is loaded. 
283
	    # 
284
285
	    if ($p_result->numrows == 0) {
		#
286
		# Check to see if a non specific version specified.
287
		#
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
		if (! defined($osid_row{'version'}) ||
		    $osid_row{'version'} eq "") {

		    #
		    # A non-specific version. Try to map it.
		    # 
		    my $o_result =
			DBQueryFatal("select o1.* from os_info as o1 ".
				     "left join partitions as p ".
				     "  on o1.osid=p.osid ".
				     "left join os_info as o2 ".
				     "  on o2.OS=o1.OS ".
				     "where p.node_id='$node' ".
				     "  and o2.osid='$osid'");

		    if ($o_result->numrows == 0) {
			die("*** $0:\n".
			    "    No mapping can be made for $osid on $node!\n".
			    "    Perhaps the disk needs reloading?\n");
		    }
		    else {
			my %o_row  = $o_result->fetchhash();
			my $n_osid = $o_row{'osid'};

			print "Mapping $osid on $node to $n_osid.\n";
313
314
315
316
317
318

			if ($dolastload &&
			    defined($row{'pid'}) && $row{'pid'} ne $pid) {
			    SetupReload($node, $n_osid, $type);
			}
			else {
319
320
321
			    system("$osselect $n_osid $node") and
				die("*** Could not set boot OS to ".
				    "$n_osid for $node\n");
322
			}
323
324
			$osids{$node} = $n_osid;
		    }
325
326
		}
		else {
327
328
		    #
		    # User wants a specific version of an OS, but its not
329
		    # loaded on the machine. 
330
		    #
331
332
333
334
335
336
337
338
339
340
		    SetupReload($node, $osid, $type);
		}
	    }
	    else {
		#
		# OSID is loaded, but might need to be cleaned. 
		#
		if ($dolastload &&
		    defined($row{'pid'}) && $row{'pid'} ne $pid) {
		    SetupReload($node, $osid, $type);
341
		}
342
343
344
	    }
	}
    }
345
    
346
347
348
349
    #
    # Set the canfail bit. Currently, sharks are always canfail=1.
    # Will come from DB at some point.
    #
350
    if ($row{'type'} eq "dnard") {
351
352
353
354
355
356
	$canfail{$node} = 1;
    }
    else {
	$canfail{$node} = 0;
    }
    
357
    print STDERR "$node - $osids{$node} - $canfail{$node}\n"
358
	if $dbg;
359
}
360

361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
#
# Now do osid setup for jailed nodes. We waited until the physnodes were
# done above so that we can set the osid for the vnodes to the same
# as the physnode. We rely on the fact that the user is not allowed to
# set the OS for jailed nodes or for the physnodes that are hosting
# jailed nodes, and the node_types table has the right stuff. Non-jailed
# nodes do not need to be done.
#
foreach my $vnode (keys(%vnodes)) {
    my $jailed = $vnodes{$vnode};
    my $pnode;

    if (! $jailed) {
	next;
    }

    if (! TBPhysNodeID($vnode, \$pnode)) {
	die("*** $0:\n".
	    "    Cannot determine phys_nodeid for $vnode!\n");
    }
    my $n_osid = $osids{$pnode};
    
    system("$osselect $n_osid $vnode") and
	die("*** Could not set boot OS to $n_osid for $vnode\n");

    $osids{$vnode} = $n_osid;
}

389
#
390
# We need to issue the reboots and the reloads in parallel.
391
#
392
TBDebugTimeStamp("rebooting/reloading started");
393
if (!$TESTMODE) {
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
    my %pids  = ();
    my $count = 0;
    my $cmd;

    if (keys(%reboots)) {
	$cmd = "$nodereboot " . join(" ", keys(%reboots));
	$pids{$cmd} = ForkCmd($cmd);
    }

    foreach my $imageid ( keys(%reloads) ) {
	my @list = @{ $reloads{$imageid} };

	sleep(5);
	$pids{"$os_load -m $imageid @list"} =
	    ForkCmd("$os_load -m $imageid @list");
    }

    foreach $cmd ( keys(%pids) ) {
	my $pid = $pids{$cmd};

	waitpid($pid, 0);
	if ($?) {
	    $failed++;
417
	    print "*** Failed: $cmd\n";
418
419
	}
    }
420
}
421
TBDebugTimeStamp("rebooting/reloading finished");
422
sleep(2);
423

424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
#
# XXX What happens if something above fails? We could exit, but some nodes
# that *are* rebooting would be caught in the middle. For the nodes that
# were reloaded, we can check the state right away (and avoid the wait
# below as well); they should be in the ISUP state when os_load is
# finished.  If not, thats a failure and we can save some time below.  For
# plain reboot failures, nothing to do but find out below after the wait.
# I do not want to exit right away cause we might end up with a lot more
# power cycles since the nodes are very likely to be in a non responsive
# state if just rebooted!
#
foreach my $imageid ( keys(%reloads) ) {
    my @list = @{ $reloads{$imageid} };

    foreach my $node ( @list ) {
	my $state;
	
	if (!TBGetNodeEventState($node, \$state)) {
	    print "*** Error getting event state for $node!\n";
	    $failed++;
	    delete($nodes{$node});
	}
	if ($state ne TBDB_NODESTATE_ISUP) {
	    print "*** Not waiting for $node since its reload failed!\n";
	    $failed++;
	    delete($nodes{$node});
	}
    }
}
# Remaining nodes we need to wait for.
my @nodelist = keys(%nodes);

456
#
457
458
459
# Now lets wait for them to come back alive. Set up a retry list though
# so that we can give each node at least 1 second chance. Avoids pointless
# experiment failures.
460
#
461
if (@nodelist) {
462
463
464
    print "Waiting for local testbed nodes to finish rebooting ...\n";
}

465
466
my %retries;
my %waitstart;
467
foreach my $node ( @nodelist ) {
468
469
470
471
    $retries{$node} = 1;
    $waitstart{$node} = time;
}

472
TBDebugTimeStamp("Local node waiting started");
473
474
475
while ( @nodelist ) {
    my $node   = shift(@nodelist);
    my $wstart = $waitstart{$node};
476

477
478
    if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, (60*7))) {
	print "$node is alive and well\n";
479
	SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
480
481
	next;
    }
482

483
484
485
486
487
488
    if ($retries{$node}) {
	$retries{$node} -= 1;

	print "*** Rebooting $node and waiting again ...\n";
	
	if (system("$nodereboot $node") == 0) {
489
	    push(@nodelist, $node);
490
491
492
493
494
	    $waitstart{$node} = time;
	    next;
	}
	# Fall through on failure.
    }
495

496
497
498
499
500
    SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
	
    print "*** WARNING: $node may be down.\n".
	  "    This has been reported to testbed-ops.\n";
	
501
502
503
504
    if ($canfail{$node}) {
	# Send mail to testbed-ops and to the user about it.
	my ($user) = getpwuid($UID);
	
505
	SENDMAIL($user, "Node $node is down",
506
507
508
509
510
511
512
513
514
515
516
517
518
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

519
520
	print "*** Continuing with experiment setup anyway ...\n";
	next;
521
522
    }

523
524
    # Reserve it to down experiment.
    MarkNodeDown($node);
525

526
527
528
529
530
531
532
    # Send mail to testbed-ops about it
    SENDMAIL($TBOPS, "Node $node is down",
	     "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
	     "$node has been taken out of the pool until this matter ".
	     "is resolved.\n");

    print "*** Experiment will be terminated automatically.\n";
533
    $failed++;
534
}
535
TBDebugTimeStamp("Local node waiting finished");
536

537
538
539
540
541
542
#
# Now deal with virtual nodes.
#
# We do this in a sub script since nodes are not owned by the user
# and so must be setuid root so that ssh will work.
#
543
544
545
546
547
548
549
550
551
552
553
554
# XXX - Don't bother if something above failed. A waste of time and
# usually leads to cascading errors.
#
my @vnodelist = keys(%vnodes);

if ($failed && @vnodelist) {
    print "*** Skipping virtual node setup since there were previous ".
	"failures!\n";
}
elsif (@vnodelist) {
    print "Setting up virtual testbed nodes ...\n";
    system("$vnode_setup $pid $eid");
555
556
    
    if ($?) {
557
558
559
560
	die("*** $0:\n".
	    "    Vnode setup failed!\n");
    }

561
    foreach my $node (@vnodelist) {
562
563
564
	$waitstart{$node} = time;
    }

565
    print "Waiting for virtual testbed nodes to finish setting up ...\n";
566

567
568
569
570
    TBDebugTimeStamp("Virtual node waiting started");
    while ( @vnodelist ) {
	my $node   = shift(@vnodelist);
	my $wstart = $waitstart{$node};
571

572
573
	if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, (60*3))) {
	    print "$node is alive and well\n";
574
575
576
577
578
579
580
581
582
583
584
585
586
	    SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
	    next;
	}

	SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
	
	print "*** WARNING: $node may be down.\n".
	      "    This has been reported to testbed-ops.\n";
	
	if ($canfail{$node}) {
	    # Send mail to testbed-ops and to the user about it.
	    my ($user) = getpwuid($UID);
	
587
	    SENDMAIL($user, "Virtual Node $node is down",
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

	    print "*** Continuing with experiment setup anyway ...\n";
	    next;
	}

	# Reserve it to down experiment. 
	# MarkNodeDown($node);

	# Send mail to testbed-ops about it
609
610
	SENDMAIL($TBOPS, "Virtual Node $node is down",
		 "Virtual node $node in pid/eid $pid/$eid appears to be ".
611
612
613
		 "unresponsive.\n\n");

	print "*** Experiment will be terminated automatically.\n";
614
615
	$failed++;
    }
616
    TBDebugTimeStamp("Virtual node waiting finished");
617
618
}

619
print "OS Setup Done!\n";
620
TBDebugTimeStamp("os_setup finished");
621
exit $failed;
622

623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
#
# Map an OSID to an imageid for a node type.
#
sub TBMapOSIDtoImageID($$)
{
    my ($osid, $type) = @_;

    my $query_result =
	DBQueryFatal("select imageid from osidtoimageid ".
		     "where type='$type' and osid='$osid'");

    if ($query_result->numrows == 0) {
	return 0;
    }
    my ($imageid) = $query_result->fetchrow_array();

    return $imageid;
}

642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
#
# Setup a reload of a node if we can find an image.
# This goo constructs a hashed array of lists.
#
sub SetupReload($$$)
{
    my ($node, $osid, $type) = @_;

    if ((my $imageid = TBMapOSIDtoImageID($osid, $type))) {
	if (! defined($reloads{$imageid})) {
	    $reloads{$imageid} = [ $node ];
	}
	else {
	    push(@{ $reloads{$imageid} }, $node);
	}
	delete $reboots{$node};
    }
    else {
	die("*** $0:\n".
	    "    No image can be found for $osid on $node!\n");
    }
}

665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
#
# Fork a process to exec a command. Return the pid to wait on.
# 
sub ForkCmd($) {
    my ($cmd) = @_;
    my($mypid);

    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }

    if ($dbg) {
	print STDERR "Forking command: $cmd\n";
    }

    system($cmd);
    exit($? >> 8);
}